From 982a0b2c54a608969760405679071e86917c19a1 Mon Sep 17 00:00:00 2001 From: Joe Brown Date: Tue, 17 Jul 2018 16:13:48 -0700 Subject: [PATCH] Updates report (#15) * changes to address count bug * updates report and its dependencies --- .gitignore | 1 + MANIFEST.in | 1 + README.md | 6 +- README.rst | 6 +- docs/install.rst | 12 +- example/log.txt | 50 -- example/mothur_sop_data/Sample1_R1.fastq | 12 - example/mothur_sop_data/Sample1_R1.fastq.gz | Bin 0 -> 637 bytes example/mothur_sop_data/Sample1_R2.fastq | 12 - example/mothur_sop_data/Sample1_R2.fastq.gz | Bin 0 -> 646 bytes hundo/Snakefile | 690 +++----------------- hundo/__init__.py | 2 +- hundo/environment.yml | 9 +- hundo/scripts/build_report.py | 596 +++++++++++++++++ index.html | 149 +++-- 15 files changed, 804 insertions(+), 742 deletions(-) delete mode 100644 example/log.txt delete mode 100644 example/mothur_sop_data/Sample1_R1.fastq create mode 100644 example/mothur_sop_data/Sample1_R1.fastq.gz delete mode 100644 example/mothur_sop_data/Sample1_R2.fastq create mode 100644 example/mothur_sop_data/Sample1_R2.fastq.gz create mode 100644 hundo/scripts/build_report.py diff --git a/.gitignore b/.gitignore index 743a3cf..c9ac32d 100644 --- a/.gitignore +++ b/.gitignore @@ -1,6 +1,7 @@ example/annotation_references example/mothur_sop_results example/example_its_results +example/additional_data .snakemake # Byte-compiled / optimized / DLL files diff --git a/MANIFEST.in b/MANIFEST.in index 602bab5..837f7d0 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1,2 +1,3 @@ include hundo/Snakefile include hundo/environment.yml +include hundo/scripts/build_report.py diff --git a/README.md b/README.md index 7606812..7d94964 100644 --- a/README.md +++ b/README.md @@ -27,10 +27,8 @@ https://bioconda.github.io/#using-bioconda Really, you just need to make sure `conda` is executable and you've set up your channels (numbers 1 and 2). Then: ``` -conda install python=3.6 \ - pyyaml snakemake>=5.1.4 biopython \ - biom-format=2.1.6 numpy pandas=0.23.1 \ - plotly=2.7.0 +conda install python>=3.6 click \ + pyyaml snakemake>=5.1.4 biopython pip install hundo ``` diff --git a/README.rst b/README.rst index 639acd1..b361650 100644 --- a/README.rst +++ b/README.rst @@ -34,10 +34,8 @@ set up your channels (numbers 1 and 2). Then: :: - conda install python=3.6 \ - pyyaml snakemake>=5.1.4 biopython \ - biom-format=2.1.6 numpy pandas=0.23.1 \ - plotly=2.7.0 + conda install python>=3.6 click \ + pyyaml snakemake>=5.1.4 biopython pip install hundo diff --git a/docs/install.rst b/docs/install.rst index 694ff91..df67ddb 100644 --- a/docs/install.rst +++ b/docs/install.rst @@ -11,10 +11,8 @@ set up your channels (steps 1 and 2). Then: :: - conda install python=3.6 \ - pyyaml snakemake>=5.1.4 biopython \ - biom-format=2.1.6 numpy pandas=0.23.1 \ - plotly=2.7.0 + conda install python>=3.6 click \ + pyyaml snakemake>=5.1.4 biopython pip install hundo To update to the newest version of Hundo, run @@ -26,10 +24,8 @@ To update to the newest version of Hundo, run Alternatively, if you do not want any new executables in your environment you can install into a new conda environment, e.g. hundo_env:: - conda create --name hundo_env python=3.6 \ - pyyaml snakemake>=5.1.4 biopython \ - biom-format=2.1.6 numpy pandas=0.23.1 \ - plotly=2.7.0 + conda create --name hundo_env python>=3.6 \ + click pyyaml snakemake>=5.1.4 biopython source activate hundo_env pip install hundo diff --git a/example/log.txt b/example/log.txt deleted file mode 100644 index 05dfa02..0000000 --- a/example/log.txt +++ /dev/null @@ -1,50 +0,0 @@ -[2018-06-06 10:56 INFO] Executing: snakemake --snakefile /Users/brow015/devel/hundo/hundo/Snakefile --directory /Users/brow015/devel/hundo/example/mothur_sop_results --printshellcmds --jobs 8 --rerun-incomplete --nolock --use-conda --config fastq_dir=/Users/brow015/devel/hundo/example/mothur_sop_data author='brow015' threads=8 database_dir=/Users/brow015/devel/hundo/example/annotation_references filter_adapters=/Users/brow015/devel/hundo/example/qc_references/adapters.fa.gz filter_contaminants=/Users/brow015/devel/hundo/example/qc_references/phix174.fa.gz allowable_kmer_mismatches=1 reference_kmer_match_length=27 reduced_kmer_min=8 minimum_passing_read_length=100 minimum_base_quality=10 minimum_merge_length=150 fastq_allowmergestagger=False fastq_maxdiffs=5 fastq_minovlen=16 maximum_expected_error=1.0 reference_chimera_filter=True minimum_sequence_abundance=2 percent_of_allowable_difference=3.0 reference_database=silva blast_minimum_bitscore=125 blast_top_fraction=0.95 read_identity_requirement=0.97 prefilter_file_size=100000 no_temp_declared=False --force summary.html -Finding samples in /Users/brow015/devel/hundo/example/mothur_sop_data -Sample1 is being omitted from analysis due to its file size (3 paired sequences). - -Found 4 samples for processing: - -F3D144_S210_L001_001: /Users/brow015/devel/hundo/example/mothur_sop_data/F3D144_S210_L001_R1_001.fastq.gz; /Users/brow015/devel/hundo/example/mothur_sop_data/F3D144_S210_L001_R2_001.fastq.gz -F3D142_S208_L001_001: /Users/brow015/devel/hundo/example/mothur_sop_data/F3D142_S208_L001_R1_001.fastq.gz; /Users/brow015/devel/hundo/example/mothur_sop_data/F3D142_S208_L001_R2_001.fastq.gz -F3D143_S209_L001_001: /Users/brow015/devel/hundo/example/mothur_sop_data/F3D143_S209_L001_R1_001.fastq.gz; /Users/brow015/devel/hundo/example/mothur_sop_data/F3D143_S209_L001_R2_001.fastq.gz -F3D141_S207_L001_001: /Users/brow015/devel/hundo/example/mothur_sop_data/F3D141_S207_L001_R1_001.fastq.gz; /Users/brow015/devel/hundo/example/mothur_sop_data/F3D141_S207_L001_R2_001.fastq.gz - -Building DAG of jobs... -Using shell: /usr/local/Cellar/bash/4.4.19/bin/bash -Provided cores: 8 -Rules claiming more threads will be scaled down. -Job counts: - count jobs - 1 report - 1 - -rule report: - input: OTU.biom, OTU.txt, hundo_results.zip, logs/F3D144_S210_L001_001_merged_filtered_eestats.txt, logs/F3D142_S208_L001_001_merged_filtered_eestats.txt, logs/F3D143_S209_L001_001_merged_filtered_eestats.txt, logs/F3D141_S207_L001_001_merged_filtered_eestats.txt, logs/F3D144_S210_L001_001_R1_eestats.txt, logs/F3D142_S208_L001_001_R1_eestats.txt, logs/F3D143_S209_L001_001_R1_eestats.txt, logs/F3D141_S207_L001_001_R1_eestats.txt, logs/F3D144_S210_L001_001_R2_eestats.txt, logs/F3D142_S208_L001_001_R2_eestats.txt, logs/F3D143_S209_L001_001_R2_eestats.txt, logs/F3D141_S207_L001_001_R2_eestats.txt, logs/raw_counts/F3D144_S210_L001_001_R1.count, logs/raw_counts/F3D142_S208_L001_001_R1.count, logs/raw_counts/F3D143_S209_L001_001_R1.count, logs/raw_counts/F3D141_S207_L001_001_R1.count, logs/filtered_counts/F3D144_S210_L001_001_R1.count, logs/filtered_counts/F3D142_S208_L001_001_R1.count, logs/filtered_counts/F3D143_S209_L001_001_R1.count, logs/filtered_counts/F3D141_S207_L001_001_R1.count, logs/merged_counts/F3D144_S210_L001_001_R1.count, logs/merged_counts/F3D142_S208_L001_001_R1.count, logs/merged_counts/F3D143_S209_L001_001_R1.count, logs/merged_counts/F3D141_S207_L001_001_R1.count - output: summary.html - jobid: 0 - -Finding samples in /Users/brow015/devel/hundo/example/mothur_sop_data -Sample1 is being omitted from analysis due to its file size (3 paired sequences). - -Found 4 samples for processing: - -F3D144_S210_L001_001: /Users/brow015/devel/hundo/example/mothur_sop_data/F3D144_S210_L001_R1_001.fastq.gz; /Users/brow015/devel/hundo/example/mothur_sop_data/F3D144_S210_L001_R2_001.fastq.gz -F3D142_S208_L001_001: /Users/brow015/devel/hundo/example/mothur_sop_data/F3D142_S208_L001_R1_001.fastq.gz; /Users/brow015/devel/hundo/example/mothur_sop_data/F3D142_S208_L001_R2_001.fastq.gz -F3D143_S209_L001_001: /Users/brow015/devel/hundo/example/mothur_sop_data/F3D143_S209_L001_R1_001.fastq.gz; /Users/brow015/devel/hundo/example/mothur_sop_data/F3D143_S209_L001_R2_001.fastq.gz -F3D141_S207_L001_001: /Users/brow015/devel/hundo/example/mothur_sop_data/F3D141_S207_L001_R1_001.fastq.gz; /Users/brow015/devel/hundo/example/mothur_sop_data/F3D141_S207_L001_R2_001.fastq.gz - -Building DAG of jobs... -Using shell: /usr/local/Cellar/bash/4.4.19/bin/bash -Job counts: - count jobs - 1 report - 1 -Changing to shadow directory: /Users/brow015/devel/hundo/example/mothur_sop_results/.snakemake/shadow/tmpgpw8ps6v -/Users/brow015/anaconda3/lib/python3.6/site-packages/h5py/__init__.py:36: FutureWarning: - -Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`. - -Finished job 0. -1 of 1 steps (100%) done -Complete log: /Users/brow015/devel/hundo/example/mothur_sop_results/.snakemake/log/2018-06-06T105606.865615.snakemake.log -Protocol [hundo, version 1.1.13] completed successfully. diff --git a/example/mothur_sop_data/Sample1_R1.fastq b/example/mothur_sop_data/Sample1_R1.fastq deleted file mode 100644 index db17059..0000000 --- a/example/mothur_sop_data/Sample1_R1.fastq +++ /dev/null @@ -1,12 +0,0 @@ -@M03018:92:000000000-BMHLN:1:1106:13637:14338 1:N:0:0 CGCAAGCCCGCG -TACGGAGGGTGCAAGCGTTACCCGGAATCACTGGGCGTAAAGGGCGTGTAGGCGGAAGGTTAAGTCCGACTTTAAAGACCGGGGCTCAACCCCGGGCCTGGGTTGGAGACTGGCCTTCTGGACCTCTGGAGAGGCAACTGGAATTCCTGGTGTAGCGGTGGAATGCGTAGATACCAGGAGGAACACCGATGGCGAAGGCAGGTTGCTGGACAGAAGGTGACGCTGAGGCGCGAAAGTGTGGGGAGCGAACC -+ -CCCCCABBCAABGGGGGGGGGGHGGGGGHGHHHHHHGGGGHHHGGGGGGHHHHGG/EEGHFHHHHHHGEFGGHHHHFHHHGGGGGGHHHHHGGGGGGGGGHGHGGGHHHGHHHHHHHHHHBGHCHHHHHHHHGGGGHHHHHGGGGGGGGGGGGGGGGGGGGGGGGGGGFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF0BBBFDDFFFFEFAFB=FD0BBFFFFFFFFFDF@ -@M03018:92:000000000-BMHLN:1:2105:5526:23637 1:N:0:0 CGCAAGCCCGCG -TAATTAGATACCCCTGTAGTCCGGCTGGCTGACTATCTCGTATGCCGTCTTCTGCTTGAAAAAAATTTACTCTAACTACTTGTTTCTATCTTCCTCTTTCTCTTTCCTCCTATCCTTATCTCCATTCTCCTGTTTAAATCCACTCCTTTCTCTCATATTTCCTTCTGTCTCTACCTAGTACAATCTTATTTTTCCCTACTTGTCCAATTTCTATTTCCTACCCTCCTTCACGAACTCTCTTTCATCATACT -+ -AAA1A3C3B@FFFEFAGFFG33EEEAA0EF000D1D1DFA/A0G2B///BA22D1FGEB11DGG>011@2B12B@B11@11111BB222212211@@B1222221B@11001>B022211BB11>2222111<<1B2122B1B1@@<1@@22@1111111111111<1?D1<11>1<11<<11111100=0000=00D00<<..<..:0000...-.000:090;C000;;0 -@M03018:92:000000000-BMHLN:1:2108:10170:23267 1:N:0:0 CGCAAGCCCGCG -TACGGAGGGTGCAAGCGTTACCCGGAATCACTGGGCGTAAAGGGCGTGTAGGCGGAAGGTTAAGTCCGACTTTAAAGACCGGGGCTCAACCCCGGGCCTGGGTTGGAGACTGGCTTTCTGGACCTCTGGAGAGGCAACTGGAATTCCTGGTGTAGCGGTGGAATGCGTAGATACCAGGAGGAACACCGATGGCGAAGGCAGGTTGCTGGACAGAAGGTGACGCTGAGGCGCGAAAGTGTGGGGAGCGAACC -+ -BBBBBBBBBABBGGGGGGFEF5FEGGGGFHHHHHHHGGGGHHHGGGGGGHHHHGGG/>EGGHHHHFFFGEEFHHHHE?GHEEGGGGHHHHHGGGGFFGGGAFGGC@GGHGHHHHHFFGHDGHFGHHHHFHHHGGCCCGHHHHGGGGGFFGGGGGGGGGGGAEGFFGGGFFFFFBFFFFFFFFFFFFFCBDFDFFFFFFF;A>99.AFFFFFBEF//9BBFFDFFFFFFFFFFFFFF/FFFFFFFFDBDDF> diff --git a/example/mothur_sop_data/Sample1_R1.fastq.gz b/example/mothur_sop_data/Sample1_R1.fastq.gz new file mode 100644 index 0000000000000000000000000000000000000000..6b17702433d6998d2a9a361cbb26a8aa85111930 GIT binary patch literal 637 zcmV-@0)qV?iwFpCml0b415;sbaBO8UUs5qHW?^%5aRAL!&2HN;48G?ndLJ5#@{%@= z>zbq``Ox;H_kWgsr0k$U(G9x{V-1nwf5k7H-$NhhdAP(O?DQz_>)Xdb15r2+)SvsO zfqwS=^8*bZLm0w?7G*A~;N?(ROW{&VZQ4?yQPXi%)(Q)B%-jTs#*5wI$)&;#X%ge1h zNT%?9&5JFowa0!{J^VOWlgc8fFXcw-cCJn?H|7GQE~T>O&75*m=R2qMQ?_->u--_@ zs=NF(;w$C9r63}CSxiisv%KU5s+)Pq^I`rScp`<_7&m5MB%bIF=B`@p#1NxHukZ}cwgvzI8O=E8 literal 0 HcmV?d00001 diff --git a/example/mothur_sop_data/Sample1_R2.fastq b/example/mothur_sop_data/Sample1_R2.fastq deleted file mode 100644 index 0d1bb99..0000000 --- a/example/mothur_sop_data/Sample1_R2.fastq +++ /dev/null @@ -1,12 +0,0 @@ -@M03018:92:000000000-BMHLN:1:1106:13637:14338 2:N:0:0 CGCAAGCCCGCG -CCGGTTCGCTCCCCACACTTTCGCGCCTCAGCGTCACCTTCTGTCCAGCAACCTGCCTTCGCCATCGGTGTTCCTCCTGGTATCTACGCATTCCACCGCTACACCAGGAATTCCAGTTGCCTCTCCAGAGGTCCAGAAGGCCAGTCTCCAACCCAGGCCCGGGGTTGAGCCCCGGTCTTTAAAGTCGGACTTAACCTTCCGCCTACACGCCCTTTACGCCCAGTGATTCCGGGTAACGCTTGCACCCTCCG -+ -BBBBBBBBBBBBGGGGAEGGGGGGGGGGGHHHGGGGGHHHHHHHHHHHHHHGHHHHHHHHHGGGGHGCGGGGHHHHHHHHHHGHHHHGGGGGGHHHHGGGEGHGHHHGGHGHHHHHHGHGHHHHHHHHFHGHHHHHHHHGHHHHHFGHHHHHGGGFGHGGGGGGGGGBGHHGGGGGGGGGGGGGGGFGAGGGGGGFFFFFDFFFFFFFDFFFFFFFFFFFFBFFFFFFFFFFFFFBBDFFFFEBBFFE:D. -@M03018:92:000000000-BMHLN:1:2105:5526:23637 2:N:0:0 CGCAAGCCCGCG -TATTATCGCGGCTGCTTGCACACAATTACAATACGCGGGCTTGCGAGCGTATTGATCTTGGTGGTCGCCGTATCATTACCAATACATACATTTCCTTCCCACATACAGTTATCAATTCAGCCCTAAGATCCTCTCTCCACATATTCTTCACTCCCATTTATTCTTTCTTCTCACAATTTTCACCCCCACAATTCATAGCACCACCCTCACCCTATACACTTATTCACTCCTCCCTACGCTTATTTATCGCT -+ -1>>A1331111>0001110111B00AD1F11BBA//E//AA/?A@>//E//>12@@FH12@1/?/1>///EA0<2FD2111B1<12B00?0/11?B122>2?<111?11<0?000011111110110101><<0<0000000EC@CGCCGHHHHGGGGG@GGG0F0B9BB-AAGGGCFFFFFFFFBFFFFFF@E@@FFFFFF@@FFF9BFFFBF-@B@FFB@BFEFFFFFEE-- diff --git a/example/mothur_sop_data/Sample1_R2.fastq.gz b/example/mothur_sop_data/Sample1_R2.fastq.gz new file mode 100644 index 0000000000000000000000000000000000000000..3a28a486a0360874bc8d88b6a9e2cf45bd1e6ab7 GIT binary patch literal 646 zcmV;10(t!(iwFpFml0b415;sbaBO8UUs5tIW?^%5aRAL!$&%b4488YP%spkZIMXwx z9k&Gr`H-0t|Nkw~6Lv4DEXPDHKs*Ua1|ot{7=5sZbn#>R{ux|w&YpuC&cj=9 zUx(rS#fQ&egMCSyh&Ux#`6M&fDryxVk$hFRVy{WOSY}acs-giPY*`_hsp32$AV3ve zkV;4ajF6-vsvKo%n0RY+)dg;=C~8xfk{&>fjQUvXMJkj>1V4$Fplc~wFY>?-E~I` z4=`p@Q!}t~iSU$oLy(4*LCP3C(T=pCgsP2^09Z3Afi%6mCITso>h1-Sf~|TC98f^P zx|gN)o#+~?lJcQZLT62v+G&kd6;SV!)*3#;VD3OfG#t@k+ zEwk#g^UG2;nVq>BC!on>Ctp^Nqi&km`^m+1_2Md_uGTnrjn4boUnlXoIyVX0^0}7N z$~RAwo%TnZ5*PdJw!PR#Nw)7U-Pre{k1~w4{iTNpHjWn)Op&rKrpaX-O~8R}q;6EG z>$<;k=RHVGZ&t|T&%Yth|I>KrA2c2!M{SV)$7%nqbI$k1!|vFWA{SiJ(Lk6@N$_ky z;=H#H2Iirc-FmZVquAR_Dafe)g&mrUG@i0*6=mxCIj2Sc&08~DWY}Wk7-Odur=Pma gl7-n#!#YMabxV|8Tw=*;%UQ_t1K)M-vbF{Q0AABawg3PC literal 0 HcmV?d00001 diff --git a/hundo/Snakefile b/hundo/Snakefile index dca0d18..ab147e3 100644 --- a/hundo/Snakefile +++ b/hundo/Snakefile @@ -5,10 +5,16 @@ import subprocess import sys from snakemake.logging import logger -from snakemake.utils import report + + +localrules: all, print_protocol_version, count_raw_reads, combine_merged_reads def get_conda_env(): + """ + Retrieves the file path of the conda definitions included in the hundo + Python package. + """ if config.get("environment"): yml = config.get("environment") else: @@ -30,7 +36,14 @@ def _make_gen(reader): def count_sequences(filename): - """https://stackoverflow.com/a/27518377""" + """ + Used to count sequences within files of very low size. If protocol starts + with a file that does not have reads that will pass the sample_group + stage, the protocol will fail as a subsequent rule will not have an input + file. + + Source: https://stackoverflow.com/a/27518377 + """ if filename.endswith(".gz"): import gzip @@ -43,6 +56,10 @@ def count_sequences(filename): def get_sample_files(fastq_dirs, prefilter_file_size): + """ + Looks into folders and file wildcard patterns to grab all relevant files + and their paired sequence. Sample names are determined from the file name. + """ if not fastq_dirs: logger.error( ( @@ -186,8 +203,10 @@ def get_run_reference_chimera_filter_inputs(wildcards): def filter_fastas(fasta1, uclust, fasta2, output): - """fasta1 is the larger set of reads to be filtered - by read IDs in uclust and fasta2""" + """ + fasta1 is the larger set of reads to be filtered + by read IDs in uclust and fasta2 + """ accepted_reads = set() with open(fasta2) as fh: for line in fh: @@ -263,13 +282,6 @@ BLASTREF = [os.path.join(config.get("database_dir", "."), i) for i in REF] REFFASTA = [os.path.join(config.get("database_dir", "."), i) for i in REF if ".fasta" in i][0].rpartition(".")[0] REFMAP = [os.path.join(config.get("database_dir", "."), i) for i in REF if i.endswith(".map")][0] REFTRE = [os.path.join(config.get("database_dir", "."), i) for i in REF if i.endswith(".tre")][0] -STYLE = """ - - - -""" rule all: @@ -825,19 +837,9 @@ rule make_deliverables_archive: "zip {output} {input}" -rule report: - input: - biom = "OTU.biom", - txt = "OTU.txt", - archive = "hundo_results.zip", - fastq_stats_post = expand("logs/{sample}_merged_filtered_eestats.txt", sample=SAMPLES.keys()), - raw_r1_quals = expand("logs/{sample}_R1_eestats.txt", sample=SAMPLES.keys()), - raw_r2_quals = expand("logs/{sample}_R2_eestats.txt", sample=SAMPLES.keys()), - raw_counts = expand("logs/raw_counts/{sample}_R1.count", sample=SAMPLES.keys()), - filtered_counts = expand("logs/filtered_counts/{sample}_R1.count", sample=SAMPLES.keys()), - merged_counts = expand("logs/merged_counts/{sample}_R1.count", sample=SAMPLES.keys()) - shadow: - "shallow" +rule aggregate_run_parameters: + output: + txt = "PARAMS.txt" params: fastq_dir = config.get("fastq_dir"), filter_adapters = "None" if not config.get("filter_adapters") else config.get("filter_adapters"), @@ -859,580 +861,82 @@ rule report: blast_minimum_bitscore = config.get("blast_minimum_bitscore"), blast_top_fraction = config.get("blast_top_fraction"), read_identity_requirement = config.get("read_identity_requirement") + run: + with open(output.txt, "w") as fh: + print("fastq_dir:", params.fastq_dir, sep=" ", file=fh) + print("filter_adapters:", params.filter_adapters, sep=" ", file=fh) + print("filter_contaminants:", params.filter_contaminants, sep=" ", file=fh) + print("allowable_kmer_mismatches:", params.allowable_kmer_mismatches, sep=" ", file=fh) + print("reference_kmer_match_length:", params.reference_kmer_match_length, sep=" ", file=fh) + print("reduced_kmer_min:", params.reduced_kmer_min, sep=" ", file=fh) + print("minimum_passing_read_length:", params.minimum_passing_read_length, sep=" ", file=fh) + print("minimum_base_quality:", params.minimum_base_quality, sep=" ", file=fh) + print("minimum_merge_length:", params.minimum_merge_length, sep=" ", file=fh) + print("fastq_allowmergestagger:", params.fastq_allowmergestagger, sep=" ", file=fh) + print("fastq_maxdiffs:", params.fastq_maxdiffs, sep=" ", file=fh) + print("fastq_minovlen:", params.fastq_minovlen, sep=" ", file=fh) + print("maximum_expected_error:", params.maximum_expected_error, sep=" ", file=fh) + print("reference_chimera_filter:", params.reference_chimera_filter, sep=" ", file=fh) + print("minimum_sequence_abundance:", params.minimum_sequence_abundance, sep=" ", file=fh) + print("percent_of_allowable_difference:", params.percent_of_allowable_difference, sep=" ", file=fh) + print("reference_database:", params.reference_database, sep=" ", file=fh) + print("blast_minimum_bitscore:", params.blast_minimum_bitscore, sep=" ", file=fh) + print("blast_top_fraction:", params.blast_top_fraction, sep=" ", file=fh) + print("read_identity_requirement:", params.read_identity_requirement, sep=" ", file=fh) + + +rule build_report: + input: + report_script = os.path.join( + os.path.dirname(os.path.abspath(workflow.snakefile)), + "scripts", + "build_report.py" + ), + biom = "OTU.biom", + txt = "OTU.txt", + archive = "hundo_results.zip", + params = "PARAMS.txt", + env = CONDAENV, + fastq_stats_post = expand("logs/{sample}_merged_filtered_eestats.txt", + sample=SAMPLES.keys() + ), + raw_r1_quals = expand("logs/{sample}_R1_eestats.txt", + sample=SAMPLES.keys() + ), + raw_r2_quals = expand("logs/{sample}_R2_eestats.txt", + sample=SAMPLES.keys() + ), + raw_counts = expand("logs/raw_counts/{sample}_R1.count", + sample=SAMPLES.keys() + ), + filtered_counts = expand("logs/filtered_counts/{sample}_R1.count", + sample=SAMPLES.keys() + ), + merged_counts = expand("logs/merged_counts/{sample}_R1.count", + sample=SAMPLES.keys() + ) + shadow: + "shallow" + params: + omitted = "'%s'" % ",".join(["{sample}:{count}".format(sample=k, count=v) for k, v in OMITTED.items()]), + author = config.get("author", "hundo"), + version = "'%s'" % PROTOCOL_VERSION output: html = "summary.html" + conda: + CONDAENV group: "combined_group" - run: - import csv - from collections import defaultdict - - import numpy as np - import pandas as pd - import plotly.figure_factory as ff - import plotly.graph_objs as go - from biom import parse_table - from plotly import offline - - - PLOTLY_PARAMS = dict( - include_plotlyjs=False, show_link=False, output_type="div" - ) - BLUE = "rgba(31,119,180,0.3)" - RED = "rgba(214,39,40,0.3)" - - def shannon_x(x, total): - return (x / total) * np.log(x / total) if x > 0 else 0 - - shannon_xf = np.vectorize(shannon_x, otypes=[np.float]) - - def shannon_index(arr): - return -1 * sum(shannon_xf(arr, sum(arr))) - - - # summary stats: stats from the biom table - with open(input.biom) as fh: - bt = parse_table(fh) - biom_df = bt.to_dataframe() - s_idx = biom_df.apply(shannon_index, axis=0) - s_idx.sort_values(inplace=True) - # sample order for plots - sample_order = s_idx.index.tolist() - otu_df = [ - ["Samples", "OTUs", "OTU Total Count", "OTU Table Density"], - [bt.length("sample"), bt.length("observation"), bt.sum(), '{:f}'.format(bt.get_table_density())] - ] - otu_df = pd.DataFrame(otu_df) - # make the first row the column labels - otu_df.rename(columns=otu_df.iloc[0], inplace=True) - # drop the first row - otu_df = otu_df.reindex(otu_df.index.drop(0)) - # build the table - summary_table = otu_df.to_html( - index=False, bold_rows=False, classes=["table", "table-bordered"], table_id="otuTable" - ) - # fix for pandoc - summary_table = summary_table.replace("\n", "\n" + 10 * " ") - del(otu_df) - - # quality plot - raw_qual_stats = defaultdict(lambda: defaultdict(list)) - for r1_ee_file in input.raw_r1_quals: - sample_name = r1_ee_file.partition("logs/")[-1].partition("_R1")[0] - r2_ee_file = "_R2".join(r1_ee_file.rsplit("_R1", 1)) - with open(r1_ee_file) as fh: - reader = csv.DictReader(fh, delimiter="\t") - for row in reader: - raw_qual_stats["R1"][sample_name].append(float(row["Mean_Q"])) - with open(r2_ee_file) as fh: - reader = csv.DictReader(fh, delimiter="\t") - for row in reader: - q = float(row["Mean_Q"]) - raw_qual_stats["R2"][sample_name].append(q) - data = [] - for read_index, sample_data in raw_qual_stats.items(): - color = BLUE if read_index == "R1" else RED - for sample_name, sample_stats in sample_data.items(): - data.append(go.Scatter( - x=list(range(1,len(sample_stats))), - y=sample_stats, - name=sample_name, - text=sample_name, - hoverinfo="text+x+y", - legendgroup=read_index, - mode="lines", - line=dict( - color=color, - dash="solid" - ) - ) - ) - layout = go.Layout( - title='Mean Quality Scores for R1 and R2', - margin={"b": "auto", "r": "auto"}, - xaxis={"title":"Position"}, - yaxis={"title":"Quality (Phred score)"}, - hovermode="closest", - showlegend=False, - autosize=True, - annotations=[ - dict( - x=0, - y=1.1, - xref="paper", - yref="paper", - text="Forward", - showarrow=False, - font=dict( - size=16, - color="#ffffff" - ), - align="left", - borderpad=4, - bgcolor="#1f77b4", - ), - dict( - x=0.15, - y=1.1, - xref="paper", - yref="paper", - text="Reverse", - showarrow=False, - font=dict( - size=16, - color='#ffffff' - ), - align='left', - borderpad=4, - bgcolor="#d62728", - ) - ] - ) - fig = go.Figure(data=data, layout=layout) - quality_plot = offline.plot( - fig, - **PLOTLY_PARAMS - ) - - # taxonomy plot with buttons - df = pd.read_table(input.txt) - # sample_cols = [i for i in df.columns if i not in ["#OTU ID", "taxonomy"]] - df[["kingdom", "phylum", "class", "order", "family", "genus", "species"]] = df["taxonomy"].str.split(",", expand=True) - tax_dataframes = [] - tax_dataframes_rel = [] - levels = ["phylum", "class", "order"] - for level in levels: - sub = df[[level] + sample_order].copy() - # cols = sub[level].tolist() - sub[level] = sub[level].str.replace("%s__" % level[0], "") - # sum same assignments over rows - sub = sub.groupby([level]).sum().reset_index() - # sort the stacks - sub["TotalCount"] = sub[sample_order].sum(axis=1) - sub.sort_values("TotalCount", ascending=False, inplace=True) - # redefine columns after sort - sub = sub[[level] + sample_order] - # relative abundance dataframe - sub_p = sub.copy() - sub_p[sample_order] = sub_p[sample_order].div(sub_p[sample_order].sum()) - sub_p[sample_order] = sub_p[sample_order] * 100 - sub_p = sub_p.transpose() - sub_p.columns = sub_p.loc[level] - sub_p.drop([level], inplace=True) - tax_dataframes_rel.append(sub_p) - # absolute abundance dataframe - sub = sub.transpose() - sub.columns = sub.loc[level] - sub.drop([level], inplace=True) - tax_dataframes.append(sub) - del(df) - data = [] - lengths = [] - for i, subset in enumerate(tax_dataframes): - data += [go.Bar(x=subset.index, - y=subset[tax], - name=tax, - text=tax, - hoverinfo="text+y", - visible=True if i == 0 else False - ) for tax in subset.columns] - lengths.append(len(subset.columns)) - # plot buttons - updatemenus = list([ - dict(type="buttons", - active=0, - buttons=list([ - dict(label = 'Phylum', - method = 'update', - args = [{'visible': [True]*lengths[0] + [False]*lengths[1] + [False]*lengths[2]}, - {"title":"Assigned Taxonomy Per Sample (Level: Phylum)"} - ]), - dict(label = 'Class', - method = 'update', - args = [{'visible': [False]*lengths[0] + [True]*lengths[1] + [False]*lengths[2]}, - {"title":"Assigned Taxonomy Per Sample (Level: Class)"} - ]), - dict(label = 'Order', - method = 'update', - args = [{'visible': [False]*lengths[0] + [False]*lengths[1] + [True]*lengths[2]}, - {"title":"Assigned Taxonomy Per Sample (Level: Order)"} - ]) - ]), - direction = 'left', - pad = {'r': 0, 't': 0}, - showactive = True, - x = 0, - xanchor = 'left', - y = 1.05, - yanchor = 'top' - ) - ]) - layout = dict(title="Assigned Taxonomy Per Sample (Level: Phylum)", - updatemenus=updatemenus, - barmode="stack", - height=700, - margin={"b": "auto", "r": "auto"}, - # xaxis={"tickangle": -60}, - yaxis={"title": "Count"}, - showlegend=False, - hovermode='closest' - ) - fig = go.Figure(data=data, layout=layout) - taxonomy_plot = offline.plot( - fig, - **PLOTLY_PARAMS - ) - # make the relative abundance taxonomy plot - data = [] - for i, subset in enumerate(tax_dataframes_rel): - data += [go.Bar(x=subset.index, - y=subset[tax], - name=tax, - text=tax, - hoverinfo="text+y", - visible=True if i == 0 else False - ) for tax in subset.columns] - layout["yaxis"]["title"] = "Relative Abundance" - fig = go.Figure(data=data, layout=layout) - relative_abundance_taxonomy_plot = offline.plot( - fig, - **PLOTLY_PARAMS - ) - - # merged reads quality plot - ee_stats = defaultdict(list) - for stats_file in input.fastq_stats_post: - sample_name = stats_file.partition("logs/")[-1].partition("_merged")[0] - with open(stats_file) as fh: - reader = csv.DictReader(fh, delimiter="\t") - for row in reader: - ee_stats[sample_name].append(row["Max_EE"]) - data = [] - for sample_name, sample_stats in ee_stats.items(): - data.append(go.Scatter( - x=list(range(1,len(sample_stats))), - y=sample_stats, - name=sample_name, - text=sample_name, - hoverinfo="text+x+y", - mode="lines", - line=dict( - color=BLUE, - dash="solid" - ) - ) - ) - layout = go.Layout( - title='Merged Sequence Quality (Max EE: {})'.format(params.maximum_expected_error), - margin={"b": "auto", "r": "auto"}, - yaxis={"title":"Expected Error"}, - xaxis={"title":"Position"}, - hovermode="closest", - showlegend=False - ) - fig = go.Figure(data=data, layout=layout) - merged_reads_quality_plot = offline.plot( - fig, - **PLOTLY_PARAMS - ) - - # sample summary table - count_data = [] - for sample, count in OMITTED.items(): - count_data.append([sample, count, "NA", "NA", "NA", "NA"]) - for sample in sample_order: - sd = [sample] - count_file = "logs/raw_counts/{}_R1.count".format(sample) - with open(count_file) as fh: - for line in fh: - sd.append(int(float(line.strip()))) - with open(count_file.replace("raw_counts/", "filtered_counts/")) as fh: - for line in fh: - sd.append(int(float(line.strip()))) - with open(count_file.replace("raw_counts/", "merged_counts/")) as fh: - for line in fh: - sd.append(int(float(line.strip()))) - sd.append(int(biom_df[sample].sum())) - sd.append('{:f}'.format(s_idx.loc[sample])) - count_data.append(sd) - ss = pd.DataFrame(count_data, columns=["Sample", "Raw", "Filtered", "Merged", "In OTUs", "Shannon"]) - sample_summary_table = ss.to_html( - index=False, bold_rows=False, classes=["table", "table-bordered"], table_id="summaryTable" - ) - sample_summary_table = sample_summary_table.replace("\n", "\n" + 10 * " ") - - # sample summary counts plot - # remove omitted samples from df - keep = [idx for idx, sample in ss.Sample.items() if sample not in OMITTED] - ss = ss.iloc[keep] - data = [go.Bar(x=ss.Sample, - y=ss[metric], - name=metric, - visible=True if i == 0 else "legendonly", - ) for i, metric in enumerate(["Raw", "Filtered", "Merged", "In OTUs"]) - ] - layout = dict(title="Counts Per Sample By Stage", - barmode="group", - height=700, - margin={"b": "auto", "r": "auto"}, - # xaxis={"tickangle": -60}, - yaxis={"title": "Count"}, - showlegend=True, - hovermode="compare", - bargroupgap=0.1, - legend={"orientation": "h", - "x": 0, - "y":1.06} - ) - fig = go.Figure(data=data, layout=layout) - sample_counts_plot = offline.plot( - fig, - **PLOTLY_PARAMS - ) - - # conda environment - conda_env = "" - with open(CONDAENV) as fh: - for i, line in enumerate(fh): - if i == 0: - conda_env += line - else: - conda_env += " " + line - - report_str = """ - - .. raw:: html - {STYLE} - - - - - - - ============================================================= - hundo_ - Experiment Summary - ============================================================= - - .. contents:: - :backlinks: none - :depth: 2 - - Summary - ------- - - Sequence Counts - *************** - - In the plots, samples are ordered by diversity (least to most). - - .. raw:: html - -
- {sample_summary_table} -
- {sample_counts_plot} - - Sequence Quality - **************** - - .. raw:: html - - {quality_plot} - {merged_reads_quality_plot} - - OTUs - **** - - .. raw:: html - -
- {summary_table} -
- - Taxonomy by Count - ***************** - - .. raw:: html - - {taxonomy_plot} - - Taxonomy by Percent - ******************* - - .. raw:: html - - {relative_abundance_taxonomy_plot} - - Methods - ------- - - ``hundo`` wraps a number of functions and its exact steps are - dependent upon a given user's configuration. - - Paired-end sequence reads are quality trimmed and can be trimmed of - adapters and filtered of contaminant sequences using BBDuk2 of the - BBTools_ package. Passing reads are merged using - VSEARCH_ then aggregated into a single FASTA file with headers - describing the origin and count of the sequence. - - Prior to creating clusters, reads are filtered again based on expected - error rates: - - To create clusters, the aggregated, merged reads are dereplicated to - remove singletons by default using VSEARCH. Sequences are preclustered - into centroids using VSEARCH to accelerate chimera filtering. Chimera - filtering is completed in two steps: *de novo* and then reference - based. The reference by default is the entire annotation database. - Following chimera filtering, sequences are placed into clusters using - distance-based, greedy cluster with VSEARCH based on the allowable - percent difference of the configuration. - - After OTU sequences have been determined, BLAST_ is used to align - sequences to the reference database. Reference databases were curated - by the CREST_ team and hundo incorporates the CREST LCA method. - - Counts are assigned to OTUs using the global alignment method of - VSEARCH, which outputs the final OTU table as a tab-delimited text - file. The Biom_ command line tool is used to convert the tab table - to biom. - - Multiple alignment of sequences is completed using MAFFT_. - A tree based on the aligned sequences is built using FastTree2_. - - This workflow is built using Snakemake_ and makes use of Bioconda_ - to install its dependencies. - - .. _BBTools: https://sourceforge.net/projects/bbmap/ - .. _VSEARCH: https://github.com/torognes/vsearch - .. _BLAST: https://blast.ncbi.nlm.nih.gov/Blast.cgi - .. _CREST: https://github.com/lanzen/CREST - .. _Biom: http://biom-format.org/ - .. _MAFFT: https://mafft.cbrc.jp/alignment/software/ - .. _FastTree2: http://www.microbesonline.org/fasttree/ - .. _Snakemake: https://snakemake.readthedocs.io/en/stable/ - .. _Bioconda: https://bioconda.github.io/ - - Configuration - ------------- - - :: - - fastq_dir: {params.fastq_dir} - filter_adapters: {params.filter_adapters} - filter_contaminants: {params.filter_contaminants} - allowable_kmer_mismatches: {params.allowable_kmer_mismatches} - reference_kmer_match_length: {params.reference_kmer_match_length} - allowable_kmer_mismatches: {params.allowable_kmer_mismatches} - reference_kmer_match_length: {params.reference_kmer_match_length} - reduced_kmer_min: {params.reduced_kmer_min} - minimum_passing_read_length: {params.minimum_passing_read_length} - minimum_base_quality: {params.minimum_base_quality} - minimum_merge_length: {params.minimum_merge_length} - allow_merge_stagger: {params.fastq_allowmergestagger} - max_diffs: {params.fastq_maxdiffs} - min_overlap: {params.fastq_minovlen} - maximum_expected_error: {params.maximum_expected_error} - reference_chimera_filter: {params.reference_chimera_filter} - minimum_sequence_abundance: {params.minimum_sequence_abundance} - percent_of_allowable_difference: {params.percent_of_allowable_difference} - reference_database: {params.reference_database} - blast_minimum_bitscore: {params.blast_minimum_bitscore} - blast_top_fraction: {params.blast_top_fraction} - read_identity_requirement: {params.read_identity_requirement} - - - Execution Environment - --------------------- - - :: - - {conda_env} - - - Output Files - ------------ - - Not all files written by the workflow are contained within the - Downloads section of this page to minimize the size of the this - document. Other output files are described and are written to the - results directory. - - - Attached - ******** - - The zip archive contains the following files: - - OTU.biom - ```````` - - Biom table with raw counts per sample and their associated - taxonomic assignment formatted to be compatible with downstream tools - like phyloseq_. - - OTU.fasta - ````````` - - Representative DNA sequences of each OTU. - - OTU.tree - ```````` - - Newick tree representation of aligned OTU sequences. - - OTU.txt - ``````` - - Tab-delimited text table with columns OTU ID, a column for each sample, - and taxonomy assignment in the final column as a comma delimited list. - - - Other Result Files - ****************** - - Other files that may be needed, but that are not included in the - attached archive include: - - OTU_aligned.fasta - ````````````````` - - OTU sequences after alignment using MAFFT. - - all-sequences.fasta - ``````````````````` - - Quality-controlled, dereplicated DNA sequences of all samples. The - header of each record identifies the sample of origin and the count - resulting from dereplication. - - blast-hits.txt - `````````````` - - The BLAST assignments per OTU sequence. - - Downloads - --------- - - .. _hundo: https://github.com/pnnl/hundo - .. _phyloseq: https://joey711.github.io/phyloseq/ - + shell: + """ + python {input.report_script} --biom {input.biom} --txt {input.txt} \ + --zip {input.archive} --env {input.env} --params {input.params} \ + --eeplot 'logs/*_merged_filtered_eestats.txt' \ + --r1quals 'logs/*_R1_eestats.txt' \ + --raw 'logs/raw_counts/*_R1.count' --omitted {params.omitted} \ + --html {output.html} --author {params.author} \ + --version {params.version} """ - report(report_str, - output.html, - metadata="Author: " + config.get("author", "hundo"), - stylesheet="", - archive=input.archive - ) onsuccess: diff --git a/hundo/__init__.py b/hundo/__init__.py index bb6d454..96f8c3b 100644 --- a/hundo/__init__.py +++ b/hundo/__init__.py @@ -1 +1 @@ -__version__ = "1.1.18" +__version__ = "1.1.19" diff --git a/hundo/environment.yml b/hundo/environment.yml index a7fd5ee..de57259 100644 --- a/hundo/environment.yml +++ b/hundo/environment.yml @@ -4,18 +4,23 @@ channels: - conda-forge - defaults dependencies: - - python=3.6 + - python>=3.6 - bbmap=37.17 - biom-format=2.1.6 - biopython - blast=2.6.0 - bzip2=1.0.6 + - click=6.7 + - docutils=0.14 - mafft=7.313 - fasttree=2.1.10 - numpy>=1.14.0 - pandas>=0.23.0 - pigz=2.3.4 - - plotly=2.7.0 - pyyaml>=3.12 - vsearch=2.6.0 - zip=3.0 + - pip + - pip: + - relatively + - plotly>=3.0 diff --git a/hundo/scripts/build_report.py b/hundo/scripts/build_report.py new file mode 100644 index 0000000..051bb81 --- /dev/null +++ b/hundo/scripts/build_report.py @@ -0,0 +1,596 @@ +#!/usr/bin/env python +# coding=utf-8 + +import argparse +import base64 +import csv +import datetime +import io +import mimetypes +import os +import textwrap +from collections import defaultdict +from glob import glob + +import numpy as np +import pandas as pd +import plotly.figure_factory as ff +import plotly.graph_objs as go +import relatively +from biom import parse_table +from docutils.parsers.rst import directives +from docutils.core import publish_file, publish_parts +from plotly import offline + + +STYLE = """ + + + + """ +SCRIPT = """ + + + + + + """ +BLUE = "rgba(31,119,180,0.3)" +RED = "rgba(214,39,40,0.3)" + + +def build_uri(file, file_type=None, default_encoding="utf8"): + fname = os.path.basename(file) + type, encoding = mimetypes.guess_type(file) + if file_type is None: + file_type = type + if encoding is not None: + default_encoding = encoding + with open(file, mode="rb") as fh: + fdata = base64.b64encode(fh.read()) + return ( + f'data:{file_type};charset={default_encoding};filename={fname};base64,{fdata.decode("utf-8")}' + ) + + +def parse_biom(biom): + with open(biom) as fh: + bt = parse_table(fh) + df = bt.to_dataframe() + otu_df = [ + ["Samples", "OTUs", "OTU Total Count", "OTU Table Density"], + [ + bt.length("sample"), + bt.length("observation"), + bt.sum(), + "{:f}".format(bt.get_table_density()), + ], + ] + otu_df = pd.DataFrame(otu_df) + # make the first row the column labels + otu_df.rename(columns=otu_df.iloc[0], inplace=True) + # drop the first row + otu_df = otu_df.reindex(otu_df.index.drop(0)) + # build the table + summary_table = otu_df.to_html( + index=False, + bold_rows=False, + classes=["table", "table-bordered"], + table_id="otuTable", + ) + # fix for rst + summary_table = summary_table.replace("\n", "\n" + 10 * " ") + return df, summary_table + + +def make_div(figure_or_data, include_plotlyjs=False, show_link=False): + div = offline.plot( + figure_or_data, + include_plotlyjs=include_plotlyjs, + show_link=show_link, + output_type="div", + ) + if ".then(function ()" in div: + div = f"""{div.partition(".then(function ()")[0]}""" + return div + + +def build_summary_table(raw, biom_df, div_df, omitted=None): + # sample summary table + header = ["Sample", "Raw", "Filtered", "Merged", "In OTUs"] + count_data = [] + omit = [] + if omitted: + for sample_count in omitted.split(","): + sample, count = sample_count.split(":") + omit.append(sample) + count_data.append([sample, count]) + for count_file in raw: + sample_name = count_file.partition("logs/raw_counts/")[-1].partition( + "_R1.count" + )[0] + sd = [sample_name] + with open(count_file) as fh: + for line in fh: + sd.append(int(float(line.strip()))) + with open(count_file.replace("raw_counts/", "filtered_counts/")) as fh: + for line in fh: + sd.append(int(float(line.strip()))) + with open(count_file.replace("raw_counts/", "merged_counts/")) as fh: + for line in fh: + sd.append(int(float(line.strip()))) + sd.append(int(biom_df[sample_name].sum())) + count_data.append(sd) + df = pd.DataFrame(count_data, columns=header) + df = df.merge(div_df, how="outer", left_on="Sample", right_index=True) + df.sort_values(by="shannon", inplace=True) + # summary table + df_str = df.to_html( + index=False, + bold_rows=False, + classes=["table", "table-bordered"], + table_id="summaryTable", + ) + df_str = df_str.replace("\n", "\n" + 10 * " ") + df = df.reset_index() + # pull out the omitted samples + keep = [idx for idx, sample in df.Sample.items() if sample not in omit] + df = df.iloc[keep] + df.sort_values(by="Raw", inplace=True) + plot_data = [] + visible = [True] * 4 + if len(df.Sample) > 10: + visible = [True] + ["legendonly"] * 3 + for i, metric in enumerate(header[1:]): + plot_data.append( + go.Bar( + x=df.Sample, + y=df[metric], + name=metric, + visible=visible[i], + ) + ) + layout = dict( + title="Counts Per Sample By Stage", + barmode="group", + height=500, + yaxis={"title": "Count"}, + showlegend=True, + hovermode="x", + bargroupgap=0.1, + legend={"orientation": "h", "x": 0, "y": 1.06}, + ) + fig = go.Figure(data=plot_data, layout=layout) + return df_str, make_div(fig) + + +def build_quality_plot(r1_quals): + # quality plot + raw_qual_stats = defaultdict(lambda: defaultdict(list)) + for r1_ee_file in r1_quals: + sample_name = r1_ee_file.partition("logs/")[-1].partition("_R1")[0] + r2_ee_file = "_R2".join(r1_ee_file.rsplit("_R1", 1)) + with open(r1_ee_file) as fh: + reader = csv.DictReader(fh, delimiter="\t") + for row in reader: + raw_qual_stats["R1"][sample_name].append(float(row["Mean_Q"])) + with open(r2_ee_file) as fh: + reader = csv.DictReader(fh, delimiter="\t") + for row in reader: + q = float(row["Mean_Q"]) + raw_qual_stats["R2"][sample_name].append(q) + data = [] + for read_index, sample_data in raw_qual_stats.items(): + color = BLUE if read_index == "R1" else RED + for sample_name, sample_stats in sample_data.items(): + data.append( + go.Scatter( + x=list(range(1, len(sample_stats))), + y=sample_stats, + name=sample_name, + text=sample_name, + hoverinfo="text+x+y", + legendgroup=read_index, + mode="lines", + line=dict(color=color, dash="solid"), + ) + ) + layout = go.Layout( + title="Mean Quality Scores for R1 and R2", + xaxis={"title": "Position"}, + yaxis={"title": "Quality (Phred score)"}, + hovermode="closest", + showlegend=False, + autosize=True, + annotations=[ + dict( + x=0, + y=1.1, + xref="paper", + yref="paper", + text="Forward", + showarrow=False, + font=dict(size=16, color="#ffffff"), + align="left", + borderpad=4, + bgcolor="#1f77b4", + ), + dict( + x=0.15, + y=1.1, + xref="paper", + yref="paper", + text="Reverse", + showarrow=False, + font=dict(size=16, color="#ffffff"), + align="left", + borderpad=4, + bgcolor="#d62728", + ), + ], + ) + fig = go.Figure(data=data, layout=layout) + return make_div(fig) + + +def build_taxonomy_plot(txt, value_cols, height=900): + df = pd.read_table(txt) + levels = ["kingdom", "phylum", "class", "order", "family", "genus", "species"] + df[levels] = df["taxonomy"].str.split(",", expand=True) + # remove "k__", etc., from taxonomies + for level in levels: + df[level] = df[level].str.replace(f"{level[0]}__", "") + hierarchy = ["phylum", "class", "order"] + df[hierarchy] = df[hierarchy].fillna("NA") + df[value_cols] = df[value_cols].fillna(0) + df = df[hierarchy + value_cols] + dfs = relatively.get_dfs_across_hierarchy(df, hierarchy, value_cols, reorder=None) + fig = relatively.get_abundance_figure_from_dfs( + dfs, hierarchy, "Assigned Taxonomy Per Sample", height=height + ) + return make_div(fig) + + +def build_ee_plot(eeplot): + # merged reads quality plot + ee_stats = defaultdict(list) + for stats_file in eeplot: + sample_name = stats_file.partition("logs/")[-1].partition("_merged")[0] + with open(stats_file) as fh: + reader = csv.DictReader(fh, delimiter="\t") + for row in reader: + ee_stats[sample_name].append(row["Mean_EE"]) + data = [] + for sample_name, sample_stats in ee_stats.items(): + data.append( + go.Scatter( + x=list(range(1, len(sample_stats))), + y=sample_stats, + name=sample_name, + text=sample_name, + hoverinfo="text+x+y", + mode="lines", + line=dict(color=BLUE, dash="solid"), + ) + ) + layout = go.Layout( + title="Mean Expected Error Rate Across Merged Reads", + yaxis={"title": "Expected Error"}, + xaxis={"title": "Base Position"}, + hovermode="closest", + showlegend=False, + ) + fig = go.Figure(data=data, layout=layout) + return make_div(fig) + + +def format_from_file(txt): + ret = "" + with open(txt) as fh: + for i, line in enumerate(fh): + if i == 0: + ret += line + else: + ret += " " * 4 + line + return ret + + +def get_metadata(author=None, version=None, sep=" // "): + md = [datetime.date.today().isoformat()] + if version: + md.insert(0, version) + if author: + md.insert(0, author) + return sep.join(md) + + +def build_attachment(zip): + data = build_uri(zip) + name = os.path.basename(zip) + return f"""{name}""" + + +def main( + biom, txt, zip, env, params, eeplot, r1quals, raw, omitted, html, author, version +): + # resolve the file patterns + eeplot = glob(eeplot, recursive=False) + r1quals = glob(r1quals, recursive=False) + raw = glob(raw, recursive=False) + + biom_df, otu_summary_table = parse_biom(biom) + div_idx = relatively.calculate_diversity( + biom_df, + biom_df.index, + biom_df.columns, + index=["shannon", "simpson", "invsimpson"], + ) + sample_summary_table, count_summary_plot = build_summary_table( + raw, biom_df, div_idx, omitted + ) + sample_order = div_idx.sort_values(by="shannon").index.tolist() + quality_plot = build_quality_plot(r1quals) + taxonomy_plot = build_taxonomy_plot(txt, sample_order, 800) + ee_plot = build_ee_plot(eeplot) + params_str = format_from_file(params) + conda_str = format_from_file(env) + metadata = get_metadata(author, version) + attachment = build_attachment(zip) + report_str = f""" + +.. raw:: html + + {STYLE} + {SCRIPT} + +============================================================= +hundo_ - Experiment Summary +============================================================= + +.. contents:: + :backlinks: none + :depth: 2 + +Summary +------- + +Sequence Counts +*************** + +.. raw:: html + +
+ {sample_summary_table} +
+ {count_summary_plot} + +Sequence Quality +**************** + +.. raw:: html + + {quality_plot} + {ee_plot} + +OTUs +**** + +.. raw:: html + +
+ {otu_summary_table} +
+ +Taxonomy +******** + +Samples are ordered from least diverse to most. + +.. raw:: html + + {taxonomy_plot} + +Methods +------- + +``hundo`` wraps a number of functions and its exact steps are +dependent upon a given user's configuration. + +Paired-end sequence reads are quality trimmed and can be trimmed of +adapters and filtered of contaminant sequences using BBDuk2 of the +BBTools_ package. Passing reads are merged using +VSEARCH_ then aggregated into a single FASTA file with headers +describing the origin and count of the sequence. + +Prior to creating clusters, reads are filtered again based on expected +error rates: + +To create clusters, the aggregated, merged reads are dereplicated to +remove singletons by default using VSEARCH. Sequences are preclustered +into centroids using VSEARCH to accelerate chimera filtering. Chimera +filtering is completed in two steps: *de novo* and then reference +based. The reference by default is the entire annotation database. +Following chimera filtering, sequences are placed into clusters using +distance-based, greedy cluster with VSEARCH based on the allowable +percent difference of the configuration. + +After OTU sequences have been determined, BLAST_ is used to align +sequences to the reference database. Reference databases were curated +by the CREST_ team and hundo incorporates the CREST LCA method. + +Counts are assigned to OTUs using the global alignment method of +VSEARCH, which outputs the final OTU table as a tab-delimited text +file. The Biom_ command line tool is used to convert the tab table +to biom. + +Multiple alignment of sequences is completed using MAFFT_. +A tree based on the aligned sequences is built using FastTree2_. + +This workflow is built using Snakemake_ and makes use of Bioconda_ +to install its dependencies. + +.. _BBTools: https://sourceforge.net/projects/bbmap/ +.. _VSEARCH: https://github.com/torognes/vsearch +.. _BLAST: https://blast.ncbi.nlm.nih.gov/Blast.cgi +.. _CREST: https://github.com/lanzen/CREST +.. _Biom: http://biom-format.org/ +.. _MAFFT: https://mafft.cbrc.jp/alignment/software/ +.. _FastTree2: http://www.microbesonline.org/fasttree/ +.. _Snakemake: https://snakemake.readthedocs.io/en/stable/ +.. _Bioconda: https://bioconda.github.io/ + +Configuration +------------- + +:: + + {params_str} + + +Execution Environment +--------------------- + +:: + + {conda_str} + + +Output Files +------------ + +Not all files written by the workflow are contained within the +Downloads section of this page to minimize the size of the this +document. Other output files are described and are written to the +results directory. + + +Attached +******** + +The zip archive contains the following files: + +OTU.biom +```````` + +Biom table with raw counts per sample and their associated +taxonomic assignment formatted to be compatible with downstream tools +like phyloseq_. + +OTU.fasta +````````` + +Representative DNA sequences of each OTU. + +OTU.tree +```````` + +Newick tree representation of aligned OTU sequences. + +OTU.txt +``````` + +Tab-delimited text table with columns OTU ID, a column for each sample, +and taxonomy assignment in the final column as a comma delimited list. + + +Other Result Files +****************** + +Other files that may be needed, but that are not included in the +attached archive include: + +OTU_aligned.fasta +````````````````` + +OTU sequences after alignment using MAFFT. + +all-sequences.fasta +``````````````````` + +Quality-controlled, dereplicated DNA sequences of all samples. The +header of each record identifies the sample of origin and the count +resulting from dereplication. + +blast-hits.txt +`````````````` + +The BLAST assignments per OTU sequence. + +Downloads +--------- + +.. _hundo: https://github.com/pnnl/hundo +.. _phyloseq: https://joey711.github.io/phyloseq/ + + +.. container:: + :name: attachments + + + .. container:: + :name: archive + + archive: + .. raw:: html + + {attachment} + + +.. container:: + :name: metadata + + {metadata} + +""" + with open(html, "w") as fh: + publish_file( + source=io.StringIO(report_str), + destination=fh, + writer_name="html", + settings_overrides={"stylesheet_path": ""}, + ) + + +if __name__ == "__main__": + p = argparse.ArgumentParser( + description=__doc__, formatter_class=argparse.ArgumentDefaultsHelpFormatter + ) + p.add_argument("--biom") + p.add_argument("--txt") + p.add_argument("--zip") + p.add_argument("--env") + p.add_argument("--params") + p.add_argument("--eeplot") + p.add_argument("--r1quals") + p.add_argument("--raw") + p.add_argument("--omitted") + p.add_argument("--html") + p.add_argument("--author") + p.add_argument("--version") + args = p.parse_args() + main( + args.biom, + args.txt, + args.zip, + args.env, + args.params, + args.eeplot, + args.r1quals, + args.raw, + args.omitted, + args.html, + args.author, + args.version, + ) diff --git a/index.html b/index.html index 0ed4fa9..20df8c1 100644 --- a/index.html +++ b/index.html @@ -12,15 +12,16 @@

hundo - Experiment Summary

- + + - - + + +

Sequence Quality

-
-
+
+

OTUs

@@ -141,12 +170,16 @@

OTUs

-
-

Taxonomy by Count

-
-
-

Taxonomy by Percent

-
+
+

Taxonomy

+

Samples are ordered from least diverse to most.

+

Methods

@@ -182,19 +215,18 @@

Methods

Configuration

+fastq_dir: /Users/brow015/devel/hundo/example/mothur_sop_data
 filter_adapters: /Users/brow015/devel/hundo/example/qc_references/adapters.fa.gz
 filter_contaminants: /Users/brow015/devel/hundo/example/qc_references/phix174.fa.gz
 allowable_kmer_mismatches: 1
 reference_kmer_match_length: 27
-allowable_kmer_mismatches: 1
-reference_kmer_match_length: 27
 reduced_kmer_min: 8
 minimum_passing_read_length: 100
 minimum_base_quality: 10
 minimum_merge_length: 150
-allow_merge_stagger: False
-max_diffs: 5
-min_overlap: 16
+fastq_allowmergestagger: False
+fastq_maxdiffs: 5
+fastq_minovlen: 16
 maximum_expected_error: 1.0
 reference_chimera_filter: True
 minimum_sequence_abundance: 2
@@ -214,21 +246,26 @@ 

Execution Environment

- conda-forge - defaults dependencies: - - python=3.6 + - python>=3.6 - bbmap=37.17 - biom-format=2.1.6 - biopython - blast=2.6.0 - bzip2=1.0.6 + - click=6.7 + - docutils=0.14 - mafft=7.313 - fasttree=2.1.10 - numpy>=1.14.0 - pandas>=0.23.0 - pigz=2.3.4 - - plotly=2.7.0 - pyyaml>=3.12 - vsearch=2.6.0 - zip=3.0 + - pip + - pip: + - relatively + - plotly>=3.0
@@ -287,13 +324,13 @@

Downloads

-Author: brow015 | 2018-06-18
+brow015 // hundo, version 1.1.19 // 2018-07-17