From 226057c5cf5157c2d82b4c501d333b40c123d6cb Mon Sep 17 00:00:00 2001 From: alex Date: Thu, 14 Oct 2021 14:51:50 -0500 Subject: [PATCH 01/47] added treecluster to yml --- vampirus_env.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/vampirus_env.yml b/vampirus_env.yml index cb10503..868a0ef 100644 --- a/vampirus_env.yml +++ b/vampirus_env.yml @@ -38,3 +38,4 @@ dependencies: - r-biocmanager=1.30.10 - pip: - oligotyping + - treecluster From 1eaffcf3a5af4dfd558d26bee61b2b7dee2c797e Mon Sep 17 00:00:00 2001 From: aveglia Date: Thu, 14 Oct 2021 14:57:52 -0500 Subject: [PATCH 02/47] altered the config file --- vampirus.config | 25 +++++++++++++++++-------- 1 file changed, 17 insertions(+), 8 deletions(-) diff --git a/vampirus.config b/vampirus.config index aaeac84..dd7ac59 100644 --- a/vampirus.config +++ b/vampirus.config @@ -58,12 +58,24 @@ params { // Minimum length of overlap for sequence merging to occur for a pair minoverlap="10" - // ASV generation and clustering parameters + // ASV generation parameters // Alpha value for denoising - the higher the alpha the higher the chance of false positives in ASV generation (1 or 2) alpha="1" // Minimum size or representation in dataset for sequence to be considered in ASV generation (ex. If set to 4, any unique sequence that is not seen in the data more 3 times is removed) minSize="8" + + // ASV filtering parameters - You can set the filtering to run with the command --filter + + // Path to database containing sequences that if ASVs match, are then removed prior to any analyses + filtDB="" + // Path to database containing sequences that if ASVs match to, are kept for final ASV file to be used in susequent analyses + keepDB="" + // Keep any sequences without hits - for yes, set keepnohit to ="true" + keepnohit="true" + + // ASV clustering parameters + // Percent similarity to cluster nucleotide ASV sequences (used when --ncASV is set) clusterNuclID="85" // List of percent similarities to cluster nucleotide ASV sequences - must be separated by a comma (ex. "95,96") @@ -75,14 +87,11 @@ params { // Minimum length of amino acid translation to be considered during protein clustered ASV (pcASV) generation. Recommended to put this at the expected amino acid sequence length based on your maximum read length (e.g. if maxLen="420", then minAA should be 420/3 so 140) minAA="140" - // ASV filtering parameters - You can set the filtering to run with the command --filter + // Tree-based ASV/AminoType clustering parameters + + // + - // Path to database containing sequences that if ASVs match, are then removed prior to any analyses - filtDB="" - // Path to database containing sequences that if ASVs match to, are kept for final ASV file to be used in susequent analyses - keepDB="" - // Keep any sequences without hits - for yes, set keepnohit to ="true" - keepnohit="true" //Parameters for diamond command for filtering From 4bbff63e3206d1c037ba77d985bf0e0c810c136f Mon Sep 17 00:00:00 2001 From: "@Aveglia" Date: Fri, 15 Oct 2021 16:26:43 -0500 Subject: [PATCH 03/47] added more quality trimming parameters to config --- vAMPirus.nf | 9 +++++---- vampirus.config | 13 +++++++++++-- 2 files changed, 16 insertions(+), 6 deletions(-) diff --git a/vAMPirus.nf b/vAMPirus.nf index 78d02bb..7b66f3e 100644 --- a/vAMPirus.nf +++ b/vAMPirus.nf @@ -678,7 +678,7 @@ if (params.DataCheck || params.Analyze) { echo ${sample_id} fastp -i ${reads[0]} -I ${reads[1]} -o left-${sample_id}.filter.fq -O right-${sample_id}.filter.fq --detect_adapter_for_pe \ - --average_qual 25 -c --overrepresentation_analysis --html ${sample_id}.fastp.html --json ${sample_id}.fastp.json --thread ${task.cpus} \ + --average_qual ${params.avQ} -n ${params.nM} -c --overrepresentation_analysis --html ${sample_id}.fastp.html --json ${sample_id}.fastp.json --thread ${task.cpus} \ --report_title ${sample_id} bash get_readstats.sh ${sample_id}.fastp.json @@ -721,7 +721,7 @@ if (params.DataCheck || params.Analyze) { RTRIM=\$( echo ${GlobTrim} | cut -f 2 -d "," ) bbduk.sh in=${reads[0]} out=${sample_id}_bb_R1.fastq.gz ftl=\${FTRIM} t=${task.cpus} bbduk.sh in=${reads[1]} out=${sample_id}_bb_R2.fastq.gz ftl=\${RTRIM} t=${task.cpus} - repair.sh in1=${sample_id}_bb_R1.fastq.gz in2=${sample_id}_bb_R2.fastq.gz out1=${sample_id}_bbduk_R1.fastq.gz out2=${sample_id}_bbduk_R2.fastq.gz outs=sing.fq repair + repair.sh in1=${sample_id}_bb_R1.fastq.gz in2=${sample_id}_bb_R2.fastq.gz out1=${sample_id}_bbduk_R1.fastq.gz out2=${sample_id}_bbduk_R2.fastq.gz outs=sing.fq repair """ } else if ( params.multi && params.primers ) { """ @@ -881,11 +881,12 @@ if (params.DataCheck || params.Analyze) { cat \$x | tr "\t" "," > \${pre}.csv rm \$x done - reformat.sh in=${reads} out=${params.projtag}_preFilt_preclean.fasta t=${task.cpus} + bbduk.sh in=${reads} out=${params.projtag}_qtrimmed.fastq t=${task.cpu} qtrim=rl trimq=${params.trimq} + reformat.sh in=${params.projtag}_qtrimmed.fastq out=${params.projtag}_preFilt_preclean.fasta t=${task.cpus} echo "sample,reads" >> reads_per_sample_preFilt_preClean.csv grep ">" ${params.projtag}_preFilt_preclean.fasta | awk -F ">" '{print \$2}' | awk -F "." '{print \$1}' | sort --parallel=${task.cpus} | uniq -c | sort -brg --parallel=${task.cpus} | awk '{print \$2","\$1}' >> reads_per_sample_preFilt_preClean.csv rm ${params.projtag}_preFilt_preclean.fasta - fastp -i ${reads} -o ${params.projtag}_merged_preFilt_clean.fastq -b ${params.maxLen} -l ${params.minLen} --thread ${task.cpus} -n 1 + fastp -i ${reads} -o ${params.projtag}_merged_preFilt_clean.fastq -b ${params.maxLen} -l ${params.minLen} --thread ${task.cpus} -n ${params.maxn} reformat.sh in=${params.projtag}_merged_preFilt_clean.fastq out=${params.projtag}_merged_preFilt_clean.fasta t=${task.cpus} bbduk.sh in=${params.projtag}_merged_preFilt_clean.fastq out=${params.projtag}_merged_clean_Lengthfiltered.fastq minlength=${params.maxLen} maxlength=${params.maxLen} t=${task.cpus} bbduk.sh in=${params.projtag}_merged_clean_Lengthfiltered.fastq bhist=${params.projtag}_all_merged_postFilt_baseFrequency_hist.txt qhist=${params.projtag}_all_merged_postFilt_qualityScore_hist.txt gchist=${params.projtag}_all_merged_postFilt_gcContent_hist.txt aqhist=${params.projtag}_all_merged_postFilt_averageQuaulity_hist.txt lhist=${params.projtag}_all_merged_postFilt_length_hist.txt gcbins=auto diff --git a/vampirus.config b/vampirus.config index dd7ac59..e1341a3 100644 --- a/vampirus.config +++ b/vampirus.config @@ -23,6 +23,15 @@ params { // Name of directory created to store output of vAMPirus analyses (Nextflow will create this directory in the working directory) outdir="results" + // Quality filter/trimming options + + // Average read quality - forward or reverse reads will be discarded if average base quality across the read is below the number set below (25 is a good start) + avQ="25" + // Maximum number of "N"s acceptable in the forward or reverse reads (default for fastp is 5) + mN="5" + // Minmum base quality to be trimmed + trimq="15" + // Primer Removal parameters // If not specifying primer sequences, forward and reverse reads will be trimmed by number of bases specified using "--GlobTrim #basesfromforward,#basesfromreverse" @@ -53,8 +62,8 @@ params { maxEE="3" // Maximum number of non-matching nucleotides allowed in overlap region diffs="10" - // Maximum number of "N"'s in a sequence - if above the specified value, sequence will be discarded - maxn="20" + // Maximum number of "N"'s in a sequence - if above the specified value, sequence will be discarded (should be similar to what is set for "mN" above in fastp parameters) + maxn="10" // Minimum length of overlap for sequence merging to occur for a pair minoverlap="10" From 28725c191bf0e9109e5d6936e71fd3fa759e772a Mon Sep 17 00:00:00 2001 From: "@Aveglia" Date: Sun, 17 Oct 2021 11:28:26 -0500 Subject: [PATCH 04/47] added TreeClutser options and processes --- README.md | 6 ++++- vAMPirus.nf | 71 +++++++++++++++++++++++++++++++++++++++++++++++-- vampirus.config | 45 +++++++++++++++++-------------- 3 files changed, 99 insertions(+), 23 deletions(-) diff --git a/README.md b/README.md index 11ba23e..b27400a 100644 --- a/README.md +++ b/README.md @@ -35,7 +35,9 @@ 6. (EXPERIMENTAL) Added Minimum Entropy Decomposition analysis using the oligotyping program produced by the Meren Lab. This allows for sequence clustering based on sequence positions of interest (biologically meaningful) or top positions with the highest Shannon's Entropy (read more here: https://merenlab.org/software/oligotyping/ ; and below). -7. Color nodes on phylogenetic trees based on Taxonomy or Minimum Entropy Decomposition results +7. Phylogeny-based clustering ASV or AminoType sequences with TreeCluster (https://github.com/niemasd/TreeCluster; https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0221068) + +8. Color nodes on phylogenetic trees based on Taxonomy or Minimum Entropy Decomposition results 8. PCoA plots added to Analyze report if NMDS does not converge. @@ -305,3 +307,5 @@ If you do use vAMPirus for your analyses, please cite the following -> 15. UNOISE algorithm - R.C. Edgar (2016). UNOISE2: improved error-correction for Illumina 16S and ITS amplicon sequencing, https://doi.org/10.1101/081257 16. Oligotyping - A. Murat Eren, Gary G. Borisy, Susan M. Huse, Jessica L. Mark Welch (2014). Oligotyping analysis of the human oral microbiome. Proceedings of the National Academy of Sciences Jul 2014, 111 (28) E2875-E2884; DOI: 10.1073/pnas.1409644111 + +17. Balaban M, Moshiri N, Mai U, Jia X, Mirarab S (2019). "TreeCluster: Clustering biological sequences using phylogenetic trees." PLoS ONE. 14(8):e0221068. doi:10.1371/journal.pone.0221068 diff --git a/vAMPirus.nf b/vAMPirus.nf index 7b66f3e..4331c0d 100644 --- a/vAMPirus.nf +++ b/vAMPirus.nf @@ -2308,7 +2308,7 @@ if (params.DataCheck || params.Analyze) { output: tuple file("*_aln.fasta"), file("*_aln.html"), file("*.tree"), file("*.log"), file("*iq*"), file("*mt*") into align_results_asv - file("*iq.treefile") into (nucl_phyl_plot_asv, asvphy_med) + file("*iq.treefile") into (nucl_phyl_plot_asv, asvphy_med, asv_treeclust) script: """ @@ -2339,6 +2339,39 @@ if (params.DataCheck || params.Analyze) { } } + if (params.asvTClust){ + + process ASV_PhyloClustering { + + label 'norm_cpus' + + publishDir "${params.workingdir}/${params.outdir}/Analyze/Analyses/ASVs/ + + input: + file(tree) from asv_treeclust + + output: + + + script: + """ + TreeCluster.py -i ${tree} ${params.asvTCopp} > ${params.projtag}_ASV_treeclustering.out + #create headless treeclustering.out + tail -n +2 ${params.projtag}_ASV_treeclustering.out | sed 's/-1/0/g' > headless.treeout + #summarizing clustering results + awk -F "\t" '{print $2}' | sort | uniq > group.list + g=1 + echo "SequenceID,PhyloGroup" > ${params.projtag}_phylogroup.csv + for x in \$(cat group.list) + do grep "$x" headless.out > tmp.tsv + groupID="phyloGroup\${g}" + awk -F "\t" '{print \$1",'\${groupID}'"}' tmp.tsv >> ${params.projtag}_phylogroup.csv + g=\$((\$g+1)) + done + rm tmp.csv + } + + } if (params.asvMED) { process ASV_Minimum_Entropy_Decomposition { @@ -2937,7 +2970,7 @@ if (params.DataCheck || params.Analyze) { output: tuple file("*_aln.fasta"), file("*_aln.html"), file("*.log"), file("*iq*"), file("*mt*") into alignprot_results - file("*iq.treefile") into (amino_rax_plot, amino_repphy) + file("*iq.treefile") into (amino_rax_plot, amino_repphy, amino_treeclust) script: """ @@ -2975,6 +3008,40 @@ if (params.DataCheck || params.Analyze) { } } + if (params.aminoTClust){ + + process AminoType_PhyloClustering { + + label 'norm_cpus' + + publishDir "${params.workingdir}/${params.outdir}/Analyze/Analyses/ASVs/ + + input: + file(tree) from amino_treeclust + + output: + + + script: + """ + TreeCluster.py -i ${tree} ${params.asvTCopp} > ${params.projtag}_AminoType_treeclustering.out + #create headless treeclustering.out + tail -n +2 ${params.projtag}_AminoType_treeclustering.out | sed 's/-1/0/g' > headless.treeout + #summarizing clustering results + awk -F "\t" '{print $2}' | sort | uniq > group.list + g=1 + echo "SequenceID,PhyloGroup" > ${params.projtag}_phylogroup.csv + for x in \$(cat group.list) + do grep "$x" headless.out > tmp.tsv + groupID="phyloGroup\${g}" + awk -F "\t" '{print \$1",'\${groupID}'"}' tmp.tsv >> ${params.projtag}_phylogroup.csv + g=\$((\$g+1)) + done + rm tmp.csv + } + + } + process Generate_AminoTypes_Counts_Table { label 'high_cpus' diff --git a/vampirus.config b/vampirus.config index e1341a3..cb03ae3 100644 --- a/vampirus.config +++ b/vampirus.config @@ -51,7 +51,6 @@ params { // Minimum non-merged read length after adapter and primer removal (default = 200) minilen="100" - // Merged read length filtering parameters // Minimum merged read length - reads with lengths greater than minLen and below the specified maximum read length will be used for counts only @@ -83,6 +82,17 @@ params { // Keep any sequences without hits - for yes, set keepnohit to ="true" keepnohit="true" + //Parameters for diamond command for ASV filtering + + // Set minimum percent amino acid similarity for best hit to be counted in taxonomy assignment + filtminID="80" + // Set minimum amino acid alignment length for best hit to be counted in taxonomy assignment + filtminaln="30" + // Set sensitivity parameters for DIAMOND aligner (read more here: https://github.com/bbuchfink/diamond/wiki; default = ultra-sensitive) + filtsensitivity="ultra-sensitive" + // Set the max e-value for best hit to be recorded + filtevalue="0.001" + // ASV clustering parameters // Percent similarity to cluster nucleotide ASV sequences (used when --ncASV is set) @@ -96,24 +106,6 @@ params { // Minimum length of amino acid translation to be considered during protein clustered ASV (pcASV) generation. Recommended to put this at the expected amino acid sequence length based on your maximum read length (e.g. if maxLen="420", then minAA should be 420/3 so 140) minAA="140" - // Tree-based ASV/AminoType clustering parameters - - // - - - - //Parameters for diamond command for filtering - - // Set minimum percent amino acid similarity for best hit to be counted in taxonomy assignment - filtminID="80" - // Set minimum amino acid alignment length for best hit to be counted in taxonomy assignment - filtminaln="30" - // Set sensitivity parameters for DIAMOND aligner (read more here: https://github.com/bbuchfink/diamond/wiki; default = ultra-sensitive) - filtsensitivity="ultra-sensitive" - // Set the max e-value for best hit to be recorded - filtevalue="0.001" - - // Minimum Entropy Decomposition (MED) parameters for clustering (https://merenlab.org/2012/05/11/oligotyping-pipeline-explained/) // If you plan to do MED on ASVs using the option "--asvMED" you can set here the number of entopy peak positions or a comma seperated list of biologically meaningful positons (e.g. 35,122,21) for oligotyping to take into consideration. If you want to use a single specific position, make "asvSingle="true"". @@ -123,6 +115,17 @@ params { aminoC="" aminoSingle="false" + // Phylogeny-based ASV/AminoType clustering parameters using the program TreeCluster (https://github.com/niemasd/TreeCluster) + + // Add the "--asvTClust" option to the launch command to run phylogeny-based clustering of ASVs ; Add "--aminoTClust" to launch command for phylogeny-based clustering on AminoTypes + // NOTE: you can't use "--skipPhylogeny" when doing this form of sequence clustering + + // TreeCluster command options for ASV clustering (--asvTClust) -- (Example: "-option1 A -option2 B -option3 C -option4 D") - See TreeCluster paper and github page to determine the best options (a good start is what is below) + asvTCopp="-t 0.045 -max_clade" + + // TreeCluster command options for AminoType clustering (--aminoTClust) -- (Example: "-option1 A -option2 B -option3 C -option4 D") - See TreeCluster paper and github page to determine the best options + aminoTCopp="-t 0.045 -max_clade" + // Counts table generation parameters // Percent similarity to use for ASV/cASV counts table generation with vsearch @@ -169,7 +172,7 @@ params { // Phylogeny analysis parameters - // Color nodes on phylogenetic tree in Analyze report with MED Group information (nodeCol="MED") or taxonomy (nodeCol=TAX) hit. If you would like nodes colored by sequence ID, leave nodeCol="" below. + // Color nodes on phylogenetic tree in Analyze report with MED Group information (nodeCol="MED"), taxonomy (nodeCol=TAX) hit, or TreeCluster Group Information (nodeCol=TC). If you would like nodes colored by sequence ID, leave nodeCol="" below. nodeCol="" // Customs options for IQ-TREE (Example: "-option1 A -option2 B -option3 C -option4 D") @@ -224,6 +227,8 @@ params { aminoMED = false // Filter ASVs filter = false + // Phylogeny-based ASV clustering + asvTClust = false // Skip options // Skip all Read Processing steps skipReadProcessing = false From bb7e42d01376f3cd39ea2f5d33f870e984b341a6 Mon Sep 17 00:00:00 2001 From: "@Aveglia" Date: Sun, 17 Oct 2021 19:45:16 -0500 Subject: [PATCH 05/47] Added treeclustering info to docs and added outputs for treeclustering processes --- README.md | 2 +- docs/HelpDocumentation.md | 46 ++++++++++++++++++++++++++++++++------- vAMPirus.nf | 13 ++++++----- 3 files changed, 47 insertions(+), 14 deletions(-) diff --git a/README.md b/README.md index b27400a..588be48 100644 --- a/README.md +++ b/README.md @@ -21,7 +21,7 @@ * [Running vAMPirus](#Running-vAMPirus) * [Who to cite](#Who-to-cite) -# New in vAMPirus version 2.0.0 +# New in vAMPirus version 2.0.1 1. Reduced redundancy of processes and the volume of generated result files per full run (Example - read processing only done once if running DataCheck then Analyze). diff --git a/docs/HelpDocumentation.md b/docs/HelpDocumentation.md index 20fb68b..1561b34 100644 --- a/docs/HelpDocumentation.md +++ b/docs/HelpDocumentation.md @@ -21,11 +21,11 @@ The vAMPirus program contains two different pipelines: If you have a feature request or any feedback/questions, feel free to email vAMPirusHelp@gmail.com or you can open an Issue on GitHub. -## Changes in version 2.0.0 +## Changes in version 2.0.1 -1. (EXPERIMENTAL) Added Minimum Entropy Decomposition analysis using the oligotyping program produced by the Meren Lab. This allows for sequence clustering based on sequence positions of interest (biologically meaningful) or top positions with the highest Shannon's Entropy (read more here: https://merenlab.org/software/oligotyping/ ; and below). +1. Reduced redundancy of processes and the volume of generated result files per full run (Example - read processing only done once if running DataCheck then Analyze). -2. Added more useful taxonomic classification of sequences leveraging the RVDB annotation database and/or NCBI taxonomy files (read more below). +2. Added further taxonomic classification of sequences using the RVDB annotation database and/or NCBI taxonomy files (see manual for more info). 3. Replaced the used of MAFFT with muscle v5 (Edgar 2021) for more accurate virus gene alignments (see https://www.biorxiv.org/content/10.1101/2021.06.20.449169v1.full). @@ -33,12 +33,13 @@ If you have a feature request or any feedback/questions, feel free to email vAMP 5. ASV filtering - you can now provide a "filter" and "keep" database to remove certain sequences from the analysis -6. Reduced redundancy of processes and the volume of generated result files per full run (Example - read processing only done once if running DataCheck then Analyze). +6. (EXPERIMENTAL) Added Minimum Entropy Decomposition analysis using the oligotyping program produced by the Meren Lab. This allows for sequence clustering based on sequence positions of interest (biologically meaningful) or top positions with the highest Shannon's Entropy (read more here: https://merenlab.org/software/oligotyping/ ; and below). -7. Color nodes on phylogenetic trees based on Taxonomy or Minimum Entropy Decomposition results +7. Phylogeny-based clustering ASV or AminoType sequences with TreeCluster (https://github.com/niemasd/TreeCluster; https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0221068) -8. PCoA plots added to report if NMDS does not converge. +8. Color nodes on phylogenetic trees based on Taxonomy or Minimum Entropy Decomposition results +8. PCoA plots added to Analyze report if NMDS does not converge. ## Who to cite @@ -76,6 +77,8 @@ If you do use vAMPirus for your analyses, please cite the following -> 16. Oligotyping - A. Murat Eren, Gary G. Borisy, Susan M. Huse, Jessica L. Mark Welch (2014). Oligotyping analysis of the human oral microbiome. Proceedings of the National Academy of Sciences Jul 2014, 111 (28) E2875-E2884; DOI: 10.1073/pnas.1409644111 +17. Balaban M, Moshiri N, Mai U, Jia X, Mirarab S (2019). "TreeCluster: Clustering biological sequences using phylogenetic trees." PLoS ONE. 14(8):e0221068. doi:10.1371/journal.pone.0221068 + # Getting started with vAMPirus @@ -1079,7 +1082,7 @@ Here are the options stored within the configuration file: ### Clustering options -Dependining on the virus type/marker gene, ASV-level results can be noisy and to combat this vAMPirus has three different approaches to clustering ASV sequences: +Depending on the virus type/marker gene, ASV-level results can be noisy and to combat this vAMPirus has three different approaches to clustering ASV sequences: 1. AminoTyping - @@ -1186,9 +1189,36 @@ MED related options within the configuration file: aminoC="" aminoSingle="" +## Phylogeny-based clustering of ASV or AminoType sequences with TreeCluster (https://github.com/niemasd/TreeCluster; https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0221068) + +New in version 2.0.1, you can now use the program TreeCluster to cluster ASV or AminoType sequences based on phylogenetic relationship. + +From the TreeCluster GitHub (https://github.com/niemasd/TreeCluster): + +"TreeCluster is a tool that, given a tree T (Newick format) and a distance threshold t, finds the minimum number of clusters of the leaves of T such that some user-specified constraint is met in each cluster. The user can also specify a branch support threshold s such that no pair of leaves in any cluster can be connected by branches with support less than or equal to s. Note that all leaves given a cluster of -1 are singletons, meaning they did not cluster with any other leaves (i.e., each leaf with a cluster of -1 is in its own cluster). + +TreeCluster was motivated by Cluster Picker. + +The default method is "Max Clade" (see Clustering Methods). There is no explicit default distance threshold, but because Cluster Picker recommends a distance threshold of 0.045 and because the same objective function is optimized by both Cluster Picker and TreeCluster "Max Clade", we currently recommend 0.045 as well." + +If you plan to use this form of clustering, be sure to read through the TreeCluster manuscript and help documentation linked above to use the most appropriate command. + +To signal the use of phylogeny-based clustering, add "--asvTClust" to cluster ASVs and "--aminoTClust" to cluster aminotypes. You will then need to check and edit the TreeClust command you would like to use in the configuration file: + + // Phylogeny-based ASV/AminoType clustering parameters using the program TreeCluster (https://github.com/niemasd/TreeCluster) + + // Add the "--asvTClust" option to the launch command to run phylogeny-based clustering of ASVs ; Add "--aminoTClust" to launch command for phylogeny-based clustering on AminoTypes + // NOTE: you can't use "--skipPhylogeny" when doing this form of sequence clustering + + // TreeCluster command options for ASV clustering (--asvTClust) -- (Example: asvTCopp="-option1 A -option2 B -option3 C -option4 D") - See TreeCluster paper and github page to determine the best options (a good start is what is below) + asvTCopp="-t 0.045 -m max_clade -s 0.5" + + // TreeCluster command options for AminoType clustering (--aminoTClust) -- (Example: aminoTCopp"-option1 A -option2 B -option3 C -option4 D") - See TreeCluster paper and github page to determine the best options + aminoTCopp="-t 0.045 -m max_clade -s 0.5" + ## Counts tables and percent ID matrices -vAMPirus generates nucleotide-based counts tables using vsearch and protein-based counts tables using DIAMOND and a custom bash script. Counts tables and percent ID matrices are always produced for each ASV, AminoType and all cASV fasta files produced. +vAMPirus generates nucleotide-based counts tables using vsearch and protein-based counts tables using DIAMOND and a custom bash script. Counts tables, pairwise distance, and percent ID matrices are always produced for each ASV, AminoType and all cASV fasta files produced. Here are the parameters you can edit at lines 61-70: diff --git a/vAMPirus.nf b/vAMPirus.nf index 4331c0d..db939ce 100644 --- a/vAMPirus.nf +++ b/vAMPirus.nf @@ -2293,7 +2293,7 @@ if (params.DataCheck || params.Analyze) { """ } - if (!params.skipPhylogeny) { // need to edit paths + if (!params.skipPhylogeny) { process ASV_Phylogeny { @@ -2345,17 +2345,19 @@ if (params.DataCheck || params.Analyze) { label 'norm_cpus' - publishDir "${params.workingdir}/${params.outdir}/Analyze/Analyses/ASVs/ + publishDir "${params.workingdir}/${params.outdir}/Analyze/Analyses/ASVs/TreeClustering", mode: "copy", overwrite: true input: file(tree) from asv_treeclust output: - + file("*treeclustering*.out") into asvtreeclustering_res + file("*phylogroup.csv") into asv_phylogroupcsv script: """ TreeCluster.py -i ${tree} ${params.asvTCopp} > ${params.projtag}_ASV_treeclustering.out + reeCluster.py -i ${tree} ${params.asvTCopp} > ${params.projtag}_ASV_treeclustering_verbose.out #create headless treeclustering.out tail -n +2 ${params.projtag}_ASV_treeclustering.out | sed 's/-1/0/g' > headless.treeout #summarizing clustering results @@ -3014,13 +3016,14 @@ if (params.DataCheck || params.Analyze) { label 'norm_cpus' - publishDir "${params.workingdir}/${params.outdir}/Analyze/Analyses/ASVs/ + publishDir "${params.workingdir}/${params.outdir}/Analyze/Analyses/AminoTypes/TreeCluster, mode: "copy", overwrite: true input: file(tree) from amino_treeclust output: - + file("*treeclustering*.out") into aminotreeclustering_res + file("*phylogroup.csv") into amino_phylogroupcsv script: """ From 5ed4105c31f053b4e80a738481cc0f9682324127 Mon Sep 17 00:00:00 2001 From: "@Aveglia" Date: Wed, 20 Oct 2021 08:25:28 -0500 Subject: [PATCH 06/47] added pairwise distance matrices to datacheck report --- vAMPirus.nf | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/vAMPirus.nf b/vAMPirus.nf index db939ce..bdf3aee 100644 --- a/vAMPirus.nf +++ b/vAMPirus.nf @@ -1059,7 +1059,7 @@ if (params.DataCheck || params.Analyze) { output: file("number_per_percentage_nucl.csv") into number_per_percent_nucl_plot - + file("${params.projtag}_ASV_PairwiseDistance.matrix") into asvpdm script: if (params.datacheckntIDlist) { """ @@ -1073,6 +1073,9 @@ if (params.DataCheck || params.Analyze) { done yo=\$(grep -c ">" ${fasta}) echo "1.0,\${yo}" >> number_per_percentage_nucl.csv + clustalo -i ${fasta} --distmat-out=${params.projtag}_distance.matrix --full --force --threads=${task.cpus} + cat ${params.projtag}_distance.matrix | tr " " "," | grep "," >${params.projtag}_ASV_PairwiseDistance.matrix + rm ${params.projtag}_distance.matrix """ } } @@ -1111,6 +1114,8 @@ if (params.DataCheck || params.Analyze) { output: file("number_per_percentage_prot.csv") into number_per_percent_prot_plot file("*aminoacid_pcASV1.0_noTaxonomy.fasta") into amino_med + file("${params.projtag}_AminoType_PairwiseDistance.matrix") into aminopdm + script: // add awk script to count seqs """ @@ -1201,6 +1206,9 @@ if (params.DataCheck || params.Analyze) { tail -\$(( \${yesirr}-1 )) number_per_percentage_protz.csv > number_per_percentage_prot.csv head -1 number_per_percentage_protz.csv >> number_per_percentage_prot.csv rm number_per_percentage_protz.csv + clustalo -i aminoacid_pcASV1.0_noTaxonomy.fasta --distmat-out=${params.projtag}_distance.matrix --full --force --threads=${task.cpus} + cat ${params.projtag}_distance.matrix | tr " " "," | grep "," >${params.projtag}_AminoType_PairwiseDistance.matrix + rm ${params.projtag}_distance.matrix """ } @@ -1473,7 +1481,7 @@ if (params.DataCheck || params.Analyze) { } report_dc_in = Channel.create() - fastp_csv_dc.mix( reads_per_sample_preFilt, reads_per_sample_postFilt, prefilt_basefreq, postFilt_basefreq, prefilt_qualityscore, postFilt_qualityscore, prefilt_gccontent, postFilt_gccontent, prefilt_averagequality, postFilt_averagequality, prefilt_length, postFilt_length, number_per_percent_nucl_plot, number_per_percent_prot_plot, amino_entro_csv, amino_entropy, asv_entro_csv, asv_entropy).into(report_dc_in) + fastp_csv_dc.mix( reads_per_sample_preFilt, reads_per_sample_postFilt, prefilt_basefreq, postFilt_basefreq, prefilt_qualityscore, postFilt_qualityscore, prefilt_gccontent, postFilt_gccontent, prefilt_averagequality, postFilt_averagequality, prefilt_length, postFilt_length, number_per_percent_nucl_plot, number_per_percent_prot_plot, amino_entro_csv, amino_entropy, asv_entro_csv, asv_entropy, asvpdm, aminopdm).into(report_dc_in) process Report_DataCheck { From 3651666092562d17c5e03aa76b6895669b3d57ed Mon Sep 17 00:00:00 2001 From: "@Aveglia" Date: Wed, 20 Oct 2021 08:59:10 -0500 Subject: [PATCH 07/47] added pariwise distance heatmaps to DC report --- bin/vAMPirus_DC_Report.Rmd | 98 +++++++++++++++++++++++++++++++------- 1 file changed, 80 insertions(+), 18 deletions(-) diff --git a/bin/vAMPirus_DC_Report.Rmd b/bin/vAMPirus_DC_Report.Rmd index c986bb5..b4eea2c 100644 --- a/bin/vAMPirus_DC_Report.Rmd +++ b/bin/vAMPirus_DC_Report.Rmd @@ -39,11 +39,9 @@ library(scales) library(cowplot) library(dplyr) library(plotly) -#library(BiocParallel) library(knitr) -library(kableExtra) #install.packages("kableExtra") +library(kableExtra) library(rmarkdown) -#register(MulticoreParam(4)) ``` ```{r colors, include=FALSE} @@ -360,6 +358,12 @@ postlhp <- postlhp %>% layout(yaxis = list(title = "Count")) postlhp <- postlhp %>% layout(xaxis = list(title = "Read Length")) postlhp <- postlhp %>% config(toImageButtonOptions=list(format='svg',filename='ReadsperLen_postfilt', height= 500, width= 800, scale= 1)) postlhp +``` +
+```{bash load_datasets_bash, include=FALSE} +mv *AminoType_PairwiseDistance.matrix ./amino_matrix.txt +mv *_ASV_PairwiseDistance.matrix ./asv_matrix.txt + ```
@@ -383,24 +387,35 @@ nnp NOTE: The "1" on the x-axis represents number of ASVs identified by vsearch
-### Number of pcASVs per clustering percentage - -
- -```{r prot_number, echo=FALSE} -pn=read.csv("number_per_percentage_prot.csv", header=TRUE) -#pn=read.csv("number_per_percentage_prot.csv", header=T) -pnp <- plot_ly(pn, x=pn[,1], y=pn[,2], type="scatter", mode = 'lines+markers', marker=list(color='#088da5', line=list(color = 'black', - width = .1)), hovertemplate = paste('ID%: %{x}','
Number of pcASVs: %{y}','')) -pnp <- pnp %>% layout(yaxis = list(title = "Number of pcASVs")) -pnp <- pnp %>% layout(xaxis = list(title = "Clustering ID %")) -pnp <- pnp %>% config(toImageButtonOptions=list(format='svg',filename='Protclustresults', height= 500, width= 800, scale= 1)) -pnp +

  ASV Pairwise Distance Heatmap

+ +
+
+```{r asvheatmap, echo=FALSE} +simmatrix<- read.csv("asv_matrix.txt", header=FALSE) +rownames(simmatrix) <- simmatrix[,1] +simmatrix <- simmatrix[,-1] +colnames(simmatrix) <-rownames(simmatrix) +cols <- dim(simmatrix)[2] +simmatrix$AA <- rownames(simmatrix) +rval=nrow(simmatrix) +simmatrix2 <- simmatrix %>% + gather(1:rval, key=sequence, value=similarity) +x=reorder(simmatrix2$AA,simmatrix2$similarity) +y=reorder(simmatrix2$sequence,simmatrix2$similarity) +similaritymatrix <- ggplot(simmatrix2, aes(x=x, y=y,fill=similarity))+ + geom_raster()+ + scale_fill_distiller(palette="Spectral")+ + theme(axis.text.x = element_text(angle = 90))+ + theme(axis.title.x=element_blank())+ + theme(axis.title.y=element_blank()) + +heat <- ggplotly(similaritymatrix) +heat <- heat %>% config(toImageButtonOptions=list(format='svg',filename='heatmap', height= 500, width= 800, scale= 1)) +heat ``` -NOTE: The "1" represents the number of AminoTypes which are unique amino acid sequences in your dataset
- ### ASV Shannon Entropy Analysis (https://merenlab.org/2012/05/11/oligotyping-pipeline-explained/)
@@ -435,6 +450,53 @@ paged_table(med_asv_csv, options = list(rows.print = 10)) ```
+### Number of pcASVs per clustering percentage + +
+ +```{r prot_number, echo=FALSE} +pn=read.csv("number_per_percentage_prot.csv", header=TRUE) +#pn=read.csv("number_per_percentage_prot.csv", header=T) +pnp <- plot_ly(pn, x=pn[,1], y=pn[,2], type="scatter", mode = 'lines+markers', marker=list(color='#088da5', line=list(color = 'black', + width = .1)), hovertemplate = paste('ID%: %{x}','
Number of pcASVs: %{y}','')) +pnp <- pnp %>% layout(yaxis = list(title = "Number of pcASVs")) +pnp <- pnp %>% layout(xaxis = list(title = "Clustering ID %")) +pnp <- pnp %>% config(toImageButtonOptions=list(format='svg',filename='Protclustresults', height= 500, width= 800, scale= 1)) +pnp +``` +NOTE: The "1" represents the number of AminoTypes which are unique amino acid sequences (AminoTypes) in your dataset + +
+

  AminoType Pairwise Distance Heatmap

+ +
+
+```{r aminoheatmap, echo=FALSE} +simmatrix<- read.csv("amino_matrix.txt", header=FALSE) +rownames(simmatrix) <- simmatrix[,1] +simmatrix <- simmatrix[,-1] +colnames(simmatrix) <-rownames(simmatrix) +cols <- dim(simmatrix)[2] +simmatrix$AA <- rownames(simmatrix) +rval=nrow(simmatrix) +simmatrix2 <- simmatrix %>% + gather(1:rval, key=sequence, value=similarity) +x=reorder(simmatrix2$AA,simmatrix2$similarity) +y=reorder(simmatrix2$sequence,simmatrix2$similarity) +similaritymatrix <- ggplot(simmatrix2, aes(x=x, y=y,fill=similarity))+ + geom_raster()+ + scale_fill_distiller(palette="Spectral")+ + theme(axis.text.x = element_text(angle = 90))+ + theme(axis.title.x=element_blank())+ + theme(axis.title.y=element_blank()) + +heat <- ggplotly(similaritymatrix) +heat <- heat %>% config(toImageButtonOptions=list(format='svg',filename='heatmap', height= 500, width= 800, scale= 1)) +heat +``` +
+
+ ### AminoTypes Shannon Entropy Analysis (https://merenlab.org/2012/05/11/oligotyping-pipeline-explained/)
From 0965b8714f1d1a62e1b34f120923ee920c7f312f Mon Sep 17 00:00:00 2001 From: "@Aveglia" Date: Wed, 20 Oct 2021 09:04:05 -0500 Subject: [PATCH 08/47] phylogrouping added to nf --- vAMPirus.nf | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/vAMPirus.nf b/vAMPirus.nf index bdf3aee..536aff3 100644 --- a/vAMPirus.nf +++ b/vAMPirus.nf @@ -2365,7 +2365,7 @@ if (params.DataCheck || params.Analyze) { script: """ TreeCluster.py -i ${tree} ${params.asvTCopp} > ${params.projtag}_ASV_treeclustering.out - reeCluster.py -i ${tree} ${params.asvTCopp} > ${params.projtag}_ASV_treeclustering_verbose.out + TreeCluster.py -i ${tree} ${params.asvTCopp} > ${params.projtag}_ASV_treeclustering_verbose.out #create headless treeclustering.out tail -n +2 ${params.projtag}_ASV_treeclustering.out | sed 's/-1/0/g' > headless.treeout #summarizing clustering results @@ -2379,9 +2379,11 @@ if (params.DataCheck || params.Analyze) { g=\$((\$g+1)) done rm tmp.csv + """ } } + if (params.asvMED) { process ASV_Minimum_Entropy_Decomposition { @@ -3018,13 +3020,13 @@ if (params.DataCheck || params.Analyze) { } } - if (params.aminoTClust){ + if (params.aminoTClust) { process AminoType_PhyloClustering { label 'norm_cpus' - publishDir "${params.workingdir}/${params.outdir}/Analyze/Analyses/AminoTypes/TreeCluster, mode: "copy", overwrite: true + publishDir "${params.workingdir}/${params.outdir}/Analyze/Analyses/AminoTypes/TreeCluster", mode: "copy", overwrite: true input: file(tree) from amino_treeclust @@ -3036,6 +3038,7 @@ if (params.DataCheck || params.Analyze) { script: """ TreeCluster.py -i ${tree} ${params.asvTCopp} > ${params.projtag}_AminoType_treeclustering.out + TreeCluster.py -i ${tree} ${params.asvTCopp} > ${params.projtag}_ASV_treeclustering_verbose.out #create headless treeclustering.out tail -n +2 ${params.projtag}_AminoType_treeclustering.out | sed 's/-1/0/g' > headless.treeout #summarizing clustering results @@ -3049,10 +3052,11 @@ if (params.DataCheck || params.Analyze) { g=\$((\$g+1)) done rm tmp.csv - } - + """ + } } + process Generate_AminoTypes_Counts_Table { label 'high_cpus' @@ -4304,7 +4308,7 @@ if (params.DataCheck || params.Analyze) { */ report_asv = Channel.create() - asv_counts_plots.mix(taxplot_asv, asv_heatmap, nucl_phyl_plot_asv, asvgroupscsv, asvgroupcounts, asv_group_rep_tree, tax_table_asv, tax_nodCol_asv).flatten().buffer(size:9).dump(tag:'asv').into(report_asv) + asv_counts_plots.mix(taxplot_asv, asv_heatmap, nucl_phyl_plot_asv, asvgroupscsv, asvgroupcounts, asv_group_rep_tree, tax_table_asv, tax_nodCol_asv, asv_phylogroupcsv).flatten().buffer(size:10).dump(tag:'asv').into(report_asv) if (params.ncASV) { report_ncasv = Channel.create() @@ -4349,7 +4353,7 @@ if (params.DataCheck || params.Analyze) { if (!params.skipAminoTyping) { report_aminotypes = Channel.create() - aminocounts_plot.mix(taxplot2, aminotype_heatmap, amino_rax_plot, atygroupscsv, amino_group_rep_tree, amino_groupcounts, tax_table_amino, tax_nodCol_amino).flatten().buffer(size:9).dump(tag:'amino').into(report_aminotypes) + aminocounts_plot.mix(taxplot2, aminotype_heatmap, amino_rax_plot, atygroupscsv, amino_group_rep_tree, amino_groupcounts, tax_table_amino, tax_nodCol_amino, amino_phylogroupcsv).flatten().buffer(size:10).dump(tag:'amino').into(report_aminotypes) /* Report_AminoTypes aminocounts_plot -> ${params.projtag}_AminoType_counts.csv From b23e58768e0124344a0ef26d9d9fe26aa93c4ab9 Mon Sep 17 00:00:00 2001 From: "@Aveglia" Date: Wed, 20 Oct 2021 09:11:06 -0500 Subject: [PATCH 09/47] added phylogroup to report --- bin/vAMPirus_Report.Rmd | 581 +++++++++++++++++++++++++++++++++++++++- 1 file changed, 580 insertions(+), 1 deletion(-) diff --git a/bin/vAMPirus_Report.Rmd b/bin/vAMPirus_Report.Rmd index 51904cc..e747313 100644 --- a/bin/vAMPirus_Report.Rmd +++ b/bin/vAMPirus_Report.Rmd @@ -736,7 +736,7 @@ if (params$skipPhylogeny == "true") { p2 <- p1 + geom_point(data = metat2, aes(x = x, y = y, label = id, color = GroupID)) gg = ggplotly(p2, tooltip = c("label","GroupID")) gg <- gg %>% config(toImageButtonOptions=list(format='svg',filename='Tree_MED_colors', height= 500, width= 800, scale= 1)) - gg + gg } else if ((params$skipTaxonomy == "false") && (params$nodeCol == "TAX")) { taxMap=read.csv("quicker_taxbreakdown.csv", header=T, check.names = F) tree=read.tree("tree.txt") @@ -770,6 +770,585 @@ This tree is a maximum likelihood tree made with IQTREE2 and the parameters you


+# Post Phylogeny-based Clustering Analyses +```{bash bash_phy_table, include=FALSE} +if [[ $(ls | grep -wc "asv_medfile.csv") -eq 1 ]];then + awk -F "," '{print $2}' asv_medfile.csv | sort | uniq | sort -g >group.list + for x in $(cat group.list);do + echo "${x}" >> ${x}.col + grep -w "$x" asv_medfile.csv | awk -F "," '{print $1}' | sort >> ${x}.col + done + paste -d "," *.col > asv_medtable.csv + rm *.col +else + echo "Not MED" +fi + +if [[ $(ls | grep -wc "amino_medfile.csv") -eq 1 ]];then + awk -F "," '{print $2}' amino_medfile.csv | sort | awk -F "Group" '{print $2}' | sort -n | uniq > group.list + for x in $(cat group.list);do + gr="Group"${x}"" + echo "${gr}" > ${x}.col + grep -w "$gr" amino_medfile.csv | awk -F "," '{print $1}' | sort >> ${x}.col + done + paste -d "," *.col > amino_medtable.csv + rm *.col +else + echo "Not MED" +fi +``` + +```{r PhyloGroup_table, echo=FALSE} +if (params$asvMED == "true" && params$type == "ASV"){ + #med_group=read.csv("asv_medfile.csv", header = F) + #colnames(med_group) <- c("SequenceID", "Group", "Sequence") + # change name of sequence to med peak or something + #paged_table(med_group,options = list(rows.print = 20)) + med_group=read.csv("asv_medtable.csv", header = T) + knitr::kable(med_group, digits = 2, align = 'c', booktabs = TRUE, caption = "Table: ASV MED Table") %>% + kable_styling(font_size = 12, full_width = F)%>% + scroll_box(width = "100%", height = "100%") +} + +if (params$aminoMED == "true" && params$type == "AminoType"){ + #med_group=read.csv("amino_medfile.csv", header = F) + #colnames(med_group) <- c("SequenceID", "Group", "Sequence") + #paged_table(med_group,options = list(rows.print = 20)) + med_group=read.csv("amino_medtable.csv", header = T) + knitr::kable(med_group, digits = 2, align = 'c', booktabs = TRUE, caption = "Table: Aminotype MED Table") %>% + kable_styling(font_size = 12, full_width = F) %>% + scroll_box(width = "100%", height = "100%") +} +``` + +
+
+
+
+
+ +

  PhyloGroup Relative Abundance Plot

+ +
+
+ +```{r med_plot, echo=FALSE} +if (params$asvMED == "true" && params$type == "ASV"){ + #medgroup=read.csv("uVP_AminoType_Groupingcounts.csv", check.names=FALSE) + medgroup=read.csv("asv_groupcounts.csv", check.names=FALSE) + meddata <- medgroup %>% select(-2) + meddata2 <- aggregate(.~GroupID, meddata, sum) + meddata3 <-as.data.frame(t(meddata2)) + meddata3$sample <- row.names(meddata3) + colnames(meddata3)<- as.matrix(meddata3[1,]) + as.data.frame(meddata3) + meddata3 <- meddata3[-1,] + meddata3 <- meddata3 %>% + rename(sample=GroupID) + meddata3dim <- dim(meddata3) + ##Loading metadata + samples <- read.csv(sample_metadata, header = TRUE) + #samples <- read.csv("rna_virus_meta.csv", header = TRUE) + ##Combining data and metadata + meddata4 <- merge(meddata3, samples, by="sample") + #if "meddata4" removed it does not work + #meddata4 + dim_meddata4 <- dim(meddata4) + dim_samples <- dim(samples) + cols <- dim_meddata4[2]-dim_samples[2]+1 + first <-colnames(meddata4)[2] + last <- colnames(meddata4)[cols] + meddata4[,2:cols] <- lapply(meddata4[,2:cols], as.character) + meddata4[,2:cols] <- lapply(meddata4[,2:cols], as.numeric) + #Calculate total reads per sample + meddata5 <- meddata4%>% + mutate(sum=select(.,2:cols)%>% + apply(1, sum, na.rm=TRUE)) + #meddata5 + ##Filter samples with low reads + nfil=params$minimumCounts + #nfil=1000 + meddata5 <- meddata5 %>% + filter(sum>nfil) + #can cause errors + meddata5dim <-dim(meddata5) + minreads<-min(meddata5$sum) + + dataz <- decostand(meddata5[,2:cols],method="total") #method 'total' normalizes data to sum up to 1 + dataz$sample <- meddata5$sample + metadata <- meddata5[,(cols+1):meddata5dim[2]] + metadata$sample <- meddata5$sample + datay <- merge(dataz, metadata, by="sample") + datalong <- datay %>% + tidyr::gather(first:last, key=hit, value=reads) + ddd <- plot_ly(datalong, x=~sample, y=~reads, color=~hit, colors=mycol, type='bar') + ddd <- ddd %>% layout(barmode = 'stack') + ddd <- ddd %>% layout(legend = list(x=10,y=.5), xaxis=list(title = "Sample"), yaxis=list(title = "Relative abundance")) + ddd <- ddd %>% config(toImageButtonOptions=list(format='svg',filename='Relative_abundance', height= 500, width= 800, scale= 1)) + ddd +} +if (params$aminoMED == "true" && params$type == "AminoType"){ + #medgroup=read.csv("uVP_AminoType_Groupingcounts.csv") + medgroup=read.csv("amino_groupcounts.csv", check.names=FALSE) + meddata <- medgroup %>% select(-2) + meddata2 <- aggregate(.~GroupID, meddata, sum) + meddata3 <-as.data.frame(t(meddata2)) + meddata3$sample <- row.names(meddata3) + colnames(meddata3)<- as.matrix(meddata3[1,]) + as.data.frame(meddata3) + meddata3 <- meddata3[-1,] + meddata3 <- meddata3 %>% + rename(sample=GroupID) + meddata3dim <- dim(meddata3) + ##Loading metadata + samples <- read.csv(sample_metadata, header = TRUE) + ##Combining data and metadata + meddata4 <- merge(meddata3, samples, by="sample") + #if "meddata4" removed it does not work + #meddata4 + dim_meddata4 <- dim(meddata4) + dim_samples <- dim(samples) + cols <- dim_meddata4[2]-dim_samples[2]+1 + first <-colnames(meddata4)[2] + last <- colnames(meddata4)[cols] + meddata4[,2:cols] <- lapply(meddata4[,2:cols], as.character) + meddata4[,2:cols] <- lapply(meddata4[,2:cols], as.numeric) + #Calculate total reads per sample + meddata5 <- meddata4%>% + mutate(sum=select(.,2:cols)%>% + apply(1, sum, na.rm=TRUE)) + #meddata5 + ##Filter samples with low reads + nfil=params$minimumCounts + #nfil=1000 + meddata5 <- meddata5 %>% + filter(sum>nfil) + #can cause errors + meddata5dim <-dim(meddata5) + minreads<-min(meddata5$sum) + + dataz <- decostand(meddata5[,2:cols],method="total") #method 'total' normalizes data to sum up to 1 + dataz$sample <- meddata5$sample + metadata <- meddata5[,(cols+1):meddata5dim[2]] + metadata$sample <- meddata5$sample + datay <- merge(dataz, metadata, by="sample") + datalong <- datay %>% + tidyr::gather(first:last, key=hit, value=reads) + ddd <- plot_ly(datalong, x=~sample, y=~reads, color=~hit, colors=mycol, type='bar') + ddd <- ddd %>% layout(barmode = 'stack') + ddd <- ddd %>% layout(legend = list(x=10,y=.5), xaxis=list(title = "Sample"), yaxis=list(title = "Relative abundance")) + ddd <- ddd %>% config(toImageButtonOptions=list(format='svg',filename='Relative_abundance', height= 500, width= 800, scale= 1)) + ddd +} +``` + +
+
+
+
+
+ +

  PhyloGroup Rarefaction Curves

+ +
+
+ +```{r phy_rarefaction, echo=FALSE, cache=FALSE} +if ((params$asvMED == "true" || params$aminoMED == "true") && (params$type == "ASV" || params$type == "AminoType")){ +##Rarefaction curves +rarefaction <- rarecurve(meddata5[,2:cols]) + +##rarefied dataset +raredata <- as.data.frame(rrarefy(meddata5[,2:cols], sample=minreads)) +} +``` + +
+
+
+
+
+
+ +

  PhyloGroup Diversity Analyses Plots

+ +
+
+ +### PhyloGroup Shannon diversity + +
+```{r phy_diversity_analysis1a, echo=FALSE} +if ((params$asvMED == "true" || params$aminoMED == "true") && (params$type == "ASV" || params$type == "AminoType")){ + metadata <- meddata5[,(cols+1):meddata5dim[2]] + metadata$sample <- meddata5$sample + index <-diversity(raredata, index= "shannon") + shannondata5 <- as.data.frame(index) + shannondata5$sample<- meddata5$sample + shannondata5_2 <- merge(shannondata5, metadata, by="sample") + + sh <- plot_ly(shannondata5_2, x=~treatment, y=~index, color=~treatment, colors=mycol, type="box", boxpoints = "all", pointpos = 0, jitter = 0.5) + #sh <- sh %>% layout(title = list(text="Shannon diversty",y=.99)) + sh <- sh %>% layout(legend = list(x=10,y=.5), yaxis=list(title = "Index"), xaxis=list(title = "Treatment")) + sh <- sh %>% config(toImageButtonOptions=list(format='svg',filename='ShannonDiv', height= 500, width= 800, scale= 1)) + sh +} +``` + +```{r phy_diversity_analysis1b, echo=FALSE} +#divided because error +if ((params$asvMED == "true" || params$aminoMED == "true") && (params$type == "ASV" || params$type == "AminoType")){ + if (params$stats == "true" ) { + shannonaov <- aov(index ~ treatment, data= shannondata5_2) + st <- shapiro.test(resid(shannonaov)) + bt <- bartlett.test(index ~ treatment, data= shannondata5_2) + + if (st$p.value > .05 && bt$p.value > .05) { + print("Shapiro Test of normality - data is normal p-value > 0.05") + print(shapiro.test(resid(shannonaov))) + writeLines("\n--------------------------------------------------------------\n") + print("Bartlett Test variance homogeneity - variance is homogeneous p-value > 0.05") + print(bartlett.test(index ~ treatment, data= shannondata5_2)) + writeLines("\n--------------------------------------------------------------\n") + print("ANOVA Results") + print(summary(shannonaov)) + writeLines("\n--------------------------------------------------------------\n") + #Tukey Honest Significant Differences (pairwise comparison) - significant p <.05 + print("Tukey HSD - Pairwise comparison - significant differences indicated by p-value < 0.05") + print(TukeyHSD(shannonaov)) + writeLines("\n--------------------------------------------------------------\n") + } else { + print("Shapiro Test of normality - data is normal if p-value > 0.05") + print(shapiro.test(resid(shannonaov))) + writeLines("\n--------------------------------------------------------------\n") + print("Bartlett Test variance homogeneity - variance is homogeneous if p-value > 0.05") + print(bartlett.test(index ~ treatment, data= shannondata5_2)) + writeLines("\n--------------------------------------------------------------\n") + print("Data either not normal or variance not homogenous") + print("Kruskal-Wallis Test - test significant if p <.05") + #Kruskal-Wallis test - significant p <.05 + mykt <- kruskal.test(index ~ treatment, data= shannondata5_2) + print(mykt) + writeLines("\n--------------------------------------------------------------\n") + if (mykt$p.value < .05) { + #Pairwise comparison + print("Wilcox.test - pairwise comparison") + print(pairwise.wilcox.test(shannondata5_2$index, shannondata5_2$treatment, p.adjust.method = "BH")) + } else { + print("Data not significant. Skipping pairwise comparison") + } + writeLines("\n--------------------------------------------------------------\n") + print("ANOVA - one or more of the assumptions not met, take with a grain of salt.") + print(summary(shannonaov)) + writeLines("\n--------------------------------------------------------------\n") + } + } else { + print("Stats skipped. To toggle on use \"--stats run\" in vAMPirus launch command") + } +} +``` +
+
+
+
+ +### PhyloGroup Simpson diversity + +
+```{r phy_diversity_analysis2a, echo=FALSE} +if ((params$asvMED == "true" || params$aminoMED == "true") && (params$type == "ASV" || params$type == "AminoType")){ + index <- diversity(raredata, index= "simpson") + simpsondata5 <- as.data.frame(index) + simpsondata5$sample<- meddata5$sample + simpsondata5_2 <- merge(simpsondata5, metadata, by="sample") + + s <- plot_ly(simpsondata5_2, x=~treatment, y=~index, color=~treatment, colors=mycol, type="box", boxpoints = "all", pointpos = 0, jitter = 0.5) + #s <- s %>% layout(title = list(text="Simpson diversty",y=.99)) + s <- s %>% layout(legend = list(x=10,y=.5), yaxis=list(title = "Index"), xaxis=list(title = "Treatment")) + s <- s %>% config(toImageButtonOptions=list(format='svg',filename='SimpsonDiv', height= 500, width= 800, scale= 1)) + s +} +``` + +```{r phy_diversity_analysis2b, echo=FALSE} +if ((params$asvMED == "true" || params$aminoMED == "true") && (params$type == "ASV" || params$type == "AminoType")){ + if (params$stats == "true" ) { + simpsonaov <- aov(index ~ treatment, data= simpsondata5_2) + st <- shapiro.test(resid(simpsonaov)) + bt <- bartlett.test(index ~ treatment, data= simpsondata5_2) + + if (st$p.value > .05 && bt$p.value > .05) { + print("Shapiro Test of normality - data is normal p-value > 0.05") + print(shapiro.test(resid(simpsonaov))) + writeLines("\n--------------------------------------------------------------\n") + print("Bartlett Test variance homogeneity - variance is homogeneous p-value > 0.05") + print(bartlett.test(index ~ treatment, data= simpsondata5_2)) + writeLines("\n--------------------------------------------------------------\n") + print("ANOVA Results") + print(summary(simpsonaov)) + writeLines("\n--------------------------------------------------------------\n") + #Tukey Honest Significant Differences (pairwise comparison) - significant p <.05 + print("Tukey HSD - Pairwise comparison - significant differences indicated by p-value < 0.05") + print(TukeyHSD(simpsonaov)) + writeLines("\n--------------------------------------------------------------\n") + } else { + print("Shapiro Test of normality - data is normal if p-value > 0.05") + print(shapiro.test(resid(simpsonaov))) + writeLines("\n--------------------------------------------------------------\n") + print("Bartlett Test variance homogeneity - variance is homogeneous if p-value > 0.05") + print(bartlett.test(index ~ treatment, data= simpsondata5_2)) + writeLines("\n--------------------------------------------------------------\n") + print("Data either not normal or variance not homogenous") + print("Kruskal-Wallis Test - test significant if p <.05") + #Kruskal-Wallis test - significant p <.05 + mykt <- kruskal.test(index ~ treatment, data= simpsondata5_2) + print(mykt) + writeLines("\n--------------------------------------------------------------\n") + if (mykt$p.value < .05) { + #Pairwise comparison + print("Wilcox.test - pairwise comparison") + print(pairwise.wilcox.test(simpsondata5_2$index, simpsondata5_2$treatment, p.adjust.method = "BH")) + } else { + print("Data not significant. Skipping pairwise comparison") + } + writeLines("\n--------------------------------------------------------------\n") + print("ANOVA - one or more of the assumptions not met, take with a grain of salt.") + print(summary(simpsonaov)) + writeLines("\n--------------------------------------------------------------\n") + } + } else { + print("Stats skipped. To toggle on use \"--stats run\" in vAMPirus launch command") + } +} +``` +
+
+
+
+ +### PhyloGroup Richness + +
+```{r phy_diversity_analysis3a, echo=FALSE} +if ((params$asvMED == "true" || params$aminoMED == "true") && (params$type == "ASV" || params$type == "AminoType")){ + mind5<-min(meddata5$sum) + index <- rarefy(meddata5[,2:cols], sample=mind5) + rarerichnessdata5 <- as.data.frame(index) + rarerichnessdata5$sample <-meddata5$sample + richdata5_2 <- merge(rarerichnessdata5, metadata, by="sample") + + ri <- plot_ly(richdata5_2, x=~treatment, y=~index, color=~treatment, colors=mycol, type="box", boxpoints = "all", pointpos = 0, jitter = 0.5) + #ri <- ri %>% layout(title = list(text="ASV Richness",y=.99)) + ri <- ri %>% layout(legend = list(x=10,y=.5), yaxis=list(title = "Index"), xaxis=list(title = "Treatment")) + ri <- ri %>% config(toImageButtonOptions=list(format='svg',filename='SpeciesRich', height= 500, width= 800, scale= 1)) + ri +} +``` + +```{r med_diversity_analysis3b, echo=FALSE} +if ((params$asvMED == "true" || params$aminoMED == "true") && (params$type == "ASV" || params$type == "AminoType")){ + if (params$stats == "true" ) { + richaov <- aov(index ~ treatment, data= richdata5_2) + st <- shapiro.test(resid(richaov)) + bt <- bartlett.test(index ~ treatment, data= richdata5_2) + + if (st$p.value > .05 && bt$p.value > .05) { + print("Shapiro Test of normality - data is normal p-value > 0.05") + print(shapiro.test(resid(richaov))) + writeLines("\n--------------------------------------------------------------\n") + print("Bartlett Test variance homogeneity - variance is homogeneous p-value > 0.05") + print(bartlett.test(index ~ treatment, data= richdata5_2)) + writeLines("\n--------------------------------------------------------------\n") + print("ANOVA Results") + print(summary(richaov)) + #Tukey Honest Significant Differences (pairwise comparison) - significant p <.05 + writeLines("\n--------------------------------------------------------------\n") + print("Tukey HSD - Pairwise comparison - significant differences indicated by p-value < 0.05") + print(TukeyHSD(richaov)) + writeLines("\n--------------------------------------------------------------\n") + } else { + print("Shapiro Test of normality - data is normal if p-value > 0.05") + print(shapiro.test(resid(richaov))) + writeLines("\n--------------------------------------------------------------\n") + print("Bartlett Test variance homogeneity - variance is homogeneous if p-value > 0.05") + print(bartlett.test(index ~ treatment, data= richdata5_2)) + writeLines("\n--------------------------------------------------------------\n") + print("Data either not normal or variance not homogenous") + print("Kruskal-Wallis Test - test significant if p <.05") + #Kruskal-Wallis test - significant p <.05 + mykt <- kruskal.test(index ~ treatment, data= richdata5_2) + print(mykt) + writeLines("\n--------------------------------------------------------------\n") + if (mykt$p.value < .05) { + #Pairwise comparison + print("Wilcox.test - pairwise comparison") + print(pairwise.wilcox.test(richdata5_2$index, richdata5_2$treatment, p.adjust.method = "BH")) + } else { + print("Data not significant. Skipping pairwise comparison") + } + writeLines("\n--------------------------------------------------------------\n") + print("ANOVA - one or more of the assumptions not met, take with a grain of salt.") + print(summary(richaov)) + writeLines("\n--------------------------------------------------------------\n") + } + } else { + print("Stats skipped. To toggle on use \"--stats run\" in vAMPirus launch command") + } +} +``` +
+
+
+
+
+ +

  PhyloGroup Distance To Centroid

+
+```{r med_distance2a, echo=FALSE} +if ((params$asvMED == "true" || params$aminoMED == "true") && (params$type == "ASV" || params$type == "AminoType")){ + ##Distance + intermediate <- raredata + bray.distance <- vegdist(sqrt(intermediate), method="bray") + + ##Dispersion + disper <- betadisper(bray.distance, group = metadata$treatment, type="centroid") + df <- data.frame(Distance_to_centroid=disper$distances,Group=disper$group) + df$sample <- meddata5$sample + df2 <- merge(df, metadata, by="sample") + + cen <- plot_ly(df2, x=~treatment, y=~Distance_to_centroid, color=~treatment, colors=mycol, type="box", boxpoints = "all", pointpos = 0, jitter = 0.5) + #cen <- cen %>% layout(title = list(text="Distance to centroid",y=.99)) + cen <- cen %>% layout(legend = list(x=10,y=.5), yaxis=list(title = "Distance"), xaxis=list(title = "Treatment")) + cen <- cen %>% config(toImageButtonOptions=list(format='svg',filename='Dispersion', height= 500, width= 800, scale= 1)) + cen +} +``` +```{r med_distance2b, echo=FALSE} +if ((params$asvMED == "true" || params$aminoMED == "true") && (params$type == "ASV" || params$type == "AminoType")){ + if (params$stats == "true" ) { + adn <- adonis(bray.distance~meddata5$treatment) + adn + } else { + print("Stats skipped. To toggle on use \"--stats run\" in vAMPirus launch command") + } +} +``` +
+
+
+
+
+ +

  NMDS Plots

+ +
+ +### PhyloGroup 2D NMDS + +
+```{r med_nmds2d, echo=FALSE} +if ((params$asvMED == "true" || params$aminoMED == "true") && (params$type == "ASV" || params$type == "AminoType")){ + ##NMDS + datax <- decostand(raredata,method="total") #method 'total' normalizes data to sum up to 1 --data5[,2:cols] + + MDS <- metaMDS(sqrt(datax), + distance = "bray",autotransform = FALSE, + k = 2, + maxit = 999, + trymax = params$try, + wascores = TRUE) + + if (MDS$converged == "TRUE") { + + data.scores <- as.data.frame(scores(MDS)) + data.scores$sample <- meddata5$sample + data.scores.2 <- merge(data.scores, metadata, by="sample") + + p <- ggplot(data.scores.2, aes(x=NMDS1, y=NMDS2,color=treatment))+ + geom_point(size=2)+ + theme_classic()+ + theme(legend.title = element_blank()) + + #fff <- ggplotly(p) + #fff <- fff %>% layout(legend=list(y=.5)) + #fff + + fff <-plot_ly(data.scores.2, x=~NMDS1, y=~NMDS2, color=~treatment, colors=mycol, text = ~paste("Sample: ", sample)) + fff <- fff %>% layout(legend=list(y=.5)) + fff <- fff %>% config(toImageButtonOptions=list(format='svg',filename='2Dnmds', height= 500, width= 800, scale= 1)) + fff + + } else { + print("NMDS did not converge. Printing PCoA") + #calculate bray curtis distance + bcdist <- vegdist(datax, "bray") + res <- pcoa(bcdist) + #res$values + #biplot(res) #to make the boring 2D PCoA + comp <- as.data.frame(res$vectors) + comp$sample <- data5$sample + comp2 <- merge(comp, metadata, by="sample") + fig <- plot_ly(comp2, x = ~Axis.1, y = ~Axis.2, color= ~treatment, text = ~paste("Sample: ", sample,"
Treatment: ",treatment)) + fig <- fig %>% layout(xaxis = list(title = "PCoA 1"), + yaxis = list(title = "PCoA 2")) + fig <- fig %>% layout(legend=list(y=.5)) + fig <- fig %>% config(toImageButtonOptions=list(format='svg',filename='2D_PostMED_PCoA', height= 500, width= 800, scale= 1)) + fig + } +} +``` + +
+
+
+
+ +### PhyloGroup 3D NMDS + +
+ +```{r med_nmds3d, echo=FALSE} +if ((params$asvMED == "true" || params$aminoMED == "true") && (params$type == "ASV" || params$type == "AminoType")){ + MDS3 <- metaMDS(sqrt(datax), + distance = "bray",autotransform = FALSE, + k = 3, + maxit = 999, + trymax = params$try, + wascores = TRUE) + + if (MDS3$converged == "TRUE") { + + data.scores3 <- as.data.frame(scores(MDS3)) + data.scores3$sample <- meddata5$sample + data.scores.3 <- merge(data.scores3, metadata, by="sample") + p3d <- plot_ly(data.scores.3, x=~NMDS1, y=~NMDS2, z=~NMDS3, text=~paste("Sample: ", sample), + color=~treatment, colors=mycol, + mode = 'markers', symbol = ~treatment, symbols = c('square','circle'), + marker = list(opacity = .8,line=list(color = 'darkblue',width = 1)) + ) + p3d <- p3d %>% layout(legend=list(y=.5)) + p3d <- p3d %>% config(toImageButtonOptions=list(format='svg',filename='3Dnmds', height= 500, width= 800, scale= 1)) + p3d + + } else { + print("NMDS did not converge. Printing PCoA") + #calculate bray curtis distance + bcdist <- vegdist(datax, "bray") + res <- pcoa(bcdist) + comp <- as.data.frame(res$vectors) + comp$sample <- data5$sample + comp2 <- merge(comp, metadata, by="sample") + fig <- plot_ly(comp2, x = ~Axis.1, y = ~Axis.2, z = ~Axis.3, color= ~treatment, text = ~paste("Sample: ", sample,"
Treatment: ",treatment)) + fig <- fig %>% layout(xaxis = list(title = "PCoA 1"), + yaxis = list(title = "PCoA 2"), + zaxis = list(title = "PCoA 3")) + fig <- fig %>% layout(legend=list(y=.5)) + fig <- fig %>% config(toImageButtonOptions=list(format='svg',filename='3D_PCoA', height= 500, width= 800, scale= 1)) + fig + } +} +``` +

From 5b09710754a4a061c8289e7690a8da3976f40d82 Mon Sep 17 00:00:00 2001 From: "@Aveglia" Date: Wed, 20 Oct 2021 09:37:53 -0500 Subject: [PATCH 10/47] fixed values --- vAMPirus.nf | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/vAMPirus.nf b/vAMPirus.nf index 536aff3..a448aad 100644 --- a/vAMPirus.nf +++ b/vAMPirus.nf @@ -2369,11 +2369,11 @@ if (params.DataCheck || params.Analyze) { #create headless treeclustering.out tail -n +2 ${params.projtag}_ASV_treeclustering.out | sed 's/-1/0/g' > headless.treeout #summarizing clustering results - awk -F "\t" '{print $2}' | sort | uniq > group.list + awk -F "\t" '{print \$2}' | sort | uniq > group.list g=1 echo "SequenceID,PhyloGroup" > ${params.projtag}_phylogroup.csv for x in \$(cat group.list) - do grep "$x" headless.out > tmp.tsv + do grep "\$x" headless.out > tmp.tsv groupID="phyloGroup\${g}" awk -F "\t" '{print \$1",'\${groupID}'"}' tmp.tsv >> ${params.projtag}_phylogroup.csv g=\$((\$g+1)) @@ -3042,11 +3042,11 @@ if (params.DataCheck || params.Analyze) { #create headless treeclustering.out tail -n +2 ${params.projtag}_AminoType_treeclustering.out | sed 's/-1/0/g' > headless.treeout #summarizing clustering results - awk -F "\t" '{print $2}' | sort | uniq > group.list + awk -F "\t" '{print \$2}' | sort | uniq > group.list g=1 echo "SequenceID,PhyloGroup" > ${params.projtag}_phylogroup.csv for x in \$(cat group.list) - do grep "$x" headless.out > tmp.tsv + do grep "\$x" headless.out > tmp.tsv groupID="phyloGroup\${g}" awk -F "\t" '{print \$1",'\${groupID}'"}' tmp.tsv >> ${params.projtag}_phylogroup.csv g=\$((\$g+1)) From 97a161bdd13b9e77fc1024ad251a0bd6e78916e0 Mon Sep 17 00:00:00 2001 From: "@Aveglia" Date: Wed, 20 Oct 2021 09:39:42 -0500 Subject: [PATCH 11/47] fixed parameter --- vAMPirus.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vAMPirus.nf b/vAMPirus.nf index a448aad..a8b2b93 100644 --- a/vAMPirus.nf +++ b/vAMPirus.nf @@ -678,7 +678,7 @@ if (params.DataCheck || params.Analyze) { echo ${sample_id} fastp -i ${reads[0]} -I ${reads[1]} -o left-${sample_id}.filter.fq -O right-${sample_id}.filter.fq --detect_adapter_for_pe \ - --average_qual ${params.avQ} -n ${params.nM} -c --overrepresentation_analysis --html ${sample_id}.fastp.html --json ${sample_id}.fastp.json --thread ${task.cpus} \ + --average_qual ${params.avQ} -n ${params.mN} -c --overrepresentation_analysis --html ${sample_id}.fastp.html --json ${sample_id}.fastp.json --thread ${task.cpus} \ --report_title ${sample_id} bash get_readstats.sh ${sample_id}.fastp.json From a46d681c9f4f60d52734d47bc7b6203a0aff7f0a Mon Sep 17 00:00:00 2001 From: "@Aveglia" Date: Wed, 20 Oct 2021 09:52:56 -0500 Subject: [PATCH 12/47] fixed error in pairwisedistance --- vAMPirus.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vAMPirus.nf b/vAMPirus.nf index a8b2b93..a0af158 100644 --- a/vAMPirus.nf +++ b/vAMPirus.nf @@ -1206,7 +1206,7 @@ if (params.DataCheck || params.Analyze) { tail -\$(( \${yesirr}-1 )) number_per_percentage_protz.csv > number_per_percentage_prot.csv head -1 number_per_percentage_protz.csv >> number_per_percentage_prot.csv rm number_per_percentage_protz.csv - clustalo -i aminoacid_pcASV1.0_noTaxonomy.fasta --distmat-out=${params.projtag}_distance.matrix --full --force --threads=${task.cpus} + clustalo -i *aminoacid_pcASV1.0_noTaxonomy.fasta --distmat-out=${params.projtag}_distance.matrix --full --force --threads=${task.cpus} cat ${params.projtag}_distance.matrix | tr " " "," | grep "," >${params.projtag}_AminoType_PairwiseDistance.matrix rm ${params.projtag}_distance.matrix """ From a05432aec490ebd1f9df5e3d811f7faf12b7c263 Mon Sep 17 00:00:00 2001 From: "@Aveglia" Date: Wed, 20 Oct 2021 10:01:26 -0500 Subject: [PATCH 13/47] edit --- bin/vAMPirus_DC_Report.Rmd | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/bin/vAMPirus_DC_Report.Rmd b/bin/vAMPirus_DC_Report.Rmd index b4eea2c..dc11d8b 100644 --- a/bin/vAMPirus_DC_Report.Rmd +++ b/bin/vAMPirus_DC_Report.Rmd @@ -400,10 +400,10 @@ cols <- dim(simmatrix)[2] simmatrix$AA <- rownames(simmatrix) rval=nrow(simmatrix) simmatrix2 <- simmatrix %>% - gather(1:rval, key=sequence, value=similarity) -x=reorder(simmatrix2$AA,simmatrix2$similarity) -y=reorder(simmatrix2$sequence,simmatrix2$similarity) -similaritymatrix <- ggplot(simmatrix2, aes(x=x, y=y,fill=similarity))+ + gather(1:rval, key=sequence, value=Distance) +x=reorder(simmatrix2$AA,simmatrix2$Distance) +y=reorder(simmatrix2$sequence,simmatrix2$Distance) +similaritymatrix <- ggplot(simmatrix2, aes(x=x, y=y,fill=Distance))+ geom_raster()+ scale_fill_distiller(palette="Spectral")+ theme(axis.text.x = element_text(angle = 90))+ @@ -480,10 +480,10 @@ cols <- dim(simmatrix)[2] simmatrix$AA <- rownames(simmatrix) rval=nrow(simmatrix) simmatrix2 <- simmatrix %>% - gather(1:rval, key=sequence, value=similarity) -x=reorder(simmatrix2$AA,simmatrix2$similarity) -y=reorder(simmatrix2$sequence,simmatrix2$similarity) -similaritymatrix <- ggplot(simmatrix2, aes(x=x, y=y,fill=similarity))+ + gather(1:rval, key=sequence, value=Distance) +x=reorder(simmatrix2$AA,simmatrix2$Distance) +y=reorder(simmatrix2$sequence,simmatrix2$Distance) +similaritymatrix <- ggplot(simmatrix2, aes(x=x, y=y,fill=Distance))+ geom_raster()+ scale_fill_distiller(palette="Spectral")+ theme(axis.text.x = element_text(angle = 90))+ From ee88eedfd53eef3e4e1919f7049ba11f89a21db1 Mon Sep 17 00:00:00 2001 From: "@Aveglia" Date: Wed, 20 Oct 2021 10:19:24 -0500 Subject: [PATCH 14/47] fixed phyloclusteing commands --- example_data/conf/test.config | 2 ++ vAMPirus.nf | 4 ++++ vampirus.config | 2 ++ 3 files changed, 8 insertions(+) diff --git a/example_data/conf/test.config b/example_data/conf/test.config index cbc63e1..660912b 100644 --- a/example_data/conf/test.config +++ b/example_data/conf/test.config @@ -24,4 +24,6 @@ params { stats = true aminoMED = true asvMED = true + asvTClust = true + aminoTClust = true } diff --git a/vAMPirus.nf b/vAMPirus.nf index a0af158..371ec61 100644 --- a/vAMPirus.nf +++ b/vAMPirus.nf @@ -2382,6 +2382,8 @@ if (params.DataCheck || params.Analyze) { """ } + } else { + asv_phylogroupcsv = Channel.empty() } if (params.asvMED) { @@ -3054,6 +3056,8 @@ if (params.DataCheck || params.Analyze) { rm tmp.csv """ } + } else { + amino_phylogroupcsv = Channel.empty() } diff --git a/vampirus.config b/vampirus.config index cb03ae3..8a0bd04 100644 --- a/vampirus.config +++ b/vampirus.config @@ -229,6 +229,8 @@ params { filter = false // Phylogeny-based ASV clustering asvTClust = false + // Phylogeny-based AminoType clustering + aminoTClust = false // Skip options // Skip all Read Processing steps skipReadProcessing = false From 1f55b193c59caa44ba11380791627aa5e7fd3b2d Mon Sep 17 00:00:00 2001 From: "@Aveglia" Date: Wed, 20 Oct 2021 10:28:21 -0500 Subject: [PATCH 15/47] fixed treeclustering options --- vampirus.config | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vampirus.config b/vampirus.config index 8a0bd04..3e854a3 100644 --- a/vampirus.config +++ b/vampirus.config @@ -121,10 +121,10 @@ params { // NOTE: you can't use "--skipPhylogeny" when doing this form of sequence clustering // TreeCluster command options for ASV clustering (--asvTClust) -- (Example: "-option1 A -option2 B -option3 C -option4 D") - See TreeCluster paper and github page to determine the best options (a good start is what is below) - asvTCopp="-t 0.045 -max_clade" + asvTCopp="-t 0.045 -m max_clade" // TreeCluster command options for AminoType clustering (--aminoTClust) -- (Example: "-option1 A -option2 B -option3 C -option4 D") - See TreeCluster paper and github page to determine the best options - aminoTCopp="-t 0.045 -max_clade" + aminoTCopp="-t 0.045 -m max_clade" // Counts table generation parameters From 33fe9c21fb97d1d88b1c6955ccabccd6b3189818 Mon Sep 17 00:00:00 2001 From: "@Aveglia" Date: Wed, 20 Oct 2021 10:31:58 -0500 Subject: [PATCH 16/47] fixe treecluster --- vAMPirus.nf | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/vAMPirus.nf b/vAMPirus.nf index 371ec61..1540574 100644 --- a/vAMPirus.nf +++ b/vAMPirus.nf @@ -2369,13 +2369,13 @@ if (params.DataCheck || params.Analyze) { #create headless treeclustering.out tail -n +2 ${params.projtag}_ASV_treeclustering.out | sed 's/-1/0/g' > headless.treeout #summarizing clustering results - awk -F "\t" '{print \$2}' | sort | uniq > group.list + awk -F "\\t" '{print \$2}' | sort | uniq > group.list g=1 echo "SequenceID,PhyloGroup" > ${params.projtag}_phylogroup.csv for x in \$(cat group.list) do grep "\$x" headless.out > tmp.tsv groupID="phyloGroup\${g}" - awk -F "\t" '{print \$1",'\${groupID}'"}' tmp.tsv >> ${params.projtag}_phylogroup.csv + awk -F "\\t" '{print \$1",'\${groupID}'"}' tmp.tsv >> ${params.projtag}_phylogroup.csv g=\$((\$g+1)) done rm tmp.csv @@ -3044,13 +3044,13 @@ if (params.DataCheck || params.Analyze) { #create headless treeclustering.out tail -n +2 ${params.projtag}_AminoType_treeclustering.out | sed 's/-1/0/g' > headless.treeout #summarizing clustering results - awk -F "\t" '{print \$2}' | sort | uniq > group.list + awk -F "\\t" '{print \$2}' | sort | uniq > group.list g=1 echo "SequenceID,PhyloGroup" > ${params.projtag}_phylogroup.csv for x in \$(cat group.list) do grep "\$x" headless.out > tmp.tsv groupID="phyloGroup\${g}" - awk -F "\t" '{print \$1",'\${groupID}'"}' tmp.tsv >> ${params.projtag}_phylogroup.csv + awk -F "\\t" '{print \$1",'\${groupID}'"}' tmp.tsv >> ${params.projtag}_phylogroup.csv g=\$((\$g+1)) done rm tmp.csv From ad309d42d25bf8636615ab2d3ce453324f52e0f2 Mon Sep 17 00:00:00 2001 From: "@Aveglia" Date: Wed, 20 Oct 2021 10:40:33 -0500 Subject: [PATCH 17/47] fixed treeclusteiing process --- vAMPirus.nf | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vAMPirus.nf b/vAMPirus.nf index 1540574..42f0258 100644 --- a/vAMPirus.nf +++ b/vAMPirus.nf @@ -2369,7 +2369,7 @@ if (params.DataCheck || params.Analyze) { #create headless treeclustering.out tail -n +2 ${params.projtag}_ASV_treeclustering.out | sed 's/-1/0/g' > headless.treeout #summarizing clustering results - awk -F "\\t" '{print \$2}' | sort | uniq > group.list + awk -F "\\t" '{print \$2}' headless.treeout | sort | uniq > group.list g=1 echo "SequenceID,PhyloGroup" > ${params.projtag}_phylogroup.csv for x in \$(cat group.list) @@ -3044,7 +3044,7 @@ if (params.DataCheck || params.Analyze) { #create headless treeclustering.out tail -n +2 ${params.projtag}_AminoType_treeclustering.out | sed 's/-1/0/g' > headless.treeout #summarizing clustering results - awk -F "\\t" '{print \$2}' | sort | uniq > group.list + awk -F "\\t" '{print \$2}' headless.treeout | sort | uniq > group.list g=1 echo "SequenceID,PhyloGroup" > ${params.projtag}_phylogroup.csv for x in \$(cat group.list) From 6e9cb6269a67c5f35a97a5e17c3e2a5eda1f7f8f Mon Sep 17 00:00:00 2001 From: "@Aveglia" Date: Wed, 20 Oct 2021 10:42:05 -0500 Subject: [PATCH 18/47] fix --- vAMPirus.nf | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vAMPirus.nf b/vAMPirus.nf index 42f0258..7818d42 100644 --- a/vAMPirus.nf +++ b/vAMPirus.nf @@ -2373,7 +2373,7 @@ if (params.DataCheck || params.Analyze) { g=1 echo "SequenceID,PhyloGroup" > ${params.projtag}_phylogroup.csv for x in \$(cat group.list) - do grep "\$x" headless.out > tmp.tsv + do grep "\$x" headless.treeout > tmp.tsv groupID="phyloGroup\${g}" awk -F "\\t" '{print \$1",'\${groupID}'"}' tmp.tsv >> ${params.projtag}_phylogroup.csv g=\$((\$g+1)) @@ -3048,7 +3048,7 @@ if (params.DataCheck || params.Analyze) { g=1 echo "SequenceID,PhyloGroup" > ${params.projtag}_phylogroup.csv for x in \$(cat group.list) - do grep "\$x" headless.out > tmp.tsv + do grep "\$x" headless.treeout > tmp.tsv groupID="phyloGroup\${g}" awk -F "\\t" '{print \$1",'\${groupID}'"}' tmp.tsv >> ${params.projtag}_phylogroup.csv g=\$((\$g+1)) From f1f1d3273efa05bcd9bcede0159f810ae8d0c8b1 Mon Sep 17 00:00:00 2001 From: "@Aveglia" Date: Wed, 20 Oct 2021 10:43:18 -0500 Subject: [PATCH 19/47] fixed remove command --- vAMPirus.nf | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vAMPirus.nf b/vAMPirus.nf index 7818d42..3e74a6b 100644 --- a/vAMPirus.nf +++ b/vAMPirus.nf @@ -2378,7 +2378,7 @@ if (params.DataCheck || params.Analyze) { awk -F "\\t" '{print \$1",'\${groupID}'"}' tmp.tsv >> ${params.projtag}_phylogroup.csv g=\$((\$g+1)) done - rm tmp.csv + rm tmp.tsv """ } @@ -3053,7 +3053,7 @@ if (params.DataCheck || params.Analyze) { awk -F "\\t" '{print \$1",'\${groupID}'"}' tmp.tsv >> ${params.projtag}_phylogroup.csv g=\$((\$g+1)) done - rm tmp.csv + rm tmp.tsv """ } } else { From 908a53487de3aa1dfd592d689719fa59a3e4e802 Mon Sep 17 00:00:00 2001 From: "@Aveglia" Date: Wed, 20 Oct 2021 11:41:59 -0500 Subject: [PATCH 20/47] updated ymlfile --- vampirus_env.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/vampirus_env.yml b/vampirus_env.yml index 868a0ef..ccdcbf3 100644 --- a/vampirus_env.yml +++ b/vampirus_env.yml @@ -32,10 +32,10 @@ dependencies: - r-tidyverse=1.3.0 - r-cowplot=1.0.0 - r-scales=1.1.1 - - bioconductor-ggtree=2.2.1 - - bioconductor-biocparallel=1.24.0 + - bioconductor-ggtree + - r-devtools - pigz=2.4 - - r-biocmanager=1.30.10 + - r-biocmanager - pip: - oligotyping - treecluster From b26f3cff8d1c7765c1ece13cd8d290b26bb6f025 Mon Sep 17 00:00:00 2001 From: "@Aveglia" Date: Wed, 20 Oct 2021 15:36:07 -0500 Subject: [PATCH 21/47] revised treeclustering processes --- bin/vAMPirus_Report.Rmd | 2 ++ vAMPirus.nf | 62 +++++++++++++++++++++++++---------------- 2 files changed, 40 insertions(+), 24 deletions(-) diff --git a/bin/vAMPirus_Report.Rmd b/bin/vAMPirus_Report.Rmd index e747313..49bbfe2 100644 --- a/bin/vAMPirus_Report.Rmd +++ b/bin/vAMPirus_Report.Rmd @@ -18,6 +18,8 @@ params: aminoMED: !r commandArgs(trailingOnly=T)[12] type: !r commandArgs(trailingOnly=T)[13] nodeCol: !r commandArgs(trailingOnly=T)[14] + asvTClust: !r commandArgs(trailingOnly=T)[15] + aminoTClust: !r commandArgs(trailingOnly=T)[16] ---