Update to v3.0.0.2

* Collation of cutadapt JSON results into single JSON file * Collation of SeqKit statistics results into a single TSV file * Update version of pyQUEST to version 1.1.0 * Improved handling of 0-length reads * Ability to extract top 50 library-independent counts as FASTA
cancerit · Oct 11, 2023 · a018409 · a018409
2 parents 44e79f7 + 5079736
commit a018409
Show file tree

Hide file tree

Showing 13 changed files with 169 additions and 10 deletions.
diff --git a/.github/workflows/linting.yml b/.github/workflows/linting.yml
@@ -99,6 +99,7 @@ jobs:
           allow-repeats: false
 
   nf-core:
+    if: false
     runs-on: ubuntu-latest
     steps:
 

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -37,11 +37,20 @@ Initial release of QUANTS, created with the [nf-core](https://nf-co.re/) templat
 ## 3.0.0.0 - [21st August 2023]
 
 * Split read trimming into two stages
-	* Adapter trimming - removes user-defined adapter sequences and takes forward both trimmed and untrimmed reads
-	* Primer trimming - removes user-defined primer sequences and takes forward only trimmed reads
+    * Adapter trimming - removes user-defined adapter sequences and takes forward both trimmed and untrimmed reads
+    * Primer trimming - removes user-defined primer sequences and takes forward only trimmed reads
 * Add a read modification process which can append user-defined sequences to trimmed reads
 * Add library transformer to allow users to provide libraries in a different format (e.g. the meta CSV from VaLiAnT) and convert it for use with pyQUEST
 
 ## 3.0.0.1 - [12th September 2023]
 
-* Primer trimming - bugfix to ensure cutadapt splits reads into trimmed and untrimmed files
+* Primer trimming - bugfix to ensure cutadapt splits reads into trimmed and untrimmed files
+
+## 3.0.0.2 - [11th October 2023]
+
+* Collation of cutadapt JSON results into single JSON file
+* Collation of SeqKit statistics results into a single TSV file
+* Update version of pyQUEST to version 1.1.0 
+    * Improved handling of 0-length reads 
+    * Ability to extract top 50 library-independent counts as FASTA 
+
diff --git a/conf/base.config b/conf/base.config
@@ -54,4 +54,8 @@ process {
         errorStrategy = 'retry'
         maxRetries    = 2
     }
+
+    withName:COLLATE_CUTADAPT_JSONS {
+        executor = 'local'
+    }
 }
diff --git a/functions/functions.nf b/functions/functions.nf
@@ -0,0 +1,45 @@
+//
+// takes channel, workflow object and name of output channel
+// extracts desired output channel from workflow, combines it with workflow name and appends to input channel
+//
+def add_stats_with_stage(channel, workflow, String out_channel) {
+    return channel.mix(
+        workflow.out.getProperty(out_channel).combine(
+            [workflow.name.split(':').last()]
+        )
+    )
+}
+
+//
+// removes stage suffix from the sample name
+//
+def trim_sample_name(sample_name) {
+    sample_name
+        .replaceFirst(/_raw$/, "")
+        .replaceFirst(/_primer_trimmed$/, "")
+        .replaceFirst(/_adapter_trimmed$/, "")
+        .replaceFirst(/_merged$/, "")
+        .replaceFirst(/_merged_filtered$/, "")
+}
+
+//
+// each seqkit stat file prepends with two columns for sample and stage
+//
+def modify_seqkit_stats(meta, path, stage) {
+    // TODO should be removed in the future once sample name handling in the pipeline is consistent
+    def sample_name = trim_sample_name(meta.id)
+
+    newLines = []
+    file(path)
+        .readLines()
+        .eachWithIndex { it, i ->
+            if (i == 0) {
+                line = "sample" + "\t" + "stage" + "\t" + it
+            } else {
+                line = sample_name + "\t" + stage + "\t" + it
+            }
+            newLines.add(line)
+        }
+
+    return newLines.join("\n") + "\n"
+}
diff --git a/modules/local/cutadapt/main.nf b/modules/local/cutadapt/main.nf
@@ -24,9 +24,9 @@ process CUTADAPT {
     output:
     tuple val(meta), path('*_trimmed{,_1,_2}.fastq.gz')  , emit: reads
     tuple val(meta), path('*_untrimmed{,_1,_2}.fastq.gz'), emit: untrimmed_reads, optional: true
-    tuple val(meta), path('*.log')               , emit: log
-    tuple val(meta), path('*.json')              , emit: json
-    path '*.version.txt'                         , emit: version
+    tuple val(meta), path('*.log')                       , emit: log
+    tuple val(meta), path('*.json')                      , emit: json
+    path '*.version.txt'                                 , emit: version
 
     script:
     def software       = getSoftwareName(task.process)

diff --git a/modules/local/cutadapt_json_collation/functions.nf b/modules/local/cutadapt_json_collation/functions.nf
@@ -0,0 +1,25 @@
+import groovy.json.JsonSlurper
+
+//
+// takes cutadapt json filenames and stages for the sample and creates a record
+//
+def compose_cutadapt_jsons(meta, pathList, stageList) {
+    def jsonSlurper = new JsonSlurper()
+    def record = [:]
+
+    [pathList, stageList].transpose().each() { path, stage ->
+        def object = jsonSlurper.parse(path)
+
+        object["read_counts"]["read1_with_adapter_percent"] = 100 * object["read_counts"]["read1_with_adapter"] / object["read_counts"]["input"]
+        if (object["read_counts"]["read2_with_adapter"]){
+            object["read_counts"]["read2_with_adapter_percent"] = 100 * object["read_counts"]["read2_with_adapter"] / object["read_counts"]["input"]
+        } else {
+            object["read_counts"]["read2_with_adapter_percent"] = null
+        }
+
+        record[stage] = object
+    }
+
+    record = [(meta.id): record]
+    return record
+}
diff --git a/modules/local/cutadapt_json_collation/main.nf b/modules/local/cutadapt_json_collation/main.nf
@@ -0,0 +1,33 @@
+import groovy.json.JsonOutput
+
+// Import generic module functions
+include { compose_cutadapt_jsons } from './functions'
+
+process COLLATE_CUTADAPT_JSONS {
+    label 'process_low'
+    publishDir "${params.outdir}/cutadapt", mode: params.publish_dir_mode
+
+    input:
+    val inputList  // list of tuples [meta, [list of jsons], [list of stages]]
+
+    output:
+    path 'cutadapt.json', emit: json
+
+    exec:
+    String filename = [task.workDir, 'cutadapt.json'].join(File.separator)
+
+    new File(filename).withWriter { writer ->
+        writer.writeLine('{')
+
+        inputList.eachWithIndex { e, index ->
+            def (meta, pathList, stageList) = e
+            def record = compose_cutadapt_jsons(meta, pathList, stageList)
+            String record_string = JsonOutput.toJson(record)
+            String comma = index + 1 < inputList.size() ? ',' : ''
+            String output_string = '  ' + record_string[1..-2] + comma
+            writer.writeLine(output_string)
+        }
+
+        writer.writeLine('}')
+    }
+}
diff --git a/modules/local/cutadapt_json_collation/meta.yml b/modules/local/cutadapt_json_collation/meta.yml
@@ -0,0 +1,17 @@
+name: cutadapt_json_collation
+description: Collate all cutadapt output jsons into one file
+keywords:
+  - cutadapt
+input:
+  - inputList:
+      type: list
+      description: |
+        Groovy list containing tuples of three objects:
+        meta, list of cutadapt jsons, list of stages
+output:
+  - json:
+      type: file
+      description: collated cutadapt json file for all samples
+      pattern: "cutadapt.json"
+authors:
+  - "@y-popov"
diff --git a/modules/local/pyquest/main.nf b/modules/local/pyquest/main.nf
@@ -18,7 +18,7 @@ process PYQUEST {
         container "quay.io/biocontainers/flash2:2.2.00--h5bf99c6_3"
     }
     */
-    container "quay.io/wtsicgp/pyquest:1.0.0"
+    container "quay.io/wtsicgp/pyquest:1.1.0"
 
     input:
         tuple val(meta), path(reads)

diff --git a/nextflow.config b/nextflow.config
@@ -190,7 +190,7 @@ manifest {
     description     = 'Analysis pipeline for saturation genome editing screens'
     mainScript      = 'main.nf'
     nextflowVersion = '!>=21.10.6'
-    version         = '3.0.0.1'
+    version         = '3.0.0.2'
 }
 
 // Function to ensure that resource requirements don't go beyond

diff --git a/subworkflows/local/adapter_trimming.nf b/subworkflows/local/adapter_trimming.nf
@@ -27,7 +27,7 @@ workflow ADAPTER_TRIMMING {
 
             CUTADAPT_ADAPTER ( reads )
             ch_trimmed_reads = CUTADAPT_ADAPTER.out.reads
-            ch_trimmed_stats = CUTADAPT_ADAPTER.out.log
+            ch_trimmed_stats = CUTADAPT_ADAPTER.out.json
         }
     emit:
         reads = ch_trimmed_reads

diff --git a/subworkflows/local/primer_trimming.nf b/subworkflows/local/primer_trimming.nf
@@ -27,7 +27,7 @@ workflow PRIMER_TRIMMING {
 
             CUTADAPT_PRIMER ( reads )
             ch_trimmed_reads = CUTADAPT_PRIMER.out.reads
-            ch_trimmed_stats = CUTADAPT_PRIMER.out.log
+            ch_trimmed_stats = CUTADAPT_PRIMER.out.json
         }
     emit:
         reads = ch_trimmed_reads

diff --git a/workflows/sge.nf b/workflows/sge.nf
@@ -175,13 +175,20 @@ include { SEQUENCING_QC as RAW_SEQUENCING_QC;
           SEQUENCING_QC as PRIMER_TRIMMED_SEQUENCING_QC;
           SEQUENCING_QC as FILTERED_SEQUENCING_QC
         } from '../subworkflows/local/sequencing_qc' addParams( options: [:] )
+include { COLLATE_CUTADAPT_JSONS } from '../modules/local/cutadapt_json_collation/main.nf' addParams( options: [:] )
 // editorconfig-checker-disable
 
 //
 // MODULE: Installed directly from nf-core/modules
 //
 include { MULTIQC } from '../modules/nf-core/multiqc/main' addParams( options: multiqc_options   )
 
+//
+// FUNCTIONS: collection of custom functions
+//
+include { modify_seqkit_stats } from '../functions/functions.nf'
+include { add_stats_with_stage } from '../functions/functions.nf'
+
 /*
 ========================================================================================
     RUN MAIN WORKFLOW
@@ -194,6 +201,8 @@ def multiqc_report = []
 workflow SGE {
     // Set up empty channels
     ch_software_versions = Channel.empty()
+    seqkit_stat_ch = Channel.empty()
+    cutadapt_jsons_ch = Channel.empty()
 
     if (params.input_type == 'cram') {
         //
@@ -224,6 +233,7 @@ workflow SGE {
         ch_raw_read_qc = ch_raw_reads.map{it -> [[id: it[0].id + '_raw', single_end: it[0].single_end], it[1]]}
         RAW_SEQUENCING_QC ( ch_raw_read_qc )
         ch_software_versions = ch_software_versions.mix(RAW_SEQUENCING_QC.out.fastqc_version, RAW_SEQUENCING_QC.out.seqkit_version)
+        seqkit_stat_ch = add_stats_with_stage(seqkit_stat_ch, RAW_SEQUENCING_QC, 'seqkit_stats')
     }
 
     //
@@ -233,13 +243,15 @@ workflow SGE {
         // Run adapter trimming
         ADAPTER_TRIMMING ( ch_adapter_trim )
         ch_software_versions = ch_software_versions.mix(ADAPTER_TRIMMING.out.versions)
+        cutadapt_jsons_ch = add_stats_with_stage(cutadapt_jsons_ch, ADAPTER_TRIMMING, 'stats')
         //
         //SUBWORKFLOW: Run FASTQC on adapter trimmed reads
         //
         if (params.adapter_trimming_qc) {
             ch_adapter_trimming_qc = ADAPTER_TRIMMING.out.reads.map{it -> [[id: it[0].id + '_adapter_trimmed', single_end: it[0].single_end], it[1]]}
             ADAPTER_TRIMMED_SEQUENCING_QC ( ch_adapter_trimming_qc )
             ch_software_versions = ch_software_versions.mix(ADAPTER_TRIMMED_SEQUENCING_QC.out.fastqc_version, ADAPTER_TRIMMED_SEQUENCING_QC.out.seqkit_version)
+            seqkit_stat_ch = add_stats_with_stage(seqkit_stat_ch, ADAPTER_TRIMMED_SEQUENCING_QC, 'seqkit_stats')
         }
         // Send to next stage
         ch_primer_trim = ADAPTER_TRIMMING.out.reads
@@ -254,13 +266,15 @@ workflow SGE {
         // Run primer trimming
         PRIMER_TRIMMING ( ch_primer_trim )
         ch_software_versions = ch_software_versions.mix(PRIMER_TRIMMING.out.versions)
+        cutadapt_jsons_ch = add_stats_with_stage(cutadapt_jsons_ch, PRIMER_TRIMMING, 'stats')
         //
         //SUBWORKFLOW: Run FASTQC on primer trimmed reads
         //
         if (params.primer_trimming_qc) {
             ch_primer_trimming_qc = PRIMER_TRIMMING.out.reads.map{it -> [[id: it[0].id + '_primer_trimmed', single_end: it[0].single_end], it[1]]}
             PRIMER_TRIMMED_SEQUENCING_QC ( ch_primer_trimming_qc )
             ch_software_versions = ch_software_versions.mix(PRIMER_TRIMMED_SEQUENCING_QC.out.fastqc_version, PRIMER_TRIMMED_SEQUENCING_QC.out.seqkit_version)
+            seqkit_stat_ch = add_stats_with_stage(seqkit_stat_ch, PRIMER_TRIMMED_SEQUENCING_QC, 'seqkit_stats')
         }
         // Send to next stage
         ch_read_merge = PRIMER_TRIMMING.out.reads
@@ -283,6 +297,7 @@ workflow SGE {
             ch_merged_read_qc = ch_read_transform
             MERGED_SEQUENCING_QC ( ch_merged_read_qc )
             ch_software_versions = ch_software_versions.mix(MERGED_SEQUENCING_QC.out.fastqc_version, MERGED_SEQUENCING_QC.out.seqkit_version)
+            seqkit_stat_ch = add_stats_with_stage(seqkit_stat_ch, MERGED_SEQUENCING_QC, 'seqkit_stats')
         }
     } else {
         ch_read_transform = ch_read_merge
@@ -316,6 +331,7 @@ workflow SGE {
             ch_filtered_read_qc = READ_FILTERING.out.reads.map{it -> [[id: it[0].id + '_filtered', single_end: true], it[1]]}
             FILTERED_SEQUENCING_QC ( ch_filtered_read_qc )
             ch_software_versions = ch_software_versions.mix(FILTERED_SEQUENCING_QC.out.fastqc_version, FILTERED_SEQUENCING_QC.out.seqkit_version)
+            seqkit_stat_ch = add_stats_with_stage(seqkit_stat_ch, FILTERED_SEQUENCING_QC, 'seqkit_stats')
         }
     } else {
         ch_reads_to_modify = ch_read_filter
@@ -361,6 +377,15 @@ workflow SGE {
         ch_software_versions.map { it }.collect()
     )
 
+    seqkit_stat_ch
+        .map { meta, file, stage -> modify_seqkit_stats(meta, file, stage) }
+        .collectFile(keepHeader: true, name: 'seqkit_stats.tsv', storeDir: "${params.outdir}/seqkit_stats")
+
+    cutadapt_jsons_ch
+       .groupTuple()
+       .toList()
+       | COLLATE_CUTADAPT_JSONS
+
     //
     // MODULE: MultiQC
     //