Adding new option to conditional identify/quantify novel miRs

OpenOmics · Sep 23, 2024 · 3d5fa75 · 3d5fa75
1 parent 26e989b
commit 3d5fa75
Show file tree

Hide file tree

Showing 3 changed files with 57 additions and 10 deletions.
diff --git a/VERSION b/VERSION
@@ -1 +1 @@
-0.2.1
+0.3.0
diff --git a/mir-seek b/mir-seek
@@ -296,6 +296,7 @@ def parsed_arguments(name, description):
                 [--silent] [--threads THREADS] [--tmp-dir TMP_DIR] \\
                 [--min-read-length MIN_READ_LENGTH] \\
                 [--max-read-length MAX_READ_LENGTH] \\
+                [--novel-mir-identification]
                 --input INPUT [INPUT ...] \\
                 --output OUTPUT \\
                 --genome {{hg38,mm10,custom.json}}
@@ -364,6 +365,16 @@ def parsed_arguments(name, description):
                                 value to this option.
                                   Default: 27
                                   Example: --max-read-length 27
+          --novel-mir-identification
+                                Enables novel miR identification. If this option is
+                                provided, the pipeline will run mirdeep2 using a two
+                                pass approach to generate a novel miR counts matrix. 
+                                In the first-pass,  novel miRs are identified using
+                                information across all samples. In the second-pass, 
+                                the expression of each novel is quantified. If you
+                                are interested in identifying and quantifying novel 
+                                miRs, please provide this option.
+                                  Example: --novel-mir-identification
 
         {3}{4}Orchestration options:{5}
           --mode {{slurm,local}}  
@@ -536,6 +547,15 @@ def parsed_arguments(name, description):
         help = argparse.SUPPRESS
     )
 
+    # Novel miR identification
+    subparser_run.add_argument(
+        '--novel-mir-identification',
+        action = 'store_true',
+        required = False,
+        default = False,
+        help = argparse.SUPPRESS
+    )
+
     # Orchestration Options
     # Execution Method, run locally 
     # on a compute node or submit to 

diff --git a/workflow/Snakefile b/workflow/Snakefile
@@ -31,6 +31,11 @@ if genome.endswith('.json'):
 # Analysis options
 min_read_length = config['options']['min_read_length']  # Min length of a trimmed read (discarded if less)
 max_read_length = config['options']['max_read_length']  # Max length of a trimmed read (cropped if greater)
+# Run mirdeep2 in two-passes to 
+# identify/quantify novel miRNAs
+identify_novel_mirs = str_bool(
+    config['options']['novel_mir_identification']
+)
 
 # Read in resource information,
 # containing information about 
@@ -103,25 +108,47 @@ rule all:
         # each sample using the novel miR fasta that
         # was generated in the first pass.
         # First-pass rules for novel miR quantification,
-        # @imported from `rule mirdeep2_novel_p1_concatenate` in rules/novel.smk
-        join(workpath, "novel", "cohort_trimmed_cleaned.fa"),
+        # @imported from `rule mirdeep2_novel_p1_concatenate` in rules/novel.smk,
+        # This rule is only run if `identify_novel_mirs` is set to True.
+        provided(
+            [join(workpath, "novel", "cohort_trimmed_cleaned.fa")], 
+            identify_novel_mirs
+        ),
         # @imported from `rule mirdeep2_novel_p1_mapper` in rules/novel.smk
-        join(workpath, "novel", "mapper", "cohort_mapped.arf"),
-        join(workpath, "novel", "mapper", "cohort_collapsed.fa"),
+        # This rule is only run if `identify_novel_mirs` is set to True.
+        provided(
+            [join(workpath, "novel", "mapper", "cohort_mapped.arf")],
+            identify_novel_mirs
+        ),
+        provided(
+            [join(workpath, "novel", "mapper", "cohort_collapsed.fa")],
+            identify_novel_mirs
+        ),
         # @imported from `rule mirdeep2_novel_p1_run` in rules/novel.smk
-        join(workpath, "novel", "pass1", "cohort_novel_mature_miRNA.tsv"),
-        join(workpath, "novel", "pass1", "cohort_novel_hairpin_miRNA.tsv"),
+        # This rule is only run if `identify_novel_mirs` is set to True.
+        provided(
+            [join(workpath, "novel", "pass1", "cohort_novel_mature_miRNA.tsv")],
+            identify_novel_mirs
+        ),
+        provided(
+            [join(workpath, "novel", "pass1", "cohort_novel_hairpin_miRNA.tsv")],
+            identify_novel_mirs
+        ),
         # Second-pass rules for novel miR quantification,
         # @imported from `rule mirdeep2_novel_p2_quantifier` in rules/novel.smk
+        # This rule is only run if `identify_novel_mirs` is set to True.
         expand(
             join(workpath, "novel", "counts", "{sample}_novel_miRNA_expressed.tsv"),
-            sample=samples
+            sample=provided(samples, identify_novel_mirs)
         ),
         expand(
             join(workpath, "novel", "counts", "{sample}_novel_mature_miRNA_expression.tsv"),
-            sample=samples
+            sample=provided(samples, identify_novel_mirs)
+        ),
+        provided(
+            [join(workpath, "novel", "counts", "miRDeep2_novel_mature_miRNA_counts.tsv")],
+            identify_novel_mirs
         ),
-        join(workpath, "novel", "counts", "miRDeep2_novel_mature_miRNA_counts.tsv"),
         # Aggregated quality-control report,
         # @imported from `rule multiqc` in rules/qc.smk
         join(workpath, "reports", "multiqc_report.html"),