final touches email, fix stats sequence upload to parkour

maxplanck-ie · Jul 12, 2024 · f9b25cf · f9b25cf
1 parent 7c8a249
commit f9b25cf
Show file tree

Hide file tree

Showing 3 changed files with 36 additions and 29 deletions.
diff --git a/BRB/ET.py b/BRB/ET.py
@@ -33,17 +33,17 @@ def getNReads(d):
 
 def getOffSpeciesRate(d, organism = None) -> float:
     """
-    Parses 
+    Parses kraken report for number of reads mapping to unexpected organisms
     """
     fname = glob.glob("{}/*rep".format(d))[0]
-    if not os.path.exists(fname):
-        return 0
     # match parkour org to kraken db organism/group
     org_map = {
         'Human (GRCh38)': 'humangrp',
         'Human (GRCh37 / hg19)': 'humangrp',
         'Mouse (GRCm38 / mm10)': 'mousegrp',
         'Mouse (GRCm39)': 'mousegrp',
+        'mouse': 'mousegrp',
+        'human': 'humangrp',
         'Escherichia phage Lambda':'lambdaphage',
         'Caenorhabditis_elegans': 'c-elegans',
         'lamprey': 'sea-lamprey',
@@ -52,6 +52,7 @@ def getOffSpeciesRate(d, organism = None) -> float:
         'drosophila': 'flygrp',
     }
     if organism not in org_map:
+        log.info(f"getOffSpeciesRate - organism {organism} is not in the org_map!")
         return 0
     with open(fname) as f:
         for line in f:
@@ -97,7 +98,20 @@ def DNA(config, outputDir, baseDict, sample2lib):
     Add % mapped, % dupped, and insert size to baseDict. Filter it for those actually in the output
     """
     # baseDict, sample2lib = getBaseStatistics(config, outputDir)
-
+    # If we have RELACS, the sample2lib won't match what we find here. 
+    # We can re-parse the sampleSheet to upload actual statistics of the RELACS demuxed samples.
+    if Path(outputDir, 'RELACS_sampleSheet.txt').exists():
+        # RELACS is a problem for parkour (matching is in sampleID / barcode level).
+        # Just return a list of dicts with the previous info
+        m = []
+        for k, v in baseDict.items():
+            m.append({'barcode': k,
+                    'reads_pf_sequenced': v[1],
+                    'confident_reads': v[2],
+                    'optical_duplicates': v[3]})
+        log.info(f"ET - DNA module detected RELACS. Returning {m}")
+        return m
+
     # % Mapped
     for fname in glob.glob("{}/Bowtie2/*.Bowtie2_summary.txt".format(outputDir)):
         sampleName = os.path.basename(fname).split(".Bowtie2_summary")[0]
@@ -117,19 +131,19 @@ def DNA(config, outputDir, baseDict, sample2lib):
         medInsertSize = insert_size_df.loc[insert_size_df["Unnamed: 0"]=="filtered_bam/"+sampleName+".filtered.bam"]
         medInsertSize = medInsertSize["Frag. Len. Median"].values[0]
         baseDict[sample2lib[sampleName]].append(int(medInsertSize))
+
+    log.info(f"ET - DNA module parsed {baseDict}")
 
-    # # Filter
-    outputDict = {k: v for k, v in baseDict.items() if len(v) == 8}
     # Reformat into a matrix
     m = []
-    for k, v in outputDict.items():
+    for k, v in baseDict.items():
         m.append({'barcode': k,
                   'reads_pf_sequenced': v[1],
                   'confident_reads': v[2],
                   'optical_duplicates': v[3],
-                  'dupped_reads': v[6],
-                  'mapped_reads': v[5],
-                  'insert_size': v[7]})
+                  'dupped_reads': v[5],
+                  'mapped_reads': v[4],
+                  'insert_size': v[6]})
     return m
 
 
@@ -139,8 +153,7 @@ def RNA(config, outputDir, baseDict, sample2lib):
 
     Add % mapped to baseDict. Filter it for those actually in the output
     """
-    # baseDict, sample2lib = getBaseStatistics(config, outputDir)
-    # % Mapped
+
     for fname in glob.glob("{}/STAR/*/*.Log.final.out".format(outputDir)):
         f = open(fname)
         tot = 0
@@ -173,21 +186,19 @@ def RNA(config, outputDir, baseDict, sample2lib):
         baseDict[sample2lib[sampleName]].append(assigned_rate)
 
 
-
-    # Filter
-    outputDict = {k: v for k, v in baseDict.items() if len(v) == 10}
+    log.info(f"ET - RNA module parsed {baseDict}")
     # Reformat into a matrix
     m = []
-    for k, v in outputDict.items():
+    for k, v in baseDict.items():
         m.append({'barcode': k,
                   'reads_pf_sequenced': v[1],
                   'confident_reads': v[2],
                   'optical_duplicates': v[3],
-                  'mapped_reads': v[5],
-                  'uniq_mapped': v[6],
-                  'multi_mapped': v[7],
-                  'dupped_reads': v[8],
-                  'assigned_reads': v[9]})
+                  'mapped_reads': v[4],
+                  'uniq_mapped': v[5],
+                  'multi_mapped': v[6],
+                  'dupped_reads': v[7],
+                  'assigned_reads': v[8]})
     return m
 
 
@@ -210,9 +221,6 @@ def phoneHome(config, outputDir, pipeline, samples_tuples, organism, project, li
     """
     samples_id = [row[0] for row in samples_tuples]
     baseDict, sample2lib = getBaseStatistics(config, outputDir, samples_id, organism)
-
-    log.info("phoneHome: baseDict: {}, sample2lib: {}".format(baseDict, sample2lib))
-
     msg = None
     if pipeline == 'DNA':
         msg = DNA(config, outputDir, baseDict, sample2lib)
@@ -226,7 +234,7 @@ def phoneHome(config, outputDir, pipeline, samples_tuples, organism, project, li
                       'confident_reads': v[2],
                       'optical_duplicates': v[3]})
         msg = m
-
+    log.info(f"phoneHome: got msg = {msg}")
     if msg is not None:
         ret = sendToParkour(config, msg)
     else:

diff --git a/BRB/PushButton.py b/BRB/PushButton.py
@@ -591,7 +591,6 @@ def GetResults(config, project, libraries):
                 #hence the pacifier is applied on the project in each pipeline separately
                 outputDir, rv, sambaUpdate = globals()[pipeline](config, group, project, organism, libraryType, tuples)
                 if rv == 0:
-                    #try:
                     msg = msg + [BRB.ET.phoneHome(config, outputDir, pipeline, tuples, organism, project, libraryType) + [sambaUpdate]]
                     log.info(f"Processed project {BRB.misc.pacifier(project)} with the {pipeline} pipeline. {libraryType}, {organism}")
                 else:

diff --git a/BRB/email.py b/BRB/email.py
@@ -33,14 +33,14 @@ def finishedEmail(config, msg):
         if [i[4] for i in msg].count('success') == len(msg):
             recipient = config.get("Email","deepSeq")
             _html.add(div(
-                "Post-processing is ready, deepSeq's sambda drive is updated for at least one project.",
+                f"Post-processing is ready, Samba drive is updated for {[i[6] for i in msg].count(True)} project(s).",
                 br()
             ))
-    
+
     mailer['To'] = recipient
     # Table
     tabHead = ['Project', 'organism', 'libraryType', 'workflow', 'workflow_status', 'parkour_status', 'sambaUpdate']
-    message = tabulate(
+    message =  _html.render() + '\n\n' + tabulate(
         msg, tabHead, tablefmt="html", disable_numparse=True
     )