Merge pull request #69 from maxplanck-ie/dev_wd

e-mail and parkour updates
maxplanck-ie · Jul 12, 2024 · af9b61a · af9b61a
2 parents a960323 + 7b89d5a
commit af9b61a
Show file tree

Hide file tree

Showing 7 changed files with 213 additions and 163 deletions.
diff --git a/BRB/ET.py b/BRB/ET.py
@@ -6,7 +6,7 @@
 import BRB.misc
 from BRB.logger import log
 import pandas as pd
-
+from pathlib import Path
 
 def getNReads(d):
     """
@@ -33,17 +33,17 @@ def getNReads(d):
 
 def getOffSpeciesRate(d, organism = None) -> float:
     """
-    Parses 
+    Parses kraken report for number of reads mapping to unexpected organisms
     """
     fname = glob.glob("{}/*rep".format(d))[0]
-    if not os.path.exists(fname):
-        return 0
     # match parkour org to kraken db organism/group
     org_map = {
         'Human (GRCh38)': 'humangrp',
         'Human (GRCh37 / hg19)': 'humangrp',
         'Mouse (GRCm38 / mm10)': 'mousegrp',
         'Mouse (GRCm39)': 'mousegrp',
+        'mouse': 'mousegrp',
+        'human': 'humangrp',
         'Escherichia phage Lambda':'lambdaphage',
         'Caenorhabditis_elegans': 'c-elegans',
         'lamprey': 'sea-lamprey',
@@ -52,11 +52,14 @@ def getOffSpeciesRate(d, organism = None) -> float:
         'drosophila': 'flygrp',
     }
     if organism not in org_map:
+        log.info(f"getOffSpeciesRate - organism {organism} is not in the org_map!")
         return 0
     with open(fname) as f:
         for line in f:
             if org_map[organism] in line:
-                off = float(line.strip().split()[0])/100
+                off = 1-(float(line.strip().split()[0])/100)
+    # off-species actually means fraction of non-expected organism reads !
+    # off-species reads vs of-species reads ;)
     log.info(f"confident reads for {fname} = {off}")
     return off
 
@@ -95,7 +98,20 @@ def DNA(config, outputDir, baseDict, sample2lib):
     Add % mapped, % dupped, and insert size to baseDict. Filter it for those actually in the output
     """
     # baseDict, sample2lib = getBaseStatistics(config, outputDir)
-
+    # If we have RELACS, the sample2lib won't match what we find here. 
+    # We can re-parse the sampleSheet to upload actual statistics of the RELACS demuxed samples.
+    if Path(outputDir, 'RELACS_sampleSheet.txt').exists():
+        # RELACS is a problem for parkour (matching is in sampleID / barcode level).
+        # Just return a list of dicts with the previous info
+        m = []
+        for k, v in baseDict.items():
+            m.append({'barcode': k,
+                    'reads_pf_sequenced': v[1],
+                    'confident_reads': v[2],
+                    'optical_duplicates': v[3]})
+        log.info(f"ET - DNA module detected RELACS. Returning {m}")
+        return m
+
     # % Mapped
     for fname in glob.glob("{}/Bowtie2/*.Bowtie2_summary.txt".format(outputDir)):
         sampleName = os.path.basename(fname).split(".Bowtie2_summary")[0]
@@ -115,19 +131,19 @@ def DNA(config, outputDir, baseDict, sample2lib):
         medInsertSize = insert_size_df.loc[insert_size_df["Unnamed: 0"]=="filtered_bam/"+sampleName+".filtered.bam"]
         medInsertSize = medInsertSize["Frag. Len. Median"].values[0]
         baseDict[sample2lib[sampleName]].append(int(medInsertSize))
+
+    log.info(f"ET - DNA module parsed {baseDict}")
 
-    # # Filter
-    outputDict = {k: v for k, v in baseDict.items() if len(v) == 8}
     # Reformat into a matrix
     m = []
-    for k, v in outputDict.items():
+    for k, v in baseDict.items():
         m.append({'barcode': k,
                   'reads_pf_sequenced': v[1],
                   'confident_reads': v[2],
                   'optical_duplicates': v[3],
-                  'dupped_reads': v[6],
-                  'mapped_reads': v[5],
-                  'insert_size': v[7]})
+                  'dupped_reads': v[5],
+                  'mapped_reads': v[4],
+                  'insert_size': v[6]})
     return m
 
 
@@ -137,8 +153,7 @@ def RNA(config, outputDir, baseDict, sample2lib):
 
     Add % mapped to baseDict. Filter it for those actually in the output
     """
-    # baseDict, sample2lib = getBaseStatistics(config, outputDir)
-    # % Mapped
+
     for fname in glob.glob("{}/STAR/*/*.Log.final.out".format(outputDir)):
         f = open(fname)
         tot = 0
@@ -171,21 +186,19 @@ def RNA(config, outputDir, baseDict, sample2lib):
         baseDict[sample2lib[sampleName]].append(assigned_rate)
 
 
-
-    # Filter
-    outputDict = {k: v for k, v in baseDict.items() if len(v) == 10}
+    log.info(f"ET - RNA module parsed {baseDict}")
     # Reformat into a matrix
     m = []
-    for k, v in outputDict.items():
+    for k, v in baseDict.items():
         m.append({'barcode': k,
                   'reads_pf_sequenced': v[1],
                   'confident_reads': v[2],
                   'optical_duplicates': v[3],
-                  'mapped_reads': v[5],
-                  'uniq_mapped': v[6],
-                  'multi_mapped': v[7],
-                  'dupped_reads': v[8],
-                  'assigned_reads': v[9]})
+                  'mapped_reads': v[4],
+                  'uniq_mapped': v[5],
+                  'multi_mapped': v[6],
+                  'dupped_reads': v[7],
+                  'assigned_reads': v[8]})
     return m
 
 
@@ -198,17 +211,16 @@ def sendToParkour(config, msg):
     log.info(f"sendToParkour: Sending {d} to Parkour")
     res = requests.post(config.get("Parkour", "ResultsURL"), auth=(config.get("Parkour", "user"), config.get("Parkour", "password")), data=d, verify=config.get("Parkour", "cert"))
     log.info(f"sendToParkour return {res}")
+    return res
 
 
-def phoneHome(config, outputDir, pipeline, samples_tuples, organism):
+
+def phoneHome(config, outputDir, pipeline, samples_tuples, organism, project, libType):
     """
     Return metrics to Parkour, the results are in outputDir and pipeline needs to be run on them
     """
     samples_id = [row[0] for row in samples_tuples]
     baseDict, sample2lib = getBaseStatistics(config, outputDir, samples_id, organism)
-
-    log.info("phoneHome: baseDict: {}, sample2lib: {}".format(baseDict, sample2lib))
-
     msg = None
     if pipeline == 'DNA':
         msg = DNA(config, outputDir, baseDict, sample2lib)
@@ -222,34 +234,43 @@ def phoneHome(config, outputDir, pipeline, samples_tuples, organism):
                       'confident_reads': v[2],
                       'optical_duplicates': v[3]})
         msg = m
-
+    log.info(f"phoneHome: got msg = {msg}")
     if msg is not None:
-        sendToParkour(config, msg)
+        ret = sendToParkour(config, msg)
+    else:
+        ret = None
+
+    return [project, organism, libType, pipeline, 'success', ret]
 
 
 def telegraphHome(config, group, project, skipList, organism=None):
     """
     The skipList is a list of samples/libraries for which we don't run a pipeline, but it'd be nice to still send back sequencing metrics
-
-    ([library, sampleName])
+    Structure of skipList:
+    [library, sampleName, libraryType]
     """
+    log.info(f"telegraphHome triggered for {project}")
     # make a fake output directory path
-    baseDir = "{}/{}/{}/{}/Analysis_{}".format(config.get('Paths', 'groupData'),
-                                                            BRB.misc.pacifier(group),
-                                                            BRB.misc.getLatestSeqdir(config.get('Paths','groupData'), group),
-                                                            config.get('Options', 'runID'),
-                                                            BRB.misc.pacifier(project))
-    outputDir = os.path.join(baseDir, "DNA_mouse") # does not matter what it is, it is just a generic name. No pipeline is run on these data
+    baseDir = Path(
+        config.get('Paths', 'groupData'),
+        BRB.misc.pacifier(group),
+        BRB.misc.getLatestSeqdir(config.get('Paths','groupData'), group),
+        config.get('Options', 'runID'),
+        BRB.misc.pacifier(project)
+    )
+    # Mock path
+    outputDir = baseDir / "DNA_mouse"
     samples_id = [row[0] for row in skipList]
-    print("SAMPLESID = {}".format(samples_id))
     baseDict, sample2lib = getBaseStatistics(config, outputDir, samples_id, organism)
-    print("telegraphHome: baseDict: {}, sample2lib: {}".format(baseDict, sample2lib))
     # Reformat into a matrix
     m = []
     for k, v in baseDict.items():
         m.append({'barcode': k,
                   'reads_pf_sequenced': v[1],
                   'confident_reads': v[2],
                   'optical_duplicates': v[3]})
-    sendToParkour(config, m)
-    return "Sent information on {} libraries from project {} from the {} group to Parkour\n".format(len(skipList), project, group)
+    ret = sendToParkour(config, m)
+    # Format the libtypes
+    libTypes = ','.join(set([i[2] for i in skipList]))
+    # [project, organism, libtypes, workflow, workflow status, parkour status, sambaUpdate]
+    return [project, organism, libTypes, None, None, ret, False]