Skip to content

Commit

Permalink
Merge pull request #69 from maxplanck-ie/dev_wd
Browse files Browse the repository at this point in the history
e-mail and parkour updates
  • Loading branch information
WardDeb authored Jul 12, 2024
2 parents a960323 + 7b89d5a commit af9b61a
Show file tree
Hide file tree
Showing 7 changed files with 213 additions and 163 deletions.
103 changes: 62 additions & 41 deletions BRB/ET.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
import BRB.misc
from BRB.logger import log
import pandas as pd

from pathlib import Path

def getNReads(d):
"""
Expand All @@ -33,17 +33,17 @@ def getNReads(d):

def getOffSpeciesRate(d, organism = None) -> float:
"""
Parses
Parses kraken report for number of reads mapping to unexpected organisms
"""
fname = glob.glob("{}/*rep".format(d))[0]
if not os.path.exists(fname):
return 0
# match parkour org to kraken db organism/group
org_map = {
'Human (GRCh38)': 'humangrp',
'Human (GRCh37 / hg19)': 'humangrp',
'Mouse (GRCm38 / mm10)': 'mousegrp',
'Mouse (GRCm39)': 'mousegrp',
'mouse': 'mousegrp',
'human': 'humangrp',
'Escherichia phage Lambda':'lambdaphage',
'Caenorhabditis_elegans': 'c-elegans',
'lamprey': 'sea-lamprey',
Expand All @@ -52,11 +52,14 @@ def getOffSpeciesRate(d, organism = None) -> float:
'drosophila': 'flygrp',
}
if organism not in org_map:
log.info(f"getOffSpeciesRate - organism {organism} is not in the org_map!")
return 0
with open(fname) as f:
for line in f:
if org_map[organism] in line:
off = float(line.strip().split()[0])/100
off = 1-(float(line.strip().split()[0])/100)
# off-species actually means fraction of non-expected organism reads !
# off-species reads vs of-species reads ;)
log.info(f"confident reads for {fname} = {off}")
return off

Expand Down Expand Up @@ -95,7 +98,20 @@ def DNA(config, outputDir, baseDict, sample2lib):
Add % mapped, % dupped, and insert size to baseDict. Filter it for those actually in the output
"""
# baseDict, sample2lib = getBaseStatistics(config, outputDir)

# If we have RELACS, the sample2lib won't match what we find here.
# We can re-parse the sampleSheet to upload actual statistics of the RELACS demuxed samples.
if Path(outputDir, 'RELACS_sampleSheet.txt').exists():
# RELACS is a problem for parkour (matching is in sampleID / barcode level).
# Just return a list of dicts with the previous info
m = []
for k, v in baseDict.items():
m.append({'barcode': k,
'reads_pf_sequenced': v[1],
'confident_reads': v[2],
'optical_duplicates': v[3]})
log.info(f"ET - DNA module detected RELACS. Returning {m}")
return m

# % Mapped
for fname in glob.glob("{}/Bowtie2/*.Bowtie2_summary.txt".format(outputDir)):
sampleName = os.path.basename(fname).split(".Bowtie2_summary")[0]
Expand All @@ -115,19 +131,19 @@ def DNA(config, outputDir, baseDict, sample2lib):
medInsertSize = insert_size_df.loc[insert_size_df["Unnamed: 0"]=="filtered_bam/"+sampleName+".filtered.bam"]
medInsertSize = medInsertSize["Frag. Len. Median"].values[0]
baseDict[sample2lib[sampleName]].append(int(medInsertSize))

log.info(f"ET - DNA module parsed {baseDict}")

# # Filter
outputDict = {k: v for k, v in baseDict.items() if len(v) == 8}
# Reformat into a matrix
m = []
for k, v in outputDict.items():
for k, v in baseDict.items():
m.append({'barcode': k,
'reads_pf_sequenced': v[1],
'confident_reads': v[2],
'optical_duplicates': v[3],
'dupped_reads': v[6],
'mapped_reads': v[5],
'insert_size': v[7]})
'dupped_reads': v[5],
'mapped_reads': v[4],
'insert_size': v[6]})
return m


Expand All @@ -137,8 +153,7 @@ def RNA(config, outputDir, baseDict, sample2lib):
Add % mapped to baseDict. Filter it for those actually in the output
"""
# baseDict, sample2lib = getBaseStatistics(config, outputDir)
# % Mapped

for fname in glob.glob("{}/STAR/*/*.Log.final.out".format(outputDir)):
f = open(fname)
tot = 0
Expand Down Expand Up @@ -171,21 +186,19 @@ def RNA(config, outputDir, baseDict, sample2lib):
baseDict[sample2lib[sampleName]].append(assigned_rate)



# Filter
outputDict = {k: v for k, v in baseDict.items() if len(v) == 10}
log.info(f"ET - RNA module parsed {baseDict}")
# Reformat into a matrix
m = []
for k, v in outputDict.items():
for k, v in baseDict.items():
m.append({'barcode': k,
'reads_pf_sequenced': v[1],
'confident_reads': v[2],
'optical_duplicates': v[3],
'mapped_reads': v[5],
'uniq_mapped': v[6],
'multi_mapped': v[7],
'dupped_reads': v[8],
'assigned_reads': v[9]})
'mapped_reads': v[4],
'uniq_mapped': v[5],
'multi_mapped': v[6],
'dupped_reads': v[7],
'assigned_reads': v[8]})
return m


Expand All @@ -198,17 +211,16 @@ def sendToParkour(config, msg):
log.info(f"sendToParkour: Sending {d} to Parkour")
res = requests.post(config.get("Parkour", "ResultsURL"), auth=(config.get("Parkour", "user"), config.get("Parkour", "password")), data=d, verify=config.get("Parkour", "cert"))
log.info(f"sendToParkour return {res}")
return res


def phoneHome(config, outputDir, pipeline, samples_tuples, organism):

def phoneHome(config, outputDir, pipeline, samples_tuples, organism, project, libType):
"""
Return metrics to Parkour, the results are in outputDir and pipeline needs to be run on them
"""
samples_id = [row[0] for row in samples_tuples]
baseDict, sample2lib = getBaseStatistics(config, outputDir, samples_id, organism)

log.info("phoneHome: baseDict: {}, sample2lib: {}".format(baseDict, sample2lib))

msg = None
if pipeline == 'DNA':
msg = DNA(config, outputDir, baseDict, sample2lib)
Expand All @@ -222,34 +234,43 @@ def phoneHome(config, outputDir, pipeline, samples_tuples, organism):
'confident_reads': v[2],
'optical_duplicates': v[3]})
msg = m

log.info(f"phoneHome: got msg = {msg}")
if msg is not None:
sendToParkour(config, msg)
ret = sendToParkour(config, msg)
else:
ret = None

return [project, organism, libType, pipeline, 'success', ret]


def telegraphHome(config, group, project, skipList, organism=None):
"""
The skipList is a list of samples/libraries for which we don't run a pipeline, but it'd be nice to still send back sequencing metrics
([library, sampleName])
Structure of skipList:
[library, sampleName, libraryType]
"""
log.info(f"telegraphHome triggered for {project}")
# make a fake output directory path
baseDir = "{}/{}/{}/{}/Analysis_{}".format(config.get('Paths', 'groupData'),
BRB.misc.pacifier(group),
BRB.misc.getLatestSeqdir(config.get('Paths','groupData'), group),
config.get('Options', 'runID'),
BRB.misc.pacifier(project))
outputDir = os.path.join(baseDir, "DNA_mouse") # does not matter what it is, it is just a generic name. No pipeline is run on these data
baseDir = Path(
config.get('Paths', 'groupData'),
BRB.misc.pacifier(group),
BRB.misc.getLatestSeqdir(config.get('Paths','groupData'), group),
config.get('Options', 'runID'),
BRB.misc.pacifier(project)
)
# Mock path
outputDir = baseDir / "DNA_mouse"
samples_id = [row[0] for row in skipList]
print("SAMPLESID = {}".format(samples_id))
baseDict, sample2lib = getBaseStatistics(config, outputDir, samples_id, organism)
print("telegraphHome: baseDict: {}, sample2lib: {}".format(baseDict, sample2lib))
# Reformat into a matrix
m = []
for k, v in baseDict.items():
m.append({'barcode': k,
'reads_pf_sequenced': v[1],
'confident_reads': v[2],
'optical_duplicates': v[3]})
sendToParkour(config, m)
return "Sent information on {} libraries from project {} from the {} group to Parkour\n".format(len(skipList), project, group)
ret = sendToParkour(config, m)
# Format the libtypes
libTypes = ','.join(set([i[2] for i in skipList]))
# [project, organism, libtypes, workflow, workflow status, parkour status, sambaUpdate]
return [project, organism, libTypes, None, None, ret, False]
Loading

0 comments on commit af9b61a

Please sign in to comment.