Skip to content

Commit

Permalink
Merge pull request #67 from maxplanck-ie/dev_wd
Browse files Browse the repository at this point in the history
Dev wd
  • Loading branch information
WardDeb authored Jul 11, 2024
2 parents 6e70257 + 55f4945 commit a960323
Show file tree
Hide file tree
Showing 11 changed files with 102 additions and 359 deletions.
8 changes: 8 additions & 0 deletions .github/workflows/lint.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
name: Ruff
on: [push, pull_request]
jobs:
ruff:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- uses: chartboost/ruff-action@v1
12 changes: 12 additions & 0 deletions .github/workflows/pip.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
name: install
on: [push, pull_request]
jobs:
pip:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- uses: actions/setup-python@v5
with:
python-version: '3.12'
cache: 'pip'
- run: pip install .
104 changes: 35 additions & 69 deletions BRB/ET.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@
import subprocess
import os
import glob
import csv
import json
import BRB.misc
from BRB.logger import log
Expand Down Expand Up @@ -32,54 +31,34 @@ def getNReads(d):
return (int(res) / 4), 0.


def getOffSpeciesRate(d, organism = None):
def getOffSpeciesRate(d, organism = None) -> float:
"""
Get the percentage of off-species reads from a directory
This is copied from the bcl2fastq pipeline
Parses
"""
fname = glob.glob("{}/*_screen.txt".format(d))[0]
fname = glob.glob("{}/*rep".format(d))[0]
if not os.path.exists(fname):
return 0,0
total = 0
species=[]
ohol=[]
mhol=[]
i = 0
maxi = 0
rRNA = ""
rRNA_rate = 0
if organism in ['human', 'mouse']:
rRNA = "{}rRNA".format(organism)
elif organism == 'drosophila':
rRNA = "flyrRNA"
for line in csv.reader(open(fname, "r"), dialect="excel-tab") :
if(len(line) == 0) :
break
if(line[0].startswith("#")) :
print("Hit a #")
continue
if(line[0].startswith("Library")) :
continue
if(line[0].startswith("Genome")) :
continue
if(line[0].startswith("PhiX") or line[0].startswith("Adapters") or line[0].startswith("Vectors") or line[0].startswith("rRNA")):
continue
if((len(rRNA)!=0) and (line[0].startswith(rRNA))):
rRNA_rate = float(line[5]) + float(line[7])
continue
species.append(line[0])
ohol.append(float(line[5]))

if(ohol[maxi] < ohol[i]) :
maxi = i
i += 1

off = 0
for i in range(len(ohol)) :
if(i != maxi) :
off += ohol[i]
print("off is {}, rRNA_rate is {}".format(off, rRNA_rate))
return off, rRNA_rate
return 0
# match parkour org to kraken db organism/group
org_map = {
'Human (GRCh38)': 'humangrp',
'Human (GRCh37 / hg19)': 'humangrp',
'Mouse (GRCm38 / mm10)': 'mousegrp',
'Mouse (GRCm39)': 'mousegrp',
'Escherichia phage Lambda':'lambdaphage',
'Caenorhabditis_elegans': 'c-elegans',
'lamprey': 'sea-lamprey',
'medaka': 'japanese-medaka',
'zebrafish': 'zebrafish',
'drosophila': 'flygrp',
}
if organism not in org_map:
return 0
with open(fname) as f:
for line in f:
if org_map[organism] in line:
off = float(line.strip().split()[0])/100
log.info(f"confident reads for {fname} = {off}")
return off


def getBaseStatistics(config, outputDir, samples_id, organism = None):
Expand All @@ -103,12 +82,8 @@ def getBaseStatistics(config, outputDir, samples_id, organism = None):
sampleName = glob.glob("{}/*_R1_fastqc.zip".format(d))[0]
sampleName = os.path.split(sampleName)[1][:-14]
nReads, optDupes = getNReads(d) # opt. dup.
try:
offRate, rRNA_rate = getOffSpeciesRate(d,organism)
except:
offRate = 0
rRNA_rate = 0
baseDict[libName] = [sampleName, nReads, offRate, optDupes, rRNA_rate]
offRate = getOffSpeciesRate(d,organism)
baseDict[libName] = [sampleName, nReads, offRate, optDupes]
s2l[sampleName] = libName
return baseDict, s2l

Expand All @@ -126,10 +101,7 @@ def DNA(config, outputDir, baseDict, sample2lib):
sampleName = os.path.basename(fname).split(".Bowtie2_summary")[0]
lastLine = open(fname).read().split("\n")[-2]
mappedPercent = lastLine.split("%")[0]
try:
baseDict[sample2lib[sampleName]].append(float(mappedPercent))
except:
continue
baseDict[sample2lib[sampleName]].append(float(mappedPercent))
# % Duplicated
dup_info = glob.glob("{}/multiQC/multiqc_data/multiqc_samtools_flagstat.txt".format(outputDir))[0]
dup_df = pd.read_csv(dup_info, sep ="\t", usecols=["Sample", "total_passed", "duplicates_passed"])
Expand All @@ -155,8 +127,7 @@ def DNA(config, outputDir, baseDict, sample2lib):
'optical_duplicates': v[3],
'dupped_reads': v[6],
'mapped_reads': v[5],
'insert_size': v[7],
'rRNA_rate': v[4]})
'insert_size': v[7]})
return m


Expand Down Expand Up @@ -214,22 +185,19 @@ def RNA(config, outputDir, baseDict, sample2lib):
'uniq_mapped': v[6],
'multi_mapped': v[7],
'dupped_reads': v[8],
'assigned_reads': v[9],
'rRNA_rate': v[4]})
'assigned_reads': v[9]})
return m


def sendToParkour(config, msg):
FCID = config.get("Options", "runID").split("_")[3][1:] # C605HACXX from 150416_SN7001180_0196_BC605HACXX
FCID = config.get("Options", "runID").split("_")[3][1:]
if '-' in FCID:
FCID = FCID.split('-')[-1]
d = {'flowcell_id': FCID}
d['sequences'] = json.dumps(msg)
log.info("sendToParkour: Sending {} to Parkour".format(d))
#print("Sending:")
#print("{}".format(d))
#print("To parkour")
log.info(f"sendToParkour: Sending {d} to Parkour")
res = requests.post(config.get("Parkour", "ResultsURL"), auth=(config.get("Parkour", "user"), config.get("Parkour", "password")), data=d, verify=config.get("Parkour", "cert"))
log.info(f"sendToParkour return {res}")


def phoneHome(config, outputDir, pipeline, samples_tuples, organism):
Expand All @@ -252,8 +220,7 @@ def phoneHome(config, outputDir, pipeline, samples_tuples, organism):
m.append({'barcode': k,
'reads_pf_sequenced': v[1],
'confident_reads': v[2],
'optical_duplicates': v[3],
'rRNA_rate': v[4]})
'optical_duplicates': v[3]})
msg = m

if msg is not None:
Expand Down Expand Up @@ -283,7 +250,6 @@ def telegraphHome(config, group, project, skipList, organism=None):
m.append({'barcode': k,
'reads_pf_sequenced': v[1],
'confident_reads': v[2],
'optical_duplicates': v[3],
'rRNA_rate': v[4]})
'optical_duplicates': v[3]})
sendToParkour(config, m)
return "Sent information on {} libraries from project {} from the {} group to Parkour\n".format(len(skipList), project, group)
111 changes: 38 additions & 73 deletions BRB/PushButton.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@
import shutil
import glob
import subprocess
import BRB.galaxy
import BRB.ET
import BRB.misc
from BRB.logger import log
Expand All @@ -11,7 +10,7 @@

def createPath(config, group, project, organism, libraryType, tuples):
"""Ensures that the output path exists, creates it otherwise, and return where it is"""
if tuples[0][3] == True: # if external data
if tuples[0][3]:
baseDir = "{}/{}/Analysis_{}".format(config.get('Paths', 'baseData'),
config.get('Options', 'runID'),
BRB.misc.pacifier(project))
Expand All @@ -23,14 +22,14 @@ def createPath(config, group, project, organism, libraryType, tuples):
BRB.misc.pacifier(project))
os.makedirs(baseDir, mode=0o750, exist_ok=True)

oDir = os.path.join(baseDir, "{}_{}".format(BRB.misc.pacifier(libraryType), organism))
oDir = os.path.join(baseDir, "{}_{}".format(BRB.misc.pacifier(libraryType), organism.split(' ')[0].lower()))
os.makedirs(oDir, exist_ok=True)
return oDir


def linkFiles(config, group, project, odir, tuples):
"""Create symlinks in odir to fastq files in {project}. Return 1 if paired-end, 0 otherwise."""
if tuples[0][3] == True: # if external data
if tuples[0][3]:
baseDir = "{}/{}/Project_{}".format(config.get('Paths', 'baseData'),
config.get('Options', 'runID'),
BRB.misc.pacifier(project))
Expand Down Expand Up @@ -58,20 +57,15 @@ def linkFiles(config, group, project, odir, tuples):

def removeLinkFiles(d):
"""Remove symlinks created by linkFiles()"""
try:
files = glob.glob("{}/originalFASTQ/*_R?.fastq.gz".format(d))
files = glob.glob("{}/originalFASTQ/*_R?.fastq.gz".format(d))
if files:
for fname in files:
os.unlink(fname)
except:
print("check if originalFASTQ exists!")
log.warning("removeLinkeFiles: check if originalFASTQ exists!")
files = glob.glob("{}/*_R?.fastq.gz".format(d))
for fname in files:
os.unlink(fname)




def relinkFiles(config, group, project, organism, libraryType, tuples):
"""
Generate symlinks under the snakepipes originalFASTQ folder directly from the project folder.
Expand All @@ -87,10 +81,9 @@ def relinkFiles(config, group, project, organism, libraryType, tuples):
log.info(f"Multiqc report found for {group} project {project}.")
of = Path(config.get('Paths', 'bioinfoCoreDir')) / 'Analysis' + project + '_multiqc.html'
log.info(f"Trying to copy mqc report to {of}.")
try:
shutil.copyfile(mqcf, of)
except:
log.warning(f"Copying {mqcf} to {of} failed.")
shutil.copyfile(mqcf, of)
else:
log.info(f"no multiqc report under {mqcf}.")


def organism2Org(config, organism):
Expand Down Expand Up @@ -135,15 +128,10 @@ def copyCellRanger(config, d):
short_fid = str(os.path.basename(lane_dir)).split('_')[2] + '_'
bioinfoCoreDirPath = Path(config.get('Paths', 'bioinfoCoreDir')) / Path(short_fid + nname)
nname = seqfac_lane_dir / nname
try:
shutil.copyfile(fname, nname)
except:
log.warning("copyCellRanger from {} to {} failed.".format(fname, nname))
shutil.copyfile(fname, nname)
# to bioinfocore dir
try:
shutil.copyfile(fname, bioinfoCoreDirPath)
except:
log.warning("copyCellRanger from {} to {} failed.".format(fname, bioinfoCoreDirPath))
shutil.copyfile(fname, bioinfoCoreDirPath)
log.warning("copyCellRanger from {} to {} failed.".format(fname, bioinfoCoreDirPath))


def copyRELACS(config, d):
Expand Down Expand Up @@ -175,49 +163,41 @@ def copyRELACS(config, d):
seqfac_lane_dir = Path(config.get('Paths', 'seqFacDir')) / year_postfix / lane_dir
os.makedirs(seqfac_lane_dir, exist_ok=True)
nname = seqfac_lane_dir / nname
try:
shutil.copyfile(fname, nname)
except:
log.warning("copyCellRanger from {} to {} failed.".format(fname, nname))

shutil.copyfile(fname, nname)

def tidyUpABit(d):
"""
Reduce the number of files in the analysis folder.
"""
try:
shutil.rmtree(os.path.join(d, 'cluster_logs'))
os.unlink(os.path.join(d, 'config.yaml'))
shutil.rmtree(os.path.join(d, '.snakemake'))
# multiqc data
mqc_log = os.path.join(d, 'multiQC', 'multiqc_data', 'multiqc.log')
mqc_out = os.path.join(d, 'multiQC', 'multiQC.out')
mqc_err = os.path.join(d, 'multiQC', 'multiQC.err')
if os.path.exists(mqc_log):
os.unlink(mqc_log)
if os.path.exists(mqc_out):
os.unlink(mqc_out)
if os.path.exists(mqc_err):
os.unlink(mqc_err)

for f in glob.glob(os.path.join(d, '*.log')):
os.unlink(f)

for d2 in glob.glob(os.path.join(d, 'FASTQ*')):
shutil.rmtree(d2)
except:
pass
shutil.rmtree(os.path.join(d, 'cluster_logs'))
os.unlink(os.path.join(d, 'config.yaml'))
shutil.rmtree(os.path.join(d, '.snakemake'))
# multiqc data
mqc_log = os.path.join(d, 'multiQC', 'multiqc_data', 'multiqc.log')
mqc_out = os.path.join(d, 'multiQC', 'multiQC.out')
mqc_err = os.path.join(d, 'multiQC', 'multiQC.err')
if os.path.exists(mqc_log):
os.unlink(mqc_log)
if os.path.exists(mqc_out):
os.unlink(mqc_out)
if os.path.exists(mqc_err):
os.unlink(mqc_err)

for f in glob.glob(os.path.join(d, '*.log')):
os.unlink(f)

for d2 in glob.glob(os.path.join(d, 'FASTQ*')):
shutil.rmtree(d2)


def stripRights(d):
# Strip rights.
try:
for r, dirs, files in os.walk(d):
for d in dirs:
os.chmod(os.path.join(r, d), stat.S_IRWXU | stat.S_IRGRP | stat.S_IXGRP)
for f in files:
os.chmod(os.path.join(r, f), stat.S_IRWXU | stat.S_IRGRP)
except:
pass
for r, dirs, files in os.walk(d):
for d in dirs:
os.chmod(os.path.join(r, d), stat.S_IRWXU | stat.S_IRGRP | stat.S_IXGRP)
for f in files:
os.chmod(os.path.join(r, f), stat.S_IRWXU | stat.S_IRGRP)


def touchDone(outputDir, fname="analysis.done"):
open(os.path.join(outputDir, fname), "w").close()
Expand Down Expand Up @@ -623,24 +603,9 @@ def GetResults(config, project, libraries):
#hence the pacifier is applied on the project in each pipeline separately
outputDir, rv = globals()[pipeline](config, group, project, organism, libraryType, tuples)
if rv == 0:
# galaxyUsers = BRB.misc.fetchGalaxyUsers(config.get('Galaxy','Users'))
# if BRB.misc.pacifier(project).split('_')[1] in galaxyUsers:
# try:
# BRB.galaxy.linkIntoGalaxy(config, group, BRB.misc.pacifier(project), outputDir)
# msg += 'Always nice to see a Galaxy user! I linked {} into Galaxy. '.format(BRB.misc.pacifier(project))
# except:
# msg += 'I did my best to link {} into Galaxy, but I failed. '.format(BRB.misc.pacifier(project))
# continue
# else:
msg += "I deliberately didn't link {} into Galaxy. ".format(BRB.misc.pacifier(project))

#try:
BRB.ET.phoneHome(config, outputDir, pipeline, tuples, organism)
msg += 'Processed project {} with the {} pipeline. The samples were of type {} from a {}.\n'.format(BRB.misc.pacifier(project), pipeline, libraryType, organism)
#except:
# msg += 'Failed to phone {} home. I was using outDir {}. I am not giving up though, BRB keeps running! \n'.format(BRB.misc.pacifier(project),outputDir)
# continue
else:
msg += "I received an error processing {}_{}_{}_{} for you.\n".format(BRB.misc.pacifier(project), pipeline, libraryType, organism)

return msg
1 change: 0 additions & 1 deletion BRB/demultiplex_relacs.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
import glob
from multiprocessing import Pool
import subprocess
import gzip
import matplotlib.pyplot as plt
import numpy as np
import editdistance as ed
Expand Down
Loading

0 comments on commit a960323

Please sign in to comment.