From 743dc77d5b1f6897d8328493baf6e8e44e3ca91d Mon Sep 17 00:00:00 2001 From: leefall Date: Sun, 24 Jul 2022 18:06:04 +0900 Subject: [PATCH] Script for Oncomine Output --- OncomineConverter.py | 302 +++++++++++++++++++++++++++++++++++++++++++ Oncomine_QC.py | 46 +++++++ 2 files changed, 348 insertions(+) create mode 100644 OncomineConverter.py create mode 100644 Oncomine_QC.py diff --git a/OncomineConverter.py b/OncomineConverter.py new file mode 100644 index 0000000..9b08a97 --- /dev/null +++ b/OncomineConverter.py @@ -0,0 +1,302 @@ +#!/usr/bin/env python +import os, glob, json, unicodedata, sys + + + +def HGVS(sChr, nPosition, sRef, sAlt): + + + sChr=sChr.replace("chr","NC_0000").replace("X","23").replace("Y","24").replace("MT","M") + + + #print (sChr, nPosition, sRef, sAlt) + + if len(sRef)len(sAlt): + return sChr+".13:g."+str(nPosition)+"_"+str(int(nPosition)+len(sAlt))+"del"+sRef + else: + return sChr+".13:g."+str(nPosition)+sRef+">"+sAlt + + + +def HGVS_CNV(sChr, nStartPosition, nEndPosition, nRatio): + sChr=sChr.replace("chr","NC_0000").replace("X","23").replace("Y","24").replace("MT","M") + + + #print (sChr, nPosition, sRef, sAlt) + + if float(nRatio)>1: + return sChr+".13:g."+str(nStartPosition)+"_"+str(int(nEndPosition))+"dup" + elif len(nRatio)<1: + return sChr+".13:g."+str(nStartPosition)+"_"+str(int(nEndPosition))+"del" + + +def HGVS_Fusion(sLocus1, sLocus2): + + (sChr1,Position1)=sLocus1.split(":") + (sChr2,Position2)=sLocus2.split(":") + + + + sChr1=sChr1.replace("chr","NC_0000").replace("X","23").replace("Y","24").replace("MT","M") + + sChr2=sChr2.replace("chr","NC_0000").replace("X","23").replace("Y","24").replace("MT","M") + + + + + return sChr1+".13:g."+str(nPosition1)+"::"+sChr2+"13.g."+str(nPosition2) + +def GetNormalized(sChr,nPosition,sRef): + rawVcf=glob.glob("*Non-Filtered*.vcf") + fVCF=open(rawVcf[0],"r") + + sKey=sChr+"_"+str(nPosition)+"_"+sRef + + for sLine in fVCF.xreadlines(): + if sLine[0]=="#": + pass + else: + t=sLine.split("\t") + sLineKey=t[0]+"_"+t[1]+"_"+t[3] + if sLineKey==sKey: + #print sChr,nPosition,sRef + sFUNC=t[-3] + sFUNC=sFUNC.split("=[")[1] + sFUNC=sFUNC.replace("]","") + sFUNC=sFUNC.replace("'",'"') + #print sFUNC + sjsonDict=json.loads(sFUNC) +# print sLine + try: + (sNormalPos, sNormalRef, sNormalAlt)=(sjsonDict["normalizedPos"],sjsonDict['normalizedRef'],sjsonDict['normalizedAlt']) + nNormalPos=unicodedata.normalize('NFKD', sNormalPos).encode('ascii','ignore') + sNormalRef=unicodedata.normalize('NFKD', sNormalRef).encode('ascii','ignore') + sNormalAlt=unicodedata.normalize('NFKD', sNormalAlt).encode('ascii','ignore') + sGenotype=t[-1] + + sGenotype=sGenotype.split(":")[0] + (sAlt1, sAlt2)=sGenotype.split("/") + + if sAlt1==sAlt2: + sAllelicState="Homozygous" + else: + sAllelicState="Heterozygous" + + return (nNormalPos, sNormalRef, sNormalAlt,sAllelicState) + except: + sGenotype=t[-1] + sGenotype=sGenotype.split(":")[0] + (sAlt1, sAlt2)=sGenotype.split("/") + if sAlt1==sAlt2: + sAllelicState="Homozygous" + else: + sAllelicState="Heterozygous" + + return (t[1], t[3], t[4], sAllelicState) + + +def GetTranscriptID(sChr,nPosition,sRef): + rawVcf=glob.glob("*Non-Filtered*.vcf") + fVCF=open(rawVcf[0],"r") + + sKey=sChr+"_"+str(nPosition)+"_"+sRef + + for sLine in fVCF.xreadlines(): + if sLine[0]=="#": + pass + else: + t=sLine.split("\t") + sLineKey=t[0]+"_"+t[1]+"_"+t[3] + if sLineKey==sKey: + #print sChr,nPosition,sRef + + sFUNC=t[-3] + print sFUNC + sFUNC=sFUNC.split("=[")[1] + #if + sFUNC=sFUNC.replace("]","") + sFUNC=sFUNC.replace("'",'"') + #print sFUNC + sjsonDict=json.loads(sFUNC) + (sTranscript, sGene,location)=(sjsonDict["transcript"],sjsonDict['gene'],sjsonDict['location']) + sTranscript=unicodedata.normalize('NFKD', sTranscript).encode('ascii','ignore') + sGene=unicodedata.normalize('NFKD', sGene).encode('ascii','ignore') + location=unicodedata.normalize('NFKD', location).encode('ascii','ignore') + + if location=="exonic": + nExonicPosition=sjsonDict["exon"] + else: + nExonicPosition='' + + return (sTranscript, sGene, location,nExonicPosition) + + + +def SNV_INDEL_Parser(sLine,fSNV,sGenomebuild): + sLine=sLine.strip() + t=sLine.split("\t") + (sLocus, sRef, nLength, sGenotype, sFilter, sGene, sNMID, sLocation, sExon, sProtein, sCoding, sClinvar, sCosmic, sDbsnp, sType,sFunction)=\ + (t[0], t[2], t[3], t[4], t[5], t[17], t[18], t[19], t[22], t[23], t[24], t[30], t[31], t[32], t[1],t[20]) + + + + + + (sChr, nPosition)=sLocus.split(":") + (sAlt1, sAlt2)=(sGenotype.split("/")[0], sGenotype.split("/")[1]) + + + + #(sNMID, sGene, sLocation, sExon)=GetTranscriptID(sChr,nPosition,sRef) + + if sAlt1==sAlt2: + sAllelicState="Homozygous" + else: + sAllelicState="Heterozygous" + + + if sAlt1!=sRef: + sAlt=sAlt1 + elif sAlt2!=sRef: + sAlt=sAlt2 + else: + print "Exception Error" + print sLine + sys.exit() + + if ((sType=="SNV") and (len(sRef)!=1) and (len(sAlt1)!=1 or len(sAlt2)!=1)): + (nPosition, sRef, sAlt, sAllelicState)=GetNormalized(sChr,nPosition,sRef) + + + if "|" in sNMID: + sNMID=sNMID.split("|")[1] + + + sHGVS=HGVS(sChr, nPosition, sRef, sAlt) + + if ((len(sRef)==1) and (len(sAlt)==1)): + sVariantType="Substitution" + elif (len(sRef)>len(sAlt)): + sVariantType="Deletion" + elif (len(sRef)2.0: + sCopynumberType="GN" + else: + sCopynumberType="LS" + + + fCNV.write("{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{6}\t{7}\t{8}\t{9}\t{10}\t{11}\t{12}\t{13}\t{14}\t{15}\t{16}\t{17}\t{18}\t{19}\t{20}\t{21}\t{22}\t{23}\t{24}\t{25}\t{26}\n".format(\ + sHGVS,"","","HGVS version 15.11",sChr.replace("chr",""), nStartPosition, nEndPosition, "","",nRatio,sCopynumberType,sGene,"","",sGenomebuild,"Somatic","","","","",sCosmic,"","","","","","","","","")) + + + + + +def Fusion_Parser(sLine, fFusion): + t=sLine.split("\t") + + + (sLocus, sGene, sExon)=(t[0], t[17], t[22]) + + sFusion_Locus=sLocus.split("_")[0] + + (sLocus1, sLocus2)=sFusion_Locus.split("-")[0], sFusion_Locus.split("-")[1] + (sGene1, sGene2)=sGene.split("|") + (nExon1, nExon2)=sExon.split("|") + + sHGVS=HGVS_Fusion(sLocus1, sLocus2) + + fFusion.write("{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{6}\{7}\t{8}\t{9}\t{10}\t{11}\t{12}\t{13}\t{14}\t{15}\t{16}\t{17}\t{18}\t{19}\t{20}\t{21}\t{22}\t{23}\t{24}\t{25}\t{26}\n".format(\ + sHGVS, "", "", "HGVS version 15.11", sGene1, "", "", sGene2, "","","","Upstream","Downstream", sGenomebuild,"","","","","","","","","","","","","","","","")) + + + + + + + + +lFile=glob.glob("*RNA-full.tsv") + +fp=open(lFile[0],"r") + +sReference=fp.readline() +sReference=sReference.strip() +sGenomebuild=sReference.split("=")[1] + +fp.readline() +fp.readline() + +fSNV=open("SNVINDEL_"+lFile[0],"w") +fCNV=open("CNV_"+lFile[0],"w") +fFusion=open("Translocation_"+lFile[0],"w") + +fCNV.write("HGVS genomic change\tHGVS coding change\tHGVS protein change\tHGVS version\tChromosome\tGenomic Start Position\tGenomic Stop Position\tBreakpoint exon, From\tBreakpoint exon, To\tCopy number\tCopy number type\tHGNC gene symbol(s)\tEntrez gene ID(s)\tEnsembl gene ID(s)\tGenome Build\tGenomic source\tCopy number ratio type\tCopy number ratio\tFunctional Domain\tdbVar Identification Number\tCOSMIC Identification Number\tCoding Size\tBiomarker name\tBiomarker state\tWildtype biomarker YN\n") +fSNV.write("HGVS genomic change\tHGVS coding change\tHGVS protein change\tHGVS version\tGenome build\tGenomic source\tHGNC gene symbol\tEntrez gene ID\tEnsembl gene ID\tChromosome\tPosition\tReference allele\tAlternative allele\tCytogenetic location\tStrand orientation\tCodon\tExon\tMolecular Effects\tVariant type\tGenotype\tdbSNP Identification Number\tclinVar Variation Identification Number\tCOSMIC Identification Number\tBiomarker name\tBiomarker state\tWildtype biomarker YN\tFunctional Domain\n") + + + + + + +for sLine in fp.xreadlines(): + t=sLine.split("\t") + if t[5]=="PASS": + if (t[1]=="SNV") or (t[1]=="INDEL"): + SNV_INDEL_Parser(sLine,fSNV,sGenomebuild) + elif (t[1]=="CNV"): + CNV_Parser(sLine, fCNV) + elif (t[1]=="FUSION"): + Fusion_Parser(sLine, fFusion) + else: + pass + + + + + + + + + + + + + + + + + + + + diff --git a/Oncomine_QC.py b/Oncomine_QC.py new file mode 100644 index 0000000..9a7ac67 --- /dev/null +++ b/Oncomine_QC.py @@ -0,0 +1,46 @@ +#!/usr/bin/bash +import glob, slate, os + +lPDFlist=glob.glob("*.pdf") + +fp=open(lPDFlist[0],"r") +doc=slate.PDF(fp) + +Linelist=doc[0].split("\n") + +nTotalNumberofReads = Linelist[-31] +nTotalnumberofBases = Linelist[-29] +nTotalNumberofBasesAQ20=Linelist[-27] +nMeanCoverageDepth=Linelist[-25] +nCoveragewithTargetregion= Linelist[-23] +nMeanReadLengthAQ20=Linelist[-21] +#nMeanReadLength(AQ30)=Linelist[-19] + +lSecondlist=doc[1].split("\n") +TotalMappedFusionPanelReads =lSecondlist[-13] + +sCurrentDir=os.getcwd() +sID=sCurrentDir.split("/")[-2] + + + +fout=open("../../QC_Report_DNA_"+sID+".txt","w") + +fout.write("Total Reads\tTotal Aligned Reads\t% Reads Aligned\tTotal Bases\tTotal Mapped Bases\tAverage on target depth\tStandard deviation on target depth \tOn Target Bases\n") + +fout.write(str(nTotalNumberofReads)+"\t") +fout.write("\t") +fout.write("\t") +fout.write(str(float(nTotalnumberofBases)*1000000)+"\t") +fout.write(str(nMeanCoverageDepth)+"\t") +fout.write("\t") +#print nTotalNumberofBasesAQ20 +#print nMeanCoverageDepth +fout.write(str(nCoveragewithTargetregion)+"\n") +fout.close() +fout=open("../../QC_Report_RNA_"+sID+".txt","w") +fout.write("Total Reads\tTotal Aligned Reads\t% Reads Aligned\tTotal Bases\tTotal Mapped Bases\tAverage on target depth\tStandard deviation on target depth \tOn Target Bases\n") +fout.write(TotalMappedFusionPanelReads) + + +