From 6870a5382bfeda7a33305652f02774c458f018c8 Mon Sep 17 00:00:00 2001 From: J35P312 Date: Wed, 15 Aug 2018 16:04:54 +0200 Subject: [PATCH] modified: ../README.md modified: ../TIDDIT.py modified: DBSCAN.py modified: TIDDIT.cpp new file: TIDDIT_calling.py deleted: TIDDIT_clustering.py new file: TIDDIT_coverage.py new file: TIDDIT_filtering.py new file: TIDDIT_signals.py modified: common.h modified: data_structures/CoverageModule.cpp modified: data_structures/ProgramModules.h modified: data_structures/Translocation.cpp modified: data_structures/Translocation.h modified: data_structures/findTranslocationsOnTheFly.cpp modified: setup.py --- README.md | 5 + TIDDIT.py | 26 +- src/DBSCAN.py | 130 ++- src/TIDDIT.cpp | 205 ++++- src/TIDDIT_calling.py | 254 ++++++ src/TIDDIT_clustering.py | 788 ------------------ src/TIDDIT_coverage.py | 128 +++ src/TIDDIT_filtering.py | 144 ++++ src/TIDDIT_signals.py | 173 ++++ src/common.h | 74 +- src/data_structures/CoverageModule.cpp | 55 +- src/data_structures/ProgramModules.h | 2 +- src/data_structures/Translocation.cpp | 10 +- src/data_structures/Translocation.h | 5 +- .../findTranslocationsOnTheFly.cpp | 32 +- src/setup.py | 2 +- 16 files changed, 1094 insertions(+), 939 deletions(-) mode change 100644 => 100755 TIDDIT.py create mode 100644 src/TIDDIT_calling.py delete mode 100644 src/TIDDIT_clustering.py create mode 100644 src/TIDDIT_coverage.py create mode 100644 src/TIDDIT_filtering.py create mode 100644 src/TIDDIT_signals.py diff --git a/README.md b/README.md index 3518dc7..3dbc904 100644 --- a/README.md +++ b/README.md @@ -45,8 +45,13 @@ The SV module ============= The main TIDDIT module, detects structural variant using discordant pairs, split reads and coverage information + python TIDDIT.py --sv [Options] --bam bam + +Optionally, TIDDIT acccepts a reference fasta for GC cocrrection: + python TIDDIT.py --sv [Options] --bam bam --ref reference.fasta + NOTE: It is important that you use the TIDDIT.py wrapper for SV detection. The TIDDIT binary in the TIDDIT/bin folder does not perform any clustering, it simply extract SV signatures into a tab file. Where bam is the input bam file. And reference.fasta is the reference fasta used to align the sequencing data: TIDDIT will crash if the reference fasta is different from the one used to align the reads. The reads of the input bam file must be sorted on genome position. diff --git a/TIDDIT.py b/TIDDIT.py old mode 100644 new mode 100755 index 575a9f8..8d7a2e4 --- a/TIDDIT.py +++ b/TIDDIT.py @@ -5,9 +5,9 @@ wd=os.path.dirname(os.path.realpath(__file__)) sys.path.insert(0, '{}/src/'.format(wd)) -import TIDDIT_clustering +import TIDDIT_calling -version = "2.2.6" +version = "2.3.0" parser = argparse.ArgumentParser("""TIDDIT-{}""".format(version),add_help=False) parser.add_argument('--sv' , help="call structural variation", required=False, action="store_true") parser.add_argument('--cov' , help="generate a coverage bed file", required=False, action="store_true") @@ -27,25 +27,27 @@ parser.add_argument('-n', type=int,default=2, help="the ploidy of the organism,(default = 2)") parser.add_argument('-e', type=int, help="clustering distance parameter, discordant pairs closer than this distance are considered to belong to the same variant(default = sqrt(insert-size*2)*12)") parser.add_argument('-l', type=int,default=3, help="min-pts parameter (default=3),must be set > 2") + parser.add_argument('-s', type=int,default=50000000, help="Number of reads to sample when computing library statistics(default=50000000)") parser.add_argument('-z', type=int,default=100, help="minimum variant size (default=100), variants smaller than this will not be printed( z < 10 is not recomended)") parser.add_argument('--force_ploidy',action="store_true", help="force the ploidy to be set to -n across the entire genome (i.e skip coverage normalisation of chromosomes)") parser.add_argument('--debug',action="store_true", help="rerun the tiddit clustering procedure") parser.add_argument('--n_mask',type=float,default=0.5, help="exclude regions from coverage calculation if they contain more than this fraction of N (default = 0.5)") - parser.add_argument('--ref',required=True, type=str, help="reference fasta") + parser.add_argument('--ref', type=str, help="reference fasta, used for GC correction") args= parser.parse_args() args.wd=os.path.dirname(os.path.realpath(__file__)) if args.l < 2: print "error, too low --l value!" quit() - if not os.path.isfile(args.ref): - print "error, could not find the reference file" - quit() + if args.ref: + if not os.path.isfile(args.ref): + print "error, could not find the reference file" + quit() if not os.path.isfile(args.bam): print "error, could not find the bam file" quit() - command_str="{}/bin/TIDDIT --sv -b {} -o {} -p {} -r {} -q {} -n {}".format(args.wd,args.bam,args.o,args.p,args.r,args.q,args.n) + command_str="{}/bin/TIDDIT --sv -b {} -o {} -p {} -r {} -q {} -n {} -s {}".format(args.wd,args.bam,args.o,args.p,args.r,args.q,args.n,args.s) if args.i: command_str += " -i {}".format(args.i) if args.d: @@ -53,8 +55,14 @@ if not args.debug: os.system(command_str) - - TIDDIT_clustering.cluster(args) + + if args.ref: + if args.ref.endswith(".gz"): + os.system("zcat {} | {}/bin/TIDDIT --gc -z 100 -o {}".format(args.ref,args.wd,args.o)) + else: + os.system("cat {} | {}/bin/TIDDIT --gc -z 100 -o {}".format(args.ref,args.wd,args.o)) + + TIDDIT_calling.cluster(args) elif args.cov: parser = argparse.ArgumentParser("""TIDDIT --cov --bam inputfile [-o prefix]""") diff --git a/src/DBSCAN.py b/src/DBSCAN.py index 844daa5..25c4f5e 100644 --- a/src/DBSCAN.py +++ b/src/DBSCAN.py @@ -1,5 +1,123 @@ import numpy -#supports up to 4D data + +#The functions of this script perform DBSCAN and returns the variatn clusters + +#Analyse the reads of the cluster, and colelct statistics such as position and orientation +def analyse_pos(candidate_signals,discordants,library_stats,args): + analysed_signals={} + + max_a=max(candidate_signals[:,0:1])[0] + min_a=min(candidate_signals[:,0:1])[0] + + max_b=max(candidate_signals[:,1:2])[0] + min_b=min(candidate_signals[:,1:2])[0] + + + FF=0 + FR=0 + RF=0 + RR=0 + + splitsINV=0 + splits=0 + + discs=0 + + avgQb=[] + avgQa=[] + + for i in range(0,len(candidate_signals)): + if candidate_signals[i][-2] == 1: + if(candidate_signals[i][2] != candidate_signals[i][4]): + splitsINV +=1 + else: + if(candidate_signals[i][2] and not candidate_signals[i][4]): + FR+=1 + elif( not candidate_signals[i][2] and candidate_signals[i][4]): + RF+=1 + elif( not candidate_signals[i][2] and not candidate_signals[i][4]): + RR+=1 + else: + FF+=1 + + if candidate_signals[i][-2] == 1: + splits +=1 + else: + discs+=1 + + if candidate_signals[i][3] != -1: + avgQa.append(candidate_signals[i][3]) + if candidate_signals[i][5] != -1: + avgQb.append(candidate_signals[i][5]) + + if avgQa: + avgQa=numpy.mean(avgQa) + else: + avgQa=-1 + + if avgQb: + avgQb=numpy.mean(avgQb) + else: + avgQb=-1 + + major_orientation=max([FF,FR,RF,RR]) + if not discs: + posA=max_a + posB=min_b + else: + if library_stats["Orientation"] == "innie": + if major_orientation == FF: + posA=max_a + posB=max_b + elif major_orientation == FR: + posA=max_a + posB=min_b + elif major_orientation == RF: + posA=min_a + posB=max_b + else: + posA=min_a + posB=min_b + else: + if major_orientation == FF: + posA=min_a + posB=min_b + elif major_orientation == FR: + posA=min_a + posB=max_b + elif major_orientation == RF: + posA=max_a + posB=min_b + else: + posA=max_a + posB=max_b + analysed_signals={"posA":posA,"posB":posB,"min_A":min_a,"max_A":max_a,"min_B":min_b,"max_B":max_b,"QA":avgQa,"QB":avgQb,"FF":FF,"FR":FR,"RF":RF,"RR":RR,"splitsINV":splitsINV,"splits":splits,"discs":discs,"signals":candidate_signals} + return(analysed_signals) + +#Call the cluser algorithm, the statistics function, and returns the final cluster +def generate_clusters(chrA,chrB,coordinates,library_stats,args): + candidates=[] + coordinates=coordinates[numpy.lexsort((coordinates[:,1],coordinates[:,0]))] + db=main(coordinates[:,0:2],args.e,int(round(args.l+library_stats["ploidies"][chrA]/(args.n*10)))) + unique_labels = set(db) + + for var in unique_labels: + if var == -1: + continue + class_member_mask = (db == var) + candidate_signals =coordinates[class_member_mask] + resolution=candidate_signals[:,-2] + support=len(set(candidate_signals[:,-1])) + discordants=True + if len(set(resolution)) == 1 and max(resolution) == 1: + disordants=False + + if discordants and support >= args.p: + candidates.append( analyse_pos(candidate_signals,discordants,library_stats,args) ) + elif not discordants and support >= args.r and chrA == chrB: + candidates.append( analyse_pos(candidate_signals,discordants,library_stats,args) ) + + return(candidates) def x_coordinate_clustering(data,epsilon,m): clusters=numpy.zeros(len(data)) @@ -97,13 +215,3 @@ def main(data,epsilon,m): clusters,cluster_id=y_coordinate_clustering(data,epsilon,m,cluster_id,clusters) return(clusters) - - -data=numpy.array([[1,1],[10481823,10483880],[10481947,10483785],[10481947,1],[10481947,1],[10481947,1],[10482033,10483984],[10482079,10483801],[10482111,10483972],[10482121,10483788],[10482125,10483769],[10482126,10484204],[10482163,10483811],[10482177,10483909],[10482186,10483906],[10482191,10483836],[10482202,10484150],[10482262,10483947],[10482285,10483797],[10482342,10483968],[10483770,10482390],[10482390,10483814],[10483770,10482405],[10483769,10482428],[10483770,10482405],[10483770,10482405],[10483770,1],[10483770,1],[10483770,1],[10483770,1]]) - - -#data=data[numpy.lexsort((data[:,1],data[:,0]))] -#print data -#data=numpy.array([[1,2],[1,3],[1,4],[1,5],[5,100],[5,100],[10,2],[10,3],[10,4],[10,5],[20,100]]) -#print main(data,398,3) - diff --git a/src/TIDDIT.cpp b/src/TIDDIT.cpp index 4463a85..10489fc 100644 --- a/src/TIDDIT.cpp +++ b/src/TIDDIT.cpp @@ -16,7 +16,6 @@ #include #include "data_structures/ProgramModules.h" - //converts a string to int int convert_str(string to_be_converted,string option){ int converted; @@ -31,21 +30,22 @@ int convert_str(string to_be_converted,string option){ } int main(int argc, char **argv) { - //MAIN VARIABLE + //MAIN VARIABLES - bool outtie = true; // library orientation + bool outtie = true; // library orientation uint32_t minimumSupportingPairs = 4; uint32_t minimumSupportingReads = 6; - int min_insert = 100; // min insert size - int max_insert = 100000; // max insert size - int minimum_mapping_quality =10; + int min_insert = 100; // min insert size + int max_insert = 80000; // max insert size + int minimum_mapping_quality = 10; float coverage; float coverageStd; float meanInsert; float insertStd; int min_variant_size= 100; + int sample = 100000000; string outputFileHeader ="output"; - string version = "2.2.6"; + string version = "2.3.0"; //collect all options as a vector vector arguments(argv, argv + argc); @@ -59,8 +59,9 @@ int main(int argc, char **argv) { string general_help="TIDDIT-" + version + " - a structural variant caller\nUsage: TIDDIT [options] \n"; general_help+="modules\n\t--help\tproduce help message\n"; - general_help+="\t--sv\tselect the sv module to find structural variations\n"; + general_help+="\t--sv\tcollect SV signals\n"; general_help+="\t--cov\tselect the cov module to analyse the coverage of the genome using bins of a specified size\n"; + general_help+="\t--gc\tselect the gc module to compute the gc content across the genome using bins of a specified size(accepts a fasta through stdin)\n"; //the sv module vm["--sv"]="store"; @@ -70,6 +71,7 @@ int main(int argc, char **argv) { vm["-r"]=""; vm["-q"]=""; vm["-c"]=""; + vm["-s"]=""; vm["-n"]=""; vm["-m"] = ""; @@ -81,18 +83,26 @@ int main(int argc, char **argv) { sv_help +="\t-r\tMinimum number of supporting split reads to call a small variant (default 3)\n"; sv_help +="\t-q\tMinimum mapping quality to consider an alignment (default 10)\n"; sv_help +="\t-c\taverage coverage, (default= computed from the bam file)\n"; + sv_help +="\t-s\tNumber of reads to sample when computing library statistics, (default= 100000000)\n"; sv_help +="\t-n\tthe number of sets of chromosomes,(default = 2)\n"; sv_help +="\t-m\tminimum variant size,(default = 100)\n"; //the coverage module vm["--cov"]="store"; vm["-z"]=""; + + //the gc module + vm["--gc"]="store"; string coverage_help="\nUsage: TIDDIT --cov [Mode] -b inputfile [-o prefix]\n"; coverage_help +="\t-b\tcoordinate sorted bam file(required)\n"; coverage_help +="\n\t-z\tuse bins of specified size(default = 500bp) to measure the coverage of the entire bam file, set output to stdout to print to stdout\n"; - + string gc_help="\nUsage: cat in.fa | TIDDIT --gc [Mode] [-o prefix]\n"; + coverage_help +="\t-r\treference fasta file(required)\n"; + coverage_help +="\n\t-z\tuse bins of specified size(default = 500bp) to measure the coverage of the entire bam file, set output to stdout to print to stdout\n"; + + //store the options in a map for(int i = 0; i < arguments.size(); i += 2){ if(vm.count( arguments[i] ) ){ @@ -127,6 +137,8 @@ int main(int argc, char **argv) { cout << sv_help; }else if(vm["--cov"] == "found"){ cout << coverage_help; + }else if(vm["--cov"] == "found"){ + cout << gc_help; }else{ cout << general_help; } @@ -134,16 +146,152 @@ int main(int argc, char **argv) { } //if no module is chosen - if(vm["--sv"] == "store" and vm["--cov"] == "store"){ + if(vm["--sv"] == "store" and vm["--cov"] == "store" and vm["--gc"] == "store" ){ cout << general_help; cout << "ERROR: select a module" << endl; return(0); - }else if(vm["--sv"] == "found" and vm["--cov"] == "found"){ + }else if( (vm["--sv"] == "found" and vm["--cov"] == "found") or (vm["--sv"] == "found" and vm["--gc"] == "found") or (vm["--cov"] == "found" and vm["--gc"] == "found") ){ cout << general_help; cout << "ERROR: only one module may be selected" << endl; return(0); } + + if (vm["--gc"] == "found"){ + int binSize=500; + if(vm["-z"] != ""){ + binSize =convert_str( vm["-z"], "-z"); + } + + string fasta = vm["-r"]; + string outputfile="output"; + if(vm["-o"] != ""){ + outputfile=vm["-o"]; + } + + ifstream in(fasta.c_str()); + + string chromosome=""; + bool first = true; + long pos = 0; + long element =0; + vector chromosomes; + map > > bins; + + int A =0; + int G =0; + int other=0; + for (string line; getline(cin, line);) { + if (line[0] == '>'){ + + if (A > 0 or G > 0 or other > 0 ){ + vector bin; + bin.push_back(A); + bin.push_back(G); + bin.push_back(other); + bins[chromosome].push_back(bin); + } + + chromosome=""; + for (int i=1;i < line.size();i++){ + if (line[i] == ' '){ + break; + } + chromosome += line[i]; + } + chromosomes.push_back(chromosome); + vector< vector > row; + bins[chromosome]=row; + pos=0; + element=0; + A=0; + G=0; + other=0; + }else{ + for(int i=0; i< line.size();i++){ + pos+=1; + if (line[i] == 'A' or line[i] =='a'){ + A+=1; + }else if (line[i] == 'C' or line[i] =='c'){ + G+=1; + }else if (line[i] == 'g' or line[i] == 'G'){ + G+=1; + }else if (line[i] == 't' or line[i] == 'T'){ + A+=1; + }else{ + other+=1; + } + + if (pos >= (element+1)*binSize){ + vector bin; + bin.push_back(A); + bin.push_back(G); + bin.push_back(other); + bins[chromosome].push_back(bin); + element+=1; + A=0;G=0;other=0; + } + } + } + + } + + if (A > 0 or G > 0 or other > 0 ){ + vector bin; + bin.push_back(A); + bin.push_back(G); + bin.push_back(other); + bins[chromosome].push_back(bin); + } + + + if (outputfile == "stdout"){ + cout << "#chromosome\tstart\tend\tGC\tN" << endl; + for(int i=0;i< chromosomes.size();i++){ + for (int j=0;j 0){ + gc=(float)bins[chromosomes[i]][j][1]/(bins[chromosomes[i]][j][1]+bins[chromosomes[i]][j][0]); + } + + float n = 0; + if (bins[chromosomes[i]][j][0]+bins[chromosomes[i]][j][1] > 0){ + n=(float)bins[chromosomes[i]][j][2]/( bins[chromosomes[i]][j][0]+bins[chromosomes[i]][j][1]+bins[chromosomes[i]][j][2] ); + }else{ + n=1; + } + + cout << chromosomes[i] << "\t" << j*binSize << "\t" << (j+1)*binSize << "\t" << gc << "\t" << n << endl; + + } + } + }else{ + ofstream gcOutput; + gcOutput.open((outputfile+".gc.tab").c_str()); + ostream& gcout=gcOutput; + gcout << "#chromosome\tstart\tend\tGC\tN" << endl; + for(int i=0;i< chromosomes.size();i++){ + for (int j=0;j 0){ + gc=(float)bins[chromosomes[i]][j][1]/(bins[chromosomes[i]][j][1]+bins[chromosomes[i]][j][0]); + } + + float n = 0; + if (bins[chromosomes[i]][j][0]+bins[chromosomes[i]][j][1] > 0){ + n=(float)bins[chromosomes[i]][j][2]/( bins[chromosomes[i]][j][0]+bins[chromosomes[i]][j][1]+bins[chromosomes[i]][j][2] ); + }else{ + n=1; + } + + gcout << chromosomes[i] << "\t" << j*binSize << "\t" << (j+1)*binSize << "\t" << gc << "\t" << n << endl; + + } + } + } + return(0); + } + //the bam file is required by all modules if(vm["-b"] == ""){ @@ -178,6 +326,7 @@ int main(int argc, char **argv) { } bamFile.Close(); + //if the find structural variations module is chosen collect the options if(vm["--sv"] == "found"){ if(vm["-o"] != ""){ @@ -194,6 +343,9 @@ int main(int argc, char **argv) { if(vm["-r"] != ""){ minimumSupportingReads=convert_str( vm["-r"] , "-r"); } + if(vm["-s"] != ""){ + sample=convert_str( vm["-s"] , "-s"); + } if(vm["-d"] != ""){ if (vm["-d"] == "outtie"){ @@ -214,7 +366,7 @@ int main(int argc, char **argv) { //now compute library stats LibraryStatistics library; size_t start = time(NULL); - library = computeLibraryStats(alignmentFile, genomeLength, max_insert, 50 , outtie,minimum_mapping_quality,outputFileHeader); + library = computeLibraryStats(alignmentFile, genomeLength, max_insert, 50 , outtie,minimum_mapping_quality,outputFileHeader,sample); printf ("library stats time consumption= %lds\n", time(NULL) - start); @@ -234,7 +386,7 @@ int main(int argc, char **argv) { meanInsert = library.insertMean; insertStd = library.insertStd; - max_insert=meanInsert+3*insertStd; + max_insert=meanInsert+2.1*insertStd; if(outtie == true){ max_insert=meanInsert+4*insertStd; } @@ -246,26 +398,25 @@ int main(int argc, char **argv) { - if (vm["-m"] != ""){ - min_variant_size = convert_str( vm["-m"],"-m" ); - } + if (vm["-m"] != ""){ + min_variant_size = convert_str( vm["-m"],"-m" ); + } - map SV_options; + map SV_options; SV_options["max_insert"] = max_insert; SV_options["pairs"] = minimumSupportingPairs; SV_options["mapping_quality"] = minimum_mapping_quality; SV_options["readLength"] = library.readLength; SV_options["ploidy"] = ploidy; SV_options["contigsNumber"] = contigsNumber; - SV_options["meanInsert"] = meanInsert; - SV_options["STDInsert"] = insertStd; - SV_options["min_variant_size"] = min_variant_size; + SV_options["meanInsert"] = meanInsert; + SV_options["STDInsert"] = insertStd; + SV_options["min_variant_size"] = min_variant_size; SV_options["splits"] = minimumSupportingReads; StructuralVariations *FindTranslocations; FindTranslocations = new StructuralVariations(); - FindTranslocations -> findTranslocationsOnTheFly(alignmentFile, outtie, coverage,outputFileHeader, version, "TIDDIT" + argString,SV_options); - + FindTranslocations -> findTranslocationsOnTheFly(alignmentFile, outtie, coverage,outputFileHeader, version, "TIDDIT" + argString,SV_options, genomeLength); //the coverage module }else if(vm["--cov"] == "found"){ @@ -274,16 +425,18 @@ int main(int argc, char **argv) { if(vm["-z"] != ""){ binSize =convert_str( vm["-z"], "-z"); } + if(vm["-o"] != ""){ outputFileHeader=vm["-o"]; } - bool discordants = false; - calculateCoverage = new Cov(binSize,alignmentFile,outputFileHeader,discordants); + + calculateCoverage = new Cov(binSize,alignmentFile,outputFileHeader); BamReader bam; bam.Open(alignmentFile); BamAlignment currentRead; - while ( bam.GetNextAlignmentCore(currentRead) ) { - calculateCoverage -> bin(currentRead); + while ( bam.GetNextAlignmentCore(currentRead) ) { + readStatus alignmentStatus = computeReadType(currentRead, 100000,100, true); + calculateCoverage -> bin(currentRead, alignmentStatus); } bam.Close(); calculateCoverage -> printCoverage(); diff --git a/src/TIDDIT_calling.py b/src/TIDDIT_calling.py new file mode 100644 index 0000000..1f2e39d --- /dev/null +++ b/src/TIDDIT_calling.py @@ -0,0 +1,254 @@ +import numpy +import math +import copy +import sys +import sqlite3 +import os +import time + +import DBSCAN +import TIDDIT_coverage +import TIDDIT_filtering +import TIDDIT_signals + +def retrieve_coverage(chromosome,start,end,coverage_data): + start_index=int(math.floor(start/100.0)) + end_index=int(math.floor(end/100.0))+1 + regional_coverage=coverage_data[chromosome][start_index:end_index,0] + coverage=numpy.average(regional_coverage[numpy.where(regional_coverage > -1)]) + max_coverage=max(regional_coverage[numpy.where(regional_coverage > -1)]) + quality=numpy.average(coverage_data[chromosome][start_index:end_index,1]) + return (coverage,max_coverage,int(round(quality))) + + +#Find the number of discordant pairs within a region +def retrieve_discs(chromosome,start,end,coverage_data): + discs=0 + start_index=int(math.floor(start/100.0)) + end_index=int(math.floor(end/100.0))+1 + discs=sum(coverage_data[chromosome][start_index:end_index,2]) + return(discs) + +#split inversions into two separate calls +def redefine_inv(vcf_line,signals,library_stats,args): + analysed_signals=DBSCAN.analyse_pos(signals,True,library_stats,args) + vcf_line[1]=analysed_signals["posA"] + vcf_line[7]=vcf_line[7].split(";END=")[0]+";END="+str(analysed_signals["posB"]) +";SVLEN=" + vcf_line[7].split(";SVLEN=")[-1] + vcf_line[7]=vcf_line[7].split("SVLEN=")[0]+"SVLEN="+str(analysed_signals["posB"]-analysed_signals["posA"]+1) +";COVM=" + vcf_line[7].split(";COVM=")[-1] + + vcf_line[7]=vcf_line[7].split("CIPOS=")[0]+"CIPOS={},{}".format(analysed_signals["min_A"]-analysed_signals["posA"],analysed_signals["max_A"]-analysed_signals["posA"]) + ";CIEND" + vcf_line[7].split(";CIEND")[-1] + vcf_line[7]=vcf_line[7].split("CIEND=")[0]+"CIEND={},{}".format(analysed_signals["min_B"]-analysed_signals["posB"],analysed_signals["max_B"]-analysed_signals["posB"]) + ";END=" + vcf_line[7].split(";END=")[-1] + return(vcf_line) + +def inv_recluster(vcf_line,candidate,library_stats,args): + RR=[] + FF=[] + + for signal in candidate["signals"]: + if( not signal[2] and not signal[4] ): + RR.append(signal) + else: + FF.append(signal) + + vcf_line_rr=copy.copy(vcf_line) + vcf_line_ff=copy.copy(vcf_line) + vcf_line_rr=redefine_inv(vcf_line_rr,numpy.array(RR),library_stats,args) + vcf_line_ff=redefine_inv(vcf_line_ff,numpy.array(FF),library_stats,args) + var_id=list(vcf_line_ff[2]) + var_id[-1]="2" + vcf_line_ff[2]="".join(var_id) + + return([vcf_line_rr,vcf_line_ff]) + +def generate_vcf_line(chrA,chrB,n,candidate,args,library_stats): + vcf_line=[] + if chrA == chrB and candidate["posA"] > candidate["posB"]: + candidate["posA"]=candidate["min_A"] + candidate["posB"]=candidate["max_B"] + + vcf_line.append(chrA) + vcf_line.append(candidate["posA"]) + vcf_line.append("SV_{}_1".format(n)) + vcf_line.append("N") + var,variant_type,GT=TIDDIT_filtering.fetch_variant_type(chrA,chrB,candidate,args,library_stats) + vcf_line.append(var) + vcf_line.append(".") + vcf_line.append(TIDDIT_filtering.fetch_filter(chrA,chrB,candidate,args,library_stats)) + INFO="{};CIPOS={},{};CIEND={},{}".format(variant_type,candidate["min_A"]-candidate["posA"],candidate["max_A"]-candidate["posA"],candidate["min_B"]-candidate["posB"],candidate["max_B"]-candidate["posB"]) + if chrA == chrB: + if not "BND" in variant_type: + INFO += ";END={};SVLEN={}".format(candidate["posB"],abs(candidate["posB"]-candidate["posA"]+1)) + INFO += ";COVM={}".format(candidate["covM"]) + stats=";COVA={};COVB={};LFA={};LFB={};LTE={};E1={};E2={};OR={},{},{},{};ORSR={},{};QUALA={};QUALB={}".format(candidate["covA"],candidate["covB"],int(round(candidate["discsA"])),int(round(candidate["discsB"])),candidate["discs"]+candidate["splits"],candidate["e1"],candidate["e2"],candidate["FF"],candidate["RR"] ,candidate["RF"],candidate["FR"],candidate["splitsINV"],candidate["splits"]-candidate["splitsINV"],candidate["QRA"],candidate["QRB"]) + + INFO+=stats + + vcf_line.append(INFO) + FORMAT_FORMAT="GT:CN:DV:RV" + vcf_line.append(FORMAT_FORMAT) + CN="." + if "DEL" in var or "DUP" in var: + CN=int(round(candidate["covM"]/(library_stats["Coverage"]/args.n))) + if "DEL" in var: + CN=library_stats["ploidies"][chrA]-CN + if CN < 0: + CN=library_stats["ploidies"][chrA] + + FORMAT_STR="{}:{}:{}:{}".format(GT,CN,candidate["discs"],candidate["splits"]) + vcf_line.append(FORMAT_STR) + if not "BND" in variant_type and not "INV" in variant_type: + return([vcf_line]) + elif "INV" in variant_type: + if candidate["FF"] and candidate["RR"] and candidate["RR"] > args.p/2 and candidate["FF"] > args.p/2 and candidate["discs"]: + return(inv_recluster(vcf_line,candidate,library_stats,args)) + else: + return([vcf_line]) + else: + vcf_line_a=copy.copy(vcf_line) + vcf_line_b=copy.copy(vcf_line) + inverted=False + before=True + if candidate["posA"] == candidate["max_A"]: + before=False + + if candidate["FF"] + candidate["RR"] > candidate["RF"] + candidate["FR"]: + inverted=True + + if not inverted and not before: + alt_str_a="N[{}:{}[".format(chrB,candidate["posB"]) + alt_str_b="]{}:{}]N".format(chrA,candidate["posA"]) + elif not inverted and before: + alt_str_a="]{}:{}]N".format(chrB,candidate["posB"]) + alt_str_b="N[{}:{}[".format(chrA,candidate["posA"]) + elif inverted and not before: + alt_str_a="N]{}:{}]".format(chrB,candidate["posB"]) + alt_str_b="[{}:{}[N".format(chrA,candidate["posA"]) + else: + alt_str_a="[{}:{}[N".format(chrB,candidate["posB"]) + alt_str_b="N]{}:{}]".format(chrA,candidate["posA"]) + + vcf_line_a[4]=alt_str_a + vcf_line_b[4]=alt_str_b + vcf_line_b[2]="SV_{}_2".format(n) + vcf_line_b[0]=chrB + vcf_line_b[1]=candidate["posB"] + return([vcf_line_a,vcf_line_b]) + +#main function +def cluster(args): + start_time=time.time() + + if args.ref: + Ncontent=TIDDIT_coverage.retrieve_N_content(args) + else: + Ncontent=[] + + coverage_data=TIDDIT_coverage.coverage(args) + + conn = sqlite3.connect(args.o+".db") + args.c = conn.cursor() + + tableListQuery = "SELECT name FROM sqlite_master WHERE type=\'table\'" + args.c.execute(tableListQuery) + tables = map(lambda t: t[0], args.c.fetchall()) + if "TIDDITcall" in tables: + args.c.execute("DROP TABLE TIDDITcall") + + args.c.execute("CREATE TABLE TIDDITcall (chrA TEXT,chrB TEXT,posA INT,posB INT,forwardA INT,forwardB INT,qualA INT, qualB INT,cigarA TEXT,cigarB TEXT, resolution INT,name INT)") + header,chromosomes,library_stats=TIDDIT_signals.signals(args,coverage_data) + args.c.execute("CREATE INDEX CHR ON TIDDITcall (chrA, chrB)") + + ploidies,library_stats,coverage_data=TIDDIT_coverage.determine_ploidy(args,chromosomes,coverage_data,Ncontent,library_stats) + library_stats["ploidies"]=ploidies + + if not args.e: + args.e=int(math.sqrt(library_stats["MeanInsertSize"]*2)*12) + n=1 + print "clustering signals on chromosome:" + calls={} + for chrA in chromosomes: + calls[chrA] =[] + print "{}".format(chrA) + for chrB in chromosomes: + signal_data=numpy.array([ [hit[0],hit[1],hit[2],hit[3],hit[4],hit[5],hit[6],hit[7]] for hit in args.c.execute('SELECT posA,posB,forwardA,qualA,forwardB,qualB,resolution,name FROM TIDDITcall WHERE chrA == \'{}\' AND chrB == \'{}\''.format(chrA,chrB)).fetchall()]) + + if not len(signal_data): + continue + + candidates=DBSCAN.generate_clusters(chrA,chrB,signal_data,library_stats,args) + + for i in range(0,len(candidates)): + candidates[i]["covA"],candidates[i]["MaxcovA"],candidates[i]["QRA"]=retrieve_coverage(chrA,candidates[i]["min_A"],candidates[i]["max_A"],coverage_data) + candidates[i]["covB"],candidates[i]["MaxcovB"],candidates[i]["QRB"]=retrieve_coverage(chrB,candidates[i]["min_B"],candidates[i]["max_B"],coverage_data) + if chrA == chrB: + if candidates[i]["posB"] > candidates[i]["posA"]: + candidates[i]["covM"],candidates[i]["MaxcovM"],candidates[i]["QRM"]=retrieve_coverage(chrB,candidates[i]["posA"],candidates[i]["posB"],coverage_data) + else: + candidates[i]["covM"],candidates[i]["MaxcovM"],candidates[i]["QRM"]=retrieve_coverage(chrB,candidates[i]["posB"],candidates[i]["posA"],coverage_data) + else: + candidates[i]["covM"]=0 + candidates[i]["QRM"]=0 + candidates[i]["discsA"]=retrieve_discs(chrA,candidates[i]["min_A"],candidates[i]["max_A"],coverage_data) + candidates[i]["discsB"]=retrieve_discs(chrB,candidates[i]["min_B"],candidates[i]["max_B"],coverage_data) + sizeA=candidates[i]["max_A"]-candidates[i]["min_A"]+library_stats["ReadLength"] + sizeB=candidates[i]["max_B"]-candidates[i]["min_B"]+library_stats["ReadLength"] + gap=[] + for signal in candidates[i]["signals"]: + gap.append(abs(sizeA-(signal[0]-candidates[i]["min_A"])+(signal[1]-candidates[i]["min_B"]))) + gap=numpy.average(gap) + if sizeA > library_stats["MeanInsertSize"] or sizeB > library_stats["MeanInsertSize"]: + gap=1 + elif gap > sizeA: + gap = gap - sizeA + else: + gap=1 + if ploidies[chrA]: + coverageA=candidates[i]["covA"]/ploidies[chrA] + else: + coverageA=candidates[i]["covA"]/float(args.n) + if ploidies[chrB]: + coverageB=candidates[i]["covB"]/ploidies[chrB] + else: + coverageB=candidates[i]["covB"]/float(args.n) + + if candidates[i]["discs"] and not ( candidates[i]["splits"] and abs(candidates[i]["posA"]-candidates[i]["posB"]) < 3*library_stats["STDInsertSize"] and chrA == chrB ): + candidates[i]["e1"]=int(round(TIDDIT_filtering.expected_links(coverageA,coverageB,sizeA,sizeB,gap,library_stats["MeanInsertSize"],library_stats["STDInsertSize"],library_stats["ReadLength"]))) + candidates[i]["e2"]= int( round( (min([coverageA,coverageB])*(library_stats["MeanInsertSize"]-library_stats["ReadLength"]/3)) / (float(library_stats["ReadLength"]))/2.0 ) ) + else: + candidates[i]["e1"]=int(round( min([coverageA,coverageB])*0.75 )) + candidates[i]["e2"]=int(round( min([coverageA,coverageB])*0.75 )) + vcf_line=generate_vcf_line(chrA,chrB,n,candidates[i],args,library_stats) + + if len(vcf_line) == 1: + calls[chrA].append(vcf_line[0]) + else: + calls[chrA].append(vcf_line[0]) + if not chrB in calls: + calls[chrB]=[] + calls[chrB].append(vcf_line[1]) + n+=1 + + outfile=open(args.o+".vcf", 'w') + new_header=[] + for line in header.strip().split("\n"): + if "##TIDDITcmd" in line: + new_header.append("##TIDDITcmd=\"{}\"\n".format(" ".join(sys.argv))) + continue + new_header.append(line+"\n") + header="".join(new_header) + outfile.write(header) + for chromosome in chromosomes: + for call in sorted(calls[chromosome],key=lambda x: x[1]): + output=call + output[1]=str(output[1]) + if "MinSize" == output[6]: + continue + + output="\t".join(output)+"\n" + outfile.write(output) + + print "variant clustering completed in {}".format(time.time()-start_time) + print "Work complete!" + os.remove("{}.db".format(args.o)) + return() + diff --git a/src/TIDDIT_clustering.py b/src/TIDDIT_clustering.py deleted file mode 100644 index 9fcc226..0000000 --- a/src/TIDDIT_clustering.py +++ /dev/null @@ -1,788 +0,0 @@ -import numpy -import math -import copy -import DBSCAN -import gzip -import sys -import sqlite3 -import itertools - -from scipy.stats import norm - -#analyse the cigar operation of the read -def read_cigar(cigar): - deletions=0 - insertions=0 - SC = ["".join(x) for _, x in itertools.groupby(cigar, key=str.isdigit)] - length=0 - first=True - clip_after=True - - aligned_range=[] - current_pos=1 - for i in range(0,len(SC)/2): - if first and SC[i*2+1] == "M": - first = False - elif first and SC[i*2+1] == "S": - first = False - clip_after=False - - if SC[i*2+1] == "M": - length += int( SC[i*2] ) - bases=range(0,int( SC[i*2] )) - for j in range(0,len(bases)): - bases[j] += current_pos - - aligned_range += bases - current_pos += int( SC[i*2] ) - elif SC[i*2+1] == "I": - insertions+=1 - bases=range(0,int( SC[i*2] )) - for j in range(0,len(bases)): - bases[j] += current_pos - aligned_range += bases - - current_pos += int( SC[i*2] ) - elif SC[i*2+1] == "D": - length += int( SC[i*2] ) - deletions +=1 - else: - current_pos += int( SC[i*2] ) - - return deletions,insertions,length,clip_after,aligned_range - -def coverage(args): - coverage_data={} - for line in open(args.o+".tab"): - if line[0] == "#": - continue - content=line.strip().split() - if not content[0] in coverage_data: - coverage_data[content[0]]=[] - coverage_data[content[0]].append([ float(content[3]),float(content[4]),0 ]) - - for chromosome in coverage_data: - coverage_data[chromosome]=numpy.array(coverage_data[chromosome]) - - return(coverage_data) - -def signals(args,coverage_data): - signal_data=[] - header="" - first_signal=True - read_list={} - n_reads=0 - for line in open(args.o+".signals.tab"): - if line[0] == "#": - header += line - continue - - if first_signal: - chromosomes,library_stats,chromosome_len=find_contigs(header) - first_signal=False - - content=line.strip().split() - - read_name=content[0] - if read_name in read_list: - name=read_list[read_name] - else: - name=n_reads - read_list[read_name]=n_reads - n_reads+=1 - - chrA=content[1] - posA=int(content[2]) - if content[3] == "+": - orientationA=1 - else: - orientationA=0 - - cigarA=content[4] - if ("S" in cigarA or "H" in cigarA) and not cigarA == "NA": - deletions,insertions,length,clip_after,aligned_range=read_cigar(cigarA) - if clip_after: - posA+=length - else: - if "M" in cigarA: - deletions,insertions,length,clip_after,aligned_range=read_cigar(cigarA) - else: - length=library_stats["ReadLength"] - - if library_stats["Orientation"] == "innie": - if orientationA: - posA+=length-1 - else: - if not orientationA: - posA+=length-1 - - if posA >= chromosome_len[chrA]: - posA=chromosome_len[chrA]-1 - - qualA=int(content[5]) - - chrB=content[6] - posB=int(content[7]) - if content[8] == "+": - orientationB=1 - else: - orientationB=0 - cigarB=content[9] - qualB=int(content[10]) - - if ("S" in cigarB or "H" in cigarB) and not cigarB == "NA": - deletions,insertions,length,clip_after,aligned_range=read_cigar(cigarB) - if clip_after: - posB+=length - else: - if "M" in cigarB: - deletions,insertions,length,clip_after,aligned_range=read_cigar(cigarB) - else: - length=library_stats["ReadLength"] - - if library_stats["Orientation"] == "innie": - if orientationB: - posB+=length-1 - else: - if not orientationB: - posB+=length-1 - - if posB >= chromosome_len[chrB]: - posB=chromosome_len[chrB]-1 - - resolution=int(content[-1]) - - - if chrA > chrB or (posB < posA and chrA == chrB): - signal_data.append([chrB,chrA,posB,posA,orientationB,orientationA,qualB,qualA,cigarB,cigarA,resolution,name]) - else: - signal_data.append([chrA,chrB,posA,posB,orientationA,orientationB,qualA,qualB,cigarA,cigarB,resolution,name]) - - - if len (signal_data) > 1000000: - args.c.executemany('INSERT INTO TIDDITcall VALUES (?,?,?,?,?,?,?,?,?,?,?,?)',signal_data) - signal_data=[] - - idx_b=int(math.floor(posB/100.0)) - if idx_b >= len(coverage_data[chrB]): - idx_b =len(coverage_data[chrB])-1 - coverage_data[chrB][idx_b,2]+=1 - - idx_a=int(math.floor(posA/100.0)) - if idx_a >= len(coverage_data[chrA]): - idx_a=len(coverage_data[chrA])-1 - coverage_data[chrA][idx_a,2]+=1 - - if len(signal_data): - args.c.executemany('INSERT INTO TIDDITcall VALUES (?,?,?,?,?,?,?,?,?,?,?,?)',signal_data) - return(header,chromosomes,library_stats) - -def find_contigs(header): - chromosomes=[] - library_stats={} - chromosome_len={} - for line in header.split("\n"): - if "##contig=")[0]) - if "##LibraryStats=" in line: - stats=line.strip().split(" ") - library_stats={"Coverage":float(stats[1].split("=")[-1]),"ReadLength":int(stats[2].split("=")[-1]),"MeanInsertSize":int(stats[3].split("=")[-1]),"STDInsertSize":int(stats[4].split("=")[-1]),"Orientation":stats[5].split("=")[-1]} - return (chromosomes,library_stats,chromosome_len) - -def analyse_pos(candidate_signals,discordants,library_stats,args): - analysed_signals={} - - max_a=max(candidate_signals[:,0:1])[0] - min_a=min(candidate_signals[:,0:1])[0] - - max_b=max(candidate_signals[:,1:2])[0] - min_b=min(candidate_signals[:,1:2])[0] - - - FF=0 - FR=0 - RF=0 - RR=0 - - splitsINV=0 - splits=0 - - discs=0 - - avgQb=[] - avgQa=[] - - for i in range(0,len(candidate_signals)): - if candidate_signals[i][-2] == 1: - if(candidate_signals[i][2] != candidate_signals[i][4]): - splitsINV +=1 - else: - if(candidate_signals[i][2] and not candidate_signals[i][4]): - FR+=1 - elif( not candidate_signals[i][2] and candidate_signals[i][4]): - RF+=1 - elif( not candidate_signals[i][2] and not candidate_signals[i][4]): - RR+=1 - else: - FF+=1 - - if candidate_signals[i][-2] == 1: - splits +=1 - else: - discs+=1 - - if candidate_signals[i][3] != -1: - avgQa.append(candidate_signals[i][3]) - if candidate_signals[i][5] != -1: - avgQb.append(candidate_signals[i][5]) - - if avgQa: - avgQa=numpy.mean(avgQa) - else: - avgQa=-1 - - if avgQb: - avgQb=numpy.mean(avgQb) - else: - avgQb=-1 - - major_orientation=max([FF,FR,RF,RR]) - if not discs: - posA=max_a - posB=min_b - else: - if library_stats["Orientation"] == "innie": - if major_orientation == FF: - posA=max_a - posB=max_b - elif major_orientation == FR: - posA=max_a - posB=min_b - elif major_orientation == RF: - posA=min_a - posB=max_b - else: - posA=min_a - posB=min_b - else: - if major_orientation == FF: - posA=min_a - posB=min_b - elif major_orientation == FR: - posA=min_a - posB=max_b - elif major_orientation == RF: - posA=max_a - posB=min_b - else: - posA=max_a - posB=max_b - analysed_signals={"posA":posA,"posB":posB,"min_A":min_a,"max_A":max_a,"min_B":min_b,"max_B":max_b,"QA":avgQa,"QB":avgQb,"FF":FF,"FR":FR,"RF":RF,"RR":RR,"splitsINV":splitsINV,"splits":splits,"discs":discs,"signals":candidate_signals} - return(analysed_signals) - - -def generate_clusters(chrA,chrB,coordinates,library_stats,args): - candidates=[] - coordinates=coordinates[numpy.lexsort((coordinates[:,1],coordinates[:,0]))] - db=DBSCAN.main(coordinates[:,0:2],args.e,int(round(args.l+library_stats["ploidies"][chrA]/(args.n*10)))) - unique_labels = set(db) - - for var in unique_labels: - if var == -1: - continue - class_member_mask = (db == var) - candidate_signals =coordinates[class_member_mask] - resolution=candidate_signals[:,-2] - support=len(set(candidate_signals[:,-1])) - discordants=True - if len(set(resolution)) == 1 and max(resolution) == 1: - disordants=False - - if discordants and support >= args.p: - candidates.append( analyse_pos(candidate_signals,discordants,library_stats,args) ) - elif not discordants and support >= args.r and chrA == chrB: - candidates.append( analyse_pos(candidate_signals,discordants,library_stats,args) ) - - return(candidates) - -def retrieve_coverage(chromosome,start,end,coverage_data): - start_index=int(math.floor(start/100.0)) - end_index=int(math.floor(end/100.0))+1 - regional_coverage=coverage_data[chromosome][start_index:end_index,0] - coverage=numpy.average(regional_coverage[numpy.where(regional_coverage > -1)]) - max_coverage=max(regional_coverage[numpy.where(regional_coverage > -1)]) - quality=numpy.average(coverage_data[chromosome][start_index:end_index,1]) - return (coverage,max_coverage,int(round(quality))) - - -#Find the number of discordant pairs within a region -def retrieve_discs(chromosome,start,end,coverage_data): - discs=0 - start_index=int(math.floor(start/100.0)) - end_index=int(math.floor(end/100.0))+1 - discs=sum(coverage_data[chromosome][start_index:end_index,2]) - return(discs) - - - -def Part(a, b, sizeA, sizeB, gap, insert_mean, insert_stddev, coverage, readLength ): - - readfrequency = 2 * readLength / coverage; - expr1 = (min([sizeA, sizeB]) - (readLength - 0)) / readfrequency * norm.cdf(a, 0, 1); - expr2 = -(- 0 ) / readfrequency * norm.cdf(b, 0, 1); - expr3 = (b * insert_stddev) / readfrequency * (norm.cdf(b, 0, 1) - norm.cdf(a, 0, 1)); - expr4 = (insert_stddev / readfrequency) * (norm.pdf(b, 0, 1) - norm.pdf(a, 0, 1)); - value = expr1 + expr2 + expr3 + expr4 - return(value) - -def expected_links(coverageA,coverageB,sizeA,sizeB,gap,insert_mean,insert_stddev,readLength): - coverage=numpy.average([coverageA,coverageB]) - - b1 = (sizeA + sizeB + gap - insert_mean) / float(insert_stddev) - a1 = (max([sizeA, sizeB]) + gap + readLength - insert_mean) / float(insert_stddev) - b2 = (min([sizeA, sizeB]) + gap + readLength - insert_mean) / float(insert_stddev) - a2 = (gap + 2 * readLength - insert_mean) / float(insert_stddev) - - e = Part(a1, b1, sizeA, sizeB, gap, insert_mean, insert_stddev, coverage, readLength ) - Part(a2, b2, sizeA, sizeB, gap, insert_mean, insert_stddev, coverage, readLength) - return(e) - -#determine the variant type -def fetch_variant_type(chrA,chrB,candidate,args,library_stats): - variant_type="SVTYPE=BND" - var="N[{}:{}[".format(chrB,candidate["posB"]) - if not library_stats["ploidies"][chrA] and chrA == chrB: - variant_type="SVTYPE=DUP" - var="" - - if chrA == chrB and library_stats["ploidies"][chrA]: - ploidy=library_stats["ploidies"][chrA] - if ploidy > 10: - if candidate["discs"] and abs(candidate["covM"]/library_stats["chr_cov"][chrA]-1) < 0.05: - if candidate["FF"] + candidate["RR"] > candidate["RF"] + candidate["FR"]: - variant_type="SVTYPE=INV" - var="" - elif not candidate["discs"] and abs(candidate["covM"]/library_stats["chr_cov"][chrA]-1) < 0.05: - if candidate["splitsINV"] > candidate["splits"]-candidate["splitsINV"]: - variant_type="SVTYPE=INV" - var="" - elif candidate["covM"]/library_stats["chr_cov"][chrA]-1 > 0.05: - variant_type="SVTYPE=DUP" - var="" - elif candidate["covM"]/library_stats["chr_cov"][chrA]-1 < -0.05: - variant_type="SVTYPE=DEL" - var="" - - elif candidate["discs"]: - if candidate["FF"] + candidate["RR"] > candidate["RF"] + candidate["FR"]: - variant_type="SVTYPE=INV" - var="" - elif library_stats["Orientation"] == "innie": - if candidate["covM"]/library_stats["chr_cov"][chrA] > (ploidy+0.5)/float(ploidy): - variant_type="SVTYPE=DUP" - var="" - if candidate["RF"] > candidate["FR"]: - variant_type="SVTYPE=TDUP" - var="" - elif candidate["covM"]/library_stats["chr_cov"][chrA] < (ploidy-0.5)/float(ploidy): - variant_type="SVTYPE=DEL" - var="" - - else: - if candidate["covM"]/library_stats["chr_cov"][chrA] > (ploidy+0.5)/float(ploidy): - variant_type="SVTYPE=DUP" - var="" - if candidate["RF"] < candidate["FR"]: - variant_type="SVTYPE=TDUP" - var="" - - elif candidate["covM"]/library_stats["chr_cov"][chrA] < (ploidy-0.5)/float(ploidy): - variant_type="SVTYPE=DEL" - var="" - else: - if candidate["splitsINV"] > candidate["splits"]-candidate["splitsINV"]: - variant_type="SVTYPE=INV" - var="" - elif candidate["covM"]/library_stats["chr_cov"][chrA] >(ploidy+0.5)/float(ploidy): - variant_type="SVTYPE=DUP" - var="" - elif candidate["covM"]/library_stats["chr_cov"][chrA] < (ploidy-0.5)/float(ploidy): - variant_type="SVTYPE=DEL" - var="" - - if candidate["discs"]: - if candidate["e2"]*1.6 <= (candidate["discs"]): - GT="1/1" - else: - GT="0/1" - else: - if candidate["e2"]*1.6 <= (candidate["splits"]): - GT="1/1" - else: - GT="0/1" - - if "DUP" in var or var == "": - if var == "DEL" and candidate["covM"]/library_stats["chr_cov"][chrA] < 0.1: - GT="1/1" - elif var == "DUP" and candidate["covM"]/library_stats["chr_cov"][chrA] > 1.8: - GT="1/1" - - return(var,variant_type,GT) - -#compute the filters -def fetch_filter(chrA,chrB,candidate,args,library_stats): - filt="PASS" - - #Less than the expected number of signals - if candidate["discs"] and not ( candidate["splits"] and abs(candidate["posA"]-candidate["posB"]) < 3*library_stats["STDInsertSize"] and chrA == chrB ): - if candidate["e1"]*0.6 >= candidate["discs"]+candidate["splits"]: - filt = "BelowExpectedLinks" - elif candidate["e2"]*library_stats["ReadLength"]/float(library_stats["MeanInsertSize"]) >= candidate["discs"]+candidate["splits"]: - filt = "BelowExpectedLinks" - else: - if candidate["e1"]*0.4 >= (candidate["splits"]+candidate["discs"]): - filt = "BelowExpectedLinks" - #The ploidy of this contig is 0, hence there shoud be no variant here - if library_stats["ploidies"][chrA] == 0 or library_stats["ploidies"][chrB] == 0: - return("Ploidy") - - #coverage is too high - if candidate["MaxcovA"] >= library_stats["chr_cov"][chrA]*(library_stats["ploidies"][chrA]+2) or candidate["MaxcovB"] >= library_stats["chr_cov"][chrB]*(library_stats["ploidies"][chrB]+2): - filt = "UnexpectedCoverage" - elif candidate["discsA"] > (candidate["discs"]+candidate["splits"])*(library_stats["ploidies"][chrA]+1) or candidate["discsB"] > (candidate["discs"]+candidate["splits"])*(library_stats["ploidies"][chrA]+1): - filt= "FewLinks" - elif chrA == chrB and candidate["max_A"] > candidate["min_B"]: - filt = "Smear" - elif chrA != chrB or (abs(candidate["posA"]-candidate["posB"]) > library_stats["MeanInsertSize"]+3*library_stats["STDInsertSize"] ): - if not candidate["discs"] or candidate["discs"]*4 < candidate["splits"] or candidate["discs"] <= args.p/2: - filt= "SplitsVSDiscs" - elif candidate["QRA"] < args.Q or candidate["QRB"] < args.Q: - filt = "RegionalQ" - - #fewer links than expected - if chrA == chrB and (abs(candidate["max_A"]-candidate["min_B"]) < args.z): - filt="MinSize" - - return(filt) - -#split inversions into two separate calls -def redefine_inv(vcf_line,signals,library_stats,args): - analysed_signals=analyse_pos(signals,True,library_stats,args) - vcf_line[1]=analysed_signals["posA"] - vcf_line[7]=vcf_line[7].split(";END=")[0]+";END="+str(analysed_signals["posB"]) +";SVLEN=" + vcf_line[7].split(";SVLEN=")[-1] - vcf_line[7]=vcf_line[7].split("SVLEN=")[0]+"SVLEN="+str(analysed_signals["posB"]-analysed_signals["posA"]+1) +";COVM=" + vcf_line[7].split(";COVM=")[-1] - - vcf_line[7]=vcf_line[7].split("CIPOS=")[0]+"CIPOS={},{}".format(analysed_signals["min_A"]-analysed_signals["posA"],analysed_signals["max_A"]-analysed_signals["posA"]) + ";CIEND" + vcf_line[7].split(";CIEND")[-1] - vcf_line[7]=vcf_line[7].split("CIEND=")[0]+"CIEND={},{}".format(analysed_signals["min_B"]-analysed_signals["posB"],analysed_signals["max_B"]-analysed_signals["posB"]) + ";END=" + vcf_line[7].split(";END=")[-1] - return(vcf_line) - -def inv_recluster(vcf_line,candidate,library_stats,args): - RR=[] - FF=[] - - for signal in candidate["signals"]: - if( not signal[2] and not signal[4] ): - RR.append(signal) - else: - FF.append(signal) - - vcf_line_rr=copy.copy(vcf_line) - vcf_line_ff=copy.copy(vcf_line) - vcf_line_rr=redefine_inv(vcf_line_rr,numpy.array(RR),library_stats,args) - vcf_line_ff=redefine_inv(vcf_line_ff,numpy.array(FF),library_stats,args) - var_id=list(vcf_line_ff[2]) - var_id[-1]="2" - vcf_line_ff[2]="".join(var_id) - - return([vcf_line_rr,vcf_line_ff]) - -def generate_vcf_line(chrA,chrB,n,candidate,args,library_stats): - vcf_line=[] - if chrA == chrB and candidate["posA"] > candidate["posB"]: - candidate["posA"]=candidate["min_A"] - candidate["posB"]=candidate["max_B"] - - vcf_line.append(chrA) - vcf_line.append(candidate["posA"]) - vcf_line.append("SV_{}_1".format(n)) - vcf_line.append("N") - var,variant_type,GT=fetch_variant_type(chrA,chrB,candidate,args,library_stats) - vcf_line.append(var) - vcf_line.append(".") - vcf_line.append(fetch_filter(chrA,chrB,candidate,args,library_stats)) - INFO="{};CIPOS={},{};CIEND={},{}".format(variant_type,candidate["min_A"]-candidate["posA"],candidate["max_A"]-candidate["posA"],candidate["min_B"]-candidate["posB"],candidate["max_B"]-candidate["posB"]) - if chrA == chrB: - if not "BND" in variant_type: - INFO += ";END={};SVLEN={}".format(candidate["posB"],abs(candidate["posB"]-candidate["posA"]+1)) - INFO += ";COVM={}".format(candidate["covM"]) - stats=";COVA={};COVB={};LFA={};LFB={};LTE={};E1={};E2={};OR={},{},{},{};ORSR={},{};QUALA={};QUALB={}".format(candidate["covA"],candidate["covB"],int(round(candidate["discsA"])),int(round(candidate["discsB"])),candidate["discs"]+candidate["splits"],candidate["e1"],candidate["e2"],candidate["FF"],candidate["RR"] ,candidate["RF"],candidate["FR"],candidate["splitsINV"],candidate["splits"]-candidate["splitsINV"],candidate["QRA"],candidate["QRB"]) - - INFO+=stats - - vcf_line.append(INFO) - FORMAT_FORMAT="GT:CN:DV:RV" - vcf_line.append(FORMAT_FORMAT) - CN="." - if "DEL" in var or "DUP" in var: - CN=int(round(candidate["covM"]/(library_stats["Coverage"]/args.n))) - if "DEL" in var: - CN=library_stats["ploidies"][chrA]-CN - if CN < 0: - CN=library_stats["ploidies"][chrA] - - FORMAT_STR="{}:{}:{}:{}".format(GT,CN,candidate["discs"],candidate["splits"]) - vcf_line.append(FORMAT_STR) - if not "BND" in variant_type and not "INV" in variant_type: - return([vcf_line]) - elif "INV" in variant_type: - if candidate["FF"] and candidate["RR"] and candidate["RR"] > args.p/2 and candidate["FF"] > args.p/2 and candidate["discs"]: - return(inv_recluster(vcf_line,candidate,library_stats,args)) - else: - return([vcf_line]) - else: - vcf_line_a=copy.copy(vcf_line) - vcf_line_b=copy.copy(vcf_line) - inverted=False - before=True - if candidate["posA"] == candidate["max_A"]: - before=False - - if candidate["FF"] + candidate["RR"] > candidate["RF"] + candidate["FR"]: - inverted=True - - if not inverted and not before: - alt_str_a="N[{}:{}[".format(chrB,candidate["posB"]) - alt_str_b="]{}:{}]N".format(chrA,candidate["posA"]) - elif not inverted and before: - alt_str_a="]{}:{}]N".format(chrB,candidate["posB"]) - alt_str_b="N[{}:{}[".format(chrA,candidate["posA"]) - elif inverted and not before: - alt_str_a="N]{}:{}]".format(chrB,candidate["posB"]) - alt_str_b="[{}:{}[N".format(chrA,candidate["posA"]) - else: - alt_str_a="[{}:{}[N".format(chrB,candidate["posB"]) - alt_str_b="N]{}:{}]".format(chrA,candidate["posA"]) - - vcf_line_a[4]=alt_str_a - vcf_line_b[4]=alt_str_b - vcf_line_b[2]="SV_{}_2".format(n) - vcf_line_b[0]=chrB - vcf_line_b[1]=candidate["posB"] - return([vcf_line_a,vcf_line_b]) - -#normalise the coverage based on GC content -def gc_norm(args,median_coverage,normalising_chromosomes,coverage_data,Ncontent): - gc_vals=[0. , 0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09, 0.1 , 0.11, 0.12, 0.13, 0.14, 0.15, 0.16, 0.17, 0.18, 0.19, 0.2 , 0.21, 0.22, 0.23, 0.24, 0.25, 0.26, 0.27, 0.28, 0.29, 0.3 , 0.31, 0.32, 0.33, 0.34, 0.35, 0.36, 0.37, 0.38, 0.39, 0.4 , 0.41, 0.42, 0.43, 0.44, 0.45, 0.46, 0.47, 0.48, 0.49, 0.5 , 0.51, 0.52, 0.53, 0.54, 0.55, 0.56, 0.57, 0.58, 0.59, 0.6 , 0.61, 0.62, 0.63, 0.64, 0.65, 0.66, 0.67, 0.68, 0.69, 0.7 , 0.71, 0.72, 0.73, 0.74, 0.75, 0.76, 0.77, 0.78, 0.79, 0.8 , 0.81, 0.82, 0.83, 0.84, 0.85, 0.86, 0.87, 0.88, 0.89, 0.9 , 0.91, 0.92, 0.93, 0.94, 0.95, 0.96, 0.97, 0.98, 0.99, 1] - gc_dict={} - - for gc in gc_vals: - tmp=[] - gc_dict[gc]=median_coverage - for chromosome in normalising_chromosomes: - selected= Ncontent[chromosome] == gc - selected_coverage=coverage_data[chromosome][selected] - tmp+=list(selected_coverage[ numpy.where( selected_coverage[:,1] > args.q)[0] ,0]) - if len(tmp): - gc_dict[gc]=numpy.median(tmp) - if 0 == gc_dict[gc]: - gc_dict[gc]=median_coverage - - for chromosome in coverage_data: - for i in range(0,len(coverage_data[chromosome])): - if not Ncontent[chromosome][i] == -1: - coverage_data[chromosome][i,0]=median_coverage*coverage_data[chromosome][i,0]/gc_dict[ Ncontent[chromosome][i] ] - - return(coverage_data) - -#estimate the ploidy of each chromosome -def determine_ploidy(args,chromosomes,coverage_data,Ncontent,sequence_length,library_stats): - library_stats["chr_cov"]={} - ploidies={} - avg_coverage=[] - cov=[] - for chromosome in chromosomes: - try: - N_count=Ncontent[chromosome] - chr_cov=coverage_data[chromosome][numpy.where( (N_count > 0) & (coverage_data[chromosome][:,1] > args.q) ),0][0] - if len(chr_cov): - chromosomal_average=numpy.median(chr_cov) - cov+= list(chr_cov) - else: - chromosomal_average=0 - library_stats["chr_cov"][chromosome]=chromosomal_average - - except: - print "error: reference mismatch!" - print "make sure that the contigs of the bam file and the reference match" - quit() - - cov=numpy.array(cov) - if len(cov): - coverage_norm=numpy.median(cov) - else: - coverage_norm=1 - coverage_data=gc_norm(args,coverage_norm,chromosomes,coverage_data,Ncontent) - - chromosomal_average=0 - outfile=open(args.o+".ploidy.tab", 'w') - outfile.write("Contig\tploidy_rounded\tploidy_raw\tmedian_coverage\n") - for chromosome in chromosomes: - N_count=Ncontent[chromosome] - cov=coverage_data[chromosome][numpy.where( (N_count > -1) & ( (coverage_data[chromosome][:,1] > args.q) | (coverage_data[chromosome][:,1] == 0) ) ),0] - chromosomal_average=numpy.median(cov) - if not args.force_ploidy: - try: - ploidies[chromosome]=int(round((chromosomal_average)/coverage_norm*args.n)) - except: - ploidies[chromosome]=args.n - else: - ploidies[chromosome]=args.n - library_stats["chr_cov"][chromosome]=chromosomal_average - - outfile.write("{}\t{}\t{}\t{}\n".format(chromosome,ploidies[chromosome],round( library_stats["chr_cov"][chromosome]/coverage_norm*args.n,2),library_stats["chr_cov"][chromosome])) - - outfile.close() - return(ploidies,library_stats,coverage_data) - -#compute the GC content of the bins, and find which bins contain too many N -def retrieve_N_content(args): - if not args.ref.endswith(".gz"): - with open(args.ref, 'r') as f: - sequence=f.read().split(">") - else: - with gzip.open(args.ref, 'r') as f: - sequence=f.read().split(">") - - del sequence[0] - Ncontent={} - sequence_length={} - for chromosome in sequence: - content=chromosome.split("\n",1) - sequence=content[1].replace("\n","") - contig=content[0].split()[0] - regions=[sequence[i:i+100] for i in range(0, len(sequence), 100)] - Ncontent[contig]=[] - for region in regions: - region_upper=region.upper() - if region_upper.count("N")/100.0 > args.n_mask: - Ncontent[contig].append(-1) - else: - Ncontent[contig].append( round( (region_upper.count("G")+region_upper.count("C"))/100.0 ,2) ) - sequence_length[contig]=len(sequence) - Ncontent[contig]=numpy.array(Ncontent[contig]) - - - return(Ncontent,sequence_length) - -#main function -def cluster(args): - - Ncontent,sequence_length=retrieve_N_content(args) - coverage_data=coverage(args) - - conn = sqlite3.connect(args.o+".db") - args.c = conn.cursor() - - tableListQuery = "SELECT name FROM sqlite_master WHERE type=\'table\'" - args.c.execute(tableListQuery) - tables = map(lambda t: t[0], args.c.fetchall()) - if "TIDDITcall" in tables: - args.c.execute("DROP TABLE TIDDITcall") - A="CREATE TABLE TIDDITcall (chrA TEXT,chrB TEXT,posA INT,posB INT,forwardA INT,forwardB INT,qualA INT, qualB INT,cigarA TEXT,cigarB TEXT, resolution INT,name INT)" - args.c.execute(A) - header,chromosomes,library_stats=signals(args,coverage_data) - A="CREATE INDEX CHR ON TIDDITcall (chrA, chrB)" - args.c.execute(A) - - ploidies,library_stats,coverage_data=determine_ploidy(args,chromosomes,coverage_data,Ncontent,sequence_length,library_stats) - library_stats["ploidies"]=ploidies - - if not args.e: - args.e=int(math.sqrt(library_stats["MeanInsertSize"]*2)*12) - n=1 - print "clustering signals on chromosome:" - calls={} - for chrA in chromosomes: - calls[chrA] =[] - print "{}".format(chrA) - for chrB in chromosomes: - signal_data=numpy.array([ [hit[0],hit[1],hit[2],hit[3],hit[4],hit[5],hit[6],hit[7]] for hit in args.c.execute('SELECT posA,posB,forwardA,qualA,forwardB,qualB,resolution,name FROM TIDDITcall WHERE chrA == \'{}\' AND chrB == \'{}\''.format(chrA,chrB)).fetchall()]) - - if not len(signal_data): - continue - - candidates=generate_clusters(chrA,chrB,signal_data,library_stats,args) - - for i in range(0,len(candidates)): - candidates[i]["covA"],candidates[i]["MaxcovA"],candidates[i]["QRA"]=retrieve_coverage(chrA,candidates[i]["min_A"],candidates[i]["max_A"],coverage_data) - candidates[i]["covB"],candidates[i]["MaxcovB"],candidates[i]["QRB"]=retrieve_coverage(chrB,candidates[i]["min_B"],candidates[i]["max_B"],coverage_data) - if chrA == chrB: - if candidates[i]["posB"] > candidates[i]["posA"]: - candidates[i]["covM"],candidates[i]["MaxcovM"],candidates[i]["QRM"]=retrieve_coverage(chrB,candidates[i]["posA"],candidates[i]["posB"],coverage_data) - else: - candidates[i]["covM"],candidates[i]["MaxcovM"],candidates[i]["QRM"]=retrieve_coverage(chrB,candidates[i]["posB"],candidates[i]["posA"],coverage_data) - else: - candidates[i]["covM"]=0 - candidates[i]["QRM"]=0 - candidates[i]["discsA"]=retrieve_discs(chrA,candidates[i]["min_A"],candidates[i]["max_A"],coverage_data) - candidates[i]["discsB"]=retrieve_discs(chrB,candidates[i]["min_B"],candidates[i]["max_B"],coverage_data) - sizeA=candidates[i]["max_A"]-candidates[i]["min_A"]+library_stats["ReadLength"] - sizeB=candidates[i]["max_B"]-candidates[i]["min_B"]+library_stats["ReadLength"] - gap=[] - for signal in candidates[i]["signals"]: - gap.append(abs(sizeA-(signal[0]-candidates[i]["min_A"])+(signal[1]-candidates[i]["min_B"]))) - gap=numpy.average(gap) - if sizeA > library_stats["MeanInsertSize"] or sizeB > library_stats["MeanInsertSize"]: - gap=1 - elif gap > sizeA: - gap = gap - sizeA - else: - gap=1 - if ploidies[chrA]: - coverageA=candidates[i]["covA"]/ploidies[chrA] - else: - coverageA=candidates[i]["covA"]/float(args.n) - if ploidies[chrB]: - coverageB=candidates[i]["covB"]/ploidies[chrB] - else: - coverageB=candidates[i]["covB"]/float(args.n) - - if candidates[i]["discs"] and not ( candidates[i]["splits"] and abs(candidates[i]["posA"]-candidates[i]["posB"]) < 3*library_stats["STDInsertSize"] and chrA == chrB ): - candidates[i]["e1"]=int(round(expected_links(coverageA,coverageB,sizeA,sizeB,gap,library_stats["MeanInsertSize"],library_stats["STDInsertSize"],library_stats["ReadLength"]))) - candidates[i]["e2"]= int( round( (min([coverageA,coverageB])*(library_stats["MeanInsertSize"]-library_stats["ReadLength"]/3)) / (float(library_stats["ReadLength"]))/2.0 ) ) - else: - candidates[i]["e1"]=int(round( min([coverageA,coverageB])*0.75 )) - candidates[i]["e2"]=int(round( min([coverageA,coverageB])*0.75 )) - vcf_line=generate_vcf_line(chrA,chrB,n,candidates[i],args,library_stats) - - if len(vcf_line) == 1: - calls[chrA].append(vcf_line[0]) - else: - calls[chrA].append(vcf_line[0]) - if not chrB in calls: - calls[chrB]=[] - calls[chrB].append(vcf_line[1]) - n+=1 - - outfile=open(args.o+".vcf", 'w') - new_header=[] - for line in header.strip().split("\n"): - if "##TIDDITcmd" in line: - new_header.append("##TIDDITcmd=\"{}\"\n".format(" ".join(sys.argv))) - continue - new_header.append(line+"\n") - header="".join(new_header) - outfile.write(header) - for chromosome in chromosomes: - for call in sorted(calls[chromosome],key=lambda x: x[1]): - output=call - output[1]=str(output[1]) - if "MinSize" == output[6]: - continue - - output="\t".join(output)+"\n" - outfile.write(output) - - print "Work complete!" - return() - diff --git a/src/TIDDIT_coverage.py b/src/TIDDIT_coverage.py new file mode 100644 index 0000000..d6d8fc8 --- /dev/null +++ b/src/TIDDIT_coverage.py @@ -0,0 +1,128 @@ +import numpy + +#The functions in this script are used to read and proccess the coverage tab file, as well as to perform GC correction + +#read the coverage tab file +def coverage(args): + coverage_data={} + for line in open(args.o+".tab"): + if line[0] == "#": + continue + content=line.strip().split() + if not content[0] in coverage_data: + coverage_data[content[0]]=[] + coverage_data[content[0]].append([ float(content[3]),float(content[4]),0 ]) + + for chromosome in coverage_data: + coverage_data[chromosome]=numpy.array(coverage_data[chromosome]) + + return(coverage_data) + +#estimate the ploidy of each chromosome +def determine_ploidy(args,chromosomes,coverage_data,Ncontent,library_stats): + library_stats["chr_cov"]={} + ploidies={} + avg_coverage=[] + cov=[] + for chromosome in chromosomes: + try: + if args.ref: + N_count=Ncontent[chromosome] + chr_cov=coverage_data[chromosome][numpy.where( (N_count > 0) & (coverage_data[chromosome][:,1] > args.q) ),0][0] + else: + chr_cov=coverage_data[chromosome][numpy.where( coverage_data[chromosome][:,1] > args.q ),0][0] + + if len(chr_cov): + chromosomal_average=numpy.median(chr_cov) + cov+= list(chr_cov) + else: + chromosomal_average=0 + library_stats["chr_cov"][chromosome]=chromosomal_average + + except: + print "error: reference mismatch!" + print "make sure that the contigs of the bam file and the reference match" + quit() + + cov=numpy.array(cov) + if len(cov): + coverage_norm=numpy.median(cov) + else: + coverage_norm=1 + + if args.ref: + coverage_data=gc_norm(args,coverage_norm,chromosomes,coverage_data,Ncontent) + + chromosomal_average=0 + outfile=open(args.o+".ploidy.tab", 'w') + outfile.write("Contig\tploidy_rounded\tploidy_raw\tmedian_coverage\n") + for chromosome in chromosomes: + + if args.ref: + N_count=Ncontent[chromosome] + cov=coverage_data[chromosome][numpy.where( (N_count > -1) & ( (coverage_data[chromosome][:,1] > args.q) | (coverage_data[chromosome][:,1] == 0) ) ),0] + else: + cov=coverage_data[chromosome][numpy.where( (coverage_data[chromosome][:,1] > args.q) | (coverage_data[chromosome][:,1] == 0) ),0] + + chromosomal_average=numpy.median(cov) + if not args.force_ploidy: + try: + ploidies[chromosome]=int(round((chromosomal_average)/coverage_norm*args.n)) + except: + ploidies[chromosome]=args.n + else: + ploidies[chromosome]=args.n + library_stats["chr_cov"][chromosome]=chromosomal_average + + outfile.write("{}\t{}\t{}\t{}\n".format(chromosome,ploidies[chromosome],round( library_stats["chr_cov"][chromosome]/coverage_norm*args.n,2),library_stats["chr_cov"][chromosome])) + + outfile.close() + return(ploidies,library_stats,coverage_data) + +#normalise the coverage based on GC content +def gc_norm(args,median_coverage,normalising_chromosomes,coverage_data,Ncontent): + gc_vals=[0., 0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09, 0.1 , 0.11, 0.12, 0.13, 0.14, 0.15, 0.16, 0.17, 0.18, 0.19, 0.2 , 0.21, 0.22, 0.23, 0.24, 0.25, 0.26, 0.27, 0.28, 0.29, 0.3 , 0.31, 0.32, 0.33, 0.34, 0.35, 0.36, 0.37, 0.38, 0.39, 0.4 , 0.41, 0.42, 0.43, 0.44, 0.45, 0.46, 0.47, 0.48, 0.49, 0.5 , 0.51, 0.52, 0.53, 0.54, 0.55, 0.56, 0.57, 0.58, 0.59, 0.6 , 0.61, 0.62, 0.63, 0.64, 0.65, 0.66, 0.67, 0.68, 0.69, 0.7 , 0.71, 0.72, 0.73, 0.74, 0.75, 0.76, 0.77, 0.78, 0.79, 0.8 , 0.81, 0.82, 0.83, 0.84, 0.85, 0.86, 0.87, 0.88, 0.89, 0.9 , 0.91, 0.92, 0.93, 0.94, 0.95, 0.96, 0.97, 0.98, 0.99, 1] + gc_dict={} + + for gc in gc_vals: + tmp=[] + gc_dict[gc]=median_coverage + for chromosome in normalising_chromosomes: + selected= Ncontent[chromosome] == gc + selected_coverage=coverage_data[chromosome][selected] + tmp+=list(selected_coverage[ numpy.where( selected_coverage[:,1] > args.q)[0] ,0]) + if len(tmp): + gc_dict[gc]=numpy.median(tmp) + if 0 == gc_dict[gc]: + gc_dict[gc]=median_coverage + + for chromosome in coverage_data: + for i in range(0,len(coverage_data[chromosome])): + if not Ncontent[chromosome][i] == -1: + coverage_data[chromosome][i,0]=median_coverage*coverage_data[chromosome][i,0]/gc_dict[ Ncontent[chromosome][i] ] + + return(coverage_data) + +#load the GC tab, and find which bins contain too many N +def retrieve_N_content(args): + + Ncontent={} + for line in open(args.o+".gc.tab"): + if line[0] == "#": + continue + + content=line.strip().split() + if not content[0] in Ncontent: + Ncontent[ content[0] ] = [] + contig=content[0] + gc=round(float(content[-2]),2) + n=float(content[-1]) + if n > args.n_mask: + Ncontent[contig].append(-1) + else: + Ncontent[contig].append( gc ) + + for contig in Ncontent: + Ncontent[contig]=numpy.array(Ncontent[contig]) + + return(Ncontent) diff --git a/src/TIDDIT_filtering.py b/src/TIDDIT_filtering.py new file mode 100644 index 0000000..bc256e2 --- /dev/null +++ b/src/TIDDIT_filtering.py @@ -0,0 +1,144 @@ +import numpy +from scipy.stats import norm + +#This script contains functions for applying and computing the filters and statistics +def expected_links(coverageA,coverageB,sizeA,sizeB,gap,insert_mean,insert_stddev,readLength): + coverage=numpy.average([coverageA,coverageB]) + + b1 = (sizeA + sizeB + gap - insert_mean) / float(insert_stddev) + a1 = (max([sizeA, sizeB]) + gap + readLength - insert_mean) / float(insert_stddev) + b2 = (min([sizeA, sizeB]) + gap + readLength - insert_mean) / float(insert_stddev) + a2 = (gap + 2 * readLength - insert_mean) / float(insert_stddev) + + e = Part(a1, b1, sizeA, sizeB, gap, insert_mean, insert_stddev, coverage, readLength ) - Part(a2, b2, sizeA, sizeB, gap, insert_mean, insert_stddev, coverage, readLength) + return(e) + +#compute the filters +def fetch_filter(chrA,chrB,candidate,args,library_stats): + filt="PASS" + + #Less than the expected number of signals + if candidate["discs"] and not ( candidate["splits"] and abs(candidate["posA"]-candidate["posB"]) < 3*library_stats["STDInsertSize"] and chrA == chrB ): + if candidate["e1"]*0.6 >= candidate["discs"]+candidate["splits"]: + filt = "BelowExpectedLinks" + elif candidate["e2"]*library_stats["ReadLength"]/float(library_stats["MeanInsertSize"]) >= candidate["discs"]+candidate["splits"]: + filt = "BelowExpectedLinks" + else: + if candidate["e1"]*0.4 >= (candidate["splits"]+candidate["discs"]): + filt = "BelowExpectedLinks" + #The ploidy of this contig is 0, hence there shoud be no variant here + if library_stats["ploidies"][chrA] == 0 or library_stats["ploidies"][chrB] == 0: + return("Ploidy") + + #coverage is too high + if candidate["MaxcovA"] >= library_stats["chr_cov"][chrA]*(library_stats["ploidies"][chrA]+2) or candidate["MaxcovB"] >= library_stats["chr_cov"][chrB]*(library_stats["ploidies"][chrB]+2): + filt = "UnexpectedCoverage" + elif candidate["discsA"] > (candidate["discs"]+candidate["splits"])*(library_stats["ploidies"][chrA]+1) or candidate["discsB"] > (candidate["discs"]+candidate["splits"])*(library_stats["ploidies"][chrA]+1): + filt= "FewLinks" + elif chrA == chrB and candidate["max_A"] > candidate["min_B"]: + filt = "Smear" + elif chrA != chrB or (abs(candidate["posA"]-candidate["posB"]) > library_stats["MeanInsertSize"]+3*library_stats["STDInsertSize"] ): + if not candidate["discs"] or candidate["discs"]*4 < candidate["splits"] or candidate["discs"] <= args.p/2: + filt= "SplitsVSDiscs" + elif candidate["QRA"] < args.Q or candidate["QRB"] < args.Q: + filt = "RegionalQ" + + #fewer links than expected + if chrA == chrB and (abs(candidate["max_A"]-candidate["min_B"]) < args.z): + filt="MinSize" + + return(filt) + +#determine the variant type +def fetch_variant_type(chrA,chrB,candidate,args,library_stats): + variant_type="SVTYPE=BND" + var="N[{}:{}[".format(chrB,candidate["posB"]) + if not library_stats["ploidies"][chrA] and chrA == chrB: + variant_type="SVTYPE=DUP" + var="" + + if chrA == chrB and library_stats["ploidies"][chrA]: + ploidy=library_stats["ploidies"][chrA] + if ploidy > 10: + if candidate["discs"] and abs(candidate["covM"]/library_stats["chr_cov"][chrA]-1) < 0.05: + if candidate["FF"] + candidate["RR"] > candidate["RF"] + candidate["FR"]: + variant_type="SVTYPE=INV" + var="" + elif not candidate["discs"] and abs(candidate["covM"]/library_stats["chr_cov"][chrA]-1) < 0.05: + if candidate["splitsINV"] > candidate["splits"]-candidate["splitsINV"]: + variant_type="SVTYPE=INV" + var="" + elif candidate["covM"]/library_stats["chr_cov"][chrA]-1 > 0.05: + variant_type="SVTYPE=DUP" + var="" + elif candidate["covM"]/library_stats["chr_cov"][chrA]-1 < -0.05: + variant_type="SVTYPE=DEL" + var="" + + elif candidate["discs"]: + if candidate["FF"] + candidate["RR"] > candidate["RF"] + candidate["FR"]: + variant_type="SVTYPE=INV" + var="" + elif library_stats["Orientation"] == "innie": + if candidate["covM"]/library_stats["chr_cov"][chrA] > (ploidy+0.5)/float(ploidy): + variant_type="SVTYPE=DUP" + var="" + if candidate["RF"] > candidate["FR"]: + variant_type="SVTYPE=TDUP" + var="" + elif candidate["covM"]/library_stats["chr_cov"][chrA] < (ploidy-0.5)/float(ploidy): + variant_type="SVTYPE=DEL" + var="" + + else: + if candidate["covM"]/library_stats["chr_cov"][chrA] > (ploidy+0.5)/float(ploidy): + variant_type="SVTYPE=DUP" + var="" + if candidate["RF"] < candidate["FR"]: + variant_type="SVTYPE=TDUP" + var="" + + elif candidate["covM"]/library_stats["chr_cov"][chrA] < (ploidy-0.5)/float(ploidy): + variant_type="SVTYPE=DEL" + var="" + else: + if candidate["splitsINV"] > candidate["splits"]-candidate["splitsINV"]: + variant_type="SVTYPE=INV" + var="" + elif candidate["covM"]/library_stats["chr_cov"][chrA] >(ploidy+0.5)/float(ploidy): + variant_type="SVTYPE=DUP" + var="" + elif candidate["covM"]/library_stats["chr_cov"][chrA] < (ploidy-0.5)/float(ploidy): + variant_type="SVTYPE=DEL" + var="" + + if candidate["discs"]: + if candidate["e2"]*1.6 <= (candidate["discs"]): + GT="1/1" + else: + GT="0/1" + else: + if candidate["e2"]*1.6 <= (candidate["splits"]): + GT="1/1" + else: + GT="0/1" + + if "DUP" in var or var == "": + if var == "DEL" and candidate["covM"]/library_stats["chr_cov"][chrA] < 0.1: + GT="1/1" + elif var == "DUP" and candidate["covM"]/library_stats["chr_cov"][chrA] > 1.8: + GT="1/1" + + return(var,variant_type,GT) + +def Part(a, b, sizeA, sizeB, gap, insert_mean, insert_stddev, coverage, readLength ): + + readfrequency = 2 * readLength / coverage; + expr1 = (min([sizeA, sizeB]) - (readLength - 0)) / readfrequency * norm.cdf(a, 0, 1); + expr2 = -(- 0 ) / readfrequency * norm.cdf(b, 0, 1); + expr3 = (b * insert_stddev) / readfrequency * (norm.cdf(b, 0, 1) - norm.cdf(a, 0, 1)); + expr4 = (insert_stddev / readfrequency) * (norm.pdf(b, 0, 1) - norm.pdf(a, 0, 1)); + value = expr1 + expr2 + expr3 + expr4 + return(value) + + diff --git a/src/TIDDIT_signals.py b/src/TIDDIT_signals.py new file mode 100644 index 0000000..61c0386 --- /dev/null +++ b/src/TIDDIT_signals.py @@ -0,0 +1,173 @@ +import itertools +import sqlite3 +import math + +#The functions of this script are used to read the input signals, and to create the database of signals + +#extract the vcf header of the signals tab file +def find_contigs(header): + chromosomes=[] + library_stats={} + chromosome_len={} + for line in header.split("\n"): + if "##contig=")[0]) + if "##LibraryStats=" in line: + stats=line.strip().split(" ") + library_stats={"Coverage":float(stats[1].split("=")[-1]),"ReadLength":int(stats[2].split("=")[-1]),"MeanInsertSize":int(stats[3].split("=")[-1]),"STDInsertSize":int(stats[4].split("=")[-1]),"Orientation":stats[5].split("=")[-1]} + return (chromosomes,library_stats,chromosome_len) + +#Read the signals tab file +def signals(args,coverage_data): + signal_data=[] + header="" + first_signal=True + read_list={} + n_reads=0 + for line in open(args.o+".signals.tab"): + if line[0] == "#": + header += line + continue + + if first_signal: + chromosomes,library_stats,chromosome_len=find_contigs(header) + first_signal=False + + content=line.strip().split() + + read_name=content[0] + if read_name in read_list: + name=read_list[read_name] + else: + name=n_reads + read_list[read_name]=n_reads + n_reads+=1 + + chrA=content[1] + posA=int(content[2]) + if content[3] == "+": + orientationA=1 + else: + orientationA=0 + + cigarA=content[4] + if ("S" in cigarA or "H" in cigarA) and not cigarA == "NA": + deletions,insertions,length,clip_after,aligned_range=read_cigar(cigarA) + if clip_after: + posA+=length + else: + if "M" in cigarA: + deletions,insertions,length,clip_after,aligned_range=read_cigar(cigarA) + else: + length=library_stats["ReadLength"] + + if library_stats["Orientation"] == "innie": + if orientationA: + posA+=length-1 + else: + if not orientationA: + posA+=length-1 + + if posA >= chromosome_len[chrA]: + posA=chromosome_len[chrA]-1 + + qualA=int(content[5]) + + chrB=content[6] + posB=int(content[7]) + if content[8] == "+": + orientationB=1 + else: + orientationB=0 + cigarB=content[9] + qualB=int(content[10]) + + if ("S" in cigarB or "H" in cigarB) and not cigarB == "NA": + deletions,insertions,length,clip_after,aligned_range=read_cigar(cigarB) + if clip_after: + posB+=length + else: + if "M" in cigarB: + deletions,insertions,length,clip_after,aligned_range=read_cigar(cigarB) + else: + length=library_stats["ReadLength"] + + if library_stats["Orientation"] == "innie": + if orientationB: + posB+=length-1 + else: + if not orientationB: + posB+=length-1 + + if posB >= chromosome_len[chrB]: + posB=chromosome_len[chrB]-1 + + resolution=int(content[-1]) + + + if chrA > chrB or (posB < posA and chrA == chrB): + signal_data.append([chrB,chrA,posB,posA,orientationB,orientationA,qualB,qualA,cigarB,cigarA,resolution,name]) + else: + signal_data.append([chrA,chrB,posA,posB,orientationA,orientationB,qualA,qualB,cigarA,cigarB,resolution,name]) + + + if len (signal_data) > 1000000: + args.c.executemany('INSERT INTO TIDDITcall VALUES (?,?,?,?,?,?,?,?,?,?,?,?)',signal_data) + signal_data=[] + + idx_b=int(math.floor(posB/100.0)) + if idx_b >= len(coverage_data[chrB]): + idx_b =len(coverage_data[chrB])-1 + coverage_data[chrB][idx_b,2]+=1 + + idx_a=int(math.floor(posA/100.0)) + if idx_a >= len(coverage_data[chrA]): + idx_a=len(coverage_data[chrA])-1 + coverage_data[chrA][idx_a,2]+=1 + + if len(signal_data): + args.c.executemany('INSERT INTO TIDDITcall VALUES (?,?,?,?,?,?,?,?,?,?,?,?)',signal_data) + return(header,chromosomes,library_stats) + +#analyse the cigar operation of the read +def read_cigar(cigar): + deletions=0 + insertions=0 + SC = ["".join(x) for _, x in itertools.groupby(cigar, key=str.isdigit)] + length=0 + first=True + clip_after=True + + aligned_range=[] + current_pos=1 + for i in range(0,len(SC)/2): + if first and SC[i*2+1] == "M": + first = False + elif first and SC[i*2+1] == "S": + first = False + clip_after=False + + if SC[i*2+1] == "M": + length += int( SC[i*2] ) + bases=range(0,int( SC[i*2] )) + for j in range(0,len(bases)): + bases[j] += current_pos + + aligned_range += bases + current_pos += int( SC[i*2] ) + elif SC[i*2+1] == "I": + insertions+=1 + bases=range(0,int( SC[i*2] )) + for j in range(0,len(bases)): + bases[j] += current_pos + aligned_range += bases + + current_pos += int( SC[i*2] ) + elif SC[i*2+1] == "D": + length += int( SC[i*2] ) + deletions +=1 + else: + current_pos += int( SC[i*2] ) + + return deletions,insertions,length,clip_after,aligned_range diff --git a/src/common.h b/src/common.h index 4c32e45..f62f465 100644 --- a/src/common.h +++ b/src/common.h @@ -54,20 +54,17 @@ static inline std::string package_description() { enum readStatus {unmapped, lowQualty, singleton, pair_wrongChrs, pair_proper, pair_wrongDistance, pair_wrongOrientation}; - - class Cov{ public: - //the vraiables used in the coverage calculation function - ofstream coverageOutput; + + //the vraiables used in the coverage calculation function + ofstream coverageOutput; int binSize; - int binStart; + int binStart; int binEnd; int currentChr; - bool discordants; vector< vector > coverageStructure; vector< vector< vector > > qualityStructure; - vector< vector > discordantsStructure; map position2contig; map contig2position; vector contigLength; @@ -76,9 +73,9 @@ class Cov{ string output; //constructor - Cov(int binSize,string bamFile,string output,bool discordants); + Cov(int binSize,string bamFile,string output); //module used to calculate the coverage of the genome - void bin(BamAlignment currentRead); + void bin(BamAlignment currentRead, readStatus alignmentStatus); void printCoverage(); }; @@ -161,12 +158,8 @@ static readStatus computeReadType(BamAlignment al, uint32_t max_insert, uint32_t } } -static LibraryStatistics computeLibraryStats(string bamFileName, uint64_t genomeLength, uint32_t max_insert, uint32_t min_insert,bool is_mp,int quality,string outputFileHeader) { - - Cov *calculateCoverage; - calculateCoverage = new Cov(100,bamFileName,outputFileHeader,true); - - +static LibraryStatistics computeLibraryStats(string bamFileName, uint64_t genomeLength, uint32_t max_insert, uint32_t min_insert,bool is_mp,int quality,string outputFileHeader, int sample) { + BamReader bamFile; bamFile.Open(bamFileName); LibraryStatistics library; @@ -177,6 +170,7 @@ static LibraryStatistics computeLibraryStats(string bamFileName, uint64_t genome uint32_t lowQualityReads = 0; uint32_t mappedReads = 0; uint64_t mappedReadsLength= 0; + int sampled = 0; uint64_t insertsLength = 0; // total inserts length @@ -218,10 +212,8 @@ static LibraryStatistics computeLibraryStats(string bamFileName, uint64_t genome int32_t currentTid = -1; int32_t iSize; - BamAlignment al; - while ( bamFile.GetNextAlignmentCore(al) ) { reads ++; readStatus read_status = computeReadType(al, max_insert, min_insert,is_mp); @@ -229,22 +221,12 @@ static LibraryStatistics computeLibraryStats(string bamFileName, uint64_t genome mappedReads ++; mappedReadsLength += al.Length; - //calculate the coverage in bins of size 400 bases - calculateCoverage -> bin(al); } - vector clipSizes; - vector< int > readPositions; - vector genomePos; - if ( al.GetSoftClips(clipSizes,readPositions,genomePos) ){ - splitReads += 1; - } - - - if (al.IsFirstMate() && al.IsMateMapped() and read_status != lowQualty ) { if( al.IsReverseStrand() != al.IsMateReverseStrand() ){ - if(al.IsMapped() and al.MapQuality > 30 and al.RefID == al.MateRefID and al.MatePosition-al.Position+1 < max_insert ){ + if(al.RefID == al.MateRefID and abs(al.MatePosition-al.Position+1) < max_insert and al.MapQuality > quality ){ + sampled+=1; iSize = abs(al.InsertSize); if(counterK == 1) { Mk = iSize; @@ -294,51 +276,37 @@ static LibraryStatistics computeLibraryStats(string bamFileName, uint64_t genome break; } + + if (sampled >= sample){ + break; + } + + } bamFile.Close(); cout << "LIBRARY STATISTICS\n"; - cout << "\t total reads number " << reads << "\n"; - cout << "\t total mapped reads " << mappedReads << "\n"; - cout << "\t total unmapped reads " << unmappedReads << "\n"; - cout << "\t split reads " << splitReads << "\n"; - cout << "\t low quality reads " << lowQualityReads << "\n"; - cout << "\t wrongly contig " << matedDifferentContig << "\n"; - cout << "\t singletons " << singletonReads << "\n"; - uint32_t total = matedReads + wrongDistanceReads + wronglyOrientedReads + matedDifferentContig + singletonReads ; - cout << "\ttotal " << total << "\n"; - cout << "\tCoverage statistics\n"; - - library.C_A = C_A = mappedReadsLength/(float)genomeLength; - library.S_A = S_A = insertsLength/(float)genomeLength; - library.C_M = C_M = matedReadsLength/(float)genomeLength; - library.C_W = C_W = wronglyOrientedReadsLength/(float)genomeLength; - library.C_S = C_S = singletonReadsLength/(float)genomeLength; - library.C_D = C_D = matedDifferentContigLength/(float)genomeLength; + library.readLength= mappedReadsLength/mappedReads; library.insertMean = insertMean = Mk; if(reads-wronglyOrientedReads > wronglyOrientedReads){ library.mp=true; + cout << "\tPair orientation = Reverse-forward "<< endl; }else{ library.mp=false; + cout << "\tPair orientation = Forward-reverse" << endl; } + Qk = sqrt(Qk/counterK); library.insertStd = insertStd = Qk; - cout << "\tC_A = " << C_A << endl; - cout << "\tS_A = " << S_A << endl; - cout << "\tC_M = " << C_M << endl; - cout << "\tC_W = " << C_W << endl; - cout << "\tC_S = " << C_S << endl; - cout << "\tC_D = " << C_D << endl; cout << "\tRead length = " << library.readLength << endl; cout << "\tMean Insert length = " << Mk << endl; cout << "\tStd Insert length = " << Qk << endl; cout << "----------\n"; - calculateCoverage -> printCoverage(); return library; } diff --git a/src/data_structures/CoverageModule.cpp b/src/data_structures/CoverageModule.cpp index 35dd044..d3ef87a 100644 --- a/src/data_structures/CoverageModule.cpp +++ b/src/data_structures/CoverageModule.cpp @@ -24,7 +24,7 @@ ostream& test(ofstream &coverageOutput,string output){ } //constructor -Cov::Cov(int binSize,string bamFile,string output,bool discordants){ +Cov::Cov(int binSize,string bamFile,string output){ ostream& covout=test(coverageOutput,output); if(output != "stdout"){ coverageOutput.open((output+".tab").c_str()); @@ -42,18 +42,13 @@ Cov::Cov(int binSize,string bamFile,string output,bool discordants){ this -> binSize = binSize; this -> currentChr=-1; - this -> contigsNumber = 0; - this -> bamFile = bamFile; - this -> discordants=discordants; + this -> contigsNumber = 0; + this -> bamFile = bamFile; this -> output = output; - if (this -> discordants == true){ - covout << "#CHR" << "\t" << "start" << "\t" << "end" << "\t" << "coverage" <<"\t" << "quality" << "\t" << "discordants" << endl; - }else{ - covout << "#CHR" << "\t" << "start" << "\t" << "end" << "\t" << "coverage" <<"\t" << "quality" << endl; - } + covout << "#CHR" << "\t" << "start" << "\t" << "end" << "\t" << "coverage" <<"\t" << "quality" << endl; + BamReader alignmentFile; - BamReader alignmentFile; //open the bam file alignmentFile.Open(bamFile); @@ -66,20 +61,14 @@ Cov::Cov(int binSize,string bamFile,string output,bool discordants){ contigsNumber++; } this -> coverageStructure.resize(contigsNumber); - this -> discordantsStructure.resize(contigsNumber); this -> qualityStructure.resize(2); qualityStructure[0].resize(contigsNumber); qualityStructure[1].resize(contigsNumber); for(int i=0;i discordants == true){ - discordantsStructure[i].resize(ceil(contigLength[i]/double(binSize)),0); - } } @@ -90,19 +79,15 @@ Cov::Cov(int binSize,string bamFile,string output,bool discordants){ //this function calculates the coverage in bins of size binSize across the entire bamfile -void Cov::bin(BamAlignment currentRead){ +void Cov::bin(BamAlignment currentRead, readStatus alignmentStatus){ //initialise the chromosome ID if(currentChr == -1){ currentChr=currentRead.RefID; } //if the quality of the read is high enough, it will be added to the data structure - readStatus alignmentStatus = computeReadType(currentRead, 100000,100, true); if(alignmentStatus != lowQualty and alignmentStatus != unmapped) { int element=floor(double(currentRead.Position)/double(binSize)); - if (this -> discordants == true and currentRead.RefID != currentRead.MateRefID){ - discordantsStructure[currentRead.RefID][element] +=1; - } //if the entire read is inside the region, add all the bases to sequenced bases if(currentRead.Position >= element*binSize and currentRead.Position+currentRead.Length-1 <= (element+1)*binSize){ coverageStructure[currentRead.RefID][element]+=currentRead.Length; @@ -142,25 +127,21 @@ void Cov::printCoverage(){ static ostream& covout=cout; } - for(int i=0;i contigLength[i]){ - binEnd=contigLength[i]; + if(binEnd > contigLength[i]){ + binEnd=contigLength[i]; } - double coverage=double(coverageStructure[i][j])/double(binEnd-binStart); - double quality = 0; - if(double(qualityStructure[1][i][j]) > 0){ - quality=double(qualityStructure[0][i][j])/double(qualityStructure[1][i][j]); - } - if (this -> discordants == true){ - covout << position2contig[i] << "\t" << binStart << "\t" << binEnd << "\t" << coverage << "\t" << quality << "\t" << discordantsStructure[i][j] << endl; - }else{ - covout << position2contig[i] << "\t" << binStart << "\t" << binEnd << "\t" << coverage << "\t" << quality << endl; + double coverage=double(coverageStructure[i][j])/double(binEnd-binStart); + double quality = 0; + if(double(qualityStructure[1][i][j]) > 0){ + quality=double(qualityStructure[0][i][j])/double(qualityStructure[1][i][j]); } - } + covout << position2contig[i] << "\t" << binStart << "\t" << binEnd << "\t" << coverage << "\t" << quality << endl; + } } } diff --git a/src/data_structures/ProgramModules.h b/src/data_structures/ProgramModules.h index 4ce9416..1a0dee8 100644 --- a/src/data_structures/ProgramModules.h +++ b/src/data_structures/ProgramModules.h @@ -17,7 +17,7 @@ class StructuralVariations{ //constructor StructuralVariations(); //main function - void findTranslocationsOnTheFly(string bamFileName, bool outtie, float meanCoverage,string outputFileHeader,string version,string commandline, map SV_options); + void findTranslocationsOnTheFly(string bamFileName, bool outtie, float meanCoverage,string outputFileHeader,string version,string commandline, map SV_options,uint64_t genomeLength); }; #endif /* PROGRAMMODULES_H_ */ diff --git a/src/data_structures/Translocation.cpp b/src/data_structures/Translocation.cpp index c9ccebd..a34ed40 100644 --- a/src/data_structures/Translocation.cpp +++ b/src/data_structures/Translocation.cpp @@ -14,7 +14,7 @@ string int2str(int to_be_converted){ return(converted); } -void Window::initTrans(SamHeader head,string libraryData) { +void Window::initTrans(SamHeader head) { uint32_t contigsNumber = 0; SamSequenceDictionary sequences = head.Sequences; for(SamSequenceIterator sequence = sequences.Begin() ; sequence != sequences.End(); ++sequence) { @@ -27,6 +27,11 @@ void Window::initTrans(SamHeader head,string libraryData) { } contigsNumber++; } + +} + +void Window::printHeader(SamHeader head,string libraryData) { + string intra_chr_eventsVCF = outputFileHeader + ".signals.tab"; this->TIDDITVCF.open(intra_chr_eventsVCF.c_str()); if(head.HasReadGroups() == false){ @@ -45,8 +50,7 @@ void Window::initTrans(SamHeader head,string libraryData) { } -void Window::insertRead(BamAlignment alignment) { - readStatus alignmentStatus = computeReadType(alignment, this->max_insert,this->min_insert, this->outtie); +void Window::insertRead(BamAlignment alignment,readStatus alignmentStatus) { if( not alignment.IsMateMapped() or alignment.MapQuality < minimum_mapping_quality or alignmentStatus == lowQualty) { return; // in case the alignment is of no use discard it diff --git a/src/data_structures/Translocation.h b/src/data_structures/Translocation.h index 9ebb763..137855a 100644 --- a/src/data_structures/Translocation.h +++ b/src/data_structures/Translocation.h @@ -45,8 +45,9 @@ class Window { Window(string bamFileName, bool outtie, float meanCoverage,string outputFileHeader, map SV_options); // constructor - void initTrans(SamHeader head,string libraryData); // initialise the contig to position array - void insertRead(BamAlignment alignment); // inserts a new read + void initTrans(SamHeader head); // initialise the contig to position array + void printHeader(SamHeader head,string libraryData); //print header + void insertRead(BamAlignment alignment, readStatus alignmentStatus); // inserts a new read string VCFHeader(string libraryData); //print the vcf header }; diff --git a/src/data_structures/findTranslocationsOnTheFly.cpp b/src/data_structures/findTranslocationsOnTheFly.cpp index 9062249..d0feeb1 100644 --- a/src/data_structures/findTranslocationsOnTheFly.cpp +++ b/src/data_structures/findTranslocationsOnTheFly.cpp @@ -9,7 +9,7 @@ //function used to find translocations StructuralVariations::StructuralVariations() { } -void StructuralVariations::findTranslocationsOnTheFly(string bamFileName, bool outtie, float meanCoverage, string outputFileHeader, string version,string command, map SV_options) { +void StructuralVariations::findTranslocationsOnTheFly(string bamFileName, bool outtie, float meanCoverage, string outputFileHeader, string version,string command, map SV_options,uint64_t genomeLength) { size_t start = time(NULL); //open the bam file BamReader bamFile; @@ -20,28 +20,41 @@ void StructuralVariations::findTranslocationsOnTheFly(string bamFileName, bool o Window *window; window = new Window(bamFileName,outtie,meanCoverage,outputFileHeader,SV_options); window-> version = version; - stringstream ss; + window->initTrans(head); string orientation_string="innie"; if(outtie == true){ orientation_string="outtie"; } - ss << "##LibraryStats=TIDDIT-" << version << " Coverage=" << meanCoverage << " ReadLength=" << SV_options["readLength"] << " MeanInsertSize=" << SV_options["meanInsert"] << " STDInsertSize=" << SV_options["STDInsert"] << " Orientation=" << orientation_string << "\n" << "##TIDDITcmd=\"" << command << "\""; - string libraryData=ss.str(); - window->initTrans(head,libraryData); for (int i=0;i< SV_options["contigsNumber"];i++){window -> SV_calls[i] = vector();} //Initialize bam entity BamAlignment currentRead; + Cov *calculateCoverage; + calculateCoverage = new Cov(100,bamFileName,outputFileHeader); + + uint64_t mappedReadsLength= 0; + //now start to iterate over the bam file while ( bamFile.GetNextAlignmentCore(currentRead) ) { - if(currentRead.IsMapped()) { - window->insertRead(currentRead); - } + if(currentRead.IsMapped()){ + readStatus alignmentStatus = computeReadType(currentRead, window->max_insert,window-> min_insert, window-> outtie); + window->insertRead(currentRead, alignmentStatus); + + calculateCoverage -> bin(currentRead, alignmentStatus); + + if (alignmentStatus != lowQualty){mappedReadsLength+= currentRead.Length;} + } } + + //print header + stringstream ss; + ss << "##LibraryStats=TIDDIT-" << version << " Coverage=" << float(mappedReadsLength)/genomeLength << " ReadLength=" << SV_options["readLength"] << " MeanInsertSize=" << SV_options["meanInsert"] << " STDInsertSize=" << SV_options["STDInsert"] << " Orientation=" << orientation_string << "\n" << "##TIDDITcmd=\"" << command << "\""; + string libraryData=ss.str(); + window->printHeader(head,libraryData); //print calls for(int i=0;i< SV_options["contigsNumber"];i++){ @@ -52,5 +65,8 @@ void StructuralVariations::findTranslocationsOnTheFly(string bamFileName, bool o window->TIDDITVCF.close(); printf ("signal extraction time consumption= %lds\n", time(NULL) - start); + + calculateCoverage -> printCoverage(); + } diff --git a/src/setup.py b/src/setup.py index ab86dfb..f22ccf5 100644 --- a/src/setup.py +++ b/src/setup.py @@ -3,5 +3,5 @@ setup( name = "TIDDIT", - ext_modules = cythonize(['TIDDIT_clustering.py',"DBSCAN.py"]), + ext_modules = cythonize(['TIDDIT_calling.py',"TIDDIT_coverage.py","TIDDIT_filtering.py","TIDDIT_signals.py","DBSCAN.py"]), )