modified: ../README.md

modified: ../TIDDIT.py modified: DBSCAN.py modified: TIDDIT.cpp new file: TIDDIT_calling.py deleted: TIDDIT_clustering.py new file: TIDDIT_coverage.py new file: TIDDIT_filtering.py new file: TIDDIT_signals.py modified: common.h modified: data_structures/CoverageModule.cpp modified: data_structures/ProgramModules.h modified: data_structures/Translocation.cpp modified: data_structures/Translocation.h modified: data_structures/findTranslocationsOnTheFly.cpp modified: setup.py
J35P312 · Aug 15, 2018 · 6870a53 · 6870a53
1 parent 0296658
commit 6870a53
Show file tree

Hide file tree

Showing 16 changed files with 1,094 additions and 939 deletions.
diff --git a/README.md b/README.md
@@ -45,8 +45,13 @@ The SV module
 =============
 The main TIDDIT module, detects structural variant using discordant pairs, split reads and coverage information
 
+    python TIDDIT.py --sv [Options] --bam bam
+
+Optionally, TIDDIT acccepts a reference fasta for GC cocrrection:
+
     python TIDDIT.py --sv [Options] --bam bam --ref reference.fasta
 
+
 NOTE: It is important that you use the TIDDIT.py wrapper for SV detection. The TIDDIT binary in the TIDDIT/bin folder does not perform any clustering, it simply extract SV signatures into a tab file.
 
 Where bam is the input bam file. And reference.fasta is the reference fasta used to align the sequencing data: TIDDIT will crash if the reference fasta is different from the one used to align the reads. The reads of the input bam file must be sorted on genome position.

diff --git a/TIDDIT.py b/TIDDIT.py
@@ -5,9 +5,9 @@
 
 wd=os.path.dirname(os.path.realpath(__file__))
 sys.path.insert(0, '{}/src/'.format(wd))
-import TIDDIT_clustering
+import TIDDIT_calling
 
-version = "2.2.6"
+version = "2.3.0"
 parser = argparse.ArgumentParser("""TIDDIT-{}""".format(version),add_help=False)
 parser.add_argument('--sv'       , help="call structural variation", required=False, action="store_true")
 parser.add_argument('--cov'        , help="generate a coverage bed file", required=False, action="store_true")
@@ -27,34 +27,42 @@
 	parser.add_argument('-n', type=int,default=2, help="the ploidy of the organism,(default = 2)")
 	parser.add_argument('-e', type=int, help="clustering distance  parameter, discordant pairs closer than this distance are considered to belong to the same variant(default = sqrt(insert-size*2)*12)")
 	parser.add_argument('-l', type=int,default=3, help="min-pts parameter (default=3),must be set > 2")
+	parser.add_argument('-s', type=int,default=50000000, help="Number of reads to sample when computing library statistics(default=50000000)")
 	parser.add_argument('-z', type=int,default=100, help="minimum variant size (default=100), variants smaller than this will not be printed( z < 10 is not recomended)")
 	parser.add_argument('--force_ploidy',action="store_true", help="force the ploidy to be set to -n across the entire genome (i.e skip coverage normalisation of chromosomes)")
 	parser.add_argument('--debug',action="store_true", help="rerun the tiddit clustering procedure")
 	parser.add_argument('--n_mask',type=float,default=0.5, help="exclude regions from coverage calculation if they contain more than this fraction of N (default = 0.5)")
-	parser.add_argument('--ref',required=True, type=str, help="reference fasta")
+	parser.add_argument('--ref', type=str, help="reference fasta, used for GC correction")
 
 	args= parser.parse_args()
 	args.wd=os.path.dirname(os.path.realpath(__file__))
 	if args.l < 2:
 		print "error, too low --l value!"
 		quit()
-	if not os.path.isfile(args.ref):
-		print "error,  could not find the reference file"
-		quit()
+	if args.ref:
+		if not os.path.isfile(args.ref):
+			print "error,  could not find the reference file"
+			quit()
 	if not os.path.isfile(args.bam):
 		print "error,  could not find the bam file"
 		quit()
 
-	command_str="{}/bin/TIDDIT --sv -b {} -o {} -p {} -r {} -q {} -n {}".format(args.wd,args.bam,args.o,args.p,args.r,args.q,args.n)
+	command_str="{}/bin/TIDDIT --sv -b {} -o {} -p {} -r {} -q {} -n {} -s {}".format(args.wd,args.bam,args.o,args.p,args.r,args.q,args.n,args.s)
 	if args.i:
 		command_str += " -i {}".format(args.i)
 	if args.d:
 		command_str += " -d {}".format(args.d)
 
 	if not args.debug:
 		os.system(command_str)
-
-	TIDDIT_clustering.cluster(args)
+
+	if args.ref:
+		if args.ref.endswith(".gz"):
+			os.system("zcat {} | {}/bin/TIDDIT --gc -z 100 -o {}".format(args.ref,args.wd,args.o))
+		else:
+			os.system("cat {} | {}/bin/TIDDIT --gc -z 100 -o {}".format(args.ref,args.wd,args.o))
+
+	TIDDIT_calling.cluster(args)
 
 elif args.cov:
 	parser = argparse.ArgumentParser("""TIDDIT --cov --bam inputfile [-o prefix]""")

diff --git a/src/DBSCAN.py b/src/DBSCAN.py
@@ -1,5 +1,123 @@
 import numpy
-#supports up to 4D data
+
+#The functions of this script perform DBSCAN and returns the variatn clusters
+
+#Analyse the reads of the cluster, and colelct statistics such as position and orientation
+def analyse_pos(candidate_signals,discordants,library_stats,args):
+	analysed_signals={}
+
+	max_a=max(candidate_signals[:,0:1])[0]
+	min_a=min(candidate_signals[:,0:1])[0]
+
+	max_b=max(candidate_signals[:,1:2])[0]
+	min_b=min(candidate_signals[:,1:2])[0]
+
+
+	FF=0
+	FR=0
+	RF=0
+	RR=0
+
+	splitsINV=0
+	splits=0
+
+	discs=0
+
+	avgQb=[]
+	avgQa=[]
+
+	for i in range(0,len(candidate_signals)):
+		if candidate_signals[i][-2] == 1:
+			if(candidate_signals[i][2] != candidate_signals[i][4]):
+				splitsINV +=1
+		else:
+			if(candidate_signals[i][2] and not candidate_signals[i][4]):
+				FR+=1
+			elif( not candidate_signals[i][2] and candidate_signals[i][4]):
+				RF+=1
+			elif( not candidate_signals[i][2] and not candidate_signals[i][4]):
+				RR+=1
+			else:
+				FF+=1
+
+		if candidate_signals[i][-2] == 1:
+			splits +=1
+		else:
+			discs+=1
+
+		if candidate_signals[i][3] != -1:
+			avgQa.append(candidate_signals[i][3])
+		if candidate_signals[i][5] != -1:
+			avgQb.append(candidate_signals[i][5])
+
+	if avgQa:
+		avgQa=numpy.mean(avgQa)
+	else:
+		avgQa=-1
+
+	if avgQb:
+		avgQb=numpy.mean(avgQb)
+	else:
+		avgQb=-1
+
+	major_orientation=max([FF,FR,RF,RR])
+	if not discs:
+				posA=max_a
+				posB=min_b		
+	else:
+		if library_stats["Orientation"] == "innie":
+			if major_orientation == FF:
+				posA=max_a
+				posB=max_b
+			elif major_orientation == FR:
+				posA=max_a
+				posB=min_b
+			elif major_orientation == RF:
+				posA=min_a
+				posB=max_b
+			else:
+				posA=min_a
+				posB=min_b
+		else:
+			if major_orientation == FF:
+				posA=min_a
+				posB=min_b
+			elif major_orientation == FR:
+				posA=min_a
+				posB=max_b
+			elif major_orientation == RF:
+				posA=max_a
+				posB=min_b
+			else:
+				posA=max_a
+				posB=max_b
+	analysed_signals={"posA":posA,"posB":posB,"min_A":min_a,"max_A":max_a,"min_B":min_b,"max_B":max_b,"QA":avgQa,"QB":avgQb,"FF":FF,"FR":FR,"RF":RF,"RR":RR,"splitsINV":splitsINV,"splits":splits,"discs":discs,"signals":candidate_signals}
+	return(analysed_signals)
+
+#Call the cluser algorithm, the statistics function, and returns the final cluster
+def generate_clusters(chrA,chrB,coordinates,library_stats,args):
+	candidates=[]
+	coordinates=coordinates[numpy.lexsort((coordinates[:,1],coordinates[:,0]))]
+	db=main(coordinates[:,0:2],args.e,int(round(args.l+library_stats["ploidies"][chrA]/(args.n*10))))
+	unique_labels = set(db)
+
+	for var in unique_labels:
+		if var == -1:
+			continue
+		class_member_mask = (db == var)
+		candidate_signals =coordinates[class_member_mask]
+		resolution=candidate_signals[:,-2]
+		support=len(set(candidate_signals[:,-1]))
+		discordants=True
+		if len(set(resolution)) == 1 and max(resolution) == 1:
+			disordants=False
+
+		if discordants and support >= args.p:
+			candidates.append( analyse_pos(candidate_signals,discordants,library_stats,args) )
+		elif not discordants and support >= args.r and chrA == chrB:
+			candidates.append( analyse_pos(candidate_signals,discordants,library_stats,args) )
+
+	return(candidates)
 
 def x_coordinate_clustering(data,epsilon,m):
 	clusters=numpy.zeros(len(data))
@@ -97,13 +215,3 @@ def main(data,epsilon,m):
 	clusters,cluster_id=y_coordinate_clustering(data,epsilon,m,cluster_id,clusters)
 
 	return(clusters)
-
-
-data=numpy.array([[1,1],[10481823,10483880],[10481947,10483785],[10481947,1],[10481947,1],[10481947,1],[10482033,10483984],[10482079,10483801],[10482111,10483972],[10482121,10483788],[10482125,10483769],[10482126,10484204],[10482163,10483811],[10482177,10483909],[10482186,10483906],[10482191,10483836],[10482202,10484150],[10482262,10483947],[10482285,10483797],[10482342,10483968],[10483770,10482390],[10482390,10483814],[10483770,10482405],[10483769,10482428],[10483770,10482405],[10483770,10482405],[10483770,1],[10483770,1],[10483770,1],[10483770,1]])
-
-
-#data=data[numpy.lexsort((data[:,1],data[:,0]))]
-#print data
-#data=numpy.array([[1,2],[1,3],[1,4],[1,5],[5,100],[5,100],[10,2],[10,3],[10,4],[10,5],[20,100]])
-#print main(data,398,3)
-