Skip to content

Commit

Permalink
modified: ../README.md
Browse files Browse the repository at this point in the history
	modified:   ../TIDDIT.py
	modified:   DBSCAN.py
	modified:   TIDDIT.cpp
	new file:   TIDDIT_calling.py
	deleted:    TIDDIT_clustering.py
	new file:   TIDDIT_coverage.py
	new file:   TIDDIT_filtering.py
	new file:   TIDDIT_signals.py
	modified:   common.h
	modified:   data_structures/CoverageModule.cpp
	modified:   data_structures/ProgramModules.h
	modified:   data_structures/Translocation.cpp
	modified:   data_structures/Translocation.h
	modified:   data_structures/findTranslocationsOnTheFly.cpp
	modified:   setup.py
  • Loading branch information
J35P312 committed Aug 15, 2018
1 parent 0296658 commit 6870a53
Show file tree
Hide file tree
Showing 16 changed files with 1,094 additions and 939 deletions.
5 changes: 5 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -45,8 +45,13 @@ The SV module
=============
The main TIDDIT module, detects structural variant using discordant pairs, split reads and coverage information

python TIDDIT.py --sv [Options] --bam bam

Optionally, TIDDIT acccepts a reference fasta for GC cocrrection:

python TIDDIT.py --sv [Options] --bam bam --ref reference.fasta


NOTE: It is important that you use the TIDDIT.py wrapper for SV detection. The TIDDIT binary in the TIDDIT/bin folder does not perform any clustering, it simply extract SV signatures into a tab file.

Where bam is the input bam file. And reference.fasta is the reference fasta used to align the sequencing data: TIDDIT will crash if the reference fasta is different from the one used to align the reads. The reads of the input bam file must be sorted on genome position.
Expand Down
26 changes: 17 additions & 9 deletions TIDDIT.py
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,9 @@

wd=os.path.dirname(os.path.realpath(__file__))
sys.path.insert(0, '{}/src/'.format(wd))
import TIDDIT_clustering
import TIDDIT_calling

version = "2.2.6"
version = "2.3.0"
parser = argparse.ArgumentParser("""TIDDIT-{}""".format(version),add_help=False)
parser.add_argument('--sv' , help="call structural variation", required=False, action="store_true")
parser.add_argument('--cov' , help="generate a coverage bed file", required=False, action="store_true")
Expand All @@ -27,34 +27,42 @@
parser.add_argument('-n', type=int,default=2, help="the ploidy of the organism,(default = 2)")
parser.add_argument('-e', type=int, help="clustering distance parameter, discordant pairs closer than this distance are considered to belong to the same variant(default = sqrt(insert-size*2)*12)")
parser.add_argument('-l', type=int,default=3, help="min-pts parameter (default=3),must be set > 2")
parser.add_argument('-s', type=int,default=50000000, help="Number of reads to sample when computing library statistics(default=50000000)")
parser.add_argument('-z', type=int,default=100, help="minimum variant size (default=100), variants smaller than this will not be printed( z < 10 is not recomended)")
parser.add_argument('--force_ploidy',action="store_true", help="force the ploidy to be set to -n across the entire genome (i.e skip coverage normalisation of chromosomes)")
parser.add_argument('--debug',action="store_true", help="rerun the tiddit clustering procedure")
parser.add_argument('--n_mask',type=float,default=0.5, help="exclude regions from coverage calculation if they contain more than this fraction of N (default = 0.5)")
parser.add_argument('--ref',required=True, type=str, help="reference fasta")
parser.add_argument('--ref', type=str, help="reference fasta, used for GC correction")

args= parser.parse_args()
args.wd=os.path.dirname(os.path.realpath(__file__))
if args.l < 2:
print "error, too low --l value!"
quit()
if not os.path.isfile(args.ref):
print "error, could not find the reference file"
quit()
if args.ref:
if not os.path.isfile(args.ref):
print "error, could not find the reference file"
quit()
if not os.path.isfile(args.bam):
print "error, could not find the bam file"
quit()

command_str="{}/bin/TIDDIT --sv -b {} -o {} -p {} -r {} -q {} -n {}".format(args.wd,args.bam,args.o,args.p,args.r,args.q,args.n)
command_str="{}/bin/TIDDIT --sv -b {} -o {} -p {} -r {} -q {} -n {} -s {}".format(args.wd,args.bam,args.o,args.p,args.r,args.q,args.n,args.s)
if args.i:
command_str += " -i {}".format(args.i)
if args.d:
command_str += " -d {}".format(args.d)

if not args.debug:
os.system(command_str)

TIDDIT_clustering.cluster(args)

if args.ref:
if args.ref.endswith(".gz"):
os.system("zcat {} | {}/bin/TIDDIT --gc -z 100 -o {}".format(args.ref,args.wd,args.o))
else:
os.system("cat {} | {}/bin/TIDDIT --gc -z 100 -o {}".format(args.ref,args.wd,args.o))

TIDDIT_calling.cluster(args)

elif args.cov:
parser = argparse.ArgumentParser("""TIDDIT --cov --bam inputfile [-o prefix]""")
Expand Down
130 changes: 119 additions & 11 deletions src/DBSCAN.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,123 @@
import numpy
#supports up to 4D data

#The functions of this script perform DBSCAN and returns the variatn clusters

#Analyse the reads of the cluster, and colelct statistics such as position and orientation
def analyse_pos(candidate_signals,discordants,library_stats,args):
analysed_signals={}

max_a=max(candidate_signals[:,0:1])[0]
min_a=min(candidate_signals[:,0:1])[0]

max_b=max(candidate_signals[:,1:2])[0]
min_b=min(candidate_signals[:,1:2])[0]


FF=0
FR=0
RF=0
RR=0

splitsINV=0
splits=0

discs=0

avgQb=[]
avgQa=[]

for i in range(0,len(candidate_signals)):
if candidate_signals[i][-2] == 1:
if(candidate_signals[i][2] != candidate_signals[i][4]):
splitsINV +=1
else:
if(candidate_signals[i][2] and not candidate_signals[i][4]):
FR+=1
elif( not candidate_signals[i][2] and candidate_signals[i][4]):
RF+=1
elif( not candidate_signals[i][2] and not candidate_signals[i][4]):
RR+=1
else:
FF+=1

if candidate_signals[i][-2] == 1:
splits +=1
else:
discs+=1

if candidate_signals[i][3] != -1:
avgQa.append(candidate_signals[i][3])
if candidate_signals[i][5] != -1:
avgQb.append(candidate_signals[i][5])

if avgQa:
avgQa=numpy.mean(avgQa)
else:
avgQa=-1

if avgQb:
avgQb=numpy.mean(avgQb)
else:
avgQb=-1

major_orientation=max([FF,FR,RF,RR])
if not discs:
posA=max_a
posB=min_b
else:
if library_stats["Orientation"] == "innie":
if major_orientation == FF:
posA=max_a
posB=max_b
elif major_orientation == FR:
posA=max_a
posB=min_b
elif major_orientation == RF:
posA=min_a
posB=max_b
else:
posA=min_a
posB=min_b
else:
if major_orientation == FF:
posA=min_a
posB=min_b
elif major_orientation == FR:
posA=min_a
posB=max_b
elif major_orientation == RF:
posA=max_a
posB=min_b
else:
posA=max_a
posB=max_b
analysed_signals={"posA":posA,"posB":posB,"min_A":min_a,"max_A":max_a,"min_B":min_b,"max_B":max_b,"QA":avgQa,"QB":avgQb,"FF":FF,"FR":FR,"RF":RF,"RR":RR,"splitsINV":splitsINV,"splits":splits,"discs":discs,"signals":candidate_signals}
return(analysed_signals)

#Call the cluser algorithm, the statistics function, and returns the final cluster
def generate_clusters(chrA,chrB,coordinates,library_stats,args):
candidates=[]
coordinates=coordinates[numpy.lexsort((coordinates[:,1],coordinates[:,0]))]
db=main(coordinates[:,0:2],args.e,int(round(args.l+library_stats["ploidies"][chrA]/(args.n*10))))
unique_labels = set(db)

for var in unique_labels:
if var == -1:
continue
class_member_mask = (db == var)
candidate_signals =coordinates[class_member_mask]
resolution=candidate_signals[:,-2]
support=len(set(candidate_signals[:,-1]))
discordants=True
if len(set(resolution)) == 1 and max(resolution) == 1:
disordants=False

if discordants and support >= args.p:
candidates.append( analyse_pos(candidate_signals,discordants,library_stats,args) )
elif not discordants and support >= args.r and chrA == chrB:
candidates.append( analyse_pos(candidate_signals,discordants,library_stats,args) )

return(candidates)

def x_coordinate_clustering(data,epsilon,m):
clusters=numpy.zeros(len(data))
Expand Down Expand Up @@ -97,13 +215,3 @@ def main(data,epsilon,m):
clusters,cluster_id=y_coordinate_clustering(data,epsilon,m,cluster_id,clusters)

return(clusters)


data=numpy.array([[1,1],[10481823,10483880],[10481947,10483785],[10481947,1],[10481947,1],[10481947,1],[10482033,10483984],[10482079,10483801],[10482111,10483972],[10482121,10483788],[10482125,10483769],[10482126,10484204],[10482163,10483811],[10482177,10483909],[10482186,10483906],[10482191,10483836],[10482202,10484150],[10482262,10483947],[10482285,10483797],[10482342,10483968],[10483770,10482390],[10482390,10483814],[10483770,10482405],[10483769,10482428],[10483770,10482405],[10483770,10482405],[10483770,1],[10483770,1],[10483770,1],[10483770,1]])


#data=data[numpy.lexsort((data[:,1],data[:,0]))]
#print data
#data=numpy.array([[1,2],[1,3],[1,4],[1,5],[5,100],[5,100],[10,2],[10,3],[10,4],[10,5],[20,100]])
#print main(data,398,3)

Loading

0 comments on commit 6870a53

Please sign in to comment.