Skip to content

Commit

Permalink
modified: svdb/export_module.py
Browse files Browse the repository at this point in the history
	modified:   svdb/query_module.py
	modified:   svdb/readVCF.py
  • Loading branch information
J35P312 committed Apr 9, 2017
1 parent c4d2892 commit 9d974c6
Show file tree
Hide file tree
Showing 3 changed files with 65 additions and 39 deletions.
58 changes: 34 additions & 24 deletions svdb/export_module.py
Original file line number Diff line number Diff line change
Expand Up @@ -142,7 +142,7 @@ def expand_chain(chain,coordinates,chrA,chrB,distance,overlap,ci):
similar=overlap_module.isSameVariation(variant["posA"],variant["posB"],var["posA"],var["posB"],overlap,distance)
if similar:
chain_data[i].append(j)

chain_data[i]=np.array(chain_data[i])
return(chain_data)

def cluster_variants(variant_dictionary,similarity_matrix):
Expand All @@ -166,22 +166,30 @@ def cluster_variants(variant_dictionary,similarity_matrix):
clusters.append( [variant,cluster_dictionary] )
return(clusters)

def svdb_cluster_main(chrA,chrB,variant,sample_IDs,args,c,i):
f = open(args.prefix+".vcf",'a')
def fetch_variants(variant,chrA,chrB,c):
chr_db={}
chr_db[ variant ]={}

hits = c.execute('SELECT posA,posB,sample,idx,var FROM SVDB WHERE var == \'{}\'AND chrA == \'{}\' AND chrB == \'{}\''.format(variant,chrA,chrB)).fetchall()
if not hits:
f.close()
return i
return False

x=[v[0] for v in hits]
y=[v[1] for v in hits]

chr_db[variant]["coordinates"]=np.column_stack((x,y))
chr_db[variant]["var_info"]=np.array([v[2] for v in hits])
chr_db[variant]["index"]=np.array([v[3] for v in hits])
return chr_db


def svdb_cluster_main(chrA,chrB,variant,sample_IDs,args,c,i):
f = open(args.prefix+".vcf",'a')

chr_db=fetch_variants(variant,chrA,chrB,c)
if not chr_db:
f.close()
return i

if args.DBSCAN:
db = DBSCAN(eps=args.epsilon, min_samples=args.min_pts).fit(chr_db[variant]["coordinates"])
Expand All @@ -198,7 +206,7 @@ def svdb_cluster_main(chrA,chrB,variant,sample_IDs,args,c,i):
#print the unique variants
unique_xy=chr_db[variant]["coordinates"][ labels == -1 ]
unique_index=chr_db[variant]["index"][ labels == -1 ]

del db
for j in range(0,len( chr_db[variant]["coordinates"][ labels == -1] ) ):
xy = unique_xy[j]
indexes=[ unique_index[j] ]
Expand All @@ -221,7 +229,9 @@ def svdb_cluster_main(chrA,chrB,variant,sample_IDs,args,c,i):
cluster=[representing_var,variant_dictionary]
f.write( vcf_line(cluster,"cluster_{}".format(i),sample_IDs )+"\n")
i += 1

del unique_xy
del unique_index

#print the clusters
for k in unique_labels:
class_member_mask = (labels == k)
Expand All @@ -230,28 +240,28 @@ def svdb_cluster_main(chrA,chrB,variant,sample_IDs,args,c,i):
if k == -1:
continue
elif args.DBSCAN:
avg_point=np.array([np.mean(xy[:,0]),np.mean(xy[:,1])])
avg_point=np.array([np.mean(xy[:,0]),np.mean(xy[:,1])])

distance=[]
for point in xy:
distance=[]
for point in xy:
distance.append( ( sum((avg_point-point)**2))**0.5 )

variant_dictionary=fetch_cluster_variant(c,indexes)
variant_dictionary=fetch_cluster_variant(c,indexes)

representing_var={}
representing_var["type"]=variant
representing_var["chrA"]=chrA
representing_var["chrB"]=chrB
representing_var["posA"]= int(avg_point[0])
representing_var["ci_A_start"]= np.amin(xy[:,0])
representing_var["ci_A_end"]= np.amax(xy[:,0])
representing_var["posB"]= int(avg_point[1])
representing_var["ci_B_start"]= np.amin(xy[:,1])
representing_var["ci_B_end"]= np.amax(xy[:,1])
representing_var={}
representing_var["type"]=variant
representing_var["chrA"]=chrA
representing_var["chrB"]=chrB
representing_var["posA"]= int(avg_point[0])
representing_var["ci_A_start"]= np.amin(xy[:,0])
representing_var["ci_A_end"]= np.amax(xy[:,0])
representing_var["posB"]= int(avg_point[1])
representing_var["ci_B_start"]= np.amin(xy[:,1])
representing_var["ci_B_end"]= np.amax(xy[:,1])

cluster=[representing_var,variant_dictionary]
f.write( vcf_line(cluster,"cluster_{}".format(i),sample_IDs )+"\n")
i += 1
cluster=[representing_var,variant_dictionary]
f.write( vcf_line(cluster,"cluster_{}".format(i),sample_IDs )+"\n")
i += 1
else:
variant_dictionary,coordinates=fetch_index_variant(c,indexes)
similarity_matrix=expand_chain(variant_dictionary,coordinates,chrA,chrB,args.bnd_distance,args.overlap,args.ci)
Expand Down
44 changes: 30 additions & 14 deletions svdb/query_module.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,22 +67,32 @@ def main(args):
db_file=args.db
DBvariants={}
db_size=1

for line in open(db_file):

if line[0] == "#":
continue

chrA,posA,chrB,posB,event_type,INFO,FORMAT =readVCF.readVCFLine(line);
chrA,posA,chrB,posB,event_type,INFO,FORMAT = readVCF.readVCFLine(line);
if not chrA in DBvariants:
DBvariants[chrA]={}
if not chrB in DBvariants[chrA]:
DBvariants[chrA][chrB]=[]
info_field=line.split("\t")[7]
DBvariants[chrA][chrB].append([int(posA),int(posB),event_type,FORMAT])
DBvariants[chrA][chrB]={}
if not event_type in DBvariants[chrA][chrB]:
DBvariants[chrA][chrB][event_type]={}
DBvariants[chrA][chrB][event_type]["samples"]=[]
DBvariants[chrA][chrB][event_type]["coordinates"]=[]

DBvariants[chrA][chrB][event_type]["coordinates"].append(np.array([int(posA),int(posB)]))
DBvariants[chrA][chrB][event_type]["samples"].append(np.array(FORMAT["GT"]))
db_size=len(FORMAT["GT"])

for chrA in DBvariants:
for chrB in DBvariants[chrA]:
DBvariants[chrA][chrB]=np.array(DBvariants[chrA][chrB])

for var in DBvariants[chrA][chrB]:
DBvariants[chrA][chrB][var]["coordinates"]=np.array(DBvariants[chrA][chrB][var]["coordinates"])
DBvariants[chrA][chrB][var]["samples"]=np.array(DBvariants[chrA][chrB][var]["samples"])

for query in queries:
hits = queryVCFDB(DBvariants, query,args)
query[5] = hits
Expand Down Expand Up @@ -140,12 +150,19 @@ def queryVCFDB(DBvariants, Query_variant,args):
return 0
if not chrB in DBvariants[chrA]:
return 0
for var in DBvariants[chrA][chrB]:
if not args.no_var and variation_type != var:
continue

candidates=DBvariants[chrA][chrB][ ( args.bnd_distance >= abs(DBvariants[chrA][chrB][:,0] - chrApos) ) & ( args.bnd_distance >= abs(DBvariants[chrA][chrB][:,1] - chrBpos) ) ]
# check if this variation is already present
for event in candidates:
#check if the variant type of the events is the same
if(event[2] == variation_type or args.no_var):
#candidates=DBvariants[chrA][chrB][var]["coordinates"][ ( args.bnd_distance >= abs(DBvariants[chrA][chrB][var]["coordinates"][:,0] - chrApos) ) & ( args.bnd_distance >= abs(DBvariants[chrA][chrB][var]["coordinates"][:,1] - chrBpos) ) ]
candidates=np.where( ( args.bnd_distance >= abs(DBvariants[chrA][chrB][var]["coordinates"][:,0] - chrApos) ) & ( args.bnd_distance >= abs(DBvariants[chrA][chrB][var]["coordinates"][:,1] - chrBpos) ) )
if not len(candidates[0]):
return 0
# check if this variation is already present
for candidate in candidates[0]:
event=DBvariants[chrA][chrB][var]["coordinates"][candidate]
sample_list=DBvariants[chrA][chrB][var]["samples"][candidate]
#check if the variant type of the events is the same
hit_tmp = None
if not args.ci:
if not (chrA == chrB):
Expand All @@ -158,9 +175,8 @@ def queryVCFDB(DBvariants, Query_variant,args):
hit_tmp=overlap_module.ci_overlap(chrApos,chrBpos,ciA_query,ciB_query,event[0],event[1],[0,0],[0,0])

if hit_tmp:
genotype=event[-1]["GT"]
for i in range(0,len(genotype)):
GT=genotype[i]
for i in range(0,len(sample_list)):
GT=sample_list[i]
if not GT == "0|0" and not GT == "0/0":
samples = samples | set([i])
hits=len(samples)
Expand Down
2 changes: 1 addition & 1 deletion svdb/readVCF.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ def readVCFLine(line):
if line[0] == "#":
return(None)

variation = line.rstrip().split("\t")
variation = line.strip().split("\t")
event_type=""
chrA=variation[0].replace("chr","").replace("Chr","").replace("CHR","");
posA=int(variation[1]);
Expand Down

0 comments on commit 9d974c6

Please sign in to comment.