Skip to content

Commit

Permalink
modified: README.md
Browse files Browse the repository at this point in the history
	modified:   setup.py
	modified:   svdb/__main__.py
	modified:   svdb/export_module.py
  • Loading branch information
J35P312 committed Mar 8, 2017
1 parent d688531 commit 24672a7
Show file tree
Hide file tree
Showing 4 changed files with 40 additions and 45 deletions.
42 changes: 17 additions & 25 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -91,41 +91,33 @@ Hist: This module is used to compare structural variant vcf files, either by gen
on the confidence interval of the position of the
variants(0 if no CIPOS or CIEND is vailable)

Query: The query module is used to query a structural variant database. Typically a database is constructed using the build module. However, since this module utilize the genotype field of the sructural variant database vcf to compute the frequency of structural variants, a wide range of files could be used as database. The query module requires a query vcf, as well as a database vcf:
Query: The query module is used to query a structural variant database. Typically a database is constructed using the build module. However, since this module utilize the genotype field of the sructural variant database vcf to compute the frequency of structural variants, a wide range of files could be used as database. The query module requires a query vcf, as well as a database file(either multisample vcf or SVDB sqlite database):

print a help message
python SVDB.py --query --help
Query a structural variant database, using a vcf file as query:
svdb --query --query_vcf patient1.vcf --db control_db.vcf
The vcf may be a exported SVDB database or a multismple vcf. The frequencies used for each variant is computed from the format fields of the vcf.
The SVDB sqlite database may also be used for querying:
svdb --query --query_vcf patient1.vcf --sqdb control_db.db

optional arguments:
-h, --help show this help message and exit

--hit_tag HIT_TAG the tag used to describe the number of hits within the
info field of the output vcf(default=OCC)
--frequency_tag FREQUENCY_TAG the tag used to describe the frequency of the
variant(defualt=FRQ)
--prefix PREFIX the prefix of the output file, default = print to stdout
-h, --help show this help message and exit
--db DB path to a SVDB db vcf
--sqdb SQDB path to a SVDB sqlite db
--hit_tag HIT_TAG the tag used to describe the number of hits within the info field of the output vcf(default=OCC)
--frequency_tag FREQUENCY_TAG the tag used to describe the frequency of the variant(defualt=FRQ)
--prefix PREFIX the prefix of the output file, default = print to stdout --bnd_distance BND_DISTANCE the maximum distance between the breakpoints of two variantsbreakpoints(default = 10000)
--overlap OVERLAP the overlap threshold for deciding if two variants are similar(0 means anything that touches will be merged, 1 means that two events must be identical to be merged), default = 0.6
--DBSCAN use dbscan to cluster the variants, only available for the sqlite db, upon choosing DBSCAN, the overlap
--epsilon EPSILON used together with --DBSCAN; sets the epsilon paramter(default = 500)
--min_pts MIN_PTS used together with 1--DBSCAN; sets the min_pts parameter(default = 2)
--memory load the database into memory: increases the memory requirements, but lowers the time consumption(may only be used with sqdb)
--no_var count overlaping variants of different type as hits in the db
--invert invert the sorting order so that high frequency samples are present on top of the output vcf
--ci overides overlap and bnd_distance,determine hits based on the confidence interval of the position fo the variants(0 if no CIPOS or CIEND is vailable)

--bnd_distance BND_DISTANCE the maximum distance between two similar precise breakpoints
(default = 10000)
--overlap OVERLAP the overlap required to merge two events(0 means
anything that touches will be merged, 1 means that two
events must be identical to be merged), default = 0.6
--no_var count overlaping variants of different type as hits
--invert invert the sorting order so that high frequency
samples are present on top of the output vcf
--ci overides overlap and bnd_distance,determine hits based
on the confidence interval of the position fo the
variants(0 if no CIPOS or CIEND is vailable)

Purge: The purge module is used to remove entries from a database:

Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@

setup(
name = 'svdb',
version = '0.1.0',
version = '0.1.1',
ext_modules = ext_modules,
packages = ['svdb'],
install_requires = ['numpy', 'scikit-learn==0.15.2', 'scipy'],
Expand Down
4 changes: 2 additions & 2 deletions svdb/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ def main():
parser.add_argument('--bnd_distance', type=int,default= 10000,help="the maximum distance between two similar precise breakpoints(default = 10000)")
parser.add_argument('--overlap', type=float, default = 0.6,help="the overlap required to merge two events(0 means anything that touches will be merged, 1 means that two events must be identical to be merged), default = 0.6")
parser.add_argument('--DBSCAN' , help="use dbscan to cluster the variants, only available for the sqlite db", required=False, action="store_true")
parser.add_argument('--epsilon' ,type=int, default = 500, help="used together with --DBSCAN; sets the epsilon paramter(default = 500)", required=False)
parser.add_argument('--epsilon' ,type=float, default = 500, help="used together with --DBSCAN; sets the epsilon paramter(default = 500)", required=False)
parser.add_argument('--min_pts' ,type=int, default = 2, help="used together with 1--DBSCAN; sets the min_pts parameter(default = 2)", required=False)
parser.add_argument('--memory' , help="load the database into memory: increases the memory requirements, but lowers the time consumption(may only be used with sqdb)", required=False, action="store_true")

Expand Down Expand Up @@ -89,7 +89,7 @@ def main():
parser.add_argument('--bnd_distance', type=int,default= 2500,help="the maximum distance between two similar precise breakpoints(default = 2500)")
parser.add_argument('--overlap', type=float, default = 0.8,help="the overlap required to merge two events(0 means anything that touches will be merged, 1 means that two events must be identical to be merged), default = 0.8")
parser.add_argument('--DBSCAN' , help="use dbscan to cluster the variants", required=False, action="store_true")
parser.add_argument('--epsilon' ,type=int, default = 500, help="used together with --DBSCAN; sets the epsilon paramter(default = 500)", required=False)
parser.add_argument('--epsilon' ,type=float, default = 500, help="used together with --DBSCAN; sets the epsilon paramter(default = 500)", required=False)
parser.add_argument('--min_pts' ,type=int, default = 2, help="used together with 1--DBSCAN; sets the min_pts parameter(default = 2)", required=False)
parser.add_argument('--prefix', type=str,default="SVDB" ,help="the prefix of the output file, default = same as input")
parser.add_argument('--memory' , help="load the database into memory: increases the memory requirements, but lowers the time consumption", required=False, action="store_true")
Expand Down
37 changes: 20 additions & 17 deletions svdb/export_module.py
Original file line number Diff line number Diff line change
Expand Up @@ -217,26 +217,29 @@ def dbscan_export(args,sample_IDs):
chrB_list=[]
for chrB in c.execute('SELECT DISTINCT chrB FROM SVDB'):
chrB_list.append(chrB[0])

var_list=[]
for variant in c.execute('SELECT DISTINCT var FROM SVDB'):
var_list.append(variant[0])

i=0;
for chrA in chrA_list:
for chrB in chrB_list:
chr_db={}
for hit in c.execute('SELECT posA,posB,sample,idx,var FROM SVDB WHERE chrA == \'{}\' AND chrB == \'{}\''.format(chrA,chrB)):
if not hit[-1] in chr_db:
chr_db[ hit[-1] ]={}
chr_db[ hit[-1] ]["coordinates"]=[]
chr_db[ hit[-1] ]["var_info"]=[]
chr_db[ hit[-1] ]["index"]=[]

chr_db[ hit[-1] ]["coordinates"].append([hit[0],hit[1]])
chr_db[ hit[-1] ]["var_info"].append(hit[2])
chr_db[hit[-1]]["index"].append(hit[-2])

for variant in chr_db:
chr_db[variant]["coordinates"]=np.array(chr_db[variant]["coordinates"])
chr_db[variant]["var_info"]=np.array(chr_db[variant]["var_info"])
chr_db[variant]["index"]=np.array(chr_db[variant]["index"])

for variant in var_list:
chr_db={}
chr_db[ variant ]={}

hits = c.execute('SELECT posA,posB,sample,idx,var FROM SVDB WHERE var == \'{}\'AND chrA == \'{}\' AND chrB == \'{}\''.format(variant,chrA,chrB)).fetchall()
if not hits:
continue

x=[v[0] for v in hits]
y=[v[1] for v in hits]

chr_db[variant]["coordinates"]=np.column_stack((x,y))
chr_db[variant]["var_info"]=np.array([v[2] for v in hits])
chr_db[variant]["index"]=np.array([v[3] for v in hits])

db = DBSCAN(eps=args.epsilon, min_samples=args.min_pts).fit(chr_db[variant]["coordinates"])

core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
Expand Down

0 comments on commit 24672a7

Please sign in to comment.