modified: README.md

modified: setup.py modified: svdb/__main__.py modified: svdb/export_module.py
J35P312 · Mar 8, 2017 · 24672a7 · 24672a7
1 parent d688531
commit 24672a7
Show file tree

Hide file tree

Showing 4 changed files with 40 additions and 45 deletions.
diff --git a/README.md b/README.md
@@ -91,41 +91,33 @@ Hist: This module is used to compare structural variant vcf files, either by gen
                                         on the confidence interval of the position of the
                                         variants(0 if no CIPOS or CIEND is vailable)
 
-Query: The query module is used to query a structural variant database. Typically a database is constructed using the build module. However, since this module utilize the genotype field of the sructural variant database vcf to compute the frequency of structural variants, a wide range of files could be used as database. The query module requires a query vcf, as well as a database vcf:
+Query: The query module is used to query a structural variant database. Typically a database is constructed using the build module. However, since this module utilize the genotype field of the sructural variant database vcf to compute the frequency of structural variants, a wide range of files could be used as database. The query module requires a query vcf, as well as a database file(either multisample vcf or SVDB sqlite database):
 
     print a help message
        python SVDB.py --query --help
     Query a structural variant database, using a vcf file as query:
         svdb --query --query_vcf patient1.vcf --db control_db.vcf
+	The vcf may be a exported SVDB database or a multismple vcf. The frequencies used for each variant is computed from the format fields of the vcf.
+	The SVDB sqlite database may also be used for querying:
         svdb --query --query_vcf patient1.vcf --sqdb control_db.db
 
     optional arguments:
-        -h, --help                      show this help message and exit
 
-        --hit_tag HIT_TAG               the tag used to describe the number of hits within the
-                                        info field of the output vcf(default=OCC)
-                                        
-        --frequency_tag FREQUENCY_TAG   the tag used to describe the frequency of the
-                                        variant(defualt=FRQ)
-                        
-        --prefix PREFIX                 the prefix of the output file, default = print to stdout
+		-h, --help            show this help message and exit
+		--db DB               path to a SVDB db vcf
+		--sqdb SQDB           path to a SVDB sqlite db
+		--hit_tag HIT_TAG     the tag used to describe the number of hits within the info field of the output vcf(default=OCC)
+		--frequency_tag FREQUENCY_TAG the tag used to describe the frequency of the variant(defualt=FRQ)
+		--prefix PREFIX       the prefix of the output file, default = print to stdout --bnd_distance BND_DISTANCE the maximum distance between the breakpoints of two variantsbreakpoints(default = 10000)
+		--overlap OVERLAP     the overlap threshold for deciding if two variants are similar(0 means anything that touches will be merged, 1 means that two events must be identical to be merged), default = 0.6
+		--DBSCAN              use dbscan to cluster the variants, only available for the sqlite db, upon choosing DBSCAN, the overlap
+		--epsilon EPSILON     used together with --DBSCAN; sets the epsilon paramter(default = 500)
+		--min_pts MIN_PTS     used together with 1--DBSCAN; sets the min_pts parameter(default = 2)
+		--memory              load the database into memory: increases the memory requirements, but lowers the time consumption(may only be used with sqdb)
+		--no_var              count overlaping variants of different type as hits in the db
+		--invert              invert the sorting order so that high frequency samples are present on top of the output vcf
+		--ci				  overides overlap and bnd_distance,determine hits based on the confidence interval of the position fo the variants(0 if no CIPOS or CIEND is vailable)
 
-        --bnd_distance BND_DISTANCE     the maximum distance between two similar precise breakpoints
-                                        (default = 10000)
-                        
-                        
-        --overlap OVERLAP               the overlap required to merge two events(0 means
-                                        anything that touches will be merged, 1 means that two
-                                        events must be identical to be merged), default = 0.6
-                                        
-        --no_var                        count overlaping variants of different type as hits
-        
-        --invert                        invert the sorting order so that high frequency
-                                        samples are present on top of the output vcf
-                              
-        --ci                            overides overlap and bnd_distance,determine hits based
-                                        on the confidence interval of the position fo the
-                                        variants(0 if no CIPOS or CIEND is vailable)
 
 Purge: The purge module is used to remove entries from a database:
 

diff --git a/setup.py b/setup.py
@@ -20,7 +20,7 @@
 
 setup(
     name = 'svdb',
-    version = '0.1.0',
+    version = '0.1.1',
     ext_modules = ext_modules,
     packages = ['svdb'],
     install_requires = ['numpy', 'scikit-learn==0.15.2', 'scipy'],

diff --git a/svdb/__main__.py b/svdb/__main__.py
@@ -52,7 +52,7 @@ def main():
         parser.add_argument('--bnd_distance', type=int,default= 10000,help="the maximum distance between two similar precise breakpoints(default = 10000)")
         parser.add_argument('--overlap', type=float, default = 0.6,help="the overlap required to merge two events(0 means anything that touches will be merged, 1 means that two events must be identical to be merged), default = 0.6")
         parser.add_argument('--DBSCAN'       , help="use dbscan to cluster the variants, only available for the sqlite db", required=False, action="store_true")
-        parser.add_argument('--epsilon'       ,type=int, default = 500, help="used together with --DBSCAN; sets the epsilon paramter(default = 500)", required=False)
+        parser.add_argument('--epsilon'       ,type=float, default = 500, help="used together with --DBSCAN; sets the epsilon paramter(default = 500)", required=False)
         parser.add_argument('--min_pts'       ,type=int, default = 2, help="used together with 1--DBSCAN; sets the min_pts parameter(default = 2)", required=False)  
         parser.add_argument('--memory'       , help="load the database into memory: increases the memory requirements, but lowers the time consumption(may only be used with sqdb)", required=False, action="store_true")        
 
@@ -89,7 +89,7 @@ def main():
         parser.add_argument('--bnd_distance', type=int,default= 2500,help="the maximum distance between two similar precise breakpoints(default = 2500)")
         parser.add_argument('--overlap', type=float, default = 0.8,help="the overlap required to merge two events(0 means anything that touches will be merged, 1 means that two events must be identical to be merged), default = 0.8")
         parser.add_argument('--DBSCAN'       , help="use dbscan to cluster the variants", required=False, action="store_true")
-        parser.add_argument('--epsilon'       ,type=int, default = 500, help="used together with --DBSCAN; sets the epsilon paramter(default = 500)", required=False)
+        parser.add_argument('--epsilon'       ,type=float, default = 500, help="used together with --DBSCAN; sets the epsilon paramter(default = 500)", required=False)
         parser.add_argument('--min_pts'       ,type=int, default = 2, help="used together with 1--DBSCAN; sets the min_pts parameter(default = 2)", required=False)              
         parser.add_argument('--prefix', type=str,default="SVDB" ,help="the prefix of the output file, default = same as input")
         parser.add_argument('--memory'       , help="load the database into memory: increases the memory requirements, but lowers the time consumption", required=False, action="store_true")

diff --git a/svdb/export_module.py b/svdb/export_module.py
@@ -217,26 +217,29 @@ def dbscan_export(args,sample_IDs):
     chrB_list=[]    
     for chrB in c.execute('SELECT DISTINCT chrB FROM SVDB'):
         chrB_list.append(chrB[0])
+
+    var_list=[]
+    for variant in c.execute('SELECT DISTINCT var FROM SVDB'):
+        var_list.append(variant[0])
+
     i=0;
     for chrA in chrA_list:
         for chrB in chrB_list:
-            chr_db={}            
-            for hit in c.execute('SELECT posA,posB,sample,idx,var FROM SVDB WHERE chrA == \'{}\' AND chrB == \'{}\''.format(chrA,chrB)):
-                if not hit[-1] in chr_db:
-                    chr_db[ hit[-1] ]={}
-                    chr_db[ hit[-1] ]["coordinates"]=[]
-                    chr_db[ hit[-1] ]["var_info"]=[]
-                    chr_db[ hit[-1] ]["index"]=[]
-
-                chr_db[ hit[-1] ]["coordinates"].append([hit[0],hit[1]])
-                chr_db[ hit[-1] ]["var_info"].append(hit[2])
-                chr_db[hit[-1]]["index"].append(hit[-2])
-
-            for variant in chr_db:
-                chr_db[variant]["coordinates"]=np.array(chr_db[variant]["coordinates"])
-                chr_db[variant]["var_info"]=np.array(chr_db[variant]["var_info"])
-                chr_db[variant]["index"]=np.array(chr_db[variant]["index"])
-
+            for variant in var_list:
+                chr_db={}
+                chr_db[ variant ]={}
+
+                hits = c.execute('SELECT posA,posB,sample,idx,var FROM SVDB WHERE var == \'{}\'AND chrA == \'{}\' AND chrB == \'{}\''.format(variant,chrA,chrB)).fetchall()
+                if not hits:
+                   continue
+
+                x=[v[0] for v in hits]
+                y=[v[1] for v in hits]
+
+                chr_db[variant]["coordinates"]=np.column_stack((x,y))
+                chr_db[variant]["var_info"]=np.array([v[2] for v in hits])
+                chr_db[variant]["index"]=np.array([v[3] for v in hits])
+
                 db = DBSCAN(eps=args.epsilon, min_samples=args.min_pts).fit(chr_db[variant]["coordinates"])
 
                 core_samples_mask = np.zeros_like(db.labels_, dtype=bool)