modified: setup.py

modified: svdb/__main__.py modified: svdb/merge_vcf_module.py modified: svdb/merge_vcf_module_cython.py
J35P312 · Sep 6, 2017 · 1aae0b5 · 1aae0b5
1 parent d11f26e
commit 1aae0b5
Show file tree

Hide file tree

Showing 4 changed files with 44 additions and 7 deletions.
diff --git a/setup.py b/setup.py
@@ -20,7 +20,7 @@
 
 setup(
     name = 'svdb',
-    version = '1.0.6',
+    version = '1.0.7',
     ext_modules = ext_modules,
     packages = ['svdb'],
     install_requires = ['numpy', 'scikit-learn==0.15.2', 'scipy'],

diff --git a/svdb/__main__.py b/svdb/__main__.py
@@ -9,7 +9,7 @@
 from . import bed_annotation_module
 
 def main():
-    version = "1.0.6"
+    version = "1.0.7"
     parser = argparse.ArgumentParser("""SVDB-{}, use the build module to construct databases, use the query module to query the database usign vcf files, or use the hist module to generate histograms""".format(version),add_help=False)
     parser.add_argument('--build'       , help="create a db", required=False, action="store_true")
     parser.add_argument('--hist'        , help="generate histograms o the performance of a db", required=False, action="store_true")
@@ -127,6 +127,7 @@ def main():
     elif args.merge:
         parser = argparse.ArgumentParser("""SVDB-{}: vcf_merge module""".format(version))
         parser.add_argument('--merge', help="merge structural variants", required=False, action="store_true")
+        parser.add_argument('--notag', help="Do not add the the VARID and set entries to the info field", required=False, action="store_true")
         parser.add_argument('--vcf', nargs='*', type=str, help="input vcf files, all input vcf files will be merged into one. Use the --prioriy flag to prioritize the callers/vcf files",required=True)
         parser.add_argument('--priority', type=str, help="prioritise the input files, using the following format --vcf caller1.vcf:2 caller2.vcf:1 --priority: 1,2")
         parser.add_argument('--bnd_distance', type=int,default= 2000,help="the maximum distance between two similar precise breakpoints(default = 2000)")

diff --git a/svdb/merge_vcf_module.py b/svdb/merge_vcf_module.py
@@ -37,7 +37,8 @@ def print_header(vcf_list,vcf_dictionary,args,command_line):
                         for sample in vcf_columns[9:]:                           
                             sample_order[sample][vcf_dictionary[vcf]]=i
                             i += 1
-
+                elif "<ID=VARID," in line or "<ID=set," in line:
+                    continue
                 elif line[0] == line[1] and "=" in line:
                     if("ID=" in line and not "##contig=<ID=" in line):
                         field=line.split("=")[2].split(",")[0]
@@ -90,7 +91,9 @@ def print_header(vcf_list,vcf_dictionary,args,command_line):
     #print subheaders
     for entry in sorted(subheader):
         print(subheader[entry].strip())
-    print("##INFO=<ID=VARID,Number=1,Type=String,Description=\"The variant ID of merged samples\">")
+    if not args.notag:
+        print("##INFO=<ID=VARID,Number=1,Type=String,Description=\"The variant ID of merged samples\">")
+        print("##INFO=<ID=set,Number=1,Type=String,Description=\"Source VCF for the merged record in SVDB\">")
     print("##svdbcmdline={}".format(" ".join(command_line)))
     sample_print_order={}
     if sample_ids:

diff --git a/svdb/merge_vcf_module_cython.py b/svdb/merge_vcf_module_cython.py
@@ -9,6 +9,31 @@ def retrieve_key(line,key):
             return(False)
     return(item)
 
+def determine_set_tag(priority_order,files):
+    n_filtered=0
+    n_pass=0
+
+    filtered=[]
+    for sample in priority_order:
+        if sample in files:
+            if files[sample].split("\t")[6] == "PASS" or files[sample].split("\t")[6] == ".":
+                n_pass+=1
+            else:
+                n_filtered += 1
+    if n_pass == len(priority_order):
+        return("Intersection")
+    elif n_filtered == len(priority_order):
+        return("FilteredInAll")
+    else:
+        for sample in priority_order:
+            if not sample in files:
+                continue
+            elif files[sample].split("\t")[6] == "PASS" or files[sample].split("\t")[6] == ".":
+                filtered.append(sample)                
+            else:
+                filtered.append("filterIn" + sample)  
+        return("-".join(filtered))
+
 def get_CIPOS_CEND(query_variant):
     ciA_query=[0,0]
     CIPOS=retrieve_key(query_variant[-1],"CIPOS")
@@ -105,7 +130,7 @@ def sort_format_field(line,samples,sample_order,sample_print_order,priority_orde
                         format_string.append(",".join(sub_entry))
                 else:
                     if entry == "GT":
-                        format_string.append("0/0")
+                        format_string.append("./.")
                     else:
                         sub_entry=[]
                         for i in range(0,format_entry_length[j]+1):
@@ -229,8 +254,16 @@ def merge(variants,samples,sample_order,sample_print_order,priority_order,args):
                     representing_file = variants[chrA][i][-3].replace(".vcf","").split("/")[-1]
 
                 line=sort_format_field(line,samples,sample_order,sample_print_order,priority_order,files, representing_file,args)
-                if merge:
-                    line[7] += ";VARID=" + "|".join(merge)                
+                if merge and not args.notag:
+                    line[7] += ";VARID=" + "|".join(merge)
+                #print "printing"
+                #print samples
+                #print priority_order
+                #print files
+                #print representing_file
+                if not args.notag:
+                    set_tag=determine_set_tag(priority_order,files)
+                    line[7] += ";set={}".format(set_tag);                
                 to_be_printed[line[0]].append(line)
 
             analysed_variants.add(i)