diff --git a/README.md b/README.md index f016a73..2103383 100644 --- a/README.md +++ b/README.md @@ -65,7 +65,7 @@ SV2 can be installed with `pip install` or can be [manually installed ### Install with `pip` ``` -pip install http://downloads.sourceforge.net/project/sv2/sv2-1.2.tar.gz +pip install http://downloads.sourceforge.net/project/sv2/sv2-1.3.tar.gz ``` ## Configure SV2 @@ -85,6 +85,7 @@ Genotyping Option | Description -c \| -cpu | Parallelize sample-wise. 1 per CPU -g \| -genome | Reference genome build [hg19, hg38]. Default: hg19 -pcrfree | GC content normalization for PCR-free libraries +-M | bwa mem -M compatibility. Split-reads flagged as secondary instead of supplementary -s \| -seed | Random seed for genome shuffling in preprocessing. Default: 42 -o \| -out | Output name -pre | Preprocessing output directory. *Skips preprocessing* diff --git a/doc/README.md b/doc/README.md index bed4364..ea5d34a 100644 --- a/doc/README.md +++ b/doc/README.md @@ -114,7 +114,7 @@ Each classifier, with the exception of Duplication SNV implements depth of cover *Recommended* ``` -pip install http://downloads.sourceforge.net/project/sv2/sv2-1.2.tar.gz +pip install http://downloads.sourceforge.net/project/sv2/sv2-1.3.tar.gz ``` ### Manual Install @@ -122,9 +122,9 @@ pip install http://downloads.sourceforge.net/project/sv2/sv2-1.2.tar.gz > [Source Files :floppy_disk:](#source-files) ``` -wget http://downloads.sourceforge.net/project/sv2/sv2-1.2.zip # sv2-1.2.tar.gz also available -unzip sv2-1.2.zip -cd sv2-1.2/ +wget http://downloads.sourceforge.net/project/sv2/sv2-1.3.zip # sv2-1.3.tar.gz also available +unzip sv2-1.3.zip +cd sv2-1.3/ python setup.py install [--prefix ] ``` @@ -145,9 +145,10 @@ sv2 -hg19 -hg38 ### Check Installation - ``` -usage: sv2 [-h] [-i I] [-r R] [-c C] [-g G] [-pcrfree] [-s S] [-o O] +$ sv2 -help + +usage: sv2 [-h] [-i I] [-r R] [-c C] [-g G] [-pcrfree] [-M] [-s S] [-o O] [-pre PRE] [-feats FEATS] [-hg19 HG19] [-hg38 HG38] ____ @@ -157,7 +158,7 @@ usage: sv2 [-h] [-i I] [-r R] [-c C] [-g G] [-pcrfree] [-s S] [-o O] / \ \ / /_________/ \___/ Support Vector Structural Variation Genotyper -Version 1.2 Author: Danny Antaki +Version 1.3 Author: Danny Antaki optional arguments: -h, --help show this help message and exit @@ -168,6 +169,7 @@ genotype arguments: -c C, -cpu C Parallelize sample-wise. 1 per cpu -g G, -genome G Reference genome build [ hg19, hg38 ] -pcrfree GC content normalization for PCR free libraries + -M bwa mem -M compatibility. Split-reads flagged as secondary instead of supplementary -s S, -seed S Preprocessing: integer seed for genome shuffling -o O, -out O output -pre PRE Preprocessing output directory @@ -382,14 +384,16 @@ ls *sv2_input.txt ##### Run SV2 for individual samples ``` -sv2 -i HG00096_sv2_input.txt -r chr21_forestSV.bed -o HG00096_sv2 -sv2 -i HG01051_sv2_input.txt -r chr21_forestSV.bed -o HG01051_sv2 +sv2 -i HG00096_sv2_input.txt -r chr21_forestSV.bed -M -o HG00096_sv2 +sv2 -i HG01051_sv2_input.txt -r chr21_forestSV.bed -M -o HG01051_sv2 ``` +**Note**: since the BAM files were generated with the legacy `-M` option that flags split-reads as secondary, supply the `-M` flag to SV2 + ##### Run SV2 for multiple samples ``` -sv2 -i sv2_input.txt -r chr21_forestSV.bed -o sv2_forestSV +sv2 -i sv2_input.txt -r chr21_forestSV.bed -M -o sv2_forestSV ``` ##### Parallelize by Sample @@ -397,7 +401,7 @@ sv2 -i sv2_input.txt -r chr21_forestSV.bed -o sv2_forestSV When given multiple samples, SV2 can run samples in parallel reducing run time. ``` -sv2 -i sv2_input.txt -r chr21_forestSV.bed -c 2 -o sv2_forestSV +sv2 -i sv2_input.txt -r chr21_forestSV.bed -M -c 2 -o sv2_forestSV ``` #### Genotype SV: Skip Preprocessing @@ -411,7 +415,7 @@ Skipping preprocessing allows the user to genotype another set of SVs with the s mv sv2_features/ sv2_forestSV_features/ -sv2 -i sv2_input.txt -r ALL.wgs.integrated_sv_map_v1.20130502.chr21.sv.genotypes.vcf -pre sv2_preprocessing/ -o sv2_tut_1KGP +sv2 -i sv2_input.txt -r ALL.wgs.integrated_sv_map_v1.20130502.chr21.sv.genotypes.vcf -M -pre sv2_preprocessing/ -o sv2_tut_1KGP ``` @@ -422,7 +426,7 @@ Skipping feature extraction allows the user to combine or subset samples in the ``` head -n 1 sv2_input.txt >sv2_subset.txt -sv2 -i sv2_input.txt -r chr21_forestSV.bed -pre sv2_preprocessing/ -feats sv2_forestSV_features/ -o sv2_tut_forestSV_subset +sv2 -i sv2_input.txt -r chr21_forestSV.bed -pre sv2_preprocessing/ -feats sv2_forestSV_features/ -M -o sv2_tut_forestSV_subset ``` diff --git a/sv2/core.pyx b/sv2/core.pyx index c549402..6a2395d 100644 --- a/sv2/core.pyx +++ b/sv2/core.pyx @@ -290,7 +290,7 @@ cdef GC(nuc): cdef MAD(a, double c=0.6745): return np.around(np.median(np.fabs(a-np.median(a))/c),decimals=3) cdef normalize_chr_cov(double read_count, double chr_size, read_length): return np.around( (read_count/chr_size)*np.median(read_length),decimals=2) cdef normalize_coverage(double read_count, double span, double chr_cov, double read_length): return (((read_count/span)*read_length)/chr_cov) -cdef count_reads(cnv_list,bam,ci,chrFlag): +cdef count_reads(cnv_list,bam,ci,chrFlag,legacyM): """count reads within SV span for coverage estimation""" cdef unsigned int read_count cdef unsigned int bp_span @@ -309,7 +309,7 @@ cdef count_reads(cnv_list,bam,ci,chrFlag): or read.is_proper_pair == False or read.is_qcfail == True or read.mapping_quality < 10 - or read.is_secondary + or ( (read.is_secondary and legacyM == True) or (read.is_supplementary and legacyM == False) ) or read.is_unmapped or read.mate_is_unmapped or read.is_duplicate @@ -349,23 +349,26 @@ cdef is_discordant(read,windows,ci,matepos): return True else: return False -cdef is_split(read,windows,c): +cdef is_split(read,windows,c,legacyM): """returns True if read is split""" ((s1,e1),(s2,e2)) = windows - if read.is_secondary == True: - second_align = read.get_tag("SA").split(',') - if second_align[0] != c: - """return False if maps to other chromosome""" - return False + second_align=[] + if legacyM == True: + if read.is_secondary == True: second_align = read.get_tag("SA").split(',') + else: + if read.is_supplementary == True: second_align = read.get_tag("SA").split(',') + if len(second_align) == 0: return False + else: + if second_align[0] != c: return False else: second_align[1] = int(second_align[1]) if ( ((int(s1) <= read.pos+1 <= int(e1)) and (int(s2) <= second_align[1] <= int(e2))) or ((int(s2) <= read.pos+1 <= int(e2)) and (int(s1) <= second_align[1] <= int(e1)))): - """secondary alignment must be near the opposite breakpoint of the primary alignment""" + """supplementary/secondary alignment must be near the opposite breakpoint of the primary alignment""" return True else: return False -cdef discordant_split_cnv(flank_list,bam,size,ci,windows,chrFlag): +cdef discordant_split_cnv(flank_list,bam,size,ci,windows,chrFlag,legacyM): """count discordant paired ends and split reads""" cdef unsigned int discordant_count cdef unsigned int split_count @@ -390,10 +393,10 @@ cdef discordant_split_cnv(flank_list,bam,size,ci,windows,chrFlag): else: mate_pos = read.pnext if is_discordant(read,windows,ci,mate_pos) == True: discordant_count+=1 - if is_split(read,windows,c) == True: split_count+=1 + if is_split(read,windows,c,legacyM) == True: split_count+=1 if (read.is_proper_pair and read.mapping_quality >= 10 - and not read.is_secondary + and ( (not read.is_secondary and legacyM == True) or (not read.is_supplementary and legacyM == False) ) and abs(read.tlen) < ci): concordant_count+=1 else: continue @@ -657,7 +660,7 @@ class Prepro(object): self.insert_mad=mad self.read_len=read_length self.snp_cov=snp_cov -def extract_feats(iid,bamfh,vcffh,cnv,prefh,gender,out,gen,pcrfree): +def extract_feats(iid,bamfh,vcffh,cnv,prefh,gender,out,gen,pcrfree,legacyM): master_cnv = filter_calls(cnv,gen) gc_dict = gc_norm(pcrfree) cnv_gc=gc_cnv(cnv,master_cnv,gen) @@ -682,10 +685,10 @@ def extract_feats(iid,bamfh,vcffh,cnv,prefh,gender,out,gen,pcrfree): (cnv_span,flank_span,windows) = master_cnv[call] size = int(e)-int(s)+1 ci = insert_size+(5*insert_mad) - discordant,split,concordant = discordant_split_cnv(flank_span,bam,size,ci,windows,chrFlag) + discordant,split,concordant = discordant_split_cnv(flank_span,bam,size,ci,windows,chrFlag,legacyM) if size > 1000: """read count coverage estimation""" - (read_count,bp_span) = count_reads(cnv_span,bam,ci,chrFlag) + (read_count,bp_span) = count_reads(cnv_span,bam,ci,chrFlag,legacyM) cov=float('nan') normc=c if gender == 'M' and checkPAR('{} {} {}'.format(c,s,e),gen)== True: normc='GENOME' diff --git a/sv2/sv2.py b/sv2/sv2.py index 6fbcaac..5688b53 100755 --- a/sv2/sv2.py +++ b/sv2/sv2.py @@ -1,4 +1,4 @@ -__version__='1.2' +__version__='1.3' import sys,os,argparse from .core import writeConfig,checkConfig,check_in,Bed,check_cnv,errFH,reportTime,preprocess,extract_feats,genotype,annotate from argparse import RawTextHelpFormatter @@ -7,7 +7,7 @@ from time import time def main(): init_time = int(time()) - splash='\n ____\n _____________ ___ |___ \\\n / _____/\ \ / // ___/\n \_____ \ \ Y //_____)\n / \ \ /\n/_________/ \___/\nSupport Vector Structural Variation Genotyper\nVersion 1.2 Author: Danny Antaki \n' + splash='\n ____\n _____________ ___ |___ \\\n / _____/\ \ / // ___/\n \_____ \ \ Y //_____)\n / \ \ /\n/_________/ \___/\nSupport Vector Structural Variation Genotyper\nVersion 1.3 Author: Danny Antaki \n' parser = argparse.ArgumentParser(description=splash,formatter_class=RawTextHelpFormatter) genoArgs = parser.add_argument_group('genotype arguments') configArgs = parser.add_argument_group('configure arguments') @@ -16,6 +16,7 @@ def main(): genoArgs.add_argument('-c','-cpu', help='Parallelize sample-wise. 1 per cpu',required=False,default=1,type=int) genoArgs.add_argument('-g','-genome', help='Reference genome build [ hg19, hg38 ]',required=False,default='hg19',type=str) genoArgs.add_argument('-pcrfree', help='GC content normalization for PCR free libraries',required=False,default=False,action="store_true") + genoArgs.add_argument('-M', help='bwa mem -M compatibility. Split-reads flagged as secondary instead of supplementary',default=False,required=False,action="store_true") genoArgs.add_argument('-s','-seed', help='Preprocessing: integer seed for genome shuffling',required=False,default=42,type=int) genoArgs.add_argument('-o','-out', help='output',required=False,default="sv2_genotypes.vcf",type=str) genoArgs.add_argument('-pre', help='Preprocessing output directory',required=False,default=None) @@ -28,6 +29,7 @@ def main(): cores = args.c gen = args.g pcrfree = args.pcrfree + legacyM = args.M ofh = args.o seed = args.s predir= args.pre @@ -93,7 +95,7 @@ def main(): prefh = preprocess_files[bam_id] gtofh = bam_id+'_sv2_features.txt' feats_files[bam_id]=os.getcwd()+'/sv2_features/'+gtofh - pool.apply_async(extract_feats, args=(bam_id,bam_dict[bam_id],vcf_dict[bam_id],cnv,prefh,gender_dict[bam_id],gtofh,gen,pcrfree) ) + pool.apply_async(extract_feats, args=(bam_id,bam_dict[bam_id],vcf_dict[bam_id],cnv,prefh,gender_dict[bam_id],gtofh,gen,pcrfree,legacyM) ) pool.close() pool.join() else: @@ -104,7 +106,7 @@ def main(): prefh = preprocess_files[bam_id] gtofh = bam_id+'_sv2_features.txt' feats_files[bam_id]=os.getcwd()+'/sv2_features/'+gtofh - extract_feats(bam_id,bam_dict[bam_id],vcf_dict[bam_id],cnv,prefh,gender_dict[bam_id],gtofh,gen,pcrfree) + extract_feats(bam_id,bam_dict[bam_id],vcf_dict[bam_id],cnv,prefh,gender_dict[bam_id],gtofh,gen,pcrfree,legacyM) else: if not featsdir.endswith('/'): featsdir=featsdir+'/' if not os.path.isdir(featsdir): errFH(featsdir)