sv2 v1.3 with -M option for legacy bwa mem -M BAM files: split-reads …

…are flagged as secondary instead of supplementary
dantaki · Jun 23, 2017 · e3e1d01 · e3e1d01
1 parent e243931
commit e3e1d01
Show file tree

Hide file tree

Showing 4 changed files with 43 additions and 33 deletions.
diff --git a/README.md b/README.md
@@ -65,7 +65,7 @@ SV<sup>2</sup> can be installed with `pip install` or can be [manually installed
 ### Install with `pip`
 
 ```
-pip install http://downloads.sourceforge.net/project/sv2/sv2-1.2.tar.gz
+pip install http://downloads.sourceforge.net/project/sv2/sv2-1.3.tar.gz
 ```
 
 ## Configure SV<sup>2</sup>
@@ -85,6 +85,7 @@ Genotyping Option | Description
 -c \| -cpu | Parallelize sample-wise. 1 per CPU 
 -g \| -genome | Reference genome build [hg19, hg38]. Default: hg19
 -pcrfree | GC content normalization for PCR-free libraries
+-M | bwa mem -M compatibility. Split-reads flagged as secondary instead of supplementary
 -s \| -seed | Random seed for genome shuffling in preprocessing. Default: 42
 -o \| -out | Output name
 -pre | Preprocessing output directory. *Skips preprocessing*

diff --git a/doc/README.md b/doc/README.md
@@ -114,17 +114,17 @@ Each classifier, with the exception of Duplication SNV implements depth of cover
 *Recommended*
 
 ```
-pip install http://downloads.sourceforge.net/project/sv2/sv2-1.2.tar.gz
+pip install http://downloads.sourceforge.net/project/sv2/sv2-1.3.tar.gz
 ```
 
 ### Manual Install
 
 > [Source Files :floppy_disk:](#source-files)
 
 ```
-wget http://downloads.sourceforge.net/project/sv2/sv2-1.2.zip # sv2-1.2.tar.gz also available
-unzip sv2-1.2.zip
-cd sv2-1.2/
+wget http://downloads.sourceforge.net/project/sv2/sv2-1.3.zip # sv2-1.3.tar.gz also available
+unzip sv2-1.3.zip
+cd sv2-1.3/
 
 python setup.py install [--prefix <PYTHONPATH>]
 ```
@@ -145,9 +145,10 @@ sv2 -hg19 <hg19.fasta> -hg38 <hg38.fasta>
 
 ### Check Installation
 
-
 ```
-usage: sv2 [-h] [-i I] [-r R] [-c C] [-g G] [-pcrfree] [-s S] [-o O]
+$ sv2 -help 
+
+usage: sv2 [-h] [-i I] [-r R] [-c C] [-g G] [-pcrfree] [-M] [-s S] [-o O]
            [-pre PRE] [-feats FEATS] [-hg19 HG19] [-hg38 HG38]
 
                        ____
@@ -157,7 +158,7 @@ usage: sv2 [-h] [-i I] [-r R] [-c C] [-g G] [-pcrfree] [-s S] [-o O]
  /        \  \     /
 /_________/   \___/
 Support Vector Structural Variation Genotyper
-Version 1.2        Author: Danny Antaki <dantaki at ucsd dot edu>
+Version 1.3        Author: Danny Antaki <dantaki at ucsd dot edu>
 
 optional arguments:
   -h, --help       show this help message and exit
@@ -168,6 +169,7 @@ genotype arguments:
   -c C, -cpu C     Parallelize sample-wise. 1 per cpu
   -g G, -genome G  Reference genome build [ hg19, hg38 ]
   -pcrfree         GC content normalization for PCR free libraries
+  -M               bwa mem -M compatibility. Split-reads flagged as secondary instead of supplementary
   -s S, -seed S    Preprocessing: integer seed for genome shuffling
   -o O, -out O     output
   -pre PRE         Preprocessing output directory
@@ -382,22 +384,24 @@ ls *sv2_input.txt
 ##### Run SV<sup>2</sup> for individual samples
 
 ```
-sv2 -i HG00096_sv2_input.txt -r chr21_forestSV.bed -o HG00096_sv2
-sv2 -i HG01051_sv2_input.txt -r chr21_forestSV.bed -o HG01051_sv2
+sv2 -i HG00096_sv2_input.txt -r chr21_forestSV.bed -M -o HG00096_sv2
+sv2 -i HG01051_sv2_input.txt -r chr21_forestSV.bed -M -o HG01051_sv2
 ```
 
+**Note**: since the BAM files were generated with the legacy `-M` option that flags split-reads as secondary, supply the `-M` flag to SV<sup>2</sup>
+
 ##### Run SV<sup>2</sup> for multiple samples
 
 ```
-sv2 -i sv2_input.txt -r chr21_forestSV.bed -o sv2_forestSV
+sv2 -i sv2_input.txt -r chr21_forestSV.bed -M -o sv2_forestSV
 ```
 
 ##### Parallelize by Sample
 
 When given multiple samples, SV<sup>2</sup> can run samples in parallel reducing run time.
 
 ```
-sv2 -i sv2_input.txt -r chr21_forestSV.bed -c 2 -o sv2_forestSV
+sv2 -i sv2_input.txt -r chr21_forestSV.bed -M -c 2 -o sv2_forestSV
 ```
 
 #### Genotype SV: Skip Preprocessing
@@ -411,7 +415,7 @@ Skipping preprocessing allows the user to genotype another set of SVs with the s
 
 mv sv2_features/ sv2_forestSV_features/
 
-sv2 -i sv2_input.txt -r ALL.wgs.integrated_sv_map_v1.20130502.chr21.sv.genotypes.vcf -pre sv2_preprocessing/ -o sv2_tut_1KGP
+sv2 -i sv2_input.txt -r ALL.wgs.integrated_sv_map_v1.20130502.chr21.sv.genotypes.vcf -M -pre sv2_preprocessing/ -o sv2_tut_1KGP
 
 ```
 
@@ -422,7 +426,7 @@ Skipping feature extraction allows the user to combine or subset samples in the
 ```
 head -n 1 sv2_input.txt >sv2_subset.txt
 
-sv2 -i sv2_input.txt -r chr21_forestSV.bed -pre sv2_preprocessing/ -feats sv2_forestSV_features/ -o sv2_tut_forestSV_subset
+sv2 -i sv2_input.txt -r chr21_forestSV.bed -pre sv2_preprocessing/ -feats sv2_forestSV_features/ -M -o sv2_tut_forestSV_subset
 
 ```
 

diff --git a/sv2/core.pyx b/sv2/core.pyx
@@ -290,7 +290,7 @@ cdef GC(nuc):
 cdef MAD(a, double c=0.6745): return np.around(np.median(np.fabs(a-np.median(a))/c),decimals=3)
 cdef normalize_chr_cov(double read_count, double chr_size, read_length): return np.around( (read_count/chr_size)*np.median(read_length),decimals=2)
 cdef normalize_coverage(double read_count, double span, double chr_cov, double read_length): return (((read_count/span)*read_length)/chr_cov)
-cdef count_reads(cnv_list,bam,ci,chrFlag):
+cdef count_reads(cnv_list,bam,ci,chrFlag,legacyM):
 	"""count reads within SV span for coverage estimation"""
 	cdef unsigned int read_count
 	cdef unsigned int bp_span
@@ -309,7 +309,7 @@ cdef count_reads(cnv_list,bam,ci,chrFlag):
 			   or read.is_proper_pair == False
 			   or read.is_qcfail == True
 			   or read.mapping_quality < 10
-			   or read.is_secondary
+			   or ( (read.is_secondary and legacyM == True) or (read.is_supplementary and legacyM == False) )
 			   or read.is_unmapped
 			   or read.mate_is_unmapped
 			   or read.is_duplicate
@@ -349,23 +349,26 @@ cdef is_discordant(read,windows,ci,matepos):
 			return True
 		else:
 			return False
-cdef is_split(read,windows,c):
+cdef is_split(read,windows,c,legacyM):
 	"""returns True if read is split"""
 	((s1,e1),(s2,e2)) = windows
-	if read.is_secondary == True:
-		second_align = read.get_tag("SA").split(',')
-		if second_align[0] != c:
-			"""return False if maps to other chromosome"""
-			return False
+	second_align=[]
+	if legacyM == True:
+		if read.is_secondary == True: second_align = read.get_tag("SA").split(',')
+	else: 
+		if read.is_supplementary == True: second_align = read.get_tag("SA").split(',')
+	if len(second_align) == 0: return False
+	else: 
+		if second_align[0] != c: return False
 		else:
 			second_align[1] = int(second_align[1])
 			if ( ((int(s1) <= read.pos+1 <= int(e1)) and (int(s2) <= second_align[1] <= int(e2)))
 			 or ((int(s2) <= read.pos+1 <= int(e2)) and (int(s1) <= second_align[1] <= int(e1)))):
-				"""secondary alignment must be near the opposite breakpoint of the primary alignment"""
+				"""supplementary/secondary alignment must be near the opposite breakpoint of the primary alignment"""
 				return True
 			else:
 				return False
-cdef discordant_split_cnv(flank_list,bam,size,ci,windows,chrFlag):
+cdef discordant_split_cnv(flank_list,bam,size,ci,windows,chrFlag,legacyM):
 	"""count discordant paired ends and split reads"""
 	cdef unsigned int discordant_count
 	cdef unsigned int split_count
@@ -390,10 +393,10 @@ cdef discordant_split_cnv(flank_list,bam,size,ci,windows,chrFlag):
 				else:
 					mate_pos = read.pnext
 					if is_discordant(read,windows,ci,mate_pos) == True: discordant_count+=1
-					if is_split(read,windows,c)  == True: split_count+=1
+					if is_split(read,windows,c,legacyM)  == True: split_count+=1
 					if (read.is_proper_pair
 					and read.mapping_quality >= 10
-					and not read.is_secondary
+					and ( (not read.is_secondary and legacyM == True) or (not read.is_supplementary and legacyM == False) ) 
 					and abs(read.tlen) < ci):
 						concordant_count+=1
 					else: continue
@@ -657,7 +660,7 @@ class Prepro(object):
 		self.insert_mad=mad
 		self.read_len=read_length
 		self.snp_cov=snp_cov
-def extract_feats(iid,bamfh,vcffh,cnv,prefh,gender,out,gen,pcrfree):
+def extract_feats(iid,bamfh,vcffh,cnv,prefh,gender,out,gen,pcrfree,legacyM):
 	master_cnv = filter_calls(cnv,gen)
 	gc_dict = gc_norm(pcrfree)
 	cnv_gc=gc_cnv(cnv,master_cnv,gen)
@@ -682,10 +685,10 @@ def extract_feats(iid,bamfh,vcffh,cnv,prefh,gender,out,gen,pcrfree):
 		(cnv_span,flank_span,windows) = master_cnv[call]
 		size = int(e)-int(s)+1
 		ci = insert_size+(5*insert_mad)
-		discordant,split,concordant = discordant_split_cnv(flank_span,bam,size,ci,windows,chrFlag)
+		discordant,split,concordant = discordant_split_cnv(flank_span,bam,size,ci,windows,chrFlag,legacyM)
 		if size > 1000:
 			"""read count coverage estimation"""
-			(read_count,bp_span) = count_reads(cnv_span,bam,ci,chrFlag)
+			(read_count,bp_span) = count_reads(cnv_span,bam,ci,chrFlag,legacyM)
 			cov=float('nan')
 			normc=c
 			if gender == 'M' and checkPAR('{} {} {}'.format(c,s,e),gen)== True: normc='GENOME'

diff --git a/sv2/sv2.py b/sv2/sv2.py
@@ -1,4 +1,4 @@
-__version__='1.2'
+__version__='1.3'
 import sys,os,argparse
 from .core import writeConfig,checkConfig,check_in,Bed,check_cnv,errFH,reportTime,preprocess,extract_feats,genotype,annotate
 from argparse import RawTextHelpFormatter
@@ -7,7 +7,7 @@
 from time import time
 def main():
 	init_time = int(time())
-	splash='\n                       ____\n  _____________   ___ |___ \\\n /   _____/\   \ /   // ___/\n \_____  \  \   Y   //_____)\n /        \  \     /\n/_________/   \___/\nSupport Vector Structural Variation Genotyper\nVersion 1.2        Author: Danny Antaki <dantaki at ucsd dot edu>\n'
+	splash='\n                       ____\n  _____________   ___ |___ \\\n /   _____/\   \ /   // ___/\n \_____  \  \   Y   //_____)\n /        \  \     /\n/_________/   \___/\nSupport Vector Structural Variation Genotyper\nVersion 1.3        Author: Danny Antaki <dantaki at ucsd dot edu>\n'
 	parser = argparse.ArgumentParser(description=splash,formatter_class=RawTextHelpFormatter)
 	genoArgs = parser.add_argument_group('genotype arguments')
 	configArgs = parser.add_argument_group('configure arguments')
@@ -16,6 +16,7 @@ def main():
 	genoArgs.add_argument('-c','-cpu', help='Parallelize sample-wise. 1 per cpu',required=False,default=1,type=int)
 	genoArgs.add_argument('-g','-genome',  help='Reference genome build [ hg19, hg38 ]',required=False,default='hg19',type=str)
 	genoArgs.add_argument('-pcrfree',  help='GC content normalization for PCR free libraries',required=False,default=False,action="store_true")
+	genoArgs.add_argument('-M', help='bwa mem -M compatibility. Split-reads flagged as secondary instead of supplementary',default=False,required=False,action="store_true")
 	genoArgs.add_argument('-s','-seed', help='Preprocessing: integer seed for genome shuffling',required=False,default=42,type=int)
 	genoArgs.add_argument('-o','-out', help='output',required=False,default="sv2_genotypes.vcf",type=str)
 	genoArgs.add_argument('-pre', help='Preprocessing output directory',required=False,default=None)
@@ -28,6 +29,7 @@ def main():
 	cores = args.c
 	gen = args.g
 	pcrfree = args.pcrfree
+	legacyM = args.M
 	ofh = args.o
 	seed = args.s
 	predir= args.pre
@@ -93,7 +95,7 @@ def main():
 				prefh = preprocess_files[bam_id]
 				gtofh = bam_id+'_sv2_features.txt'
 				feats_files[bam_id]=os.getcwd()+'/sv2_features/'+gtofh
-				pool.apply_async(extract_feats, args=(bam_id,bam_dict[bam_id],vcf_dict[bam_id],cnv,prefh,gender_dict[bam_id],gtofh,gen,pcrfree) )
+				pool.apply_async(extract_feats, args=(bam_id,bam_dict[bam_id],vcf_dict[bam_id],cnv,prefh,gender_dict[bam_id],gtofh,gen,pcrfree,legacyM) )
 			pool.close()
 			pool.join()
 		else:
@@ -104,7 +106,7 @@ def main():
 				prefh = preprocess_files[bam_id]
 				gtofh = bam_id+'_sv2_features.txt'
 				feats_files[bam_id]=os.getcwd()+'/sv2_features/'+gtofh
-				extract_feats(bam_id,bam_dict[bam_id],vcf_dict[bam_id],cnv,prefh,gender_dict[bam_id],gtofh,gen,pcrfree)
+				extract_feats(bam_id,bam_dict[bam_id],vcf_dict[bam_id],cnv,prefh,gender_dict[bam_id],gtofh,gen,pcrfree,legacyM)
 	else: 
 		if not featsdir.endswith('/'): featsdir=featsdir+'/'
 		if not os.path.isdir(featsdir): errFH(featsdir)