diff --git a/README.md b/README.md
index f016a73..2103383 100644
--- a/README.md
+++ b/README.md
@@ -65,7 +65,7 @@ SV2 can be installed with `pip install` or can be [manually installed
### Install with `pip`
```
-pip install http://downloads.sourceforge.net/project/sv2/sv2-1.2.tar.gz
+pip install http://downloads.sourceforge.net/project/sv2/sv2-1.3.tar.gz
```
## Configure SV2
@@ -85,6 +85,7 @@ Genotyping Option | Description
-c \| -cpu | Parallelize sample-wise. 1 per CPU
-g \| -genome | Reference genome build [hg19, hg38]. Default: hg19
-pcrfree | GC content normalization for PCR-free libraries
+-M | bwa mem -M compatibility. Split-reads flagged as secondary instead of supplementary
-s \| -seed | Random seed for genome shuffling in preprocessing. Default: 42
-o \| -out | Output name
-pre | Preprocessing output directory. *Skips preprocessing*
diff --git a/doc/README.md b/doc/README.md
index bed4364..ea5d34a 100644
--- a/doc/README.md
+++ b/doc/README.md
@@ -114,7 +114,7 @@ Each classifier, with the exception of Duplication SNV implements depth of cover
*Recommended*
```
-pip install http://downloads.sourceforge.net/project/sv2/sv2-1.2.tar.gz
+pip install http://downloads.sourceforge.net/project/sv2/sv2-1.3.tar.gz
```
### Manual Install
@@ -122,9 +122,9 @@ pip install http://downloads.sourceforge.net/project/sv2/sv2-1.2.tar.gz
> [Source Files :floppy_disk:](#source-files)
```
-wget http://downloads.sourceforge.net/project/sv2/sv2-1.2.zip # sv2-1.2.tar.gz also available
-unzip sv2-1.2.zip
-cd sv2-1.2/
+wget http://downloads.sourceforge.net/project/sv2/sv2-1.3.zip # sv2-1.3.tar.gz also available
+unzip sv2-1.3.zip
+cd sv2-1.3/
python setup.py install [--prefix ]
```
@@ -145,9 +145,10 @@ sv2 -hg19 -hg38
### Check Installation
-
```
-usage: sv2 [-h] [-i I] [-r R] [-c C] [-g G] [-pcrfree] [-s S] [-o O]
+$ sv2 -help
+
+usage: sv2 [-h] [-i I] [-r R] [-c C] [-g G] [-pcrfree] [-M] [-s S] [-o O]
[-pre PRE] [-feats FEATS] [-hg19 HG19] [-hg38 HG38]
____
@@ -157,7 +158,7 @@ usage: sv2 [-h] [-i I] [-r R] [-c C] [-g G] [-pcrfree] [-s S] [-o O]
/ \ \ /
/_________/ \___/
Support Vector Structural Variation Genotyper
-Version 1.2 Author: Danny Antaki
+Version 1.3 Author: Danny Antaki
optional arguments:
-h, --help show this help message and exit
@@ -168,6 +169,7 @@ genotype arguments:
-c C, -cpu C Parallelize sample-wise. 1 per cpu
-g G, -genome G Reference genome build [ hg19, hg38 ]
-pcrfree GC content normalization for PCR free libraries
+ -M bwa mem -M compatibility. Split-reads flagged as secondary instead of supplementary
-s S, -seed S Preprocessing: integer seed for genome shuffling
-o O, -out O output
-pre PRE Preprocessing output directory
@@ -382,14 +384,16 @@ ls *sv2_input.txt
##### Run SV2 for individual samples
```
-sv2 -i HG00096_sv2_input.txt -r chr21_forestSV.bed -o HG00096_sv2
-sv2 -i HG01051_sv2_input.txt -r chr21_forestSV.bed -o HG01051_sv2
+sv2 -i HG00096_sv2_input.txt -r chr21_forestSV.bed -M -o HG00096_sv2
+sv2 -i HG01051_sv2_input.txt -r chr21_forestSV.bed -M -o HG01051_sv2
```
+**Note**: since the BAM files were generated with the legacy `-M` option that flags split-reads as secondary, supply the `-M` flag to SV2
+
##### Run SV2 for multiple samples
```
-sv2 -i sv2_input.txt -r chr21_forestSV.bed -o sv2_forestSV
+sv2 -i sv2_input.txt -r chr21_forestSV.bed -M -o sv2_forestSV
```
##### Parallelize by Sample
@@ -397,7 +401,7 @@ sv2 -i sv2_input.txt -r chr21_forestSV.bed -o sv2_forestSV
When given multiple samples, SV2 can run samples in parallel reducing run time.
```
-sv2 -i sv2_input.txt -r chr21_forestSV.bed -c 2 -o sv2_forestSV
+sv2 -i sv2_input.txt -r chr21_forestSV.bed -M -c 2 -o sv2_forestSV
```
#### Genotype SV: Skip Preprocessing
@@ -411,7 +415,7 @@ Skipping preprocessing allows the user to genotype another set of SVs with the s
mv sv2_features/ sv2_forestSV_features/
-sv2 -i sv2_input.txt -r ALL.wgs.integrated_sv_map_v1.20130502.chr21.sv.genotypes.vcf -pre sv2_preprocessing/ -o sv2_tut_1KGP
+sv2 -i sv2_input.txt -r ALL.wgs.integrated_sv_map_v1.20130502.chr21.sv.genotypes.vcf -M -pre sv2_preprocessing/ -o sv2_tut_1KGP
```
@@ -422,7 +426,7 @@ Skipping feature extraction allows the user to combine or subset samples in the
```
head -n 1 sv2_input.txt >sv2_subset.txt
-sv2 -i sv2_input.txt -r chr21_forestSV.bed -pre sv2_preprocessing/ -feats sv2_forestSV_features/ -o sv2_tut_forestSV_subset
+sv2 -i sv2_input.txt -r chr21_forestSV.bed -pre sv2_preprocessing/ -feats sv2_forestSV_features/ -M -o sv2_tut_forestSV_subset
```
diff --git a/sv2/core.pyx b/sv2/core.pyx
index c549402..6a2395d 100644
--- a/sv2/core.pyx
+++ b/sv2/core.pyx
@@ -290,7 +290,7 @@ cdef GC(nuc):
cdef MAD(a, double c=0.6745): return np.around(np.median(np.fabs(a-np.median(a))/c),decimals=3)
cdef normalize_chr_cov(double read_count, double chr_size, read_length): return np.around( (read_count/chr_size)*np.median(read_length),decimals=2)
cdef normalize_coverage(double read_count, double span, double chr_cov, double read_length): return (((read_count/span)*read_length)/chr_cov)
-cdef count_reads(cnv_list,bam,ci,chrFlag):
+cdef count_reads(cnv_list,bam,ci,chrFlag,legacyM):
"""count reads within SV span for coverage estimation"""
cdef unsigned int read_count
cdef unsigned int bp_span
@@ -309,7 +309,7 @@ cdef count_reads(cnv_list,bam,ci,chrFlag):
or read.is_proper_pair == False
or read.is_qcfail == True
or read.mapping_quality < 10
- or read.is_secondary
+ or ( (read.is_secondary and legacyM == True) or (read.is_supplementary and legacyM == False) )
or read.is_unmapped
or read.mate_is_unmapped
or read.is_duplicate
@@ -349,23 +349,26 @@ cdef is_discordant(read,windows,ci,matepos):
return True
else:
return False
-cdef is_split(read,windows,c):
+cdef is_split(read,windows,c,legacyM):
"""returns True if read is split"""
((s1,e1),(s2,e2)) = windows
- if read.is_secondary == True:
- second_align = read.get_tag("SA").split(',')
- if second_align[0] != c:
- """return False if maps to other chromosome"""
- return False
+ second_align=[]
+ if legacyM == True:
+ if read.is_secondary == True: second_align = read.get_tag("SA").split(',')
+ else:
+ if read.is_supplementary == True: second_align = read.get_tag("SA").split(',')
+ if len(second_align) == 0: return False
+ else:
+ if second_align[0] != c: return False
else:
second_align[1] = int(second_align[1])
if ( ((int(s1) <= read.pos+1 <= int(e1)) and (int(s2) <= second_align[1] <= int(e2)))
or ((int(s2) <= read.pos+1 <= int(e2)) and (int(s1) <= second_align[1] <= int(e1)))):
- """secondary alignment must be near the opposite breakpoint of the primary alignment"""
+ """supplementary/secondary alignment must be near the opposite breakpoint of the primary alignment"""
return True
else:
return False
-cdef discordant_split_cnv(flank_list,bam,size,ci,windows,chrFlag):
+cdef discordant_split_cnv(flank_list,bam,size,ci,windows,chrFlag,legacyM):
"""count discordant paired ends and split reads"""
cdef unsigned int discordant_count
cdef unsigned int split_count
@@ -390,10 +393,10 @@ cdef discordant_split_cnv(flank_list,bam,size,ci,windows,chrFlag):
else:
mate_pos = read.pnext
if is_discordant(read,windows,ci,mate_pos) == True: discordant_count+=1
- if is_split(read,windows,c) == True: split_count+=1
+ if is_split(read,windows,c,legacyM) == True: split_count+=1
if (read.is_proper_pair
and read.mapping_quality >= 10
- and not read.is_secondary
+ and ( (not read.is_secondary and legacyM == True) or (not read.is_supplementary and legacyM == False) )
and abs(read.tlen) < ci):
concordant_count+=1
else: continue
@@ -657,7 +660,7 @@ class Prepro(object):
self.insert_mad=mad
self.read_len=read_length
self.snp_cov=snp_cov
-def extract_feats(iid,bamfh,vcffh,cnv,prefh,gender,out,gen,pcrfree):
+def extract_feats(iid,bamfh,vcffh,cnv,prefh,gender,out,gen,pcrfree,legacyM):
master_cnv = filter_calls(cnv,gen)
gc_dict = gc_norm(pcrfree)
cnv_gc=gc_cnv(cnv,master_cnv,gen)
@@ -682,10 +685,10 @@ def extract_feats(iid,bamfh,vcffh,cnv,prefh,gender,out,gen,pcrfree):
(cnv_span,flank_span,windows) = master_cnv[call]
size = int(e)-int(s)+1
ci = insert_size+(5*insert_mad)
- discordant,split,concordant = discordant_split_cnv(flank_span,bam,size,ci,windows,chrFlag)
+ discordant,split,concordant = discordant_split_cnv(flank_span,bam,size,ci,windows,chrFlag,legacyM)
if size > 1000:
"""read count coverage estimation"""
- (read_count,bp_span) = count_reads(cnv_span,bam,ci,chrFlag)
+ (read_count,bp_span) = count_reads(cnv_span,bam,ci,chrFlag,legacyM)
cov=float('nan')
normc=c
if gender == 'M' and checkPAR('{} {} {}'.format(c,s,e),gen)== True: normc='GENOME'
diff --git a/sv2/sv2.py b/sv2/sv2.py
index 6fbcaac..5688b53 100755
--- a/sv2/sv2.py
+++ b/sv2/sv2.py
@@ -1,4 +1,4 @@
-__version__='1.2'
+__version__='1.3'
import sys,os,argparse
from .core import writeConfig,checkConfig,check_in,Bed,check_cnv,errFH,reportTime,preprocess,extract_feats,genotype,annotate
from argparse import RawTextHelpFormatter
@@ -7,7 +7,7 @@
from time import time
def main():
init_time = int(time())
- splash='\n ____\n _____________ ___ |___ \\\n / _____/\ \ / // ___/\n \_____ \ \ Y //_____)\n / \ \ /\n/_________/ \___/\nSupport Vector Structural Variation Genotyper\nVersion 1.2 Author: Danny Antaki \n'
+ splash='\n ____\n _____________ ___ |___ \\\n / _____/\ \ / // ___/\n \_____ \ \ Y //_____)\n / \ \ /\n/_________/ \___/\nSupport Vector Structural Variation Genotyper\nVersion 1.3 Author: Danny Antaki \n'
parser = argparse.ArgumentParser(description=splash,formatter_class=RawTextHelpFormatter)
genoArgs = parser.add_argument_group('genotype arguments')
configArgs = parser.add_argument_group('configure arguments')
@@ -16,6 +16,7 @@ def main():
genoArgs.add_argument('-c','-cpu', help='Parallelize sample-wise. 1 per cpu',required=False,default=1,type=int)
genoArgs.add_argument('-g','-genome', help='Reference genome build [ hg19, hg38 ]',required=False,default='hg19',type=str)
genoArgs.add_argument('-pcrfree', help='GC content normalization for PCR free libraries',required=False,default=False,action="store_true")
+ genoArgs.add_argument('-M', help='bwa mem -M compatibility. Split-reads flagged as secondary instead of supplementary',default=False,required=False,action="store_true")
genoArgs.add_argument('-s','-seed', help='Preprocessing: integer seed for genome shuffling',required=False,default=42,type=int)
genoArgs.add_argument('-o','-out', help='output',required=False,default="sv2_genotypes.vcf",type=str)
genoArgs.add_argument('-pre', help='Preprocessing output directory',required=False,default=None)
@@ -28,6 +29,7 @@ def main():
cores = args.c
gen = args.g
pcrfree = args.pcrfree
+ legacyM = args.M
ofh = args.o
seed = args.s
predir= args.pre
@@ -93,7 +95,7 @@ def main():
prefh = preprocess_files[bam_id]
gtofh = bam_id+'_sv2_features.txt'
feats_files[bam_id]=os.getcwd()+'/sv2_features/'+gtofh
- pool.apply_async(extract_feats, args=(bam_id,bam_dict[bam_id],vcf_dict[bam_id],cnv,prefh,gender_dict[bam_id],gtofh,gen,pcrfree) )
+ pool.apply_async(extract_feats, args=(bam_id,bam_dict[bam_id],vcf_dict[bam_id],cnv,prefh,gender_dict[bam_id],gtofh,gen,pcrfree,legacyM) )
pool.close()
pool.join()
else:
@@ -104,7 +106,7 @@ def main():
prefh = preprocess_files[bam_id]
gtofh = bam_id+'_sv2_features.txt'
feats_files[bam_id]=os.getcwd()+'/sv2_features/'+gtofh
- extract_feats(bam_id,bam_dict[bam_id],vcf_dict[bam_id],cnv,prefh,gender_dict[bam_id],gtofh,gen,pcrfree)
+ extract_feats(bam_id,bam_dict[bam_id],vcf_dict[bam_id],cnv,prefh,gender_dict[bam_id],gtofh,gen,pcrfree,legacyM)
else:
if not featsdir.endswith('/'): featsdir=featsdir+'/'
if not os.path.isdir(featsdir): errFH(featsdir)