Skip to content

Commit

Permalink
Add SR1POS and SR2POS to gatk format to recover INS END coordinate
Browse files Browse the repository at this point in the history
  • Loading branch information
mwalker174 committed Oct 7, 2024
1 parent 72715e6 commit 3c4ac57
Show file tree
Hide file tree
Showing 3 changed files with 32 additions and 6 deletions.
6 changes: 6 additions & 0 deletions src/sv-pipeline/scripts/format_gatk_vcf_for_svtk.py
Original file line number Diff line number Diff line change
Expand Up @@ -151,6 +151,10 @@ def convert(record: pysam.VariantRecord,
if svtype == 'INS':
new_record.info['SVLEN'] = record.info.get('SVLEN', -1)
new_record.info['STRANDS'] = '+-'
# END information is lost when setting POS=END, so we need to set it to SR2POS if it's available
# Note that the END position is only important following SR breakpoint refinement in FilterBatch
if record.info.get('SR2POS') is not None:
new_record.stop = record.info['SR2POS']
elif svtype == 'BND' or svtype == 'CTX':
new_record.stop = record.info['END2']
new_record.info['SVLEN'] = -1
Expand Down Expand Up @@ -208,6 +212,8 @@ def __parse_arguments(argv: List[Text]) -> argparse.Namespace:
help="Comma-delimited list of FORMAT fields to remove")
parser.add_argument("--remove-infos", type=str,
help="Comma-delimited list of INFO fields to remove")
parser.add_argument("--intervaled-ins", default=False, action='store_true',
help="Set INS record END to POS+SVLEN")
parser.add_argument("--set-pass", default=False, action='store_true',
help="Set empty FILTER fields (\".\") to PASS")
if len(argv) <= 1:
Expand Down
28 changes: 24 additions & 4 deletions src/sv-pipeline/scripts/format_svtk_vcf_for_gatk.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,21 +70,27 @@ def _parse_ploidy_table(path: Text) -> Dict[Text, Dict[Text, int]]:
return ploidy_dict


def update_header(header: pysam.VariantHeader) -> None:
def update_header(header: pysam.VariantHeader,
add_sr_pos: bool) -> None:
"""
Ingests the given header, removes specified fields, and adds necessary fields.
Parameters
----------
header: pysam.VariantHeader
input header
add_sr_pos: bool
add SR1POS and SR2POS INFO fields
"""
header.add_line('##FORMAT=<ID=ECN,Number=1,Type=Integer,Description="Expected copy number for ref genotype">')
# Add these just in case (no effect if they exist)
header.add_line('##INFO=<ID=END2,Number=1,Type=Integer,Description="Second position">')
header.add_line('##INFO=<ID=CHR2,Number=1,Type=String,Description="Second contig">')
header.add_line('##INFO=<ID=BOTHSIDES_SUPPORT,Number=0,Type=Flag,Description="Variant has read-level support for both sides of breakpoint">')
header.add_line('##INFO=<ID=HIGH_SR_BACKGROUND,Number=0,Type=Flag,Description="High number of SR splits in background samples indicating messy region">')
if add_sr_pos:
header.add_line('##INFO=<ID=SR1POS,Number=1,Type=Integer,Description="Split read position at start">')
header.add_line('##INFO=<ID=SR2POS,Number=1,Type=Integer,Description="Split read position at end">')


def rescale_gq(record):
Expand All @@ -99,7 +105,8 @@ def convert(record: pysam.VariantRecord,
ploidy_dict: Dict[Text, Dict[Text, int]],
scale_down_gq: bool,
bothside_pass_vid_set: Set[Text],
background_fail_vid_set: Set[Text]) -> pysam.VariantRecord:
background_fail_vid_set: Set[Text],
add_sr_pos: bool) -> pysam.VariantRecord:
"""
Converts a record from svtk to gatk style. This includes updating END/END2 and adding
necessary fields such as ECN.
Expand All @@ -118,6 +125,8 @@ def convert(record: pysam.VariantRecord,
set of variant IDs with bothside SR pass flag
background_fail_vid_set: Set[Text]
set of variant Ids with background SR fail flag
add_sr_pos: bool
add SR1POS and SR2POS INFO fields
Returns
-------
Expand All @@ -138,6 +147,13 @@ def is_null(val):
if svtype == 'BND' or svtype == 'CTX':
record.info['END2'] = bnd_end_dict[record.id] if bnd_end_dict is not None \
else record.info.get('END2', record.stop)
# Add SR1POS/SR2POS, CPX not supported
if add_sr_pos and svtype != 'CPX':
record.info['SR1POS'] = record.pos
if svtype == 'BND' or svtype == 'CTX':
record.info['SR2POS'] = record.info['END2']
else:
record.info['SR2POS'] = record.stop
# Fix this weird edge case (may be from CPX review workflow)
if svtype == 'INV' and '<CPX>' in record.alleles[1]:
svtype = 'CPX'
Expand Down Expand Up @@ -208,7 +224,8 @@ def _process(vcf_in: pysam.VariantFile,
out = convert(record=record, bnd_end_dict=bnd_end_dict,
ploidy_dict=ploidy_dict, scale_down_gq=arguments.scale_down_gq,
bothside_pass_vid_set=bothside_pass_vid_set,
background_fail_vid_set=background_fail_vid_set)
background_fail_vid_set=background_fail_vid_set,
add_sr_pos=arguments.add_sr_pos)
vcf_out.write(out)


Expand All @@ -235,6 +252,8 @@ def _parse_arguments(argv: List[Text]) -> argparse.Namespace:
help="Path to bothside SR pass flag variant list")
parser.add_argument("--background-fail-list", type=str,
help="Path to background SR fail flag variant list")
parser.add_argument("--add-sr-pos", action='store_true',
help="Add SR1POS and SR2POS INFO fields")
if len(argv) <= 1:
parser.parse_args(["--help"])
sys.exit(0)
Expand All @@ -250,7 +269,8 @@ def main(argv: Optional[List[Text]] = None):
# convert vcf header and records
with pysam.VariantFile(arguments.vcf) as vcf_in:
update_header(
header=vcf_in.header
header=vcf_in.header,
add_sr_pos=arguments.add_sr_pos
)
with pysam.VariantFile(arguments.out, mode='w', header=vcf_in.header) as vcf_out:
_process(vcf_in, vcf_out, arguments)
Expand Down
4 changes: 2 additions & 2 deletions wdl/CombineBatches.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -120,7 +120,7 @@ workflow CombineBatches {
input:
vcf=reformatted_vcf,
ploidy_table=CreatePloidyTableFromPed.out,
args="--fix-end",
args="--fix-end --add-sr-pos",
output_prefix=basename(vcf, ".vcf.gz") + ".reformat_gatk",
bothside_pass_list=CombineSRBothsidePass.out,
background_fail_list=CombineBackgroundFail.outfile,
Expand Down Expand Up @@ -231,7 +231,7 @@ workflow CombineBatches {
source="depth",
contig_list=contig_list,
remove_formats="CN",
remove_infos="END2,AC,AF,AN,HIGH_SR_BACKGROUND,BOTHSIDES_SUPPORT",
remove_infos="END2,AC,AF,AN,HIGH_SR_BACKGROUND,BOTHSIDES_SUPPORT,SR1POS,SR2POS",
set_pass=true,
sv_pipeline_docker=sv_pipeline_docker,
runtime_attr_override=runtime_attr_gatk_to_svtk_vcf
Expand Down

0 comments on commit 3c4ac57

Please sign in to comment.