diff --git a/.github/.dockstore.yml b/.github/.dockstore.yml index 3bcad4198..974fdeee6 100644 --- a/.github/.dockstore.yml +++ b/.github/.dockstore.yml @@ -4,7 +4,7 @@ workflows: name: GatherSampleEvidence primaryDescriptorPath: /wdl/GatherSampleEvidence.wdl filters: - branches: + branches: - main tags: - /.*/ @@ -13,7 +13,7 @@ workflows: name: EvidenceQC primaryDescriptorPath: /wdl/EvidenceQC.wdl filters: - branches: + branches: - main tags: - /.*/ @@ -22,7 +22,7 @@ workflows: name: TrainGCNV primaryDescriptorPath: /wdl/TrainGCNV.wdl filters: - branches: + branches: - main tags: - /.*/ @@ -31,7 +31,7 @@ workflows: name: GatherBatchEvidence primaryDescriptorPath: /wdl/GatherBatchEvidence.wdl filters: - branches: + branches: - main tags: - /.*/ @@ -40,7 +40,7 @@ workflows: name: ClusterBatch primaryDescriptorPath: /wdl/ClusterBatch.wdl filters: - branches: + branches: - main tags: - /.*/ @@ -49,7 +49,7 @@ workflows: name: GenerateBatchMetrics primaryDescriptorPath: /wdl/GenerateBatchMetrics.wdl filters: - branches: + branches: - main tags: - /.*/ @@ -58,7 +58,7 @@ workflows: name: FilterBatchSites primaryDescriptorPath: /wdl/FilterBatchSites.wdl filters: - branches: + branches: - main tags: - /.*/ @@ -67,7 +67,7 @@ workflows: name: PlotSVCountsPerSample primaryDescriptorPath: /wdl/PlotSVCountsPerSample.wdl filters: - branches: + branches: - main tags: - /.*/ @@ -76,7 +76,7 @@ workflows: name: FilterBatchSamples primaryDescriptorPath: /wdl/FilterBatchSamples.wdl filters: - branches: + branches: - main tags: - /.*/ @@ -85,7 +85,7 @@ workflows: name: MergeBatchSites primaryDescriptorPath: /wdl/MergeBatchSites.wdl filters: - branches: + branches: - main tags: - /.*/ @@ -94,7 +94,7 @@ workflows: name: GenotypeBatch primaryDescriptorPath: /wdl/GenotypeBatch.wdl filters: - branches: + branches: - main tags: - /.*/ @@ -103,7 +103,7 @@ workflows: name: RegenotypeCNVs primaryDescriptorPath: /wdl/RegenotypeCNVs.wdl filters: - branches: + branches: - main tags: - /.*/ @@ -112,7 +112,7 @@ workflows: name: CombineBatches primaryDescriptorPath: /wdl/CombineBatches.wdl filters: - branches: + branches: - main tags: - /.*/ @@ -121,7 +121,7 @@ workflows: name: ResolveComplexVariants primaryDescriptorPath: /wdl/ResolveComplexVariants.wdl filters: - branches: + branches: - main tags: - /.*/ @@ -130,7 +130,7 @@ workflows: name: GenotypeComplexVariants primaryDescriptorPath: /wdl/GenotypeComplexVariants.wdl filters: - branches: + branches: - main tags: - /.*/ @@ -139,7 +139,43 @@ workflows: name: CleanVcf primaryDescriptorPath: /wdl/CleanVcf.wdl filters: - branches: + branches: + - main + tags: + - /.*/ + + - subclass: WDL + name: RefineComplexVariants + primaryDescriptorPath: /wdl/RefineComplexVariants.wdl + filters: + branches: + - main + tags: + - /.*/ + + - subclass: WDL + name: JoinRawCalls + primaryDescriptorPath: /wdl/JoinRawCalls.wdl + filters: + branches: + - main + tags: + - /.*/ + + - subclass: WDL + name: SVConcordance + primaryDescriptorPath: /wdl/SVConcordance.wdl + filters: + branches: + - main + tags: + - /.*/ + + - subclass: WDL + name: FilterGenotypes + primaryDescriptorPath: /wdl/FilterGenotypes.wdl + filters: + branches: - main tags: - /.*/ @@ -148,7 +184,7 @@ workflows: name: MainVcfQc primaryDescriptorPath: /wdl/MainVcfQc.wdl filters: - branches: + branches: - main tags: - /.*/ @@ -157,7 +193,7 @@ workflows: name: AnnotateVcf primaryDescriptorPath: /wdl/AnnotateVcf.wdl filters: - branches: + branches: - main tags: - /.*/ diff --git a/README.md b/README.md index cb8808221..09146ef12 100644 --- a/README.md +++ b/README.md @@ -27,4 +27,4 @@ continuous integration (CI) and continuous delivery (CD). GATK-SV CI/CD is developed as a set of Github Actions workflows that are available under the `.github/workflows` directory. Please refer to the [workflow's README](.github/workflows/README.md) -for their current coverage and setup. +for their current coverage and setup. \ No newline at end of file diff --git a/inputs/templates/terra_workspaces/cohort_mode/cohort_mode_workspace_dashboard.md.tmpl b/inputs/templates/terra_workspaces/cohort_mode/cohort_mode_workspace_dashboard.md.tmpl index 224f83af9..c22618380 100644 --- a/inputs/templates/terra_workspaces/cohort_mode/cohort_mode_workspace_dashboard.md.tmpl +++ b/inputs/templates/terra_workspaces/cohort_mode/cohort_mode_workspace_dashboard.md.tmpl @@ -61,10 +61,11 @@ The following workflows are included in this workspace, to be executed in this o 13. `13-ResolveComplexVariants`: Complex variant resolution 14. `14-GenotypeComplexVariants`: Complex variant re-genotyping 15. `15-CleanVcf`: VCF cleanup -16. `16-JoinRawCalls`: Combines unfiltered calls (from step 5) across batches -17. `17-SVConcordance`: Annotates variants with genotype concordance against raw calls -18. `18-FilterGenotypes`: Performs genotype filtering to improve precision and generates QC plots -19. `19-AnnotateVcf`: Cohort VCF annotations, including functional annotation, allele frequency (AF) annotation, and AF annotation with external population callsets +16. `16-RefineComplexVariants`: Complex variant and translocation refinement +17. `17-JoinRawCalls`: Combines unfiltered calls (from step 5) across batches +18. `18-SVConcordance`: Annotates variants with genotype concordance against raw calls +19. `19-FilterGenotypes`: Performs genotype filtering to improve precision and generates QC plots +20. `20-AnnotateVcf`: Cohort VCF annotations, including functional annotation, allele frequency (AF) annotation, and AF annotation with external population callsets Additional downstream modules, such as those for visualization, are under development. They are not included in this workspace at this time, but the source code can be found in the [GATK-SV GitHub repository](https://github.com/broadinstitute/gatk-sv). See **Downstream steps** towards the bottom of this page for more information. @@ -205,9 +206,9 @@ Read the full MergeBatchSites documentation [here](https://github.com/broadinsti Read the full GenotypeBatch documentation [here](https://github.com/broadinstitute/gatk-sv#genotype-batch). * Use the same `sample_set` definitions you used for `03-TrainGCNV` through `08-FilterBatchSamples`. -#### 11-RegenotypeCNVs, 12-CombineBatches, 13-ResolveComplexVariants, 14-GenotypeComplexVariants, 15-CleanVcf, 16-JoinRawCalls, 17-SVConcordance, 18-FilterGenotypes, and 19-AnnotateVcf +#### 11-RegenotypeCNVs, 12-CombineBatches, 13-ResolveComplexVariants, 14-GenotypeComplexVariants, 15-CleanVcf, 16-RefineComplexVariants, 17-JoinRawCalls, 18-SVConcordance, 19-FilterGenotypes, and 20-AnnotateVcf -Read the full documentation for [RegenotypeCNVs](https://github.com/broadinstitute/gatk-sv#regenotype-cnvs), [MakeCohortVcf](https://github.com/broadinstitute/gatk-sv#make-cohort-vcf) (which includes `CombineBatches`, `ResolveComplexVariants`, `GenotypeComplexVariants`, `CleanVcf`), [`JoinRawCalls`](https://github.com/broadinstitute/gatk-sv#join-raw-calls), [`SVConcordance`](https://github.com/broadinstitute/gatk-sv#svconcordance), [`FilterGenotypes`](https://github.com/broadinstitute/gatk-sv#filter-genotypes), and [AnnotateVcf](https://github.com/broadinstitute/gatk-sv#annotate-vcf) on the README. +Read the full documentation for [RegenotypeCNVs](https://github.com/broadinstitute/gatk-sv#regenotype-cnvs), [MakeCohortVcf](https://github.com/broadinstitute/gatk-sv#make-cohort-vcf) (which includes `CombineBatches`, `ResolveComplexVariants`, `GenotypeComplexVariants`, `CleanVcf`), [`RefineComplexVariants`](https://github.com/broadinstitute/gatk-sv#refine-complex), [`JoinRawCalls`](https://github.com/broadinstitute/gatk-sv#join-raw-calls), [`SVConcordance`](https://github.com/broadinstitute/gatk-sv#svconcordance), [`FilterGenotypes`](https://github.com/broadinstitute/gatk-sv#filter-genotypes), and [AnnotateVcf](https://github.com/broadinstitute/gatk-sv#annotate-vcf) on the README. * Use the same cohort `sample_set_set` you created and used for `09-MergeBatchSites`. #### Downstream steps diff --git a/inputs/templates/terra_workspaces/cohort_mode/workflow_configurations/CleanVcf.json.tmpl b/inputs/templates/terra_workspaces/cohort_mode/workflow_configurations/CleanVcf.json.tmpl index e7c1bc7a0..93c172d8c 100644 --- a/inputs/templates/terra_workspaces/cohort_mode/workflow_configurations/CleanVcf.json.tmpl +++ b/inputs/templates/terra_workspaces/cohort_mode/workflow_configurations/CleanVcf.json.tmpl @@ -1,6 +1,8 @@ { "CleanVcf.contig_list": "${workspace.primary_contigs_fai}", "CleanVcf.allosome_fai": "${workspace.allosome_file}", + "CleanVcf.HERVK_reference": "${workspace.HERVK_reference}", + "CleanVcf.LINE1_reference": "${workspace.LINE1_reference}", "CleanVcf.chr_x": "${workspace.chr_x}", "CleanVcf.chr_y": "${workspace.chr_y}", diff --git a/inputs/templates/terra_workspaces/cohort_mode/workflow_configurations/MakeCohortVcf.json.tmpl b/inputs/templates/terra_workspaces/cohort_mode/workflow_configurations/MakeCohortVcf.json.tmpl index ccab739da..f3fbfb018 100644 --- a/inputs/templates/terra_workspaces/cohort_mode/workflow_configurations/MakeCohortVcf.json.tmpl +++ b/inputs/templates/terra_workspaces/cohort_mode/workflow_configurations/MakeCohortVcf.json.tmpl @@ -8,6 +8,8 @@ "MakeCohortVcf.depth_exclude_list": "${workspace.depth_exclude_list}", "MakeCohortVcf.empty_file" : "${workspace.empty_file}", "MakeCohortVcf.ref_dict": "${workspace.reference_dict}", + "MakeCohortVcf.HERVK_reference": "${workspace.HERVK_reference}", + "MakeCohortVcf.LINE1_reference": "${workspace.LINE1_reference}", "MakeCohortVcf.site_level_comparison_datasets": [ {{ reference_resources.ccdg_abel_site_level_benchmarking_dataset | tojson }}, diff --git a/inputs/templates/terra_workspaces/cohort_mode/workflow_configurations/RefineComplexVariants.json.tmpl b/inputs/templates/terra_workspaces/cohort_mode/workflow_configurations/RefineComplexVariants.json.tmpl new file mode 100644 index 000000000..fb95e1ce4 --- /dev/null +++ b/inputs/templates/terra_workspaces/cohort_mode/workflow_configurations/RefineComplexVariants.json.tmpl @@ -0,0 +1,17 @@ +{ + "RefineComplexVariants.vcf": "${this.cleaned_vcf}", + "RefineComplexVariants.prefix": "${this.sample_set_set_id}", + + "RefineComplexVariants.batch_name_list": "${this.sample_sets.sample_set_id}", + "RefineComplexVariants.batch_sample_lists": "${this.sample_sets.filtered_batch_samples_file}", + "RefineComplexVariants.PE_metrics": "${this.sample_sets.merged_PE}", + "RefineComplexVariants.PE_metrics_indexes": "${this.sample_sets.merged_PE_index}", + "RefineComplexVariants.Depth_DEL_beds": "${this.sample_sets.merged_dels}", + "RefineComplexVariants.Depth_DUP_beds": "${this.sample_sets.merged_dups}", + + "RefineComplexVariants.n_per_split": "15000", + + "RefineComplexVariants.sv_base_mini_docker": "${workspace.sv_base_mini_docker}", + "RefineComplexVariants.sv_pipeline_docker": "${workspace.sv_pipeline_docker}", + "RefineComplexVariants.linux_docker": "${workspace.linux_docker}" +} diff --git a/inputs/templates/terra_workspaces/cohort_mode/workflow_configurations/SVConcordance.json.tmpl b/inputs/templates/terra_workspaces/cohort_mode/workflow_configurations/SVConcordance.json.tmpl index ff48489ac..1c9dfed26 100644 --- a/inputs/templates/terra_workspaces/cohort_mode/workflow_configurations/SVConcordance.json.tmpl +++ b/inputs/templates/terra_workspaces/cohort_mode/workflow_configurations/SVConcordance.json.tmpl @@ -2,7 +2,7 @@ "SVConcordance.gatk_docker": "${workspace.gatk_docker}", "SVConcordance.sv_base_mini_docker": "${workspace.sv_base_mini_docker}", - "SVConcordance.eval_vcf" : "${this.cleaned_vcf}", + "SVConcordance.eval_vcf" : "${this.cpx_refined_vcf}", "SVConcordance.truth_vcf" : "${this.joined_raw_calls_vcf}", "SVConcordance.output_prefix": "${this.sample_set_set_id}", diff --git a/inputs/templates/terra_workspaces/cohort_mode/workspace.tsv.tmpl b/inputs/templates/terra_workspaces/cohort_mode/workspace.tsv.tmpl index b9d5f26d5..6ee7a177d 100644 --- a/inputs/templates/terra_workspaces/cohort_mode/workspace.tsv.tmpl +++ b/inputs/templates/terra_workspaces/cohort_mode/workspace.tsv.tmpl @@ -29,6 +29,8 @@ exclude_intervals_for_gcnv_filter_intervals {{ reference_resources.exclude_inter external_af_ref_bed {{ reference_resources.external_af_ref_bed }} external_af_ref_bed_prefix {{ reference_resources.external_af_ref_bed_prefix }} genome_file {{ reference_resources.genome_file }} +HERVK_reference {{ reference_resources.hervk_reference }} +LINE1_reference {{ reference_resources.line1_reference }} manta_region_bed {{ reference_resources.manta_region_bed }} manta_region_bed_index {{ reference_resources.manta_region_bed_index }} mei_bed {{ reference_resources.mei_bed }} diff --git a/inputs/templates/terra_workspaces/single_sample/GATKSVPipelineSingleSample.json.tmpl b/inputs/templates/terra_workspaces/single_sample/GATKSVPipelineSingleSample.json.tmpl index e5a920a0c..88c735b2d 100644 --- a/inputs/templates/terra_workspaces/single_sample/GATKSVPipelineSingleSample.json.tmpl +++ b/inputs/templates/terra_workspaces/single_sample/GATKSVPipelineSingleSample.json.tmpl @@ -97,6 +97,9 @@ "GATKSVPipelineSingleSample.clean_vcf_random_seed": 0, "GATKSVPipelineSingleSample.run_vcf_qc" : false, + "GATKSVPipelineSingleSample.MakeCohortVcf.HERVK_reference": "${workspace.reference_HERVK}", + "GATKSVPipelineSingleSample.MakeCohortVcf.LINE1_reference": "${workspace.reference_LINE1}", + "GATKSVPipelineSingleSample.protein_coding_gtf" : "${workspace.reference_protein_coding_gtf}", "GATKSVPipelineSingleSample.noncoding_bed" : "${workspace.reference_noncoding_bed}", "GATKSVPipelineSingleSample.external_af_ref_bed" : "${workspace.reference_external_af_ref_bed}", diff --git a/inputs/templates/terra_workspaces/single_sample/workspace.tsv.tmpl b/inputs/templates/terra_workspaces/single_sample/workspace.tsv.tmpl index 25e13568f..2bcef0ad2 100644 --- a/inputs/templates/terra_workspaces/single_sample/workspace.tsv.tmpl +++ b/inputs/templates/terra_workspaces/single_sample/workspace.tsv.tmpl @@ -55,6 +55,8 @@ reference_exclude_intervals_for_gcnv_filter_intervals {{ reference_resources.exc reference_external_af_ref_bed {{ reference_resources.external_af_ref_bed }} reference_external_af_ref_bed_prefix {{ reference_resources.external_af_ref_bed_prefix }} reference_genome_file {{ reference_resources.genome_file }} +reference_HERVK {{ reference_resources.hervk_reference }} +reference_LINE1 {{ reference_resources.line1_reference }} reference_manta_region_bed {{ reference_resources.manta_region_bed }} reference_manta_region_bed_index {{ reference_resources.manta_region_bed_index }} reference_mei_bed {{ reference_resources.mei_bed }} diff --git a/inputs/templates/test/FilterGenotypes/FilterGenotypes.fixed_cutoffs.json.tmpl b/inputs/templates/test/FilterGenotypes/FilterGenotypes.fixed_cutoffs.json.tmpl index 5035f0d0b..1025504cf 100644 --- a/inputs/templates/test/FilterGenotypes/FilterGenotypes.fixed_cutoffs.json.tmpl +++ b/inputs/templates/test/FilterGenotypes/FilterGenotypes.fixed_cutoffs.json.tmpl @@ -5,7 +5,13 @@ "FilterGenotypes.gq_recalibrator_model_file": {{ reference_resources.aou_recalibrate_gq_model_file | tojson }}, "FilterGenotypes.sl_filter_args": "--small-del-threshold 93 --medium-del-threshold 150 --small-dup-threshold -51 --medium-dup-threshold -4 --ins-threshold -13 --inv-threshold -19", - "FilterGenotypes.genome_tracks": {{ reference_resources.recalibrate_gq_genome_tracks | tojson }}, + "FilterGenotypes.genome_tracks": [ + {{ reference_resources.recalibrate_gq_genome_track_repeatmasker | tojson }}, + {{ reference_resources.recalibrate_gq_genome_track_segdup | tojson }}, + {{ reference_resources.recalibrate_gq_genome_track_simple_repeats | tojson }}, + {{ reference_resources.recalibrate_gq_genome_track_umap100 | tojson }}, + {{ reference_resources.recalibrate_gq_genome_track_umap24 | tojson }} + ], "FilterGenotypes.recalibrate_gq_args": [ "--keep-homvar false", "--keep-homref true", diff --git a/inputs/templates/test/FilterGenotypes/FilterGenotypes.optimize_cutoffs.json.tmpl b/inputs/templates/test/FilterGenotypes/FilterGenotypes.optimize_cutoffs.json.tmpl index b11a12812..53e50c1ba 100644 --- a/inputs/templates/test/FilterGenotypes/FilterGenotypes.optimize_cutoffs.json.tmpl +++ b/inputs/templates/test/FilterGenotypes/FilterGenotypes.optimize_cutoffs.json.tmpl @@ -6,7 +6,13 @@ "FilterGenotypes.primary_contigs_fai": {{ reference_resources.primary_contigs_fai | tojson }}, "FilterGenotypes.gq_recalibrator_model_file": {{ reference_resources.aou_recalibrate_gq_model_file | tojson }}, - "FilterGenotypes.genome_tracks": {{ reference_resources.recalibrate_gq_genome_tracks | tojson }}, + "FilterGenotypes.genome_tracks": [ + {{ reference_resources.recalibrate_gq_genome_track_repeatmasker | tojson }}, + {{ reference_resources.recalibrate_gq_genome_track_segdup | tojson }}, + {{ reference_resources.recalibrate_gq_genome_track_simple_repeats | tojson }}, + {{ reference_resources.recalibrate_gq_genome_track_umap100 | tojson }}, + {{ reference_resources.recalibrate_gq_genome_track_umap24 | tojson }} + ], "FilterGenotypes.recalibrate_gq_args": [ "--keep-homvar false", "--keep-homref true", diff --git a/inputs/templates/test/GATKSVPipelineBatch/GATKSVPipelineBatch.FromSampleEvidence.json.tmpl b/inputs/templates/test/GATKSVPipelineBatch/GATKSVPipelineBatch.FromSampleEvidence.json.tmpl index 2f5ba41c4..2bfbc4f1b 100644 --- a/inputs/templates/test/GATKSVPipelineBatch/GATKSVPipelineBatch.FromSampleEvidence.json.tmpl +++ b/inputs/templates/test/GATKSVPipelineBatch/GATKSVPipelineBatch.FromSampleEvidence.json.tmpl @@ -134,6 +134,8 @@ "GATKSVPipelineBatch.MakeCohortVcf.mei_bed": {{ reference_resources.mei_bed | tojson }}, "GATKSVPipelineBatch.MakeCohortVcf.pe_exclude_list": {{ reference_resources.pesr_exclude_list | tojson }}, "GATKSVPipelineBatch.MakeCohortVcf.depth_exclude_list": {{ reference_resources.depth_exclude_list | tojson }}, + "GATKSVPipelineBatch.MakeCohortVcf.HERVK_reference": {{ reference_resources.hervk_reference | tojson }}, + "GATKSVPipelineBatch.MakeCohortVcf.LINE1_reference": {{ reference_resources.line1_reference | tojson }}, "GATKSVPipelineBatch.MakeCohortVcf.min_sr_background_fail_batches": 0.5, "GATKSVPipelineBatch.MakeCohortVcf.max_shards_per_chrom_clean_vcf_step1": 200, "GATKSVPipelineBatch.MakeCohortVcf.min_records_per_shard_clean_vcf_step1": 5000, diff --git a/inputs/templates/test/GATKSVPipelineBatch/GATKSVPipelineBatch.json.tmpl b/inputs/templates/test/GATKSVPipelineBatch/GATKSVPipelineBatch.json.tmpl index 5412fd79e..9ba04d229 100644 --- a/inputs/templates/test/GATKSVPipelineBatch/GATKSVPipelineBatch.json.tmpl +++ b/inputs/templates/test/GATKSVPipelineBatch/GATKSVPipelineBatch.json.tmpl @@ -127,6 +127,8 @@ "GATKSVPipelineBatch.MakeCohortVcf.mei_bed": {{ reference_resources.mei_bed | tojson }}, "GATKSVPipelineBatch.MakeCohortVcf.pe_exclude_list": {{ reference_resources.pesr_exclude_list | tojson }}, "GATKSVPipelineBatch.MakeCohortVcf.depth_exclude_list": {{ reference_resources.depth_exclude_list | tojson }}, + "GATKSVPipelineBatch.MakeCohortVcf.HERVK_reference": {{ reference_resources.hervk_reference | tojson }}, + "GATKSVPipelineBatch.MakeCohortVcf.LINE1_reference": {{ reference_resources.line1_reference | tojson }}, "GATKSVPipelineBatch.MakeCohortVcf.min_sr_background_fail_batches": 0.5, "GATKSVPipelineBatch.MakeCohortVcf.max_shards_per_chrom_clean_vcf_step1": 200, "GATKSVPipelineBatch.MakeCohortVcf.min_records_per_shard_clean_vcf_step1": 5000, diff --git a/inputs/templates/test/GATKSVPipelineSingleSample/GATKSVPipelineSingleSample.json.tmpl b/inputs/templates/test/GATKSVPipelineSingleSample/GATKSVPipelineSingleSample.json.tmpl index f08b38902..870b5b82f 100644 --- a/inputs/templates/test/GATKSVPipelineSingleSample/GATKSVPipelineSingleSample.json.tmpl +++ b/inputs/templates/test/GATKSVPipelineSingleSample/GATKSVPipelineSingleSample.json.tmpl @@ -101,6 +101,9 @@ "GATKSVPipelineSingleSample.clean_vcf5_records_per_shard": 5000, "GATKSVPipelineSingleSample.run_vcf_qc" : false, + "GATKSVPipelineSingleSample.MakeCohortVcf.HERVK_reference": {{ reference_resources.hervk_reference | tojson }}, + "GATKSVPipelineSingleSample.MakeCohortVcf.LINE1_reference": {{ reference_resources.line1_reference | tojson }}, + "GATKSVPipelineSingleSample.protein_coding_gtf" : {{ reference_resources.protein_coding_gtf | tojson }}, "GATKSVPipelineSingleSample.noncoding_bed" : {{ reference_resources.noncoding_bed | tojson }}, "GATKSVPipelineSingleSample.external_af_ref_bed" : {{ reference_resources.external_af_ref_bed | tojson }}, diff --git a/inputs/templates/test/GATKSVPipelineSingleSample/GATKSVPipelineSingleSample.melt.json.tmpl b/inputs/templates/test/GATKSVPipelineSingleSample/GATKSVPipelineSingleSample.melt.json.tmpl index 4d35ca4ae..8352403df 100644 --- a/inputs/templates/test/GATKSVPipelineSingleSample/GATKSVPipelineSingleSample.melt.json.tmpl +++ b/inputs/templates/test/GATKSVPipelineSingleSample/GATKSVPipelineSingleSample.melt.json.tmpl @@ -90,6 +90,10 @@ "GATKSVPipelineSingleSample.depth_clustering_algorithm": "SINGLE_LINKAGE", "GATKSVPipelineSingleSample.ref_copy_number_autosomal_contigs" : 2, + + "GATKSVPipelineSingleSample.MakeCohortVcf.HERVK_reference": {{ reference_resources.hervk_reference | tojson }}, + "GATKSVPipelineSingleSample.MakeCohortVcf.LINE1_reference": {{ reference_resources.line1_reference | tojson }}, + "GATKSVPipelineSingleSample.clean_vcf_min_sr_background_fail_batches": 0.5, "GATKSVPipelineSingleSample.max_shard_size_resolve" : 500, "GATKSVPipelineSingleSample.clean_vcf_max_shards_per_chrom_clean_vcf_step1": 200, diff --git a/inputs/templates/test/MakeCohortVcf/CleanVcf.json.tmpl b/inputs/templates/test/MakeCohortVcf/CleanVcf.json.tmpl index 8606a4a50..196505724 100644 --- a/inputs/templates/test/MakeCohortVcf/CleanVcf.json.tmpl +++ b/inputs/templates/test/MakeCohortVcf/CleanVcf.json.tmpl @@ -1,6 +1,8 @@ { "CleanVcf.contig_list": {{ reference_resources.primary_contigs_fai | tojson }}, "CleanVcf.allosome_fai": {{ reference_resources.allosome_file | tojson }}, + "CleanVcf.HERVK_reference": {{ reference_resources.hervk_reference | tojson }}, + "CleanVcf.LINE1_reference": {{ reference_resources.line1_reference | tojson }}, "CleanVcf.chr_x": {{ reference_resources.chr_x | tojson }}, "CleanVcf.chr_y": {{ reference_resources.chr_y | tojson }}, diff --git a/inputs/templates/test/MakeCohortVcf/MakeCohortVcf.json.tmpl b/inputs/templates/test/MakeCohortVcf/MakeCohortVcf.json.tmpl index 8c27a74cf..cad2a9e03 100644 --- a/inputs/templates/test/MakeCohortVcf/MakeCohortVcf.json.tmpl +++ b/inputs/templates/test/MakeCohortVcf/MakeCohortVcf.json.tmpl @@ -8,6 +8,8 @@ "MakeCohortVcf.depth_exclude_list": {{ reference_resources.depth_exclude_list | tojson }}, "MakeCohortVcf.empty_file" : {{ reference_resources.empty_file | tojson }}, "MakeCohortVcf.ref_dict": {{ reference_resources.reference_dict | tojson }}, + "MakeCohortVcf.HERVK_reference": {{ reference_resources.hervk_reference | tojson }}, + "MakeCohortVcf.LINE1_reference": {{ reference_resources.line1_reference | tojson }}, "MakeCohortVcf.site_level_comparison_datasets": [ {{ reference_resources.ccdg_abel_site_level_benchmarking_dataset | tojson }}, diff --git a/inputs/templates/test/RefineComplexVariants/RefineComplexVariants.json.tmpl b/inputs/templates/test/RefineComplexVariants/RefineComplexVariants.json.tmpl new file mode 100644 index 000000000..c9feb45a2 --- /dev/null +++ b/inputs/templates/test/RefineComplexVariants/RefineComplexVariants.json.tmpl @@ -0,0 +1,17 @@ +{ + "RefineComplexVariants.vcf": {{ test_batch.clean_vcf | tojson }}, + "RefineComplexVariants.prefix": {{ test_batch.name | tojson }}, + + "RefineComplexVariants.batch_name_list": [{{ test_batch.name | tojson }}], + "RefineComplexVariants.batch_sample_lists": [ {{ test_batch.final_sample_list | tojson }} ], + "RefineComplexVariants.PE_metrics": [ {{ test_batch.merged_disc_file | tojson }} ], + "RefineComplexVariants.PE_metrics_indexes": [ {{ test_batch.merged_disc_file_index | tojson }} ], + "RefineComplexVariants.Depth_DEL_beds": [ {{ test_batch.del_bed | tojson }} ], + "RefineComplexVariants.Depth_DUP_beds": [ {{ test_batch.dup_bed | tojson }} ], + + "RefineComplexVariants.n_per_split": "15000", + + "RefineComplexVariants.sv_base_mini_docker": {{ dockers.sv_base_mini_docker | tojson }}, + "RefineComplexVariants.sv_pipeline_docker": {{ dockers.sv_pipeline_docker | tojson }}, + "RefineComplexVariants.linux_docker": {{ dockers.linux_docker | tojson }} +} diff --git a/inputs/templates/test/SVConcordance/SVConcordance.json.tmpl b/inputs/templates/test/SVConcordance/SVConcordance.json.tmpl index 0540305cd..de29c485f 100644 --- a/inputs/templates/test/SVConcordance/SVConcordance.json.tmpl +++ b/inputs/templates/test/SVConcordance/SVConcordance.json.tmpl @@ -2,7 +2,7 @@ "SVConcordance.gatk_docker": {{ dockers.gatk_docker | tojson }}, "SVConcordance.sv_base_mini_docker": {{ dockers.sv_base_mini_docker | tojson }}, - "SVConcordance.eval_vcf" : {{ test_batch.gatk_formatted_vcf | tojson }}, + "SVConcordance.eval_vcf" : {{ test_batch.complex_refined_vcf | tojson }}, "SVConcordance.truth_vcf" : {{ test_batch.joined_raw_calls_vcf | tojson }}, "SVConcordance.output_prefix": {{ test_batch.name | tojson }}, diff --git a/inputs/values/ref_panel_1kg.json b/inputs/values/ref_panel_1kg.json index 4ee29e419..f317967a5 100644 --- a/inputs/values/ref_panel_1kg.json +++ b/inputs/values/ref_panel_1kg.json @@ -1166,8 +1166,8 @@ "gs://gatk-sv-ref-panel-1kg/outputs/mw-make-cohort-vcf-templates/ResolveComplexVariants/breakpoint-overlap-dropped-record-vcfs/ref_panel_1kg.chrX.breakpoint_overlap.dropped_records.vcf.gz.tbi", "gs://gatk-sv-ref-panel-1kg/outputs/mw-make-cohort-vcf-templates/ResolveComplexVariants/breakpoint-overlap-dropped-record-vcfs/ref_panel_1kg.chrY.breakpoint_overlap.dropped_records.vcf.gz.tbi" ], - "clean_vcf": "gs://gatk-sv-ref-panel-1kg/outputs/mw-vcf-reshard/ref_panel_1kg.cleaned.vcf.gz", - "clean_vcf_index": "gs://gatk-sv-ref-panel-1kg/outputs/mw-vcf-reshard/ref_panel_1kg.cleaned.vcf.gz.tbi", + "clean_vcf": "gs://gatk-sv-ref-panel-1kg/outputs/eph-refine-cpx/ref_panel_1kg.cleaned.vcf.gz", + "clean_vcf_index": "gs://gatk-sv-ref-panel-1kg/outputs/eph-refine-cpx/ref_panel_1kg.cleaned.vcf.gz.tbi", "clean_vcf_gatk_formatter_args": "", "cluster_background_fail_lists": [ "gs://gatk-sv-ref-panel-1kg/outputs/mw-make-cohort-vcf-templates/CombineBatches/cluster-background-fail-lists/ref_panel_1kg.chr1.sr_background_fail.updated2.txt", @@ -1329,6 +1329,8 @@ "gs://gatk-sv-ref-panel-1kg/outputs/mw-vcf-reshard/ref_panel_1kg.chrX.regenotyped.vcf.gz.tbi", "gs://gatk-sv-ref-panel-1kg/outputs/mw-vcf-reshard/ref_panel_1kg.chrY.regenotyped.vcf.gz.tbi" ], + "complex_refined_vcf": "gs://gatk-sv-ref-panel-1kg/outputs/eph-refine-cpx/ref_panel_1kg.cpx_refined.vcf.gz", + "complex_refined_vcf_index": "gs://gatk-sv-ref-panel-1kg/outputs/eph-refine-cpx/ref_panel_1kg.cpx_refined.vcf.gz", "complex_resolve_bothside_pass_list": "gs://gatk-sv-ref-panel-1kg/outputs/mw-vcf-reshard/ref_panel_1kg.all.sr_bothside_pass.updated3.txt", "complex_resolve_background_fail_list": "gs://gatk-sv-ref-panel-1kg/outputs/mw-vcf-reshard/ref_panel_1kg.all.sr_background_fail.updated3.txt", "complex_resolve_vcfs": [ @@ -1383,8 +1385,8 @@ "gs://gatk-sv-ref-panel-1kg/outputs/mw-vcf-reshard/ref_panel_1kg.reshard_vcf.chrX.resharded.vcf.gz.tbi", "gs://gatk-sv-ref-panel-1kg/outputs/mw-vcf-reshard/ref_panel_1kg.reshard_vcf.chrY.resharded.vcf.gz.tbi" ], - "concordance_vcf": "gs://gatk-sv-ref-panel-1kg/outputs/mw_sv_concordance_update/ref_panel_1kg.concordance.vcf.gz", - "concordance_vcf_index": "gs://gatk-sv-ref-panel-1kg/outputs/mw_sv_concordance_update/ref_panel_1kg.concordance.vcf.gz.tbi", + "concordance_vcf": "gs://gatk-sv-ref-panel-1kg/outputs/eph-refine-cpx/ref_panel_1kg.concordance.vcf.gz", + "concordance_vcf_index": "gs://gatk-sv-ref-panel-1kg/outputs/eph-refine-cpx/ref_panel_1kg.concordance.vcf.gz.tbi", "contig_ploidy_model_tar": "gs://gatk-sv-resources-public/hg38/v0/sv-resources/ref-panel/1KG/v2/gcnv/ref_panel_1kg_v2-contig-ploidy-model.tar.gz", "counts": [ "gs://gatk-sv-ref-panel-1kg/outputs/tws-no-cram-conversion/GatherSampleEvidenceBatch/HG00096.counts.tsv.gz", @@ -1852,9 +1854,9 @@ "genotype_pesr_pesr_sepcutoff": "gs://gatk-sv-ref-panel-1kg/outputs/GATKSVPipelineBatch/38c65ca4-2a07-4805-86b6-214696075fef/call-GenotypeBatch/GenotypeBatch/ad17f522-0950-4f0a-9148-a13f689082ed/call-GenotypePESRPart1/GenotypePESRPart1/40ec6d76-dd1c-432d-bfab-bc4426d0b1ec/call-TrainRDGenotyping/TrainRDGenotyping/e5540a96-9072-4719-bcfb-afccdfec15c6/call-UpdateCutoff/cacheCopy/ref_panel_1kg.pesr.pesr_sepcutoff.txt", "genotyped_depth_vcf": "gs://gatk-sv-ref-panel-1kg/outputs/GATKSVPipelineBatch/38c65ca4-2a07-4805-86b6-214696075fef/call-GenotypeBatch/GenotypeBatch/ad17f522-0950-4f0a-9148-a13f689082ed/call-GenotypeDepthPart2/GenotypeDepthPart2/0aafd752-e606-4196-86ac-41c1c3ce1eb2/call-ConcatGenotypedVcfs/cacheCopy/ref_panel_1kg.depth.vcf.gz", "genotyped_pesr_vcf": "gs://gatk-sv-ref-panel-1kg/outputs/GATKSVPipelineBatch/38c65ca4-2a07-4805-86b6-214696075fef/call-GenotypeBatch/GenotypeBatch/ad17f522-0950-4f0a-9148-a13f689082ed/call-GenotypePESRPart2/GenotypePESRPart2/ce1f4075-1a3e-44b5-9cfe-bfb701327616/call-ConcatGenotypedVcfs/cacheCopy/ref_panel_1kg.pesr.vcf.gz", - "genotype_filtered_vcf": "gs://gatk-sv-ref-panel-1kg/outputs/mw-filter-genotypes/1kgp/ref_panel_1kg.filter_genotypes.vcf.gz", - "genotype_filtered_vcf_index": "gs://gatk-sv-ref-panel-1kg/outputs/mw-filter-genotypes/1kgp/ref_panel_1kg.filter_genotypes.vcf.gz.tbi", - "genotype_filtered_qc_tarball": "gs://gatk-sv-ref-panel-1kg/outputs/mw-filter-genotypes/1kgp/ref_panel_1kg.filter_genotypes_SV_VCF_QC_output.tar.gz", + "genotype_filtered_vcf": "gs://gatk-sv-ref-panel-1kg/outputs/eph-refine-cpx/ref_panel_1kg.filter_genotypes.sanitized.vcf.gz", + "genotype_filtered_vcf_index": "gs://gatk-sv-ref-panel-1kg/outputs/eph-refine-cpx/ref_panel_1kg.filter_genotypes.sanitized.vcf.gz.tbi", + "genotype_filtered_qc_tarball": "gs://gatk-sv-ref-panel-1kg/outputs/eph-refine-cpx/ref_panel_1kg.filter_genotypes_SV_VCF_QC_output.tar.gz", "joined_raw_calls_vcf": "gs://gatk-sv-ref-panel-1kg/outputs/mw_sv_concordance_update/ref_panel_1kg.join_raw_calls.vcf.gz", "joined_raw_calls_vcf_index": "gs://gatk-sv-ref-panel-1kg/outputs/mw_sv_concordance_update/ref_panel_1kg.join_raw_calls.vcf.gz.tbi", "manta_vcfs": [ diff --git a/inputs/values/resources_hg38.json b/inputs/values/resources_hg38.json index 73484d5b3..545d4b2b0 100644 --- a/inputs/values/resources_hg38.json +++ b/inputs/values/resources_hg38.json @@ -21,6 +21,8 @@ "external_af_ref_bed" : "gs://gcp-public-data--broad-references/hg38/v0/sv-resources/resources/v1/gnomad_AF/gnomad_v2.1_sv.sites.GRCh38.bed.gz", "external_af_ref_bed_prefix" : "gnomad_v2.1_sv", "genome_file" : "gs://gcp-public-data--broad-references/hg38/v0/sv-resources/resources/v1/hg38.genome", + "hervk_reference": "gs://gatk-sv-resources-public/hg38/v0/sv-resources/resources/v1/HERVK.sorted.bed.gz", + "line1_reference": "gs://gatk-sv-resources-public/hg38/v0/sv-resources/resources/v1/LINE1.sorted.bed.gz", "manta_region_bed" : "gs://gcp-public-data--broad-references/hg38/v0/sv-resources/resources/v1/primary_contigs_plus_mito.bed.gz", "manta_region_bed_index" : "gs://gcp-public-data--broad-references/hg38/v0/sv-resources/resources/v1/primary_contigs_plus_mito.bed.gz.tbi", "mei_bed" : "gs://gatk-sv-resources-public/hg38/v0/sv-resources/resources/v1/hg38.repeatmasker.mei.with_SVA.pad_50_merged.bed.gz", diff --git a/scripts/test/terra_validation.py b/scripts/test/terra_validation.py index 2e9a64756..aa9079d19 100644 --- a/scripts/test/terra_validation.py +++ b/scripts/test/terra_validation.py @@ -113,7 +113,7 @@ def main(): parser.add_argument("-j", "--womtool-jar", help="Path to womtool jar", required=True) parser.add_argument("-n", "--num-input-jsons", help="Number of Terra input JSONs expected", - required=False, default=24, type=int) + required=False, default=25, type=int) parser.add_argument("--log-level", help="Specify level of logging information, ie. info, warning, error (not case-sensitive)", required=False, default="INFO") diff --git a/src/sv-pipeline/scripts/apply_sl_filter.py b/src/sv-pipeline/scripts/apply_sl_filter.py index ce0151e92..917b605fb 100644 --- a/src/sv-pipeline/scripts/apply_sl_filter.py +++ b/src/sv-pipeline/scripts/apply_sl_filter.py @@ -4,6 +4,7 @@ import sys import pysam import math +from numpy import median from typing import List, Text, Dict, Optional _gt_no_call_map = dict() @@ -78,6 +79,24 @@ def _fails_filter(sl, gt_is_ref, cutoff): return sl < cutoff +def recal_qual_score(record): + """ + Recalibrate quality score for a single variant + """ + quals = [] + for s in [s for s in record.samples]: + gt = record.samples[s]['GT'] + if _is_hom_ref(gt) or _is_no_call(gt): + continue + elif _is_hom_var(gt): + quals.append(99) + else: + quals.append(record.samples[s]['GQ']) + + if len(quals) > 0: + return int(median(quals)) + + def _apply_filter(record, sl_threshold, ploidy_dict, apply_hom_ref, ncr_threshold, keep_gq, gq_scale_factor, upper_sl_cap, lower_sl_cap, sl_shift, max_gq): record.info['MINSL'] = sl_threshold @@ -109,15 +128,19 @@ def _apply_filter(record, sl_threshold, ploidy_dict, apply_hom_ref, ncr_threshol gt['GT_FILTER'] = _gt_to_filter_status(gt['GT'], fails_filter) if fails_filter: gt['GT'] = _filter_gt(gt['GT'], allele) + if record.info['SVTYPE'] != 'CNV' and _is_no_call(gt['GT']): n_no_call += 1 # Annotate metrics record.info['NCN'] = n_no_call record.info['NCR'] = n_no_call / n_samples if n_samples > 0 else None if ncr_threshold is not None and record.info['NCR'] is not None and record.info['NCR'] >= ncr_threshold: record.filter.add(_HIGH_NCR_FILTER) + if len(record.filter) == 0: + record.filter.add('PASS') n_sl = len(sl_list) record.info['SL_MEAN'] = sum(sl_list) / n_sl if n_sl > 0 else None record.info['SL_MAX'] = max(sl_list) if n_sl > 0 else None + record.qual = recal_qual_score(record) # Clean out AF metrics since they're no longer valid for field in ['AC', 'AF']: if field in record.info: diff --git a/src/sv-pipeline/scripts/reformat_CPX_bed_and_generate_script.py b/src/sv-pipeline/scripts/reformat_CPX_bed_and_generate_script.py new file mode 100644 index 000000000..8b4b8cef8 --- /dev/null +++ b/src/sv-pipeline/scripts/reformat_CPX_bed_and_generate_script.py @@ -0,0 +1,499 @@ +#!python +# this script takes in the CPX SVs, reformats in svelter format, and generates the script to collect PE metrics +import os +import argparse + + +def header_pos_readin(input_bed): + fin = os.popen(r'''zcat %s''' % input_bed) + pin = fin.readline().strip().split() + out = {} + for i in range(len(pin)): + out[pin[i]] = i + fin.close() + return out + + +def sample_pe_readin(sample_pe_file): + # sample_pe_file is a 2 column file with sample ID and PE metrics on both columns + out = {} + fin = open(sample_pe_file) + for line in fin: + pin = line.strip().split() + if not pin[0] in out.keys(): + out[pin[0]] = pin[1] + fin.close() + return out + + +def svid_sample_readin(input_bed, header_pos): + out = {} + fin = os.popen(r'''zcat %s''' % input_bed) + for line in fin: + pin = line.strip().split('\t') + if not pin[0][0] == '#': + out[pin[header_pos['name']]] = pin[header_pos['samples']] + fin.close() + return out + + +def parse_segment(segment): + svtype, interval = segment.split('_') + chrom, coords = interval.split(':') + coord1, coord2 = coords.split('-') + return svtype, chrom, int(coord1), int(coord2) + + +# segments: CPX_INTERVALS split by ',' +def extract_bp_list_for_inv_cnv_events(segments, cpx_type): + s0_svtype, s0_chrom, s0_c1, s0_c2 = parse_segment(segments[0]) + breakpoints = [s0_chrom] + [s0_c1, s0_c2] + if 'INVdup' not in cpx_type: + for seg in segments[1:]: + seg_svtype, seg_chrom, seg_c1, seg_c2 = parse_segment(seg) + breakpoints.append(seg_c2) + else: + if cpx_type == 'INVdup': + s1_svtype, s1_chrom, s1_c1, s1_c2 = parse_segment(segments[1]) + if s1_c1 < breakpoints[2]: + breakpoints = breakpoints[:2] + [s1_c1, breakpoints[2]] + elif cpx_type == 'dupINVdup' or cpx_type == 'delINVdup': + s2_svtype, s2_chrom, s2_c1, s2_c2 = parse_segment(segments[2]) + breakpoints += [s2_c1, s2_c2] + return breakpoints + + +def extract_bp_list_for_ddup_events(coordinates, segments, small_sv_size_threshold): + # example of coordinates: ["chr1",1154129,1154133] + # example of segments: DUP_chr1:229987763-230011157 + del_bp = [coordinates[0], int(coordinates[1]), int(coordinates[2])] + # CW: this looks like it only gets the coordinates from the first DUP segment? + # dup_seg = [i for i in segments if 'DUP_' in i][0].split('_')[1].split(':') + # dup_bp = [int(i) for i in dup_seg[1].split('-')] + # inv_flag = len([i for i in segments if "INV_" in i]) + # rewrite as: + inv_flag = 0 + dup_bp = [] + for segment in segments: + if 'DUP_' in segment: + seg_svtype, seg_chrom, seg_c1, seg_c2 = parse_segment(segment) + dup_bp = [seg_c1, seg_c2] + if 'INV_' in segment: + inv_flag = inv_flag + 1 + if 'DEL_' in segment: + seg_svtype, seg_chrom, seg_c1, seg_c2 = parse_segment(segment) + del_bp = [seg_chrom, seg_c1, seg_c2] + # inv_flag == 0 : no INV involved in the insertion + # inv_flag > 0: INV involved + structures = [] + breakpoints = [] + if inv_flag == 0: + if del_bp[2] < dup_bp[0]: + breakpoints = del_bp + dup_bp + if del_bp[2] - del_bp[1] > small_sv_size_threshold: + structures = ['abc', 'cbc'] + else: + structures = ['ab', 'bab'] + elif del_bp[1] > dup_bp[1]: + breakpoints = [del_bp[0]] + dup_bp + del_bp[1:] + if del_bp[2] - del_bp[1] > small_sv_size_threshold: + structures = ['abc', 'aba'] + else: + structures = ['ab', 'aba'] + elif del_bp[1] < dup_bp[0] and not del_bp[2] < dup_bp[0]: + breakpoints = del_bp[:2] + dup_bp + structures = ['ab', 'bb'] + elif del_bp[1] < dup_bp[1] and not del_bp[2] < dup_bp[1]: + breakpoints = [del_bp[0]] + dup_bp + [del_bp[2]] + structures = ['ab', 'aa'] + else: + return 'unresolved' + elif inv_flag > 0: + if del_bp[2] < dup_bp[0]: + breakpoints = del_bp + dup_bp + if del_bp[2] - del_bp[1] > small_sv_size_threshold: + structures = ['abc', 'c^bc'] + else: + structures = ['ab', 'b^ab'] + elif del_bp[1] > dup_bp[1]: + breakpoints = [del_bp[0]] + dup_bp + del_bp[1:] + if del_bp[2] - del_bp[1] > small_sv_size_threshold: + structures = ['abc', 'aba^'] + else: + structures = ['ab', 'aba^'] + elif del_bp[1] < dup_bp[0] and not del_bp[2] < dup_bp[0]: + breakpoints = del_bp[:2] + dup_bp + structures = ['ab', 'b^b'] + elif del_bp[1] < dup_bp[1] and not del_bp[2] < dup_bp[1]: + breakpoints = [del_bp[0]] + dup_bp + [del_bp[2]] + structures = ['ab', 'aa^'] + else: + return 'unresolved' + return [breakpoints, structures] + + +def extract_bp_list_for_ins_idel(coordinates, segments, small_sv_size_threshold): + # example of coordinates: ["chr1",1154129,1154133] + # example of segments: DUP_chr1:229987763-230011157 + del_bp = [coordinates[0], int(coordinates[1]), int(coordinates[2])] + dup_seg = [i for i in segments if 'INS_' in i][0].split('_')[1].split(':') + dup_bp = [int(i) for i in dup_seg[1].split('-')] + inv_flag = len([i for i in segments if "INV_" in i]) + # inv_flag == 0 : no INV involved in the insertion + # inv_flag > 0: INV involved + structures = [] + breakpoints = [] + if inv_flag == 0: + if del_bp[2] < dup_bp[0]: + breakpoints = del_bp + dup_bp + if del_bp[2] - del_bp[1] > small_sv_size_threshold: + structures = ['abc', 'cbc'] + else: + structures = ['ab', 'bab'] + elif del_bp[1] > dup_bp[1]: + breakpoints = [del_bp[0]] + dup_bp + del_bp[1:] + if del_bp[2] - del_bp[1] > small_sv_size_threshold: + structures = ['abc', 'aba'] + else: + structures = ['ab', 'aba'] + elif del_bp[1] < dup_bp[0] and not del_bp[2] < dup_bp[0]: + breakpoints = del_bp[:2] + dup_bp + structures = ['ab', 'bb'] + elif del_bp[1] < dup_bp[1] and not del_bp[2] < dup_bp[1]: + breakpoints = [del_bp[0]] + dup_bp + [del_bp[2]] + structures = ['ab', 'aa'] + elif inv_flag > 0: + if del_bp[2] < dup_bp[0]: + breakpoints = del_bp + dup_bp + if del_bp[2] - del_bp[1] > small_sv_size_threshold: + structures = ['abc', 'c^bc'] + else: + structures = ['ab', 'b^ab'] + elif del_bp[1] > dup_bp[1]: + breakpoints = [del_bp[0]] + dup_bp + del_bp[1:] + if del_bp[2] - del_bp[1] > small_sv_size_threshold: + structures = ['abc', 'aba^'] + else: + structures = ['ab', 'aba^'] + elif del_bp[1] < dup_bp[0] and not del_bp[2] < dup_bp[0]: + breakpoints = del_bp[:2] + dup_bp + structures = ['ab', 'b^b'] + elif del_bp[1] < dup_bp[1] and not del_bp[2] < dup_bp[1]: + breakpoints = [del_bp[0]] + dup_bp + [del_bp[2]] + structures = ['ab', 'aa^'] + return [breakpoints, structures] + + +def extract_bp_list_v4(coordinates, segments, small_sv_size_threshold): + del_bp = [coordinates[0], int(coordinates[1]), int(coordinates[2])] + dup_seg = [i for i in segments if 'INV_' in i][0].split('_')[1].split(':') + dup_bp = [int(i) for i in dup_seg[1].split('-')] + structures = [] + breakpoints = [] + if del_bp[2] < dup_bp[0]: + breakpoints = del_bp + dup_bp + if del_bp[2] - del_bp[1] > small_sv_size_threshold: + structures = ['abc', 'c^bc'] + else: + structures = ['ab', 'b^ab'] + elif del_bp[1] > dup_bp[1]: + breakpoints = [del_bp[0]] + dup_bp + del_bp[1:] + if del_bp[2] - del_bp[1] > small_sv_size_threshold: + structures = ['abc', 'aba^'] + else: + structures = ['ab', 'aba^'] + return [breakpoints, structures] + + +def is_interchromosomal(pin, header_pos): + chrom = pin[0] + if pin[header_pos['CPX_TYPE']] in ['dDUP', 'dDUP_iDEL']: + seg2 = [i for i in pin[header_pos['CPX_INTERVALS']].split(',') if 'DUP_' in i][0].split('_')[1].split(':') + if seg2[0] != chrom: + return True + elif pin[header_pos['CPX_TYPE']] in ['INS_iDEL']: + seg2 = [i for i in pin[header_pos['SOURCE']].split(',') if 'INS_' in i][0].split('_')[1].split(':') + if seg2[0] != chrom: + return True + elif pin[header_pos['CPX_TYPE']] in ['CTX_PQ/QP', 'CTX_PP/QQ'] or pin[header_pos['SVTYPE']] in ['CTX']: + return True + elif "INV_" in pin[header_pos['SOURCE']] and pin[header_pos['SVTYPE']] == "INS": + seg2 = [i for i in pin[header_pos['SOURCE']].split(',') if 'INV_' in i][0].split('_')[1].split(':') + seg2 = [seg2[0]] + seg2[1].split('-') + if seg2[0] != chrom: + return True + return False + + +def cpx_sv_readin(input_bed, header_pos, unresolved): + unresolved_svids = [] + fin = os.popen(r'''zcat %s''' % input_bed) + out = [] + small_sv_size_threshold = 250 + ref_alt = [] + for line in fin: + pin = line.strip().split('\t') + # label pidups as unresolved since manual inspection indicated low quality + if not pin[0][0] == "#": + if pin[header_pos['CPX_TYPE']] in ['piDUP_RF', 'piDUP_FR']: + unresolved_svids.append(pin[header_pos['name']]) + elif not is_interchromosomal(pin, header_pos): + if pin[header_pos['CPX_TYPE']] in ['delINV', 'INVdel', 'dupINV', 'INVdup', 'delINVdel', 'delINVdup', + 'dupINVdel', 'dupINVdup']: + segments = pin[header_pos['CPX_INTERVALS']].split(',') + breakpoints = extract_bp_list_for_inv_cnv_events(segments, pin[header_pos['CPX_TYPE']]) + if pin[header_pos['CPX_TYPE']] == 'delINV': + ref_alt = ['ab', 'b^'] + if pin[header_pos['CPX_TYPE']] == 'INVdel': + ref_alt = ['ab', 'a^'] + if pin[header_pos['CPX_TYPE']] == 'dupINV': + ref_alt = ['ab', 'aba^'] + if pin[header_pos['CPX_TYPE']] == 'INVdup': + ref_alt = ['ab', 'b^ab'] + if pin[header_pos['CPX_TYPE']] == 'delINVdel': + ref_alt = ['abc', 'b^'] + if pin[header_pos['CPX_TYPE']] == 'delINVdup': + ref_alt = ['abc', 'c^bc'] + if pin[header_pos['CPX_TYPE']] == 'dupINVdel': + ref_alt = ['abc', 'aba^'] + if pin[header_pos['CPX_TYPE']] == 'dupINVdup': + ref_alt = ['abc', 'ac^b^a^c'] + elif pin[header_pos['CPX_TYPE']] in ['dDUP', 'dDUP_iDEL']: + segments = pin[header_pos['CPX_INTERVALS']].split(',') + cpx_info = extract_bp_list_for_ddup_events(pin[:3], segments, small_sv_size_threshold) + if cpx_info == 'unresolved': + unresolved_svids.append(pin[header_pos['name']]) + continue + ref_alt = cpx_info[1] + breakpoints = cpx_info[0] + elif pin[header_pos['CPX_TYPE']] in ['INS_iDEL']: + segments = pin[header_pos['SOURCE']].split(',') + cpx_info = extract_bp_list_for_ins_idel(pin[:3], segments, small_sv_size_threshold) + ref_alt = cpx_info[1] + breakpoints = cpx_info[0] + else: + segments = pin[header_pos['SOURCE']].split(',') + cpx_info = extract_bp_list_v4(pin[:3], segments, small_sv_size_threshold) + ref_alt = cpx_info[1] + breakpoints = cpx_info[0] + out.append([breakpoints, ref_alt, pin[3]]) + else: + continue + fin.close() + with open(unresolved, 'w') as unres: + for svid in unresolved_svids: + unres.write(svid + "\n") + return out + + +def cpx_inter_chromo_sv_readin(input_bed, header_pos): + fin = os.popen(r'''zcat %s''' % input_bed) + out = [] + chr_list = ['chr' + str(i) for i in range(1, 23)] + ['chrX', 'chrY'] + bp = [] + ref_alt = [] + for line in fin: + pin = line.strip().split('\t') + if not pin[0][0] == "#": + if is_interchromosomal(pin, header_pos): + if pin[header_pos['CPX_TYPE']] in ['dDUP', 'dDUP_iDEL']: + seg1 = pin[:3] + seg2 = [i for i in pin[header_pos['CPX_INTERVALS']].split(',') if 'DUP_' in i][0].split('_')[1].split(':') + seg2 = [seg2[0]] + seg2[1].split('-') + if chr_list.index(seg1[0]) < chr_list.index(seg2[0]): + bp = [[seg1[0]] + [int(i) for i in seg1[1:]], [seg2[0]] + [int(i) for i in seg2[1:]]] + if int(seg1[2]) - int(seg1[1]) > 250: + ref_alt = ['ab_c', 'cb_c'] + else: + ref_alt = ['a_b', 'ba_b'] + elif chr_list.index(seg1[0]) > chr_list.index(seg2[0]): + bp = [[seg2[0]] + [int(i) for i in seg2[1:]], [seg1[0]] + [int(i) for i in seg1[1:]]] + if int(seg1[2]) - int(seg1[1]) > 250: + ref_alt = ['a_bc', 'a_ba'] + else: + ref_alt = ['a_b', 'a_ba'] + elif pin[header_pos['CPX_TYPE']] in ['INS_iDEL']: + seg1 = pin[:3] + seg2 = [i for i in pin[header_pos['SOURCE']].split(',') if 'INS_' in i][0].split('_')[1].split(':') + seg2 = [seg2[0]] + seg2[1].split('-') + if chr_list.index(seg1[0]) < chr_list.index(seg2[0]): + bp = [[seg1[0]] + [int(i) for i in seg1[1:]], [seg2[0]] + [int(i) for i in seg2[1:]]] + if int(seg1[2]) - int(seg1[1]) > 250: + ref_alt = ['ab_c', 'cb_c'] + else: + ref_alt = ['a_b', 'ba_b'] + elif chr_list.index(seg1[0]) > chr_list.index(seg2[0]): + bp = [[seg2[0]] + [int(i) for i in seg2[1:]], [seg1[0]] + [int(i) for i in seg1[1:]]] + if int(seg1[2]) - int(seg1[1]) > 250: + ref_alt = ['a_bc', 'a_ba'] + else: + ref_alt = ['a_b', 'a_ba'] + elif pin[header_pos['CPX_TYPE']] in ['CTX_PQ/QP', 'CTX_PP/QQ'] or pin[header_pos['SVTYPE']] in ['CTX']: + seg1 = pin[:3] + seg2 = [pin[header_pos['CHR2']], pin[header_pos['END2']], pin[header_pos['END2']]] + if chr_list.index(seg1[0]) < chr_list.index(seg2[0]): + bp = [[seg1[0]] + [int(i) for i in seg1[1:]], [seg2[0]] + [int(i) for i in seg2[1:]]] + elif chr_list.index(seg1[0]) > chr_list.index(seg2[0]): + bp = [[seg2[0]] + [int(i) for i in seg2[1:]], [seg1[0]] + [int(i) for i in seg1[1:]]] + ref_alt = [pin[header_pos['CPX_TYPE']]] + elif "INV_" in pin[header_pos['SOURCE']] and pin[header_pos['SVTYPE']] == "INS": + seg1 = pin[:3] + seg2 = [i for i in pin[header_pos['SOURCE']].split(',') if 'INV_' in i][0].split('_')[1].split(':') + seg2 = [seg2[0]] + seg2[1].split('-') + if chr_list.index(seg1[0]) < chr_list.index(seg2[0]): + bp = [[seg1[0]] + [int(i) for i in seg1[1:]], [seg2[0]] + [int(i) for i in seg2[1:]]] + if int(seg1[2]) - int(seg1[1]) > 250: + ref_alt = ['ab_c', 'c^b_c'] + else: + ref_alt = ['a_b', 'b^a_b'] + elif chr_list.index(seg1[0]) > chr_list.index(seg2[0]): + bp = [[seg2[0]] + [int(i) for i in seg2[1:]], [seg1[0]] + [int(i) for i in seg1[1:]]] + if int(seg1[2]) - int(seg1[1]) > 250: + ref_alt = ['a_bc', 'a_ba^'] + else: + ref_alt = ['a_b', 'a_ba^'] + out.append([bp, ref_alt, pin[3]]) + fin.close() + return out + + +def cpx_sample_batch_readin(cpx_sv, svid_sample, sample_pe, pe_evidence, out_file): + out = [] + flank_back = 1000 + flank_front = 100 + fo = open(out_file, 'w') + for info in cpx_sv: + breakpoints = info[0] + if info[1][0] == 'ab' and info[1][1] == 'b^': # delINV + common_1 = ['tabix', 'PE_metrics', breakpoints[0] + ':' + str(breakpoints[1] - flank_back) + '-' + str(breakpoints[1] + flank_front), '| grep', 'sample', '| awk', "'{if ($1==$4", '&&', '$3=="+" && $6=="+"', '&&', '$5>' + str(breakpoints[3] - flank_back), '&&', '$5<' + str(breakpoints[3] + flank_front), ") print}' | sed -e 's/$/\\t", info[2], "/'", '>>', pe_evidence] + common_2 = ['tabix', 'PE_metrics', breakpoints[0] + ':' + str(breakpoints[2] - flank_front) + '-' + str(breakpoints[2] + flank_back), '| grep', 'sample', '| awk', "'{if ($1==$4", '&&', '$3=="-" && $6=="-"', '&&', '$5>' + str(breakpoints[3] - flank_front), '&&', '$5<' + str(breakpoints[3] + flank_back), ") print}' | sed -e 's/$/\\t", info[2], "/'", '>>', pe_evidence] + elif info[1][0] == 'abc' and info[1][1] == 'b^': # delINVdel + common_1 = ['tabix', 'PE_metrics', breakpoints[0] + ':' + str(breakpoints[1] - flank_back) + '-' + str(breakpoints[1] + flank_front), '| grep', 'sample', '| awk', "'{if ($1==$4", '&&', '$3=="+" && $6=="+"', '&&', '$5>' + str(breakpoints[3] - flank_back), '&&', '$5<' + str(breakpoints[3] + flank_front), ") print}' | sed -e 's/$/\\t", info[2], "/'", '>>', pe_evidence] + common_2 = ['tabix', 'PE_metrics', breakpoints[0] + ':' + str(breakpoints[2] - flank_front) + '-' + str(breakpoints[2] + flank_back), '| grep', 'sample', '| awk', "'{if ($1==$4", '&&', '$3=="-" && $6=="-"', '&&', '$5>' + str(breakpoints[4] - flank_front), '&&', '$5<' + str(breakpoints[4] + flank_back), ") print}' | sed -e 's/$/\\t", info[2], "/'", '>>', pe_evidence] + elif info[1][0] == 'abc' and info[1][1] == 'c^bc': # delINVdup + common_1 = ['tabix', 'PE_metrics', breakpoints[0] + ':' + str(breakpoints[1] - flank_back) + '-' + str(breakpoints[1] + flank_front), '| grep', 'sample', '| awk', "'{if ($1==$4", '&&', '$3=="+" && $6=="+"', '&&', '$5>' + str(breakpoints[4] - flank_back), '&&', '$5<' + str(breakpoints[4] + flank_front), ") print}' | sed -e 's/$/\\t", info[2], "/'", '>>', pe_evidence] + common_2 = ['tabix', 'PE_metrics', breakpoints[0] + ':' + str(breakpoints[2] - flank_front) + '-' + str(breakpoints[2] + flank_back), '| grep', 'sample', '| awk', "'{if ($1==$4", '&&', '$3=="-" && $6=="-"', '&&', '$5>' + str(breakpoints[3] - flank_front), '&&', '$5<' + str(breakpoints[3] + flank_back), ") print}' | sed -e 's/$/\\t", info[2], "/'", '>>', pe_evidence] + elif info[1][0] == 'ab' and info[1][1] == 'aba^': # dupINV + common_1 = ['tabix', 'PE_metrics', breakpoints[0] + ':' + str(breakpoints[2] - flank_back) + '-' + str(breakpoints[2] + flank_front), '| grep', 'sample', '| awk', "'{if ($1==$4", '&&', '$3=="+" && $6=="+"', '&&', '$5>' + str(breakpoints[3] - flank_back), '&&', '$5<' + str(breakpoints[3] + flank_front), ") print}' | sed -e 's/$/\\t", info[2], "/'", '>>', pe_evidence] + common_2 = ['tabix', 'PE_metrics', breakpoints[0] + ':' + str(breakpoints[1] - flank_front) + '-' + str(breakpoints[1] + flank_back), '| grep', 'sample', '| awk', "'{if ($1==$4", '&&', '$3=="-" && $6=="-"', '&&', '$5>' + str(breakpoints[3] - flank_front), '&&', '$5<' + str(breakpoints[3] + flank_back), ") print}' | sed -e 's/$/\\t", info[2], "/'", '>>', pe_evidence] + elif info[1][0] == 'abc' and info[1][1] == 'ac^b^a^c': # dupINVdup + common_1 = ['tabix', 'PE_metrics', breakpoints[0] + ':' + str(breakpoints[2] - flank_back) + '-' + str(breakpoints[2] + flank_front), '| grep', 'sample', '| awk', "'{if ($1==$4", '&&', '$3=="+" && $6=="+"', '&&', '$5>' + str(breakpoints[4] - flank_back), '&&', '$5<' + str(breakpoints[4] + flank_front), ") print}' | sed -e 's/$/\\t", info[2], "/'", '>>', pe_evidence] + common_2 = ['tabix', 'PE_metrics', breakpoints[0] + ':' + str(breakpoints[1] - flank_front) + '-' + str(breakpoints[1] + flank_back), '| grep', 'sample', '| awk', "'{if ($1==$4", '&&', '$3=="-" && $6=="-"', '&&', '$5>' + str(breakpoints[3] - flank_front), '&&', '$5<' + str(breakpoints[3] + flank_back), ") print}' | sed -e 's/$/\\t", info[2], "/'", '>>', pe_evidence] + elif info[1][0] == 'abc' and info[1][1] == 'aba^': # dupINVdel + common_1 = ['tabix', 'PE_metrics', breakpoints[0] + ':' + str(breakpoints[2] - flank_back) + '-' + str(breakpoints[2] + flank_front), '| grep', 'sample', '| awk', "'{if ($1==$4", '&&', '$3=="+" && $6=="+"', '&&', '$5>' + str(breakpoints[3] - flank_back), '&&', '$5<' + str(breakpoints[3] + flank_front), ") print}' | sed -e 's/$/\\t", info[2], "/'", '>>', pe_evidence] + common_2 = ['tabix', 'PE_metrics', breakpoints[0] + ':' + str(breakpoints[1] - flank_front) + '-' + str(breakpoints[1] + flank_back), '| grep', 'sample', '| awk', "'{if ($1==$4", '&&', '$3=="-" && $6=="-"', '&&', '$5>' + str(breakpoints[4] - flank_front), '&&', '$5<' + str(breakpoints[4] + flank_back), ") print}' | sed -e 's/$/\\t", info[2], "/'", '>>', pe_evidence] + elif info[1][0] == 'ab' and info[1][1] == 'a^': # INVdel + common_1 = ['tabix', 'PE_metrics', breakpoints[0] + ':' + str(breakpoints[1] - flank_back) + '-' + str(breakpoints[1] + flank_front), '| grep', 'sample', '| awk', "'{if ($1==$4", '&&', '$3=="+" && $6=="+"', '&&', '$5>' + str(breakpoints[2] - flank_back), '&&', '$5<' + str(breakpoints[2] + flank_front), ") print}' | sed -e 's/$/\\t", info[2], "/'", '>>', pe_evidence] + common_2 = ['tabix', 'PE_metrics', breakpoints[0] + ':' + str(breakpoints[1] - flank_front) + '-' + str(breakpoints[1] + flank_back), '| grep', 'sample', '| awk', "'{if ($1==$4", '&&', '$3=="-" && $6=="-"', '&&', '$5>' + str(breakpoints[3] - flank_front), '&&', '$5<' + str(breakpoints[3] + flank_back), ") print}' | sed -e 's/$/\\t", info[2], "/'", '>>', pe_evidence] + elif info[1][0] == 'ab' and info[1][1] == 'b^ab': # INVdup + common_1 = ['tabix', 'PE_metrics', breakpoints[0] + ':' + str(breakpoints[1] - flank_back) + '-' + str(breakpoints[1] + flank_front), '| grep', 'sample', '| awk', "'{if ($1==$4", '&&', '$3=="+" && $6=="+"', '&&', '$5>' + str(breakpoints[3] - flank_back), '&&', '$5<' + str(breakpoints[3] + flank_front), ") print}' | sed -e 's/$/\\t", info[2], "/'", '>>', pe_evidence] + common_2 = ['tabix', 'PE_metrics', breakpoints[0] + ':' + str(breakpoints[1] - flank_front) + '-' + str(breakpoints[1] + flank_back), '| grep', 'sample', '| awk', "'{if ($1==$4", '&&', '$3=="-" && $6=="-"', '&&', '$5>' + str(breakpoints[2] - flank_front), '&&', '$5<' + str(breakpoints[2] + flank_back), ") print}' | sed -e 's/$/\\t", info[2], "/'", '>>', pe_evidence] + elif info[1][0] == 'ab' and info[1][1] == 'aba': # dupINS + common_1 = ['tabix', 'PE_metrics', breakpoints[0] + ':' + str(breakpoints[2] - flank_back) + '-' + str(breakpoints[2] + flank_front), '| grep', 'sample', '| awk', "'{if ($1==$4", '&&', '$3=="+" && $6=="-"', '&&', '$5>' + str(breakpoints[3] - flank_front), '&&', '$5<' + str(breakpoints[3] + flank_back), ") print}' | sed -e 's/$/\\t", info[2], "/'", '>>', pe_evidence] + common_2 = ['tabix', 'PE_metrics', breakpoints[0] + ':' + str(breakpoints[1] - flank_front) + '-' + str(breakpoints[1] + flank_back), '| grep', 'sample', '| awk', "'{if ($1==$4", '&&', '$3=="-" && $6=="+"', '&&', '$5>' + str(breakpoints[3] - flank_back), '&&', '$5<' + str(breakpoints[3] + flank_front), ") print}' | sed -e 's/$/\\t", info[2], "/'", '>>', pe_evidence] + elif info[1][0] == 'abc' and info[1][1] == 'aba': # dupINSdel + common_1 = ['tabix', 'PE_metrics', breakpoints[0] + ':' + str(breakpoints[2] - flank_back) + '-' + str(breakpoints[2] + flank_front), '| grep', 'sample', '| awk', "'{if ($1==$4", '&&', '$3=="+" && $6=="-"', '&&', '$5>' + str(breakpoints[4] - flank_front), '&&', '$5<' + str(breakpoints[4] + flank_back), ") print}' | sed -e 's/$/\\t", info[2], "/'", '>>', pe_evidence] + common_2 = ['tabix', 'PE_metrics', breakpoints[0] + ':' + str(breakpoints[1] - flank_front) + '-' + str(breakpoints[1] + flank_back), '| grep', 'sample', '| awk', "'{if ($1==$4", '&&', '$3=="-" && $6=="+"', '&&', '$5>' + str(breakpoints[3] - flank_back), '&&', '$5<' + str(breakpoints[3] + flank_front), ") print}' | sed -e 's/$/\\t", info[2], "/'", '>>', pe_evidence] + elif info[1][0] == 'ab' and info[1][1] == 'aa': # dupdel + common_1 = ['tabix', 'PE_metrics', breakpoints[0] + ':' + str(breakpoints[2] - flank_back) + '-' + str(breakpoints[2] + flank_front), '| grep', 'sample', '| awk', "'{if ($1==$4", '&&', '$3=="+" && $6=="-"', '&&', '$5>' + str(breakpoints[3] - flank_front), '&&', '$5<' + str(breakpoints[3] + flank_back), ") print}' | sed -e 's/$/\\t", info[2], "/'", '>>', pe_evidence] + common_2 = ['tabix', 'PE_metrics', breakpoints[0] + ':' + str(breakpoints[1] - flank_front) + '-' + str(breakpoints[1] + flank_back), '| grep', 'sample', '| awk', "'{if ($1==$4", '&&', '$3=="-" && $6=="+"', '&&', '$5>' + str(breakpoints[2] - flank_back), '&&', '$5<' + str(breakpoints[2] + flank_front), ") print}' | sed -e 's/$/\\t", info[2], "/'", '>>', pe_evidence] + elif info[1][0] == 'ab' and info[1][1] == 'bab': # INSdup + common_1 = ['tabix', 'PE_metrics', breakpoints[0] + ':' + str(breakpoints[1] - flank_back) + '-' + str(breakpoints[1] + flank_front), '| grep', 'sample', '| awk', "'{if ($1==$4", '&&', '$3=="+" && $6=="-"', '&&', '$5>' + str(breakpoints[2] - flank_front), '&&', '$5<' + str(breakpoints[2] + flank_back), ") print}' | sed -e 's/$/\\t", info[2], "/'", '>>', pe_evidence] + common_2 = ['tabix', 'PE_metrics', breakpoints[0] + ':' + str(breakpoints[1] - flank_front) + '-' + str(breakpoints[1] + flank_back), '| grep', 'sample', '| awk', "'{if ($1==$4", '&&', '$3=="-" && $6=="+"', '&&', '$5>' + str(breakpoints[3] - flank_back), '&&', '$5<' + str(breakpoints[3] + flank_front), ") print}' | sed -e 's/$/\\t", info[2], "/'", '>>', pe_evidence] + elif info[1][0] == 'abc' and info[1][1] == 'cbc': # delINSdup + common_1 = ['tabix', 'PE_metrics', breakpoints[0] + ':' + str(breakpoints[1] - flank_back) + '-' + str(breakpoints[1] + flank_front), '| grep', 'sample', '| awk', "'{if ($1==$4", '&&', '$3=="+" && $6=="-"', '&&', '$5>' + str(breakpoints[3] - flank_front), '&&', '$5<' + str(breakpoints[3] + flank_back), ") print}' | sed -e 's/$/\\t", info[2], "/'", '>>', pe_evidence] + common_2 = ['tabix', 'PE_metrics', breakpoints[0] + ':' + str(breakpoints[2] - flank_front) + '-' + str(breakpoints[2] + flank_back), '| grep', 'sample', '| awk', "'{if ($1==$4", '&&', '$3=="-" && $6=="+"', '&&', '$5>' + str(breakpoints[4] - flank_back), '&&', '$5<' + str(breakpoints[4] + flank_front), ") print}' | sed -e 's/$/\\t", info[2], "/'", '>>', pe_evidence] + elif info[1][0] == 'ab' and info[1][1] == 'bb': # deldup + common_1 = ['tabix', 'PE_metrics', breakpoints[0] + ':' + str(breakpoints[1] - flank_back) + '-' + str(breakpoints[1] + flank_front), '| grep', 'sample', '| awk', "'{if ($1==$4", '&&', '$3=="+" && $6=="-"', '&&', '$5>' + str(breakpoints[2] - flank_front), '&&', '$5<' + str(breakpoints[2] + flank_back), ") print}' | sed -e 's/$/\\t", info[2], "/'", '>>', pe_evidence] + common_2 = ['tabix', 'PE_metrics', breakpoints[0] + ':' + str(breakpoints[2] - flank_front) + '-' + str(breakpoints[2] + flank_back), '| grep', 'sample', '| awk', "'{if ($1==$4", '&&', '$3=="-" && $6=="+"', '&&', '$5>' + str(breakpoints[3] - flank_back), '&&', '$5<' + str(breakpoints[3] + flank_front), ") print}' | sed -e 's/$/\\t", info[2], "/'", '>>', pe_evidence] + elif info[1] == ['a_b', 'ba_b'] or info[1] == ['ab_c', 'cb_c']: # ddup or ddup_iDEL, insertion from the smaller chromosome to the larger chromosome + common_1 = ['tabix', 'PE_metrics', breakpoints[0][0] + ':' + str(breakpoints[0][1] - flank_back) + '-' + str(breakpoints[0][1] + flank_front), '| grep', 'sample', '| awk', "'{if (", '$3=="+" && $6=="-"', '&&', '$4=="' + breakpoints[1][0] + '" &&', '$5>' + str(breakpoints[1][1] - flank_front), '&&', '$5<' + str(breakpoints[1][1] + flank_back), ") print}' | sed -e 's/$/\\t", info[2], "/'", '>>', pe_evidence] + common_2 = ['tabix', 'PE_metrics', breakpoints[0][0] + ':' + str(breakpoints[0][2] - flank_front) + '-' + str(breakpoints[0][2] + flank_back), '| grep', 'sample', '| awk', "'{if (", '$3=="-" && $6=="+"', '&&', '$4=="' + breakpoints[1][0] + '" &&', '$5>' + str(breakpoints[1][2] - flank_back), '&&', '$5<' + str(breakpoints[1][2] + flank_front), ") print}' | sed -e 's/$/\\t", info[2], "/'", '>>', pe_evidence] + elif info[1] == ['a_b', 'a_ba'] or info[1] == ['a_bc', 'a_ba']: # ddup or ddup_iDEL, insertion from the larger chromosome to the smaller chromosome + common_1 = ['tabix', 'PE_metrics', breakpoints[0][0] + ':' + str(breakpoints[0][1] - flank_front) + '-' + str(breakpoints[0][1] + flank_back), '| grep', 'sample', '| awk', "'{if (", '$3=="-" && $6=="+"', '&&', '$4=="' + breakpoints[1][0] + '" &&', '$5>' + str(breakpoints[1][1] - flank_back), '&&', '$5<' + str(breakpoints[1][1] + flank_front), ") print}' | sed -e 's/$/\\t", info[2], "/'", '>>', pe_evidence] + common_2 = ['tabix', 'PE_metrics', breakpoints[0][0] + ':' + str(breakpoints[0][2] - flank_back) + '-' + str(breakpoints[0][2] + flank_front), '| grep', 'sample', '| awk', "'{if (", '$3=="+" && $6=="-"', '&&', '$4=="' + breakpoints[1][0] + '" &&', '$5>' + str(breakpoints[1][2] - flank_front), '&&', '$5<' + str(breakpoints[1][2] + flank_back), ") print}' | sed -e 's/$/\\t", info[2], "/'", '>>', pe_evidence] + elif info[1] == ['a_b', 'b^a_b'] or info[1] == ['ab_c', 'c^b_c']: # inverted insertion, insertion from the smaller chromosome to the larger chromosome + common_1 = ['tabix', 'PE_metrics', breakpoints[0][0] + ':' + str(breakpoints[0][1] - flank_back) + '-' + str(breakpoints[0][1] + flank_front), '| grep', 'sample', '| awk', "'{if (", '$3=="+" && $6=="+"', '&&', '$4=="' + breakpoints[1][0] + '" &&', '$5>' + str(breakpoints[1][2] - flank_back), '&&', '$5<' + str(breakpoints[1][2] + flank_front), ") print}' | sed -e 's/$/\\t", info[2], "/'", '>>', pe_evidence] + common_2 = ['tabix', 'PE_metrics', breakpoints[0][0] + ':' + str(breakpoints[0][2] - flank_front) + '-' + str(breakpoints[0][2] + flank_back), '| grep', 'sample', '| awk', "'{if (", '$3=="-" && $6=="-"', '&&', '$4=="' + breakpoints[1][0] + '" &&', '$5>' + str(breakpoints[1][1] - flank_front), '&&', '$5<' + str(breakpoints[1][1] + flank_back), ") print}' | sed -e 's/$/\\t", info[2], "/'", '>>', pe_evidence] + elif info[1] == ['a_b', 'a_ba^'] or info[1] == ['a_bc', 'a_ba^']: # inverted insertion, insertion from the larger chromosome to the smaller chromosome + common_1 = ['tabix', 'PE_metrics', breakpoints[0][0] + ':' + str(breakpoints[0][1] - flank_front) + '-' + str(breakpoints[0][1] + flank_back), '| grep', 'sample', '| awk', "'{if (", '$3=="-" && $6=="-"', '&&', '$4=="' + breakpoints[1][0] + '" &&', '$5>' + str(breakpoints[1][2] - flank_front), '&&', '$5<' + str(breakpoints[1][2] + flank_back), ") print}' | sed -e 's/$/\\t", info[2], "/'", '>>', pe_evidence] + common_2 = ['tabix', 'PE_metrics', breakpoints[0][0] + ':' + str(breakpoints[0][2] - flank_back) + '-' + str(breakpoints[0][2] + flank_front), '| grep', 'sample', '| awk', "'{if (", '$3=="+" && $6=="+"', '&&', '$4=="' + breakpoints[1][0] + '" &&', '$5>' + str(breakpoints[1][1] - flank_back), '&&', '$5<' + str(breakpoints[1][1] + flank_front), ") print}' | sed -e 's/$/\\t", info[2], "/'", '>>', pe_evidence] + elif info[1] == ['CTX_PQ/QP']: # inverted insertion, insertion from the larger chromosome to the smaller chromosome + common_1 = ['tabix', 'PE_metrics', breakpoints[0][0] + ':' + str(breakpoints[0][1] - flank_back) + '-' + str(breakpoints[0][1] + flank_front), '| grep', 'sample', '| awk', "'{if (", '$3=="+" && $6=="+"', '&&', '$4=="' + breakpoints[1][0] + '" &&', '$5>' + str(breakpoints[1][1] - flank_back), '&&', '$5<' + str(breakpoints[1][1] + flank_front), ") print}' | sed -e 's/$/\\t", info[2], "/'", '>>', pe_evidence] + common_2 = ['tabix', 'PE_metrics', breakpoints[0][0] + ':' + str(breakpoints[0][2] - flank_front) + '-' + str(breakpoints[0][2] + flank_back), '| grep', 'sample', '| awk', "'{if (", '$3=="-" && $6=="-"', '&&', '$4=="' + breakpoints[1][0] + '" &&', '$5>' + str(breakpoints[1][2] - flank_front), '&&', '$5<' + str(breakpoints[1][2] + flank_back), ") print}' | sed -e 's/$/\\t", info[2], "/'", '>>', pe_evidence] + elif info[1] == ['CTX_PP/QQ']: # inverted insertion, insertion from the larger chromosome to the smaller chromosome + common_1 = ['tabix', 'PE_metrics', breakpoints[0][0] + ':' + str(breakpoints[0][1] - flank_back) + '-' + str(breakpoints[0][1] + flank_front), '| grep', 'sample', '| awk', "'{if (", '$3=="+" && $6=="-"', '&&', '$4=="' + breakpoints[1][0] + '" &&', '$5>' + str(breakpoints[1][1] - flank_front), '&&', '$5<' + str(breakpoints[1][1] + flank_back), ") print}' | sed -e 's/$/\\t", info[2], "/'", '>>', pe_evidence] + common_2 = ['tabix', 'PE_metrics', breakpoints[0][0] + ':' + str(breakpoints[0][2] - flank_front) + '-' + str(breakpoints[0][2] + flank_back), '| grep', 'sample', '| awk', "'{if (", '$3=="-" && $6=="+"', '&&', '$4=="' + breakpoints[1][0] + '" &&', '$5>' + str(breakpoints[1][2] - flank_back), '&&', '$5<' + str(breakpoints[1][2] + flank_front), ") print}' | sed -e 's/$/\\t", info[2], "/'", '>>', pe_evidence] + else: + print(info) + common_1 = None + common_2 = None + samples = svid_sample[info[2]] + if not samples == '' and not samples == 'NA': + sample_list = samples.split(',') + pe_metrics_list = [sample_pe[samp] for samp in sample_list] + for num in range(len(sample_list)): + common_1[1] = pe_metrics_list[num] + common_2[1] = pe_metrics_list[num] + common_1[4] = sample_list[num] + common_2[4] = sample_list[num] + write_1 = ' '.join(common_1) + write_2 = ' '.join(common_2) + print(write_1, file=fo) + print(write_2, file=fo) + + fo.close() + return out + + +def write_cpx_command(cpx_command, out_file): + fo = open(out_file, 'w') + for i in cpx_command: + print(i, file=fo) + fo.close() + + +def write_cpx_svs(cpx_sv, out_file): + fo = open(out_file, 'w') + for i in cpx_sv: + out = [i[2], ':'.join([str(j) for j in i[0]])] + i[1] + print('\t'.join(out), file=fo) + fo.close() + + +def main(): + """ + Command-line main block + """ + + # Parse command line arguments and options + parser = argparse.ArgumentParser( + description=__doc__, + formatter_class=argparse.RawDescriptionHelpFormatter) + parser.add_argument('-i', '--input', required=True, help='input file of CPX events in bed format') + parser.add_argument('-s', '--sample_pe', required=True, help='2 column file with sample ID and PE metrics information') + parser.add_argument('-p', '--pe_evidence', required=True, help='name of file to store collected PE metrics') + parser.add_argument('-c', '--command_script', required=True, help='name of file that has scripts to collect PE evidences') + parser.add_argument('-r', '--reformat_SV', required=True, help='reformatted SV in svelter format') + parser.add_argument('-u', '--unresolved', required=True, help='list of SVIDs to mark unresolved without evaluating (temporary workaround') # TODO + args = parser.parse_args() + + input_bed = args.input + sample_pe_file = args.sample_pe + pe_evidence = args.pe_evidence + command_script = args.command_script + reformatted_sv = args.reformat_SV + + header_pos = header_pos_readin(input_bed) + sample_pe = sample_pe_readin(sample_pe_file) + svid_sample = svid_sample_readin(input_bed, header_pos) + + cpx_sv = cpx_sv_readin(input_bed, header_pos, args.unresolved) + cpx_inter_chromo_sv = cpx_inter_chromo_sv_readin(input_bed, header_pos) + write_cpx_svs(cpx_sv + cpx_inter_chromo_sv, reformatted_sv) + + cpx_sample_batch_readin(cpx_sv + cpx_inter_chromo_sv, svid_sample, sample_pe, pe_evidence, command_script) + + +if __name__ == '__main__': + main() diff --git a/terra_pipeline_diagram.jpg b/terra_pipeline_diagram.jpg index e3af1c2f7..226f13c2c 100644 --- a/terra_pipeline_diagram.jpg +++ b/terra_pipeline_diagram.jpg @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7312ab3c1e136b98f3721606ecec017a1a2f6787144aaaf12fe16ab00a9a6a53 -size 295450 +oid sha256:e7b763bd8386fb082264ae43a5ed9ca922b4dd9763f58e079d2028375fb8e8af +size 307785 diff --git a/wdl/CleanVcf.wdl b/wdl/CleanVcf.wdl index c86e88500..ab228ccf7 100644 --- a/wdl/CleanVcf.wdl +++ b/wdl/CleanVcf.wdl @@ -24,6 +24,9 @@ workflow CleanVcf { Int clean_vcf1b_records_per_shard Int clean_vcf5_records_per_shard + File HERVK_reference + File LINE1_reference + String chr_x String chr_y @@ -67,6 +70,7 @@ workflow CleanVcf { RuntimeAttr? runtime_override_stitch_fragmented_cnvs RuntimeAttr? runtime_override_final_cleanup RuntimeAttr? runtime_attr_format + RuntimeAttr? runtime_override_rescue_me_dels # Clean vcf 1b RuntimeAttr? runtime_attr_override_subset_large_cnvs_1b @@ -133,6 +137,8 @@ workflow CleanVcf { clean_vcf1b_records_per_shard=clean_vcf1b_records_per_shard, clean_vcf5_records_per_shard=clean_vcf5_records_per_shard, ploidy_table=CreatePloidyTableFromPed.out, + HERVK_reference=HERVK_reference, + LINE1_reference=LINE1_reference, chr_x=chr_x, chr_y=chr_y, linux_docker=linux_docker, @@ -170,6 +176,7 @@ workflow CleanVcf { runtime_override_concat_vcfs_1b=runtime_override_concat_vcfs_1b, runtime_override_cat_multi_cnvs_1b=runtime_override_cat_multi_cnvs_1b, runtime_attr_format=runtime_attr_format, + runtime_override_rescue_me_dels=runtime_override_rescue_me_dels } } diff --git a/wdl/CleanVcfChromosome.wdl b/wdl/CleanVcfChromosome.wdl index 29b3bcc8e..a14ffa8c4 100644 --- a/wdl/CleanVcfChromosome.wdl +++ b/wdl/CleanVcfChromosome.wdl @@ -25,6 +25,9 @@ workflow CleanVcfChromosome { File? outlier_samples_list Int? max_samples_per_shard_step3 + File HERVK_reference + File LINE1_reference + File ploidy_table String chr_x String chr_y @@ -49,6 +52,7 @@ workflow CleanVcfChromosome { RuntimeAttr? runtime_override_clean_vcf_5_polish RuntimeAttr? runtime_override_stitch_fragmented_cnvs RuntimeAttr? runtime_override_final_cleanup + RuntimeAttr? runtime_override_rescue_me_dels # Clean vcf 1b RuntimeAttr? runtime_attr_override_subset_large_cnvs_1b @@ -285,9 +289,19 @@ workflow CleanVcfChromosome { runtime_attr_override=runtime_override_stitch_fragmented_cnvs } + call RescueMobileElementDeletions { + input: + vcf = StitchFragmentedCnvs.stitched_vcf_shard, + prefix = "~{prefix}.rescue_me_dels", + LINE1 = LINE1_reference, + HERVK = HERVK_reference, + sv_pipeline_docker = sv_pipeline_docker, + runtime_attr_override = runtime_override_rescue_me_dels + } + call FinalCleanup { input: - vcf=StitchFragmentedCnvs.stitched_vcf_shard, + vcf=RescueMobileElementDeletions.out, contig=contig, prefix="~{prefix}.final_cleanup", sv_pipeline_docker=sv_pipeline_docker, @@ -595,6 +609,99 @@ task CleanVcf4 { } } +task RescueMobileElementDeletions { + input { + File vcf + String prefix + File LINE1 + File HERVK + String sv_pipeline_docker + RuntimeAttr? runtime_attr_override + } + + Float input_size = size(vcf, "GiB") + RuntimeAttr runtime_default = object { + mem_gb: 3.75 + input_size * 1.5, + disk_gb: ceil(100.0 + input_size * 3.0), + cpu_cores: 1, + preemptible_tries: 3, + max_retries: 1, + boot_disk_gb: 10 + } + RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default]) + runtime { + memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB" + disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD" + cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores]) + preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries]) + maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries]) + docker: sv_pipeline_docker + bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb]) + } + + command <<< + set -euo pipefail + + python <.5) print}' | cut -f4 | sed -e 's/$/\tDEL\tPASS\toverlap_LINE1/' > manual_revise.MEI_DEL_from_BND.SVID_SVTYPE_FILTER_INFO.tsv + bedtools coverage -wo -a ~{prefix}.bnd_del.bed.gz -b ~{HERVK} | awk '{if ($NF>.5) print}' | cut -f4 | sed -e 's/$/\tDEL\tPASS\toverlap_HERVK/' >> manual_revise.MEI_DEL_from_BND.SVID_SVTYPE_FILTER_INFO.tsv + + python <',) + if hash_MEI_DEL_reset[record.id] == 'overlap_HERVK': + record.alts = ('',) + fo.write(record) +fin.close() +fo.close() +CODE + >>> + + output { + File out = "~{prefix}.vcf.gz" + } +} + # Remove CNVs that are redundant with CPX events or other CNVs task DropRedundantCnvs { diff --git a/wdl/CollectLargeCNVSupportForCPX.wdl b/wdl/CollectLargeCNVSupportForCPX.wdl new file mode 100644 index 000000000..6c943238a --- /dev/null +++ b/wdl/CollectLargeCNVSupportForCPX.wdl @@ -0,0 +1,284 @@ +version 1.0 + +import "Structs.wdl" +import "TasksMakeCohortVcf.wdl" as MiniTasks + +workflow CollectLargeCNVSupportForCPX { + input { + File cpx_ctx_bed + String prefix + + File sample_batch_pe_map + Array[String] batch_name_list + Array[File] Depth_DEL_beds + Array[File] Depth_DUP_beds + + String sv_base_mini_docker + String sv_pipeline_docker + + RuntimeAttr? runtime_attr_generate_cnv_segments_from_cpx + RuntimeAttr? runtime_attr_extract_cpx_lg_cnv_by_batch + RuntimeAttr? runtime_attr_seek_depth_supp_for_cpx + RuntimeAttr? runtime_attr_concat_bed_Step1 + RuntimeAttr? runtime_attr_concat_bed_Step2 + + } + + call GenerateCnvSegmentFromCpx { + input: + bed = cpx_ctx_bed, #input file including cpx calls in bed format; + sample_batch_pe_map = sample_batch_pe_map, + sv_pipeline_docker = sv_pipeline_docker, + runtime_attr_override = runtime_attr_generate_cnv_segments_from_cpx + } + + scatter (i in range(length(batch_name_list))){ + call ExtractCpxLgCnvByBatch { + input: + bed_gz = GenerateCnvSegmentFromCpx.cpx_lg_cnv, + batch_name = batch_name_list[i], + sv_base_mini_docker = sv_base_mini_docker, + runtime_attr_override = runtime_attr_extract_cpx_lg_cnv_by_batch + } + + call SeekDepthSuppForCpx as seek_depth_supp_for_cpx_del { + input: + cpx_lg_cnv = ExtractCpxLgCnvByBatch.lg_cnv_del, + raw_depth_bed = Depth_DEL_beds[i], + sv_pipeline_docker = sv_pipeline_docker, + runtime_attr_override = runtime_attr_seek_depth_supp_for_cpx + } + + call SeekDepthSuppForCpx as seek_depth_supp_for_cpx_dup { + input: + cpx_lg_cnv = ExtractCpxLgCnvByBatch.lg_cnv_dup, + raw_depth_bed = Depth_DUP_beds[i], + sv_pipeline_docker = sv_pipeline_docker, + runtime_attr_override = runtime_attr_seek_depth_supp_for_cpx + } + + call MiniTasks.ConcatBeds as concat_beds_svtype { + input: + shard_bed_files = [seek_depth_supp_for_cpx_del.cpx_cnv_depth_supp, seek_depth_supp_for_cpx_dup.cpx_cnv_depth_supp], + prefix = "~{batch_name_list[i]}.lg_cnv.depth_supp", + sv_base_mini_docker = sv_base_mini_docker, + runtime_attr_override = runtime_attr_concat_bed_Step1 + } + } + + + call MiniTasks.ConcatBeds as concat_beds_across_batches { + input: + shard_bed_files = concat_beds_svtype.merged_bed_file, + prefix = "~{prefix}.lg_cnv.depth_supp", + sv_base_mini_docker = sv_base_mini_docker, + runtime_attr_override = runtime_attr_concat_bed_Step2 + } + + output { + File lg_cnv_depth_supp = concat_beds_across_batches.merged_bed_file + } +} + +task SeekDepthSuppForCpx { + input { + File cpx_lg_cnv + File raw_depth_bed + String sv_pipeline_docker + RuntimeAttr? runtime_attr_override + } + + RuntimeAttr default_attr = object { + cpu_cores: 1, + mem_gb: 3.75, + disk_gb: 10, + boot_disk_gb: 10, + preemptible_tries: 3, + max_retries: 1 + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + + String prefix = basename(cpx_lg_cnv, '.bed.gz') + + output { + File cpx_cnv_depth_supp = "~{prefix}.depth_supp.bed.gz" + } + command <<< + + set -e pipefail + + zcat ~{cpx_lg_cnv} | awk '{print $6}' | sort | uniq > sample_list.tsv + + echo -e '#chr\tpos\tend\tSVTYPE\tSVID\tsample\tbatch\tdepth_cov' > ~{prefix}.depth_supp.bed + + while read sample_name; do + zcat ~{cpx_lg_cnv} | awk -v sample="$sample_name" '{if ($6==sample) print}' > query.bed + zcat ~{raw_depth_bed} | awk -v sample="$sample_name" '{if ($5==sample) print}' > ref.bed + bedtools coverage -a query.bed -b ref.bed \ + | awk '{print $1,$2,$3,$4,$5,$6,$7,$NF}' \ + | sed -e 's/ /\t/g' \ + >> ~{prefix}.depth_supp.bed + done < sample_list.tsv + + bgzip ~{prefix}.depth_supp.bed + + >>> + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + docker: sv_pipeline_docker + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + } +} + + +task ExtractCpxLgCnvByBatch { + input { + File bed_gz + String batch_name + String sv_base_mini_docker + RuntimeAttr? runtime_attr_override + } + + RuntimeAttr default_attr = object { + cpu_cores: 1, + mem_gb: 3.75, + disk_gb: 10, + boot_disk_gb: 10, + preemptible_tries: 3, + max_retries: 1 + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + + output { + File lg_cnv_del = "~{batch_name}.lg_cnv.DEL.bed.gz" + File lg_cnv_dup = "~{batch_name}.lg_cnv.DUP.bed.gz" + } + command <<< + + set -e pipefail + + zcat ~{bed_gz} | awk '{if ($4=="DEL" && $7=="~{batch_name}") print}' > ~{batch_name}.lg_cnv.DEL.bed + zcat ~{bed_gz} | awk '{if ($4=="DUP" && $7=="~{batch_name}") print}' > ~{batch_name}.lg_cnv.DUP.bed + bgzip ~{batch_name}.lg_cnv.DEL.bed + bgzip ~{batch_name}.lg_cnv.DUP.bed + + >>> + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + docker: sv_base_mini_docker + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + } +} + +task GenerateCnvSegmentFromCpx { + input { + File bed + File sample_batch_pe_map + String sv_pipeline_docker + Int min_depth_size = 5000 + RuntimeAttr? runtime_attr_override + } + + RuntimeAttr default_attr = object { + cpu_cores: 1, + mem_gb: 3.75, + disk_gb: 10, + boot_disk_gb: 10, + preemptible_tries: 1, + max_retries: 1 + } + + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + + + String prefix = basename(bed, ".bed.gz") + + command <<< + set -euo pipefail + + python <0: + for i in interval: + if i[2]-i[1]>=min_depth_size: + sample_names = pin[pos_SAMPLES].split(',') + if i[3]=="DEL": + for j in sample_names: + CPX_CNV.append(i+[j, sample_to_batch[j]]) + if i[3]=="DUP": + for j in sample_names: + CPX_CNV.append(i+[j, sample_to_batch[j]]) + f_bed.close() + return CPX_CNV + + def write_cpx_cnv(CPX_CNV, fileout): + fo=open(fileout, 'w') + for info in CPX_CNV: + print('\t'.join([str(i) for i in info]), file=fo) + fo.close() + + bed_input = "~{bed}" + fileout = "~{prefix}.lg_CNV.bed" + min_depth_size = ~{min_depth_size} + sample_to_batch = get_sample_batch_map("~{sample_batch_pe_map}") + CPX_CNV = readin_cpx_cnv(bed_input, sample_to_batch, min_depth_size) + write_cpx_cnv(CPX_CNV, fileout) + CODE + + bgzip "~{prefix}.lg_CNV.bed" + >>> + + output { + File cpx_lg_cnv = "~{prefix}.lg_CNV.bed.gz" + } + + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + docker: sv_pipeline_docker + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + } +} + diff --git a/wdl/CollectPEMetricsForCPX.wdl b/wdl/CollectPEMetricsForCPX.wdl new file mode 100644 index 000000000..9a6fbd0c1 --- /dev/null +++ b/wdl/CollectPEMetricsForCPX.wdl @@ -0,0 +1,102 @@ +version 1.0 + +import "Structs.wdl" +import "CollectPEMetricsPerBatchCPX.wdl" as per_batch + +workflow CollectPEMetricsForCPX { + input { + Array[String] batch_name_list + Array[File] PE_metrics + Array[File] PE_metrics_idxes + File PE_collect_script + String prefix + Int n_per_split + String sv_pipeline_docker + String sv_base_mini_docker + RuntimeAttr? runtime_attr_collect_pe + RuntimeAttr? runtime_attr_split_script + RuntimeAttr? runtime_attr_calcu_pe_stat + RuntimeAttr? runtime_attr_concat_evidence + } + + scatter (i in range(length(batch_name_list))) { + call per_batch.CollectPEMetricsPerBatchCPX { + input: + n_per_split = n_per_split, + prefix = "~{prefix}.~{batch_name_list[i]}", + batch_name = batch_name_list[i], + PE_metric = PE_metrics[i], + PE_metrics_idx = PE_metrics_idxes[i], + PE_collect_script = PE_collect_script, + sv_pipeline_docker = sv_pipeline_docker, + sv_base_mini_docker = sv_base_mini_docker, + runtime_attr_collect_pe = runtime_attr_collect_pe, + runtime_attr_split_script = runtime_attr_split_script, + runtime_attr_calcu_pe_stat = runtime_attr_calcu_pe_stat, + runtime_attr_concat_evidence = runtime_attr_concat_evidence + } + } + + call per_batch.ConcatEvidences { + input: + evidences = CollectPEMetricsPerBatchCPX.evidence, + prefix = prefix, + sv_base_mini_docker = sv_base_mini_docker, + runtime_attr_override = runtime_attr_concat_evidence + } + + call CalcuPEStat { + input: + evidence = ConcatEvidences.concat_evidence, + prefix = prefix, + sv_base_mini_docker = sv_base_mini_docker, + runtime_attr_override = runtime_attr_calcu_pe_stat + } + + output { + File evidence = ConcatEvidences.concat_evidence + File evi_stat = CalcuPEStat.evi_stat + } +} + + +task CalcuPEStat { + input { + File evidence + String prefix + String sv_base_mini_docker + RuntimeAttr? runtime_attr_override + } + + RuntimeAttr default_attr = object { + cpu_cores: 1, + mem_gb: 3.75, + disk_gb: 10, + boot_disk_gb: 10, + preemptible_tries: 3, + max_retries: 1 + } + + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + + command <<< + set -euo pipefail + + zcat ~{evidence} | cut -f3,6- | uniq -c > ~{prefix}.evi_stat + bgzip ~{prefix}.evi_stat + >>> + + output { + File evi_stat = "~{prefix}.evi_stat.gz" + } + + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + docker: sv_base_mini_docker + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + } +} diff --git a/wdl/CollectPEMetricsPerBatchCPX.wdl b/wdl/CollectPEMetricsPerBatchCPX.wdl new file mode 100644 index 000000000..3cc65dbeb --- /dev/null +++ b/wdl/CollectPEMetricsPerBatchCPX.wdl @@ -0,0 +1,205 @@ +version 1.0 + +import "Structs.wdl" +workflow CollectPEMetricsPerBatchCPX { + input { + Int n_per_split + String batch_name + File PE_metric + File PE_metrics_idx + File PE_collect_script + String prefix + String sv_base_mini_docker + String sv_pipeline_docker + RuntimeAttr? runtime_attr_collect_pe + RuntimeAttr? runtime_attr_concat_evidence + RuntimeAttr? runtime_attr_calcu_pe_stat + RuntimeAttr? runtime_attr_split_script + } + + call SplitScripts { + input: + script = PE_collect_script, + n_per_split = n_per_split, + batch_name = batch_name, + sv_base_mini_docker = sv_base_mini_docker, + runtime_attr_override = runtime_attr_split_script + } + + scatter (script in SplitScripts.script_splits ) { + call CollectPEMetrics { + input: + batch_name = batch_name, + PE_metric = PE_metric, + PE_metrics_idx = PE_metrics_idx, + PE_collect_script = script, + sv_pipeline_docker = sv_pipeline_docker, + runtime_attr_override = runtime_attr_collect_pe + } + } + + call ConcatEvidences { + input: + evidences = CollectPEMetrics.evidence, + prefix = prefix, + sv_base_mini_docker = sv_base_mini_docker, + runtime_attr_override = runtime_attr_concat_evidence + } + + output { + File evidence = ConcatEvidences.concat_evidence + } +} + + + +# collect PE metrics +task CollectPEMetrics { + input { + String batch_name + File PE_metric + File PE_metrics_idx + File PE_collect_script + String sv_pipeline_docker + RuntimeAttr? runtime_attr_override + } + + RuntimeAttr default_attr = object { + cpu_cores: 1, + mem_gb: 3.75, + disk_gb: ceil(30.0 + size(PE_metric, "GiB") * 3), + boot_disk_gb: 10, + preemptible_tries: 3, + max_retries: 1 + } + + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + + command <<< + set -euo pipefail + mkdir PE_metrics/ + gsutil cp ~{PE_metric} ./ + gsutil cp ~{PE_metrics_idx} ./ + echo "Metrics succesfully downloaded." + echo "Starting the evidence collection ..." + + if [ $(wc -c < "~{PE_collect_script}") -gt 0 ]; then + bash ~{PE_collect_script} + fi + + touch ~{batch_name}.evidence + touch ~{batch_name}.0.PE_evidences + for peEvFile in *.PE_evidences + do + cat ${peEvFile} >> ~{batch_name}.evidence + done + + bgzip ~{batch_name}.evidence + + >>> + + output { + File evidence = "~{batch_name}.evidence.gz" + } + + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + docker: sv_pipeline_docker + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + } +} + +task ConcatEvidences { + input { + Array[File] evidences + String prefix + String sv_base_mini_docker + RuntimeAttr? runtime_attr_override + } + + RuntimeAttr default_attr = object { + cpu_cores: 1, + mem_gb: 3.75, + disk_gb: 10, + boot_disk_gb: 10, + preemptible_tries: 3, + max_retries: 1 + } + + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + + command <<< + set -euo pipefail + + while read SPLIT; do + zcat $SPLIT + done < ~{write_lines(evidences)} \ + | bgzip -c \ + > ~{prefix}.evidence.gz + + >>> + + output { + File concat_evidence = "~{prefix}.evidence.gz" + } + + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + docker: sv_base_mini_docker + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + } +} + +task SplitScripts { + input { + File script + String batch_name + Int n_per_split + String sv_base_mini_docker + RuntimeAttr? runtime_attr_override + } + + RuntimeAttr default_attr = object { + cpu_cores: 1, + mem_gb: 3.75, + disk_gb: 10, + boot_disk_gb: 10, + preemptible_tries: 3, + max_retries: 1 + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + + output { + Array[File] script_splits = glob("collect_PE_evidences.*") + } + command <<< + + set -e pipefail + + awk '{if ($2~"~{batch_name}") print}' ~{script} > tmp_metrics.sh + + if [ $(wc -c < "tmp_metrics.sh") -gt 0 ]; then + split --additional-suffix ".sh" -l ~{n_per_split} -a 6 tmp_metrics.sh collect_PE_evidences. + else + touch collect_PE_evidences.aaaaaa.sh + fi + + >>> + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + docker: sv_base_mini_docker + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + } +} diff --git a/wdl/FilterGenotypes.wdl b/wdl/FilterGenotypes.wdl index 0603b5750..8926bc31e 100644 --- a/wdl/FilterGenotypes.wdl +++ b/wdl/FilterGenotypes.wdl @@ -25,6 +25,9 @@ workflow FilterGenotypes { Int optimize_vcf_records_per_shard = 50000 Int filter_vcf_records_per_shard = 20000 + # For SanitizeHeader - bcftools annotate -x ~{header_drop_fields} + String header_drop_fields = "FILTER/LOW_QUALITY,FORMAT/TRUTH_CN_EQUAL,FORMAT/GT_FILTER,FORMAT/CONC_ST,INFO/STATUS,INFO/TRUTH_AC,INFO/TRUTH_AN,INFO/TRUTH_AF,INFO/TRUTH_VID,INFO/CNV_CONCORDANCE,INFO/GENOTYPE_CONCORDANCE,INFO/HET_PPV,INFO/HET_SENSITIVITY,INFO/HOMVAR_PPV,INFO/HOMVAR_SENSITIVITY,INFO/MINSL,INFO/NON_REF_GENOTYPE_CONCORDANCE,INFO/SL_MAX,INFO/SL_MEAN,INFO/VAR_PPV,INFO/VAR_SENSITIVITY,INFO/VAR_SPECIFICITY" + # For MainVcfQc File primary_contigs_fai File? ped_file @@ -32,7 +35,7 @@ workflow FilterGenotypes { Array[Array[String]]? sample_level_comparison_datasets # Array of two-element arrays, one per dataset, each of format [prefix, gs:// path to per-sample tarballs] File? sample_renaming_tsv # File with mapping to rename per-sample benchmark sample IDs for compatibility with cohort Boolean run_qc = true - String qc_bcftools_preprocessing_options = "-e 'FILTER~\"UNRESOLVED\" || FILTER~\"HIGH_NCR\"'" + String qc_bcftools_preprocessing_options = "-i 'FILTER=\"PASS\" || FILTER=\"MULTIALLELIC\"'" Int qc_sv_per_shard = 2500 Int qc_samples_per_shard = 600 RuntimeAttr? runtime_override_plot_qc_per_family @@ -120,10 +123,19 @@ workflow FilterGenotypes { sv_base_mini_docker=sv_base_mini_docker } + call SanitizeHeader { + input: + vcf = ConcatVcfs.concat_vcf, + vcf_index = ConcatVcfs.concat_vcf_idx, + drop_fields = header_drop_fields, + prefix = "~{output_prefix_}.filter_genotypes.sanitized", + sv_pipeline_docker = sv_pipeline_docker + } + if (run_qc) { call qc.MainVcfQc { input: - vcfs=[ConcatVcfs.concat_vcf], + vcfs=[SanitizeHeader.out], prefix="~{output_prefix_}.filter_genotypes", ped_file=ped_file, bcftools_preprocessing_options=qc_bcftools_preprocessing_options, @@ -142,8 +154,8 @@ workflow FilterGenotypes { } output { - File filtered_vcf = ConcatVcfs.concat_vcf - File filtered_vcf_index = ConcatVcfs.concat_vcf_idx + File filtered_vcf = SanitizeHeader.out + File filtered_vcf_index = SanitizeHeader.out_index File? main_vcf_qc_tarball = MainVcfQc.sv_vcf_qc_output # For optional analysis @@ -295,3 +307,59 @@ task FilterVcf { maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) } } + +task SanitizeHeader { + input { + File vcf + File vcf_index + String prefix + String drop_fields + String sv_pipeline_docker + RuntimeAttr? runtime_attr_override + } + + RuntimeAttr runtime_default = object { + mem_gb: 3.75, + disk_gb: ceil(10.0 + size(vcf, "GiB") * 2.0), + cpu_cores: 1, + preemptible_tries: 3, + max_retries: 1, + boot_disk_gb: 10 + } + RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default]) + runtime { + memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GiB" + disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD" + cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores]) + preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries]) + maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries]) + docker: sv_pipeline_docker + bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb]) + } + + command <<< + set -euo pipefail + + bcftools view --no-version -h ~{vcf} > header.vcf + + grep -v -e ^"##bcftools" header.vcf \ + -e ^"##source=depth" \ + -e ^"##source=cleanvcf" \ + -e ^"##ALT=/##ALT=/g' > newheader.vcf + + bcftools reheader -h newheader.vcf ~{vcf} \ + | bcftools annotate -x ~{drop_fields} \ + --no-version \ + -O z \ + -o ~{prefix}.vcf.gz + + tabix ~{prefix}.vcf.gz + >>> + + output { + File out = "~{prefix}.vcf.gz" + File out_index = "~{prefix}.vcf.gz.tbi" + } +} diff --git a/wdl/MakeCohortVcf.wdl b/wdl/MakeCohortVcf.wdl index 5ea66ea50..f3891fc8d 100644 --- a/wdl/MakeCohortVcf.wdl +++ b/wdl/MakeCohortVcf.wdl @@ -44,6 +44,8 @@ workflow MakeCohortVcf { File pe_exclude_list File depth_exclude_list File ref_dict + File HERVK_reference + File LINE1_reference Int max_shard_size_resolve Int max_shards_per_chrom_clean_vcf_step1 Int min_records_per_shard_clean_vcf_step1 @@ -378,6 +380,8 @@ workflow MakeCohortVcf { allosome_fai=allosome_fai, chr_x=chr_x, chr_y=chr_y, + HERVK_reference=HERVK_reference, + LINE1_reference=LINE1_reference, max_shards_per_chrom_step1=max_shards_per_chrom_clean_vcf_step1, min_records_per_shard_step1=min_records_per_shard_clean_vcf_step1, clean_vcf1b_records_per_shard=clean_vcf1b_records_per_shard, diff --git a/wdl/RefineComplexVariants.wdl b/wdl/RefineComplexVariants.wdl new file mode 100644 index 000000000..4340ef253 --- /dev/null +++ b/wdl/RefineComplexVariants.wdl @@ -0,0 +1,800 @@ +version 1.0 + +import "Structs.wdl" +import "HailMerge.wdl" as hail +import "TasksMakeCohortVcf.wdl" as MiniTasks +import "CollectPEMetricsForCPX.wdl" as collect_pe_metrics_for_cpx +import "CollectLargeCNVSupportForCPX.wdl" as collect_lg_cnv_supp_for_cpx +import "Utils.wdl" as util + +workflow RefineComplexVariants { + input { + File vcf + String prefix + + Array[String] batch_name_list + Array[File] batch_sample_lists + Array[File] PE_metrics + Array[File] PE_metrics_indexes + Array[File] Depth_DEL_beds + Array[File] Depth_DUP_beds + + Int n_per_split + Int min_pe_cpx = 3 + Int min_pe_ctx = 3 + + Boolean use_hail = false + String? gcs_project # required if use_hail = true + + String sv_base_mini_docker + String sv_pipeline_docker + String linux_docker + + RuntimeAttr? runtime_attr_sample_batch + RuntimeAttr? runtime_attr_vcf2bed + RuntimeAttr? runtime_attr_collect_pe + RuntimeAttr? runtime_attr_scatter_vcf + RuntimeAttr? runtime_attr_split_script + RuntimeAttr? runtime_attr_calcu_pe_stat + RuntimeAttr? runtime_attr_split_cpx_ctx + RuntimeAttr? runtime_attr_concat_evidence + RuntimeAttr? runtime_attr_concat + RuntimeAttr? runtime_attr_preconcat + RuntimeAttr? runtime_attr_hail_merge + RuntimeAttr? runtime_attr_fix_header + RuntimeAttr? runtime_attr_generate_cpx_review_script + RuntimeAttr? runtime_attr_generate_cnv_segments_from_cpx + RuntimeAttr? runtime_attr_get_vcf_header_with_members_info_line + RuntimeAttr? runtime_attr_extract_cpx_lg_cnv_by_batch + RuntimeAttr? runtime_attr_seek_depth_supp_for_cpx + RuntimeAttr? runtime_attr_concat_bed_Step1 + RuntimeAttr? runtime_attr_concat_bed_Step2 + RuntimeAttr? runtime_attr_calcu_cpx_evidences + } + + call GetSampleBatchPEMap { + input: + batch_name_list = batch_name_list, + batch_sample_lists = batch_sample_lists, + batch_pe_files = write_lines(PE_metrics), + prefix = prefix, + sv_pipeline_docker = sv_pipeline_docker, + runtime_attr_override = runtime_attr_sample_batch + } + + call MiniTasks.ScatterVcf { + input: + vcf = vcf, + prefix = prefix, + records_per_shard = n_per_split, + sv_pipeline_docker = sv_pipeline_docker, + runtime_attr_override = runtime_attr_scatter_vcf + } + + scatter (i in range(length(ScatterVcf.shards))) { + + call util.VcfToBed { + input: + vcf_file = ScatterVcf.shards[i], + args = "-i ALL --include-filters", + variant_interpretation_docker = sv_pipeline_docker, + runtime_attr_override = runtime_attr_vcf2bed + } + + call SplitCpxCtx { + input: + bed = VcfToBed.bed_output, + sv_base_mini_docker = sv_base_mini_docker, + runtime_attr_override = runtime_attr_split_cpx_ctx + } + + call collect_lg_cnv_supp_for_cpx.CollectLargeCNVSupportForCPX { + input: + cpx_ctx_bed = SplitCpxCtx.cpx_ctx_bed, + prefix = "~{prefix}.~{i}", + + sample_batch_pe_map = GetSampleBatchPEMap.sample_batch_pe_map, + batch_name_list = batch_name_list, + Depth_DEL_beds = Depth_DEL_beds, + Depth_DUP_beds = Depth_DUP_beds, + + sv_base_mini_docker = sv_base_mini_docker, + sv_pipeline_docker = sv_pipeline_docker, + + runtime_attr_concat_bed_Step1 = runtime_attr_concat_bed_Step1, + runtime_attr_concat_bed_Step2 = runtime_attr_concat_bed_Step2, + runtime_attr_seek_depth_supp_for_cpx = runtime_attr_seek_depth_supp_for_cpx, + runtime_attr_extract_cpx_lg_cnv_by_batch = runtime_attr_extract_cpx_lg_cnv_by_batch, + runtime_attr_generate_cnv_segments_from_cpx = runtime_attr_generate_cnv_segments_from_cpx + } + + call GenerateCpxReviewScript { + input: + bed = SplitCpxCtx.cpx_ctx_bed, + sample_batch_pe_map = GetSampleBatchPEMap.sample_batch_pe_map, + sv_pipeline_docker = sv_pipeline_docker, + runtime_attr_override = runtime_attr_generate_cpx_review_script + } + + call collect_pe_metrics_for_cpx.CollectPEMetricsForCPX { + input: + batch_name_list = batch_name_list, + PE_metrics = PE_metrics, + PE_metrics_idxes = PE_metrics_indexes, + PE_collect_script = GenerateCpxReviewScript.pe_evidence_collection_script, + prefix = "~{prefix}.~{i}", + n_per_split = n_per_split, + sv_pipeline_docker = sv_pipeline_docker, + sv_base_mini_docker = sv_base_mini_docker, + runtime_attr_collect_pe = runtime_attr_collect_pe, + runtime_attr_split_script = runtime_attr_split_script, + runtime_attr_calcu_pe_stat = runtime_attr_calcu_pe_stat, + runtime_attr_concat_evidence = runtime_attr_concat_evidence + } + + call CalculateCpxEvidences { + input: + PE_collect_script = GenerateCpxReviewScript.pe_evidence_collection_script, + PE_supp = CollectPEMetricsForCPX.evi_stat, + depth_supp = CollectLargeCNVSupportForCPX.lg_cnv_depth_supp, + min_pe_cpx = min_pe_cpx, + prefix = "~{prefix}.~{i}", + sv_pipeline_docker = sv_pipeline_docker, + runtime_attr_override = runtime_attr_calcu_cpx_evidences + } + + call CalculateCtxEvidences{ + input: + PE_collect_script = GenerateCpxReviewScript.pe_evidence_collection_script, + PE_supp = CollectPEMetricsForCPX.evi_stat, + min_pe_ctx = min_pe_ctx, + prefix = "~{prefix}.~{i}", + sv_pipeline_docker = sv_pipeline_docker, + runtime_attr_override = runtime_attr_calcu_cpx_evidences + } + + call ReviseVcf { + input: + vcf_file = ScatterVcf.shards[i], + CPX_manual = CalculateCpxEvidences.manual_revise_CPX_results, + CTX_manual = CalculateCtxEvidences.manual_revise_CTX_results, + unresolved_svids = GenerateCpxReviewScript.unresolved_svids, + prefix = "~{prefix}.~{i}", + sv_pipeline_docker = sv_pipeline_docker + } + } + + if (use_hail) { + call hail.HailMerge { + input: + vcfs=ReviseVcf.revised_vcf, + prefix="~{prefix}.cpx_refined", + gcs_project=gcs_project, + sv_base_mini_docker=sv_base_mini_docker, + sv_pipeline_docker=sv_pipeline_docker, + runtime_override_preconcat=runtime_attr_preconcat, + runtime_override_hail_merge=runtime_attr_hail_merge, + runtime_override_fix_header=runtime_attr_fix_header + } + } + + if (!use_hail) { + call MiniTasks.ConcatVcfs { + input: + vcfs=ReviseVcf.revised_vcf, + vcfs_idx=ReviseVcf.revised_vcf_idx, + allow_overlaps=true, + outfile_prefix="~{prefix}.cpx_refined", + sv_base_mini_docker=sv_base_mini_docker, + runtime_attr_override=runtime_attr_concat + } + } + + call MiniTasks.ConcatHeaderedTextFiles { + input: + text_files = CalculateCpxEvidences.manual_revise_CPX_results, + output_filename = "~{prefix}.CPX_evidence.txt" , + linux_docker = linux_docker + + } + + output { + File cpx_refined_vcf = select_first([ConcatVcfs.concat_vcf, HailMerge.merged_vcf]) + File cpx_refined_vcf_index = select_first([ConcatVcfs.concat_vcf_idx, HailMerge.merged_vcf_index]) + File cpx_evidences = ConcatHeaderedTextFiles.out + } +} + + +task GetSampleBatchPEMap { + input { + Array[File] batch_sample_lists + Array[String] batch_name_list + File batch_pe_files + String prefix + String sv_pipeline_docker + RuntimeAttr? runtime_attr_override + } + + RuntimeAttr default_attr = object { + cpu_cores: 1, + mem_gb: 3.75, + disk_gb: 10 + 2*ceil(size(flatten([batch_sample_lists]), "GB")), + boot_disk_gb: 10, + preemptible_tries: 1, + max_retries: 1 + } + + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + + command <<< + set -euo pipefail + + python <>> + + output { + File sample_batch_pe_map = "~{prefix}.sample_batch_pe_map.tsv" + } + + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + docker: sv_pipeline_docker + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + } +} + + +task ReviseVcf { + input { + File vcf_file + File CPX_manual + File CTX_manual + File unresolved_svids + String prefix + String sv_pipeline_docker + RuntimeAttr? runtime_attr_override + } + + RuntimeAttr default_attr = object { + cpu_cores: 1, + mem_gb: 3.75, + disk_gb: ceil(5.0 + size(vcf_file, "GB")*3), + boot_disk_gb: 10, + preemptible_tries: 1, + max_retries: 1 + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + + command<<< + set -euo pipefail + + python <',) + del_section = record.stop - record.pos + if del_section<50: + record.info['CPX_TYPE']="dDUP" + record.info['CPX_INTERVALS'] = record.info['SOURCE'].replace('INV','DUP')+','+record.info['SOURCE'] + else: + record.info['CPX_TYPE']="dDUP_iDEL" + record.info['CPX_INTERVALS'] = record.info['SOURCE'].replace('INV','DUP')+','+record.info['SOURCE']+','+"DEL_"+record.chrom+":"+str(record.pos)+'-'+str(record.stop) + del record.info['SOURCE'] + fo.write(record) # write out every record that was in the input - NCR will remove ones with no carriers left + fin.close() + fo.close() + +import pysam + +unresolved_svids = unresolved_readin("~{unresolved_svids}") +hash_CPX_manual = CPX_manual_readin("~{CPX_manual}") +hash_CTX_manual = CTX_manual_readin("~{CTX_manual}") +print(len(hash_CPX_manual.keys())) +revise_vcf("~{vcf_file}", "~{prefix}.Manual_Revised.vcf.gz", hash_CPX_manual, unresolved_svids, hash_CTX_manual) + +CODE + + tabix ~{prefix}.Manual_Revised.vcf.gz + >>> + + output { + File revised_vcf = "~{prefix}.Manual_Revised.vcf.gz" + File revised_vcf_idx = "~{prefix}.Manual_Revised.vcf.gz.tbi" + } + + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + docker: sv_pipeline_docker + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + } +} + +task SplitCpxCtx { + input { + File bed + String sv_base_mini_docker + RuntimeAttr? runtime_attr_override + } + + RuntimeAttr default_attr = object { + cpu_cores: 1, + mem_gb: 3.75, + disk_gb: 10, + boot_disk_gb: 10, + preemptible_tries: 1, + max_retries: 1 + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + + + String prefix = basename(bed, ".bed.gz") + + command <<< + set -eu + + zcat ~{bed} | head -1 > ~{prefix}.cpx_ctx.bed + + filterColumn=$(zcat ~{bed} | head -1 | tr "\t" "\n" | awk '$1=="FILTER" {print NR}') + + set -o pipefail + + zcat ~{bed} | awk 'NR > 1' | { grep CPX || true; } | awk -v filter_column=${filterColumn} '$filter_column !~ /UNRESOLVED/' >> ~{prefix}.cpx_ctx.bed + + zcat ~{bed} | awk 'NR > 1' | { grep CTX || true; } >> ~{prefix}.cpx_ctx.bed + + # INS with INV in SOURCE - will be converted to CPX later so need to evaluate evidence + zcat ~{bed} | awk 'NR > 1' | { grep INS || true; } | { grep INV || true; } >> ~{prefix}.cpx_ctx.bed + + bgzip ~{prefix}.cpx_ctx.bed + >>> + + output { + File cpx_ctx_bed = "~{prefix}.cpx_ctx.bed.gz" + } + + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + docker: sv_base_mini_docker + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + } +} + +task GenerateCpxReviewScript { + input { + File bed + File sample_batch_pe_map + File? script + String sv_pipeline_docker + RuntimeAttr? runtime_attr_override + } + + RuntimeAttr default_attr = object { + cpu_cores: 1, + mem_gb: 3.75, + disk_gb: 10, + boot_disk_gb: 10, + preemptible_tries: 1, + max_retries: 1 + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + + + String prefix = basename(bed, ".bed.gz") + + command <<< + set -euo pipefail + + cut -f1,3 ~{sample_batch_pe_map} > sample_PE_metrics.tsv + + python ~{default="/opt/sv-pipeline/scripts/reformat_CPX_bed_and_generate_script.py" script} \ + -i ~{bed} \ + -s sample_PE_metrics.tsv \ + -p CPX_CTX_disINS.PASS.PE_evidences \ + -c collect_PE_evidences.CPX_CTX_disINS.PASS.sh \ + -r ~{prefix}.svelter \ + -u ~{prefix}.unresolved_svids.txt + + >>> + + output { + File pe_evidence_collection_script = "collect_PE_evidences.CPX_CTX_disINS.PASS.sh" + File svelter = "~{prefix}.svelter" + File unresolved_svids = "~{prefix}.unresolved_svids.txt" + } + + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + docker: sv_pipeline_docker + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + } +} + +task CalculateCpxEvidences { + input { + File PE_collect_script + File PE_supp + File depth_supp + Int min_pe_cpx + String prefix + String sv_pipeline_docker + RuntimeAttr? runtime_attr_override + } + + RuntimeAttr default_attr = object { + cpu_cores: 1, + mem_gb: 3.75, + disk_gb: 10, + boot_disk_gb: 10, + preemptible_tries: 1, + max_retries: 1 + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + + command <<< + + set -eu + awk '{print $6, $(NF-3)}' ~{PE_collect_script} | grep -v "_CTX_"| uniq > sample_SVID.tsv + + set -euo pipefail + + + python <0 or PE_supp[svid][sample][1][0]>0: + pe_supp = 'partial_PE' + if PE_supp[svid][sample][0][0]>0 and PE_supp[svid][sample][1][0]>0: + pe_supp = 'low_PE' + if PE_supp[svid][sample][0][0]>=~{min_pe_cpx} and PE_supp[svid][sample][1][0]>=~{min_pe_cpx}: + pe_supp = 'high_PE' + out[svid][sample] = [pe_supp] + else: + out[svid][sample] = ['no_PE'] + else: + for sample in sample_svid[svid]: + out[svid][sample] = ['no_PE'] + return out + + def add_depth_supp(sample_svid_pe, Depth_supp): + for svid in sample_svid_pe.keys(): + if svid in Depth_supp.keys(): + for sample in sample_svid_pe[svid].keys(): + if sample in Depth_supp[svid].keys(): + depth_supp = 'lack_depth' + if Depth_supp[svid][sample][0]>.5: + depth_supp = 'depth' + if len(Depth_supp[svid][sample])>1: + if not Depth_supp[svid][sample][1]>.5: + depth_supp+=',lack_depth' + else: + if len(Depth_supp[svid][sample])>1: + if Depth_supp[svid][sample][1]>.5: + depth_supp+=',depth' + sample_svid_pe[svid][sample].append(depth_supp) + else: + sample_svid_pe[svid][sample].append('NA') + else: + for sample in sample_svid_pe[svid].keys(): + sample_svid_pe[svid][sample].append('NA') + return sample_svid_pe + + def write_pe_depth_supp(sample_svid_pe_depth, fileout): + fo=open(fileout,'w') + print('\t'.join(['VID','sample','supportive_PE_counts','depth_supp']), file=fo) + for svid in sample_svid_pe_depth.keys(): + for sample in sample_svid_pe_depth[svid].keys(): + print('\t'.join([svid,sample] + sample_svid_pe_depth[svid][sample]), file=fo) + fo.close() + + import os + sample_svid = sample_svid_readin("sample_SVID.tsv") + PE_supp = PE_supp_readin("~{PE_supp}") + Depth_supp = Depth_supp_readin("~{depth_supp}") + sample_svid_pe = add_PE_supp(sample_svid, PE_supp) + sample_svid_pe_depth = add_depth_supp(sample_svid_pe, Depth_supp) + write_pe_depth_supp(sample_svid_pe_depth, "~{prefix}.manual_revise.CPX_results") + + CODE + >>> + + output { + File manual_revise_CPX_results = "~{prefix}.manual_revise.CPX_results" + } + + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + docker: sv_pipeline_docker + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + } +} + +task CalculateCtxEvidences { + input { + File PE_collect_script + File PE_supp + Int min_pe_ctx + String prefix + String sv_pipeline_docker + RuntimeAttr? runtime_attr_override + } + + RuntimeAttr default_attr = object { + cpu_cores: 1, + mem_gb: 3.75, + disk_gb: 10, + boot_disk_gb: 10, + preemptible_tries: 1, + max_retries: 1 + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + + command<<< + + set -eu + + awk '{print $6, $(NF-3)}' ~{PE_collect_script} | grep "_CTX_"| uniq > sample_SVID.tsv + + set -eu + + zcat ~{PE_supp} | grep "_CTX_"| uniq > PE_supp.tsv + + set -euo pipefail + + python <0 or PE_supp[svid][sample][1][0]>0: + pe_supp = 'partial_PE' + if PE_supp[svid][sample][0][0]>0 and PE_supp[svid][sample][1][0]>0: + pe_supp = 'low_PE' + if PE_supp[svid][sample][0][0]>=~{min_pe_ctx} and PE_supp[svid][sample][1][0]>=~{min_pe_ctx}: + pe_supp = 'high_PE' + out[svid][sample] = [pe_supp] + else: + out[svid][sample] = ['no_PE'] + else: + for sample in sample_svid[svid]: + out[svid][sample] = ['no_PE'] + return out + + def write_pe_depth_supp(sample_svid_pe_depth, fileout): + fo=open(fileout,'w') + print('\t'.join(['VID','sample','supportive_PE_counts']), file=fo) + for svid in sample_svid_pe_depth.keys(): + for sample in sample_svid_pe_depth[svid].keys(): + print('\t'.join([svid,sample] + sample_svid_pe_depth[svid][sample]), file=fo) + fo.close() + + import os + sample_svid = sample_svid_readin("sample_SVID.tsv") + PE_supp = PE_supp_readin("PE_supp.tsv") + sample_svid_pe = add_PE_supp(sample_svid, PE_supp) + write_pe_depth_supp(sample_svid_pe, "~{prefix}.manual_revise.CTX_results") + + CODE + >>> + + output { + File manual_revise_CTX_results = "~{prefix}.manual_revise.CTX_results" + } + + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + docker: sv_pipeline_docker + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + } +} +