From 4f8ffe512596244be6520cfbe7f8fe260fd5c320 Mon Sep 17 00:00:00 2001 From: Paul Sud <41386393+paul-sud@users.noreply.github.com> Date: Fri, 11 Mar 2022 10:07:39 -0800 Subject: [PATCH] PIP-1666-megamap-from-bams (#138) --- docker/hic-pipeline/Dockerfile | 4 + genophase.wdl | 10 +- hic.wdl | 52 +++- hic_pipeline/__init__.py | 2 +- make_restriction_site_locations.wdl | 10 +- megamap.wdl | 253 ++++++++++++++++++ .../make_megamap_input_json_from_portal.py | 136 ++++++++++ 7 files changed, 442 insertions(+), 25 deletions(-) create mode 100644 megamap.wdl create mode 100644 scripts/make_megamap_input_json_from_portal.py diff --git a/docker/hic-pipeline/Dockerfile b/docker/hic-pipeline/Dockerfile index 80f33633..572e7496 100644 --- a/docker/hic-pipeline/Dockerfile +++ b/docker/hic-pipeline/Dockerfile @@ -127,6 +127,10 @@ RUN curl \ chmod 666 /opt/MixerTools.4.8.2.jar && \ ln -s /opt/MixerTools.4.8.2.jar /opt/MixerTools.jar +RUN curl \ + -LO \ + https://github.com/sa501428/merge-stats/releases/download/v0.3/merge-stats.jar + RUN curl \ -LO \ https://github.com/broadinstitute/gatk/releases/download/4.2.2.0/gatk-4.2.2.0.zip && \ diff --git a/genophase.wdl b/genophase.wdl index f26e0237..c103e387 100644 --- a/genophase.wdl +++ b/genophase.wdl @@ -4,9 +4,9 @@ import "./hic.wdl" workflow genophase { meta { - version: "1.12.2" - caper_docker: "encodedcc/hic-pipeline:1.12.2" - caper_singularity: "docker://encodedcc/hic-pipeline:1.12.2" + version: "1.13.0" + caper_docker: "encodedcc/hic-pipeline:1.13.0" + caper_singularity: "docker://encodedcc/hic-pipeline:1.13.0" croo_out_def: "https://raw.githubusercontent.com/ENCODE-DCC/hic-pipeline/dev/croo_out_def.json" } @@ -25,8 +25,8 @@ workflow genophase { Int? run_3d_dna_ram_gb Boolean no_phasing = false - String docker = "encodedcc/hic-pipeline:1.12.2" - String singularity = "docker://encodedcc/hic-pipeline:1.12.2" + String docker = "encodedcc/hic-pipeline:1.13.0" + String singularity = "docker://encodedcc/hic-pipeline:1.13.0" } RuntimeEnvironment runtime_environment = { diff --git a/hic.wdl b/hic.wdl index da950448..36e7ceba 100644 --- a/hic.wdl +++ b/hic.wdl @@ -19,9 +19,9 @@ struct RuntimeEnvironment { workflow hic { meta { - version: "1.12.2" - caper_docker: "encodedcc/hic-pipeline:1.12.2" - caper_singularity: "docker://encodedcc/hic-pipeline:1.12.2" + version: "1.13.0" + caper_docker: "encodedcc/hic-pipeline:1.13.0" + caper_singularity: "docker://encodedcc/hic-pipeline:1.13.0" croo_out_def: "https://raw.githubusercontent.com/ENCODE-DCC/hic-pipeline/dev/croo_out_def.json" description: "ENCODE Hi-C pipeline, see https://github.com/ENCODE-DCC/hic-pipeline for details." } @@ -41,13 +41,20 @@ workflow hic { # Parameters controlling delta calls Boolean no_delta = false + # Parameters Boolean intact = false Array[String] normalization_methods = [] + Array[Int] create_hic_in_situ_resolutions = [2500000, 1000000, 500000, 250000, 100000, 50000, 25000, 10000, 5000, 2000, 1000, 500, 200, 100] + Array[Int] create_hic_intact_resolutions = [2500000, 1000000, 500000, 250000, 100000, 50000, 25000, 10000, 5000, 2000, 1000, 500, 200, 100, 50, 20, 10] + + # Feature flags Boolean no_pairs = false Boolean no_call_loops = false Boolean no_call_tads = false Boolean no_eigenvectors = false Boolean no_slice = false + + # Resource parameters Int align_num_cpus = 32 Int align_ram_gb_in_situ = 64 Int align_ram_gb_intact = 88 @@ -60,15 +67,20 @@ workflow hic { Int dedup_disk_size_gb_in_situ = 5000 Int dedup_disk_size_gb_intact = 7500 Int? create_hic_num_cpus + Int? create_hic_ram_gb + Int? create_hic_juicer_tools_heap_size_gb + Int? create_hic_disk_size_gb Int? add_norm_num_cpus + Int? add_norm_ram_gb + Int? add_norm_disk_size_gb Int? create_accessibility_track_ram_gb Int? create_accessibility_track_disk_size_gb String assembly_name = "undefined" - String docker = "encodedcc/hic-pipeline:1.12.2" - String singularity = "docker://encodedcc/hic-pipeline:1.12.2" - String delta_docker = "encodedcc/hic-pipeline:1.12.2_delta" - String hiccups_docker = "encodedcc/hic-pipeline:1.12.2_hiccups" + String docker = "encodedcc/hic-pipeline:1.13.0" + String singularity = "docker://encodedcc/hic-pipeline:1.13.0" + String delta_docker = "encodedcc/hic-pipeline:1.13.0_delta" + String hiccups_docker = "encodedcc/hic-pipeline:1.13.0_hiccups" } RuntimeEnvironment runtime_environment = { @@ -92,8 +104,6 @@ workflow hic { Int dedup_disk_size_gb = if intact then dedup_disk_size_gb_intact else dedup_disk_size_gb_in_situ String delta_models_path = if intact then "ultimate-models" else "beta-models" Array[Int] delta_resolutions = if intact then [5000, 2000, 1000] else [5000, 10000] - Array[Int] create_hic_in_situ_resolutions = [2500000, 1000000, 500000, 250000, 100000, 50000, 25000, 10000, 5000, 2000, 1000, 500, 200, 100] - Array[Int] create_hic_intact_resolutions = [2500000, 1000000, 500000, 250000, 100000, 50000, 25000, 10000, 5000, 2000, 1000, 500, 200, 100, 50, 20, 10] Array[Int] create_hic_resolutions = if intact then create_hic_intact_resolutions else create_hic_in_situ_resolutions # Default MAPQ thresholds for generating .hic contact maps @@ -273,6 +283,9 @@ workflow hic { resolutions = create_hic_resolutions, assembly_name = assembly_name, num_cpus = create_hic_num_cpus, + ram_gb = create_hic_ram_gb, + juicer_tools_heap_size_gb = create_hic_juicer_tools_heap_size_gb, + disk_size_gb = create_hic_disk_size_gb, runtime_environment = runtime_environment, } } @@ -288,6 +301,9 @@ workflow hic { resolutions = create_hic_resolutions, assembly_name = normalize_assembly_name.normalized_assembly_name, num_cpus = create_hic_num_cpus, + ram_gb = create_hic_ram_gb, + juicer_tools_heap_size_gb = create_hic_juicer_tools_heap_size_gb, + disk_size_gb = create_hic_disk_size_gb, runtime_environment = runtime_environment, } } @@ -303,6 +319,8 @@ workflow hic { normalization_methods = normalization_methods, quality = qualities[i], num_cpus = add_norm_num_cpus, + ram_gb = add_norm_ram_gb, + disk_size_gb = add_norm_disk_size_gb, runtime_environment = runtime_environment, } @@ -845,6 +863,10 @@ task create_hic { File? chrsz File? restriction_sites Int num_cpus = 16 + # Should always set juicer_tools_heap_size_gb < ram_gb + Int ram_gb = 384 + Int juicer_tools_heap_size_gb = 370 + Int disk_size_gb = 2000 RuntimeEnvironment runtime_environment } @@ -860,7 +882,7 @@ task create_hic { java \ -Ddevelopment=false \ -Djava.awt.headless=true \ - -Xmx240g \ + -Xmx~{juicer_tools_heap_size_gb}g \ -jar /opt/scripts/common/juicer_tools.jar \ pre \ -n \ @@ -883,8 +905,8 @@ task create_hic { runtime { cpu : "~{num_cpus}" - disks: "local-disk 2000 HDD" - memory : "256 GB" + memory : "~{ram_gb} GB" + disks: "local-disk ~{disk_size_gb} HDD" docker: runtime_environment.docker singularity: runtime_environment.singularity } @@ -897,6 +919,8 @@ task add_norm { Array[String] normalization_methods = [] Int quality Int num_cpus = 24 + Int disk_size_gb = 256 + Int ram_gb = 72 RuntimeEnvironment runtime_environment } @@ -920,8 +944,8 @@ task add_norm { runtime { cpu : "~{num_cpus}" - disks: "local-disk 128 HDD" - memory : "72 GB" + disks: "local-disk ~{disk_size_gb} HDD" + memory : "~{ram_gb} GB" docker: runtime_environment.docker singularity: runtime_environment.singularity } diff --git a/hic_pipeline/__init__.py b/hic_pipeline/__init__.py index d785468d..d8e4391b 100644 --- a/hic_pipeline/__init__.py +++ b/hic_pipeline/__init__.py @@ -1,5 +1,5 @@ __title__ = "hic-pipeline" -__version__ = "1.12.2" +__version__ = "1.13.0" __description__ = "ENCODE Hi-C uniform processing pipeline." __url__ = "https://github.com/ENCODE-DCC/hic-pipeline" __uri__ = __url__ diff --git a/make_restriction_site_locations.wdl b/make_restriction_site_locations.wdl index efbc5813..63cf16e6 100644 --- a/make_restriction_site_locations.wdl +++ b/make_restriction_site_locations.wdl @@ -7,9 +7,9 @@ struct RuntimeEnvironment { workflow make_restriction_site_locations { meta { - version: "1.12.2" - caper_docker: "encodedcc/hic-pipeline:1.12.2" - caper_singularity: "docker://encodedcc/hic-pipeline:1.12.2" + version: "1.13.0" + caper_docker: "encodedcc/hic-pipeline:1.13.0" + caper_singularity: "docker://encodedcc/hic-pipeline:1.13.0" } parameter_meta { @@ -22,8 +22,8 @@ workflow make_restriction_site_locations { File reference_fasta String assembly_name String restriction_enzyme - String docker = "encodedcc/hic-pipeline:1.12.2" - String singularity = "docker://encodedcc/hic-pipeline:1.12.2" + String docker = "encodedcc/hic-pipeline:1.13.0" + String singularity = "docker://encodedcc/hic-pipeline:1.13.0" } diff --git a/megamap.wdl b/megamap.wdl new file mode 100644 index 00000000..98f4874c --- /dev/null +++ b/megamap.wdl @@ -0,0 +1,253 @@ +version 1.0 + +import "./hic.wdl" + +workflow megamap { + meta { + version: "1.13.0" + caper_docker: "encodedcc/hic-pipeline:1.13.0" + caper_singularity: "docker://encodedcc/hic-pipeline:1.13.0" + } + + input { + Array[File] bams + Array[File] hic_files + File? restriction_sites + File chrom_sizes + String assembly_name = "undefined" + + # Parameters + Int quality = 30 + Array[Int] create_hic_in_situ_resolutions = [2500000, 1000000, 500000, 250000, 100000, 50000, 25000, 10000, 5000, 2000, 1000, 500, 200, 100] + Array[Int] create_hic_intact_resolutions = [2500000, 1000000, 500000, 250000, 100000, 50000, 25000, 10000, 5000, 2000, 1000, 500, 200, 100, 50, 20, 10] + Boolean intact = true + + # Resource parameters + Int? create_hic_num_cpus + Int? create_hic_ram_gb + Int? create_hic_juicer_tools_heap_size_gb + Int? create_hic_disk_size_gb + Int? add_norm_num_cpus + Int? add_norm_ram_gb + Int? add_norm_disk_size_gb + Int? create_accessibility_track_ram_gb + Int? create_accessibility_track_disk_size_gb + + # Pipeline images + String docker = "encodedcc/hic-pipeline:1.13.0" + String singularity = "docker://encodedcc/hic-pipeline:1.13.0" + String delta_docker = "encodedcc/hic-pipeline:1.13.0_delta" + String hiccups_docker = "encodedcc/hic-pipeline:1.13.0_hiccups" + } + + RuntimeEnvironment runtime_environment = { + "docker": docker, + "singularity": singularity + } + + RuntimeEnvironment hiccups_runtime_environment = { + "docker": hiccups_docker, + "singularity": singularity + } + + RuntimeEnvironment delta_runtime_environment = { + "docker": delta_docker, + "singularity": singularity + } + + String delta_models_path = if intact then "ultimate-models" else "beta-models" + Array[Int] delta_resolutions = if intact then [5000, 2000, 1000] else [5000, 10000] + Array[Int] create_hic_resolutions = if intact then create_hic_intact_resolutions else create_hic_in_situ_resolutions + + call hic.normalize_assembly_name as normalize_assembly_name { input: + assembly_name = assembly_name, + runtime_environment = runtime_environment, + } + + call hic.merge as merge { input: + bams = bams, + runtime_environment = runtime_environment, + } + + call hic.bam_to_pre as bam_to_pre { input: + bam = merge.bam, + quality = quality, + runtime_environment = runtime_environment, + } + + call hic.create_accessibility_track as accessibility { input: + pre = bam_to_pre.pre, + chrom_sizes = chrom_sizes, + ram_gb = create_accessibility_track_ram_gb, + disk_size_gb = create_accessibility_track_disk_size_gb, + runtime_environment = runtime_environment, + } + + call merge_stats_from_hic_files { input: + hic_files = hic_files, + runtime_environment = runtime_environment, + } + + if (normalize_assembly_name.assembly_is_supported) { + call hic.create_hic as create_hic { input: + pre = bam_to_pre.pre, + pre_index = bam_to_pre.index, + restriction_sites = restriction_sites, + quality = quality, + stats = merge_stats_from_hic_files.merged_stats, + stats_hists = merge_stats_from_hic_files.merged_stats_hists, + resolutions = create_hic_resolutions, + assembly_name = normalize_assembly_name.normalized_assembly_name, + num_cpus = create_hic_num_cpus, + ram_gb = create_hic_ram_gb, + juicer_tools_heap_size_gb = create_hic_juicer_tools_heap_size_gb, + disk_size_gb = create_hic_disk_size_gb, + runtime_environment = runtime_environment, + } + } + + if (!normalize_assembly_name.assembly_is_supported) { + call hic.create_hic as create_hic_with_chrom_sizes { input: + pre = bam_to_pre.pre, + pre_index = bam_to_pre.index, + restriction_sites = restriction_sites, + quality = quality, + stats = merge_stats_from_hic_files.merged_stats, + stats_hists = merge_stats_from_hic_files.merged_stats_hists, + resolutions = create_hic_resolutions, + assembly_name = assembly_name, + num_cpus = create_hic_num_cpus, + ram_gb = create_hic_ram_gb, + juicer_tools_heap_size_gb = create_hic_juicer_tools_heap_size_gb, + disk_size_gb = create_hic_disk_size_gb, + chrsz = chrom_sizes, + runtime_environment = runtime_environment, + } + } + + File unnormalized_hic_file = select_first([ + if (defined(create_hic.output_hic)) + then create_hic.output_hic + else create_hic_with_chrom_sizes.output_hic + ]) + + call hic.add_norm as add_norm { input: + hic = unnormalized_hic_file, + quality = quality, + num_cpus = add_norm_num_cpus, + ram_gb = add_norm_ram_gb, + disk_size_gb = add_norm_disk_size_gb, + runtime_environment = runtime_environment, + } + + call hic.arrowhead as arrowhead { input: + hic_file = add_norm.output_hic, + quality = quality, + runtime_environment = runtime_environment, + } + + if (!intact) { + call hic.hiccups { input: + hic_file = add_norm.output_hic, + quality = quality, + runtime_environment = hiccups_runtime_environment, + } + } + + if (intact) { + call hic.hiccups_2 { input: + hic = add_norm.output_hic, + quality = quality, + runtime_environment = hiccups_runtime_environment, + } + + call hic.localizer as localizer_intact { input: + hic = add_norm.output_hic, + loops = hiccups_2.merged_loops, + quality = quality, + runtime_environment = runtime_environment, + } + } + + call hic.create_eigenvector as create_eigenvector { input: + hic_file = add_norm.output_hic, + chrom_sizes = chrom_sizes, + output_filename_suffix = "_" + quality, + runtime_environment = runtime_environment, + } + + call hic.create_eigenvector as create_eigenvector_10kb { input: + hic_file = add_norm.output_hic, + chrom_sizes = chrom_sizes, + resolution = 10000, + output_filename_suffix = "_" + quality, + runtime_environment = runtime_environment, + } + + call hic.delta as delta { input: + hic = add_norm.output_hic, + resolutions = delta_resolutions, + models_path = delta_models_path, + runtime_environment = delta_runtime_environment, + } + + call hic.localizer as localizer_delta { input: + hic = add_norm.output_hic, + loops = delta.loops, + runtime_environment = runtime_environment, + } + + call hic.slice as slice_25kb { input: + hic_file = add_norm.output_hic, + resolution = 25000, + runtime_environment = runtime_environment, + } + + call hic.slice as slice_50kb { input: + hic_file = add_norm.output_hic, + resolution = 50000, + runtime_environment = runtime_environment, + } + + call hic.slice as slice_100kb { input: + hic_file = add_norm.output_hic, + resolution = 100000, + runtime_environment = runtime_environment, + } +} + + +task merge_stats_from_hic_files { + input { + Array[File] hic_files + Int quality = 30 + RuntimeEnvironment runtime_environment + } + + command <<< + set -euo pipefail + java \ + -jar \ + /opt/merge-stats.jar \ + ~{"inter_" + quality} \ + ~{sep=" " hic_files} + python3 \ + "$(which jsonify_stats.py)" \ + inter_~{quality}.txt \ + stats_~{quality}.json + >>> + + output { + File merged_stats = "inter_~{quality}.txt" + File merged_stats_hists = "inter_~{quality}_hists.m" + File stats_json = "stats_~{quality}.json" + } + + runtime { + cpu : 1 + memory: "8 GB" + disks: "local-disk 500 HDD" + docker: runtime_environment.docker + singularity: runtime_environment.singularity + } +} diff --git a/scripts/make_megamap_input_json_from_portal.py b/scripts/make_megamap_input_json_from_portal.py new file mode 100644 index 00000000..5f78046e --- /dev/null +++ b/scripts/make_megamap_input_json_from_portal.py @@ -0,0 +1,136 @@ +import argparse +import json +from pathlib import Path +from urllib.parse import urljoin + +import requests + +_PORTAL_URL = "https://www.encodeproject.org" +_ALLOWED_STATUSES = ("released", "in progress") +_REFERENCE_FILES = { + "chrom_sizes": urljoin( + _PORTAL_URL, + "/files/GRCh38_no_alt_analysis_set_GCA_000001405.15/@@download/GRCh38_no_alt_analysis_set_GCA_000001405.15.fasta.gz", + ), +} + + +def main(): + parser = _get_parser() + args = parser.parse_args() + auth = _read_auth_from_file(args.keypair_file) + bams, hic_files = _get_bams_and_hic_files_from_experiment( + accessions=args.accessions, + auth=auth, + use_submitted_file_names=args.use_submitted_file_names, + ) + assembly_name = _get_assembly_name(experiment=_get_experiment(args.accessions[0])) + input_json = _get_input_json( + bams=bams, hic_files=hic_files, assembly_name=assembly_name + ) + _write_json_to_file(input_json, args.outfile) + + +def _get_experiment(accession, auth=None): + response = requests.get( + urljoin(_PORTAL_URL, accession), + auth=auth, + headers={"Accept": "application/json"}, + ) + response.raise_for_status() + return response.json() + + +def _get_assembly_name(experiment): + organism = experiment["replicates"][0]["library"]["biosample"]["organism"]["@id"] + if organism == "/organisms/mouse/": + assembly_name = "mm10" + elif organism == "/organisms/human/": + assembly_name = "GRCh38" + else: + raise ValueError(f"Organism {organism} not supported") + return assembly_name + + +def _get_bams_and_hic_files_from_experiment( + accessions, auth=None, use_submitted_file_names=False +): + bams = [] + hic_files = [] + for accession in accessions: + experiment = _get_experiment(accession, auth=auth) + analysis = None + for a in experiment["analyses"]: + if ( + a["status"] in _ALLOWED_STATUSES + and a["lab"] == "/labs/encode-processing-pipeline/" + ): + analysis = a + break + if analysis is None: + raise ValueError("Could not find uniform processing pipeline analysis") + for file in experiment["files"]: + if file["status"] in _ALLOWED_STATUSES and file["@id"] in analysis["files"]: + if file["file_format"] == "bam": + if use_submitted_file_names: + bams.append(file["submitted_file_name"]) + else: + bams.append(urljoin(_PORTAL_URL, file["href"])) + if ( + file["output_type"] + == "mapping quality thresholded chromatin interactions" + ): + if use_submitted_file_names: + hic_files.append(file["submitted_file_name"]) + else: + hic_files.append(urljoin(_PORTAL_URL, file["href"])) + return bams, hic_files + + +def _get_input_json(bams, hic_files, assembly_name): + input_json = { + "megamap.bams": bams, + "megamap.hic_files": hic_files, + "megamap.chrom_sizes": _REFERENCE_FILES["chrom_sizes"], + "megamap.assembly_name": assembly_name, + } + return input_json + + +def _write_json_to_file(data, outfile): + Path(outfile).write_text(json.dumps(data, indent=2, sort_keys=True)) + + +def _read_auth_from_file(keypair_file): + keypair_path = Path(keypair_file).expanduser() + if keypair_path.exists(): + data = json.loads(keypair_path.read_text()) + return (data["submit"]["key"], data["submit"]["secret"]) + else: + return None + + +def _get_parser(): + parser = argparse.ArgumentParser() + parser.add_argument("outfile") + parser.add_argument( + "-a", + "--accessions", + nargs="+", + help="Experiments to pool for megamap", + required=True, + ) + parser.add_argument( + "--keypair-file", help="Path to keypairs.json", default="~/keypairs.json" + ) + parser.add_argument( + "-u", + "--use-submitted-file-names", + help="Use submitted file names (gs:// uris) for bams to avoid massive file download", + action="store_true", + ) + return parser + + +if __name__ == "__main__": + main()