From 3338e6eae38213dd8711d6a7d4d77b9640c6b40a Mon Sep 17 00:00:00 2001 From: Paul Sud <41386393+paul-sud@users.noreply.github.com> Date: Fri, 11 Mar 2022 11:50:18 -0800 Subject: [PATCH] PIPE-49-diploidify (#145) --- diploidify.wdl | 221 ++++++++++++++++++++++++ docker/hic-pipeline/Dockerfile | 12 +- genophase.wdl | 10 +- hic.wdl | 20 ++- hic_pipeline/__init__.py | 2 +- hic_pipeline/filter_chrom_sizes.py | 29 ++++ make_restriction_site_locations.wdl | 10 +- megamap.wdl | 14 +- tests/python/test_filter_chrom_sizes.py | 21 +++ 9 files changed, 310 insertions(+), 29 deletions(-) create mode 100644 diploidify.wdl create mode 100755 hic_pipeline/filter_chrom_sizes.py create mode 100644 tests/python/test_filter_chrom_sizes.py diff --git a/diploidify.wdl b/diploidify.wdl new file mode 100644 index 00000000..5cbc5715 --- /dev/null +++ b/diploidify.wdl @@ -0,0 +1,221 @@ +version 1.0 + +import "./hic.wdl" + +workflow diploidify { + meta { + version: "1.14.0" + caper_docker: "encodedcc/hic-pipeline:1.14.0" + caper_singularity: "docker://encodedcc/hic-pipeline:1.14.0" + } + + input { + Array[File] bams + # This is from genophase, snp.out_HiC.vcf.gz + File vcf + File chrom_sizes + + # Parameters + Int quality = 30 + Array[Int] create_diploid_hic_resolutions = [2500000, 1000000, 500000, 250000, 100000, 50000, 25000, 10000, 5000, 2000, 1000, 500, 200, 100, 50, 20, 10] + + # Resource params, specify to override the defaults + Int? merge_num_cpus + Int? merge_ram_gb + Int? merge_disk_size_gb + Int? prepare_bam_num_cpus + Int? prepare_bam_ram_gb + Int? prepare_bam_disk_size_gb + Int? create_diploid_hic_num_cpus + Int? create_diploid_hic_ram_gb + Int? create_diploid_hic_disk_size_gb + Int? create_diploid_dhs_num_cpus + Int? create_diploid_dhs_ram_gb + Int? create_diploid_dhs_disk_size_gb + + String docker = "encodedcc/hic-pipeline:1.14.0" + String singularity = "docker://encodedcc/hic-pipeline:1.14.0" + } + + RuntimeEnvironment runtime_environment = { + "docker": docker, + "singularity": singularity + } + + call filter_chrom_sizes { input: + chrom_sizes = chrom_sizes, + } + + call hic.merge as merge { input: + bams = bams, + num_cpus = merge_num_cpus, + ram_gb = merge_ram_gb, + disk_size_gb = merge_disk_size_gb, + runtime_environment = runtime_environment, + } + + call prepare_bam { input: + bam = merge.bam, + quality = quality, + chrom_sizes = chrom_sizes, + num_cpus = prepare_bam_num_cpus, + ram_gb = prepare_bam_ram_gb, + disk_size_gb = prepare_bam_disk_size_gb, + } + + call create_diploid_hic { input: + bam = prepare_bam.filtered_bam, + bam_index = prepare_bam.bam_index, + vcf = vcf, + chrom_sizes = filter_chrom_sizes.filtered_chrom_sizes, + resolutions = create_diploid_hic_resolutions, + num_cpus = create_diploid_hic_num_cpus, + ram_gb = create_diploid_hic_ram_gb, + disk_size_gb = create_diploid_hic_disk_size_gb, + } + + call create_diploid_dhs { input: + bam = prepare_bam.filtered_bam, + bam_index = prepare_bam.bam_index, + chrom_sizes = chrom_sizes, + reads_to_homologs = create_diploid_hic.reads_to_homologs, + num_cpus = create_diploid_dhs_num_cpus, + ram_gb = create_diploid_dhs_ram_gb, + disk_size_gb = create_diploid_dhs_disk_size_gb, + } +} + +task filter_chrom_sizes { + input { + File chrom_sizes + String output_filename = "filtered.chrom.sizes" + } + + command <<< + python3 "$(which filter_chrom_sizes.py)" ~{chrom_sizes} ~{output_filename} + >>> + + output { + File filtered_chrom_sizes = output_filename + } +} + +task prepare_bam { + input { + File bam + File chrom_sizes + Int quality + Int num_cpus = 8 + Int ram_gb = 64 + Int disk_size_gb = 2000 + } + + command <<< + export CHROM_SIZES_FILENAME="assembly.chrom.sizes" + mv ~{chrom_sizes} $CHROM_SIZES_FILENAME + bash /opt/juicer/CPU/diploidify.sh \ + --from-stage prep \ + --to-stage prep \ + --chrom-sizes $CHROM_SIZES_FILENAME \ + --mapq ~{quality} \ + --juicer-dir /opt \ + --phaser-dir /opt/3d-dna \ + ~{bam} + >>> + + output { + File filtered_bam = "reads.sorted.bam" + File bam_index = "reads.sorted.bam.bai" + } + + runtime { + cpu: num_cpus + memory: "~{ram_gb} GB" + disks: "local-disk ~{disk_size_gb} HDD" + } +} + + +task create_diploid_hic { + input { + File bam + File bam_index + File vcf + File chrom_sizes + Array[Int] resolutions + Int num_cpus = 24 + Int ram_gb = 128 + Int disk_size_gb = 2000 + } + + command <<< + mv ~{bam} "reads.sorted.bam" + mv ~{bam_index} "reads.sorted.bam.bai" + export CHROM_SIZES_FILENAME="assembly.chrom.sizes" + mv ~{chrom_sizes} $CHROM_SIZES_FILENAME + export VCF_FILENAME="snp.vcf" + gzip -dc ~{vcf} > $VCF_FILENAME + bash /opt/juicer/CPU/diploidify.sh \ + --from-stage hic \ + --to-stage hic \ + --vcf $VCF_FILENAME \ + --chrom-sizes $CHROM_SIZES_FILENAME \ + --resolutions ~{sep="," resolutions} \ + --threads-hic ~{num_cpus} \ + --juicer-dir /opt \ + --phaser-dir /opt/3d-dna + >>> + + output { + # r = reference, a = alternate haplotype but it's arbitrary + File hic_r = "diploid_inter_r.hic" + File hic_a = "diploid_inter_a.hic" + File reads_to_homologs = "reads_to_homologs.txt" + } + + runtime { + cpu: num_cpus + memory: "~{ram_gb} GB" + disks: "local-disk ~{disk_size_gb} HDD" + } +} + +task create_diploid_dhs { + input { + File bam + File bam_index + File chrom_sizes + File reads_to_homologs + Int num_cpus = 2 + Int ram_gb = 128 + Int disk_size_gb = 1000 + } + + command <<< + mv ~{bam} "reads.sorted.bam" + mv ~{bam_index} "reads.sorted.bam.bai" + export CHROM_SIZES_FILENAME="assembly.chrom.sizes" + mv ~{chrom_sizes} $CHROM_SIZES_FILENAME + bash /opt/juicer/CPU/diploidify.sh \ + --from-stage dhs \ + --to-stage dhs \ + --chrom-sizes $CHROM_SIZES_FILENAME \ + --reads-to-homologs ~{reads_to_homologs} \ + --juicer-dir /opt \ + --phaser-dir /opt/3d-dna + >>> + + output { + # r = reference, a = alternate haplotype but it's arbitrary + File bigwig_raw_r = "diploid_inter_raw_r.bw" + File bigwig_raw_a = "diploid_inter_raw_a.bw" + File bigwig_corrected_r = "diploid_inter_corrected_r.bw" + File bigwig_corrected_a = "diploid_inter_corrected_a.bw" + } + + runtime { + cpu: num_cpus + memory: "~{ram_gb} GB" + disks: "local-disk ~{disk_size_gb} HDD" + } +} diff --git a/docker/hic-pipeline/Dockerfile b/docker/hic-pipeline/Dockerfile index 572e7496..d4e76ad8 100644 --- a/docker/hic-pipeline/Dockerfile +++ b/docker/hic-pipeline/Dockerfile @@ -38,6 +38,12 @@ RUN ln -s /usr/bin/python3 /usr/bin/python # Need to be sure we have this for stats RUN locale-gen en_US.UTF-8 +# Fix warning for diploidify +# https://www.educative.io/edpresso/error-mesg-ttyname-failed-inappropriate-ioctl-for-device +# Need to escape the &s +# https://unix.stackexchange.com/questions/32907/what-characters-do-i-need-to-escape-when-using-sed-in-a-sh-script +RUN sed -i 's/mesg n || true/tty -s \&\& mesg n/' /root/.profile + WORKDIR /opt/ # Install BWA @@ -105,7 +111,7 @@ RUN git clone https://github.com/ENCODE-DCC/kentUtils_bin_v381.git && \ # Install Juicer RUN git clone --branch encode https://github.com/theaidenlab/juicer.git && \ cd juicer && \ - git checkout 7b21fd620ee1f07266206caa2a7992d08d51ba8e && \ + git checkout 50d557f1d4725a475071fce5975839602bd311e5 && \ chmod +x CPU/* CPU/common/* misc/* && \ find -mindepth 1 -maxdepth 1 -type d -not -name "CPU" -not -name ".git" -not -name "misc" | xargs rm -rf @@ -115,7 +121,9 @@ RUN curl \ https://github.com/aidenlab/Juicebox/releases/download/v2.13.06/juicer_tools_2.13.06.jar \ -o /opt/juicer/CPU/common/juicer_tools.jar && \ chmod 666 /opt/juicer/CPU/common/juicer_tools.jar && \ - ln -s juicer/CPU scripts + ln -s juicer/CPU scripts && \ + ln -s /opt/juicer/CPU/common/juicer_tools /opt/juicer/CPU/juicer_tools && \ + ln -s /opt/juicer/CPU/common/juicer_tools.jar /opt/juicer/CPU/juicer_tools.jar RUN curl \ -LO \ diff --git a/genophase.wdl b/genophase.wdl index c103e387..b1f392a7 100644 --- a/genophase.wdl +++ b/genophase.wdl @@ -4,9 +4,9 @@ import "./hic.wdl" workflow genophase { meta { - version: "1.13.0" - caper_docker: "encodedcc/hic-pipeline:1.13.0" - caper_singularity: "docker://encodedcc/hic-pipeline:1.13.0" + version: "1.14.0" + caper_docker: "encodedcc/hic-pipeline:1.14.0" + caper_singularity: "docker://encodedcc/hic-pipeline:1.14.0" croo_out_def: "https://raw.githubusercontent.com/ENCODE-DCC/hic-pipeline/dev/croo_out_def.json" } @@ -25,8 +25,8 @@ workflow genophase { Int? run_3d_dna_ram_gb Boolean no_phasing = false - String docker = "encodedcc/hic-pipeline:1.13.0" - String singularity = "docker://encodedcc/hic-pipeline:1.13.0" + String docker = "encodedcc/hic-pipeline:1.14.0" + String singularity = "docker://encodedcc/hic-pipeline:1.14.0" } RuntimeEnvironment runtime_environment = { diff --git a/hic.wdl b/hic.wdl index 36e7ceba..5e7a5df8 100644 --- a/hic.wdl +++ b/hic.wdl @@ -19,9 +19,9 @@ struct RuntimeEnvironment { workflow hic { meta { - version: "1.13.0" - caper_docker: "encodedcc/hic-pipeline:1.13.0" - caper_singularity: "docker://encodedcc/hic-pipeline:1.13.0" + version: "1.14.0" + caper_docker: "encodedcc/hic-pipeline:1.14.0" + caper_singularity: "docker://encodedcc/hic-pipeline:1.14.0" croo_out_def: "https://raw.githubusercontent.com/ENCODE-DCC/hic-pipeline/dev/croo_out_def.json" description: "ENCODE Hi-C pipeline, see https://github.com/ENCODE-DCC/hic-pipeline for details." } @@ -77,10 +77,10 @@ workflow hic { Int? create_accessibility_track_disk_size_gb String assembly_name = "undefined" - String docker = "encodedcc/hic-pipeline:1.13.0" - String singularity = "docker://encodedcc/hic-pipeline:1.13.0" - String delta_docker = "encodedcc/hic-pipeline:1.13.0_delta" - String hiccups_docker = "encodedcc/hic-pipeline:1.13.0_hiccups" + String docker = "encodedcc/hic-pipeline:1.14.0" + String singularity = "docker://encodedcc/hic-pipeline:1.14.0" + String delta_docker = "encodedcc/hic-pipeline:1.14.0_delta" + String hiccups_docker = "encodedcc/hic-pipeline:1.14.0_hiccups" } RuntimeEnvironment runtime_environment = { @@ -654,6 +654,8 @@ task merge { input { Array[File] bams Int num_cpus = 8 + Int ram_gb = 16 + Int disk_size_gb = 6000 String output_bam_filename = "merged" RuntimeEnvironment runtime_environment } @@ -675,8 +677,8 @@ task merge { runtime { cpu : "~{num_cpus}" - memory: "16 GB" - disks: "local-disk 6000 HDD" + memory: "~{ram_gb} GB" + disks: "local-disk ~{disk_size_gb} HDD" docker: runtime_environment.docker singularity: runtime_environment.singularity } diff --git a/hic_pipeline/__init__.py b/hic_pipeline/__init__.py index d8e4391b..7b1330b1 100644 --- a/hic_pipeline/__init__.py +++ b/hic_pipeline/__init__.py @@ -1,5 +1,5 @@ __title__ = "hic-pipeline" -__version__ = "1.13.0" +__version__ = "1.14.0" __description__ = "ENCODE Hi-C uniform processing pipeline." __url__ = "https://github.com/ENCODE-DCC/hic-pipeline" __uri__ = __url__ diff --git a/hic_pipeline/filter_chrom_sizes.py b/hic_pipeline/filter_chrom_sizes.py new file mode 100755 index 00000000..ac3c78be --- /dev/null +++ b/hic_pipeline/filter_chrom_sizes.py @@ -0,0 +1,29 @@ +import argparse + + +def main(): + parser = get_parser() + args = parser.parse_args() + with open(args.chrom_sizes) as chrom_sizes, open( + args.output_file, "w" + ) as output_file: + filter_chrom_sizes(chrom_sizes=chrom_sizes, output_file=output_file) + + +def filter_chrom_sizes(chrom_sizes, output_file): + for line in chrom_sizes: + chrom_name = line.split()[0] + if "_" in chrom_name or chrom_name == "chrEBV": + continue + output_file.write(line) + + +def get_parser(): + parser = argparse.ArgumentParser() + parser.add_argument("chrom_sizes") + parser.add_argument("output_file") + return parser + + +if __name__ == "__main__": + main() diff --git a/make_restriction_site_locations.wdl b/make_restriction_site_locations.wdl index 63cf16e6..ad13ecaf 100644 --- a/make_restriction_site_locations.wdl +++ b/make_restriction_site_locations.wdl @@ -7,9 +7,9 @@ struct RuntimeEnvironment { workflow make_restriction_site_locations { meta { - version: "1.13.0" - caper_docker: "encodedcc/hic-pipeline:1.13.0" - caper_singularity: "docker://encodedcc/hic-pipeline:1.13.0" + version: "1.14.0" + caper_docker: "encodedcc/hic-pipeline:1.14.0" + caper_singularity: "docker://encodedcc/hic-pipeline:1.14.0" } parameter_meta { @@ -22,8 +22,8 @@ workflow make_restriction_site_locations { File reference_fasta String assembly_name String restriction_enzyme - String docker = "encodedcc/hic-pipeline:1.13.0" - String singularity = "docker://encodedcc/hic-pipeline:1.13.0" + String docker = "encodedcc/hic-pipeline:1.14.0" + String singularity = "docker://encodedcc/hic-pipeline:1.14.0" } diff --git a/megamap.wdl b/megamap.wdl index 98f4874c..6dcf33a3 100644 --- a/megamap.wdl +++ b/megamap.wdl @@ -4,9 +4,9 @@ import "./hic.wdl" workflow megamap { meta { - version: "1.13.0" - caper_docker: "encodedcc/hic-pipeline:1.13.0" - caper_singularity: "docker://encodedcc/hic-pipeline:1.13.0" + version: "1.14.0" + caper_docker: "encodedcc/hic-pipeline:1.14.0" + caper_singularity: "docker://encodedcc/hic-pipeline:1.14.0" } input { @@ -34,10 +34,10 @@ workflow megamap { Int? create_accessibility_track_disk_size_gb # Pipeline images - String docker = "encodedcc/hic-pipeline:1.13.0" - String singularity = "docker://encodedcc/hic-pipeline:1.13.0" - String delta_docker = "encodedcc/hic-pipeline:1.13.0_delta" - String hiccups_docker = "encodedcc/hic-pipeline:1.13.0_hiccups" + String docker = "encodedcc/hic-pipeline:1.14.0" + String singularity = "docker://encodedcc/hic-pipeline:1.14.0" + String delta_docker = "encodedcc/hic-pipeline:1.14.0_delta" + String hiccups_docker = "encodedcc/hic-pipeline:1.14.0_hiccups" } RuntimeEnvironment runtime_environment = { diff --git a/tests/python/test_filter_chrom_sizes.py b/tests/python/test_filter_chrom_sizes.py new file mode 100644 index 00000000..4c25565e --- /dev/null +++ b/tests/python/test_filter_chrom_sizes.py @@ -0,0 +1,21 @@ +import textwrap +from io import StringIO + +from hic_pipeline.filter_chrom_sizes import filter_chrom_sizes + + +def test_filter_chrom_sizes(): + chrom_sizes_data = textwrap.dedent( + """\ + chr1\t248956422 + chrM\t16569 + chr1_GL383518v1_alt\t182439 + chr1_KI270707v1_random\t32032 + chrUn_GL000195v1\t182896 + chrEBV\t171823 + """ + ) + chrom_sizes = StringIO(initial_value=chrom_sizes_data) + output_file = StringIO() + filter_chrom_sizes(chrom_sizes, output_file) + assert output_file.getvalue() == "chr1\t248956422\nchrM\t16569\n"