PIPE-49-diploidify (#145)

ENCODE-DCC · Mar 11, 2022 · 3338e6e · 3338e6e
1 parent 4f8ffe5
commit 3338e6e
Show file tree

Hide file tree

Showing 9 changed files with 310 additions and 29 deletions.
diff --git a/diploidify.wdl b/diploidify.wdl
@@ -0,0 +1,221 @@
+version 1.0
+
+import "./hic.wdl"
+
+workflow diploidify {
+    meta {
+        version: "1.14.0"
+        caper_docker: "encodedcc/hic-pipeline:1.14.0"
+        caper_singularity: "docker://encodedcc/hic-pipeline:1.14.0"
+    }
+
+    input {
+        Array[File] bams
+        # This is from genophase, snp.out_HiC.vcf.gz
+        File vcf
+        File chrom_sizes
+
+        # Parameters
+        Int quality = 30
+        Array[Int] create_diploid_hic_resolutions = [2500000, 1000000, 500000, 250000, 100000, 50000, 25000, 10000, 5000, 2000, 1000, 500, 200, 100, 50, 20, 10]
+
+        # Resource params, specify to override the defaults
+        Int? merge_num_cpus
+        Int? merge_ram_gb
+        Int? merge_disk_size_gb
+        Int? prepare_bam_num_cpus
+        Int? prepare_bam_ram_gb
+        Int? prepare_bam_disk_size_gb
+        Int? create_diploid_hic_num_cpus
+        Int? create_diploid_hic_ram_gb
+        Int? create_diploid_hic_disk_size_gb
+        Int? create_diploid_dhs_num_cpus
+        Int? create_diploid_dhs_ram_gb
+        Int? create_diploid_dhs_disk_size_gb
+
+        String docker = "encodedcc/hic-pipeline:1.14.0"
+        String singularity = "docker://encodedcc/hic-pipeline:1.14.0"
+    }
+
+    RuntimeEnvironment runtime_environment = {
+      "docker": docker,
+      "singularity": singularity
+    }
+
+    call filter_chrom_sizes { input:
+        chrom_sizes = chrom_sizes,
+    }
+
+    call hic.merge as merge { input:
+        bams = bams,
+        num_cpus = merge_num_cpus,
+        ram_gb = merge_ram_gb,
+        disk_size_gb = merge_disk_size_gb,
+        runtime_environment = runtime_environment,
+    }
+
+    call prepare_bam { input:
+        bam = merge.bam,
+        quality = quality,
+        chrom_sizes = chrom_sizes,
+        num_cpus = prepare_bam_num_cpus,
+        ram_gb = prepare_bam_ram_gb,
+        disk_size_gb = prepare_bam_disk_size_gb,
+    }
+
+    call create_diploid_hic { input:
+        bam = prepare_bam.filtered_bam,
+        bam_index = prepare_bam.bam_index,
+        vcf = vcf,
+        chrom_sizes = filter_chrom_sizes.filtered_chrom_sizes,
+        resolutions = create_diploid_hic_resolutions,
+        num_cpus = create_diploid_hic_num_cpus,
+        ram_gb = create_diploid_hic_ram_gb,
+        disk_size_gb = create_diploid_hic_disk_size_gb,
+    }
+
+    call create_diploid_dhs { input:
+        bam = prepare_bam.filtered_bam,
+        bam_index = prepare_bam.bam_index,
+        chrom_sizes = chrom_sizes,
+        reads_to_homologs = create_diploid_hic.reads_to_homologs,
+        num_cpus = create_diploid_dhs_num_cpus,
+        ram_gb = create_diploid_dhs_ram_gb,
+        disk_size_gb = create_diploid_dhs_disk_size_gb,
+    }
+}
+
+task filter_chrom_sizes {
+    input {
+        File chrom_sizes
+        String output_filename = "filtered.chrom.sizes"
+    }
+
+    command <<<
+        python3 "$(which filter_chrom_sizes.py)" ~{chrom_sizes} ~{output_filename}
+    >>>
+
+    output {
+        File filtered_chrom_sizes = output_filename
+    }
+}
+
+task prepare_bam {
+    input {
+        File bam
+        File chrom_sizes
+        Int quality
+        Int num_cpus = 8
+        Int ram_gb = 64
+        Int disk_size_gb = 2000
+    }
+
+    command <<<
+        export CHROM_SIZES_FILENAME="assembly.chrom.sizes"
+        mv ~{chrom_sizes} $CHROM_SIZES_FILENAME
+        bash /opt/juicer/CPU/diploidify.sh \
+            --from-stage prep \
+            --to-stage prep \
+            --chrom-sizes $CHROM_SIZES_FILENAME \
+            --mapq ~{quality} \
+            --juicer-dir /opt \
+            --phaser-dir /opt/3d-dna \
+            ~{bam}
+    >>>
+
+    output {
+        File filtered_bam = "reads.sorted.bam"
+        File bam_index = "reads.sorted.bam.bai"
+    }
+
+    runtime {
+        cpu: num_cpus
+        memory: "~{ram_gb} GB"
+        disks: "local-disk ~{disk_size_gb} HDD"
+    }
+}
+
+
+task create_diploid_hic {
+    input {
+        File bam
+        File bam_index
+        File vcf
+        File chrom_sizes
+        Array[Int] resolutions
+        Int num_cpus = 24
+        Int ram_gb = 128
+        Int disk_size_gb = 2000
+    }
+
+    command <<<
+        mv ~{bam} "reads.sorted.bam"
+        mv ~{bam_index} "reads.sorted.bam.bai"
+        export CHROM_SIZES_FILENAME="assembly.chrom.sizes"
+        mv ~{chrom_sizes} $CHROM_SIZES_FILENAME
+        export VCF_FILENAME="snp.vcf"
+        gzip -dc ~{vcf} > $VCF_FILENAME
+        bash /opt/juicer/CPU/diploidify.sh \
+            --from-stage hic \
+            --to-stage hic \
+            --vcf $VCF_FILENAME \
+            --chrom-sizes $CHROM_SIZES_FILENAME \
+            --resolutions ~{sep="," resolutions} \
+            --threads-hic ~{num_cpus} \
+            --juicer-dir /opt \
+            --phaser-dir /opt/3d-dna
+    >>>
+
+    output {
+        # r = reference, a = alternate haplotype but it's arbitrary
+        File hic_r = "diploid_inter_r.hic"
+        File hic_a = "diploid_inter_a.hic"
+        File reads_to_homologs = "reads_to_homologs.txt"
+    }
+
+    runtime {
+        cpu: num_cpus
+        memory: "~{ram_gb} GB"
+        disks: "local-disk ~{disk_size_gb} HDD"
+    }
+}
+
+task create_diploid_dhs {
+    input {
+        File bam
+        File bam_index
+        File chrom_sizes
+        File reads_to_homologs
+        Int num_cpus = 2
+        Int ram_gb = 128
+        Int disk_size_gb = 1000
+    }
+
+    command <<<
+        mv ~{bam} "reads.sorted.bam"
+        mv ~{bam_index} "reads.sorted.bam.bai"
+        export CHROM_SIZES_FILENAME="assembly.chrom.sizes"
+        mv ~{chrom_sizes} $CHROM_SIZES_FILENAME
+        bash /opt/juicer/CPU/diploidify.sh \
+            --from-stage dhs \
+            --to-stage dhs \
+            --chrom-sizes $CHROM_SIZES_FILENAME \
+            --reads-to-homologs ~{reads_to_homologs} \
+            --juicer-dir /opt \
+            --phaser-dir /opt/3d-dna
+    >>>
+
+    output {
+        # r = reference, a = alternate haplotype but it's arbitrary
+        File bigwig_raw_r = "diploid_inter_raw_r.bw"
+        File bigwig_raw_a = "diploid_inter_raw_a.bw"
+        File bigwig_corrected_r = "diploid_inter_corrected_r.bw"
+        File bigwig_corrected_a = "diploid_inter_corrected_a.bw"
+    }
+
+    runtime {
+        cpu: num_cpus
+        memory: "~{ram_gb} GB"
+        disks: "local-disk ~{disk_size_gb} HDD"
+    }
+}
diff --git a/docker/hic-pipeline/Dockerfile b/docker/hic-pipeline/Dockerfile
@@ -38,6 +38,12 @@ RUN ln -s /usr/bin/python3 /usr/bin/python
 # Need to be sure we have this for stats
 RUN locale-gen en_US.UTF-8
 
+# Fix warning for diploidify
+# https://www.educative.io/edpresso/error-mesg-ttyname-failed-inappropriate-ioctl-for-device
+# Need to escape the &s
+# https://unix.stackexchange.com/questions/32907/what-characters-do-i-need-to-escape-when-using-sed-in-a-sh-script
+RUN sed -i 's/mesg n || true/tty -s \&\& mesg n/' /root/.profile
+
 WORKDIR /opt/
 
 # Install BWA
@@ -105,7 +111,7 @@ RUN git clone https://github.com/ENCODE-DCC/kentUtils_bin_v381.git && \
 # Install Juicer
 RUN git clone --branch encode https://github.com/theaidenlab/juicer.git && \
     cd juicer && \
-    git checkout 7b21fd620ee1f07266206caa2a7992d08d51ba8e && \
+    git checkout 50d557f1d4725a475071fce5975839602bd311e5 && \
     chmod +x CPU/* CPU/common/* misc/* && \
     find -mindepth 1 -maxdepth 1  -type d -not -name "CPU" -not -name ".git" -not -name "misc" | xargs rm -rf
 
@@ -115,7 +121,9 @@ RUN curl \
         https://github.com/aidenlab/Juicebox/releases/download/v2.13.06/juicer_tools_2.13.06.jar \
         -o /opt/juicer/CPU/common/juicer_tools.jar && \
     chmod 666 /opt/juicer/CPU/common/juicer_tools.jar && \
-    ln -s juicer/CPU scripts
+    ln -s juicer/CPU scripts && \
+    ln -s /opt/juicer/CPU/common/juicer_tools /opt/juicer/CPU/juicer_tools && \
+    ln -s /opt/juicer/CPU/common/juicer_tools.jar /opt/juicer/CPU/juicer_tools.jar
 
 RUN curl \
         -LO \

diff --git a/genophase.wdl b/genophase.wdl
@@ -4,9 +4,9 @@ import "./hic.wdl"
 
 workflow genophase {
     meta {
-        version: "1.13.0"
-        caper_docker: "encodedcc/hic-pipeline:1.13.0"
-        caper_singularity: "docker://encodedcc/hic-pipeline:1.13.0"
+        version: "1.14.0"
+        caper_docker: "encodedcc/hic-pipeline:1.14.0"
+        caper_singularity: "docker://encodedcc/hic-pipeline:1.14.0"
         croo_out_def: "https://raw.githubusercontent.com/ENCODE-DCC/hic-pipeline/dev/croo_out_def.json"
     }
 
@@ -25,8 +25,8 @@ workflow genophase {
         Int? run_3d_dna_ram_gb
         Boolean no_phasing = false
 
-        String docker = "encodedcc/hic-pipeline:1.13.0"
-        String singularity = "docker://encodedcc/hic-pipeline:1.13.0"
+        String docker = "encodedcc/hic-pipeline:1.14.0"
+        String singularity = "docker://encodedcc/hic-pipeline:1.14.0"
     }
 
     RuntimeEnvironment runtime_environment = {

diff --git a/hic.wdl b/hic.wdl
@@ -19,9 +19,9 @@ struct RuntimeEnvironment {
 
 workflow hic {
     meta {
-        version: "1.13.0"
-        caper_docker: "encodedcc/hic-pipeline:1.13.0"
-        caper_singularity: "docker://encodedcc/hic-pipeline:1.13.0"
+        version: "1.14.0"
+        caper_docker: "encodedcc/hic-pipeline:1.14.0"
+        caper_singularity: "docker://encodedcc/hic-pipeline:1.14.0"
         croo_out_def: "https://raw.githubusercontent.com/ENCODE-DCC/hic-pipeline/dev/croo_out_def.json"
         description: "ENCODE Hi-C pipeline, see https://github.com/ENCODE-DCC/hic-pipeline for details."
     }
@@ -77,10 +77,10 @@ workflow hic {
         Int? create_accessibility_track_disk_size_gb
         String assembly_name = "undefined"
 
-        String docker = "encodedcc/hic-pipeline:1.13.0"
-        String singularity = "docker://encodedcc/hic-pipeline:1.13.0"
-        String delta_docker = "encodedcc/hic-pipeline:1.13.0_delta"
-        String hiccups_docker = "encodedcc/hic-pipeline:1.13.0_hiccups"
+        String docker = "encodedcc/hic-pipeline:1.14.0"
+        String singularity = "docker://encodedcc/hic-pipeline:1.14.0"
+        String delta_docker = "encodedcc/hic-pipeline:1.14.0_delta"
+        String hiccups_docker = "encodedcc/hic-pipeline:1.14.0_hiccups"
     }
 
     RuntimeEnvironment runtime_environment = {
@@ -654,6 +654,8 @@ task merge {
     input {
         Array[File] bams
         Int num_cpus = 8
+        Int ram_gb = 16
+        Int disk_size_gb = 6000
         String output_bam_filename = "merged"
         RuntimeEnvironment runtime_environment
     }
@@ -675,8 +677,8 @@ task merge {
 
     runtime {
         cpu : "~{num_cpus}"
-        memory: "16 GB"
-        disks: "local-disk 6000 HDD"
+        memory: "~{ram_gb} GB"
+        disks: "local-disk ~{disk_size_gb} HDD"
         docker: runtime_environment.docker
         singularity: runtime_environment.singularity
     }

diff --git a/hic_pipeline/__init__.py b/hic_pipeline/__init__.py
@@ -1,5 +1,5 @@
 __title__ = "hic-pipeline"
-__version__ = "1.13.0"
+__version__ = "1.14.0"
 __description__ = "ENCODE Hi-C uniform processing pipeline."
 __url__ = "https://github.com/ENCODE-DCC/hic-pipeline"
 __uri__ = __url__

diff --git a/hic_pipeline/filter_chrom_sizes.py b/hic_pipeline/filter_chrom_sizes.py
@@ -0,0 +1,29 @@
+import argparse
+
+
+def main():
+    parser = get_parser()
+    args = parser.parse_args()
+    with open(args.chrom_sizes) as chrom_sizes, open(
+        args.output_file, "w"
+    ) as output_file:
+        filter_chrom_sizes(chrom_sizes=chrom_sizes, output_file=output_file)
+
+
+def filter_chrom_sizes(chrom_sizes, output_file):
+    for line in chrom_sizes:
+        chrom_name = line.split()[0]
+        if "_" in chrom_name or chrom_name == "chrEBV":
+            continue
+        output_file.write(line)
+
+
+def get_parser():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("chrom_sizes")
+    parser.add_argument("output_file")
+    return parser
+
+
+if __name__ == "__main__":
+    main()
diff --git a/make_restriction_site_locations.wdl b/make_restriction_site_locations.wdl
@@ -7,9 +7,9 @@ struct RuntimeEnvironment {
 
 workflow make_restriction_site_locations {
     meta {
-        version: "1.13.0"
-        caper_docker: "encodedcc/hic-pipeline:1.13.0"
-        caper_singularity: "docker://encodedcc/hic-pipeline:1.13.0"
+        version: "1.14.0"
+        caper_docker: "encodedcc/hic-pipeline:1.14.0"
+        caper_singularity: "docker://encodedcc/hic-pipeline:1.14.0"
     }
 
     parameter_meta {
@@ -22,8 +22,8 @@ workflow make_restriction_site_locations {
         File reference_fasta
         String assembly_name
         String restriction_enzyme
-        String docker = "encodedcc/hic-pipeline:1.13.0"
-        String singularity = "docker://encodedcc/hic-pipeline:1.13.0"
+        String docker = "encodedcc/hic-pipeline:1.14.0"
+        String singularity = "docker://encodedcc/hic-pipeline:1.14.0"
     }