From b0b4d78b03ec8accedd026d47bed5c5d72ac13ce Mon Sep 17 00:00:00 2001 From: David Koppstein Date: Mon, 27 Feb 2023 11:51:23 +0100 Subject: [PATCH] include everything from dk repo --- config/config.yaml | 38 +++---- config/phased_vcfs.tsv | 3 +- config/references.tsv | 4 +- environment.yml | 196 ++++++++++++++++++++++++++++++++- envs/cooler.yml | 2 +- envs/pore_c.yml | 3 +- envs/whatshap.yml | 2 +- rules/exports.smk | 14 ++- rules/mapping.smk | 22 +++- rules/methylation.smk | 6 + rules/reads.smk | 1 + rules/refgenome.smk | 14 ++- scripts/reformat_hicpro_vcf.sh | 5 + 13 files changed, 269 insertions(+), 41 deletions(-) create mode 100755 scripts/reformat_hicpro_vcf.sh diff --git a/config/config.yaml b/config/config.yaml index 34ac252..b76df46 100644 --- a/config/config.yaml +++ b/config/config.yaml @@ -14,41 +14,42 @@ reads_per_batch: 50000 # reads longer than this are filtered out (they can cause bwa-sw to crash) max_read_length: 100000 +# formerly 1000 matrix_resolutions: - base: 1000 + base: 10000 zoomify: - - 1000 - - 2000 - - 5000 + # - 1000 + # - 2000 + # - 5000 - 10000 - - 25000 - - 50000 + - 20000 + - 40000 - 100000 - 250000 - 500000 - 1000000 - - 2500000 - - 5000000 - - 10000000 + # - 2500000 + # - 5000000 + # - 10000000 software: juicer: tools_url: "https://s3.amazonaws.com/hicfiles.tc4ga.com/public/juicer/juicer_tools_1.19.02.jar" bwa: cli_opts: "bwasw -b 5 -q 2 -r 1 -T 15 -z 10" - threads: 10 + threads: 64 pore_c: create_alignment_table: - threads: 1 + threads: 32 to_cooler: - threads: 10 + threads: 32 to_unsorted_pairs: - threads: 1 + threads: 32 sort_pairs_file: - threads: 10 + threads: 32 to_salsa_bed: - threads: 4 + threads: 32 sort: - threads: 10 + threads: 64 memory_per_thread: "4G" f5c: run_mode: cpu # gpu @@ -57,16 +58,15 @@ software: settings: gpu: binary: "f5c_x86_64_linux_cuda" - threads: 8 + threads: 16 gpus: 1 cli_opts: "--cuda-dev-id 0 --cuda-mem-frac 0.7" cpu: binary: "f5c_x86_64_linux" - threads: 20 + threads: 32 gpus: 0 cli_opts: "" # mapping_query: "" mapping_query: "" pore_c_version: 'rel' - diff --git a/config/phased_vcfs.tsv b/config/phased_vcfs.tsv index 0752d09..6ea8bc7 100644 --- a/config/phased_vcfs.tsv +++ b/config/phased_vcfs.tsv @@ -3,5 +3,6 @@ # refgenome_id: The reference genome the phased variants were called on, must match an entry in references.tsv # biospecimen: The sample the phased variants come from, must match corresponding entry in basecalls.tsv # vcf_path: Path to a tabix-indexed VCF file with phased variants. +#GIAB GRCh38 GM12878 .test/resources/GM12878.phased.conf.vcf.gz phase_set_id refgenome_id biospecimen vcf_path -GIAB GRCh38 GM12878 .test/resources/GM12878.phased.conf.vcf.gz +129S1CAST GRCm38 9sCa /data/akhtar/Mouse2019AlleleSpecific2/projects/pore-c/resources/snp_genome/129S1_CAST.snp.hicpro_reformatted.vcf.gz diff --git a/config/references.tsv b/config/references.tsv index 03b9ba4..dc444c7 100644 --- a/config/references.tsv +++ b/config/references.tsv @@ -1,6 +1,6 @@ # references.tsv - One entry per genome assembly (eg reference genome, draft genome assembly or scaffold). # refgenome_id: A unique id to identify the reference/assmembly/scaffold you're mapping against # refgenome_path: The path of the source fasta path. If a relative path is given then it is relative to the snakemake workdir +#draft1 .test/resources/GRCh38.fasta.gz refgenome_id refgenome_path -GRCh38 .test/resources/GRCh38.fasta.gz -draft1 .test/resources/GRCh38.fasta.gz +GRCm38 /data/repository/organisms/GRCm38_ensembl/genome_fasta/genome.fa diff --git a/environment.yml b/environment.yml index 00630c4..6a7d650 100644 --- a/environment.yml +++ b/environment.yml @@ -1,9 +1,193 @@ name: pore-c-snakemake channels: -- conda-forge -- bioconda -- defaults + - anaconda + - conda-forge + - bioconda + - defaults dependencies: -- pandas ==1.0.5 -- python-box ==4.2.3 -- snakemake ==5.19.3 + - _libgcc_mutex=0.1=conda_forge + - _openmp_mutex=4.5=2_gnu + - abseil-cpp=20211102.0=h27087fc_1 + - aioeasywebdav=2.4.0=pyha770c72_0 + - aiohttp=3.8.1=py310h5764c6d_1 + - aiosignal=1.2.0=pyhd8ed1ab_0 + - amply=0.1.5=pyhd8ed1ab_0 + - appdirs=1.4.4=pyh9f0ad1d_0 + - async-timeout=4.0.2=pyhd8ed1ab_0 + - attmap=0.13.2=pyhd8ed1ab_0 + - attrs=22.1.0=pyh71513ae_1 + - backports=1.0=py_2 + - backports.functools_lru_cache=1.6.4=pyhd8ed1ab_0 + - bcrypt=3.2.2=py310h5764c6d_0 + - boto3=1.24.46=pyhd8ed1ab_0 + - botocore=1.27.46=pyhd8ed1ab_0 + - brotlipy=0.7.0=py310h5764c6d_1004 + - bzip2=1.0.8=h7f98852_4 + - c-ares=1.18.1=h7f98852_0 + - ca-certificates=2022.6.15=ha878542_0 + - cachetools=5.0.0=pyhd8ed1ab_0 + - certifi=2022.6.15=py310hff52083_0 + - cffi=1.15.1=py310h255011f_0 + - charset-normalizer=2.1.0=pyhd8ed1ab_0 + - cni=1.0.1=ha975731_1 + - cni-plugins=1.0.1=ha8f183a_0 + - coin-or-cbc=2.10.8=h3786ebc_0 + - coin-or-cgl=0.60.6=h6f57e76_1 + - coin-or-clp=1.17.7=hc56784d_1 + - coin-or-osi=0.108.7=h2720bb7_1 + - coin-or-utils=2.11.6=h202d8b1_1 + - coincbc=2.10.8=0_metapackage + - commonmark=0.9.1=py_0 + - configargparse=1.5.3=pyhd8ed1ab_0 + - connection_pool=0.0.3=pyhd3deb0d_0 + - cryptography=37.0.4=py310h597c629_0 + - dataclasses=0.8=pyhc8e2a94_3 + - datrie=0.8.2=py310h6acc77f_3 + - decorator=5.1.1=pyhd8ed1ab_0 + - defusedxml=0.7.1=pyhd8ed1ab_0 + - docutils=0.19=py310hff52083_0 + - dpath=2.0.6=py310hff52083_1 + - drmaa=0.7.9=py310h06a4308_0 + - dropbox=11.33.0=pyhd8ed1ab_0 + - filechunkio=1.8=py_2 + - filelock=3.7.1=pyhd8ed1ab_0 + - frozenlist=1.3.1=py310h5764c6d_0 + - ftputil=5.0.4=pyhd8ed1ab_0 + - future=0.18.2=py310hff52083_5 + - gitdb=4.0.9=pyhd8ed1ab_0 + - gitpython=3.1.27=pyhd8ed1ab_0 + - google-api-core=2.8.2=pyhd8ed1ab_0 + - google-api-python-client=2.55.0=pyhd8ed1ab_0 + - google-auth=2.10.0=pyh6c4a22f_0 + - google-auth-httplib2=0.1.0=pyhd8ed1ab_1 + - google-cloud-core=2.3.2=pyhd8ed1ab_0 + - google-cloud-storage=2.4.0=pyh6c4a22f_0 + - google-crc32c=1.1.2=py310he8fe98e_3 + - google-resumable-media=2.3.3=pyhd8ed1ab_0 + - googleapis-common-protos=1.56.4=py310hff52083_0 + - grpc-cpp=1.48.0=hbd84cd8_0 + - grpcio=1.48.0=py310ha0b7d45_0 + - httplib2=0.20.4=pyhd8ed1ab_0 + - icu=70.1=h27087fc_0 + - idna=3.3=pyhd8ed1ab_0 + - importlib-metadata=4.11.4=py310hff52083_0 + - importlib_resources=5.9.0=pyhd8ed1ab_0 + - iniconfig=1.1.1=pyh9f0ad1d_0 + - jinja2=3.1.2=pyhd8ed1ab_1 + - jmespath=1.0.1=pyhd8ed1ab_0 + - jq=1.6=h36c2ea0_1000 + - jsonschema=4.9.1=pyhd8ed1ab_0 + - jupyter_core=4.11.1=py310hff52083_0 + - ld_impl_linux-64=2.36.1=hea4e1c9_2 + - libarchive=3.5.2=hb890918_3 + - libblas=3.9.0=15_linux64_openblas + - libcblas=3.9.0=15_linux64_openblas + - libcrc32c=1.1.2=h9c3ff4c_0 + - libffi=3.4.2=h7f98852_5 + - libgcc-ng=12.1.0=h8d9b700_16 + - libgfortran-ng=12.1.0=h69a702a_16 + - libgfortran5=12.1.0=hdcd56e2_16 + - libgomp=12.1.0=h8d9b700_16 + - libiconv=1.16=h516909a_0 + - liblapack=3.9.0=15_linux64_openblas + - liblapacke=3.9.0=15_linux64_openblas + - libnsl=2.0.0=h7f98852_0 + - libopenblas=0.3.20=pthreads_h78a6416_1 + - libprotobuf=3.20.1=h6239696_0 + - libseccomp=2.4.4=h7f98852_1 + - libsodium=1.0.18=h36c2ea0_1 + - libstdcxx-ng=12.1.0=ha89aaad_16 + - libuuid=2.32.1=h7f98852_1000 + - libxml2=2.9.14=h22db469_3 + - libzlib=1.2.12=h166bdaf_2 + - logmuse=0.2.6=pyh8c360ce_0 + - lz4-c=1.9.3=h9c3ff4c_1 + - lzo=2.10=h516909a_1000 + - markupsafe=2.1.1=py310h5764c6d_1 + - multidict=6.0.2=py310h5764c6d_1 + - nbformat=5.4.0=pyhd8ed1ab_0 + - ncurses=6.3=h27087fc_1 + - numpy=1.23.1=py310h53a5b5f_0 + - oauth2client=4.1.3=py_0 + - oniguruma=6.9.8=h166bdaf_0 + - openssl=1.1.1q=h166bdaf_0 + - packaging=21.3=pyhd8ed1ab_0 + - pandas=1.4.3=py310h769672d_0 + - paramiko=2.11.0=pyhd8ed1ab_0 + - peppy=0.32.0=pyhd8ed1ab_1 + - pip=22.2.2=pyhd8ed1ab_0 + - pkgutil-resolve-name=1.3.10=pyhd8ed1ab_0 + - plac=1.3.5=pyhd8ed1ab_0 + - pluggy=1.0.0=py310hff52083_3 + - ply=3.11=py_1 + - prettytable=3.3.0=pyhd8ed1ab_0 + - protobuf=3.20.1=py310hd8f1fbe_0 + - psutil=5.9.1=py310h5764c6d_0 + - pulp=2.6.0=py310hff52083_1 + - py=1.11.0=pyh6c4a22f_0 + - pyasn1=0.4.8=py_0 + - pyasn1-modules=0.2.7=py_0 + - pycparser=2.21=pyhd8ed1ab_0 + - pygments=2.12.0=pyhd8ed1ab_0 + - pynacl=1.5.0=py310h5764c6d_1 + - pyopenssl=22.0.0=pyhd8ed1ab_0 + - pyparsing=3.0.9=pyhd8ed1ab_0 + - pyrsistent=0.18.1=py310h5764c6d_1 + - pysftp=0.2.9=py_1 + - pysocks=1.7.1=py310hff52083_5 + - pytest=7.1.2=py310hff52083_0 + - python=3.10.5=h582c2e5_0_cpython + - python-box=6.0.2=py310h5764c6d_3 + - python-dateutil=2.8.2=pyhd8ed1ab_0 + - python-fastjsonschema=2.16.1=pyhd8ed1ab_0 + - python-irodsclient=1.1.4=pyhd8ed1ab_0 + - python_abi=3.10=2_cp310 + - pytz=2022.1=pyhd8ed1ab_0 + - pyu2f=0.1.5=pyhd8ed1ab_0 + - pyyaml=6.0=py310h5764c6d_4 + - ratelimiter=1.2.0=py_1002 + - re2=2022.06.01=h27087fc_0 + - readline=8.1.2=h0f457ee_0 + - requests=2.28.1=pyhd8ed1ab_0 + - retry=0.9.2=py_0 + - rich=12.5.1=pyhd8ed1ab_0 + - rsa=4.9=pyhd8ed1ab_0 + - ruamel.yaml=0.17.21=py310h5764c6d_1 + - ruamel.yaml.clib=0.2.6=py310h5764c6d_1 + - s3transfer=0.6.0=pyhd8ed1ab_0 + - setuptools=63.4.2=py310hff52083_0 + - setuptools-scm=7.0.5=pyhd8ed1ab_0 + - singularity=3.8.6=h9c2343c_0 + - six=1.16.0=pyh6c4a22f_0 + - slacker=0.14.0=py_0 + - smart_open=6.0.0=pyhd8ed1ab_0 + - smmap=3.0.5=pyh44b312d_0 + - snakemake=7.12.0=hdfd78af_0 + - snakemake-minimal=7.12.0=pyhdfd78af_0 + - sqlite=3.39.2=h4ff8645_0 + - squashfs-tools=4.4=hd0129a2_3 + - stone=3.3.1=pyhd8ed1ab_0 + - stopit=1.1.2=py_0 + - tabulate=0.8.10=pyhd8ed1ab_0 + - tk=8.6.12=h27826a3_0 + - toml=0.10.2=pyhd8ed1ab_0 + - tomli=2.0.1=pyhd8ed1ab_0 + - toposort=1.7=pyhd8ed1ab_0 + - traitlets=5.3.0=pyhd8ed1ab_0 + - typing-extensions=4.3.0=hd8ed1ab_0 + - typing_extensions=4.3.0=pyha770c72_0 + - tzdata=2022a=h191b570_0 + - ubiquerg=0.6.2=pyhd8ed1ab_0 + - uritemplate=4.1.1=pyhd8ed1ab_0 + - urllib3=1.26.11=pyhd8ed1ab_0 + - veracitools=0.1.3=py_0 + - wcwidth=0.2.5=pyh9f0ad1d_2 + - wheel=0.37.1=pyhd8ed1ab_0 + - wrapt=1.14.1=py310h5764c6d_0 + - xz=5.2.5=h516909a_1 + - yaml=0.2.5=h7f98852_2 + - yarl=1.7.2=py310h5764c6d_2 + - yte=1.5.1=py310hff52083_0 + - zipp=3.8.1=pyhd8ed1ab_0 + - zlib=1.2.12=h166bdaf_2 + - zstd=1.5.2=h8a70e8d_3 diff --git a/envs/cooler.yml b/envs/cooler.yml index aa110af..ead2962 100644 --- a/envs/cooler.yml +++ b/envs/cooler.yml @@ -2,4 +2,4 @@ channels: - conda-forge - bioconda dependencies: - - cooler==0.8.11 + - cooler diff --git a/envs/pore_c.yml b/envs/pore_c.yml index bd17fad..8669e31 100644 --- a/envs/pore_c.yml +++ b/envs/pore_c.yml @@ -5,5 +5,4 @@ channels: dependencies: - pore-c==0.4.0 - python==3.8 -- aws-sdk-cpp=1.8.186=h9ad65fb_2 - +- pysam==0.19.1 diff --git a/envs/whatshap.yml b/envs/whatshap.yml index b567823..8f5c9e3 100644 --- a/envs/whatshap.yml +++ b/envs/whatshap.yml @@ -3,4 +3,4 @@ channels: - bioconda - defaults dependencies: - - whatshap ==1.0 + - whatshap=1.4 diff --git a/rules/exports.smk b/rules/exports.smk index b6b10ad..9186a48 100644 --- a/rules/exports.smk +++ b/rules/exports.smk @@ -7,16 +7,21 @@ rule to_cooler: fragments=paths.virtual_digest.fragments, params: prefix=to_prefix(paths.matrix.cool, 1), + cooler_resolution=config["matrix_resolutions"]["base"], log: to_log(paths.matrix.cool), benchmark: to_benchmark(paths.matrix.cool) threads: config["software"]["pore_c"]["to_cooler"]["threads"] + resources: + mem_mb=32000 conda: PORE_C_CONDA_FILE shell: "pore_c {DASK_SETTINGS} --dask-num-workers {threads} " - " contacts export {input.contacts} cooler {params.prefix} --fragment-table {input.fragments} --chromsizes {input.chromsizes} 2>{log} " + " contacts export {input.contacts} cooler {params.prefix} " + "--cooler-resolution {params.cooler_resolution} " + "--fragment-table {input.fragments} --chromsizes {input.chromsizes} 2>{log} " rule to_haplotyped_cooler: @@ -28,6 +33,7 @@ rule to_haplotyped_cooler: fragments=paths.virtual_digest.fragments, params: prefix=to_prefix(paths.matrix.haplotyped_cools, 2), + cooler_resolution=config["matrix_resolutions"]["base"], log: to_log(paths.matrix.haplotyped_cools), benchmark: @@ -37,7 +43,9 @@ rule to_haplotyped_cooler: PORE_C_CONDA_FILE shell: "pore_c {DASK_SETTINGS} --dask-num-workers {threads} " - " contacts export {input.contacts} cooler {params.prefix} --by-haplotype --fragment-table {input.fragments} --chromsizes {input.chromsizes} 2>{log} " + " contacts export {input.contacts} cooler {params.prefix} " + "--cooler-resolution {params.cooler_resolution} " + "--by-haplotype --fragment-table {input.fragments} --chromsizes {input.chromsizes} 2>{log} " rule create_mcool_file: @@ -53,6 +61,8 @@ rule create_mcool_file: to_log(paths.matrix.mcool), conda: "../envs/cooler.yml" + resources: + mem_mb=16000 threads: 1 shell: "cooler zoomify -n {threads} -r {params.resolutions} -o {output} {input} 2>{log}" diff --git a/rules/mapping.smk b/rules/mapping.smk index 6c6f44d..23e72af 100644 --- a/rules/mapping.smk +++ b/rules/mapping.smk @@ -13,6 +13,8 @@ rule align_bwa: cli_opts=config["software"]["bwa"]["cli_opts"], memory=config["software"]["sort"]["memory_per_thread"], sort_threads=config["software"]["sort"]["threads"], + resources: + mem_mb=360000 threads: config["software"]["bwa"]["threads"] conda: "../envs/bwa.yml" @@ -24,7 +26,7 @@ rule align_bwa: "( bwa {params.cli_opts} -t {threads} " "{input.refgenome} {input.fastq} " " | pore_c alignments reformat-bam - - " - " | samtools sort -O bam -m {params.memory} -@ {params.sort_threads} -o {output.bam} -) 2>{log} ;" + " | samtools sort -O bam -m {params.memory} -@ {params.sort_threads} -T $TMPDIR -o {output.bam} -) 2>{log} ;" " samtools index {output.bam} 2>{log} " @@ -45,9 +47,12 @@ rule haplotag: refgenome=paths.refgenome.fasta_unzipped, params: vcf=lookup_value("vcf_path", mapping_df), - is_phased=is_phased, #conda: "../envs/whatshap.yml" + is_phased=is_phased, #conda: "../envs/whatshap.yml", + additional_params="--skip-missing-contigs" log: to_log(paths.mapping.haplotagged_aligns), + resources: + mem_mb=16000 benchmark: to_benchmark(paths.mapping.haplotagged_aligns) wrapper: @@ -65,6 +70,8 @@ rule create_alignment_table: benchmark: to_benchmark(paths.align_table.alignment) threads: config["software"]["pore_c"]["create_alignment_table"]["threads"] + resources: + mem_mb=16000 conda: PORE_C_CONDA_FILE shell: @@ -83,6 +90,8 @@ rule assign_fragments: benchmark: to_benchmark(paths.align_table.pore_c) threads: config["software"]["pore_c"]["create_alignment_table"]["threads"] + resources: + mem_mb=16000 conda: PORE_C_CONDA_FILE shell: @@ -104,7 +113,10 @@ rule to_contacts: conda: PORE_C_CONDA_FILE threads: 1 + resources: + mem_mb=16000 shell: + "hostname; " "pore_c {DASK_SETTINGS} --dask-num-workers {threads} " "alignments to-contacts {input} {output.contacts} 2>{log}" @@ -151,6 +163,8 @@ rule merge_contact_files: threads: 4 conda: PORE_C_CONDA_FILE + resources: + mem_mb=24000 shell: "pore_c {DASK_SETTINGS} --dask-num-workers {threads} " "contacts merge {input} {output} --fofn" @@ -169,9 +183,11 @@ rule summarise_contacts: to_log(paths.merged_contacts.concatemers), benchmark: to_benchmark(paths.merged_contacts.concatemers) - threads: 10 + threads: 16 conda: PORE_C_CONDA_FILE + resources: + mem_mb=48000 shell: "pore_c {DASK_SETTINGS} --dask-num-workers {threads} " "contacts summarize {input.contacts} {input.read_summary} {output.pq} {output.csv} " diff --git a/rules/methylation.smk b/rules/methylation.smk index 299df22..6a02e62 100644 --- a/rules/methylation.smk +++ b/rules/methylation.smk @@ -25,10 +25,13 @@ rule filter_bam: filtered_bai=paths.mapping.filtered_bai, benchmark: to_benchmark(paths.mapping.filtered_bam) + threads: 16 log: to_log(paths.mapping.filtered_bam), conda: PORE_C_CONDA_FILE + resources: + mem_mb=16000 shell: "( pore_c {DASK_SETTINGS} --dask-num-workers {threads} " "alignments filter-bam {input.bam} {input.pore_c_table} {output.filtered_bam} " @@ -60,6 +63,8 @@ rule f5c_index: to_benchmark(paths.basecall.f5c_index) log: to_log(paths.basecall.f5c_index), + resources: + mem_mb=24000 shell: """ {input.binary} index -d {input.fast5} -s {input.summary} {input.fastq} 2>{log} @@ -87,6 +92,7 @@ rule f5c_call_methylation: per_read=paths.methylation.per_read_llr, resources: gpu=f5c_config["gpus"], + mem_mb=24000 params: cli_opts=f5c_config["cli_opts"], benchmark: diff --git a/rules/reads.smk b/rules/reads.smk index 017e327..73bff0a 100644 --- a/rules/reads.smk +++ b/rules/reads.smk @@ -14,6 +14,7 @@ checkpoint import_basecalls: PORE_C_CONDA_FILE threads: 1 shell: + "rm -f {params.prefix}*; " "pore_c {DASK_SETTINGS} --dask-num-workers {threads} " "reads prepare {params.fname} {params.prefix} --max-read-length {config[max_read_length]} " " --batch-size {config[reads_per_batch]} 2> {log}" diff --git a/rules/refgenome.smk b/rules/refgenome.smk index bcd03cc..2d67c3c 100644 --- a/rules/refgenome.smk +++ b/rules/refgenome.smk @@ -12,7 +12,9 @@ rule add_refgenome: to_log(paths.refgenome.catalog), benchmark: to_benchmark(paths.refgenome.catalog) - threads: 5 + threads: 16 + resources: + mem_mb=8000 conda: PORE_C_CONDA_FILE shell: @@ -33,10 +35,13 @@ rule virtual_digest: to_benchmark(paths.virtual_digest.catalog) log: to_log(paths.virtual_digest.catalog), - threads: 10 + threads: 16 + resources: + mem_mb=16000 conda: PORE_C_CONDA_FILE shell: + "ulimit -s unlimited && " "pore_c {DASK_SETTINGS} --dask-num-workers {threads} " "refgenome virtual-digest {input} {wildcards.enzyme} {params.prefix} -n {threads} 2> {log}" @@ -48,9 +53,10 @@ rule bwa_index_refgenome: paths.refgenome.bwt, conda: "../envs/bwa.yml" + resources: + mem_mb=24000 log: to_log(paths.refgenome.bwt), - benchmark: - to_benchmark(paths.refgenome.bwt) + threads: 1 shell: "bwa index {input} 2>{log}" diff --git a/scripts/reformat_hicpro_vcf.sh b/scripts/reformat_hicpro_vcf.sh new file mode 100755 index 0000000..d111394 --- /dev/null +++ b/scripts/reformat_hicpro_vcf.sh @@ -0,0 +1,5 @@ +#!/usr/bin/env bash +# reformat hicpro-output vcf to standard VCF +# convert spaces to tabs on header line +# convert multiple spaces to tabs on rest of lines +sed '/^#CHROM/ s/ /\t/g' - | sed '/^#/!s/ \+ /\t/g'