From 8665def39864ddaedf961d6cdf46405d0d9da233 Mon Sep 17 00:00:00 2001 From: xuebingjie1990 Date: Fri, 18 Dec 2020 13:58:44 -0500 Subject: [PATCH] generate bigBED, store in `bigbed_files` --- bedmaker.py | 41 ++++++++++++++++++++++++++++++++++++++++- pep_schema.yaml | 10 ++++++---- pipeline_interface.yaml | 9 ++++++--- 3 files changed, 52 insertions(+), 8 deletions(-) diff --git a/bedmaker.py b/bedmaker.py index f833ee2..45b8fb8 100755 --- a/bedmaker.py +++ b/bedmaker.py @@ -4,6 +4,8 @@ import pypiper import os import sys +import tempfile +import pandas as pd # import pyBigWig import gzip import shutil @@ -19,7 +21,8 @@ parser.add_argument("-r", "--rfg-config", help="file path to the genome config file", type=str) parser.add_argument("-o", "--output-file", help="path to the output BED files", type=str) parser.add_argument("-s", "--sample-name", help="name of the sample used to systematically build the output name", type=str) - +parser.add_argument("--output-bigbed", help="path to the output bigBED files", type=str) +parser.add_argument("--chrom-size", help="a full path to the chrom.sizes required for the bedtobigbed conversion", type=str) # add pypiper args to make pipeline looper compatible parser = pypiper.add_pypiper_args(parser, groups=["pypiper", "looper"], @@ -142,6 +145,42 @@ def main(): cmd = [cmd] cmd.append(gzip_cmd) pm.run(cmd, target=args.output_file) + + bedfile_name = os.path.split(args.output_file)[1] + fileid = os.path.splitext(os.path.splitext(bedfile_name)[0])[0] + # Produce bigBed (bigNarrowPeak) file from peak file + bigNarrowPeak = os.path.join(args.output_bigbed, fileid + ".bigBed") + temp = tempfile.NamedTemporaryFile(dir=args.output_bigbed, delete=False) + print ("test bigbed saving path: ", bigNarrowPeak) + print ("test chrom.sizes path: ", args.chrom_size) + if not os.path.exists(bigNarrowPeak): + df = pd.read_csv(args.output_file, sep='\t', header=None, + names=("V1","V2","V3","V4","V5","V6", + "V7","V8","V9","V10")).sort_values(by=["V1","V2"]) + df.to_csv(temp.name, sep='\t', header=False, index=False) + pm.clean_add(temp.name) + print ("BED: \n", df) + as_file = os.path.join(args.output_bigbed, "bigNarrowPeak.as") + cmd = ("echo 'table bigNarrowPeak\n" + + "\"BED6+4 Peaks of signal enrichment based on pooled, normalized (interpreted) data.\"\n" + + "(\n" + + " string chrom; \"Reference sequence chromosome or scaffold\"\n" + + " uint chromStart; \"Start position in chromosome\"\n" + + " uint chromEnd; \"End position in chromosome\"\n" + + " string name; \"Name given to a region (preferably unique). Use . if no name is assigned\"\n" + + " uint score; \"Indicates how dark the peak will be displayed in the browser (0-1000) \"\n" + + " char[1] strand; \"+ or - or . for unknown\"\n" + + " float signalValue; \"Measurement of average enrichment for the region\"\n" + + " float pValue; \"Statistical significance of signal value (-log10). Set to -1 if not used.\"\n" + + " float qValue; \"Statistical significance with multiple-test correction applied (FDR -log10). Set to -1 if not used.\"\n" + + " int peak; \"Point-source called for this peak; 0-based offset from chromStart. Set to -1 if no point-source called.\"\n" + + ")' > " + as_file) + pm.run(cmd, as_file, clean=True) + + cmd = ("bedToBigBed -as=" + as_file + " -type=bed6+4 " + + temp.name + " " + args.chrom_size + " " + bigNarrowPeak) + pm.run(cmd, bigNarrowPeak, nofail=True) + pm.stop_pipeline() diff --git a/pep_schema.yaml b/pep_schema.yaml index a991674..e94f106 100644 --- a/pep_schema.yaml +++ b/pep_schema.yaml @@ -15,15 +15,16 @@ properties: output_file_path: type: string description: "absolute path the file to the output BED file (derived attribute)" + output_bigbed_path: + type: string + description: "absolute path the file to the output bigBED file (derived attribute)" genome: type: string description: "organism genome code" enum: ["hg18", "hg19", "hg38", "mm9", "mm10"] narrowpeak: - type: integer - minimum: 0 - maximum: 1 - description: "binary number indicating whether the regions are narrow (transcription factor implies narrow, histone mark implies broad peaks)" + type: boolean + description: "whether the regions are narrow (transcription factor implies narrow, histone mark implies broad peaks)" format: type: string description: "file format" @@ -49,6 +50,7 @@ properties: required: - input_file_path - output_file_path + - output_bigbed_path - genome - narrowpeak - sample_name diff --git a/pipeline_interface.yaml b/pipeline_interface.yaml index c1cf512..475c0ec 100644 --- a/pipeline_interface.yaml +++ b/pipeline_interface.yaml @@ -1,16 +1,19 @@ pipeline_name: BEDMAKER pipeline_type: sample -path: bedmaker.py -input_schema: http://schema.databio.org/pipelines/bedmaker.yaml +var_templates: + path: "{looper.piface_dir}/bedmaker.py" +input_schema: "/home/bx2ur/Documents/GitHubRepo/bedmaker/pep_schema.yaml" command_template: > - {pipeline.path} + {pipeline.var_templates.path} --input-file {sample.input_file_path} --output-file {sample.output_file_path} + --output-bigbed {sample.output_bigbed_path} --narrowpeak {sample.narrowpeak} --input-type {sample.format} --genome {sample.genome} --sample-name {sample.sample_name} {% if sample.rfg_config is defined %} --rfg-config {sample.rfg_config} {% endif %} + {% if sample.chrom_size is defined %} --chrom-size {sample.chrom_size} {% endif %} compute: size_dependent_variables: resources-sample.tsv