generate bigBED, store in bigbed_files

databio · Dec 18, 2020 · 8665def · stolarczyk · Dec 21, 2020 · xuebingjie1990
1 parent 2daafc0
commit 8665def
Show file tree

Hide file tree

Showing 3 changed files with 52 additions and 8 deletions.
diff --git a/bedmaker.py b/bedmaker.py
@@ -4,6 +4,8 @@
 import pypiper
 import os
 import sys
+import tempfile
+import pandas as pd
 # import pyBigWig
 import gzip
 import shutil
@@ -19,7 +21,8 @@
 parser.add_argument("-r", "--rfg-config", help="file path to the genome config file", type=str)
 parser.add_argument("-o", "--output-file", help="path to the output BED files", type=str)
 parser.add_argument("-s", "--sample-name", help="name of the sample used to systematically build the output name", type=str)
-
+parser.add_argument("--output-bigbed", help="path to the output bigBED files", type=str)
+parser.add_argument("--chrom-size", help="a full path to the chrom.sizes required for the bedtobigbed conversion", type=str)
 
 # add pypiper args to make pipeline looper compatible
 parser = pypiper.add_pypiper_args(parser, groups=["pypiper", "looper"],
@@ -142,6 +145,42 @@ def main():
             cmd = [cmd]
         cmd.append(gzip_cmd)
     pm.run(cmd, target=args.output_file)
+
+    bedfile_name = os.path.split(args.output_file)[1]
+    fileid = os.path.splitext(os.path.splitext(bedfile_name)[0])[0]
+     # Produce bigBed (bigNarrowPeak) file from peak file 
+    bigNarrowPeak = os.path.join(args.output_bigbed, fileid + ".bigBed")
+    temp = tempfile.NamedTemporaryFile(dir=args.output_bigbed, delete=False)
+    print ("test bigbed saving path: ", bigNarrowPeak)
+    print ("test chrom.sizes path: ", args.chrom_size)
+    if not os.path.exists(bigNarrowPeak):
+        df = pd.read_csv(args.output_file, sep='\t', header=None,
+                            names=("V1","V2","V3","V4","V5","V6",
+                                    "V7","V8","V9","V10")).sort_values(by=["V1","V2"])
+        df.to_csv(temp.name, sep='\t', header=False, index=False)
+        pm.clean_add(temp.name)
+        print ("BED: \n", df)
+        as_file = os.path.join(args.output_bigbed, "bigNarrowPeak.as")
+        cmd = ("echo 'table bigNarrowPeak\n" + 
+                "\"BED6+4 Peaks of signal enrichment based on pooled, normalized (interpreted) data.\"\n" +
+                "(\n" +
+                "     string chrom;        \"Reference sequence chromosome or scaffold\"\n" +
+                "     uint   chromStart;   \"Start position in chromosome\"\n" +
+                "     uint   chromEnd;     \"End position in chromosome\"\n" +
+                "     string name;         \"Name given to a region (preferably unique). Use . if no name is assigned\"\n" +
+                "     uint   score;        \"Indicates how dark the peak will be displayed in the browser (0-1000) \"\n" +
+                "     char[1]  strand;     \"+ or - or . for unknown\"\n" +
+                "     float  signalValue;  \"Measurement of average enrichment for the region\"\n" +
+                "     float  pValue;       \"Statistical significance of signal value (-log10). Set to -1 if not used.\"\n" +
+                "     float  qValue;       \"Statistical significance with multiple-test correction applied (FDR -log10). Set to -1 if not used.\"\n" +
+                "     int   peak;          \"Point-source called for this peak; 0-based offset from chromStart. Set to -1 if no point-source called.\"\n" +
+                ")' > " + as_file)
+        pm.run(cmd, as_file, clean=True)
+
+        cmd = ("bedToBigBed -as=" + as_file + " -type=bed6+4 " +
+                temp.name + " " + args.chrom_size + " " + bigNarrowPeak)
+        pm.run(cmd, bigNarrowPeak, nofail=True)
+
     pm.stop_pipeline()
 
 

diff --git a/pep_schema.yaml b/pep_schema.yaml
@@ -15,15 +15,16 @@ properties:
         output_file_path: 
           type: string
           description: "absolute path the file to the output BED file (derived attribute)"
+        output_bigbed_path: 
+          type: string
+          description: "absolute path the file to the output bigBED file (derived attribute)"
         genome:
           type: string
           description: "organism genome code"
           enum: ["hg18", "hg19", "hg38", "mm9", "mm10"]
         narrowpeak:
-          type: integer
-          minimum: 0
-          maximum: 1
-          description: "binary number indicating whether the regions are narrow (transcription factor implies narrow, histone mark implies broad peaks)"
+          type: boolean
+          description: "whether the regions are narrow (transcription factor implies narrow, histone mark implies broad peaks)"
         format:
           type: string
           description: "file format"
@@ -49,6 +50,7 @@ properties:
       required:
         - input_file_path
         - output_file_path
+        - output_bigbed_path
         - genome
         - narrowpeak
         - sample_name

diff --git a/pipeline_interface.yaml b/pipeline_interface.yaml
@@ -1,16 +1,19 @@
 pipeline_name: BEDMAKER
 pipeline_type: sample
-path: bedmaker.py
-input_schema: http://schema.databio.org/pipelines/bedmaker.yaml
+var_templates:
+  path: "{looper.piface_dir}/bedmaker.py"
+input_schema: "/home/bx2ur/Documents/GitHubRepo/bedmaker/pep_schema.yaml"
 command_template: >
-  {pipeline.path}
+  {pipeline.var_templates.path}
   --input-file {sample.input_file_path}
   --output-file {sample.output_file_path}
+  --output-bigbed {sample.output_bigbed_path}
   --narrowpeak {sample.narrowpeak}
   --input-type {sample.format}
   --genome {sample.genome}
   --sample-name {sample.sample_name}
   {% if sample.rfg_config is defined %} --rfg-config {sample.rfg_config} {% endif %}
+  {% if sample.chrom_size is defined %} --chrom-size {sample.chrom_size} {% endif %}
 compute:
   size_dependent_variables: resources-sample.tsv