-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpipeline_tf.py
219 lines (174 loc) · 7.27 KB
/
pipeline_tf.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
#!/usr/bin/env python
# TODO: Encode pipeline suggest using bowtie1 + SPP aligner for TF data
# TODO: table 'folder' value not used
import sys
from typing import Tuple, List
import pandas as pd
from pipeline_utils import *
from scripts.util import run_macs2
def cli():
parser = argparse.ArgumentParser(
description="For given descriptor table download SRA data & call peaks"
)
parser.add_argument("data",
# required=True,
type=file_path_type(ext="tsv"),
help="Data *.tsv file")
parser.add_argument('-o', '--out', default='.',
type=file_path_type(True, False),
help="Output dir (default: .)")
args = parser.parse_args()
run_pipeline(args.out, args.data)
def run_pipeline(out, data):
#################
# Configuration #
#################
genome = "hg19"
indexes \
= os.path.join("/scratch/artyomov_lab_aging/Y20O20/chipseq/indexes",
genome)
chrom_sizes = os.path.join(indexes, genome + ".chrom.sizes")
picard_tools = os.path.join("~", "picard.jar")
# Data table
data_table = pd.read_csv(data, sep="\t")
print("Data to process:")
print(data_table)
gsm2srxs = {}
for r in data_table.itertuples():
gsm2srxs[r.signal] = r.signal_srx.split(";")
# if input defined
if r.input != "-":
gsm2srxs[r.input] = r.input_srx.split(";")
gsm_to_process = sorted(gsm2srxs.keys())
print(gsm_to_process)
# Make dirs:
data_dirs = [os.path.join(out, gsmid) for gsmid in gsm_to_process]
for data_dir in data_dirs:
os.makedirs(data_dir, exist_ok=True)
##################
# Pipeline start #
##################
# Download SRA data:
# 'rsync' here skips file if it already exist
print("Downloading data...")
srx_to_dir_list = []
for gsmid in gsm_to_process:
sra_dir = os.path.join(out, gsmid, "sra")
srxs = gsm2srxs[gsmid]
for srx in srxs:
srx_to_dir_list.extend([srx, sra_dir])
run_bash("scripts/geo_rsync.sh", *srx_to_dir_list)
# Fastq-dump SRA data:
run_bash("parallel/fastq_dump.sh", *data_dirs)
# Prepare genome *.fa and Bowtie indexes
print("Genomes and indices folder: ", indexes)
run_bash("parallel/index_genome.sh", genome, indexes)
run_bash("parallel/index_bowtie2.sh", genome, indexes)
# run_bash("index_bowtie.sh", genome, indexes)
# Batch QC
run_bash("parallel/fastqc.sh", *data_dirs)
# Total multiqc:
# Use -s options, otherwise tons of "SRRnnn" hard to distinguish
# -s, --fullnames Do not clean the sample names (leave as full
# file name)
# -f, --force Overwrite any existing reports
# -o, --outdir TEXT Create report in the specified output directory.
# if len(data_dirs) > 1:
# run("multiqc", "-f", "-o", out, " ".join(data_dirs))
#
# XXX: let's look in multiqc results, it shows that in several samples
# it's better to trim first 5bp, so let's trim it in all samples for
# simplicity:
#
# Alignment step:
def process_sra(sra_dirs):
# * batch Bowtie with trim 5 first base pairs
run_bash("parallel/bowtie2.sh", genome, indexes, "5", *sra_dirs)
# Merge TF SRR*.bam files to one
run_bash("parallel/samtools_merge.sh", genome, *sra_dirs)
bams_dirs = process_dirs(data_dirs, "_bams", ["*.bam", "*bowtie*.log"],
process_sra)
for bams_dir in bams_dirs:
# multiqc is able to process Bowtie report
run("multiqc", "-f", "-o", bams_dir, " ".join(bams_dirs))
if len(data_dirs) > 1:
run("multiqc", "-f", "-o", out, " ".join(data_dirs + bams_dirs))
# XXX: doesn't work for some reason, "filter by -f66" returns nothing
# Process insert size of BAM visualization
# run_bash("fragments.sh", *bams_dirs)
# Batch BigWig visualization
process_dirs(bams_dirs, "_bws", ["*.bw", "*.bdg", "*bw.log"],
lambda dirs: run_bash("parallel/bigwig.sh", chrom_sizes,
*dirs))
# Batch RPKM visualization
process_dirs(bams_dirs, "_rpkms", ["*.bw", "*rpkm.log"],
lambda dirs: run_bash("parallel/rpkm.sh", *dirs))
# Remove duplicates
process_dirs(bams_dirs, "_unique",
["*_unique*", "*_metrics.txt", "*duplicates.log"],
lambda dirs: run_bash("parallel/remove_duplicates.sh",
picard_tools, *dirs))
# Call PEAKS:
files_to_cleanup = []
try:
# let's link signal bams with corresponding input:
bams_dirs_for_peakcalling = []
for r in data_table.itertuples():
gsmid_signal = r.signal
bams_dir_signal = os.path.join(out, gsmid_signal + "_bams")
# if input defined
if r.input != "-":
gsmid_input = r.input
bams_dir_input = os.path.join(out, gsmid_input + "_bams")
# Find all input *.bam and *.bam.bai
input_files = [f for f in os.listdir(bams_dir_input)
if f.endswith(".bam") or f.endswith(".bam.bai")]
# Create symlink to link
for f in input_files:
f_link = os.path.join(bams_dir_signal,
f.replace(".bam", "_input.bam"))
run("ln", "-s", os.path.join(bams_dir_input, f), f_link)
files_to_cleanup.append(f_link)
bams_dirs_for_peakcalling.append(bams_dir_signal)
########################
# Peak calling section #
########################
# Bedtools is necessary for filter script
subprocess.run('module load bedtools2', shell=True)
fpeaks_and_bams_dirs = []
# MACS2 Broad peak calling (https://github.com/taoliu/MACS) Q=0.1
# in example
run_macs2(genome, chrom_sizes,
'broad_0.1', '--broad', '--broad-cutoff', 0.1,
work_dirs=bams_dirs_for_peakcalling)
# # MACS2 Regular peak calling (https://github.com/taoliu/MACS)
# # Q=0.01 in example
run_macs2(genome, chrom_sizes, 'q0.1', '-q', 0.1, work_dirs=bams_dirs_for_peakcalling)
finally:
for f in files_to_cleanup:
print("Cleanup:")
try:
os.remove(f)
print("* deleted: ", f)
except OSError:
print("Error while deleting '{}'".format(f), sys.exc_info()[0])
def process_dirs(dirs, suffix, what_to_move, processor_fun):
# filter already processed dirs:
dirs_to_process = []
result_dirs = []
for data_dir in dirs:
res_dir = data_dir + suffix
result_dirs.append(res_dir)
if os.path.exists(res_dir):
print("[Skipped] Directory already exists:", res_dir)
continue
dirs_to_process.append((data_dir, res_dir))
if dirs_to_process:
# process dirs:
processor_fun(list(zip(*dirs_to_process))[0])
# move results:
for data_dir, res_dir in dirs_to_process:
move_forward(data_dir, res_dir, what_to_move)
return result_dirs
if __name__ == '__main__':
cli()