-
Notifications
You must be signed in to change notification settings - Fork 0
/
merge_and_rename_NGI_fastq_files.py
executable file
·85 lines (70 loc) · 3.07 KB
/
merge_and_rename_NGI_fastq_files.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
#!/usr/bin/env python
import re
import os
import sys
import shutil
import argparse
import collections
def merge_files(input_dir, dest_dir):
#Gather all fastq files in inputdir and its subdirs
fastq_files=[]
for subdir, dirs, files in os.walk(input_dir):
for fastq in files:
if fastq.endswith('.fastq.gz'):
fastq_files.append(os.path.join(subdir, fastq))
#Match NGI sample number from flowcell
sample_pattern=re.compile("^(.+)_S[0-9]+_.+_R([1-2])_")
#Remove files that already have the right name (i.e have been merged already)
matches=[]
for fastq_file in fastq_files:
try:
match=sample_pattern.search(os.path.basename(fastq_file)).group(1)
if match:
matches.append(fastq_file)
except AttributeError:
continue
fastq_files=matches
while fastq_files:
tomerge=[]
#grab one sample to work on
first=fastq_files[0]
fq_bn=os.path.basename(first)
sample_name=sample_pattern.match(fq_bn).group(1)
fastq_files_read1=[]
fastq_files_read2=[]
for fq in fastq_files:
this_sample_pattern = re.compile("^" + sample_name + "_S[0-9]+_.+_R([1-2])_")
if this_sample_pattern.match(os.path.basename(fq)) and "_R1_" in os.path.basename(fq):
fastq_files_read1.append(fq)
if this_sample_pattern.match(os.path.basename(fq)) and "_R2_" in os.path.basename(fq):
fastq_files_read2.append(fq)
fastq_files_read1.sort()
fastq_files_read2.sort()
actual_merging(sample_name,1, fastq_files_read1, dest_dir)
actual_merging(sample_name,2, fastq_files_read2, dest_dir)
for fq in fastq_files_read1:
fastq_files.remove(fq)
for fq in fastq_files_read2:
fastq_files.remove(fq)
def actual_merging(sample_name, read_nb, tomerge, dest_dir):
outfile=os.path.join(dest_dir, "{}_R{}.fastq.gz".format(sample_name, read_nb))
print "Merging the following files:"
if not tomerge:
print "No read {} files found".format(read_nb)
return
for fq in tomerge:
print fq
print "as {}".format(outfile)
with open(outfile, 'wb') as wfp:
for fn in tomerge:
with open(fn, 'rb') as rfp:
shutil.copyfileobj(rfp, wfp)
if __name__ == "__main__":
parser = argparse.ArgumentParser(description=""" Merges all fastq-files from each samples into one file. Looks through the given dir and subdirs.
Written with a the NGI folder structure in mind.""")
parser.add_argument("input_dir", metavar='Input directory', nargs='?', default='.',
help="Base directory for the fastq files that should be merged. ")
parser.add_argument("dest_dir", metavar='Output directory', nargs='?', default='.',
help="Path path to where the merged files should be outputed. ")
args = parser.parse_args()
merge_files(args.input_dir, args.dest_dir)