-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path16S_pipline.py
executable file
·136 lines (122 loc) · 9.91 KB
/
16S_pipline.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
#!/usr/bin/env python3
# coding:utf-8
import argparse
import re
import sys
import os
# argument:
p = argparse.ArgumentParser(
description="This script work well when analyze the paired-end sequences derived from MeiJi company. python ~/github/Bayegy/16S_pipline.py -i 郑超君老师-ME201808031003-MJ-M-20180807049-68份样本-原始数据/rawData/eachsample/ --classifier ../../database_16S/GG/338-806/gg_13_8_99_338_806_classifier.qza --classifier-type gg -m pre_map.txt -c Group1,Group2,Group3 -o Results --run-in-parallel")
p.add_argument('-i', '--input', dest='input', metavar='<directory>', default=False,
help='Directory of demuxed fastq files. sub directory is allowed when storing fastq files. This parameter is required if --manifest is not supplied. Supply this parameter together with -s -f -r. This option will be ignored if --manifest is specifed')
p.add_argument('--manifest', dest='manifest', metavar='<path>', default=False,
help='The manifest file to import demuxed fastq files, required if -i is not specified. And -i, -s, -f, -r will be ignored if this parameter is specifed')
p.add_argument('-m', '--map', dest='map', metavar='<path>', default=False,
help='Sample metadata file. The sample ID must in first and last column, and two columns do not have to be same when -i is supplied, but the IDs in first column must be respond to --sample-id-patern')
p.add_argument('-c', '--category', dest='group', metavar='<str>', default='auto',
help='column name of categories in metadata. You can specify more than one category, and the categories must be seprated by commas')
p.add_argument('--run-in-parallel', dest='parallel', action='store_true',
help="When supplied, OTU clustering is operated for each category, otherwise OTU clustering is operated only one time for all categories. This parameter is only used when --manifest is not supplied")
p.add_argument('-s', '--sample-id-patern', dest='sp', default=r'raw\.split\.(.+)\.[12]\.fq$', metavar='<regular expression>',
help="The regular expression of sample ID in file names. You must use () to contain sample ID expression, and to prevent unexpected error, using '' to contain the expression is necessary. Supply this parameter together with -i -f -r. This option will be ignored if --manifest is specifed")
p.add_argument('-f', '--forward-file-pattern', dest='fp', default=r'\.1\.fq$', metavar='<regular expression>',
help='The regular expression representing forward sequence in file names,Supply this parameter together with -i -s -r. This option will be ignored if --manifest is specifed')
p.add_argument('-r', '--reverse-file-pattern', dest='rp', default=r'\.2\.fq$', metavar='<regular expression>',
help='The regular expression representing reverse sequence in file names,Supply this parameter together with -i -s -f. This option will be ignored if --manifest is specifed')
p.add_argument('-d', '--sample-depth', dest='depth', default='auto', metavar='<int or str>',
help='Depth for subsampleing. If "auto" is supplied, the min OTU frequency of samples will be caculated and apply to this parameter')
p.add_argument('-e', '--exclude', dest='exclude', default='none', metavar='<str>',
help='The variables excluded from RDA and correlation heatmap analysis, if more than one variable is expected, you should use comma as delimeters. if more than one run is expected, use semicolon as delimeters. example: "Var1,Var2;Var1,Var3". default is none, meaning no variable will be excluded.')
p.add_argument('-l', '--classifier', dest='classifier', default='/home/admin1/database_16S/GG/338-806/gg_13_8_99_338_806_classifier.qza', metavar='<path>',
help='Path of the classifier for alignment and assigning taxonomy')
p.add_argument('--ref-seqs', dest='ref', default='/home/admin1/database_16S/GG/338-806/gg_13_5_97_338_806_ref_seqs.qza', metavar='<path>',
help='Path of the reference sequences for close reference alignment')
p.add_argument('-t', '--classifier-type', dest='type', default='gg', metavar='<str>',
help='Specify the type of classifier, either silva or gg')
p.add_argument('-u', '--run-picrust', dest='picrust', default='yes', metavar='<str>',
help='yes or no')
p.add_argument('-k', '--filter-taxa', dest='ftaxa', default='exclude:mitochondria,chloroplast,Unassigned', metavar='<str>',
help='Taxa excluded from OTU table, seprated by commas.')
p.add_argument('-n', '--rar-filename', dest='fname', default='Result', metavar='<str>',
help='The name of final results.rar')
p.add_argument('-o', '--outdir', dest='outdir', metavar='<directory>', default='./',
help='specify the output directory')
options = p.parse_args()
if not ((options.input or options.manifest) and options.map and options.group):
print("You must specify either -i or --manifest, also -m and -c")
sys.exit()
if not os.path.exists(options.outdir):
os.makedirs(options.outdir)
if options.manifest:
options.manifest = os.path.abspath(options.manifest)
options.map = os.path.abspath(options.map)
options.classifier = os.path.abspath(options.classifier)
options.ref = os.path.abspath(options.ref)
options.outdir = os.path.abspath(options.outdir)
scriptpath = sys.path[0]
def parse_map(mapping):
os.system('''sed -i -e 's/ //g' %s''' % (mapping))
with open(mapping, 'r') as mapin:
l1 = mapin.readline()
gps = re.findall('Category[^\t ]*', l1)
return ','.join(gps)
def rar(name, target):
os.system('''rar a %s %s''' % (name, target))
# print("analyzing......")
if os.path.isfile(options.map):
cgp = parse_map(options.map)
options.group = cgp if options.group == "auto" else options.group
if not options.manifest:
if options.parallel:
for c in re.split(",", options.group.strip()):
print('##############################################################Start to generate results according to the samples of %s' % (c))
if not os.path.exists(options.outdir + '/' + c + "_Results"):
os.makedirs(options.outdir + '/' + c + "_Results")
os.system("Rscript %s/clean_na_of_inputs.R -m %s --group %s -o %s" %
(scriptpath, options.map, c, options.outdir))
os.system("python3 %s/write_manifest.py -i %s -m %s/cleaned_map.txt -o %s/%s_data -f '%s' -r '%s' -s '%s'" %
(scriptpath, options.input, options.outdir, options.outdir, c, options.fp, options.rp, options.sp))
os.system('''cd %s/%s_Results&&sed -i -e '1s/%s/Group/g' ../%s_data/sample-metadata.tsv&&\
bash %s/16S_pipeline.V9.sh ../%s_data/sample-metadata.tsv %s %s Group %s %s ../%s_data/manifest.txt "%s" %s %s''' %
(options.outdir, c, c, c, scriptpath, c, options.depth, options.ftaxa, options.classifier, options.ref, c, options.exclude, options.type, options.picrust))
if not os.path.exists(options.outdir + '/' + 'All_Results_Summary'):
os.makedirs(options.outdir + '/' + 'All_Results_Summary')
os.system('''rm -r %s/All_Results_Summary/%s_Result; mv %s/%s_Results/Result %s/All_Results_Summary/%s_Result''' %
(options.outdir, c, options.outdir, c, options.outdir, c))
else:
if not os.path.exists(options.outdir + '/Results'):
os.makedirs(options.outdir + '/Results')
os.system("python3 %s/write_manifest.py -i %s -m %s -o %s/data -f '%s' -r '%s' -s '%s'" %
(scriptpath, options.input, options.map, options.outdir, options.fp, options.rp, options.sp))
os.system('''cd %s/Results&&\
bash %s/16S_pipeline.V9.sh ../data/sample-metadata.tsv %s %s %s %s %s ../data/manifest.txt "%s" %s %s&&\
change_suffix.py Result -s '1-Stats-demux,2-Stats-dada2,3-RepresentiveSequence,taxa-bar-plots_Qiime2,1-ANCOM,alpha-rarefaction-Qiime2,2-Kruskal_Wallis,PCoA-Qiime2,5-GroupSignificance,FiguresTablesForReport'&&\
rar a %s.rar Result''' % (options.outdir, scriptpath, options.depth, options.ftaxa, options.group, options.classifier, options.ref, options.exclude, options.type, options.picrust, options.fname))
else:
os.system('''cd %s&&\
bash %s/16S_pipeline.V9.sh %s %s %s %s %s %s %s "%s" %s %s&&\
rar a %s.rar Result''' % (options.outdir, scriptpath, options.map, options.depth, options.ftaxa, options.group, options.classifier, options.ref, options.manifest, options.exclude, options.type, options.picrust, options.fname))
else:
mapdir = os.listdir(options.map)
excs = re.split('::', options.exclude)
gp_set = re.split('::', options.group)
if not os.path.exists(options.outdir + '/Results_Result'):
os.makedirs(options.outdir + '/Results_Result')
for mp in mapdir:
pmp = options.map + '/' + mp
if os.path.isfile(pmp):
od = re.search('\d+', mp).group()
odn = int(od) - 1
exc = excs[odn] if len(excs) > 1 else options.exclude
cgp = parse_map(pmp) if options.group == "auto" else gp_set[odn]
if not os.path.exists(options.outdir + '/Results' + od):
os.makedirs(options.outdir + '/Results' + od)
os.system("python3 %s/write_manifest.py -i %s -m %s -o %s/data%s -f '%s' -r '%s' -s '%s'" %
(scriptpath, options.input, pmp, options.outdir, od, options.fp, options.rp, options.sp))
os.system('''cd %s/Results%s&&\
bash %s/16S_pipeline.V9.sh ../data%s/sample-metadata.tsv %s %s %s %s %s ../data%s/manifest.txt "%s" %s %s&&\
mv Result %sResult&&\
rar a %s%s.rar %sResult&&\
mv %s%s.rar ../Results_Result/''' %
(options.outdir, od, scriptpath, od, options.depth, options.ftaxa, cgp, options.classifier, options.ref, od, exc, options.type, options.picrust,
od, od, options.fname, od, od, options.fname))