-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathmain.nf
234 lines (181 loc) · 7.12 KB
/
main.nf
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
#!/usr/bin/env nextflow
/*
========================================================================================
PALLOR: Phylogeny from universAL singLe cOpy oRthologs
========================================================================================
PALLOR Analysis Pipeline.
#### Homepage / Documentation
https://gitlab.ifremer.fr/bioinfo/pallor
----------------------------------------------------------------------------------------
*/
def helpMessage() {
// Add to this help message with new command line parameters
log.info SeBiMERHeader()
log.info"""
Usage:
The typical command for running the pipeline after filling the conf/base.config file is as follows :
nextflow run main.nf
Mandatory arguments:
--rawdata_dir [path] Path to input directory with raw data files
Other options:
--outdir [path] The output directory where the results will be saved
-w/--work-dir The temporary directory where intermediate data will be saved
--name [str] Name for the pipeline run. If not specified, Nextflow will automatically generate a random mnemonic
--projectName [str] Name of the project being analyzed
Single copy orthologs:
--odb_path [path] Path to all BUSCO ODB databases
--odb_name [str] Specify the name of the BUSCO lineage to be used
--min_species [int] Keep orthologs presents in a least this minimal number of species (min:2; max: total number of species)
""".stripIndent()
}
// Show help message
if (params.help) {
helpMessage()
exit 0
}
/*
* SET UP CONFIGURATION VARIABLES
*/
// Has the run name been specified by the user?
// this has the bonus effect of catching both -name and --name
custom_runName = params.name
if (!(workflow.runName ==~ /[a-z]+_[a-z]+/)) {
custom_runName = workflow.runName
}
Channel.fromPath(params.rawdata_dir)
.map { file -> tuple(file.baseName, file) }
.set { genomes }
/*
* PIPELINE INFO
*/
// Header log info
log.info SeBiMERHeader()
def summary = [:]
if (workflow.revision) summary['Pipeline Release'] = workflow.revision
summary['Run Name'] = custom_runName ?: workflow.runName
summary['Project Name'] = params.projectName
summary['Output dir'] = params.outdir
summary['Launch dir'] = workflow.launchDir
summary['Working dir'] = workflow.workDir
summary['Script dir'] = workflow.projectDir
summary['User'] = workflow.userName
// if (params.email || params.email_on_fail) {
// summary['E-mail Address'] = params.email
// summary['E-mail on failure'] = params.email_on_fail
// }
log.info summary.collect { k,v -> "${k.padRight(18)}: $v" }.join("\n")
log.info "-\033[2m--------------------------------------------------\033[0m-"
// Launch BUSCO and cat all single copy genes into a single file for each specie
process get_single_copy {
label 'busco_env'
publishDir "${params.outdir}/${params.completness_dirname}" , mode: 'copy', pattern : "${genome_name}/short_summary*"
publishDir "${params.outdir}/${params.completness_dirname}" , mode: 'copy', pattern : "${genome_name}/run_*/*.tsv" , saveAs : { full_table -> "${genome_name}/full_table.tsv" }
publishDir "${params.outdir}/${params.completness_dirname}" , mode: 'copy', pattern : "*.faa"
input:
set genome_name, file(fasta) from genomes
output:
file "${genome_name}/short_summary*" //into completness_short_summary
file "${genome_name}/run_*/full_table.tsv" //into completness_full_summary
file "${genome_name}.sg.faa" into single_copy_proteins
shell:
"""
export AUGUSTUS_CONFIG_PATH=${params.augustus_config_path}
busco -c ${task.cpus} --force --offline -m genome -i ${fasta} -o ${genome_name} -l ${params.odb_path}/${params.odb_name} >& busco.log 2>&1
catSingleCopyBySpecie.py -f ${genome_name}/run_${params.odb_name}/busco_sequences/single_copy_busco_sequences/ -s ${genome_name} >& catSingleCopyBySpecie.log 2>&1
"""
}
// Concat all single copy genes of all specie into a single file
process concat_single_copy {
publishDir "${params.outdir}/${params.concatenate_dirname}", mode: 'copy'
input:
file '*.faa' from single_copy_proteins.collect()
output:
file "single_copy_busco_sequences.fasta" into cat_all_single_copy_proteins
shell:
"""
cat *.faa > single_copy_busco_sequences.fasta
"""
}
// Filter in order to keep at least a minimum of X species which shared a single copy gene (give a list of sequence ID)
process filter_single_copy {
label 'biopython_env'
publishDir "${params.outdir}/${params.extract_shared_sg_dirname}", mode: 'copy'
input:
file fasta from cat_all_single_copy_proteins
output:
file "*.faa" into shared_single_copy_proteins
shell:
"""
extractSharedSingleCopy.py -f ${fasta} -s ${params.min_species}
"""
}
// Align each orthogroup
process mafft {
label 'mafft_env'
publishDir "${params.outdir}/${params.alignment_dirname}", mode: 'copy'
input:
file faa from shared_single_copy_proteins.flatten()
output:
file "*.mafft" into aligned_shared_single_copy_proteins
shell:
"""
mafft --auto ${faa} > ${faa}.mafft
"""
}
// Clean the alignment
process gblocks {
// As Gblocks exit status is always 1...
validExitStatus 1
label 'gblocks_env'
publishDir "${params.outdir}/${params.cleaning_dirname}", mode: 'copy'
input:
file aln from aligned_shared_single_copy_proteins
output:
file "*-gb" into cleaned_aligned_shared_single_copy_proteins
script:
"""
Gblocks ${aln} -t=p -p=n -b3=8 -b4=10 -b5=h
"""
}
// Create the matrix
process concatenation {
label 'biopython_env'
publishDir "${params.outdir}/${params.matrix_dirname}", mode: 'copy'
input:
file '*-gb' from cleaned_aligned_shared_single_copy_proteins.collect()
output:
file "*.fas" into concatenated_alignments
script:
"""
ElConcatenero.py -if fasta -of fasta -in *-gb -o concatenated
"""
}
process iqtree {
label 'iqtree_env'
publishDir "${params.outdir}/${params.tree_dirname}", mode: 'copy'
input:
file matrix from concatenated_alignments
output:
file "*.treefile" into iqtree_tree
file "*.iqtree" into iqtree_logs
shell:
"""
iqtree -s ${matrix} -bb 1000 -alrt 1000 -nt ${task.cpus}
"""
}
/* Other functions */
def SeBiMERHeader() {
// Log colors ANSI codes
c_cyan = params.monochrome_logs ? '' : "\033[0;36m";
c_blue = params.monochrome_logs ? '' : "\033[0;34m";
c_reset = params.monochrome_logs ? '' : "\033[0m";
c_yellow = params.monochrome_logs ? '' : "\033[0;33m";
return """ -${c_cyan}--------------------------------------------------${c_reset}-
${c_blue} __ __ __ . __ __ ${c_reset}
${c_blue} \\ |_ |__) | |\\/| |_ |__) ${c_reset}
${c_blue} __\\ |__ |__) | | | |__ | \\ ${c_reset}
${c_reset}
${c_yellow} PALLOR: Phylogeny from universAL singLe cOpy oRthologs${c_reset}
-${c_cyan}--------------------------------------------------${c_reset}-
""".stripIndent()
}