forked from nf-core/proteomicslfq
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain.nf
1625 lines (1337 loc) · 68.7 KB
/
main.nf
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#!/usr/bin/env nextflow
/*
========================================================================================
nf-core/proteomicslfq
========================================================================================
nf-core/proteomicslfq Analysis Pipeline.
#### Homepage / Documentation
https://github.com/nf-core/proteomicslfq
----------------------------------------------------------------------------------------
*/
def helpMessage() {
log.info nfcoreHeader()
log.info"""
Usage:
The typical command for running the pipeline is as follows:
nextflow run nf-core/proteomicslfq --spectra '*.mzML' --database '*.fasta' -profile docker
Main arguments:
--input Path/URI to PRIDE Sample to data relation format file (SDRF) OR path to input spectra as mzML or Thermo Raw
For SDRF:
--root_folder (Optional) If given, looks for the filenames in the SDRF in this folder, locally
--local_input_type (Optional) If given and 'root_folder' was specified, it overwrites the filetype in the SDRF for local lookup and matches only the basename.
For mzML/raw files:
--expdesign (Optional) Path to an experimental design file (if not given, it assumes unfractionated, unrelated samples)
And:
--database Path to input protein database as fasta
Decoy database:
--add_decoys Add decoys to the given fasta
--decoy_affix The decoy prefix or suffix used or to be used (default: DECOY_)
--affix_type Prefix (default) or suffix (WARNING: Percolator only supports prefices)
Database Search:
--search_engines Which search engine: "comet" (default) or "msgf"
--enzyme Enzymatic cleavage (e.g. 'unspecific cleavage' or 'Trypsin' [default], see OpenMS enzymes)
--num_enzyme_termini Specify the termini where the cleavage rule has to match (default:
'fully' valid: 'semi', 'fully')
--num_hits Number of peptide hits per spectrum (PSMs) in output file (default: '1')
--fixed_mods Fixed modifications ('Carbamidomethyl (C)', see OpenMS modifications)
--variable_mods Variable modifications ('Oxidation (M)', see OpenMS modifications)
--enable_mod_localization Enable localization scoring with Luciphor
--mod_localization Specify the var. modifications whose localizations should be rescored with the luciphor algorithm
--precursor_mass_tolerance Mass tolerance of precursor mass (default: 5)
--precursor_mass_tolerance_unit Da or ppm (default: ppm)
--fragment_mass_tolerance Mass tolerance for fragment masses (currently only controls Comets fragment_bin_tol) (default: 0.03)
--fragment_mass_tolerance_unit Da or ppm (default: Da)
--allowed_missed_cleavages Allowed missed cleavages (default: 2)
--min_precursor_charge Minimum precursor ion charge (default: 2)
--max_precursor_charge Maximum precursor ion charge (default: 4)
--min_peptide_length Minimum peptide length to consider (default: 6)
--max_peptide_length Maximum peptide length to consider (default: 40)
--instrument Type of instrument that generated the data (currently only 'high_res' [default] and 'low_res' supported)
--protocol Used labeling or enrichment protocol (if any)
--fragment_method Used fragmentation method (currently unused since we let the search engines consider all MS2 spectra and let them determine from the spectrum metadata)
--max_mods Maximum number of modifications per peptide. If this value is large, the search may take very long
--db_debug Debug level during database search
--open_search Default is not to conduct an open search, if needed set this to "yes"
//TODO probably also still some options missing. Try to consolidate them whenever the two search engines share them
Peak picking:
--openms_peakpicking Use the OpenMS PeakPicker to ADDITIONALLY pick the spectra before the search. This is usually done
during conversion already. Only activate if something goes wrong.
--peakpicking_inmemory Perform OpenMS peakpicking in-memory. Needs at least the size of the mzML file as RAM but is faster. default: false
--peakpicking_ms_levels Which MS levels to pick. default: [] which means auto-convert all non-centroided
Peptide Re-indexing:
--IL_equivalent Should isoleucine and leucine be treated interchangeably? Default: true
--allow_unmatched Ignore unmatched peptides (Default: false; only activate if you double-checked all other settings)
PSM Rescoring:
--posterior_probabilities How to calculate posterior probabilities for PSMs:
"percolator" = Re-score based on PSM-feature-based SVM and transform distance
to hyperplane for posteriors
"fit_distributions" = Fit positive and negative distributions to scores
(similar to PeptideProphet)
--rescoring_debug Debug level during PSM rescoring
--psm_pep_fdr_cutoff FDR cutoff on PSM level (or potential peptide level; see Percolator options) before going into
feature finding, map alignment and inference.
Percolator specific:
--train_FDR False discovery rate threshold to define positive examples in training. Set to testFDR if 0
--test_FDR False discovery rate threshold for evaluating best cross validation result and reported end result
--percolator_fdr_level Level of FDR calculation ('peptide-level-fdrs' or 'psm-level-fdrs')
--description_correct_features Description of correct features for Percolator (0, 1, 2, 4, 8, see Percolator retention time and calibration)
--generic_feature_set Use only generic (i.e. not search engine specific) features. Generating search engine specific
features for common search engines by PSMFeatureExtractor will typically boost the identification rate significantly.
--subset_max_train Only train an SVM on a subset of PSMs, and use the resulting score vector to evaluate the other
PSMs. Recommended when analyzing huge numbers (>1 million) of PSMs. When set to 0, all PSMs are used for training as normal.
--klammer Retention time features are calculated as in Klammer et al. instead of with Elude
Distribution specific:
--outlier_handling How to handle outliers during fitting:
- ignore_iqr_outliers (default): ignore outliers outside of 3*IQR from Q1/Q3 for fitting
- set_iqr_to_closest_valid: set IQR-based outliers to the last valid value for fitting
- ignore_extreme_percentiles: ignore everything outside 99th and 1st percentile (also removes equal values like potential censored max values in XTandem)
- none: do nothing
--top_hits_only Use only the top hits for fitting
//TODO add more options for rescoring part
ConsensusID:
--consensusid_algorithm Choose method to combine probabilities from multiple search engines (if used). Valid: best, worst, average, rank, PEPMatrix, PEPIons (Default: best)
--min_consensus_support Choose ratio of ADDITIONAL evidence for a peptide ID of a spectrum. Varies across methods. See documentation for further info. (Default: 0)
--consensusid_considered_top_hits Number of top hits per spectrum considered for consensus scoring. (Default: 0 = all)
Inference and Quantification:
--inf_quant_debug Debug level during inference and quantification. (WARNING: Higher than 666 may produce a lot
of additional output files)
Inference:
--protein_inference Infer proteins through:
"aggregation" = aggregates all peptide scores across a protein (by calculating the maximum)
"bayesian" = computes a posterior probability for every protein based on a Bayesian network
("percolator" not yet supported)
--protein_level_fdr_cutoff Protein level FDR cutoff (this affects and chooses the peptides used for quantification)
Quantification:
--quantification_method Quantification method supported by proteomicslfq ('feature_intensity' or 'spectral_counting', default: 'feature_intensity')
--transfer_ids Transfer IDs over aligned samples to increase # of quantifiable features (WARNING:
increased memory consumption). (default: false) TODO must specify true or false
--targeted_only Only ID based quantification. (default: true) TODO must specify true or false
--mass_recalibration Recalibrates masses to correct for instrument biases. (default: false) TODO must specify true
or false
//TODO the following need to be passed still
--psm_pep_fdr_for_quant PSM/peptide level FDR used for quantification (if filtering on protein level is not enough)
If Bayesian inference was chosen, this will be a peptide-level FDR and only the best PSMs per
peptide will be reported.
(default: off = 1.0)
--protein_quantification Quantify proteins based on:
"unique_peptides" = use peptides mapping to single proteins or a group of indistinguishable proteins (according to the set of experimentally identified peptides)
"strictly_unique_peptides" = use peptides mapping to a unique single protein only
"shared_peptides" = use shared peptides only for its best group (by inference score)
Statistical post-processing:
--skip_post_msstats Skip MSstats for statistical post-processing?
--ref_condition Instead of all pairwise contrasts, uses the given condition number (corresponding to your experimental design) as a reference and
creates pairwise contrasts against it (TODO fully implement)
--contrasts Specify a set of contrasts in a semicolon seperated list of R-compatible contrasts with the
condition numbers as variables (e.g. "1-2;1-3;2-3"). Overwrites "--reference" (TODO fully implement)
Quality control:
--ptxqc_report_layout Specify a yaml file for the report layout (see PTXQC documentation) (TODO fully implement)
Other options:
--outdir [file] The output directory where the results will be saved
--publish_dir_mode [str] Mode for publishing results in the output directory. Available: symlink, rellink, link, copy, copyNoFollow, move (Default: copy)
--email [email] Set this parameter to your e-mail address to get a summary e-mail with details of the run sent to you when the workflow exits
--email_on_fail [email] Same as --email, except only send mail if the workflow is not successful
-name [str] Name for the pipeline run. If not specified, Nextflow will automatically generate a random mnemonic
AWSBatch options:
--awsqueue [str] The AWSBatch JobQueue that needs to be set when running on AWSBatch
--awsregion [str] The AWS Region for your AWS Batch job to run on
--awscli [str] Path to the AWS CLI tool
""".stripIndent()
}
// Show help message
if (params.help) {
helpMessage()
exit 0
}
/*
* SET UP CONFIGURATION VARIABLES
*/
// Has the run name been specified by the user?
// this has the bonus effect of catching both -name and --name
custom_runName = params.name
if (!(workflow.runName ==~ /[a-z]+_[a-z]+/)) {
custom_runName = workflow.runName
}
// Check AWS batch settings
if (workflow.profile.contains('awsbatch')) {
// AWSBatch sanity checking
if (!params.awsqueue || !params.awsregion) exit 1, "Specify correct --awsqueue and --awsregion parameters on AWSBatch!"
// Check outdir paths to be S3 buckets if running on AWSBatch
// related: https://github.com/nextflow-io/nextflow/issues/813
if (!params.outdir.startsWith('s3:')) exit 1, "Outdir not on S3 - specify S3 Bucket to run on AWSBatch!"
// Prevent trace files to be stored on S3 since S3 does not support rolling files.
if (params.tracedir.startsWith('s3:')) exit 1, "Specify a local tracedir or run without trace! S3 cannot be used for tracefiles."
}
// Stage config files
ch_output_docs = file("$baseDir/docs/output.md", checkIfExists: true)
ch_output_docs_images = file("$baseDir/docs/images/", checkIfExists: true)
// Validate input
if (isCollectionOrArray(params.input))
{
tocheck = params.input[0]
} else {
tocheck = params.input
}
sdrf_file = null
if (tocheck.toLowerCase().endsWith("sdrf") || tocheck.toLowerCase().endsWith("tsv")) {
sdrf_file = params.input
} else if (tocheck.toLowerCase().endsWith("mzml") || tocheck.toLowerCase().endsWith("raw")) {
spectra_files = params.input
} else {
log.error "EITHER spectra data (mzML/raw) OR an SDRF needs to be provided as input."; exit 1
}
params.database = params.database ?: { log.error "No protein database provided. Make sure you have used the '--database' option."; exit 1 }()
params.outdir = params.outdir ?: { log.warn "No output directory provided. Will put the results into './results'"; return "./results" }()
/*
* Create a channel for input files
*/
//Filename FixedModifications VariableModifications Label PrecursorMassTolerance PrecursorMassToleranceUnit FragmentMassTolerance DissociationMethod Enzyme
if (!sdrf_file)
{
ch_spectra = Channel.fromPath(spectra_files, checkIfExists: true)
ch_spectra
.multiMap{ it -> id = it.toString().md5()
comet_settings: msgf_settings: msfragger_settings: msfragger_open_settings: tuple(id,
params.fixed_mods,
params.variable_mods,
"", //labelling modifications currently not supported
params.precursor_mass_tolerance,
params.precursor_mass_tolerance_unit,
params.fragment_mass_tolerance,
params.fragment_mass_tolerance_unit,
params.fragment_method,
params.enzyme)
idx_settings: tuple(id,
params.enzyme)
luciphor_settings:
tuple(id,
params.fragment_method)
mzmls: tuple(id,it)}
.set{ch_sdrf_config}
}
else
{
ch_sdrf = Channel.fromPath(sdrf_file, checkIfExists: true)
/*
* STEP 0 - SDRF parsing
*/
process sdrf_parsing {
publishDir "${params.outdir}/logs", mode: 'copy', pattern: '*.log'
input:
file sdrf from ch_sdrf
output:
file "experimental_design.tsv" into ch_expdesign
file "openms.tsv" into ch_sdrf_config_file
when:
sdrf_file
script:
"""
## -t2 since the one-table format parser is broken in OpenMS2.5
## -l for legacy behavior to always add sample columns
parse_sdrf convert-openms -t2 -l -s ${sdrf} > sdrf_parsing.log
"""
}
//TODO use header and reference by col name instead of index
ch_sdrf_config_file
.splitCsv(skip: 1, sep: '\t')
.multiMap{ row -> id = row.toString().md5()
comet_settings: msgf_settings: msfragger_settings: msfragger_open_settings: tuple(id,
row[2],
row[3],
row[4],
row[5],
row[6],
row[7],
row[8],
row[9],
row[10])
idx_settings: tuple(id,
row[10])
luciphor_settings:
tuple(id,
row[9])
mzmls: tuple(id, !params.root_folder ?
row[0] :
params.root_folder + "/" + (params.local_input_type ?
row[1].take(row[1].lastIndexOf('.')) + '.' + params.local_input_type :
row[1]))}
.set{ch_sdrf_config}
}
ch_db_for_decoy_creation = Channel.fromPath(params.database)
// overwrite experimental design if given additionally to SDRF
//TODO think about that
if (params.expdesign)
{
Channel
.fromPath(params.expdesign)
.set { ch_expdesign }
}
ch_sdrf_config.mzmls
.branch {
raw: hasExtension(it[1], 'raw')
mzML: hasExtension(it[1], 'mzML')
}
.set {branched_input}
//TODO we could also check for outdated mzML versions and try to update them
branched_input.mzML
.branch {
nonIndexedMzML: file(it[1]).withReader {
f = it;
1.upto(5) {
if (f.readLine().contains("indexedmzML")) return false;
}
return true;
}
inputIndexedMzML: file(it[1]).withReader {
f = it;
1.upto(5) {
if (f.readLine().contains("indexedmzML")) return true;
}
return false;
}
}
.set {branched_input_mzMLs}
//Push raw files through process that does the conversion, everything else directly to downstream Channel with mzMLs
//This piece only runs on data that is a.) raw and b.) needs conversion
//mzML files will be mixed after this step to provide output for downstream processing - allowing you to even specify mzMLs and RAW files in a mixed mode as input :-)
/*
* STEP 0.1 - Raw file conversion
*/
process raw_file_conversion {
label 'process_low'
label 'process_single_thread'
publishDir "${params.outdir}/logs", mode: 'copy', pattern: '*.log'
input:
tuple mzml_id, path(rawfile) from branched_input.raw
output:
tuple mzml_id, file("*.mzML") into mzmls_converted
script:
"""
ThermoRawFileParser.sh -i=${rawfile} -f=2 -o=./ > ${rawfile}_conversion.log
"""
}
/*
* STEP 0.2 - MzML indexing
*/
process mzml_indexing {
label 'process_low'
publishDir "${params.outdir}/logs", mode: 'copy', pattern: '*.log'
input:
tuple mzml_id, path(mzmlfile) from branched_input_mzMLs.nonIndexedMzML
output:
tuple mzml_id, file("out/*.mzML") into mzmls_indexed
file "*.log"
script:
"""
mkdir out
FileConverter -in ${mzmlfile} -out out/${mzmlfile.baseName}.mzML > ${mzmlfile.baseName}_mzmlindexing.log
"""
}
//Mix the converted raw data with the already supplied mzMLs and push these to the same channels as before
if (params.openms_peakpicking)
{
branched_input_mzMLs.inputIndexedMzML.mix(mzmls_converted).mix(mzmls_indexed).set{mzmls_pp}
(mzmls_comet, mzmls_msgf, mzmls_msfragger, mzmls_msfragger, mzmls_luciphor, mzmls_plfq, mzmls_ptmshepherd, mzmls_deltamass) = [Channel.empty(), Channel.empty(), Channel.empty(), Channel.empty(), Channel.empty(), Channel.empty(), Channel.empty(), Channel.empty()]
}
else
{
branched_input_mzMLs.inputIndexedMzML.mix(mzmls_converted).mix(mzmls_indexed).into{mzmls_comet; mzmls_msgf; mzmls_msfragger; mzmls_msfragger_open; mzmls_luciphor; mzmls_plfq; mzmls_ptmshepherd; mzmls_deltamass}
mzmls_pp = Channel.empty()
}
//Fill the channels with empty Channels in case that we want to add decoys. Otherwise fill with output from database.
(searchengine_in_db_msgf, searchengine_in_db_comet, searchengine_in_db_msfragger, searchengine_in_db_msfragger_open, searchengine_in_db_peptideprophet, pepidx_in_db, plfq_in_db) = ( params.add_decoys
? [ Channel.empty(), Channel.empty(), Channel.empty(), Channel.empty(), Channel.empty(), Channel.empty(), Channel.empty() ]
: [ Channel.fromPath(params.database), Channel.fromPath(params.database), Channel.fromPath(params.database), Channel.fromPath(params.database), Channel.fromPath(params.database), Channel.fromPath(params.database), Channel.fromPath(params.database) ] )
//Add decoys if params.add_decoys is set appropriately
process generate_decoy_database {
label 'process_very_low'
label 'process_single_thread'
publishDir "${params.outdir}/logs", mode: 'copy', pattern: '*.log'
input:
file(mydatabase) from ch_db_for_decoy_creation
output:
file "${mydatabase.baseName}_decoy.fasta" into searchengine_in_db_decoy_msgf, searchengine_in_db_decoy_comet, searchengine_in_db_decoy_msfragger, searchengine_in_db_decoy_msfragger_open, searchengine_in_db_decoy_peptideprophet, pepidx_in_db_decoy, plfq_in_db_decoy
file "*.log"
when:
params.add_decoys
script:
"""
DecoyDatabase -in ${mydatabase} \\
-out ${mydatabase.baseName}_decoy.fasta \\
-decoy_string ${params.decoy_affix} \\
-decoy_string_position ${params.affix_type} \\
> ${mydatabase.baseName}_decoy_database.log
"""
}
// Doesnt work yet. Maybe point the script to the workspace?
// All the files should be there after collecting.
//process generate_simple_exp_design_file {
// publishDir "${params.outdir}", mode: 'copy'
// input:
// val mymzmls from mzmls.collect()
// output:
// file "expdesign.tsv" into expdesign
// when:
// !params.expdesign
// script:
// strng = new File(mymzmls[0].toString()).getParentFile()
// """
// create_trivial_design.py ${strng} 1 > expdesign.tsv
// """
//}
process openms_peakpicker {
label 'process_low'
publishDir "${params.outdir}/logs", mode: 'copy', pattern: '*.log'
input:
tuple mzml_id, path(mzml_file) from mzmls_pp
when:
params.openms_peakpicking
output:
tuple mzml_id, file("out/${mzml_file.baseName}.mzML") into mzmls_comet_picked, mzmls_msgf_picked, mzmls_msfragger_picked, mzmls_msfragger_open_picked, mzmls_plfq_picked, mzmls_ptmshepherd_picked, mzmls_deltamass_picked
script:
in_mem = params.peakpicking_inmemory ? "inmemory" : "lowmemory"
lvls = params.peakpicking_ms_levels ? "-algorithm:ms_levels ${params.peakpicking_ms_levels}" : ""
"""
mkdir out
PeakPickerHiRes -in ${mzml_file} \\
-out out/${mzml_file.baseName}.mzML \\
-threads ${task.cpus} \\
-debug ${params.pp_debug} \\
-processOption ${in_mem} \\
${lvls} \\
> ${mzml_file.baseName}_pp.log
"""
}
if (params.enzyme == "unspecific cleavage")
{
params.num_enzyme_termini == "none"
}
pepidx_num_enzyme_termini = params.num_enzyme_termini
if (params.num_enzyme_termini == "fully")
{
pepidx_num_enzyme_termini = "full"
}
process search_engine_msgf {
label 'process_medium'
publishDir "${params.outdir}/logs", mode: 'copy', pattern: '*.log'
// ---------------------------------------------------------------------------------------------------------------------
// ------------- WARNING: If you experience nextflow running forever after a failure, set the following ----------------
// ---------------------------------------------------------------------------------------------------------------------
// This is probably true for other processes as well. See https://github.com/nextflow-io/nextflow/issues/1457
// errorStrategy 'terminate'
input:
tuple file(database), mzml_id, path(mzml_file), fixed, variable, label, prec_tol, prec_tol_unit, frag_tol, frag_tol_unit, diss_meth, enzyme from searchengine_in_db_msgf.mix(searchengine_in_db_decoy_msgf).combine(mzmls_msgf.mix(mzmls_msgf_picked).join(ch_sdrf_config.msgf_settings))
// This was another way of handling the combination
//file database from searchengine_in_db.mix(searchengine_in_db_decoy)
//each file(mzml_file) from mzmls
when:
params.search_engines.contains("msgf")
output:
tuple mzml_id, file("${mzml_file.baseName}_msgf.idXML") into id_files_msgf
file "*.log"
script:
if (enzyme == 'Trypsin') enzyme = 'Trypsin/P'
else if (enzyme == 'Arg-C') enzyme = 'Arg-C/P'
else if (enzyme == 'Asp-N') enzyme = 'Asp-N/B'
else if (enzyme == 'Chymotrypsin') enzyme = 'Chymotrypsin/P'
else if (enzyme == 'Lys-C') enzyme = 'Lys-C/P'
if ((frag_tol.toDouble() < 50 && frag_tol_unit == "ppm") || (frag_tol.toDouble() < 0.1 && frag_tol_unit == "Da"))
{
inst = params.instrument ?: "high_res"
} else {
inst = params.instrument ?: "low_res"
}
"""
MSGFPlusAdapter -in ${mzml_file} \\
-out ${mzml_file.baseName}_msgf.idXML \\
-threads ${task.cpus} \\
-java_memory ${task.memory.toMega()} \\
-database "${database}" \\
-instrument ${inst} \\
-protocol "${params.protocol}" \\
-matches_per_spec ${params.num_hits} \\
-min_precursor_charge ${params.min_precursor_charge} \\
-max_precursor_charge ${params.max_precursor_charge} \\
-min_peptide_length ${params.min_peptide_length} \\
-max_peptide_length ${params.max_peptide_length} \\
-enzyme "${enzyme}" \\
-tryptic ${params.num_enzyme_termini} \\
-precursor_mass_tolerance ${prec_tol} \\
-precursor_error_units ${prec_tol_unit} \\
-fixed_modifications ${fixed.tokenize(',').collect { "'${it}'" }.join(" ") } \\
-variable_modifications ${variable.tokenize(',').collect { "'${it}'" }.join(" ") } \\
-max_mods ${params.max_mods} \\
-debug ${params.db_debug} \\
> ${mzml_file.baseName}_msgf.log
"""
}
process search_engine_comet {
label 'process_medium'
publishDir "${params.outdir}/logs", mode: 'copy', pattern: '*.log'
// ---------------------------------------------------------------------------------------------------------------------
// ------------- WARNING: If you experience nextflow running forever after a failure, set the following ----------------
// ---------------------------------------------------------------------------------------------------------------------
// This is probably true for other processes as well. See https://github.com/nextflow-io/nextflow/issues/1457
//errorStrategy 'terminate'
input:
tuple file(database), mzml_id, path(mzml_file), fixed, variable, label, prec_tol, prec_tol_unit, frag_tol, frag_tol_unit, diss_meth, enzyme from searchengine_in_db_comet.mix(searchengine_in_db_decoy_comet).combine(mzmls_comet.mix(mzmls_comet_picked).join(ch_sdrf_config.comet_settings))
when:
params.search_engines.contains("comet")
output:
tuple mzml_id, file("${mzml_file.baseName}_comet.idXML") into id_files_comet
file "*.log"
//TODO we currently ignore the activation_method param to leave the default "ALL" for max. compatibility
//Note: OpenMS CometAdapter will double the number that is passed to fragment_mass_tolerance to "convert"
// it to a fragment_bin_tolerance
script:
if (frag_tol_unit == "ppm") {
// Note: This uses an arbitrary rule to decide if it was hi-res or low-res
// and uses Comet's defaults for bin size (i.e. by passing 0.5*default to the Adapter), in case unsupported unit "ppm" was given.
if (frag_tol.toDouble() < 50) {
bin_tol = 0.015
bin_offset = 0.0
inst = params.instrument ?: "high_res"
} else {
bin_tol = 0.50025
bin_offset = 0.4
inst = params.instrument ?: "low_res"
}
log.warn "The chosen search engine Comet does not support ppm fragment tolerances. We guessed a " + inst +
" instrument and set the fragment_bin_tolerance to " + bin_tol
} else {
//TODO expose the fragment_bin_offset parameter of comet
bin_tol = frag_tol.toDouble()
bin_offset = bin_tol <= 0.05 ? 0.0 : 0.4
if (!params.instrument)
{
inst = bin_tol <= 0.05 ? "high_res" : "low_res"
} else {
inst = params.instrument
}
}
// for consensusID the cutting rules need to be the same. So we adapt to the loosest rules from MSGF
// TODO find another solution. In ProteomicsLFQ we re-run PeptideIndexer (remove??) and if we
// e.g. add XTandem, after running ConsensusID it will lose the auto-detection ability for the
// XTandem specific rules.
if (params.search_engines.contains("msgf"))
{
if (enzyme == 'Trypsin') enzyme = 'Trypsin/P'
else if (enzyme == 'Arg-C') enzyme = 'Arg-C/P'
else if (enzyme == 'Asp-N') enzyme = 'Asp-N/B'
else if (enzyme == 'Chymotrypsin') enzyme = 'Chymotrypsin/P'
else if (enzyme == 'Lys-C') enzyme = 'Lys-C/P'
}
"""
CometAdapter -in ${mzml_file} \\
-out ${mzml_file.baseName}_comet.idXML \\
-threads ${task.cpus} \\
-database "${database}" \\
-instrument ${inst} \\
-missed_cleavages ${params.allowed_missed_cleavages} \\
-num_hits ${params.num_hits} \\
-num_enzyme_termini ${params.num_enzyme_termini} \\
-enzyme "${enzyme}" \\
-precursor_charge ${params.min_precursor_charge}:${params.max_precursor_charge} \\
-fixed_modifications ${fixed.tokenize(',').collect { "'${it}'" }.join(" ") } \\
-variable_modifications ${variable.tokenize(',').collect { "'${it}'" }.join(" ") } \\
-max_variable_mods_in_peptide ${params.max_mods} \\
-precursor_mass_tolerance ${prec_tol} \\
-precursor_error_units ${prec_tol_unit} \\
-fragment_mass_tolerance ${bin_tol} \\
-fragment_bin_offset ${bin_offset} \\
-debug ${params.db_debug} \\
-force \\
> ${mzml_file.baseName}_comet.log
"""
}
process search_engine_msfragger {
label 'process_medium'
publishDir "${params.outdir}/logs", mode: 'copy', pattern: '*.log'
// ---------------------------------------------------------------------------------------------------------------------
// ------------- WARNING: If you experience nextflow running forever after a failure, set the following ----------------
// ---------------------------------------------------------------------------------------------------------------------
// This is probably true for other processes as well. See https://github.com/nextflow-io/nextflow/issues/1457
//errorStrategy 'terminate'
input:
tuple file(database), val(mzml_id), path(mzml_file), fixed, variable, label, prec_tol, prec_tol_unit, frag_tol, frag_tol_unit, diss_meth, enzyme from searchengine_in_db_msfragger.mix(searchengine_in_db_decoy_msfragger).combine(mzmls_msfragger.mix(mzmls_msfragger_picked).join(ch_sdrf_config.msfragger_settings))
when:
params.search_engines.contains("msfragger")
output:
tuple mzml_id, file("${mzml_file.baseName}_msfragger.idXML") into id_files_msfragger
file "*.log"
script:
"""
MSFraggerAdapter -in \$PWD/${mzml_file} \\
-out ${mzml_file.baseName}_msfragger.idXML \\
-threads ${task.cpus} \\
-license yes \\
-database \$PWD/${database} \\
-digest:allowed_missed_cleavage ${params.allowed_missed_cleavages} \\
-digest:num_enzyme_termini ${params.num_enzyme_termini} \\
-digest:search_enzyme_name "${enzyme}" \\
-statmod:unimod ${fixed.tokenize(',').collect { "'${it}'" }.join(" ") } \\
-varmod:unimod ${variable.tokenize(',').collect { "'${it}'" }.join(" ") } \\
-tolerance:precursor_mass_tolerance_lower ${prec_tol} \\
-tolerance:precursor_mass_tolerance_upper ${prec_tol} \\
-tolerance:precursor_mass_unit ${prec_tol_unit} \\
-tolerance:fragment_mass_tolerance ${frag_tol} \\
-tolerance:fragment_mass_unit ${frag_tol_unit} \\
-debug ${params.db_debug}\\
> ${mzml_file.baseName}_msfragger.log
"""
}
process search_engine_msfragger_open {
label 'process_medium'
publishDir "${params.outdir}/logs", mode: 'copy', pattern: '*.log'
// ---------------------------------------------------------------------------------------------------------------------
// ------------- WARNING: If you experience nextflow running forever after a failure, set the following ----------------
// ---------------------------------------------------------------------------------------------------------------------
// This is probably true for other processes as well. See https://github.com/nextflow-io/nextflow/issues/1457
//errorStrategy 'terminate'
input:
tuple file(database), val(mzml_id), path(mzml_file), fixed, variable, label, prec_tol, prec_tol_unit, frag_tol, frag_tol_unit, diss_meth, enzyme from searchengine_in_db_msfragger_open.mix(searchengine_in_db_decoy_msfragger_open).combine(mzmls_msfragger_open.mix(mzmls_msfragger_open_picked).join(ch_sdrf_config.msfragger_open_settings))
when:
params.search_engines.contains("msfragger")
params.open_search.contains("yes")
output:
tuple val(mzml_id), file("${mzml_file.baseName}_msfragger.pepXML") into pep_files_msfragger
file "*.log"
script:
"""
MSFraggerAdapter -in \$PWD/${mzml_file} \\
-out ${mzml_file.baseName}_msfragger.idXML \\
-opt_out ${mzml_file.baseName}_msfragger.pepXML \\
-threads ${task.cpus} \\
-license yes \\
-database \$PWD/${database} \\
-digest:allowed_missed_cleavage ${params.allowed_missed_cleavages} \\
-digest:num_enzyme_termini ${params.num_enzyme_termini} \\
-digest:search_enzyme_name "${enzyme}" \\
-statmod:unimod ${fixed.tokenize(',').collect { "'${it}'" }.join(" ") } \\
-varmod:unimod ${variable.tokenize(',').collect { "'${it}'" }.join(" ") } \\
-tolerance:precursor_mass_tolerance_lower 150 \\
-tolerance:precursor_mass_tolerance_upper 500 \\
-tolerance:precursor_mass_unit Da \\
-tolerance:fragment_mass_tolerance ${frag_tol} \\
-tolerance:fragment_mass_unit ${frag_tol_unit} \\
-debug ${params.db_debug} \\
> ${mzml_file.baseName}_msfragger.log
"""
}
process peptideprophet {
label 'process_low'
publishDir "${params.outdir}/logs", mode: 'copy', pattern: '*.log'
input:
tuple file(database), val(mzml_id), file(pepXML) from searchengine_in_db_peptideprophet.mix(searchengine_in_db_decoy_peptideprophet).combine(pep_files_msfragger)
output:
tuple val(mzml_id), file("${pepXML.baseName}_psm.tsv") into psm_ch
file "*.log"
"""
echo "------------Workspace init------------" >> ${pepXML.baseName}_peptideprophet.log
philosopher workspace --clean >> ${pepXML.baseName}_peptideprophet.log
philosopher workspace --init >> ${pepXML.baseName}_peptideprophet.log
echo "------------Read Database-------------" >> ${pepXML.baseName}_peptideprophet.log
philosopher database --custom ${database} >> ${pepXML.baseName}_peptideprophet.log
echo "--------------Read File--------------" >> ${pepXML.baseName}_peptideprophet.log
philosopher peptideprophet --database ${database} --ppm --accmass --expectscore --decoyprobs --nonparam ${pepXML} >> ${pepXML.baseName}_peptideprophet.log
echo "\n------------Postprocess------------" >> ${pepXML.baseName}_peptideprophet.log
philosopher filter --pepxml "interact-${pepXML.baseName}.pep.xml" --tag ${params.decoy_affix} >> ${pepXML.baseName}_peptideprophet.log
philosopher report >> ${pepXML.baseName}_peptideprophet.log
mv psm.tsv ${pepXML.baseName}_psm.tsv
"""
}
process ptmshepherd {
label 'process_low'
publishDir "${params.outdir}/logs", mode: 'copy', pattern: '*.log'
publishDir "${params.outdir}/open_search", mode: 'copy', pattern: '*global.modsummary.tsv'
input:
tuple val(mzml_id), file(mzml_file), file(psm) from mzmls_ptmshepherd.mix(mzmls_ptmshepherd_picked).join(psm_ch)
output:
tuple val(mzml_id), file("${mzml_file.baseName}_global.modsummary.tsv") into globalmod_ch
file "*.log"
"""
echo "
dataset = ${mzml_file.baseName} $psm \$PWD
threads = 8
histo_bindivs = 5000
histo_smoothbins = 2
peakpicking_promRatio = 0.3
peakpicking_width = 0.002
peakpicking_topN = 500
precursor_tol = 0.01
spectra_ppmtol = 20.0
spectra_condPeaks = 100
spectra_condRatio = 0.02
varmod_masses = Failed_Carbamidomethylation:-57.021464
localization_background = 4
output_extended = true" > shepherd_config.txt
java -jar /thirdparty/PTMShepherd/ptmshepherd-0.3.5.jar shepherd_config.txt > ${mzml_file.baseName}_ptmshepherd.log
mv global.modsummary.tsv ${mzml_file.baseName}_global.modsummary.tsv
"""
}
process deltamass {
label 'process_low'
publishDir "${params.outdir}/open_search", mode: 'copy'
input:
tuple val(mzml_id), file(mzml_file), file(globalmod) from mzmls_deltamass.mix(mzmls_deltamass_picked).join(globalmod_ch)
output:
file "${mzml_file.baseName}_delta-mass.html"
"""
python3 /thirdparty/MSFragger/Delta_Mass_Hist.py -i $globalmod -o ${mzml_file.baseName}_delta-mass.html
"""
}
process index_peptides {
label 'process_low'
publishDir "${params.outdir}/logs", mode: 'copy', pattern: '*.log'
input:
tuple mzml_id, file(id_file), val(enzyme), file(database) from id_files_msgf.mix(id_files_comet).mix(id_files_msfragger).combine(ch_sdrf_config.idx_settings, by: 0).combine(pepidx_in_db.mix(pepidx_in_db_decoy))
output:
tuple mzml_id, file("${id_file.baseName}_idx.idXML") into id_files_idx_ForPerc, id_files_idx_ForIDPEP, id_files_idx_ForIDPEP_noFDR
file "*.log"
script:
def il = params.IL_equivalent ? '-IL_equivalent' : ''
def allow_um = params.allow_unmatched ? '-allow_unmatched' : ''
// see comment in CometAdapter. Alternative here in PeptideIndexer is to let it auto-detect the enzyme by not specifying.
if (params.search_engines.contains("msgf"))
{
if (enzyme == 'Trypsin') enzyme = 'Trypsin/P'
else if (enzyme == 'Arg-C') enzyme = 'Arg-C/P'
else if (enzyme == 'Asp-N') enzyme = 'Asp-N/B'
else if (enzyme == 'Chymotrypsin') enzyme = 'Chymotrypsin/P'
else if (enzyme == 'Lys-C') enzyme = 'Lys-C/P'
}
"""
PeptideIndexer -in ${id_file} \\
-out ${id_file.baseName}_idx.idXML \\
-threads ${task.cpus} \\
-fasta ${database} \\
-enzyme:name "${enzyme}" \\
-enzyme:specificity ${pepidx_num_enzyme_termini} \\
${il} \\
${allow_um} \\
> ${id_file.baseName}_index_peptides.log
"""
}
// ---------------------------------------------------------------------
// Branch a) Q-values and PEP from Percolator
process extract_percolator_features {
label 'process_very_low'
label 'process_single_thread'
publishDir "${params.outdir}/logs", mode: 'copy', pattern: '*.log'
input:
tuple mzml_id, file(id_file) from id_files_idx_ForPerc
output:
tuple mzml_id, file("${id_file.baseName}_feat.idXML") into id_files_idx_feat
file "*.log"
when:
params.posterior_probabilities == "percolator"
script:
"""
PSMFeatureExtractor -in ${id_file} \\
-out ${id_file.baseName}_feat.idXML \\
-threads ${task.cpus} \\
> ${id_file.baseName}_extract_percolator_features.log
"""
}
//Note: from here, we do not need any settings anymore. so we can skip adding the mzml_id to the channels
//TODO find a way to run across all runs merged
process percolator {
//TODO Actually it heavily depends on the subset_max_train option and the number of IDs
// would be cool to get an estimate by parsing the number of IDs from previous tools.
label 'process_medium'
//Since percolator 3.5 it allows for 27 parallel tasks
cpus { check_max( 27, 'cpus' ) }
publishDir "${params.outdir}/logs", mode: 'copy', pattern: '*.log'
publishDir "${params.outdir}/raw_ids", mode: 'copy', pattern: '*.idXML'
input:
tuple mzml_id, file(id_file) from id_files_idx_feat
output:
tuple mzml_id, file("${id_file.baseName}_perc.idXML"), val("MS:1001491") into id_files_perc, id_files_perc_consID
file "*.log"
when:
params.posterior_probabilities == "percolator"
// NICE-TO-HAVE: the decoy-pattern is automatically detected from PeptideIndexer.
// Parse its output and put the correct one here.
script:
if (params.klammer && params.description_correct_features == 0) {
log.warn('Klammer was specified, but description of correct features was still 0. Please provide a description of correct features greater than 0.')
log.warn('Klammer will be implicitly off!')
}
// currently post-processing-tdc is always set since we do not support separate TD databases
"""
## Percolator does not have a threads parameter. Set it via OpenMP env variable,
## to honor threads on clusters
OMP_NUM_THREADS=${task.cpus} PercolatorAdapter \\
-in ${id_file} \\
-out ${id_file.baseName}_perc.idXML \\
-threads ${task.cpus} \\
-subset_max_train ${params.subset_max_train} \\
-decoy_pattern ${params.decoy_affix} \\
-post_processing_tdc \\
-score_type pep \\
> ${id_file.baseName}_percolator.log
"""
}
// ---------------------------------------------------------------------
// Branch b) Q-values and PEP from OpenMS
if(params.posterior_probabilities != "percolator" && params.search_engines.split(",").size() == 1)
{
id_files_idx_ForIDPEP_noFDR = Channel.empty()
}
process fdr_idpep {
label 'process_very_low'
label 'process_single_thread'
publishDir "${params.outdir}/logs", mode: 'copy', pattern: '*.log'
input:
tuple mzml_id, file(id_file) from id_files_idx_ForIDPEP
output:
tuple mzml_id, file("${id_file.baseName}_fdr.idXML") into id_files_idx_ForIDPEP_FDR
file "*.log"
when:
params.posterior_probabilities != "percolator" && params.search_engines.split(",").size() == 1
script:
"""
FalseDiscoveryRate -in ${id_file} \\
-out ${id_file.baseName}_fdr.idXML \\
-threads ${task.cpus} \\
-protein false \\
-algorithm:add_decoy_peptides \\
-algorithm:add_decoy_proteins \\
> ${id_file.baseName}_fdr.log
"""
}
//idpep picks the best scores for each search engine automatically. No switching needed after FDR.
process idpep {
label 'process_low'
// I think Eigen optimization is multi-threaded, so leave threads open
publishDir "${params.outdir}/logs", mode: 'copy', pattern: '*.log'
publishDir "${params.outdir}/raw_ids", mode: 'copy', pattern: '*.idXML'
input:
tuple mzml_id, file(id_file) from id_files_idx_ForIDPEP_FDR.mix(id_files_idx_ForIDPEP_noFDR)
output:
tuple mzml_id, file("${id_file.baseName}_idpep.idXML"), val("q-value_score") into id_files_idpep, id_files_idpep_consID
file "*.log"
when:
params.posterior_probabilities != "percolator"
script:
"""
IDPosteriorErrorProbability -in ${id_file} \\
-out ${id_file.baseName}_idpep.idXML \\