-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsipros_peptides_assembling_mod2.py
1497 lines (1232 loc) · 61.2 KB
/
sipros_peptides_assembling_mod2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#!/usr/bin/python
"""
sipros_peptides_assembling.py
sipros_peptides_assembling is the second post-processing program
after running of sipros_peptides_filtering.py for assembling
peptides to identify proteins.
Created by Tae-Hyuk (Ted) Ahn on 10/10/2012.
Updated by Xuan Guo on 02/20/2017.
Updated by Shengze Wang on 07/30/2022.
Copyright (c) 2012 Tae-Hyuk Ahn (ORNL). Allrights reserved.
"""
## Import standard modules
from __future__ import division
import sys, getopt, warnings, os, re
from datetime import datetime, date, time
from collections import defaultdict
import csv
try:
from sets import Set
except ImportError:
pass
## Import Sipros package modules
import sipros_post_module
import parseconfig
## increase CSV field size limit
csv.field_size_limit(1000000000)
## Version control
def get_version():
return "Sipros Ensemble 1.0.1 (Alpha)"
"""
1. for Sipros4.0 (added two columns in sip file)
new program should handle both types of sip files
"""
## Import classes
## Class Usage
Usage = sipros_post_module.Usage
## Class for ignoring comments '#' in sipros file
CommentedFile = sipros_post_module.CommentedFile
## Exit system with error message
die = sipros_post_module.die
## Returns the current time in a nice format
curr_time = sipros_post_module.curr_time
## Format time as a pretty string
format_time = sipros_post_module.format_time
## Check file exist
check_file_exist = sipros_post_module.check_file_exist
## Get file(s) list in working dir with specific file extension
get_file_list_with_ext = sipros_post_module.get_file_list_with_ext
## Get base_out filename
get_base_out = sipros_post_module.get_base_out
## Class for PepOutFields object
PepOutFields = sipros_post_module.PepOutFields
## Class for PsmOutFields object
PsmOutFields = sipros_post_module.PsmOutFields
Psm4OutFields = sipros_post_module.Psm4OutFields
## list_to_string
list_to_string = sipros_post_module.list_to_string
## Find string between two substrings
find_between = sipros_post_module.find_between
## Division error handling
divide = sipros_post_module.divide
## PrettyFloat (%0.5f)
PrettyFloat = sipros_post_module.PrettyFloat
## Get item list
check_sub_list = sipros_post_module.check_sub_list
## Get item list
get_item_list = sipros_post_module.get_item_list
## set_float_digit
set_float_digit = sipros_post_module.set_float_digit
## Help message
help_message = '''
Usage:
python sipros_peptides_assembling.py [options]
Inputs:
sipros_peptides_filtering output: [*.pep.txt] and [*.psm.txt] files
(multiple peptide filtering results can be processed)
(search automatically in current directory)
sipros config file (search automatically in current directory)
Options:
-h/--help
-v/--version
-w/--working-dir ./path/ # Directory path containing SIPROS output files
# (default = current directory)
-c/--config-file SiprosConfig.cfg # SIPROS config file
# (default = SiprosConfig.cfg)
Outputs:
BaseFilename.pro.txt
BaseFilename.pro2pep.txt
BaseFilenameepro2psm.txt
- where BaseFilename = common prefix of inputs
- if len(BaseFilename) < 5, then BaseFilename = Sipros_searches
'''
## Glboal variables
pep_file_ext = '.pep.txt'
psm_file_ext = '.psm.txt'
pep_iden_str = '[Peptide_Identification]'
fasta_database_str = 'FASTA_Database'
pro_iden_str = '[Protein_Identification]'
testing_decoy_prefix_str = 'Testing_Decoy_Prefix'
training_decoy_prefix_str = 'Training_Decoy_Prefix'
reserved_decoy_prefix_str = 'Reserved_Decoy_Prefix'
min_peptide_per_protein_str = 'Min_Peptide_Per_Protein'
min_unique_peptide_per_protein_str = 'Min_Unique_Peptide_Per_Protein'
remove_decoy_identification_str = 'Remove_Decoy_Identification'
search_type_str = 'Search_Type'
sipros4_psmout_column_length = 20
sipros4_input = None
# defaul value
decoy_prefix = 'Rev_'
min_peptide_per_protein = 2
min_unique_peptide_per_protein = 1
remove_decoy_identification = 'No'
## Parse options
def parse_options(argv):
try:
opts, args = getopt.getopt(argv[1:], "hvVw:c:i:",
["help",
"version",
"working-dir",
"config-file",
"psm-pep-files"])
# Error handling of options
except(getopt.error, msg):
raise Usage(msg)
# Default working dir and config file
working_dir = "./"
config_file = "SiprosConfig.cfg"
psm_txt_file_str = ""
pep_txt_file_str = ""
# Basic options
for option, value in opts:
if option in ("-h", "--help"):
raise Usage(help_message)
if option in ("-v", "-V", "--version"):
print("sipros_peptides_assembling.py V%s" % (get_version()))
sys.exit(0)
if option in ("-w", "--working-dir"):
working_dir = value
if working_dir[-1] != '/':
working_dir = working_dir + '/'
if option in ("-c", "--config-file"):
config_file = value
if option in ("-i", "--psm-pep-files"):
split_l = value.split("____________")
psm_txt_file_str = split_l[0]
pep_txt_file_str = split_l[1]
# only -w is provided
if working_dir != "./" and config_file == "SiprosConfig.cfg":
config_file = working_dir + config_file
return (working_dir, config_file, psm_txt_file_str, pep_txt_file_str)
## Parse config file
def parse_config(config_filename):
# Save all config values to dictionary
all_config_dict = {} # initialize dictionay
# Save config values to dictionary
config_dict = {} # initialize dictionay
# Call Yinfeng's parseconfig.py module
check_file_exist(config_filename)
all_config_dict = parseconfig.parseConfigKeyValues(config_filename)
# only save protein_identification config info to config_dict
config_dict[testing_decoy_prefix_str] = decoy_prefix
config_dict[min_peptide_per_protein_str] = min_peptide_per_protein
config_dict[min_unique_peptide_per_protein_str] = min_unique_peptide_per_protein
config_dict[remove_decoy_identification_str] = remove_decoy_identification
for key, value in all_config_dict.items():
if key == (pep_iden_str + fasta_database_str):
config_dict[fasta_database_str] = value
elif key == (pro_iden_str + testing_decoy_prefix_str):
config_dict[testing_decoy_prefix_str] = value
elif key == (pro_iden_str + training_decoy_prefix_str):
config_dict[training_decoy_prefix_str] = value
elif key == (pro_iden_str + reserved_decoy_prefix_str):
config_dict[reserved_decoy_prefix_str] = value
elif key == (pro_iden_str + min_peptide_per_protein_str):
config_dict[min_peptide_per_protein_str] = value
elif key == (pro_iden_str + min_unique_peptide_per_protein_str):
config_dict[min_unique_peptide_per_protein_str] = value
elif key == (pro_iden_str + remove_decoy_identification_str):
config_dict[remove_decoy_identification_str] = value
elif key == (pep_iden_str + search_type_str):
config_dict[search_type_str] = value
else:
continue
# return config dictionary
return config_dict
## Read fasta file and save the description
def read_fasta_file(working_dir, config_dict):
# get fasta filename from config file
fasta_filename = config_dict[fasta_database_str].strip()
fasta_filename_only = fasta_filename.split("/")[-1]
# get working_dir
if working_dir[-1] != '/':
working_dir = working_dir + '/'
fasta_filename_dir = working_dir + fasta_filename_only
# check file exist and open the file
try:
with open(fasta_filename) as f:
pass
fasta_file = open(fasta_filename, 'r')
except:
try:
with open(fasta_filename_only) as f:
pass
fasta_file = open(fasta_filename_only, 'r')
except:
try:
with open(fasta_filename_dir) as f:
pass
fasta_file = open(fasta_filename_dir, 'r')
except:
print >> sys.stderr, '\nCannot open', fasta_filename
print >> sys.stderr, 'Check your config file!'
die("Program exit!")
# save the fasta ID and description to the fasta_ID_dict
fasta_ID_dict = {} # initialize dictionary
# FASTA ID is space delimited file
fasta_ID_del = ' '
# read lines
for line in fasta_file.readlines():
line = line.strip()
# check line start with '>'
if line.startswith('>'):
# protein ID is the first word without '>'
protein_ID = line.split(fasta_ID_del)[0][1:]
# protein description is the whole line without '>'
protein_desc = line[1:]
# replace "#" to "$"
protein_desc = re.sub('#', '$', protein_desc)
# replace tab to " "
protein_desc = re.sub('\t', ' ', protein_desc)
# save the fasta ID and description to the fasta_ID_dict
fasta_ID_dict[protein_ID] = protein_desc # initialize dictionary
return fasta_ID_dict
# the ratio between test decoy psms and forward psms
test_decoy_fwr_ratio = 1
## Read fasta file and only save the description in the filtered PSM
def read_fasta_necessary_file(working_dir, config_dict, pro_greedy_list):
pro_set = set()
# if pro_greedy_list length > 0
if len(pro_greedy_list) > 0:
# loop pro_greedy_list
for protein_one in pro_greedy_list:
# Column1: Protein ID
ProteinID = protein_one
if (ProteinID.startswith('{')) and (ProteinID.endswith('}')):
# multiple proteins exist
protein_ID_list = get_item_list(ProteinID.strip())
for protein_ID_one in protein_ID_list:
pro_set.add(protein_ID_one)
# single ProteinID
else:
# check protein ID exist
pro_set.add(ProteinID)
# get fasta filename from config file
fasta_filename = config_dict[fasta_database_str].strip()
fasta_filename_only = fasta_filename.split("/")[-1]
# get working_dir
if working_dir[-1] != '/':
working_dir = working_dir + '/'
fasta_filename_dir = working_dir + fasta_filename_only
# check file exist and open the file
try:
with open(fasta_filename) as f:
pass
fasta_file = open(fasta_filename, 'r')
except:
try:
with open(fasta_filename_only) as f:
pass
fasta_file = open(fasta_filename_only, 'r')
except:
try:
with open(fasta_filename_dir) as f:
pass
fasta_file = open(fasta_filename_dir, 'r')
except:
print >> sys.stderr, '\nCannot open', fasta_filename
print >> sys.stderr, 'Check your config file!'
die("Program exit!")
# save the fasta ID and description to the fasta_ID_dict
fasta_ID_dict = {} # initialize dictionary
# FASTA ID is space delimited file
fasta_ID_del = ' '
train_str = config_dict[training_decoy_prefix_str]
test_str = config_dict[testing_decoy_prefix_str]
reserved_str = ''
if reserved_decoy_prefix_str in config_dict:
reserved_str = config_dict[reserved_decoy_prefix_str]
if config_dict[search_type_str] == 'SIP':
train_str = test_str + '_fake'
num_train_int = 0
num_test_int = 0
num_reserved_int = 0
num_fwd_int = 0
less_train_str = '>' + train_str
less_test_str = '>' + test_str
less_reserved_str = '>' + reserved_str
# read lines
for line in fasta_file:
line = line.strip()
# check line start with '>'
if line.startswith('>'):
if line.startswith(less_train_str):
num_train_int += 1
elif line.startswith(less_test_str):
num_test_int += 1
elif reserved_str != '' and line.startswith(less_reserved_str):
num_reserved_int += 1
else:
num_fwd_int += 1
# protein ID is the first word without '>'
protein_ID = line.split(fasta_ID_del)[0][1:]
#
if protein_ID not in pro_set:
continue
# protein description is the whole line without '>'
protein_desc = line[1:]
# replace "#" to "$"
protein_desc = re.sub('#', '$', protein_desc)
# replace tab to " "
protein_desc = re.sub('\t', ' ', protein_desc)
# save the fasta ID and description to the fasta_ID_dict
fasta_ID_dict[protein_ID] = protein_desc # initialize dictionary
global test_decoy_fwr_ratio
test_decoy_fwr_ratio = float(num_test_int) / float(num_fwd_int)
fasta_file.close()
return fasta_ID_dict
## check pep and psm files pair set, and save run#
def get_run_num(pep_file_list, psm_file_list):
# dictionary of Run# for each pep_file
run_num_dict = {}
psm_run_num_dict = {}
# read multiple pep files
for pep_file_idx, pep_file in enumerate(pep_file_list):
# check file exist and open
check_file_exist(pep_file)
# If base common prefix ends with '.pep.txt', then remove '.pep.txt'
base_pep_file = pep_file.replace(pep_file_ext, "")
# make a psm filename using pep filename
psm_file = base_pep_file + psm_file_ext
# check psm file exist
check_file_exist(psm_file)
# for Run#
run_tag = 'Run' + str(pep_file_idx + 1)
# run_num_dict
run_num_dict[pep_file] = run_tag
psm_run_num_dict[psm_file] = run_tag
return (run_num_dict, psm_run_num_dict)
# Read and load pep and psm files
def read_run_files(run_num_dict):
# save the pep file data to the defaultdict
pep_data_dict = defaultdict(list)
# save the psm file data to the defaultdict
psm_data_dict = defaultdict(list)
# save peptides list (value) to unique protein (key)
pro_pep_dict = defaultdict(list)
# save protein list (value) to unique peptide (key)
pep_pro_dict = defaultdict(list)
# key = pep_file, val = run_num , sorted by Run# index
for pep_file, run_num in sorted(run_num_dict.items(), key=lambda x: x[1][-1]):
# read line with csv
pep_reader = csv.reader(CommentedFile(open(pep_file, 'r')),
delimiter='\t')
# skip header
headline = pep_reader.__next__()
# get data
for pep_line in pep_reader:
# adapt class PepOutFields
pep_obj = PepOutFields._make(pep_line)
identified_peptide = pep_obj.IdentifiedPeptide
identified_peptide = identified_peptide.strip()
# new unique ID for pep_data_dict is identified_peptide + run_num
pep_run_id = identified_peptide + "_" + run_num
pep_data_dict[pep_run_id].append(pep_line)
# get protein item list
protein_names = pep_obj.ProteinNames
pro_item_list = get_item_list(protein_names.strip())
# for loop of pro_item_list
for pro_item in pro_item_list:
pro_item = pro_item.strip()
# save IdentifiedPeptide list to unique protein dict
if identified_peptide not in pro_pep_dict[pro_item]:
pro_pep_dict[pro_item].append(identified_peptide)
# save protein list to unique peptide dict
if pro_item not in pep_pro_dict[identified_peptide]:
pep_pro_dict[identified_peptide].append(pro_item)
# try to find indistinguishable set
indistin_pro_dict = defaultdict(list)
for pro_key, pep_list in pro_pep_dict.items():
sorted_pep_list = sorted(set(pep_list))
sorted_pep_list_join = '_'.join(sorted_pep_list)
indistin_pro_dict[sorted_pep_list_join].append(pro_key)
# indistin_key = str(sorted(pep_list)), indistin_value=pro_key list
for indistin_key, indistin_value in indistin_pro_dict.items():
# if proteins have a same set of peptides
if len(indistin_value) > 1:
# get new protein name
new_pro_key = list_to_string(sorted(indistin_value))
new_pro_val = pro_pep_dict[indistin_value[0]]
# append with new key and value to pro_pep_dict
# delete provious pep_pro_dict key,value, and append new one
for new_pro_val_one in new_pro_val:
pro_pep_dict[new_pro_key].append(new_pro_val_one)
pep_pro_dict[new_pro_val_one].append(new_pro_key)
# delete pep_pro_dict indistin protein list of for the peptide
org_pep_pro_dict_list = pep_pro_dict[new_pro_val_one]
for indistin_value_one in indistin_value:
org_pep_pro_dict_list.remove(indistin_value_one)
if len(org_pep_pro_dict_list) == 0:
del pep_pro_dict[new_pro_val_one]
else:
pep_pro_dict[new_pro_val_one] = org_pep_pro_dict_list
# delete previous indistinguishable proteins in pro_pep_dict
for indistin_pro in indistin_value:
del pro_pep_dict[indistin_pro]
# key = pep_file, val = run_num , sorted by Run# index
for pep_file, run_num in sorted(run_num_dict.items(), key=lambda x: x[1][-1]):
# If base common prefix ends with '.pep.txt', then remove '.pep.txt'
base_pep_file = pep_file.replace(pep_file_ext, "")
# make a psm filename using pep filename
psm_file = base_pep_file + psm_file_ext
# check psm file exist
check_file_exist(psm_file)
# read line with csv
psm_reader = csv.reader(CommentedFile(open(psm_file, 'r')),
delimiter='\t')
# skip header
headline = psm_reader.__next__()
# get data
for psm_line in psm_reader:
# check sipros3 or sipros4 input
psm_obj = None
global sipros4_input
if len(psm_line) == sipros4_psmout_column_length:
psm_obj = Psm4OutFields._make(psm_line)
sipros4_input = True
else:
psm_obj = PsmOutFields._make(psm_line)
sipros4_input = False
# get protein item list
protein_names = psm_obj.ProteinNames
pro_item_list = get_item_list(protein_names.strip())
# save key=protein_id, val=line
for pro_item_one in pro_item_list:
# new unique ID for psm_data_dict is protein_name + run_num
pro_item_id = pro_item_one + "_" + run_num
# save to dictionary
psm_data_dict[pro_item_id].append(psm_line)
return (pep_data_dict, psm_data_dict, pro_pep_dict, pep_pro_dict)
## Greedy algorithm to extract proteins
def greedy_alg(config_dict, pro_pep_dict, pep_pro_dict):
# get config value
min_peptide_per_protein = int(config_dict[min_peptide_per_protein_str])
min_unique_peptide_per_protein = int(config_dict[min_unique_peptide_per_protein_str])
# return config dictionary
# First, extract proteins that have >= min_peptide_per_proteins
# and at least min_unique_peptide_per_protein of those is unique
# save the extracted proteins to the list
pro_greedy_list = []
# copy pro_pep_dict to pro_pep_dict_red for reduction in greedy steps
pro_pep_dict_red = defaultdict(list)
for pro_key, pep_list in pro_pep_dict.items():
for pep_list_one in pep_list:
pro_pep_dict_red[pro_key].append(pep_list_one)
# copy pep_pro_dict to pep_pro_dict_red for reduction in greedy steps
pep_pro_dict_red = defaultdict(list)
for pep_key, pro_list in pep_pro_dict.items():
for pro_list_one in pro_list:
pep_pro_dict_red[pep_key].append(pro_list_one)
# for loop of pro_pep_dict
for pro_key, pep_list in pro_pep_dict.items():
# if proteins that have >= min_peptide_per_protein
if len(pep_list) >= min_peptide_per_protein:
# if at least min_unique_peptide_per_protein is unique
unique_pep_pro_num = 0
for pep_list_one in pep_list:
pep_pro_num = len(pep_pro_dict[pep_list_one])
if pep_pro_num == 1:
unique_pep_pro_num += 1
# unique peptides num should be >= min_unique_peptide_per_protein
# if min_unique_peptide_per_protein = 0, then set 1
if unique_pep_pro_num >= max(min_unique_peptide_per_protein, 1):
# append to the pro_greedy_list
if pro_key not in pro_greedy_list:
pro_greedy_list.append(pro_key)
# remove the protein
try:
del pro_pep_dict_red[pro_key]
except:
pass
# remove all peptides that are covered by the protein
for pep_list_item in pep_list:
# delete by key
try:
del pep_pro_dict_red[pep_list_item]
except:
pass
# get new peptide list for the proteins from pep_pro_dict
for pro_list_item in pep_pro_dict[pep_list_item]:
new_pep_list = pro_pep_dict[pro_list_item]
# if new_pep_list is sub list of pep_list, then remove
if check_sub_list(new_pep_list, pep_list):
try:
del pro_pep_dict_red[pro_list_item]
except:
pass
# Second, iteratively extract a protein at a time that covers the most peptides
if len(pro_pep_dict_red.keys()) > 0:
# Run greedy iterations until it converges
converge = False
# Iterate greedy algorithm until it converges
greedy_step = 0
while (converge == False):
greedy_step += 1
# find a protein that covers the most peptides
ppdr_idx = 0
for key_ppdr, val_ppdr in pro_pep_dict_red.items():
if ppdr_idx == 0:
max_key_ppdr = key_ppdr
max_len_val_ppdr = len(val_ppdr)
else:
# get current one
cur_len_val_ppdr = len(val_ppdr)
cur_key_ppdr = key_ppdr
# get max one
if cur_len_val_ppdr > max_len_val_ppdr:
max_len_val_ppdr = cur_len_val_ppdr
max_key_ppdr = cur_key_ppdr
ppdr_idx += 1
max_pro_one = max_key_ppdr
# if proteins that have >= min_peptide_per_protein
if len(pro_pep_dict_red[max_pro_one]) >= min_peptide_per_protein:
# if at least min_unique_peptide_per_protein is unique
unique_pep_pro_num = 0
for pep_list_one in pro_pep_dict_red[max_pro_one]:
pep_pro_num = len(pep_pro_dict[pep_list_one])
if pep_pro_num == 1:
unique_pep_pro_num += 1
# if at least min_unique_peptide_per_protein is unique
if unique_pep_pro_num >= min_unique_peptide_per_protein:
# append the protein to the pro_greedy_list
pro_greedy_list.append(max_pro_one)
# loop for pep set
for pep_list_sub_one in pro_pep_dict[max_pro_one]:
# delete by key
try:
del pep_pro_dict_red[pep_list_sub_one]
except:
pass
# loop for pro set
for pro_list_sub_one in pep_pro_dict[pep_list_sub_one]:
try:
del pro_pep_dict_red[pro_list_sub_one]
except:
pass
# remove the protein
try:
del pro_pep_dict_red[max_pro_one]
except:
pass
# remove all peptides that are covered by the protein
for pep_list_item in pro_pep_dict[max_pro_one]:
# delete by key
try:
del pep_pro_dict_red[pep_list_item]
except:
pass
# get new peptide list for the proteins from pep_pro_dict
for pro_list_item in pep_pro_dict[pep_list_item]:
new_pep_list = pro_pep_dict[pro_list_item]
# if new_pep_list is sub list of pep_list, then remove
if check_sub_list(new_pep_list, pep_list):
try:
del pro_pep_dict_red[pro_list_item]
except:
pass
else:
del pro_pep_dict_red[max_pro_one]
# the max peptides number for protein < min_peptide_per_protein
else:
converge = True
# if there is no protein, then converge
if len(pro_pep_dict_red.keys()) == 0:
converge = True
# greedy algorithm done
pro_greedy_list = sorted(pro_greedy_list)
return pro_greedy_list
## Get protein description to handle multiple protein IDs
def get_protein_description(protein_ID, fasta_ID_dict):
# initialize
protein_description_list = []
# if multiple IDs
if (protein_ID.startswith('{')) and (protein_ID.endswith('}')):
# multiple proteins exist
protein_ID_list = get_item_list(protein_ID.strip())
for protein_ID_one in protein_ID_list:
# check protein ID exist
if protein_ID_one in fasta_ID_dict:
protein_description_one = fasta_ID_dict[protein_ID_one]
protein_description_list.append(protein_description_one)
else:
protein_description_one = "N/A"
protein_description_list.append(protein_description_one)
# single ProteinID
else:
# check protein ID exist
if protein_ID in fasta_ID_dict:
protein_description = fasta_ID_dict[protein_ID]
protein_description_list.append(protein_description)
else:
protein_description = "N/A"
protein_description_list.append(protein_description)
# convert list to string
protein_description = list_to_string(protein_description_list)
return protein_description
## check decoy match
def check_decoy_match(ProteinNames, decoy_prefix):
# match type (correct or decoy) and strip
match_type = ProteinNames
match_type_list = []
if match_type.startswith("{"):
# if starts with {}, then delete parenthesis { }
match_type = match_type[1:-1]
# sometimes multiple proteins
match_type_list = re.split(r"\s*[,]\s*", match_type.strip())
else:
match_type_list.append(match_type)
# TF -> True or False(decoy match)
TF = False
# for loop of proteins
for match_item in match_type_list:
# if at least one of matches is True, then match is True
if not match_item.startswith(decoy_prefix):
TF = True
break
return TF
## Report output files
def report_output(config_dict,
run_num_dict,
psm_run_num_dict,
pep_data_dict,
psm_data_dict,
pro_pep_dict,
pep_pro_dict,
pro_greedy_list,
fasta_ID_dict):
# save .pro.txt data
pro_out_data = []
# save .pro2pep.txt data
pro2pep_out_data = []
# save .pro2psm.txt data
pro2psm_out_data = []
# get config value
min_peptide_per_protein = int(config_dict[min_peptide_per_protein_str])
min_unique_peptide_per_protein = int(config_dict[min_unique_peptide_per_protein_str])
remove_decoy_identification = config_dict[remove_decoy_identification_str]
# to get decoy_prefix
decoy_prefix = config_dict[testing_decoy_prefix_str]
# total number of proteins
total_proteins_before_filtering = len(pro_pep_dict.keys())
decoy_proteins_before_filtering = 0
for key, val in pro_pep_dict.items():
check_decoy_match_val = check_decoy_match(key, decoy_prefix)
if check_decoy_match_val is False:
decoy_proteins_before_filtering += 1
target_proteins_before_filtering = int(total_proteins_before_filtering) - int(decoy_proteins_before_filtering)
# total number of identified proteins
total_proteins_after_filtering = len(pro_greedy_list)
decoy_proteins_after_filtering = 0
for pro_one in pro_greedy_list:
check_decoy_match_val = check_decoy_match(pro_one, decoy_prefix)
if check_decoy_match_val is False:
decoy_proteins_after_filtering += 1
target_proteins_after_filtering = int(total_proteins_after_filtering) - int(decoy_proteins_after_filtering)
# protein FDR
protein_fdr = 0.0
if float(total_proteins_after_filtering) != 0:
protein_fdr = divide(float(decoy_proteins_after_filtering), float(target_proteins_after_filtering))
# protein_fdr = (1/test_decoy_fwr_ratio) * protein_fdr
# adjust fdr for the hidden training
protein_fdr = PrettyFloat(protein_fdr)
# output header
def_para_msg = ""
def_para_msg += "#\t########################################\n"
def_para_msg += "#\t##### Peptide Assembling by Sipros #####\n"
def_para_msg += "#\t########################################\n"
def_para_msg += "#\t\n"
def_para_msg += "#\t###############\n"
def_para_msg += "#\t# Input Files #\n"
def_para_msg += "#\t###############\n"
def_para_msg += "#\t\n"
def_para_msg += "#\t[Input_Files]\n"
def_para_msg += "#\t\n"
# key = psm_file, val = run_num , sorted by Run# index
for psm_file, run_num in sorted(psm_run_num_dict.items(), key=lambda x: x[1][-1]):
def_para_msg += "#\tpsm{" + run_num + "} = " + str(psm_file) + "\n"
def_para_msg += "#\t\n"
# key = pep_file, val = run_num , sorted by Run# index
for pep_file, run_num in sorted(run_num_dict.items(), key=lambda x: x[1][-1]):
def_para_msg += "#\tpep{" + run_num + "} = " + str(pep_file) + "\n"
def_para_msg += "#\t\n"
def_para_msg += "#\t########################\n"
def_para_msg += "#\t# Filtering Parameters #\n"
def_para_msg += "#\t########################\n"
def_para_msg += "#\t\n"
def_para_msg += "#\t" + pep_iden_str + "\n"
def_para_msg += "#\t\n"
def_para_msg += "#\t# Minimum number of peptides per protein\n"
def_para_msg += "#\t" + min_peptide_per_protein_str + " = " + str(min_peptide_per_protein) + "\n"
def_para_msg += "#\t\n"
def_para_msg += "#\t# Minimum number of unique peptides per protein\n"
def_para_msg += "#\t" + min_unique_peptide_per_protein_str + " = " + str(min_unique_peptide_per_protein) + "\n"
def_para_msg += "#\t\n"
def_para_msg += "#\t#######################\n"
def_para_msg += "#\t# Statistical Results #\n"
def_para_msg += "#\t#######################\n"
def_para_msg += "#\t\n"
def_para_msg += "#\t[Statistical_Results]\n"
def_para_msg += "#\t\n"
def_para_msg += "#\t# Numbers of proteins before filtering\n"
def_para_msg += "#\tDecoy_Proteins_Before_Filtering = " + str(decoy_proteins_before_filtering) + "\n"
def_para_msg += "#\tTarget_Proteins_Before_Filtering = " + str(target_proteins_before_filtering) + "\n"
# def_para_msg += "#\tTotal_Proteins_Before_Filtering = " + str(total_proteins_before_filtering) + "\n"
def_para_msg += "#\t\n"
def_para_msg += "#\t# Numbers of proteins after filtering\n"
def_para_msg += "#\tDecoy_Proteins_After_Filtering = " + str(decoy_proteins_after_filtering) + "\n"
def_para_msg += "#\tTarget_Proteins_After_Filtering = " + str(target_proteins_after_filtering) + "\n"
# = "#\tTotal_Proteins_After_Filtering = " + str(total_proteins_after_filtering) + "\n"
def_para_msg += "#\t\n"
def_para_msg += "#\t# Protein FDR = Decoy_Proteins_After_Filtering / Target_Proteins_After_Filtering\n"
def_para_msg += '#\tProtein_FDR = {0:.2f}%\n'.format(100 * protein_fdr)
# def_para_msg += "#\tProtein_FDR = " + set_float_digit(protein_fdr) + "\n"
def_para_msg += "#\t\n"
pro_out_file.write(def_para_msg)
pro2pep_out_file.write(def_para_msg)
pro2psm_out_file.write(def_para_msg)
# pro.txt column message
pro_column_msg = ""
pro_column_msg += "#\t################\n"
pro_column_msg += "#\t# Column Names #\n"
pro_column_msg += "#\t################\n"
pro_column_msg += "#\t\n"
pro_column_msg += "#\t[Column_Names]\n"
pro_column_msg += "#\tProteinID = Names of the protein\n"
pro_column_msg += "#\tRun#_UniquePeptideCounts = Number of unique peptides in a run\n"
pro_column_msg += "#\tRun#_TotalPeptideCounts = Number of all peptides in a run\n"
pro_column_msg += "#\tRun#_UniqueSpectrumCounts = Number of unique PSM in a run\n"
pro_column_msg += "#\tRun#_TotalSpectrumCounts = Number of all PSM in a run\n"
pro_column_msg += "#\tRun#_BalancedSpectrumCounts = Balanced spectrum count in a run\n"
pro_column_msg += "#\tRun#_NormalizedBalancedSpectrumCounts = Normalized Balanced spectrum count in a run\n"
pro_column_msg += "#\tProteinDescription = Protein description\n"
pro_column_msg += "#\tTargetMatch = T for target match and F for decoy match\n"
pro_column_msg += "#\t\n"
pro_out_file.write(pro_column_msg)
# pro2pep.txt column message
pro2pep_column_msg = ""
pro2pep_column_msg += "#\t################\n"
pro2pep_column_msg += "#\t# Column Names #\n"
pro2pep_column_msg += "#\t################\n"
pro2pep_column_msg += "#\t\n"
pro2pep_column_msg += "#\t[Column_Names]\n"
pro2pep_column_msg += "#\t\n"
pro2pep_column_msg += "#\t+ = Marker of a protein line\n"
pro2pep_column_msg += "#\tProteinID = Names of the protein\n"
pro2pep_column_msg += "#\tRun#_UniquePeptideCounts = Number of unique peptides in a run\n"
pro2pep_column_msg += "#\tRun#_TotalPeptideCounts = Number of all peptides in a run\n"
pro2pep_column_msg += "#\tRun#_UniqueSpectrumCounts = Number of unique PSM in a run\n"
pro2pep_column_msg += "#\tRun#_TotalSpectrumCounts = Number of all PSM in a run\n"
pro2pep_column_msg += "#\tRun#_BalancedSpectrumCounts = Balanced spectrum count in a run\n"
pro2pep_column_msg += "#\tRun#_NormalizedBalancedSpectrumCounts = Normalized Balanced spectrum count in a run\n"
pro2pep_column_msg += "#\tProteinDescription = Protein description\n"
pro2pep_column_msg += "#\tTargetMatch = T for target match and F for decoy match\n"
pro2pep_column_msg += "#\t\n"
pro2pep_column_msg += "#\t* = Marker of a peptide line\n"
pro2pep_column_msg += "#\tIdentifiedPeptide = Identified peptide sequence with potential PTMs and mutations\n"
pro2pep_column_msg += "#\tParentCharge = Charge state of identified peptide\n"
pro2pep_column_msg += "#\tOriginalPeptide = Original peptide sequence in the FASTA file\n"
pro2pep_column_msg += "#\tProteinNames = Names of proteins of the peptide\n"
pro2pep_column_msg += "#\tProteinCount = Number of proteins that the peptide can be assigned to\n"
pro2pep_column_msg += "#\tTargetMatch = T for target match and F for decoy match\n"
pro2pep_column_msg += "#\tSpectralCount = Number of PSMs in which the peptide is identified\n"
pro2pep_column_msg += "#\tBestScore = The best score of those PSMs\n"
pro2pep_column_msg += "#\tPSMs = List of PSMs for the peptide: FT2_Filename[Scan_Number]\n"
pro2pep_column_msg += "#\tScanType = Scan type of those PSMs\n"
pro2pep_column_msg += "#\tSearchName = Sipros search name\n"
pro2pep_column_msg += "#\t\n"
pro2pep_out_file.write(pro2pep_column_msg)
# pro2psm.txt column message
pro2psm_column_msg = ""
pro2psm_column_msg += "#\t################\n"
pro2psm_column_msg += "#\t# Column Names #\n"
pro2psm_column_msg += "#\t################\n"
pro2psm_column_msg += "#\t\n"
pro2psm_column_msg += "#\t[Column_Names]\n"
pro2psm_column_msg += "#\t\n"
pro2psm_column_msg += "#\t+ = Marker of a protein line\n"
pro2psm_column_msg += "#\tProteinID = Names of the protein\n"
pro2psm_column_msg += "#\tRun#_UniquePeptideCounts = Number of unique peptides in a run\n"
pro2psm_column_msg += "#\tRun#_TotalPeptideCounts = Number of all peptides in a run\n"
pro2psm_column_msg += "#\tRun#_UniqueSpectrumCounts = Number of unique PSM in a run\n"
pro2psm_column_msg += "#\tRun#_TotalSpectrumCounts = Number of all PSM in a run\n"
pro2psm_column_msg += "#\tRun#_BalancedSpectrumCounts = Balanced spectrum count in a run\n"
pro2pep_column_msg += "#\tRun#_NormalizedBalancedSpectrumCounts = Normalized Balanced spectrum count in a run\n"
pro2psm_column_msg += "#\tProteinDescription = Protein description\n"
pro2psm_column_msg += "#\tTargetMatch = T for target match and F for decoy match\n"
pro2psm_column_msg += "#\t\n"
pro2psm_column_msg += "#\t* = Marker of a PSM line\n"
pro2psm_column_msg += "#\tFilename = Filename of input FT2 file\n"
pro2psm_column_msg += "#\tScanNumber = Scan number of the PSM\n"
pro2psm_column_msg += "#\tParentCharge = Charge state of the PSM\n"
pro2psm_column_msg += "#\tMeasuredParentMass = Measured parent mass\n"
pro2psm_column_msg += "#\tCalculatedParentMass = Calculated parent mass from peptide sequence\n"
pro2psm_column_msg += "#\tMassErrorDa = Mass error in Da with 1-Da error correction\n"
pro2psm_column_msg += "#\tMassErrorPPM = Mass error in PPM with 1-Da error correction\n"
pro2psm_column_msg += "#\tScanType = Scan type of the PSM\n"
pro2psm_column_msg += "#\tSearchName = Sipros search name\n"
pro2psm_column_msg += "#\tScoringFunction = Scoring function used in the search\n"