-
Notifications
You must be signed in to change notification settings - Fork 8
/
Copy pathmartinize.py
executable file
·5070 lines (4385 loc) · 265 KB
/
martinize.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#!/usr/bin/env python
# EDITABLE SECTIONS ARE MARKED WITH #@#
version="2.5.TAW160730"
authors=["Djurre de Jong", "Jaakko J. Uusitalo", "Tsjerk A. Wassenaar"]
# Parameters are defined for the following (protein) forcefields:
forcefields = ['martini21','martini21p','martini22','martini22p','elnedyn','elnedyn22','elnedyn22p','martini22dna']
notes = [
("DdJ130213","V2.3"),
("DdJ200613","Fixes in cysteine bridge detection and help text."),
("DdJ200820","Fixes in cysteine bridge length and revert elnedyn BB-bonds to bonds."),
("DdJ200826","Inverted 'define NO_RUBBER_BANDS', fixed writing posres when merging and added few comments."),
("DdJ200831","Shortened in-file changelog and fixed some comments."),
("DdJ181013","V2.4"),
]
#
# This program has grown to be pretty complex.
# The routines have been organized in different files.
# For working versions, all files can be incorporated by using the option -cat.
#
# Index of the program files:
#
# 1. Options and documentation @DOC.py
# 2. Description, options and command line parsing @CMD.py
# 3. Helper functions and macros @FUNC.py
# 4. Finegrained to coarsegrained mapping @MAP.py
# 5. Secondary structure determination and interpretation @SS.py
# 6. Force field parameters (MARTINI/ELNEDYN) @FF.py
# 7. Elastic network @ELN.py
# 8. Structure I/O @IO.py
# 9. Topology generation @TOP.py
# 10. Main @MAIN.py
# 11. Web-interface @WEB.py
#
def cat(file_out):
'''Function to 'compile' the martinize script into one file.'''
import re
files_in = 'martinize.py DOC.py CMD.py FUNC.py MAP.py SS.py '+'.py '.join(forcefields)+'.py ELN.py IO.py TOP.py MAIN.py '
pattern1 = re.compile(files_in.replace('.py ','|')[:-1])
pattern2 = re.compile(files_in.replace('.py ','\.|')[:-1])
file_out = open(file_out,'w')
tail = ''; head = True
for f in files_in.split():
for line in open(f).readlines():
# Split the string to avoid the function finding itself
if '__na'+'me__' in line:
head = False
if head:
file_out.write(line)
elif (f == 'martinize.py' and not head) and not ('import' in line and pattern1.search(line)):
tail += pattern2.sub('',line)
elif line[0] == '#':
file_out.write(line)
elif not ('import' in line and pattern1.search(line)):
file_out.write(pattern2.sub('',line))
file_out.write(tail)
###################################
## 1 # OPTIONS AND DOCUMENTATION ## -> @DOC <-
###################################
# This is a simple and versatily option class that allows easy
# definition and parsing of options.
class Option:
def __init__(self,func=str,num=1,default=None,description=""):
self.func = func
self.num = num
self.value = default
self.description = description
def __nonzero__(self):
if self.func == bool:
return self.value != False
return bool(self.value)
def __str__(self):
return self.value and str(self.value) or ""
def setvalue(self,v):
if len(v) == 1:
self.value = self.func(v[0])
else:
self.value = [ self.func(i) for i in v ]
# Lists for gathering arguments to options that can be specified
# multiple times on the command line.
lists = {
'cystines': [],
'merges' : [],
'links' : [],
'multi' : [],
}
# List of Help text and options.
# This way we can simply print this list if the user wants help.
options = [
# NOTE: Options marked with (+) can be given multiple times on the command line
# option type number default description
"""
MARTINIZE.py is a script to create Coarse Grain Martini input files of
proteins, ready for use in the molecular dynamics simulations package
Gromacs. For more information on the Martini forcefield, see:
www.cgmartini.nl
and read our papers:
Monticelli et al., J. Chem. Theory Comput., 2008, 4(5), 819-834
de Jong et al., J. Chem. Theory Comput., 2013, DOI:10.1021/ct300646g
Primary input/output
--------------------
The input file (-f) should be a coordinate file in PDB or GROMOS
format. The format is inferred from the structure of the file. The
input can also be provided through stdin, allowing piping of
structures. The input structure can have multiple frames/models. If an output
structure file (-x) is given, each frame will be coarse grained,
resulting in a multimodel output structure. Having multiple frames may
also affect the topology. If secondary structure is determined
internally, the structure will be averaged over the frames. Likewise,
interatomic distances, as used for backbone bond lengths in Elnedyn
and in elastic networks, are also averaged over the frames available.
If an output file (-o) is indicated for the topology, that file will
be used for the master topology, using #include statements to link the
moleculetype definitions, which are written to separate files. If no
output filename is given, the topology and the moleculetype
definitions are written to stdout.
Secondary structure
-------------------
The secondary structure plays a central role in the assignment of atom
types and bonded interactions in MARTINI. Martinize allows
specification of the secondary structure as a string (-ss), or as a
file containing a specification in GROMACS' ssdump format
(-ss). Alternatively, DSSP can be used for an on-the-fly assignment of
the secondary structure. For this, the option -dssp has to be used
giving the location of the executable as the argument.
The option -collagen will set the whole structure to collagen. If this
is not what you want (eg only part of the structure is collagen, you
can give a secondary structure file/string (-ss) and specifiy collagen
as "F". Parameters for collagen are taken from: Gautieri et al.,
J. Chem. Theory Comput., 2010, 6, 1210-1218.
With multimodel input files, the secondary structure as determined with
DSSP will be averaged over the frames. In this case, a cutoff
can be specified (-ssc) indicating the fraction of frames to match a
certain secondary structure type for designation.
Topology
--------
Several options are available to tune the resulting topology. By
default, termini are charged, and chain breaks are kept neutral. This
behaviour can be changed using -nt and -cb, respectively.
Disulphide bridges can be specified using -cys. This option can be
given multiple times on the command line. The argument is a pair of
cysteine residues, using the format
chain/resn/resi,chain/resn/resi.
It is also possible to let martinize detect cysteine pairs based on a
cut-off distance of 0.22nm, by giving the keyword 'auto' as argument to -cys.
Alternatively, a different cut-off distance can be specified, which
will also trigger a search of pairs satisfying the distance
criterion (eg: -cys 0.32).
In addition to cystine bridges, links between other atoms can be
specified using -link. This requires specification of the atoms, using
the format
chain/resi/resn/atom,chain/resi/resn/atom,bondlength,forceconstant.
If only two atoms are given, a constraint will be added with length
equal to the (average) distance in the coordinate file. If a bond
length is added, but no force constant, then the bondlength will be
used to set a constraint.
Linking atoms requires that the atoms are part of the same
moleculetype. Therefore any link between chains will cause the chains
to be merged. Merges can also be specified explicitly, using the
option -merge with a comma-separated list of chain identifiers to be
joined into one moleculetype. The option -merge can be used several
times. Note that specifying a chain in several merge groups will cause
all chains involved to be merged into a single moleculetype.
The moleculetype definitions are written to topology include (.itp)
files, using a name consisting of the molecule class (e.g. Protein)
and the chain identifier. With -name a name can be specified instead.
By default, martinize only writes a moleculetype for each unique
molecule, inferred from the sequence and the secondary structure
definition. It is possible to force writing a moleculetype definition
for every single molecule, using -sep.
The option -p can be used to write position restraints, using the
force constant specified with -pf, which is set to 1000 kJ/mol
by default.
For stability, elastic bonds are used to retain the structure of
extended strands. The option -ed causes dihedrals to be used
instead.
Different forcefields can be specified with -ff. All the parameters and
options belonging to that forcefield will be set (eg. bonded interactions,
BB-bead positions, Elastic Network, etc.). By default martini 2.1 is
used.
Elastic network
---------------
Martinize can write an elastic network for atom pairs within a cutoff
distance. The force constant (-ef) and the upper distance bound (-eu)
can be speficied. If a force field with an intrinsic Elastic
network is specified (eg. Elnedyn) with -ff, -elastic in implied and
the default values for the force constant and upper cutoff are used.
However, these can be overwritten.
Multiscaling
------------
Martinize can process a structure to yield a multiscale system,
consisting of a coordinate file with atomistic parts and
corresponding, overlaid coarsegrained parts. For chains that are
multiscaled, rather than writing a full moleculetype definition,
additional [atoms] and [virtual_sitesn] sections are written, to
be appended to the atomistic moleculetype definitions.
The option -multi can be specified multiple times, and takes a chain
identifier as argument. Alternatively, the keyword 'all' can be given
as argument, causing all chains to be multiscaled.
========================================================================\n
""",
("-f", Option(str, 1, None, "Input file (PDB|GRO)")),
("-o", Option(str, 1, None, "Output topology (TOP)")),
("-x", Option(str, 1, None, "Output coarse grained structure (PDB)")),
("-n", Option(str, 1, None, "Output index file with CG (and multiscale) beads.")),
("-nmap", Option(str, 1, None, "Output index file containing per bead mapping.")),
("-v", Option(bool, 0, False, "Verbose. Be load and noisy.")),
("-h", Option(bool, 0, False, "Display this help.")),
("-ss", Option(str, 1, None, "Secondary structure (File or string)")),
("-ssc", Option(float, 1, 0.5, "Cutoff fraction for ss in case of ambiguity (default: 0.5).")),
("-dssp", Option(str, 1, None, "DSSP executable for determining structure")),
# ("-pymol", Option(str, 1, None, "PyMOL executable for determining structure")),
("-collagen", Option(bool, 0, False, "Use collagen parameters")),
("-his", Option(bool, 0, False, "Interactively set the charge of each His-residue.")),
("-nt", Option(bool, 0, False, "Set neutral termini (charged is default)")),
("-cb", Option(bool, 0, False, "Set charges at chain breaks (neutral is default)")),
("-cys", Option(lists['cystines'].append, 1, None, "Disulphide bond (+)")),
("-link", Option(lists['links'].append, 1, None, "Link (+)")),
("-merge", Option(lists['merges'].append, 1, None, "Merge chains: e.g. -merge A,B,C (+)")),
# ("-mixed", Option(bool, 0, False, "Allow chains of mixed type (default: False)")),
("-name", Option(str, 1, None, "Moleculetype name")),
("-p", Option(str, 1, 'None', "Output position restraints (None/All/Backbone) (default: None)")),
("-pf", Option(float, 1, 1000, "Position restraints force constant (default: 1000 kJ/mol/nm^2)")),
("-ed", Option(bool, 0, False, "Use dihedrals for extended regions rather than elastic bonds)")),
("-sep", Option(bool, 0, False, "Write separate topologies for identical chains.")),
("-ff", Option(str, 1,'martini21', "Which forcefield to use: "+' ,'.join(n for n in forcefields[:-1]))),
# Fij = Fc exp( -a (rij - lo)**p )
("-elastic", Option(bool, 0, False, "Write elastic bonds")),
("-ef", Option(float, 1, 500, "Elastic bond force constant Fc")),
("-el", Option(float, 1, 0, "Elastic bond lower cutoff: F = Fc if rij < lo")),
("-eu", Option(float, 1, 0.90, "Elastic bond upper cutoff: F = 0 if rij > up")),
("-ea", Option(float, 1, 0, "Elastic bond decay factor a")),
("-ep", Option(float, 1, 1, "Elastic bond decay power p")),
("-em", Option(float, 1, 0, "Remove elastic bonds with force constant lower than this")),
("-eb", Option(str, 1, 'BB', "Comma separated list of bead names for elastic bonds")),
# ("-hetatm", Option(bool, 0, False, "Include HETATM records from PDB file (Use with care!)")),
("-multi", Option(lists['multi'].append, 1, None, "Chain to be set up for multiscaling (+)")),
]
## Martini Quotes
martiniq = [
("Robert Benchley",
"Why don't you get out of that wet coat and into a dry martini?"),
("James Thurber",
"One martini is all right, two is two many, three is not enough"),
("Philip Larkin",
"The chromatic scale is what you use to give the effect of drinking a quinine martini and having an enema simultaneously."),
("William Emerson, Jr.",
"And when that first martini hits the liver like a silver bullet, there is a sigh of contentment that can be heard in Dubuque."),
("Alec Waugh",
"I am prepared to believe that a dry martini slightly impairs the palate, but think what it does for the soul."),
("Gerald R. Ford",
"The three-martini lunch is the epitome of American efficiency. Where else can you get an earful, a bellyful and a snootful at the same time?"),
("P. G. Wodehouse",
"He was white and shaken, like a dry martini."),
]
desc = ""
def help():
"""Print help text and list of options and end the program."""
import sys
for item in options:
if type(item) == str:
print item
for item in options:
if type(item) != str:
print "%10s %s"%(item[0],item[1].description)
print
sys.exit()
##############################
## 2 # COMMAND LINE PARSING ## -> @CMD <-
##############################
import sys,logging
# Helper function to parse atom strings given on the command line:
# resid
# resname/resid
# chain/resname/resid
# resname/resid/atom
# chain/resname/resid/atom
# chain//resid
# chain/resname/atom
def str2atom(a):
a = a.split("/")
if len(a) == 1: # Only a residue number:
return (None,None,int(a[0]),None)
if len(a) == 2: # Residue name and number (CYS/123):
return (None,a[0],int(a[1]),None)
if len(a) == 3:
if a[2].isdigit(): # Chain, residue name, residue number
return (None,a[1],int(a[2]),a[0])
else: # Residue name, residue number, atom name
return (a[2],a[0],int(a[1]),None)
return (a[3],a[1],int(a[2]),a[0])
def option_parser(args,options,lists,version=0):
# Check whether there is a request for help
if '-h' in args or '--help' in args:
help()
# Convert the option list to a dictionary, discarding all comments
options = dict([i for i in options if not type(i) == str])
# This information we would like to print to some files, so let's put it in our information class
options['Version'] = version
options['Arguments'] = args[:]
while args:
ar = args.pop(0)
options[ar].setvalue([args.pop(0) for i in range(options[ar].num)])
## LOGGING ##
# Set the log level and communicate which options are set and what is happening
# If 'Verbose' is set, change the logger level
logLevel = options["-v"] and logging.DEBUG or logging.INFO
logging.basicConfig(format='%(levelname)-7s %(message)s',level=logLevel)
logging.info('MARTINIZE, script version %s'%version)
logging.info('If you use this script please cite:')
logging.info('de Jong et al., J. Chem. Theory Comput., 2013, DOI:10.1021/ct300646g')
# The make the program flexible, the forcefield parameters are defined
# for multiple forcefield. Check if a existing one is defined:
###_tmp = __import__(options['-ff'].value.lower())
###options['ForceField'] = getattr(_tmp,options['-ff'].value.lower())()
try:
try:
# Try to load the forcefield class from a different file
_tmp = __import__(options['-ff'].value.lower())
options['ForceField'] = getattr(_tmp,options['-ff'].value.lower())()
except:
# Try to load the forcefield class from the current file
options['ForceField'] = globals()[options['-ff'].value.lower()]()
except:
logging.error("Forcefield '%s' can not be found."%(options['-ff']))
sys.exit()
# Process the raw options from the command line
# Boolean options are set to more intuitive variables
options['Collagen'] = options['-collagen']
options['chHIS'] = options['-his']
options['ChargesAtBreaks'] = options['-cb']
options['NeutralTermini'] = options['-nt']
options['ExtendedDihedrals'] = options['-ed']
options['RetainHETATM'] = False # options['-hetatm']
options['SeparateTop'] = options['-sep']
options['MixedChains'] = False # options['-mixed']
options['ElasticNetwork'] = options['-elastic']
# Parsing of some other options into variables
options['ElasticMaximumForce'] = options['-ef'].value
options['ElasticMinimumForce'] = options['-em'].value
options['ElasticLowerBound'] = options['-el'].value
options['ElasticUpperBound'] = options['-eu'].value
options['ElasticDecayFactor'] = options['-ea'].value
options['ElasticDecayPower'] = options['-ep'].value
options['ElasticBeads'] = options['-eb'].value.split(',')
options['PosResForce'] = options['-pf'].value
options['PosRes'] = [i.lower() for i in options['-p'].value.split(",")]
if "none" in options['PosRes']: options['PosRes'] = []
if "backbone" in options['PosRes']: options['PosRes'].append("BB")
if options['ForceField'].ElasticNetwork:
# Some forcefields, like elnedyn, always use an elatic network. This is set in the
# forcefield file, with the parameter ElasticNetwork.
options['ElasticNetwork'] = True
# Merges, links and cystines
options['mergeList'] = "all" in lists['merges'] and ["all"] or [i.split(",") for i in lists['merges']]
# Process links
linkList = []
linkListCG = []
for i in lists['links']:
ln = i.split(",")
a, b = str2atom(ln[0]), str2atom(ln[1])
if len(ln) > 3: # Bond with given length and force constant
bl, fc = (ln[2] and float(ln[2]) or None, float(ln[3]))
elif len(a) == 3: # Constraint at given distance
bl, fc = float(a[2]), None
else: # Constraint at distance in structure
bl, fc = None, None
# Store the link, but do not list the atom name in the
# atomistic link list. Otherwise it will not get noticed
# as a valid link when checking for merging chains
linkList.append(((None,a[1],a[2],a[3]),(None,b[1],b[2],b[3])))
linkListCG.append((a,b,bl,fc))
# Cystines -- This should be done for all special bonds listed in the _special_ dictionary
CystineCheckBonds = False # By default, do not detect cystine bridges
CystineMaxDist2 = (10*0.22)**2 # Maximum distance (A) for detection of SS bonds
for i in lists['cystines']:
if i.lower() == "auto":
CystineCheckBonds = True
elif i.replace(".","").isdigit():
CystineCheckBonds = True
CystineMaxDist2 = (10*float(i))**2
else:
# This item should be a pair of cysteines
cysA, cysB = [str2atom(j) for j in i.split(",")]
# Internally we handle the residue number shifted by ord(' ')<<20. We have to add this to the
# cys-residue numbers given here as well.
constant = 32<<20
linkList.append((("SG","CYS",cysA[2]+constant,cysA[3]),("SG","CYS",cysB[2]+constant,cysB[3])))
linkListCG.append((("SC1","CYS",cysA[2]+constant,cysA[3]),("SC1","CYS",cysB[2]+constant,cysB[3]),-1,-1))
# Now we have done everything to it, we can add Link/cystine related stuff to options
# 'multi' is not stored anywhere else, so that we also add
options['linkList'] = linkList
options['linkListCG'] = linkListCG
options['CystineCheckBonds'] = CystineCheckBonds
options['CystineMaxDist2'] = CystineMaxDist2
options['multi'] = lists['multi']
logging.info("Chain termini will%s be charged"%(options['NeutralTermini'] and " not" or ""))
logging.info("Residues at chain brakes will%s be charged"%((not options['ChargesAtBreaks']) and " not" or ""))
if options.has_key('ForceField'):
logging.info("The %s forcefield will be used."%(options['ForceField'].name))
else:
logging.error("Forcefield '%s' has not been implemented."%(options['-ff']))
sys.exit()
if options['ExtendedDihedrals']:
logging.info('Dihedrals will be used for extended regions. (Elastic bonds may be more stable)')
else:
logging.info('Local elastic bonds will be used for extended regions.')
if options['PosRes']:
logging.info("Position restraints will be generated.")
logging.warning("Position restraints are only enabled if -DPOSRES is set in the MDP file")
if options['MixedChains']:
logging.warning("So far no parameters for mixed chains are available. This might crash the program!")
if options['RetainHETATM']:
logging.warning("I don't know how to handle HETATMs. This will probably crash the program.")
return options
#################################################
## 3 # HELPER FUNCTIONS, CLASSES AND SHORTCUTS ## -> @FUNC <-
#################################################
import math
#----+------------------+
## A | STRING FUNCTIONS |
#----+------------------+
# Split a string
def spl(x):
return x.split()
# Split each argument in a list
def nsplit(*x):
return [i.split() for i in x]
# Make a dictionary from two lists
def hash(x,y):
return dict(zip(x,y))
# Function to reformat pattern strings
def pat(x,c="."):
return x.replace(c,"\x00").split()
# Function to generate formatted strings according to the argument type
def formatString(i):
if type(i) == str:
return i
if type(i) == int:
return "%5d"%i
if type(i) == float:
return "%8.5f"%i
else:
return str(i)
#----+----------------+
## B | MATH FUNCTIONS |
#----+----------------+
def cos_angle(a,b):
p = sum([i*j for i,j in zip(a,b)])
q = math.sqrt(sum([i*i for i in a])*sum([j*j for j in b]))
return min(max(-1,p/q),1)
def norm2(a):
return sum([i*i for i in a])
def norm(a):
return math.sqrt(norm2(a))
def distance2(a,b):
return (a[0]-b[0])**2+(a[1]-b[1])**2+(a[2]-b[2])**2
##########################
## 4 # FG -> CG MAPPING ## -> @MAP <-
##########################
dnares3 = " DA DC DG DT"
dnares1 = " dA dC dG dT"
rnares3 = " A C G U"
rnares1 = " rA rC rG rU" #
# Amino acid nucleic acid codes:
# The naming (AA and '3') is not strictly correct when adding DNA/RNA, but we keep it like this for consistincy.
AA3 = spl("TRP TYR PHE HIS HIH ARG LYS CYS ASP GLU ILE LEU MET ASN PRO HYP GLN SER THR VAL ALA GLY"+dnares3+rnares3) #@#
AA1 = spl(" W Y F H H R K C D E I L M N P O Q S T V A G"+dnares1+rnares1) #@#
# Dictionaries for conversion from one letter code to three letter code v.v.
AA123, AA321 = hash(AA1,AA3),hash(AA3,AA1)
# Residue classes:
protein = AA3[:-8] # remove eight to get rid of DNA/RNA here.
water = spl("HOH SOL TIP")
lipids = spl("DPP DHP DLP DMP DSP POP DOP DAP DUP DPP DHP DLP DMP DSP PPC DSM DSD DSS")
nucleic = spl("DAD DCY DGU DTH ADE CYT GUA THY URA DA DC DG DT")
residueTypes = dict(
[(i,"Protein") for i in protein ]+
[(i,"Water") for i in water ]+
[(i,"Lipid") for i in lipids ]+
[(i,"Nucleic") for i in nucleic ]
)
class CoarseGrained:
# Class for mapping an atomistic residue list to a coarsegrained one
# Should get an __init__ function taking a residuelist, atomlist, Pymol selection or ChemPy model
# The result should be stored in a list-type attribute
# The class should have pdbstr and grostr methods
# Standard mapping groups
# Protein backbone
bb = "N CA C O H H1 H2 H3 O1 O2" #@#
# Lipid tails
palmitoyl1 = nsplit("C1B C1C C1D C1E","C1F C1G C1H C1I","C1J C1K C1L C1M","C1N C1O C1P") #@#
palmitoyl2 = nsplit("C2B C2C C2D C2E","C2F C2G C2H C2I","C2J C2K C2L C2M","C2N C2O C2P") #@#
oleyl1 = nsplit("C1B C1C C1D C1E","C1F C1G C1H","C1I C1J","C1K C1L C1M C1N","C1O C1P C1Q C1R") #@#
oleyl2 = nsplit("C2B C2C C2D C2E","C2F C2G C2H","C2I C2J","C2K C2L C2M C2N","C2O C2P C2Q C2R") #@#
#lauroyl1 = []
#stearoyl1 = []
#arachidonoyl1 = []
#linoleyl1 = []
#hexanoyl1 = []
# Lipid head groups
#phoshpatidylcholine =
phosphatydilethanolamine = nsplit("N H1 H2 H3 CA","CB P OA OB OC OD","CC CD OG C2A OH","CE OE C1A OF") #@#
phosphatidylglycerol = nsplit("H1 O1 CA H2 O2 CB","CC P OA OB OC OD","CD CE OG C2A OH","CF OE C1A OF") #@#
#phosphatidylserine =
dna_bb = "P OP1 OP2 O5' O3'","C5' O4' C4'","C3' O3' C2' C1'"
# This is the mapping dictionary
# For each residue it returns a list, each element of which
# lists the atom names to be mapped to the corresponding bead.
# The order should be the standard order of the coarse grained
# beads for the residue. Only atom names matching with those
# present in the list of atoms for the residue will be used
# to determine the bead position. This adds flexibility to the
# approach, as a single definition can be used for different
# states of a residue (e.g., GLU/GLUH).
# For convenience, the list can be specified as a set of strings,
# converted into a list of lists by 'nsplit' defined above.
mapping = {
"ALA": nsplit(bb + " CB"),
"CYS": nsplit(bb,"CB SG"),
"ASP": nsplit(bb,"CB CG OD1 OD2"),
"GLU": nsplit(bb,"CB CG CD OE1 OE2"),
"PHE": nsplit(bb,"CB CG CD1 HD1","CD2 HD2 CE2 HE2","CE1 HE1 CZ HZ"),
"GLY": nsplit(bb),
"HIS": nsplit(bb,"CB CG","CD2 HD2 NE2 HE2","ND1 HD1 CE1 HE1"),
"HIH": nsplit(bb,"CB CG","CD2 HD2 NE2 HE2","ND1 HD1 CE1 HE1"), # Charged Histidine.
"ILE": nsplit(bb,"CB CG1 CG2 CD CD1"),
"LYS": nsplit(bb,"CB CG CD","CE NZ HZ1 HZ2 HZ3"),
"LEU": nsplit(bb,"CB CG CD1 CD2"),
"MET": nsplit(bb,"CB CG SD CE"),
"ASN": nsplit(bb,"CB CG ND1 ND2 OD1 OD2 HD11 HD12 HD21 HD22"),
"PRO": nsplit(bb,"CB CG CD"),
"HYP": nsplit(bb,"CB CG CD OD"),
"GLN": nsplit(bb,"CB CG CD OE1 OE2 NE1 NE2 HE11 HE12 HE21 HE22"),
"ARG": nsplit(bb,"CB CG CD","NE HE CZ NH1 NH2 HH11 HH12 HH21 HH22"),
"SER": nsplit(bb,"CB OG HG"),
"THR": nsplit(bb,"CB OG1 HG1 CG2"),
"VAL": nsplit(bb,"CB CG1 CG2"),
"TRP": nsplit(bb,"CB CG CD2","CD1 HD1 NE1 HE1 CE2","CE3 HE3 CZ3 HZ3","CZ2 HZ2 CH2 HH2"),
"TYR": nsplit(bb,"CB CG CD1 HD1","CD2 HD2 CE2 HE2","CE1 HE1 CZ OH HH"),
"POPE": phosphatydilethanolamine + palmitoyl1 + oleyl2,
"DOPE": phosphatydilethanolamine + oleyl1 + oleyl2,
"DPPE": phosphatydilethanolamine + palmitoyl1 + palmitoyl2,
"POPG": phosphatidylglycerol + palmitoyl1 + oleyl2,
"DOPG": phosphatidylglycerol + oleyl1 + oleyl2,
"DPPG": phosphatidylglycerol + palmitoyl1 + palmitoyl2,
"DA": nsplit("P OP1 OP2 O5' O3' O1P O2P","C5' O4' C4'","C3' C2' C1'","N9 C4","C8 N7 C5","C6 N6 N1","C2 N3"),
"DG": nsplit("P OP1 OP2 O5' O3' O1P O2P","C5' O4' C4'","C3' C2' C1'","N9 C4","C8 N7 C5","C6 O6 N1","C2 N2 N3"),
"DC": nsplit("P OP1 OP2 O5' O3' O1P O2P","C5' O4' C4'","C3' C2' C1'","N1 C6","C5 C4 N4","N3 C2 O2"),
"DT": nsplit("P OP1 OP2 O5' O3' O1P O2P","C5' O4' C4'","C3' C2' C1'","N1 C6","C5 C4 O4 C7 C5M","N3 C2 O2"),
}
# Generic names for side chain beads
residue_bead_names = spl("BB SC1 SC2 SC3 SC4")
# Generic names for DNA beads
residue_bead_names_dna = spl("BB1 BB2 BB3 SC1 SC2 SC3 SC4")
# This dictionary contains the bead names for all residues,
# following the order in 'mapping'
names = {
"POPE": "NH3 PO4 GL1 GL2 C1A C2A C3A C4A C1B C2B D3B C4B C5B".split(),
"POPG": "GLC PO4 GL1 GL2 C1A C2A C3A C4A C1B C2B D3B C4B C5B".split()
}
# Add default bead names for all amino acids
names.update([(i,("BB","SC1","SC2","SC3","SC4")) for i in AA3])
# Add the default bead names for all DNA nucleic acids
names.update([(i,("BB1","BB2","BB3","SC1","SC2","SC3","SC4")) for i in nucleic])
# This dictionary allows determining four letter residue names
# for ones specified with three letters, e.g., resulting from
# truncation to adhere to the PDB format.
# Each entry returns a prototypical test, given as a string,
# and the residue name to be applied if eval(test) is True.
# This is particularly handy to determine lipid types.
# The test assumes there is a local or global array 'atoms'
# containing the atom names of the residue in correct order.
restest = {
"POP": [('atoms[0] == "CA"', "POPG"),
('atoms[0] == "N"', "POPE")]
}
# Crude mass for weighted average. No consideration of united atoms.
# This will probably give only minor deviations, while also giving less headache
mass = {'H': 1,'C': 12,'N': 14,'O': 16,'S': 32,'P': 31,'M': 0}
# Determine average position for a set of weights and coordinates
# This is a rather specific function that requires a list of items
# [(m,(x,y,z),id),..] and returns the weighted average of the
# coordinates and the list of ids mapped to this bead
def aver(b):
mwx,ids = zip(*[((m*x,m*y,m*z),i) for m,(x,y,z),i in b]) # Weighted coordinates
tm = sum(zip(*b)[0]) # Sum of weights
return [sum(i)/tm for i in zip(*mwx)],ids # Centre of mass
# Return the CG beads for an atomistic residue, using the mapping specified above
# The residue 'r' is simply a list of atoms, and each atom is a list:
# [ name, resname, resid, chain, x, y, z ]
def map(r,ca2bb = False):
p = CoarseGrained.mapping[r[0][1]] # Mapping for this residue
if ca2bb: p[0] = ["CA"] # Elnedyn maps BB to CA, ca2bb is False or True
# Get the name, mass and coordinates for all atoms in the residue
a = [(i[0],CoarseGrained.mass.get(i[0][0],0),i[4:]) for i in r]
# Store weight, coordinate and index for atoms that match a bead
q = [[(m,coord,a.index((atom,m,coord))) for atom,m,coord in a if atom in i] for i in p]
# Bead positions
return zip(*[aver(i) for i in q])
# Mapping for index file
def mapIndex(r,ca2bb = False):
p = CoarseGrained.mapping[r[0][1]] # Mapping for this residue
if ca2bb: p[0] = ["CA"] # Elnedyn maps BB to CA, ca2bb is False or True
# Get the name, mass and coordinates for all atoms in the residue
a = [(i[0],CoarseGrained.mass.get(i[0][0],0),i[4:]) for i in r]
# Store weight, coordinate and index for atoms that match a bead
return [[(m,coord,a.index((atom,m,coord))) for atom,m,coord in a if atom in i] for i in p]
#############################
## 5 # SECONDARY STRUCTURE ## -> @SS <-
#############################
import logging,os,sys
import subprocess as subp
#----+--------------------------------------+
## A | SECONDARY STRUCTURE TYPE DEFINITIONS |
#----+--------------------------------------+
# This table lists all coarse grained secondary structure types
# The following are matched lists. Make sure they stay matched.
# The lists do not need to be of the same length. The longer list
# will be truncated when combined with a shorter list, e.g. with
# dihedral definitions, which are not present for coil and termini
#
ss_names = {
"F": "Collagenous Fiber", #@#
"E": "Extended structure (beta sheet)", #@#
"H": "Helix structure", #@#
"1": "Helix start (H-bond donor)", #@#
"2": "Helix end (H-bond acceptor)", #@#
"3": "Ambivalent helix type (short helices)", #@#
"T": "Turn", #@#
"S": "Bend", #@#
"C": "Coil", #@#
}
bbss = ss_names.keys()
bbss = spl(" F E H 1 2 3 T S C") # SS one letter
# The following dictionary contains secondary structure types as assigned by
# different programs. The corresponding Martini secondary structure types are
# listed in cgss
#
# NOTE:
# Each list of letters in the dictionary ss should exactly match the list
# in cgss.
#
ssdefs = {
"dssp": list(".HGIBETSC~"), # DSSP one letter secondary structure code #@#
"pymol": list(".H...S...L"), # Pymol one letter secondary structure code #@#
"gmx": list(".H...ETS.C"), # Gromacs secondary structure dump code #@#
"self": list("FHHHEETSCC") # Internal CG secondary structure codes #@#
}
cgss = list("FHHHEETSCC") # Corresponding CG secondary structure types #@#
#----+-------------------------------------------+
## B | SECONDARY STRUCTURE PATTERN SUBSTITUTIONS |
#----+-------------------------------------------+
# For all structure types specific dihedrals may be used if four or
# more consecutive residues are assigned that type.
# Helix start and end regions are special and require assignment of
# specific types. The following pattern substitutions are applied
# (in the given order). A dot matches any other type.
# Patterns can be added to the dictionaries. This only makes sense
# if for each key in patterns there is a matching key in pattypes.
patterns = {
"H": pat(".H. .HH. .HHH. .HHHH. .HHHHH. .HHHHHH. .HHHHHHH. .HHHH HHHH.") #@#
}
pattypes = {
"H": pat(".3. .33. .333. .3333. .13332. .113322. .1113222. .1111 2222.") #@#
}
#----+----------+
## C | INTERNAL |
#----+----------+
# Pymol Colors
# F E H 1 2 3 T S C
ssnum = ( 13, 4, 2, 2, 2, 2, 6, 22, 0 ) #@#
# Dictionary returning a number for a given type of secondary structure
# This can be used for setting the b-factor field for coloring
ss2num = hash(bbss,ssnum)
# List of programs for which secondary structure definitions can be processed
programs = ssdefs.keys()
# Dictionaries mapping ss types to the CG ss types
ssd = dict([ (i, hash(ssdefs[i],cgss)) for i in programs ])
# From the secondary structure dictionaries we create translation tables
# with which all secondary structure types can be processed. Anything
# not listed above will be mapped to C (coil).
# Note, a translation table is a list of 256 characters to map standard
# ascii characters to.
def tt(program):
return "".join([ssd[program].get(chr(i),"C") for i in range(256)])
# The translation table depends on the program used to obtain the
# secondary structure definitions
sstt = dict([(i,tt(i)) for i in programs])
# The following translation tables are used to identify stretches of
# a certain type of secondary structure. These translation tables have
# every character, except for the indicated secondary structure, set to
# \x00. This allows summing the sequences after processing to obtain
# a single sequence summarizing all the features.
null = "\x00"
sstd = dict([ (i,ord(i)*null+i+(255-ord(i))*null) for i in cgss ])
# Pattern substitutions
def typesub(seq,patterns,types):
seq = null+seq+null
for i,j in zip(patterns,types):
seq = seq.replace(i,j)
return seq[1:-1]
# The following function translates a string encoding the secondary structure
# to a string of corresponding Martini types, taking the origin of the
# secondary structure into account, and replacing termini if requested.
def ssClassification(ss,program="dssp"):
# Translate dssp/pymol/gmx ss to Martini ss
ss = ss.translate(sstt[program])
# Separate the different secondary structure types
sep = dict([(i,ss.translate(sstd[i])) for i in sstd.keys()])
# Do type substitutions based on patterns
# If the ss type is not in the patterns lists, do not substitute
# (use empty lists for substitutions)
typ = [ typesub(sep[i],patterns.get(i,[]),pattypes.get(i,[]))
for i in sstd.keys()]
# Translate all types to numerical values
typ = [ [ord(j) for j in list(i)] for i in typ ]
# Sum characters back to get a full typed sequence
typ = "".join([chr(sum(i)) for i in zip(*typ)])
# Return both the actual as well as the fully typed sequence
return ss, typ
# The following functions are for determination of secondary structure,
# given a list of atoms. The atom format is generic and can be written out
# as PDB or GRO. The coordinates are in Angstrom.
# NOTE: There is the *OLD* DSSP and the *NEW* DSSP, which require
# different calls. The old version uses '--' to indicate reading from stdin
# whereas the new version uses '-i /dev/stdin'
def call_dssp(chain,atomlist,executable='dsspcmbi'):
'''Get the secondary structure, by calling to dssp'''
ssdfile = 'chain_%s.ssd'%chain.id
try:
if os.system(executable+" -V 2>/dev/null"):
logging.debug("New version of DSSP; Executing '%s -i /dev/stdin -o %s'"%(executable,ssdfile))
p = subp.Popen([executable,"-i","/dev/stdin","-o",ssdfile],stderr=subp.PIPE,stdout=subp.PIPE,stdin=subp.PIPE)
else:
logging.debug("Old version of DSSP; Executing '%s -- %s'"%(executable,ssdfile))
p = subp.Popen([executable,"--",ssdfile],stderr=subp.PIPE,stdout=subp.PIPE,stdin=subp.PIPE)
except OSError:
logging.error("A problem occured calling %s."%executable)
sys.exit(1)
for atom in atomlist:
if atom[0][:2] == 'O1': atom=('O',)+atom[1:]
if atom[0][0]!='H' and atom[0][:2]!='O2': p.stdin.write(pdbOut(atom))
p.stdin.write('TER\n')
data = p.communicate()
p.wait()
main,ss = 0,''
for line in open(ssdfile).readlines():
if main and not line[13] == "!": ss+=line[16]
if line[:15] == ' # RESIDUE AA': main=1
return ss
ssDetermination = {
"dssp": call_dssp
}
################################
## 6 # FORCE FIELD PARAMETERS ## -> @FF <-
################################
class martini21:
def __init__(self):
# parameters are defined here for the following (protein) forcefields:
self.name = 'martini21'
# Charged types:
self.charges = {"Qd":1, "Qa":-1, "SQd":1, "SQa":-1, "RQd":1, "AQa":-1} #@#
#----+---------------------+
## A | BACKBONE PARAMETERS |
#----+---------------------+
#
# bbss lists the one letter secondary structure code
# bbdef lists the corresponding default backbone beads
# bbtyp lists the corresponding residue specific backbone beads
#
# bbd lists the structure specific backbone bond lengths
# bbkb lists the corresponding bond force constants
#
# bba lists the structure specific angles
# bbka lists the corresponding angle force constants
#
# bbd lists the structure specific dihedral angles
# bbkd lists the corresponding force constants
#
# -=NOTE=-
# if the secondary structure types differ between bonded atoms
# the bond is assigned the lowest corresponding force constant
#
# -=NOTE=-
# if proline is anywhere in the helix, the BBB angle changes for
# all residues
#
###############################################################################################
## BEADS ## #
# F E H 1 2 3 T S C # SS one letter
self.bbdef = spl(" N0 Nda N0 Nd Na Nda Nda P5 P5") # Default beads #@#
self.bbtyp = { # #@#
"ALA": spl(" C5 N0 C5 N0 N0 N0 N0 P4 P4"),# ALA specific #@#
"PRO": spl(" C5 N0 C5 N0 Na N0 N0 Na Na"),# PRO specific #@#
"HYP": spl(" C5 N0 C5 N0 N0 N0 N0 Na Na") # HYP specific #@#
} # #@#
## BONDS ## #
self.bbldef = (.365, .350, .350, .350, .350, .350, .350, .350, .350) # BB bond lengths #@#
self.bbkb = (1250, 1250, 1250, 1250, 1250, 1250, 500, 400, 400) # BB bond kB #@#
self.bbltyp = {} # #@#
self.bbkbtyp = {} # #@#
## ANGLES ## #
self.bbadef = ( 119.2,134, 96, 96, 96, 96, 100, 130, 127) # BBB angles #@#
self.bbka = ( 150, 25, 700, 700, 700, 700, 25, 25, 25) # BBB angle kB #@#
self.bbatyp = { # #@#
"PRO": ( 119.2,134, 98, 98, 98, 98, 100, 130, 127), # PRO specific #@#
"HYP": ( 119.2,134, 98, 98, 98, 98, 100, 130, 127) # PRO specific #@#
} # #@#
self.bbkatyp = { # #@#
"PRO": ( 150, 25, 100, 100, 100, 100, 25, 25, 25), # PRO specific #@#
"HYP": ( 150, 25, 100, 100, 100, 100, 25, 25, 25) # PRO specific #@#
} # #@#
## DIHEDRALS ## #
self.bbddef = ( 90.7, 0, -120, -120, -120, -120) # BBBB dihedrals #@#
self.bbkd = ( 100, 10, 400, 400, 400, 400) # BBBB kB #@#
self.bbdmul = ( 1, 1, 1, 1, 1, 1) # BBBB mltplcty #@#
self.bbdtyp = {} # #@#
self.bbkdtyp = {} # #@#
#
###############################################################################################
# Some Forcefields use the Ca position to position the BB-bead (me like!)
# martini 2.1 doesn't
self.ca2bb = False
# BBS angle, equal for all ss types
# Connects BB(i-1),BB(i),SC(i), except for first residue: BB(i+1),BB(i),SC(i)
# ANGLE Ka
self.bbsangle = [ 100, 25] #@#
# Bonds for extended structures (more stable than using dihedrals)
# LENGTH FORCE
self.ebonds = { #@#
'short': [ .640, 2500], #@#
'long' : [ .970, 2500] #@#
} #@#
#----+-----------------------+
## B | SIDE CHAIN PARAMETERS |
#----+-----------------------+
# To be compatible with Elnedyn, all parameters are explicitly defined, even if they are double.
self.sidechains = {
#RES# BEADS BONDS ANGLES DIHEDRALS
# BB-SC SC-SC BB-SC-SC SC-SC-SC
"TRP": [spl("SC4 SP1 SC4 SC4"),[(0.300,5000)]+[(0.270,None) for i in range(5)], [(210,50),(90,50),(90,50)], [(0,50),(0,200)]],
"TYR": [spl("SC4 SC4 SP1"), [(0.320,5000), (0.270,None), (0.270,None),(0.270,None)],[(150,50),(150,50)], [(0,50)]],
"PHE": [spl("SC4 SC4 SC4"), [(0.310,7500), (0.270,None), (0.270,None),(0.270,None)],[(150,50),(150,50)], [(0,50)]],
"HIS": [spl("SC4 SP1 SP1"), [(0.320,7500), (0.270,None), (0.270,None),(0.270,None)],[(150,50),(150,50)], [(0,50)]],
"HIH": [spl("SC4 SP1 SQd"), [(0.320,7500), (0.270,None), (0.270,None),(0.270,None)],[(150,50),(150,50)], [(0,50)]],
"ARG": [spl("N0 Qd"), [(0.330,5000), (0.340,5000)], [(180,25)]],
"LYS": [spl("C3 Qd"), [(0.330,5000), (0.280,5000)], [(180,25)]],