-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpolypeptide.py
414 lines (382 loc) · 18.1 KB
/
polypeptide.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
from proteinmath import *
from aminoacids import *
from randomcoil import *
from secondary_structure import *
import os.path
import string, resource
import gc
class Polypeptide(object):
'Represents a series of amino acids and provides for input from a file and further manipulation.'
def __init__(self, preaas=None):
self.hashtable = AAHashTable()
self.aminoacids = []
self.secondary_structures = []
self.mass = 0.0
if preaas:
for i, aa in enumerate(preaas):
aa = aa.withtag(i)
self.hashtable.add(aa)
self.aminoacids.append(aa)
if aa:
self.mass += aa.mass
def randomcoil(self, seq=None, permissions=None, struct_permissions=None, repeat_cutoff=100):
"""Generates a random 3D structure for the sequence you provide. If seq is None, this method tries to use the sequence of amino acids that's already existing in the protein."""
self.hashtable.clear()
self.mass = 0.0
self.hashtable = AAHashTable()
if not seq:
seq = ""
for aa in self.aminoacids:
seq += aacodec(aa.type)
if len(self.secondary_structures) and struct_permissions:
new_aas = generate_randomcoil(seq, permissions=permissions, secondary_structures=self.secondary_structures, struct_permissions=struct_permissions, repeat_cutoff=repeat_cutoff, cache_aas=self.aminoacids)
else:
new_aas = generate_randomcoil(seq, permissions, repeat_cutoff=repeat_cutoff, cache_aas=self.aminoacids)
if new_aas: self.aminoacids = new_aas
for aa in self.aminoacids:
if aa: self.mass += aa.mass
self.hashtable.add(aa)
return new_aas is not None
def add_aa(self, aa):
self.aminoacids.append(aa)
self.hashtable.add(aa)
self.mass = 0.0
for aa in self.aminoacids:
if aa: self.mass += aa.mass
def add_aas(self, aas):
self.mass = 0.0
if not self.hashtable: self.hashtable = AAHashTable()
for aa in aas:
self.aminoacids.append(aa)
self.hashtable.add(aa)
if aa: self.mass += aa.mass
def __str__(self):
descriptions = [self.aminoacids[x].__str__() for x in range(len(self.aminoacids))]
return "Polypeptide, %i amino acids:\n\t" % len(self.aminoacids) + str(descriptions)
def nearby_aa(self, aa, distance, index=-1, consec=2, excluded=[], mindiff=1):
"""Pass in an array of integers for excluded to specify any tags that should not be counted in the search. If you specify a mindiff and consec is False, this method will only return amino acids whose tag difference is greater than mindiff."""
assert aa != None, "Must provide a valid amino acid object"
#if index == -1:
# index = self.aminoacids.index(aa)
ret = self.hashtable.nearby_aa(aa, distance)
ret2 = []
for ret_aa in ret:
if ((consec == False and math.fabs(ret_aa.tag - aa.tag) > mindiff) or (consec == True and math.fabs(ret_aa.tag - aa.tag) == 1) or (consec == 2)) and ret_aa.tag not in excluded:
if consec == True and aa.has_break and ret_aa.tag == aa.tag + 1: continue
ret2.append(ret_aa)
return ret2
def xyz(self, highlight=None, escaped=True):
if escaped is True:
ret = "%d\\nNo comment\\n" % (len(self.aminoacids) * 4)
for aa in self.aminoacids:
name = "C"
if highlight is not None and aa.tag in highlight: name = "Cl"
ret += "%s\t%.4f\t%.4f\t%.4f\\n" % (name, aa.acarbon.x, aa.acarbon.y, aa.acarbon.z)
h = aa.toglobal(Point3D(1.1, 0.0, math.acos(-1.0 / 3.0)).tocartesian())
ret += "H\t%.4f\t%.4f\t%.4f\\n" % (h.x, h.y, h.z)
ret += "C\t%.4f\t%.4f\t%.4f\\n" % (aa.carbon.x, aa.carbon.y, aa.carbon.z)
ret += "N\t%.4f\t%.4f\t%.4f\\n" % (aa.nitrogen.x, aa.nitrogen.y, aa.nitrogen.z)
return ret
else:
ret = "%d\nNo comment\n" % (len(self.aminoacids) * 4)
for aa in self.aminoacids:
name = "C"
if highlight is not None and aa.tag in highlight: name = "Cl"
ret += "%s\t%.4f\t%.4f\t%.4f\n" % (name, aa.acarbon.x, aa.acarbon.y, aa.acarbon.z)
h = aa.toglobal(Point3D(1.1, 0.0, math.acos(-1.0 / 3.0)).tocartesian())
ret += "H\t%.4f\t%.4f\t%.4f\n" % (h.x, h.y, h.z)
ret += "C\t%.4f\t%.4f\t%.4f\n" % (aa.carbon.x, aa.carbon.y, aa.carbon.z)
ret += "N\t%.4f\t%.4f\t%.4f\n" % (aa.nitrogen.x, aa.nitrogen.y, aa.nitrogen.z)
return ret
def pdb(self, modelno=1, chain=0, range=None):
ret = "MODEL %d\n" % modelno
idx = 1
chain_letters = string.ascii_uppercase
enum_array = self.aminoacids
if range:
enum_array = self.aminoacids[range[0] - 1 : range[1]]
for i, aa in enumerate(enum_array):
aa.nitrogen = aa.toglobal(Point3D(NITROGEN_BOND_LENGTH, math.pi + math.acos(-1.0 / 2.0) / 2.0, math.acos(-1.0 / 3.0)).tocartesian())
aa.carbon = aa.toglobal(Point3D(CARBON_BOND_LENGTH, math.pi - math.acos(-1.0 / 2.0) / 2.0, math.acos(-1.0 / 3.0)).tocartesian())
ret += "ATOM {0:>5} N {1} {2}{3:>4} {4:>8}{5:>8}{6:>8}\n".format(idx, aa.type, chain_letters[chain], i + 1, "%.3f" % aa.nitrogen.x, "%.3f" % aa.nitrogen.y, "%.3f" % aa.nitrogen.z)
idx += 1
ret += "ATOM {0:>5} CA {1} {2}{3:>4} {4:>8}{5:>8}{6:>8}\n".format(idx, aa.type, chain_letters[chain], i + 1, "%.3f" % aa.acarbon.x, "%.3f" % aa.acarbon.y, "%.3f" % aa.acarbon.z)
idx += 1
h = aa.toglobal(Point3D(1.1, 0.0, math.acos(-1.0 / 3.0)).tocartesian())
ret += "ATOM {0:>5} H {1} {2}{3:>4} {4:>8}{5:>8}{6:>8}\n".format(idx, aa.type, chain_letters[chain], i + 1, "%.3f" % h.x, "%.3f" % h.y, "%.3f" % h.z)
idx += 1
ret += "ATOM {0:>5} C {1} {2}{3:>4} {4:>8}{5:>8}{6:>8}\n".format(idx, aa.type, chain_letters[chain], i + 1, "%.3f" % aa.carbon.x, "%.3f" % aa.carbon.y, "%.3f" % aa.carbon.z)
idx += 1
for atom, location in aa.otheratoms.iteritems():
location = aa.toglobal(location)
ret += "ATOM {0:>5} {1:<4}{2} {3}{4:>4} {5:>8}{6:>8}{7:>8}\n".format(idx, atom.strip(), aa.type, chain_letters[chain], i + 1, "%.3f" % location.x, "%.3f" % location.y, "%.3f" % location.z)
idx += 1
if aa.has_break: chain += 1
ret += "ENDMDL\n"
return ret
def rg(self):
"""Calculates the radius of gyration of the polypeptide."""
#First compute the center of mass
cm = Point3D.zero()
total_mass = 0.0
for aa in self.aminoacids:
if not aa: continue
cm = cm.add(aa.acarbon.multiply(aa.mass))
total_mass += aa.mass
cm = cm.multiply(1.0 / total_mass)
#Now compute Rg
rg = 0.0
for aa in self.aminoacids:
if not aa: continue
rg += aa.mass * (aa.acarbon.distanceto(cm) ** 2)
rg = math.sqrt(rg / total_mass)
print "Radius of gyration:", rg
return rg
def volume(self):
density = 1.0 / (1.410 + 0.145 * math.exp(-self.mass / 13000.0)) # 0.73
volume = (density * 1e24) / (6.02e23) * self.mass
return volume
def center(self):
"""Center the polypeptide at the origin of the global coordinate system."""
center = Point3D.zero()
for aa in self.aminoacids:
center = center.add(aa.acarbon)
center = center.multiply(1.0 / len(self.aminoacids))
for aa in self.aminoacids:
aa.acarbon = aa.acarbon.subtract(center)
aa.nitrogen = aa.nitrogen.subtract(center)
aa.carbon = aa.carbon.subtract(center)
def read_file(self, f, checkgaps=False, otheratoms=False, secondary_structure=False, fillgaps=False, fillends=False, cache_aas=None, range=None):
"""Pass in a file or file-like object to read. Set checkgaps to True to return a list of missing amino acid indices (first in the tuple if necessary) if there is one or more gaps in the chain. Set otheratoms to True to add all atoms found in the PDB file to the amino acids (under the otheratoms array property of the amino acids)."""
gaps = []
current_aa = None
current_chain = None
current_seq = 0
current_tag = -1
minseq = 100000
maxseq = -10000
if secondary_structure:
self.secondary_structures = []
current_sheet = None
for line in f:
if line.find('ENDMDL') != -1: #Only keep one model for each polypeptide
if current_aa is not None and len(self.aminoacids) > 0 and current_aa.acarbon == Point3D.zero():
self.aminoacids.pop()
assert current_aa is None or current_aa.carbon != Point3D.zero() or current_aa.nitrogen != Point3D.zero(), "Cannot have an amino acid with no carbon or nitrogen location"
if current_aa:
current_aa.compute_coordinate_system_vectors()
if current_aa.i == Point3D.zero() and len(self.aminoacids) > 0:
self.aminoacids.pop()
break
if secondary_structure:
if line.find('HELIX') == 0:
start_seq = int(line[22:25]) - 1
end_seq = int(line[34:37]) - 1
type = line[39:41]
if type[1] in "1234567890":
type = int(type)
else:
type = int(type[0])
if range:
if end_seq < range[0] - 1 or start_seq > range[1] - 1: continue
start_seq = max(0, start_seq - range[0] + 1)
end_seq = min(end_seq - range[0] + 1, range[1] - range[0])
chain_start = line[19]
chain_end = line[31]
if chain_start.upper() != "A" or chain_end.upper() != "A": continue
self.secondary_structures.append(Helix(start_seq, end_seq, type))
elif line.find('SHEET') == 0:
sense = int(line[38:40])
start_seq = int(line[23:26]) - 1
end_seq = int(line[34:37]) - 1
if range:
if end_seq < range[0] - 1 or start_seq > range[1] - 1: continue
start_seq = max(0, start_seq - range[0] + 1)
end_seq = min(end_seq - range[0] + 1, range[1] - range[0])
chain_start = line[21]
chain_end = line[32]
if chain_start.upper() != "A" or chain_end.upper() != "A": continue
if not current_sheet or sense == 0:
if current_sheet:
self.secondary_structures.append(current_sheet)
current_sheet = Sheet(start_seq, end_seq, sense)
else:
current_sheet.add_strand(start_seq, end_seq, sense)
if line.find('ATOM') != 0: continue
if line[17:20] == "SOL": break #There are no more amino acids after the solvent molecules start
seq = int(line[23:26])
chain = line[21:22]
if chain is not current_chain and current_aa:
current_aa.has_break = True
if chain is not current_chain or current_seq != seq:
current_chain = chain
if current_aa is not None and current_aa.acarbon == Point3D.zero() and len(self.aminoacids) > 0:
self.aminoacids.pop()
assert current_aa is None or current_aa.carbon != Point3D.zero() or current_aa.nitrogen != Point3D.zero(), "Cannot have an amino acid with no carbon or nitrogen location: %r" % current_aa
if current_aa:
#mem = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss
current_aa.compute_coordinate_system_vectors()
#print "Mem check 2:", resource.getrusage(resource.RUSAGE_SELF).ru_maxrss - mem
#mem = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss
if current_aa.i == Point3D.zero() and len(self.aminoacids) > 0:
self.aminoacids.pop()
if fillgaps:
self.aminoacids.append(None)
if range and range[0] > seq: continue
if range and range[1] < seq: break
if seq - current_seq > 1 and (current_seq != 0 or fillends):
current_seq += 1
if checkgaps:
print "Gap between", current_seq, "and", seq
for n in xrange(current_seq, seq): gaps.append(n)
if fillgaps:
while seq != current_seq:
self.aminoacids.append(None)
current_seq += 1
current_tag += 1
current_seq = seq
if current_seq < minseq: minseq = current_seq
if current_seq > maxseq: maxseq = current_seq
current_tag += 1
if cache_aas:
current_aa = cache_aas[0]
del cache_aas[0]
current_aa.type = line[17:20]
current_aa.tag = current_tag
else:
current_aa = AminoAcid(line[17:20], current_tag)
self.aminoacids.append(current_aa)
atom_code = line[13:17]
replaced = atom_code.replace(" ", "")
location = Point3D(float(line[30:38]), float(line[38:46]), float(line[46:54]))
if replaced == "CA" or atom_code == "CA A":
current_aa._acarbon = location
elif replaced == "CB" or atom_code == "CB A":
current_aa.sidechain = location
if otheratoms:
current_aa.add_other_atom(atom_code.split()[0], location)
elif replaced == "C" or atom_code == "C A":
current_aa.carbon = location
elif replaced == "N" or atom_code == "N A":
current_aa.nitrogen = location
elif otheratoms and atom_code[-1] != "B" and atom_code[0] != "H" and line[12] != "H" and replaced != "OXT":
current_aa.add_other_atom(atom_code.split()[0], location)
if current_aa is not None and current_aa.acarbon == Point3D.zero() and len(self.aminoacids) > 0:
self.aminoacids.pop()
if self.hashtable:
self.hashtable.clear()
else:
self.hashtable = AAHashTable()
self.mass = 0.0
for aa in self.aminoacids:
if aa:
self.mass += aa.mass
self.hashtable.add(aa)
if secondary_structure:
if current_sheet:
self.secondary_structures.append(current_sheet)
if checkgaps is True:
return (gaps, (minseq, maxseq))
else:
return (minseq, maxseq)
def read(self, filepath, checkgaps=False, otheratoms=False, secondary_structure=False, fillgaps=False, fillends=False, range=None):
"""Set checkgaps to True to return a True value (first in the tuple if necessary) if there is one or more gaps in the chain. Set otheratoms to True to add all atoms found in the PDB file to the amino acids (under the otheratoms array property of the amino acids). Set secondary_structure to True to populate the secondary_structures array of the protein with Helix and Sheet objects. Set fillgaps to True to fill the peptide amino acids array with None objects wherever there is a gap. Set range to a 1-indexed tuple (start, end) to specify the locations to start and end this peptide."""
f = open(filepath, 'r')
ret = self.read_file(f, checkgaps, otheratoms, secondary_structure, fillgaps, range)
f.close()
return ret
def iter_models_file(self, f, checkgaps=False, otheratoms=False, secondary_structure=False, fillgaps=False, fillends=False, range=None):
"""This function yields the model number for every model in the PDB file or list of lines you specify. At each iteration, the peptide assimilates the model into its own amino acids, so to evaluate the structure simply use this Polypeptide object. For information about the other parameters, see read()."""
if hasattr(f, "readlines"):
lines = f.readlines()
else:
lines = f
last_read = 0
peptide = None
cache = None
modelno = 0
i = 0
for i, l in enumerate(lines):
if l[:5] == "MODEL":
modelno = int(l[5:].strip())
elif l[:3] == "END":
if modelno == 0:
modelno = 1
try:
self.read_file(lines[last_read : i + 1], cache_aas=cache, checkgaps=checkgaps, otheratoms=otheratoms, secondary_structure=secondary_structure, fillgaps=fillgaps, fillends=fillends, range=range)
yield modelno
except AssertionError as e:
print "Assertion for model {}: {}".format(i, e)
curr_score = 0.0
last_read = i + 1
del self.aminoacids[:]
self.hashtable = None
continue
last_read = i + 1
cache = self.aminoacids
modelno = 1
def iter_models(self, filepath, checkgaps=False, otheratoms=False, secondary_structure=False, fillgaps=False, fillends=False, range=None):
"""This function iterates over the models found in the PDB file at filepath, reading them into this peptide's amino acid array and yielding the model number for each model. For information about the other parameters, see read()."""
with open(filepath, 'r') as file:
for modelno in self.iter_models_file(file, checkgaps, otheratoms, secondary_structure, fillgaps, fillends, range):
yield modelno
def secondary_structure_aa(self, aa_idx):
"""Returns the (structure, strand) of the protein that contains amino acid at the index provided, and None if there is no secondary structure defined there."""
if isinstance(aa_idx, AminoAcid):
return find_secondary_structure(self.secondary_structures, aa_idx.tag)
return find_secondary_structure(self.secondary_structures, aa_idx)
def add_secondary_structures(self, sec_struct_string, format='csv', range=None):
"""Takes a string of secondary structures in PDB or CSV format, depending on the format option, and adds them to the secondary structure array. CSV format should be 'type', 'subtype', 'start', 'end'. For instance, helix,1,3,6 covers amino acids at indices 2-5 (values are 1-indexed). If you pass a tuple (start, end) for range, then the secondary structures will be shifted to only cover the amino acids within that range."""
current_sheet = None
self.secondary_structures = []
if format == 'csv':
for line in sec_struct_string.split('\n'):
comps = line.split(',')
start = int(comps[2]) - 1
end = int(comps[3]) - 1
if range:
if end < range[0] - 1 or start > range[1] - 1: continue
start = max(0, start - range[0] + 1)
end = min(end - range[0] + 1, range[1] - range[0])
if comps[0].lower() == 'helix':
self.secondary_structures.append(Helix(start, end, int(comps[1])))
elif comps[0].lower() == 'sheet':
if int(comps[1]) == 0 or not current_sheet:
if current_sheet: self.secondary_structures.append(current_sheet)
current_sheet = Sheet(start, end, int(comps[1]))
else:
current_sheet.add_strand(start, end, int(comps[1]))
else: print "Unknown secondary structure"
else:
for line in sec_struct_string.split('\n'):
if line.find('HELIX') == 0:
start_seq = int(line[22:26]) - 1
end_seq = int(line[34:38]) - 1
if range:
if end_seq < range[0] - 1 or start_seq > range[1] - 1: continue
start_seq = max(0, start_seq - range[0] + 1)
end_seq = min(end_seq - range[0] + 1, range[1] - range[0])
type = int(line[39:41])
chain_start = line[19]
chain_end = line[31]
if chain_start.upper() != "A" or chain_end.upper() != "A": continue
self.secondary_structures.append(Helix(start_seq, end_seq, type))
elif line.find('SHEET') == 0:
sense = int(line[38:40])
start_seq = int(line[23:26]) - 1
end_seq = int(line[34:37]) - 1
if range:
if end_seq < range[0] - 1 or start_seq > range[1] - 1: continue
start_seq = max(0, start_seq - range[0] + 1)
end_seq = min(end_seq - range[0] + 1, range[1] - range[0])
chain_start = line[21]
chain_end = line[32]
if chain_start.upper() != "A" or chain_end.upper() != "A": continue
if not current_sheet or sense == 0:
if current_sheet:
self.secondary_structures.append(current_sheet)
current_sheet = Sheet(start_seq, end_seq, sense)
else:
current_sheet.add_strand(start_seq, end_seq, sense)
if current_sheet:
self.secondary_structures.append(current_sheet)