-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathclean_out_pdbs.py
executable file
·705 lines (547 loc) · 32.3 KB
/
clean_out_pdbs.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
#!/usr/bin/python
__author__ = "morganlnance"
#####
# path to pymol executable
# must change for each machine!
#pymol_exe_dir = "/Applications/MacPyMOL.app/Contents/MacOS/MacPyMOL"
#####
###########################
#### PROGRAM ARGUMENTS ####
###########################
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser(description="Use Python to count contacts.")
parser.add_argument("pdb_name_list", help="a file of the pdbs to be analyzed")
parser.add_argument("--ignore_glycosylated_proteins", "-i", action="store_true", help="do you want to skip PDBs that have a covalently attached HETATM group? This is most likely a glycan")
parser.add_argument("--heavy_atoms", "-ha", type=int, default=10, help="how many heavy atoms does a HETATM residue need to be considered a ligand? default = 10")
parser.add_argument("--download_pdbs", "-d", action="store_true", help="do you need to download the pdbs from the database?")
parser.add_argument("--keep_cifs", action="store_true", help="do you want to keep the cif files you download?")
parser.add_argument("--keep_pdbs", action="store_true", help="do you want to keep the pdbs you download?")
parser.add_argument("--keep_clean_pdbs", action="store_true", help="do you want to keep the cleaned-up version of the pdbs you are working with?")
input_args = parser.parse_args()
#################
#### IMPORTS ####
#################
print "Loading '%s' dependencies...\n" %__name__
import sys
import os
import shutil
import pickle
try:
from colorama import Fore, Style
except:
pass
try:
from Bio.PDB import *
except ImportError:
print "Trouble with imports - do you have Bio.PDB? Exiting"
sys.exit()
try:
import networkx as nx
except ImportError:
print "Trouble with imports - do you have networkx? Exiting"
sys.exit()
# utility functions
sys.path.append( "utility" )
from download_files import download_pdb_file, download_cif_file
from line_definitions import *
from chemical_data import *
###########################
#### PDB CLEANING CODE ####
###########################
class Clean:
def __init__( self ):
self.working_dir = os.getcwd()
# make on/off switch for skipping covalently bound ligand residues
self.glycosylated_proteins = []
# hold the name of PDBs that contain undesirables like multiple models or an unknown HETATM
self.unknown_res_pdb_names = []
self.multiple_models_pdb_names = []
self.deuterium_pdb_names = []
# hold unique ligand residue names for post-analysis
self.uniq_lig_three_letter_codes_kept = []
self.uniq_lig_three_letter_codes_skipped_size = []
self.uniq_lig_three_letter_codes_skipped_link = []
def instantiate_pdb_info_holders( self ):
# instantiate lists that will hold relevant PDB data
self.protein_lines = []
self.hetatm_lines = []
self.ter_lines = []
self.link_records = []
self.covalently_bound_lig_residues = []
self.protein = {}
self.ligand = {}
# ligand data holders for print PDB information to screen
self.num_ligand_residues = 0
self.num_ligand_atoms = 0
return True
def download_pdb( self, pdb_name ):
"""
Uses a utility function to download a PDB from the internet based on the four letter accession code
:param pdb_name: str( four-letter PDB code )
:return: str( full path name to the PDB file )
"""
# relevant paths
cur_dir = os.getcwd() + '/'
pdb_dir = cur_dir + 'pdbs/'
# check to make sure the 'pdbs' directory exists one down from the current directory
if not os.path.isdir( pdb_dir ):
os.mkdir( pdb_dir )
# if the PDB exists in the 'pdbs' directory, return the path to it
if os.path.isfile( pdb_dir + pdb_name ):
return pdb_dir + pdb_name
# otherwise download the PDB
else:
# download PDB and get its filename
try:
pdb = download_pdb_file( pdb_name[:4], pdb_dir )
except:
pass
# return the full path name to the PDB file
return pdb_dir + pdb
return True
def download_cif( self, pdb_name ):
"""
Uses a utility function to download the cif file of a PDB from the internet based on the four letter accession code
:param pdb_name: str( four-letter PDB code )
:return: str( full path name to the .cif file )
"""
# relevant paths
cur_dir = os.getcwd() + '/'
pdb_dir = cur_dir + 'pdbs/'
# check to make sure the 'pdbs' directory exists one down from the current directory
if not os.path.isdir( pdb_dir ):
os.mkdir( pdb_dir )
# if the PDB exists in the 'pdbs' directory, return the path to it
if os.path.isfile( pdb_dir + pdb_name ):
return pdb_dir + pdb_name
# otherwise download the PDB
else:
# download PDB and get its filename
try:
pdb = download_cif_file( pdb_name[:4], pdb_dir )
except:
pass
# return the full path name to the PDB file
return pdb_dir + pdb
return True
def pymol_clean( self, pdb_filename ):
"""
Uses PyMOL to "clean" a PDB by removing waters and hydrogens.
Opens up PyMOL via a command-line argument. Closes PyMOL upon completion
:param pdb_filename:
"""
# get the right names of the PDB files
split_pdb_name = pdb_filename.split( '/' )[-1]
pdb_name = split_pdb_name[:-4]
# use pymol to remove water residues and remove hydrogens - ie. clean the file
pymol_command = "%s -cqd 'load %s; remove resn hoh; remove resn dod; remove hydrogens %s; save %s'" %( pymol_exe_dir, pdb_filename, pdb_name, pdb_name )
# run the PyMol command through the terminal
os.popen( pymol_command )
return True
def get_uniq_connection_names_from_LINK_records( self, LINK_records ):
unique_connection_names = []
for link_line in LINK_records:
res1_unique_name = link_line.res1_name + '_' + link_line.res1_chain + '_' + str( link_line.res1_seq )
res2_unique_name = link_line.res2_name + '_' + link_line.res2_chain + '_' + str( link_line.res2_seq )
uniq_connection_name = res1_unique_name + '+' + res2_unique_name
if uniq_connection_name not in unique_connection_names:
unique_connection_names.append( uniq_connection_name )
return unique_connection_names
def graph_out_residue_connections( self, unique_partner_names, unique_protein_names, unique_hetatm_names ):
# make a tree using networkx
graph = nx.Graph()
# add nodes 2 at a time from unique_connect_partner_names
for partners in unique_partner_names:
# get the unique names for partner1 and partner2
ptnr1 = partners.split( '+' )[0]
ptnr2 = partners.split( '+' )[1]
# get the residue name, chain, and sequence id from the unique names
ptnr1_resname = ptnr1.split( '_' )[0]
ptnr1_reschain = ptnr1.split( '_' )[1]
ptnr1_resseq = ptnr1.split( '_' )[2]
ptnr2_resname = ptnr2.split( '_' )[0]
ptnr2_reschain = ptnr2.split( '_' )[1]
ptnr2_resseq = ptnr2.split( '_' )[2]
# add each node according to whether its a HETATM or not
# if residue not already a node
if not ptnr1 in graph.nodes():
# add residue as node based on whether it is a protein or hetatm
if ptnr1 in unique_protein_names:
graph.add_node( ptnr1, { "Chain" : ptnr1_reschain, "Seq" : ptnr1_resseq, "HETATM" : False } )
else:
graph.add_node( ptnr1, { "Chain" : ptnr1_reschain, "Seq" : ptnr1_resseq, "HETATM" : True } )
# if residue not already a node
if not ptnr2 in graph.nodes():
# add residue as node based on whether it is a protein or hetatm
if ptnr2 in unique_protein_names:
graph.add_node( ptnr2, { "Chain" : ptnr2_reschain, "Seq" : ptnr2_resseq, "HETATM" : False } )
else:
graph.add_node( ptnr2, { "Chain" : ptnr2_reschain, "Seq" : ptnr2_resseq, "HETATM" : True } )
# add a bond (an edge) between the two residues
graph.add_edge( ptnr1, ptnr2 )
# iter through each subgraph (the group of nodes that have edges connecting them together)
C = nx.connected_component_subgraphs( graph )
# for holding unique residue names
remove_these_ligs = []
# for each subgraph
for g in C:
# remove = False. Default is we would keep these ligand residues
remove = False
for n1, n2 in g.edges_iter():
# if one node is a HETATM and one is not
if ( g.node[n1]["HETATM"] == True and g.node[n2]["HETATM"] == False ) or ( g.node[n1]["HETATM"] == False and g.node[n2]["HETATM"] == True ):
# remove this subgraph as there is a covalent connection bewteen protein and ligand
remove = True
# if you need to remove the ligs from this subgraph
if remove:
for mynode in g.nodes_iter():
# if this node is a ligand
if g.node[ mynode ][ "HETATM" ]:
# if you haven't already added it
if mynode not in remove_these_ligs:
remove_these_ligs.append( mynode )
# return the unique names of the residues that are covalently linked to the protein
return remove_these_ligs
def determine_covalently_bound_ligands( self, pdb_filename, unique_protein_names, unique_hetatm_names, link_records ):
# download the mmcif file
cif_filename = self.download_cif( pdb_filename[:4] )
# get _struct_conn lines to determine HETATM connections
_struct_conn = cif_struct_conn_lines( cif_filename )
# check to see if check_if_has_mmcif_dict() by using return value
response = _struct_conn.check_if_has_mmcif_dict()
# if this PDB has an mmcif file
if response is True:
# unique name = res1name_res1chain_res1num+res2name_res2chain_res2num
unique_partner_names = _struct_conn.get_uniq_connection_names()
print unique_partner_names
# get list of ligand residues to remove
remove_these_ligs = self.graph_out_residue_connections( unique_partner_names, unique_protein_names, unique_hetatm_names )
# otherwise this PDB didn't have an mmcif file, use LINK records instead
else:
# unique name = res1name_res1chain_res1num+res2name_res2chain_res2num
unique_partner_names = self.get_uniq_connection_names_from_LINK_records( link_records )
# get list of ligand residues to remove
remove_these_ligs = self.graph_out_residue_connections( unique_partner_names, unique_protein_names, unique_hetatm_names )
return remove_these_ligs
def split_pdb_file( self, pdb_filename, ignore_glycosylated_proteins ):
"""
Splits up the PDB file into ATOM and HETATM lines.
Doesn't keep HETATM lines that are 1) lone metal atoms, 2) amino acids, or 3) unknown
Is able to handle modified amino acid residues - potential bug is if the modified residue is a ligand
Will write out an "error" message if the PDB 1) doesn't have a ligand, 2) has unknown residues, or 3) has multiple models in the file
:param pdb_filename: str( /path/to/pdb/file )
:return: 1 if PDB file passes all specifications; 0 otherwise
"""
# glycosylated protein boolean tracker
glycosylated_protein = False
# get the PDB name from the end of the full path given in pdb_filename
split_pdb_name = pdb_filename.split( '/' )[-1]
pdb_name = split_pdb_name[:-4].lower()
# instantiate lists that will hold relevant PDB data that is NOT wanted
AA_lig = []
nuc_acid_lig = []
water = []
models = []
deuterium = []
unknown = []
# list of MODRES names to treat as ATOM lines
modres_protein_res_names = []
# open up the PDB file
try:
with open( pdb_filename, 'r' ) as pdb_fh:
pdb = pdb_fh.readlines()
except IOError:
text = pdb_filename + " doesn't exist in this directory. Did you mean to download it? Exiting."
print(Fore.BLUE + text + Style.RESET_ALL)
sys.exit()
# parse through the PDB file
for line in pdb:
# if there is a residue that has a modification, it is likely classified as a HETATM, so treat it like an ATOM line
if line[0:6] == "MODRES":
modres_line = MODRES_line( line )
# unique residue name
modres_name = modres_line.res_name
modres_chain = modres_line.res_chain
modres_num = str( modres_line.res_num )
uniq_modres_name = modres_name + '_' + modres_chain + '_' + modres_num
# the standard res name if this residue wasn't modified
std_res_name = modres_line.std_res_name
# if the standard residue name is a standard amino acid, add its modified res name to a list to be added later
if std_res_name in AA_list:
modres_protein_res_names.append( uniq_modres_name )
# if there are multiple models in this PDB file, add the line so this PDB will be skipped
if line[0:5] == "MODEL":
models.append( line )
self.multiple_models_pdb_names.append( pdb_name )
break
if line[0:4] == "LINK":
# store LINK records used for determining any covalently bound ligands
link_line = LINK_line( line )
self.link_records.append( link_line )
if line[0:3] == "TER":
# store TER lines to write to a clean PDB file
ter_line = PDB_line( line )
self.ter_lines.append( ter_line )
if line[0:4] == "ATOM":
# store the protein lines
pdb_line = PDB_line( line )
# unknown amino acid - skip the PDB
if pdb_line.res_name == "UNK" or pdb_line.res_name == "UNL":
unknown.append( line )
self.unknown_res_pdb_names.append( pdb_name )
break
# skip PDBs that have deuterium as an element
if pdb_line.element == 'D':
deuterium.append( line )
self.deuterium_pdb_names.append( pdb_name )
break
# skip hydrogen atoms
if pdb_line.element != 'H':
# collect the residue's unique name
pro_res_name = pdb_line.res_name
pro_res_chain = pdb_line.res_chain
pro_res_num = str( pdb_line.res_num )
uniq_pro_name = pro_res_name + '_' + pro_res_chain + '_' + pro_res_num
# check occupancy level
if pdb_line.occupancy != 1.00:
# keep those without an identifier or with the code 'A'
if pdb_line.alt_loc == '' or pdb_line.alt_loc == 'A':
# instantiate a dictionary key for this specific protein residue using its unique name
# will be filled with the non-hydrogen ATOM lines later
if uniq_pro_name not in self.protein.keys():
self.protein[ uniq_pro_name ] = []
# store the protein lines
self.protein_lines.append( pdb_line )
# otherwise this protein residue is at full occupancy so store the line
else:
# instantiate a dictionary key for this specific protein residue using its unique name
# will be filled with the non-hydrogen ATOM lines later
if uniq_pro_name not in self.protein.keys():
self.protein[ uniq_pro_name ] = []
# store the protein lines
self.protein_lines.append( pdb_line )
if line[0:6] == "HETATM":
# get each ligand residue name to see what it is and store it in the appropriate list
pdb_line = PDB_line( line )
lig_res_name = pdb_line.res_name
# check what each residue is by its name ( water, unknown, or a ligand etc )
if lig_res_name == "HOH" or lig_res_name == "DOD":
water.append( line )
elif lig_res_name in AA_list:
AA_lig.append( line )
elif lig_res_name in nucleic_acid_list:
nuc_acid_lig.append( line )
# unknown ligand - skip the PDB
elif lig_res_name == "UNK" or lig_res_name == "UNL":
unknown.append( line )
self.unknown_res_pdb_names.append( pdb_name )
break
# unknown nucleic acid - skip the PDB
elif lig_res_name == 'N':
unknown.append( line )
self.unknown_res_pdb_names.append( pdb_name )
break
else:
# skip PDBs that have deuterium as an element
if pdb_line.element == 'D':
deuterium.append( line )
self.deuterium_pdb_names.append( pdb_name )
break
# otherwise, keep going
if pdb_line.element != 'H':
# get the ligand residue's unique name
lig_res_chain = pdb_line.res_chain
lig_res_num = str( pdb_line.res_num )
uniq_lig_name = lig_res_name + '_' + lig_res_chain + '_' + lig_res_num
# if this is a modified amino acid residue
if uniq_lig_name in modres_protein_res_names:
# change the HETATM to ATOM in the line
line = pdb_line.line
line = line.replace( "HETATM", "ATOM " )
pdb_line = PDB_line( line )
# now this is a unique protein name, not ligand
uniq_pro_name = uniq_lig_name
# check occupancy level
if pdb_line.occupancy != 1.00:
# keep those without an identifier or with the code 'A'
if pdb_line.alt_loc == '' or pdb_line.alt_loc == 'A':
# instantiate a dictionary key for this specific protein residue using its unique name
# will be filled with the non-hydrogen ATOM lines later
if uniq_lig_name not in self.ligand.keys():
self.ligand[ uniq_lig_name ] = []
# append the altered PDB line to the protein lines
self.hetatm_lines.append( pdb_line )
# otherwise this modified residue is at full occupancy
else:
# instantiate a dictionary key for this specific modified protein residue
# will be filled with the non-hydrogen HETATM lines later
if uniq_lig_name not in self.ligand.keys():
self.ligand[ uniq_lig_name ] = []
# append the altered PDB line to the protein lines
self.hetatm_lines.append( pdb_line )
# otherwise, this is a ligand residue
else:
# check occupancy level
if pdb_line.occupancy != 1.00:
# keep those without an identifier or with the code 'A'
if pdb_line.alt_loc == '' or pdb_line.alt_loc == 'A':
# instantiate a dictionary key for this specific protein residue using its unique name
# will be filled with the non-hydrogen ATOM lines later
if uniq_lig_name not in self.ligand.keys():
self.ligand[ uniq_lig_name ] = []
# store the HETATM lines
self.hetatm_lines.append( pdb_line )
# otherwise residue is at full occupancy so store the line
else:
# instantiate a dictionary key for this specific ligand residue
# will be filled with the non-hydrogen HETATM lines later
if uniq_lig_name not in self.ligand.keys():
self.ligand[ uniq_lig_name ] = []
# store the HETATM lines
self.hetatm_lines.append( pdb_line )
# if there were unknown residues, skip
if len( unknown ) != 0:
print "## Skipping", pdb_name, "because it has unknown residues ##"
return False
# if there were PDBs with more than one model, skip
if len( models ) != 0:
print "## Skipping", pdb_name, "because it has more than one model ##"
return False
# if there were PDBs with more than deuterium, skip
if len( deuterium ) != 0:
print "## Skipping", pdb_name, "because it contains deuterium ##"
return False
# if there is no ligand, skip
if len( self.hetatm_lines ) == 0:
print "## Skipping", pdb_name, "because it does not have a ligand of interest ##"
return False
# move all ATOM and HETATM lines to their appropriate place in the dictionaries based on their unique names
for pro_pdb_line in self.protein_lines:
# get the residue's unique name again
pro_res_name = pro_pdb_line.res_name
pro_res_chain = pro_pdb_line.res_chain
pro_res_num = str( pro_pdb_line.res_num )
uniq_pro_name = pro_res_name + '_' + pro_res_chain + '_' + pro_res_num
# otherwise just a normal residue
self.protein[ uniq_pro_name ].append( pro_pdb_line )
for lig_pdb_line in self.hetatm_lines:
lig_res_name = lig_pdb_line.res_name
lig_res_chain = lig_pdb_line.res_chain
lig_res_num = str( lig_pdb_line.res_num )
uniq_lig_name = lig_res_name + '_' + lig_res_chain + '_' + lig_res_num
# otherwise just a normal residue
self.ligand[ uniq_lig_name ].append( lig_pdb_line )
# if user wants to ignore glycosylated proteins (proteins with a HETATM attached to them)
if ignore_glycosylated_proteins:
# see if there are residues covalently bound to the protein
self.covalently_bound_lig_residues = self.determine_covalently_bound_ligands( pdb_name, self.protein, self.ligand, self.link_records )
# add PDB to list if it had a glycan
if len( self.covalently_bound_lig_residues ) != 0:
# add name of PDB to glycosylated_proteins list (to be dumped later)
self.glycosylated_proteins.append( pdb_name )
# remove covalently bound ligands from the list of unique ligand residue names
for remove_this_lig in self.covalently_bound_lig_residues:
# using an if statement because a covalently bound ligand residue could be an amino acid
if remove_this_lig in self.ligand.keys():
self.ligand.pop( remove_this_lig )
# store the unique three letter code of the ligand residues that were skipped
three_letter_code = remove_this_lig.split( '_' )[0]
if three_letter_code not in self.uniq_lig_three_letter_codes_skipped_link:
self.uniq_lig_three_letter_codes_skipped_link.append( three_letter_code )
# if there is no ligand after removing glycans, skip
if len( self.ligand.keys() ) == 0:
print "## Skipping", pdb_name, "because it did not have a ligand of interest after removing glycans ##"
return False
return True
def get_ligand_residues( self, heavy_atoms, pdb_filename ):
# get the PDB name from the end of the full path given in pdb_filename
split_pdb_name = pdb_filename.split( '/' )[-1]
pdb_name = split_pdb_name[:-4].lower()
# this makes a new dictionary of ligands only if there are more than the specified number of heavy atoms in the lig residue
for uniq_lig_name in self.ligand.keys():
# remove ligands that don't pass the passed heavy atom cutoff
if len( self.ligand[ uniq_lig_name ] ) < heavy_atoms:
# remove the ligand from the self.ligand dictionary
self.ligand.pop( uniq_lig_name )
# so keep unique three letter codes for ligand residues skipped
three_letter_code = uniq_lig_name.split( '_' )[0]
if three_letter_code not in self.uniq_lig_three_letter_codes_skipped_size:
self.uniq_lig_three_letter_codes_skipped_size.append( three_letter_code )
else:
# otherwise the ligand passed the heavy atom cutoff
# keep unique three letter codes for ligand residues kept
three_letter_code = uniq_lig_name.split( '_' )[0]
if three_letter_code not in self.uniq_lig_three_letter_codes_kept:
self.uniq_lig_three_letter_codes_kept.append( three_letter_code )
# get number of ligand residues
self.num_ligand_residues = len( self.ligand.keys() )
# count the number of heavy ligand atoms by looping through the dictionary
# hydrogens were skipped so just count the number of values for each key
for uniq_lig_name in self.ligand.keys():
self.num_ligand_atoms += len( self.ligand[ uniq_lig_name ] )
# stop if there were no ligand residues with the given heavy atom cutoff
if self.num_ligand_residues == 0:
print "## Skipping", pdb_name, "because it had no ligand residues left given the", heavy_atoms, "heavy atom cutoff ##"
return False
# otherwise this ligand passed user requirements
else:
print " ", pdb_name,
print "has", self.num_ligand_residues,
print "ligand residues that each have equal to or more than", heavy_atoms, "heavy atoms",
print "resulting in", self.num_ligand_atoms, "total heavy ligand atoms"
# replace hetatm_lines with only the hetatm lines of the remaining ligand residues with the appropriate number of heavy atoms
# hetatm_lines will be used elsewhere - so it's easier to keep this accurate with only the hetatm lines that will be used
self.hetatm_lines = []
for uniq_lig_name in self.ligand.keys():
for pdb_line in self.ligand[ uniq_lig_name ]:
self.hetatm_lines.append( pdb_line.line )
return True
return True
def write_clean_pdb_file( self, pdb_filename ):
# make a clean PDB file that can be kept at the user's request to see if the program is doing what they think it is
# only need to make the clean file if the user wants to keep it
# create and open a XXXX.clean.pdb file name in the pdb directory
cur_dir = os.getcwd() + '/'
pdb_dir = cur_dir + 'pdbs/'
# get the four letter code in case a file path was given
pdb_name = pdb_filename.split( '/' )[-1][:4].lower()
clean_pdb_filename = pdb_dir + pdb_name + ".clean.pdb"
# put the ATOM, HETATM, and TER lines into a list to be sorted on atom number
atom_num_pdb_lines = {}
for atom_res_key in self.protein.keys():
for atom_line in self.protein[ atom_res_key ]:
atom_num_pdb_lines[ atom_line.atom_num ] = atom_line.line
for hetatm_res_key in self.ligand.keys():
for hetatm_line in self.ligand[ hetatm_res_key ]:
atom_num_pdb_lines[ hetatm_line.atom_num ] = hetatm_line.line
for ter_line in self.ter_lines:
atom_num_pdb_lines[ ter_line.atom_num ] = ter_line.line
# get the atom numbers in order from the dictionary
ordered_atom_numbers = atom_num_pdb_lines.keys()
ordered_atom_numbers.sort()
# order the PDB lines based on their atom number
ordered_pdb_lines = []
for atom_num_key in ordered_atom_numbers:
ordered_pdb_lines.append( atom_num_pdb_lines[ atom_num_key ] )
# write the protein and ligand lines that are kept after this round of splitting
with open( clean_pdb_filename, 'wb' ) as pdb_fh:
pdb_fh.writelines( ordered_pdb_lines )
return True
def write_pro_lig_pickle( self, pdb_filename ):
# get the protein and ligand pickle directory path
cur_dir = os.getcwd() + '/'
pro_lig_pickle_dir = cur_dir + 'pdbs/pro_lig_pickles/'
# create the directory if they do not already exist
if not os.path.isdir( pro_lig_pickle_dir ):
os.mkdir( pro_lig_pickle_dir )
# get the four letter code in case a file path was given
pdb_name = pdb_filename.split( '/' )[-1][:4].lower()
# create the pickle filename from the four letter PDB code
protein_pickle = pro_lig_pickle_dir + pdb_name + "_pro.p"
ligand_pickle = pro_lig_pickle_dir + pdb_name + "_lig.p"
# write the protein and ligand dictionaries to the pickle file
# doing this so that in contact and activesite code there is no need to split PDB file again
pickle.dump( self.protein, open( protein_pickle, "wb" ) )
pickle.dump( self.ligand, open( ligand_pickle, "wb" ) )
return True