-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathrun_clean_out_pdbs.py
executable file
·184 lines (140 loc) · 7.68 KB
/
run_clean_out_pdbs.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
#!/usr/bin/python
__author__ = "morganlnance"
###########################
#### PROGRAM ARGUMENTS ####
###########################
import argparse
parser = argparse.ArgumentParser(description="Use Python to clean out a list of PDB files.")
parser.add_argument("pdb_name_list", help="a file of the pdbs to be analyzed")
parser.add_argument("--ignore_glycosylated_proteins", "-i", action="store_true", help="do you want to skip PDBs that have a covalently attached HETATM group? This is most likely a glycan")
parser.add_argument("--heavy_atoms", "-ha", type=int, default=10, help="how many heavy atoms does a HETATM residue need to be considered a ligand? default = 10")
parser.add_argument("--download_pdbs", "-d", action="store_true", help="do you need to download the pdbs from the database?")
parser.add_argument("--keep_cifs", "-kc", action="store_true", help="do you want to keep the cif files you download?")
parser.add_argument("--keep_pdbs", "-kp", action="store_true", help="do you want to keep the pdbs you download?")
parser.add_argument("--keep_clean_pdbs", "-kcp", action="store_true", help="do you want to keep the cleaned-up version of the pdbs you are working with?")
input_args = parser.parse_args()
#################
#### IMPORTS ####
#################
import os
import sys
try:
from colorama import Fore, Style
except:
pass
import clean_out_pdbs
######################
#### MAIN PROGRAM ####
######################
def go( pdb_name_list, ignore_glycosylated_proteins, heavy_atoms, download_pdbs, keep_cifs, keep_pdbs, keep_clean_pdbs ):
# get any filenames in the 'pdbs' directory as to not delete them later
keep_these_files = os.listdir( "pdbs" )
# relevant variable instatiations
all_clean_pdb_names = []
# load up PDB name list from input argument
pdb_names_from_list = []
pdb_file_CR = open( pdb_name_list, 'r' ).readlines() # CR = carriage return (new line character)
for name in pdb_file_CR:
name = name.rstrip( '\n' )
# add .pdb extension if not already there - don't need to do this if downloading the PDB
if not download_pdbs:
if not name.endswith( ".pdb" ):
name = name + ".pdb"
pdb_names_from_list.append( name )
# get current files as to not delete them later
working_dir = os.getcwd() + '/'
cur_files_working = os.listdir( working_dir )
# holds the name of PDBs that couldn't be downloaded
unable_to_download_pdb_names = []
# instantiate an instance of the Clean object
# this also instantiates the data holders for PDBs that don't pass the qualifications
clean = clean_out_pdbs.Clean()
# for each PDB in the list, run the contact counter
for pdb in pdb_names_from_list:
if pdb != '':
# try to print in color if user has colorama library
try:
text = "Working on %s" %pdb
print(Fore.RED + text + Style.RESET_ALL)
except:
print "Working on", pdb
# store pdb name in the contact counting class
pdb_name = pdb[ 0:4 ]
# check to see if this PDB has already been looked at in this round
if not pdb_name in all_clean_pdb_names:
# also check lower case
if not pdb_name.lower() in all_clean_pdb_names:
# download the pdb if needed
if download_pdbs:
pdb = clean.download_pdb( pdb )
# check to see if the PDB path exists, otherwise it needed to be downlaoded
if not os.path.isfile( pdb ):
print "## Skipping", pdb.split( '/' )[-1][0:4], "because it doesn't seem to exist"
unable_to_download_pdb_names.append( pdb_name )
pass
# otherwise the PDB exists and the program continues
else:
# instantiate data holders for PDB information
clean.instantiate_pdb_info_holders()
# split ATOM, HETATM, and LINK lines
response = clean.split_pdb_file( pdb, ignore_glycosylated_proteins )
# if splitting the pdb was successful ( there is a ligand, no AA as ligand, no metal as ligand, no UNK residues, etc. )
if response:
# get ligand residue numbers from pose
response = clean.get_ligand_residues( heavy_atoms, pdb )
# if it got this far, then the PDB has an acceptable ligand
if response:
# write out a clean PDB file
if keep_clean_pdbs:
clean.write_clean_pdb_file( pdb )
# write out the protein and ligand dictionary pickle files
clean.write_pro_lig_pickle( pdb )
# add the PDB name to the list of clean PDBs
all_clean_pdb_names.append( pdb.split( '/' )[-1][0:4] )
# get file names in the 'pdbs' directory
os.chdir( working_dir + 'pdbs' )
pdb_files = os.listdir( os.getcwd() )
# delete undownloaded .tar.gz files
for f in pdb_files:
if f.endswith( ".gz" ):
os.remove( f )
# delete unwanted .cif, .pdb, and .pdb files
if not keep_cifs:
for f in pdb_files:
if f not in keep_these_files:
if f.endswith( ".cif" ):
os.remove( f )
if not keep_pdbs:
for f in pdb_files:
if f not in keep_these_files:
if f.endswith( ".pdb" ) and len( f ) == 8:
os.remove( f )
# return to working directory
os.chdir( working_dir )
# write out all_clean_pdb_names to a file as those were the PDBs actually analyzed
with open( "clean_PSMDB_90_non_red_pro_70_non_red_lig_13_ha_cutoff_list", 'wb' ) as fh:
for line in all_clean_pdb_names:
fh.write( line + '\n' )
###########################
##### DATA COLLECTION #####
###########################
# write out the unique three letter codes for ligands that were kept
filename = "ligand_residues_that_were_kept.txt"
with open( filename, 'wb' ) as fh:
for line in clean.uniq_lig_three_letter_codes_kept:
fh.write( line + '\n' )
# write out the unique three letter codes for ligands that were skipped due to size or linkage
filename = "ligand_residues_that_were_skipped.txt"
with open( filename, 'wb' ) as fh:
fh.write( "## ligand residues that were smaller than %s heavy atoms\n" %heavy_atoms )
for line in clean.uniq_lig_three_letter_codes_skipped_size:
fh.write( line + '\n' )
fh.write( '\n' )
fh.write( "## ligand residues that were skipped because they were linked to the protein\n" )
for line in clean.uniq_lig_three_letter_codes_skipped_link:
fh.write( line + '\n' )
print "\nDone! :D\n"
######################
#### RUNS PROGRAM ####
######################
go( input_args.pdb_name_list, input_args.ignore_glycosylated_proteins, input_args.heavy_atoms, input_args.download_pdbs, input_args.keep_cifs, input_args.keep_pdbs, input_args.keep_clean_pdbs )