-
Notifications
You must be signed in to change notification settings - Fork 3
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
peevo
committed
Mar 20, 2017
1 parent
9ef2b8d
commit 8b97cb4
Showing
19 changed files
with
26,120 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,2 +1,8 @@ | ||
# biovec | ||
Protein classification over sum of protein ngrams vector representation | ||
An actualization for https://github.com/ehsanasgari/Deep-Proteomics | ||
and code to reproduce http://journals.plos.org/plosone/article?id=10.1371/journal.pone.0141287 results | ||
|
||
# install deps: | ||
pip install -r requirements.txt | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,108 @@ | ||
import argparse | ||
import sys | ||
import os | ||
import gzip | ||
from collections import Counter | ||
|
||
os.environ['KERAS_BACKEND'] = 'theano' | ||
|
||
|
||
import numpy as np | ||
from scipy.spatial.distance import cosine | ||
from Bio import SeqIO | ||
|
||
import keras | ||
from keras.models import Model, Sequential | ||
from keras.layers import Input, Dense, Dropout | ||
from keras.callbacks import EarlyStopping | ||
from sklearn.model_selection import train_test_split | ||
from sklearn.metrics import log_loss | ||
from sklearn import preprocessing | ||
from sklearn.metrics import roc_auc_score | ||
|
||
|
||
def get_model(vector_length): | ||
model = Sequential([ | ||
Dense(vector_length, activation='relu', name='hidden_1', input_dim=vector_length), | ||
Dropout(0.25), | ||
Dense(int(vector_length / 2), activation='relu', name='hidden_2'), | ||
Dropout(0.25), | ||
Dense(int(vector_length / 2), activation='relu', name='outer'), | ||
Dropout(0.25), | ||
Dense(1, activation='sigmoid', name='protein_family'), | ||
]) | ||
model.compile( | ||
optimizer='adam', | ||
loss={'protein_family': 'binary_crossentropy'}, | ||
metrics=['accuracy'], | ||
) | ||
return model | ||
|
||
|
||
def fit_model(file_path, epochs): | ||
targets = [] | ||
vectors = [] | ||
with open(file_path) as infile: | ||
for line in infile: | ||
is_in_family, vector_string = line.rstrip().split('\t', 2) | ||
targets.append(int(is_in_family)) | ||
vectors.append(np.array(map(float, vector_string.split()), dtype=np.float32)) | ||
|
||
vectors_array = np.array(vectors) | ||
targets_array = np.array(targets) | ||
targets_array = targets_array.reshape(-1, 1) | ||
vectors, target = None, None | ||
#print(vectors_array, targets_array) | ||
#print(targets_array) | ||
#print(vectors_array.shape, targets_array.shape) | ||
model = get_model(vector_length=vectors_array.shape[1]) | ||
|
||
vectors_train, vectors_test, targets_train, targets_test = train_test_split(vectors_array, | ||
targets_array, | ||
test_size=0.1) | ||
vectors_array, targets_array = None, None | ||
|
||
history = model.fit(vectors_train, | ||
targets_train, | ||
epochs=epochs, | ||
#validation_split=0.2, | ||
validation_data=(vectors_test, targets_test), | ||
batch_size=256, | ||
callbacks=[EarlyStopping(patience=3)]) | ||
score = model.evaluate(vectors_test, targets_test, verbose=1) | ||
print('Test loss: {}, Test accuracy:', score[0], score[1]) | ||
test_predictions = model.predict(vectors_test, verbose=2, batch_size=256) | ||
prediction_counter = Counter() | ||
|
||
for index, predicted_float in enumerate(test_predictions): | ||
predicted = 0 if predicted_float < 0.5 else 1 | ||
correct = targets_test[index][0] | ||
prediction_counter[predicted==correct] += 1 | ||
#print('{}\t{}\t{}'.format(predicted, correct, predicted==correct)) | ||
sample_file_base_name = os.path.basename(file_path) | ||
with open('pfam_nn_results/{}.txt'.format(sample_file_base_name), 'w') as outfile: | ||
result = '{}: t_rate(accuracy)={}'.format(sample_file_base_name, float(prediction_counter[True]) / sum(prediction_counter.values())) | ||
print(result) | ||
outfile.write('{}\n'.format(result)) | ||
|
||
with open('pfam_nn_results/{}_model.json'.format(sample_file_base_name), "w") as json_file: | ||
json_file.write(model.to_json()) | ||
|
||
|
||
def main(): | ||
parser = argparse.ArgumentParser('Trains NN model over protein vectors') | ||
parser.add_argument('--sample', type=str, default='training_sample_100.txt') | ||
parser.add_argument('--epochs', type=int, default=30) | ||
args = parser.parse_args() | ||
|
||
if os.path.isdir(args.sample): | ||
sample_directory = args.sample | ||
for file_name in os.listdir(sample_directory): | ||
file_path = os.path.join(sample_directory, file_name) | ||
fit_model(file_path, args.epochs) | ||
else: | ||
fit_model(args.sample, args.epochs) | ||
|
||
|
||
if __name__ == '__main__': | ||
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,113 @@ | ||
import argparse | ||
import sys | ||
import os | ||
import gzip | ||
from collections import Counter | ||
import cPickle as pickle | ||
|
||
import numpy as np | ||
from scipy.spatial.distance import cosine | ||
from Bio import SeqIO | ||
|
||
from sklearn import svm | ||
from sklearn.model_selection import train_test_split | ||
from sklearn.metrics import log_loss | ||
from sklearn import preprocessing | ||
from sklearn.metrics import roc_auc_score, f1_score, accuracy_score, average_precision_score, coverage_error | ||
from sklearn.model_selection import cross_val_score | ||
|
||
|
||
def get_sample(file_path): | ||
targets = [] | ||
vectors = [] | ||
with open(file_path) as infile: | ||
for line in infile: | ||
is_in_family, vector_string = line.rstrip().split('\t', 2) | ||
targets.append(int(is_in_family)) | ||
vectors.append(np.array(map(float, vector_string.split()), dtype=np.float32)) | ||
|
||
vectors_array = np.array(vectors) | ||
targets_array = np.array(targets) | ||
#targets_array = targets_array.reshape(-1, 1) | ||
vectors, target = None, None | ||
|
||
vectors_train, vectors_test, targets_train, targets_test = train_test_split(vectors_array, | ||
targets_array, | ||
test_size=0.1) | ||
vectors_array, targets_array = None, None | ||
return vectors_train, vectors_test, targets_train, targets_test | ||
|
||
|
||
def save_model_metrics(model_params_string, model, vectors_test, targets_test): | ||
with open('svm_results/{}_results.txt'.format(model_params_string), 'w') as outfile: | ||
predicted_targets = model.predict(vectors_test) | ||
outfile.write('score: {}\n'.format(model.score(vectors_test, targets_test))) | ||
#print('cross_val_test', cross_val_score(model, vectors_test, targets_test, scoring='neg_log_loss')) | ||
#print('cross_val_train', cross_val_score(model, vectors_train, targets_test, scoring='neg_log_loss')) | ||
outfile.write('f1_macro: {}\n'.format(f1_score(targets_test, predicted_targets, average='macro'))) | ||
outfile.write('f1_micro: {}\n'.format(f1_score(targets_test, predicted_targets, average='micro'))) | ||
outfile.write('f1_weighted: {}\n'.format(f1_score(targets_test, predicted_targets, average='weighted'))) | ||
outfile.write('accuracy_score: {}\n'.format(accuracy_score(targets_test, predicted_targets))) | ||
|
||
prediction_counter = Counter() | ||
for index, predicted_target in enumerate(predicted_targets): | ||
correct_target = targets_test[index] | ||
if predicted_target and not correct_target: | ||
prediction_counter['fp'] += 1 | ||
elif predicted_target and correct_target: | ||
prediction_counter['tp'] += 1 | ||
elif not predicted_target and correct_target: | ||
prediction_counter['fn'] += 1 | ||
elif not predicted_target and not correct_target: | ||
prediction_counter['tn'] += 1 | ||
|
||
outfile.write('predicted={} correct={} is_correct={}\n'.format(correct_target, | ||
predicted_target, | ||
predicted_target==correct_target)) | ||
|
||
tp_rate = float(prediction_counter['tp']) / (prediction_counter['tp'] + prediction_counter['fn']) | ||
tn_rate = float(prediction_counter['tn']) / (prediction_counter['tn'] + prediction_counter['fp']) | ||
t_rate = float(prediction_counter['tn'] + prediction_counter['tp']) / sum(prediction_counter.values()) | ||
result = '{}: tp_rate(specificity) = {} tn_rate(sensitivity) = {} t_rate(accuracy) = {}'.format(model_params_string, tp_rate, tn_rate, t_rate) | ||
outfile.write('{}\n'.format(result)) | ||
return result | ||
|
||
|
||
def fit_model(file_path, model_type): | ||
vectors_train, vectors_test, targets_train, targets_test = get_sample(file_path) | ||
model = None | ||
if model_type == 'svc_linear': | ||
model = svm.SVC(kernel='linear') | ||
elif model_type == 'svc_rbf': | ||
model = svm.SVC(kernel='rbf') | ||
elif model_type == 'linear_svc': | ||
model = svm.LinearSVC() | ||
|
||
model.fit(vectors_train, targets_train) | ||
model_params_string = '{}_{}'.format(model_type, os.path.basename(file_path)) | ||
with open('pfam_svm_results/{}.pkl'.format(model_params_string), 'wb') as outfile: | ||
pickle.dump(model, outfile) | ||
|
||
return save_model_metrics(model_params_string, model, vectors_test, targets_test) | ||
|
||
#def get_model(file_path): | ||
# with open(file_path, 'rb') as infile: | ||
# return pickle.load(infile) | ||
|
||
def main(): | ||
parser = argparse.ArgumentParser('Trains SVM model over protein vectors') | ||
parser.add_argument('--sample', type=str, default='training_sample_100.txt') | ||
parser.add_argument('--type', type=str, default='svc_linear') | ||
args = parser.parse_args() | ||
|
||
if os.path.isdir(args.sample): | ||
sample_directory = args.sample | ||
for file_name in os.listdir(sample_directory): | ||
file_path = os.path.join(sample_directory, file_name) | ||
print(fit_model(file_path, args.type)) | ||
else: | ||
fit_model(args.sample, args.type) | ||
|
||
|
||
if __name__ == '__main__': | ||
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,33 @@ | ||
import sys | ||
from collections import Counter | ||
|
||
import numpy as np | ||
|
||
|
||
def get_family_distribution(file_path): | ||
family_distribution = Counter() | ||
with open(file_path) as infile: | ||
for line in infile: | ||
uniprot_id, family, vector_string = line.rstrip().split('\t', 2) | ||
family_distribution[family] += 1 | ||
return family_distribution | ||
|
||
if __name__ == '__main__': | ||
|
||
super_sample_file_path = sys.argv[1] | ||
#target_family = sys.argv[2] | ||
family_distribution = get_family_distribution(super_sample_file_path) | ||
for target_family, target_family_proteins_count in family_distribution.most_common(10): | ||
negative_samples = [] | ||
binary_sample_file_path = 'binary_samples_pfam_100/{}_binary_sample.txt'.format(target_family) | ||
with open(binary_sample_file_path, 'w') as outfile: | ||
with open(super_sample_file_path) as infile: | ||
for line in infile: | ||
uniprot_id, family, vector_string = line.rstrip().split('\t', 2) | ||
if family == target_family: | ||
outfile.write('1\t{}\n'.format(vector_string)) | ||
else: | ||
negative_samples.append(vector_string) | ||
|
||
for negative_sample in np.random.choice(negative_samples, target_family_proteins_count): | ||
outfile.write('0\t{}\n'.format(negative_sample)) |
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,95 @@ | ||
import sys | ||
import os | ||
import gzip | ||
from collections import Counter | ||
|
||
os.environ['KERAS_BACKEND'] = 'theano' | ||
|
||
|
||
import numpy as np | ||
from scipy.spatial.distance import cosine | ||
from Bio import SeqIO | ||
|
||
import keras | ||
from keras.models import Model, Sequential | ||
from keras.layers import Input, Dense, Dropout | ||
from keras.callbacks import EarlyStopping | ||
from sklearn.model_selection import train_test_split | ||
from sklearn.metrics import log_loss | ||
from sklearn import preprocessing | ||
from sklearn.metrics import roc_auc_score | ||
|
||
|
||
def get_model(vector_length, number_of_families): | ||
model = Sequential([ | ||
Dense(vector_length, activation='relu', name='hidden_1', input_dim=vector_length), | ||
Dropout(0.25), | ||
Dense(int(vector_length / 2), activation='relu', name='hidden_2'), | ||
Dropout(0.25), | ||
Dense(int(vector_length / 2), activation='relu', name='outer'), | ||
Dropout(0.25), | ||
Dense(number_of_families, activation='softmax', name='protein_family'), | ||
]) | ||
model.compile( | ||
optimizer='adam', | ||
loss={'protein_family': 'categorical_crossentropy'}, | ||
metrics=['accuracy'], | ||
) | ||
return model | ||
|
||
|
||
def main(): | ||
families = [] | ||
vectors = [] | ||
with open(sys.argv[1]) as infile: | ||
for line in infile: | ||
uniprot_id, family, vector_string = line.rstrip().split('\t', 2) | ||
families.append(family) | ||
vectors.append(np.array(map(float, vector_string.split()), dtype=np.float32)) | ||
|
||
vectors_array = np.array(vectors) | ||
vectors = None | ||
|
||
label_encoder = preprocessing.LabelEncoder() | ||
label_encoder.fit(families) | ||
number_of_classes = len(set(label_encoder.classes_)) | ||
model = get_model(vector_length=vectors_array.shape[1], number_of_families=number_of_classes) | ||
|
||
families_encoded = np.array(label_encoder.transform(families), dtype=np.int32) | ||
families = None | ||
families_encoded = families_encoded.reshape((-1, 1)) | ||
families_binary_labels = keras.utils.to_categorical(families_encoded, num_classes=number_of_classes) | ||
families_encoded = None | ||
|
||
vectors_train, vectors_test, families_train, families_test = train_test_split(vectors_array, | ||
families_binary_labels, | ||
test_size=0.1) | ||
vectors_array, families_binary_labels = None, None | ||
|
||
history = model.fit(vectors_train, | ||
families_train, | ||
epochs=25, | ||
#validation_split=0.2, | ||
validation_data=(vectors_test, families_test), | ||
batch_size=256, | ||
callbacks=[EarlyStopping(patience=3)]) | ||
score = model.evaluate(vectors_test, families_test, verbose=1) | ||
|
||
test_predictions = model.predict(vectors_test, verbose=2, batch_size=4096) | ||
prediction_counter = Counter() | ||
for index, test_prediction in enumerate(test_predictions): | ||
predicted_family = label_encoder.inverse_transform(np.argmax(test_prediction)) | ||
actual_family = label_encoder.inverse_transform(np.argmax(families_test[index])) | ||
prediction_counter[actual_family==predicted_family] += 1 | ||
print('{}\t{}\t{}'.format(actual_family, predicted_family, actual_family==predicted_family)) | ||
print(prediction_counter, float(prediction_counter[True]) / sum(prediction_counter.values())) | ||
print(test_predictions) | ||
print(families_test) | ||
print('Test loss: {}, Test accuracy:', score[0], score[1]) | ||
|
||
with open("nn_256_model.json", "w") as json_file: | ||
json_file.write(model.to_json()) | ||
|
||
|
||
if __name__ == '__main__': | ||
main() |
Oops, something went wrong.