Skip to content

Commit

Permalink
Init
Browse files Browse the repository at this point in the history
  • Loading branch information
peevo committed Mar 20, 2017
1 parent 9ef2b8d commit 8b97cb4
Show file tree
Hide file tree
Showing 19 changed files with 26,120 additions and 0 deletions.
6 changes: 6 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,2 +1,8 @@
# biovec
Protein classification over sum of protein ngrams vector representation
An actualization for https://github.com/ehsanasgari/Deep-Proteomics
and code to reproduce http://journals.plos.org/plosone/article?id=10.1371/journal.pone.0141287 results

# install deps:
pip install -r requirements.txt

108 changes: 108 additions & 0 deletions family_classification_binary /learn_binary_nn.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
import argparse
import sys
import os
import gzip
from collections import Counter

os.environ['KERAS_BACKEND'] = 'theano'


import numpy as np
from scipy.spatial.distance import cosine
from Bio import SeqIO

import keras
from keras.models import Model, Sequential
from keras.layers import Input, Dense, Dropout
from keras.callbacks import EarlyStopping
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss
from sklearn import preprocessing
from sklearn.metrics import roc_auc_score


def get_model(vector_length):
model = Sequential([
Dense(vector_length, activation='relu', name='hidden_1', input_dim=vector_length),
Dropout(0.25),
Dense(int(vector_length / 2), activation='relu', name='hidden_2'),
Dropout(0.25),
Dense(int(vector_length / 2), activation='relu', name='outer'),
Dropout(0.25),
Dense(1, activation='sigmoid', name='protein_family'),
])
model.compile(
optimizer='adam',
loss={'protein_family': 'binary_crossentropy'},
metrics=['accuracy'],
)
return model


def fit_model(file_path, epochs):
targets = []
vectors = []
with open(file_path) as infile:
for line in infile:
is_in_family, vector_string = line.rstrip().split('\t', 2)
targets.append(int(is_in_family))
vectors.append(np.array(map(float, vector_string.split()), dtype=np.float32))

vectors_array = np.array(vectors)
targets_array = np.array(targets)
targets_array = targets_array.reshape(-1, 1)
vectors, target = None, None
#print(vectors_array, targets_array)
#print(targets_array)
#print(vectors_array.shape, targets_array.shape)
model = get_model(vector_length=vectors_array.shape[1])

vectors_train, vectors_test, targets_train, targets_test = train_test_split(vectors_array,
targets_array,
test_size=0.1)
vectors_array, targets_array = None, None

history = model.fit(vectors_train,
targets_train,
epochs=epochs,
#validation_split=0.2,
validation_data=(vectors_test, targets_test),
batch_size=256,
callbacks=[EarlyStopping(patience=3)])
score = model.evaluate(vectors_test, targets_test, verbose=1)
print('Test loss: {}, Test accuracy:', score[0], score[1])
test_predictions = model.predict(vectors_test, verbose=2, batch_size=256)
prediction_counter = Counter()

for index, predicted_float in enumerate(test_predictions):
predicted = 0 if predicted_float < 0.5 else 1
correct = targets_test[index][0]
prediction_counter[predicted==correct] += 1
#print('{}\t{}\t{}'.format(predicted, correct, predicted==correct))
sample_file_base_name = os.path.basename(file_path)
with open('pfam_nn_results/{}.txt'.format(sample_file_base_name), 'w') as outfile:
result = '{}: t_rate(accuracy)={}'.format(sample_file_base_name, float(prediction_counter[True]) / sum(prediction_counter.values()))
print(result)
outfile.write('{}\n'.format(result))

with open('pfam_nn_results/{}_model.json'.format(sample_file_base_name), "w") as json_file:
json_file.write(model.to_json())


def main():
parser = argparse.ArgumentParser('Trains NN model over protein vectors')
parser.add_argument('--sample', type=str, default='training_sample_100.txt')
parser.add_argument('--epochs', type=int, default=30)
args = parser.parse_args()

if os.path.isdir(args.sample):
sample_directory = args.sample
for file_name in os.listdir(sample_directory):
file_path = os.path.join(sample_directory, file_name)
fit_model(file_path, args.epochs)
else:
fit_model(args.sample, args.epochs)


if __name__ == '__main__':
main()
113 changes: 113 additions & 0 deletions family_classification_binary /learn_binary_svm.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,113 @@
import argparse
import sys
import os
import gzip
from collections import Counter
import cPickle as pickle

import numpy as np
from scipy.spatial.distance import cosine
from Bio import SeqIO

from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss
from sklearn import preprocessing
from sklearn.metrics import roc_auc_score, f1_score, accuracy_score, average_precision_score, coverage_error
from sklearn.model_selection import cross_val_score


def get_sample(file_path):
targets = []
vectors = []
with open(file_path) as infile:
for line in infile:
is_in_family, vector_string = line.rstrip().split('\t', 2)
targets.append(int(is_in_family))
vectors.append(np.array(map(float, vector_string.split()), dtype=np.float32))

vectors_array = np.array(vectors)
targets_array = np.array(targets)
#targets_array = targets_array.reshape(-1, 1)
vectors, target = None, None

vectors_train, vectors_test, targets_train, targets_test = train_test_split(vectors_array,
targets_array,
test_size=0.1)
vectors_array, targets_array = None, None
return vectors_train, vectors_test, targets_train, targets_test


def save_model_metrics(model_params_string, model, vectors_test, targets_test):
with open('svm_results/{}_results.txt'.format(model_params_string), 'w') as outfile:
predicted_targets = model.predict(vectors_test)
outfile.write('score: {}\n'.format(model.score(vectors_test, targets_test)))
#print('cross_val_test', cross_val_score(model, vectors_test, targets_test, scoring='neg_log_loss'))
#print('cross_val_train', cross_val_score(model, vectors_train, targets_test, scoring='neg_log_loss'))
outfile.write('f1_macro: {}\n'.format(f1_score(targets_test, predicted_targets, average='macro')))
outfile.write('f1_micro: {}\n'.format(f1_score(targets_test, predicted_targets, average='micro')))
outfile.write('f1_weighted: {}\n'.format(f1_score(targets_test, predicted_targets, average='weighted')))
outfile.write('accuracy_score: {}\n'.format(accuracy_score(targets_test, predicted_targets)))

prediction_counter = Counter()
for index, predicted_target in enumerate(predicted_targets):
correct_target = targets_test[index]
if predicted_target and not correct_target:
prediction_counter['fp'] += 1
elif predicted_target and correct_target:
prediction_counter['tp'] += 1
elif not predicted_target and correct_target:
prediction_counter['fn'] += 1
elif not predicted_target and not correct_target:
prediction_counter['tn'] += 1

outfile.write('predicted={} correct={} is_correct={}\n'.format(correct_target,
predicted_target,
predicted_target==correct_target))

tp_rate = float(prediction_counter['tp']) / (prediction_counter['tp'] + prediction_counter['fn'])
tn_rate = float(prediction_counter['tn']) / (prediction_counter['tn'] + prediction_counter['fp'])
t_rate = float(prediction_counter['tn'] + prediction_counter['tp']) / sum(prediction_counter.values())
result = '{}: tp_rate(specificity) = {} tn_rate(sensitivity) = {} t_rate(accuracy) = {}'.format(model_params_string, tp_rate, tn_rate, t_rate)
outfile.write('{}\n'.format(result))
return result


def fit_model(file_path, model_type):
vectors_train, vectors_test, targets_train, targets_test = get_sample(file_path)
model = None
if model_type == 'svc_linear':
model = svm.SVC(kernel='linear')
elif model_type == 'svc_rbf':
model = svm.SVC(kernel='rbf')
elif model_type == 'linear_svc':
model = svm.LinearSVC()

model.fit(vectors_train, targets_train)
model_params_string = '{}_{}'.format(model_type, os.path.basename(file_path))
with open('pfam_svm_results/{}.pkl'.format(model_params_string), 'wb') as outfile:
pickle.dump(model, outfile)

return save_model_metrics(model_params_string, model, vectors_test, targets_test)

#def get_model(file_path):
# with open(file_path, 'rb') as infile:
# return pickle.load(infile)

def main():
parser = argparse.ArgumentParser('Trains SVM model over protein vectors')
parser.add_argument('--sample', type=str, default='training_sample_100.txt')
parser.add_argument('--type', type=str, default='svc_linear')
args = parser.parse_args()

if os.path.isdir(args.sample):
sample_directory = args.sample
for file_name in os.listdir(sample_directory):
file_path = os.path.join(sample_directory, file_name)
print(fit_model(file_path, args.type))
else:
fit_model(args.sample, args.type)


if __name__ == '__main__':
main()
33 changes: 33 additions & 0 deletions family_classification_binary /make_binary_sample.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
import sys
from collections import Counter

import numpy as np


def get_family_distribution(file_path):
family_distribution = Counter()
with open(file_path) as infile:
for line in infile:
uniprot_id, family, vector_string = line.rstrip().split('\t', 2)
family_distribution[family] += 1
return family_distribution

if __name__ == '__main__':

super_sample_file_path = sys.argv[1]
#target_family = sys.argv[2]
family_distribution = get_family_distribution(super_sample_file_path)
for target_family, target_family_proteins_count in family_distribution.most_common(10):
negative_samples = []
binary_sample_file_path = 'binary_samples_pfam_100/{}_binary_sample.txt'.format(target_family)
with open(binary_sample_file_path, 'w') as outfile:
with open(super_sample_file_path) as infile:
for line in infile:
uniprot_id, family, vector_string = line.rstrip().split('\t', 2)
if family == target_family:
outfile.write('1\t{}\n'.format(vector_string))
else:
negative_samples.append(vector_string)

for negative_sample in np.random.choice(negative_samples, target_family_proteins_count):
outfile.write('0\t{}\n'.format(negative_sample))
Binary file not shown.
95 changes: 95 additions & 0 deletions family_classification_multiclass/learn_multiclass_nn.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
import sys
import os
import gzip
from collections import Counter

os.environ['KERAS_BACKEND'] = 'theano'


import numpy as np
from scipy.spatial.distance import cosine
from Bio import SeqIO

import keras
from keras.models import Model, Sequential
from keras.layers import Input, Dense, Dropout
from keras.callbacks import EarlyStopping
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss
from sklearn import preprocessing
from sklearn.metrics import roc_auc_score


def get_model(vector_length, number_of_families):
model = Sequential([
Dense(vector_length, activation='relu', name='hidden_1', input_dim=vector_length),
Dropout(0.25),
Dense(int(vector_length / 2), activation='relu', name='hidden_2'),
Dropout(0.25),
Dense(int(vector_length / 2), activation='relu', name='outer'),
Dropout(0.25),
Dense(number_of_families, activation='softmax', name='protein_family'),
])
model.compile(
optimizer='adam',
loss={'protein_family': 'categorical_crossentropy'},
metrics=['accuracy'],
)
return model


def main():
families = []
vectors = []
with open(sys.argv[1]) as infile:
for line in infile:
uniprot_id, family, vector_string = line.rstrip().split('\t', 2)
families.append(family)
vectors.append(np.array(map(float, vector_string.split()), dtype=np.float32))

vectors_array = np.array(vectors)
vectors = None

label_encoder = preprocessing.LabelEncoder()
label_encoder.fit(families)
number_of_classes = len(set(label_encoder.classes_))
model = get_model(vector_length=vectors_array.shape[1], number_of_families=number_of_classes)

families_encoded = np.array(label_encoder.transform(families), dtype=np.int32)
families = None
families_encoded = families_encoded.reshape((-1, 1))
families_binary_labels = keras.utils.to_categorical(families_encoded, num_classes=number_of_classes)
families_encoded = None

vectors_train, vectors_test, families_train, families_test = train_test_split(vectors_array,
families_binary_labels,
test_size=0.1)
vectors_array, families_binary_labels = None, None

history = model.fit(vectors_train,
families_train,
epochs=25,
#validation_split=0.2,
validation_data=(vectors_test, families_test),
batch_size=256,
callbacks=[EarlyStopping(patience=3)])
score = model.evaluate(vectors_test, families_test, verbose=1)

test_predictions = model.predict(vectors_test, verbose=2, batch_size=4096)
prediction_counter = Counter()
for index, test_prediction in enumerate(test_predictions):
predicted_family = label_encoder.inverse_transform(np.argmax(test_prediction))
actual_family = label_encoder.inverse_transform(np.argmax(families_test[index]))
prediction_counter[actual_family==predicted_family] += 1
print('{}\t{}\t{}'.format(actual_family, predicted_family, actual_family==predicted_family))
print(prediction_counter, float(prediction_counter[True]) / sum(prediction_counter.values()))
print(test_predictions)
print(families_test)
print('Test loss: {}, Test accuracy:', score[0], score[1])

with open("nn_256_model.json", "w") as json_file:
json_file.write(model.to_json())


if __name__ == '__main__':
main()
Loading

0 comments on commit 8b97cb4

Please sign in to comment.