Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

cognoml refactor part 1 #3

Merged
merged 9 commits into from
Nov 10, 2016
235 changes: 131 additions & 104 deletions cognoml/analysis.py
Original file line number Diff line number Diff line change
@@ -1,121 +1,148 @@
import collections
import os
import warnings

import pandas as pd
import numpy as np
import sklearn
from sklearn.model_selection import train_test_split

from cognoml import utils
from cognoml.classifiers.logistic_regression import grid_search
from cognoml.figshare import download_files

# expression_path = os.path.join('download', 'mutation-matrix.tsv.bz2')
data_directory = "download"

def read_data(version=None):
class CognomlClassifier:
"""
Read data.
Class to handle all operations related to the Cognoml Classifier
"""
v_dir = download_files(directory=data_directory, article_id=3487685, version=version)
# Read expression data
path = os.path.join(v_dir, 'expression-matrix.tsv.bz2')
X = pd.read_table(path, index_col=0)
return X

def classify(sample_id, mutation_status, data_version, json_sanitize=False, **kwargs):
"""
Perform an analysis.

Parameters
----------
sample_id : list
Sample IDs of the observations.
mutation_status : list
Mutation status (0 or 1) of each sample.
data_version : int
Integer with the figshare data version.
json_sanitize : bool
Whether to make results JSON-serializable. If `True` DataFrames are
converted to DataTables format.
def __init__(self, X, y, pipeline=grid_search, test_size=0.1, json_sanitize=True):
"""

Parameters
----------
X: Pandas data frame
Expressions data frame, [sample_id, feature set]
y: Pandas data frame
Mutations data frame [sample_id, boolean (1 or 0) on mutation]
routine: function
ML model fitting pipeline, of form function(X, y)
test_size: float
% of total sample to be used in testing data. Training size = 1-test_size
json_sanitize: bool
Whether to make results JSON-serializable. If `True` DataFrames are
converted to DataTables format.
"""

self.X_whole = X
self.X = utils.filter_data_by_mutation(X, y)
self.obs_df = y
self.sample_id = self.obs_df.index
self.y = y.values
self.pipeline = pipeline
self.test_size = test_size
self.x_train, self.x_test, self.y_train, self.y_test = self.test_train_split()
self.json_sanitize = json_sanitize

def test_train_split(self):
"""
Internal wrapper for scikit-learn's test/train split
Performs stratified test/train split of Cognoml data sets
Stratify option set to consume the mutations data by default

Returns
-------
x_train: Pandas data frame
Training set of expressions data frame (feature space)
x_test: Pandas data frame
Test set of expressions data frame
y_train: Pandas data frame
Training set of mutations data
y_test: Pandas data frame
Test set of mutations data
"""
x = self.X
y = self.y
test_size = self.test_size
x_train, x_test, y_train, y_test = train_test_split(
x, y, test_size=test_size, random_state=0, stratify=y)
return x_train, x_test, y_train, y_test

Returns
-------
results : dict
An object of results. See `data/api/hippo-output-schema.json`
for JSON schema.
"""
results = collections.OrderedDict()

obs_df = pd.DataFrame.from_items([
('sample_id', sample_id),
('status', mutation_status),
])

X_whole = read_data(version=data_version)
X = X_whole.loc[obs_df.sample_id, :]
y = obs_df.status

X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.1, random_state=0, stratify=y)
obs_df['testing'] = obs_df.sample_id.isin(X_test.index).astype(int)

grid_search.fit(X=X_train, y=y_train)

predict_df = pd.DataFrame.from_items([
('sample_id', X_whole.index),
('predicted_status', grid_search.predict(X_whole)),
])
if hasattr(grid_search, 'decision_function'):
predict_df['predicted_score'] = grid_search.decision_function(X_whole)
if hasattr(grid_search, 'predict_proba'):
predict_df['predicted_prob'] = grid_search.predict_proba(X_whole)[:, 1]

# obs_df switches to containing non-selected samples
obs_df = obs_df.merge(predict_df, how='right', sort=True)
obs_df['selected'] = obs_df.sample_id.isin(sample_id).astype(int)
for column in 'status', 'testing', 'selected':
obs_df[column] = obs_df[column].fillna(-1).astype(int)
obs_train_df = obs_df.query("testing == 0")
obs_test_df = obs_df.query("testing == 1")

#y_pred_train = obs_df.query("testing == 0").predicted_score
#y_pred_test = obs_df.query("testing == 1").predicted_score
def fit(self):
"""
Internal wrapper for scikit-learn's fit method on a custom data pipeline
Fits custom data pipeline using internal training data sets created
by test_train_split

dimensions = collections.OrderedDict()
dimensions['observations_selected'] = sum(obs_df.selected == 1)
dimensions['observations_unselected'] = sum(obs_df.selected == 0)
dimensions['features'] = len(X.columns)
dimensions['positives'] = sum(obs_df.status == 1)
dimensions['negatives'] = sum(obs_df.status == 0)
dimensions['positive_prevalence'] = y.mean()
dimensions['training_observations'] = len(obs_train_df)
dimensions['testing_observations'] = len(obs_test_df)
results['dimensions'] = dimensions
"""
x_train = self.x_train
y_train = self.y_train
pipeline = self.pipeline
try:
pipeline.fit(X=x_train, y=y_train)
except AttributeError:
print("Pipeline {} does not have a fit method".format(pipeline))

performance = collections.OrderedDict()
for part, df in ('training', obs_train_df), ('testing', obs_test_df):
y_true = df.status
y_pred = df.predicted_status
metrics = utils.class_metrics(y_true, y_pred)
metrics.update(utils.threshold_metrics(y_true, y_pred))
performance[part] = metrics
performance['cv'] = {'auroc': grid_search.best_score_}
results['performance'] = performance

gs = collections.OrderedDict()
gs['cv_scores'] = utils.cv_results_to_df(grid_search.cv_results_)
results['grid_search'] = gs

results['model'] = utils.model_info(grid_search.best_estimator_.steps[-1][1])
def predict(self):
"""
Internal wrapper for scikit-learn's predict method with custom data pipeline

feature_df = utils.get_feature_df(grid_search, X.columns)
results['model']['features'] = feature_df
Returns
-------
predict_df: Pandas data frame
Mutation predictions for entire feature dataframe
"""
pipeline = self.pipeline
x = self.X_whole
try:
predict_df = pd.DataFrame(collections.OrderedDict((('sample_id', x.index),
('predicted_status', pipeline.predict(x)))))
except AttributeError:
raise AttributeError("Pipeline {} does not have a predict method".format(pipeline))
if hasattr(pipeline, 'decision_function'):
predict_df['predicted_score'] = pipeline.decision_function(x)
if hasattr(pipeline, 'predict_proba'):
predict_df['predicted_prob'] = pipeline.predict_proba(x)[:, 1]
return predict_df

results['observations'] = obs_df

if json_sanitize:
results = utils.make_json_serializable(results)

return results
def get_results(self):
pipeline = self.pipeline
results = collections.OrderedDict()
x_test = self.x_test
x = self.X
obs_df = self.obs_df
obs_df = pd.DataFrame(collections.OrderedDict((('sample_id', obs_df.index), ('status', obs_df.values))))
predict_df = self.predict()
obs_df['testing'] = obs_df['sample_id'].isin(x_test.index).astype(int)
obs_df = obs_df.merge(predict_df, how='right', sort=True)
obs_df['selected'] = obs_df['sample_id'].isin(self.sample_id).astype(int)
for column in 'status', 'testing', 'selected':
obs_df[column] = obs_df[column].fillna(-1).astype(int)
obs_train_df = obs_df.query("testing == 0")
obs_test_df = obs_df.query("testing == 1")
dimensions = collections.OrderedDict()
dimensions['observations_selected'] = sum(obs_df.selected == 1)
dimensions['observations_unselected'] = sum(obs_df.selected == 0)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The resulting JSON has:

    "observations_unselected": 0,

Rather than.

    "observations_unselected": 2264,

See this comment in original code:

# obs_df switches to containing non-selected samples

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This has been implemented in latest pull request b3427c7

dimensions['features'] = len(x.columns)
dimensions['positives'] = sum(obs_df.status == 1)
dimensions['negatives'] = sum(obs_df.status == 0)
dimensions['positive_prevalence'] = obs_df.query("selected == 1").status.mean()
dimensions['training_observations'] = len(obs_train_df)
dimensions['testing_observations'] = len(obs_test_df)
results['dimensions'] = dimensions
performance = collections.OrderedDict()
for part, df in ('training', obs_train_df), ('testing', obs_test_df):
y_true = df.status
y_pred = df.predicted_status
metrics = utils.class_metrics(y_true, y_pred)
metrics.update(utils.threshold_metrics(y_true, y_pred))
performance[part] = metrics
performance['cv'] = {'auroc': pipeline.best_score_}
results['performance'] = performance
gs = collections.OrderedDict()
gs['cv_scores'] = utils.cv_results_to_df(pipeline.cv_results_)
results['grid_search'] = gs
# CHECK BELOW VERY THOROUGHLY
results['model'] = utils.model_info(pipeline.best_estimator_.steps[-1][1])
feature_df = utils.get_feature_df(pipeline, x.columns)
results['model']['features'] = feature_df
results['observations'] = obs_df
if self.json_sanitize:
results = utils.make_json_serializable(results)
return results
Loading