Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Standardize predict interface using SAR standard #1039

Merged
merged 10 commits into from
Jan 24, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 8 additions & 6 deletions benchmarks/benchmark_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,11 +26,14 @@
from reco_utils.recommender.ncf.ncf_singlenode import NCF
from reco_utils.recommender.ncf.dataset import Dataset as NCFDataset
from reco_utils.recommender.surprise.surprise_utils import (
compute_rating_predictions,
predict,
compute_ranking_predictions,
)
from reco_utils.recommender.fastai.fastai_utils import (cartesian_product, score,
hide_fastai_progress_bar)
from reco_utils.recommender.fastai.fastai_utils import (
cartesian_product,
score,
hide_fastai_progress_bar,
)
from reco_utils.recommender.cornac.cornac_utils import predict_ranking
from reco_utils.evaluation.spark_evaluation import (
SparkRatingEvaluation,
Expand Down Expand Up @@ -125,7 +128,7 @@ def train_svd(params, data):

def predict_svd(model, test):
with Timer() as t:
preds = compute_rating_predictions(
preds = predict(
model,
test,
usercol=DEFAULT_USER_COL,
Expand Down Expand Up @@ -266,8 +269,7 @@ def recommend_k_ncf(model, test, train):

def prepare_training_bpr(train):
return cornac.data.Dataset.from_uir(
train.drop(DEFAULT_TIMESTAMP_COL, axis=1).itertuples(index=False),
seed=SEED
train.drop(DEFAULT_TIMESTAMP_COL, axis=1).itertuples(index=False), seed=SEED
)


Expand Down
4 changes: 2 additions & 2 deletions notebooks/02_model/cornac_bpr_deep_dive.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -408,7 +408,7 @@
"source": [
"### 3.4 Prediction and Evaluation\n",
"\n",
"Now that our model is trained, we can produce the ranked lists for recommendation. Every recommender models in Cornac provide `rate()` and `rank()` methods for predicting item rated value as well as item ranked list for a given user. To make use of the current evaluation schemes, we will through `predict_rating()` and `predict_ranking()` functions inside `cornac_utils` to produce the predictions.\n",
"Now that our model is trained, we can produce the ranked lists for recommendation. Every recommender models in Cornac provide `rate()` and `rank()` methods for predicting item rated value as well as item ranked list for a given user. To make use of the current evaluation schemes, we will through `predict()` and `predict_ranking()` functions inside `cornac_utils` to produce the predictions.\n",
"\n",
"Note that BPR model is effectively designed for item ranking. Hence, we only measure the performance using ranking metrics."
]
Expand Down Expand Up @@ -592,4 +592,4 @@
},
"nbformat": 4,
"nbformat_minor": 2
}
}
29 changes: 13 additions & 16 deletions notebooks/02_model/surprise_svd_deep_dive.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -99,16 +99,17 @@
"source": [
"import sys\n",
"sys.path.append(\"../../\")\n",
"import time\n",
"import os\n",
"import surprise\n",
"import papermill as pm\n",
"import pandas as pd\n",
"\n",
"from reco_utils.common.timer import Timer\n",
"from reco_utils.dataset import movielens\n",
"from reco_utils.dataset.python_splitters import python_random_split\n",
"from reco_utils.evaluation.python_evaluation import (rmse, mae, rsquared, exp_var, map_at_k, ndcg_at_k, precision_at_k, \n",
" recall_at_k, get_top_k_items)\n",
"from reco_utils.recommender.surprise.surprise_utils import compute_rating_predictions, compute_ranking_predictions\n",
"from reco_utils.recommender.surprise.surprise_utils import predict, compute_ranking_predictions\n",
"\n",
"print(\"System version: {}\".format(sys.version))\n",
"print(\"Surprise version: {}\".format(surprise.__version__))"
Expand Down Expand Up @@ -337,12 +338,10 @@
"source": [
"svd = surprise.SVD(random_state=0, n_factors=200, n_epochs=30, verbose=True)\n",
"\n",
"start_time = time.time()\n",
"\n",
"svd.fit(train_set)\n",
"with Timer() as train_time:\n",
" svd.fit(train_set)\n",
"\n",
"train_time = time.time() - start_time\n",
"print(\"Took {} seconds for training.\".format(train_time))"
"print(\"Took {} seconds for training.\".format(train_time.interval))"
]
},
{
Expand Down Expand Up @@ -435,7 +434,7 @@
}
],
"source": [
"predictions = compute_rating_predictions(svd, test, usercol='userID', itemcol='itemID')\n",
"predictions = predict(svd, test, usercol='userID', itemcol='itemID')\n",
"predictions.head()"
]
},
Expand All @@ -462,12 +461,10 @@
}
],
"source": [
"start_time = time.time()\n",
"\n",
"all_predictions = compute_ranking_predictions(svd, train, usercol='userID', itemcol='itemID', remove_seen=True)\n",
"with Timer() as test_time:\n",
" all_predictions = compute_ranking_predictions(svd, train, usercol='userID', itemcol='itemID', remove_seen=True)\n",
" \n",
"test_time = time.time() - start_time\n",
"print(\"Took {} seconds for prediction.\".format(test_time))"
"print(\"Took {} seconds for prediction.\".format(test_time.interval))"
]
},
{
Expand Down Expand Up @@ -682,8 +679,8 @@
"pm.record(\"ndcg\", eval_ndcg)\n",
"pm.record(\"precision\", eval_precision)\n",
"pm.record(\"recall\", eval_recall)\n",
"pm.record(\"train_time\", train_time)\n",
"pm.record(\"test_time\", test_time)"
"pm.record(\"train_time\", train_time.interval)\n",
"pm.record(\"test_time\", test_time.interval)"
]
},
{
Expand Down Expand Up @@ -720,4 +717,4 @@
},
"nbformat": 4,
"nbformat_minor": 2
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@
"from reco_utils.dataset import movielens\n",
"from reco_utils.dataset.python_splitters import python_random_split\n",
"from reco_utils.evaluation.python_evaluation import rmse, precision_at_k, ndcg_at_k\n",
"from reco_utils.recommender.surprise.surprise_utils import compute_rating_predictions, compute_ranking_predictions\n",
"from reco_utils.recommender.surprise.surprise_utils import predict, compute_ranking_predictions\n",
"\n",
"print(\"System version: {}\".format(sys.version))\n",
"print(\"Surprise version: {}\".format(surprise.__version__))\n",
Expand Down Expand Up @@ -165,9 +165,6 @@
}
],
"source": [
"from azureml.core.compute import ComputeTarget, AmlCompute\n",
"from azureml.core.compute_target import ComputeTargetException\n",
"\n",
"# Remote compute (cluster) configuration. If you want to save costs decrease these.\n",
"# Each standard_D2_V2 VM has 2 vCPUs, 7GB memory, 100GB SSD storage\n",
"\n",
Expand Down Expand Up @@ -708,7 +705,7 @@
],
"source": [
"test_results = {}\n",
"predictions = compute_rating_predictions(svd, test, usercol=USERCOL, itemcol=ITEMCOL)\n",
"predictions = predict(svd, test, usercol=USERCOL, itemcol=ITEMCOL)\n",
"for metric in RATING_METRICS:\n",
" test_results[metric] = eval(metric)(test, predictions, col_user=USERCOL, col_item=ITEMCOL)\n",
"\n",
Expand Down Expand Up @@ -776,4 +773,4 @@
},
"nbformat": 4,
"nbformat_minor": 2
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@
"from reco_utils.evaluation.python_evaluation import rmse, precision_at_k, ndcg_at_k\n",
"from reco_utils.tuning.nni.nni_utils import (check_experiment_status, check_stopped, check_metrics_written, get_trials,\n",
" stop_nni, start_nni)\n",
"from reco_utils.recommender.surprise.surprise_utils import compute_rating_predictions, compute_ranking_predictions\n",
"from reco_utils.recommender.surprise.surprise_utils import predict, compute_ranking_predictions\n",
"\n",
"print(\"System version: {}\".format(sys.version))\n",
"print(\"Surprise version: {}\".format(surprise.__version__))\n",
Expand Down Expand Up @@ -561,7 +561,7 @@
"source": [
"def compute_test_results(svd):\n",
" test_results = {}\n",
" predictions = compute_rating_predictions(svd, test, usercol=\"userID\", itemcol=\"itemID\")\n",
" predictions = predict(svd, test, usercol=\"userID\", itemcol=\"itemID\")\n",
" for metric in RATING_METRICS:\n",
" test_results[metric] = eval(metric)(test, predictions)\n",
"\n",
Expand Down Expand Up @@ -1080,4 +1080,4 @@
},
"nbformat": 4,
"nbformat_minor": 2
}
}
127 changes: 81 additions & 46 deletions reco_utils/azureml/svd_training.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,10 @@
HAS_AML = False

from reco_utils.evaluation.python_evaluation import *
from reco_utils.recommender.surprise.surprise_utils import compute_rating_predictions, compute_ranking_predictions
from reco_utils.recommender.surprise.surprise_utils import (
predict,
compute_ranking_predictions,
)


def svd_training(args):
Expand All @@ -27,23 +30,42 @@ def svd_training(args):
"""
print("Start training...")
train_data = pd.read_pickle(path=os.path.join(args.datastore, args.train_datapath))
validation_data = pd.read_pickle(path=os.path.join(args.datastore, args.validation_datapath))

svd = surprise.SVD(random_state=args.random_state, n_epochs=args.epochs, verbose=args.verbose, biased=args.biased,
n_factors=args.n_factors, init_mean=args.init_mean, init_std_dev=args.init_std_dev,
lr_all=args.lr_all, reg_all=args.reg_all, lr_bu=args.lr_bu, lr_bi=args.lr_bi, lr_pu=args.lr_pu,
lr_qi=args.lr_qi, reg_bu=args.reg_bu, reg_bi=args.reg_bi, reg_pu=args.reg_pu,
reg_qi=args.reg_qi)

train_set = surprise.Dataset.load_from_df(train_data, reader=surprise.Reader(args.surprise_reader)) \
.build_full_trainset()
validation_data = pd.read_pickle(
path=os.path.join(args.datastore, args.validation_datapath)
)

svd = surprise.SVD(
random_state=args.random_state,
n_epochs=args.epochs,
verbose=args.verbose,
biased=args.biased,
n_factors=args.n_factors,
init_mean=args.init_mean,
init_std_dev=args.init_std_dev,
lr_all=args.lr_all,
reg_all=args.reg_all,
lr_bu=args.lr_bu,
lr_bi=args.lr_bi,
lr_pu=args.lr_pu,
lr_qi=args.lr_qi,
reg_bu=args.reg_bu,
reg_bi=args.reg_bi,
reg_pu=args.reg_pu,
reg_qi=args.reg_qi,
)

train_set = surprise.Dataset.load_from_df(
train_data, reader=surprise.Reader(args.surprise_reader)
).build_full_trainset()
svd.fit(train_set)

print("Evaluating...")

rating_metrics = args.rating_metrics
if len(rating_metrics) > 0:
predictions = compute_rating_predictions(svd, validation_data, usercol=args.usercol, itemcol=args.itemcol)
predictions = predict(
svd, validation_data, usercol=args.usercol, itemcol=args.itemcol
)
for metric in rating_metrics:
result = eval(metric)(validation_data, predictions)
print(metric, result)
Expand All @@ -52,11 +74,18 @@ def svd_training(args):

ranking_metrics = args.ranking_metrics
if len(ranking_metrics) > 0:
all_predictions = compute_ranking_predictions(svd, train_data, usercol=args.usercol, itemcol=args.itemcol,
remove_seen=args.remove_seen)
all_predictions = compute_ranking_predictions(
svd,
train_data,
usercol=args.usercol,
itemcol=args.itemcol,
remove_seen=args.remove_seen,
)
k = args.k
for metric in ranking_metrics:
result = eval(metric)(validation_data, all_predictions, col_prediction='prediction', k=k)
result = eval(metric)(
validation_data, all_predictions, col_prediction="prediction", k=k
)
print("{}@{}".format(metric, k), result)
if HAS_AML:
run.log(metric, result)
Expand All @@ -70,49 +99,55 @@ def svd_training(args):
def main():
parser = argparse.ArgumentParser()
# Data path
parser.add_argument('--datastore', type=str, dest='datastore', help="Datastore path")
parser.add_argument('--train-datapath', type=str, dest='train_datapath')
parser.add_argument('--validation-datapath', type=str, dest='validation_datapath')
parser.add_argument('--output_dir', type=str, help='output directory')
parser.add_argument('--surprise-reader', type=str, dest='surprise_reader')
parser.add_argument('--usercol', type=str, dest='usercol', default='userID')
parser.add_argument('--itemcol', type=str, dest='itemcol', default='itemID')
parser.add_argument(
"--datastore", type=str, dest="datastore", help="Datastore path"
)
parser.add_argument("--train-datapath", type=str, dest="train_datapath")
parser.add_argument("--validation-datapath", type=str, dest="validation_datapath")
parser.add_argument("--output_dir", type=str, help="output directory")
parser.add_argument("--surprise-reader", type=str, dest="surprise_reader")
parser.add_argument("--usercol", type=str, dest="usercol", default="userID")
parser.add_argument("--itemcol", type=str, dest="itemcol", default="itemID")
# Metrics
parser.add_argument('--rating-metrics', type=str, nargs='*', dest='rating_metrics', default=[])
parser.add_argument('--ranking-metrics', type=str, nargs='*', dest='ranking_metrics', default=[])
parser.add_argument('--k', type=int, dest='k', default=None)
parser.add_argument('--remove-seen', dest='remove_seen', action='store_true')
parser.add_argument(
"--rating-metrics", type=str, nargs="*", dest="rating_metrics", default=[]
)
parser.add_argument(
"--ranking-metrics", type=str, nargs="*", dest="ranking_metrics", default=[]
)
parser.add_argument("--k", type=int, dest="k", default=None)
parser.add_argument("--remove-seen", dest="remove_seen", action="store_true")
# Training parameters
parser.add_argument('--random-state', type=int, dest='random_state', default=0)
parser.add_argument('--verbose', dest='verbose', action='store_true')
parser.add_argument('--epochs', type=int, dest='epochs', default=30)
parser.add_argument('--biased', dest='biased', action='store_true')
parser.add_argument("--random-state", type=int, dest="random_state", default=0)
parser.add_argument("--verbose", dest="verbose", action="store_true")
parser.add_argument("--epochs", type=int, dest="epochs", default=30)
parser.add_argument("--biased", dest="biased", action="store_true")
# Hyperparameters to be tuned
parser.add_argument('--n_factors', type=int, dest='n_factors', default=100)
parser.add_argument('--init_mean', type=float, dest='init_mean', default=0.0)
parser.add_argument('--init_std_dev', type=float, dest='init_std_dev', default=0.1)
parser.add_argument('--lr_all', type=float, dest='lr_all', default=0.005)
parser.add_argument('--reg_all', type=float, dest='reg_all', default=0.02)
parser.add_argument('--lr_bu', type=float, dest='lr_bu', default=None)
parser.add_argument('--lr_bi', type=float, dest='lr_bi', default=None)
parser.add_argument('--lr_pu', type=float, dest='lr_pu', default=None)
parser.add_argument('--lr_qi', type=float, dest='lr_qi', default=None)
parser.add_argument('--reg_bu', type=float, dest='reg_bu', default=None)
parser.add_argument('--reg_bi', type=float, dest='reg_bi', default=None)
parser.add_argument('--reg_pu', type=float, dest='reg_pu', default=None)
parser.add_argument('--reg_qi', type=float, dest='reg_qi', default=None)
parser.add_argument("--n_factors", type=int, dest="n_factors", default=100)
parser.add_argument("--init_mean", type=float, dest="init_mean", default=0.0)
parser.add_argument("--init_std_dev", type=float, dest="init_std_dev", default=0.1)
parser.add_argument("--lr_all", type=float, dest="lr_all", default=0.005)
parser.add_argument("--reg_all", type=float, dest="reg_all", default=0.02)
parser.add_argument("--lr_bu", type=float, dest="lr_bu", default=None)
parser.add_argument("--lr_bi", type=float, dest="lr_bi", default=None)
parser.add_argument("--lr_pu", type=float, dest="lr_pu", default=None)
parser.add_argument("--lr_qi", type=float, dest="lr_qi", default=None)
parser.add_argument("--reg_bu", type=float, dest="reg_bu", default=None)
parser.add_argument("--reg_bi", type=float, dest="reg_bi", default=None)
parser.add_argument("--reg_pu", type=float, dest="reg_pu", default=None)
parser.add_argument("--reg_qi", type=float, dest="reg_qi", default=None)

args = parser.parse_args()

print("Args:", str(vars(args)), sep='\n')
print("Args:", str(vars(args)), sep="\n")

if HAS_AML:
run.log('Number of epochs', args.epochs)
run.log("Number of epochs", args.epochs)

svd = svd_training(args)
# Save SVD model to the output directory for later use
os.makedirs(args.output_dir, exist_ok=True)
surprise.dump.dump(os.path.join(args.output_dir, 'model.dump'), algo=svd)
surprise.dump.dump(os.path.join(args.output_dir, "model.dump"), algo=svd)


if __name__ == "__main__":
Expand Down
Loading