From 4d5ca54873a09d327e21323283295e9549f8f95a Mon Sep 17 00:00:00 2001 From: Evgeny Frolov Date: Thu, 7 Jun 2018 13:57:45 +0300 Subject: [PATCH 01/13] simplify holdout index extraction --- polara/recommender/data.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/polara/recommender/data.py b/polara/recommender/data.py index c026498..40748f0 100644 --- a/polara/recommender/data.py +++ b/polara/recommender/data.py @@ -685,7 +685,7 @@ def _sample_holdout(self, test_split, group_id=None): selector = selector.sample(frac=1, random_state=random_state) group_id = group_id or self.fields.userid - grouper = selector.groupby(self._data[group_id], sort=False) + grouper = selector.groupby(self._data[group_id], sort=False, group_keys=False) if self._random_holdout: # randomly sample data for evaluation random_state = np.random.RandomState(self.seed) @@ -708,8 +708,7 @@ def sample_largest(x): return x.iloc[np.argpartition(x, -size)[-size:]] holdout = grouper.apply(sample_largest) - holdout_index = holdout.index.get_level_values(1) - return self._data.loc[holdout_index] + return self._data.loc[holdout.index] def _sample_testset(self, test_split, holdout_index): From 0b19dd0a7871f866bfacca86134b1716af630a7b Mon Sep 17 00:00:00 2001 From: Evgeny Frolov Date: Thu, 7 Jun 2018 13:58:56 +0300 Subject: [PATCH 02/13] improve random holdout sampling efficiency --- polara/recommender/data.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/polara/recommender/data.py b/polara/recommender/data.py index 40748f0..1af7a6b 100644 --- a/polara/recommender/data.py +++ b/polara/recommender/data.py @@ -12,8 +12,10 @@ def random_choice(df, num, random_state): n = df.shape[0] - k = min(num, n) - return df.iloc[random_state.choice(n, k, replace=False)] + if n > num: + return df.take(random_state.choice(n, num, replace=False), is_copy=False) + else: + return df def random_sample(df, frac, random_state): From e314a129543c6f5b459e3ae29b98b99a9f7fb82d Mon Sep 17 00:00:00 2001 From: Evgeny Frolov Date: Thu, 7 Jun 2018 13:59:32 +0300 Subject: [PATCH 03/13] fix empty feedback indexing bug --- polara/recommender/data.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/polara/recommender/data.py b/polara/recommender/data.py index 1af7a6b..9941d9e 100644 --- a/polara/recommender/data.py +++ b/polara/recommender/data.py @@ -677,7 +677,7 @@ def reindex(data, col, sort=True, inplace=True): def _sample_holdout(self, test_split, group_id=None): # TODO order_field may also change - need to check it as well - order_field = self._custom_order or self.fields.feedback + order_field = self._custom_order or self.fields.feedback or [] selector = self._data.loc[test_split, order_field] # data may have many items with the same top ratings From 82353ed993fe6b3697b2e4eb75f782d8092c57b2 Mon Sep 17 00:00:00 2001 From: Evgeny Frolov Date: Thu, 7 Jun 2018 14:48:08 +0300 Subject: [PATCH 04/13] assign developer version tag --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index eb47339..3525a33 100644 --- a/setup.py +++ b/setup.py @@ -17,7 +17,7 @@ opts = dict(name="polara", description="Fast and flexible recommender system framework", keywords = "recommender system", - version = "0.6.0", + version = "0.6.0.dev", license="MIT", author="Evgeny Frolov", platforms=["any"], From 8d62e3ed2b91ca93067992da8003a94958e3bf9b Mon Sep 17 00:00:00 2001 From: Evgeny Frolov Date: Thu, 7 Jun 2018 17:55:12 +0300 Subject: [PATCH 05/13] fix bug with data preparation completion message --- polara/recommender/data.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/polara/recommender/data.py b/polara/recommender/data.py index 9941d9e..ba161c1 100644 --- a/polara/recommender/data.py +++ b/polara/recommender/data.py @@ -221,8 +221,10 @@ def prepare(self): self._try_sort_test_data() if self.verbose: + num_train_events = self.training.shape[0] if self.training is not None else 0 + num_holdout_events = self.test.holdout.shape[0] if self.test.holdout is not None else 0 stats_msg = 'Done.\nThere are {} events in the training and {} events in the holdout.' - print(stats_msg.format(self.training.shape[0], self.test.holdout.shape[0])) + print(stats_msg.format(num_train_events, num_holdout_events)) def prepare_training_only(self): self.holdout_size = 0 # do not form holdout From 8d0a46f727c35d0cd844ac5846f8037815583c66 Mon Sep 17 00:00:00 2001 From: Evgeny Frolov Date: Thu, 7 Jun 2018 20:33:39 +0300 Subject: [PATCH 06/13] implement feedback value thresholding for all models --- polara/recommender/data.py | 26 ++++++++-- polara/recommender/defaults.py | 1 + .../external/mymedialite/mmlwrapper.py | 6 +-- polara/recommender/models.py | 49 ++++++++++++++----- 4 files changed, 63 insertions(+), 19 deletions(-) diff --git a/polara/recommender/data.py b/polara/recommender/data.py index ba161c1..e4a8234 100644 --- a/polara/recommender/data.py +++ b/polara/recommender/data.py @@ -735,7 +735,24 @@ def _sample_testset(self, test_split, holdout_index): return sampled - def to_coo(self, tensor_mode=False): + @staticmethod + def threshold_data(idx, val, threshold, filter_values=True): + if threshold is None: + return idx, val + + value_filter = val >= threshold + if filter_values: + val = val[value_filter] + if isinstance(idx, tuple): + idx = tuple([x[value_filter] for x in idx]) + else: + idx = idx[value_filter, :] + else: + val[~value_filter] = 0 + return idx, val + + + def to_coo(self, tensor_mode=False, feedback_threshold=None): userid, itemid, feedback = self.fields user_item_data = self.training[[userid, itemid]].values @@ -755,6 +772,7 @@ def to_coo(self, tensor_mode=False): val = self.training[feedback].values shp = tuple(idx.max(axis=0) + 1) + idx, val = self.threshold_data(idx, val, feedback_threshold) idx = idx.astype(np.intp) val = np.ascontiguousarray(val) return idx, val, shp @@ -775,7 +793,7 @@ def _recover_testset(self, update_data=False): return testset - def test_to_coo(self, tensor_mode=False): + def test_to_coo(self, tensor_mode=False, feedback_threshold=None): userid, itemid, feedback = self.fields testset = self.test.testset @@ -801,8 +819,8 @@ def test_to_coo(self, tensor_mode=False): else: fdbk_val = testset[feedback].values test_coo = (user_idx, item_idx, fdbk_val) - - return test_coo + test_coo, val = self.threshold_data(test_coo[:-1], test_coo[-1], feedback_threshold, filter_values=False) + return test_coo + (val,) def get_test_shape(self, tensor_mode=False): diff --git a/polara/recommender/defaults.py b/polara/recommender/defaults.py index 161d728..6065d05 100644 --- a/polara/recommender/defaults.py +++ b/polara/recommender/defaults.py @@ -15,6 +15,7 @@ #MODELS +feedback_threshold = None switch_positive = None #feedback values below are treated as negative feedback verify_integrity = True #svd diff --git a/polara/recommender/external/mymedialite/mmlwrapper.py b/polara/recommender/external/mymedialite/mmlwrapper.py index cf4cf7c..96f84b0 100644 --- a/polara/recommender/external/mymedialite/mmlwrapper.py +++ b/polara/recommender/external/mymedialite/mmlwrapper.py @@ -69,14 +69,14 @@ def command(self): ' --save-item-mapping={{item_mapping}}').format(command_template) else: command = ('{} --no-id-mapping' - ' --rating-threshold={{switch_positive}}').format(command_template) + ' --rating-threshold={{feedback_threshold}}').format(command_template) return command def _save_to_disk(self): if self.positive_only: feedback = self.data.fields.feedback - pos_ind = self.data.training[feedback] >= self.switch_positive + pos_ind = self.data.training[feedback] >= self.feedback_threshold pos_data = self.data.training.loc[pos_ind] pos_data.to_csv(self.train_data_path, index=False, header=False) else: @@ -99,7 +99,7 @@ def _run_external(self, debug=False): program=self.program, train_path=self.train_data_path, saved_model_path=self.saved_model_path, - switch_positive=self.switch_positive, + feedback_threshold=self.feedback_threshold, topk=self.topk, algo=method_name, options=self.options, diff --git a/polara/recommender/models.py b/polara/recommender/models.py index c2e47ab..52eecc1 100644 --- a/polara/recommender/models.py +++ b/polara/recommender/models.py @@ -70,22 +70,19 @@ def __new__(mcs, name, bases, clsdict): @with_metaclass(MetaModel) class RecommenderModel(object): - _config = ('topk', 'filter_seen', 'switch_positive', 'verify_integrity') + _config = ('topk', 'filter_seen', 'switch_positive', 'feedback_threshold', 'verify_integrity') _pad_const = -1 # used for sparse data - def __init__(self, recommender_data, switch_positive=None): + def __init__(self, recommender_data, feedback_threshold=None): self.data = recommender_data self._recommendations = None self.method = 'ABC' self._topk = get_default('topk') - self.filter_seen = get_default('filter_seen') - # `switch_positive` can be used by other models during construction process - # (e.g. mymedialite wrapper or any other implicit model); hence, it's - # better to make it a model attribute, not a simple evaluation argument - # (in contrast to `on_feedback_level` argument of self.evaluate) - self.switch_positive = switch_positive or get_default('switch_positive') + self._filter_seen = get_default('filter_seen') + self._feedback_threshold = feedback_threshold or get_default('feedback_threshold') + self.switch_positive = get_default('switch_positive') self.verify_integrity = get_default('verify_integrity') self.max_test_workers = get_default('max_test_workers') @@ -130,13 +127,34 @@ def topk(self, new_value): self._recommendations = None # if topk is too high - recalculate recommendations self._topk = new_value + @property + def feedback_threshold(self): + return self._feedback_threshold + + @feedback_threshold.setter + def feedback_threshold(self, new_value): + if self._feedback_threshold != new_value: + self._feedback_threshold = new_value + self._renew_model() + + @property + def filter_seen(self): + return self._filter_seen + + @filter_seen.setter + def filter_seen(self, new_value): + if self._filter_seen != new_value: + self._filter_seen = new_value + self._refresh_model() + def build(self): raise NotImplementedError('This must be implemented in subclasses') - def get_training_matrix(self, dtype=None): - idx, val, shp = self.data.to_coo(tensor_mode=False) + def get_training_matrix(self, feedback_threshold=None, dtype=None): + threshold = feedback_threshold or self.feedback_threshold + idx, val, shp = self.data.to_coo(tensor_mode=False, feedback_threshold=threshold) dtype = dtype or val.dtype matrix = csr_matrix((val, (idx[:, 0], idx[:, 1])), shape=shp, dtype=dtype) @@ -160,6 +178,12 @@ def get_test_matrix(self, test_data=None, shape=None, user_slice=None): coo_data = test_data user_coo, item_coo, fdbk_coo = coo_data + valid_fdbk = fdbk_coo != 0 + if not valid_fdbk.all(): + user_coo = user_coo[valid_fdbk] + item_coo = item_coo[valid_fdbk] + fdbk_coo = fdbk_coo[valid_fdbk] + num_items = shape[1] test_matrix = csr_matrix((fdbk_coo, (user_coo, item_coo)), shape=(num_users, num_items), @@ -180,14 +204,15 @@ def _get_slices_idx(self, shape, result_width=None, scores_multiplier=None, dtyp return slices_idx - def _get_test_data(self): + def _get_test_data(self, feedback_threshold=None): try: tensor_mode = self.factors.get(self.data.fields.feedback, None) is not None except AttributeError: tensor_mode = False - user_idx, item_idx, feedback = self.data.test_to_coo(tensor_mode=tensor_mode) test_shape = self.data.get_test_shape(tensor_mode=tensor_mode) + threshold = feedback_threshold or self.feedback_threshold + user_idx, item_idx, feedback = self.data.test_to_coo(tensor_mode=tensor_mode, feedback_threshold=threshold) idx_diff = np.diff(user_idx) # TODO sorting by self._predition_key From df731f25d0695804e8468a52347b5304041c09eb Mon Sep 17 00:00:00 2001 From: Evgeny Frolov Date: Thu, 7 Jun 2018 20:38:18 +0300 Subject: [PATCH 07/13] allow return some common model config values --- polara/recommender/models.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/polara/recommender/models.py b/polara/recommender/models.py index 52eecc1..1a0a9d5 100644 --- a/polara/recommender/models.py +++ b/polara/recommender/models.py @@ -148,6 +148,11 @@ def filter_seen(self, new_value): self._refresh_model() + def get_base_configuration(self): + config = {attr: getattr(self, attr) for attr in self._config} + return config + + def build(self): raise NotImplementedError('This must be implemented in subclasses') From 74cda790a11d6a1a8f81adb34eaeb7b909facd97 Mon Sep 17 00:00:00 2001 From: Evgeny Frolov Date: Fri, 3 Aug 2018 13:52:10 +0300 Subject: [PATCH 08/13] fix attribute naming bug in mp model --- polara/recommender/models.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/polara/recommender/models.py b/polara/recommender/models.py index 1a0a9d5..370ec98 100644 --- a/polara/recommender/models.py +++ b/polara/recommender/models.py @@ -607,7 +607,7 @@ def build(self): item_groups = self.data.training.groupby(itemid, sort=True) if self.by_feedback_value: feedback = self.data.fields.feedback - self.items_scores = item_groups[feedback].sum().values + self.item_scores = item_groups[feedback].sum().values else: self.item_scores = item_groups.size().values From b7ad6b0042ced73ca60c1fde6a55be81cb88e6fa Mon Sep 17 00:00:00 2001 From: Evgeny Frolov Date: Sat, 11 Aug 2018 08:00:37 +0300 Subject: [PATCH 09/13] fix _prediction_key attribute naming --- polara/recommender/models.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/polara/recommender/models.py b/polara/recommender/models.py index 370ec98..fde8c35 100644 --- a/polara/recommender/models.py +++ b/polara/recommender/models.py @@ -86,8 +86,8 @@ def __init__(self, recommender_data, feedback_threshold=None): self.verify_integrity = get_default('verify_integrity') self.max_test_workers = get_default('max_test_workers') - # TODO sorting in data must be by self._predition_key, also need to change get_test_data - self._predition_key = self.data.fields.userid + # TODO sorting in data must be by self._prediction_key, also need to change get_test_data + self._prediction_key = self.data.fields.userid self._prediction_target = self.data.fields.itemid self._is_ready = False @@ -220,7 +220,7 @@ def _get_test_data(self, feedback_threshold=None): user_idx, item_idx, feedback = self.data.test_to_coo(tensor_mode=tensor_mode, feedback_threshold=threshold) idx_diff = np.diff(user_idx) - # TODO sorting by self._predition_key + # TODO sorting by self._prediction_key assert (idx_diff >= 0).all() # calculations assume testset is sorted by users! # TODO only required when testset consists of known users @@ -411,7 +411,7 @@ def evaluate(self, method='hits', topk=None, not_rated_penalty=None, on_feedback feedback = None if ignore_feedback else feedback scoring_data = assemble_scoring_matrices(recommendations, eval_data, - self._predition_key, self._prediction_target, + self._prediction_key, self._prediction_target, is_positive, feedback=feedback) if method == 'relevance': # no need for feedback From 84fcdd5844bb2d138ca61133990a3bc44d2439da Mon Sep 17 00:00:00 2001 From: Evgeny Frolov Date: Fri, 17 Aug 2018 20:09:29 +0300 Subject: [PATCH 10/13] allow to define which index values to use for test data alignment critical for reindexing test data in custom scenarios --- polara/recommender/data.py | 61 +++++++------------------------------- 1 file changed, 11 insertions(+), 50 deletions(-) diff --git a/polara/recommender/data.py b/polara/recommender/data.py index e4a8234..cc00330 100644 --- a/polara/recommender/data.py +++ b/polara/recommender/data.py @@ -495,17 +495,17 @@ def _try_reindex_training_data(self): self._reindex_train_items() self._reindex_feedback() - def _try_drop_unseen_test_items(self): + def _try_drop_unseen_test_items(self, mapping='old'): if self.ensure_consistency: itemid = self.fields.itemid - self._filter_unseen_entity(itemid, self._test.testset, 'testset') - self._filter_unseen_entity(itemid, self._test.holdout, 'holdout') + self._filter_unseen_entity(itemid, self._test.testset, 'testset', mapping) + self._filter_unseen_entity(itemid, self._test.holdout, 'holdout', mapping) - def _try_drop_unseen_test_users(self): + def _try_drop_unseen_test_users(self, mapping='old'): if self.ensure_consistency and not self._warm_start: # even in state 3 there could be unseen users userid = self.fields.userid - self._filter_unseen_entity(userid, self._test.holdout, 'holdout') + self._filter_unseen_entity(userid, self._test.holdout, 'holdout', mapping) def _try_drop_invalid_test_users(self): if self.holdout_size >= 1: @@ -613,7 +613,7 @@ def _map_entity(self, entity, dataset): entity_index_map = seen_entities_index.set_index('old').new dataset.loc[:, entity] = dataset.loc[:, entity].map(entity_index_map) - def _filter_unseen_entity(self, entity, dataset, label): + def _filter_unseen_entity(self, entity, dataset, label, mapping): if dataset is None: return @@ -625,9 +625,9 @@ def _filter_unseen_entity(self, entity, dataset, label): raise NotImplementedError try: - seen_entities = index_data.training['old'] + seen_entities = index_data.training[mapping] except AttributeError: - seen_entities = index_data['old'] + seen_entities = index_data[mapping] seen_data = dataset[entity].isin(seen_entities) if not seen_data.all(): @@ -883,54 +883,15 @@ def set_test_data(self, testset=None, holdout=None, warm_start=False, test_users if (testset is None) and (holdout is None): return # allows to cleanup data if ensure_consistency: # allows to disable self.ensure_consistency without actually changing it - self._try_drop_unseen_test_items() # unseen = not present in training data - self._try_drop_unseen_test_users() # unseen = not present in training data + index_mapping = 'old' if reindex else 'new' + self._try_drop_unseen_test_items(mapping=index_mapping) # unseen = not present in training data + self._try_drop_unseen_test_users(mapping=index_mapping) # unseen = not present in training data self._try_drop_invalid_test_users() # inconsistent between testset and holdout if reindex: self._try_reindex_test_data() # either assign known index, or reindex (if warm_start) self._try_sort_test_data() - -class BinaryDataMixin(object): - def __init__(self, *args, **kwargs): - raise NotImplementedError - self.binary_threshold = kwargs.pop('binary_threshold', None) - super(BinaryDataMixin, self).__init__(*args, **kwargs) - - def _binarize(self, data, return_filtered_users=False): - feedback = self.fields.feedback - data = data[data[feedback] >= self.binary_threshold].copy() - data[feedback] = np.ones_like(data[feedback]) - return data - - def _split_test_data(self): - super(BinaryDataMixin, self)._split_test_data() - if self.binary_threshold is not None: - self._training = self._binarize(self._training) - - def _split_eval_data(self): - super(BinaryDataMixin, self)._split_eval_data() - if self.binary_threshold is not None: - userid = self.fields.userid - testset = self._binarize(self.test.testset) - test_users = testset[userid].unique() - user_sel = self.test.holdout[userid].isin(test_users) - holdout = self.test.holdout[user_sel].copy() - self._test = namedtuple('TestData', 'testset holdout')._make([testset, holdout]) - if len(test_users) != (testset[userid].max()+1): - # remove gaps in test user indices - self._update_test_user_index() - - def _update_test_user_index(self): - testset, holdout = self._test - userid = self.fields.userid - new_test_idx = self.reindex(testset, userid, sort=False, inplace=True) - holdout.loc[:, userid] = holdout[userid].map(new_test_idx.set_index('old').new) - new_test_idx.old = new_test_idx.old.map(self.index.userid.test.set_index('new').old) - self.index = self.index._replace(userid=self.index.userid._replace(test=new_test_idx)) - - class LongTailMixin(object): def __init__(self, *args, **kwargs): raise NotImplementedError From e112f76270609e066fd6668735427f9771ecf878 Mon Sep 17 00:00:00 2001 From: Evgeny Frolov Date: Fri, 17 Aug 2018 20:10:54 +0300 Subject: [PATCH 11/13] improve status message for holdout filtering --- polara/recommender/data.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/polara/recommender/data.py b/polara/recommender/data.py index cc00330..98791f2 100644 --- a/polara/recommender/data.py +++ b/polara/recommender/data.py @@ -549,7 +549,7 @@ def _filter_short_sessions(self, group_id=None): invalid_session_index = invalid_sessions.index[invalid_sessions] holdout.query('{} not in @invalid_session_index'.format(group_id), inplace=True) if self.verbose: - msg = '{} of {} {}\'s were filtered out from holdout. Reason: not enough items.' + msg = '{} of {} {}\'s were filtered out from holdout. Reason: incompatible number of items.' print(msg.format(n_invalid_sessions, len(invalid_sessions), group_id)) def _align_test_users(self): From d760d595cdfe9a0ade9c46d1233a11865c75bfde Mon Sep 17 00:00:00 2001 From: Evgeny Frolov Date: Tue, 21 Aug 2018 14:18:35 +0300 Subject: [PATCH 12/13] new example of Polara usage based on the EigenRec paper --- examples/Reproducing EIGENREC results.ipynb | 1118 +++++++++++++++++++ 1 file changed, 1118 insertions(+) create mode 100644 examples/Reproducing EIGENREC results.ipynb diff --git a/examples/Reproducing EIGENREC results.ipynb b/examples/Reproducing EIGENREC results.ipynb new file mode 100644 index 0000000..e0a96fa --- /dev/null +++ b/examples/Reproducing EIGENREC results.ipynb @@ -0,0 +1,1118 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Introduction" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The main goal of this tutorial is to demonstrate how one can use Polara to conduct custom experiments with very specific requirements. Needless to say, it can be useful for reproducing someone's research as well. Below you'll find a particular example based on one of my favorite papers called *\"[EIGENREC: generalizing PureSVD for effective and efficient top-N recommendations](https://arxiv.org/abs/1511.06033)\"*, which helped me to see standard SVD-based models in a differrent light and even led me to [my own discoveries](https://arxiv.org/abs/1807.10634). Even though it's not necessary for understanding the material below, I strongly recommend to read the original paper as it builds on top of clear ideas and contains a very thorough analysis.\n", + "\n", + "The key take home message from this paper for me personally is that **SVD can be viewed as a particular case of a more general eigendecomposition problem of the scaled similarity matrix**. Based on that insight, the authors of the *EigenRec* model propose several modifications, which involve tuning the scaling factor as well as the similarity measure in order to improve the model's performance.\n", + "\n", + "
In this tutorial we are not going to reproduce the full work and will focus only on some of its easy-to-implement parts. Basically, we will alter only the scaling factor to see how it affects the quality of recommendations.
\n", + "\n", + "Nevertheless, the tutorial allows to verify validity of the proposed ideas and creates a convenient playground for further exploration." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Data preparation" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Getting Movielens-1M data" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "As in the previous tutorials, let's download the Movielens dataset. The task should be already familiar to you. This is one of the datasets used in the paper as well. One could use some other datasets, it wouldn't change anything in later parts of the tutorial. The main requirement is to have it in the form of a Pandas dataframe, similarly to what is returned by the `get_movielens_data` function." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "from polara import RecommenderData\n", + "from polara import get_movielens_data" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
useridmovieidrating
0111935
116613
219143
3134084
4123555
\n", + "
" + ], + "text/plain": [ + " userid movieid rating\n", + "0 1 1193 5\n", + "1 1 661 3\n", + "2 1 914 3\n", + "3 1 3408 4\n", + "4 1 2355 5" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data = get_movielens_data() # will automatically download it\n", + " # alternatively you can specify a path to the local copy as an argument to the function\n", + "data.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "data_model = RecommenderData(data, 'userid', 'movieid', 'rating', seed=0)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Custom experimental setup with item sampling" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The *EigenRec* paper follows a specific experimentation setup, mainly based on the settings, proposed in my another favorite paper [Performance of recommender algorithms on top-n recommendation tasks](https://dl.acm.org/citation.cfm?id=1864708.1864721), devoted to the *PureSVD* model itself. For evaluation purposes, the authors sample 1.4% of all available ratings and additionally shrink the resulting sample by leaving 5-star ratings only. Quote from the paper (Section 4.2.1): \n", + "
\"...we form a probeset $\\mathcal{P}$ by randomly sampling 1.4% of the ratings of the dataset, and we use each item $v_j$,rated with 5-star by user $u_i$ in $\\mathcal{P}$ to create the test set $\\mathcal{T}$...\"
" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This setup can be easily implemented in Polara with the help of `test_ratio` and `holdout_size` parameters of the `RecommendeData` instance. **It requires a two-step preparation procedure.**\n", + "\n", + "**The first step** is to sample data without filtering top-rated items. The following configuration does the thing:" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Preparing data...\n", + "2 unique movieid's within 2 holdout interactions were filtered. Reason: not in the training data.\n", + "Done.\n", + "There are 986206 events in the training and 14001 events in the holdout.\n" + ] + } + ], + "source": [ + "data_model.test_ratio = 0 # do not split dataset into folds, use entire dataset for sampling\n", + "data_model.holdout_size = 0.014 # sample this fraction of ratings from data\n", + "data_model.random_holdout = True # sample ratings randomly (not just 5-star)\n", + "data_model.warm_start = False # allow test users to be part of the training (excluding holdout items)\n", + "\n", + "data_model.prepare() # perform sampling" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Mind the `test_ratio` parameter setting. Together with the `test_fold` parameter it controls, which fraction of the dataset to sample from; 0 means the whole dataset and turns off data splitting mechanism used by Polara for cross-validation. The value of `test_fold` has no effect in that case. Also note that by default Polara performs some additional manipulations with data like cleaning and reindexing to transform it into a uniform internal representation for further use. Key actions and their results are reported in an output text, which can be turned off by setting `data_model.verbose = False`. Here's how to see the final result of sampling:" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
useridmovieidrating
19025904
1409695
1331922
61110303
231210585
\n", + "
" + ], + "text/plain": [ + " userid movieid rating\n", + "19 0 2590 4\n", + "14 0 969 5\n", + "133 1 92 2\n", + "61 1 1030 3\n", + "231 2 1058 5" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data_model.test.holdout.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**The second step** is to leave only items with rating 5, as it was done in the original paper. The easiest way in our case would be to simply run:\n", + "```python\n", + "data_model.test.holdout.query('rating==5', inplace=True)\n", + "```\n", + "However, in general, you shouldn't manually change the data after it was processed by Polara, as it may break some internal logic. A more appropriate and a safier way to achieve the same is to use the `set_test_data` method, specifically designed to cover custom configurations: " + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "data_model.set_test_data(holdout=data_model.test.holdout.query('rating==5'), # select only 5-star ratings\n", + " warm_start=data_model.warm_start, \n", + " reindex=False, # avoid reindexing users and items second time\n", + " ensure_consistency=False # do not try to filter out unseen entities (they are already excluded)\n", + " # leaving it as True wouldn't change the result but would lead to extra checks\n", + " )" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Note that we reuse the previously sampled holdout dataset (the $\\mathcal{P}$ dataset in the authors' notation), which is already reindexed by Polara's built-in data pre-processing procedure. In order not to loose the index mapping between internal and external representation of movies and users (stored in the `data_model.index` attribute) it's very important to set `reindex` argument of the `set_test_data` method to `False`. Now the `data_model.test.holdout` dataframe stores the final result, namely the $\\mathcal{T}$ dataset:" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
useridmovieidrating
1409695
231210585
326423725
488533015
931911985
\n", + "
" + ], + "text/plain": [ + " userid movieid rating\n", + "14 0 969 5\n", + "231 2 1058 5\n", + "326 4 2372 5\n", + "488 5 3301 5\n", + "931 9 1198 5" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data_model.test.holdout.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Scaled SVD-based model" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In the simplest case of the EigenRec model, when only the scaling factor is changed, we can go with a very straightforward approach. Instead of computing similarity matrices and solving an eigendecomposition problem, it is sufficient to apply standard SVD to a scaled rating matrix $\\tilde R$: \n", + "\n", + "$$\n", + "\\tilde R = R \\, S^{d-1} \\approx U\\Sigma V^T,\n", + "$$ \n", + "\n", + "where $R$ is an $M \\times N$ rating matrix, $S = \\text{diag}\\{\\|r_1\\|_2, \\dots, \\|r_N\\|_2\\}^d$ is a diagonal scaling matrix with its non-zero values depending on a scaling parameter $d$ and $r_i$ denotes an $i$-th column of $R$. Note that due to the orthogonality of columns in the SVD factors the approximation of $\\tilde R$ can be written in an equivalent and more convenient form $\\tilde RVV^T$, which can be used to generate recommendations." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Scaling input data" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In order to calculate the scaled version of the *PureSVD* approach we can reuse the `SVDModel` class implemented in Polara. One of the ways to do that is to redefine the `build` method in an `SVDModel`'s subclass. A simpler solution, however, is to directly modify an output of the `get_training_matrix` method, which is generally available for all models in Polara and is used internally in the `SVDModel` in particular. This method returns the rating matrix in a sparse format, which is then fed into the `scipy`'s truncated SVD implementation within the `build` method (you can run the `SVDModel.build??` command with double question mark to see it). Assuming we already have sparse rating matrix, the following function will help to scale it:" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "from scipy.sparse import diags\n", + "from scipy.sparse.linalg import norm as spnorm\n", + "\n", + "def sparse_normalize(matrix, scaling, axis):\n", + " '''Function to scale either rows or columns of the sparse rating matrix'''\n", + " if scaling == 1: # no scaling (standard SVD case)\n", + " return matrix\n", + " \n", + " norm = spnorm(matrix, axis=axis, ord=2) # compute Euclidean norm of rows or columns\n", + " scaling_matrix = diags(np.power(norm, scaling-1, where=norm!=0))\n", + " \n", + " if axis == 0: # scale columns\n", + " return matrix.dot(scaling_matrix)\n", + " if axis == 1: # scale rows\n", + " return scaling_matrix.dot(matrix)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Sampling random items for evaluation" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Somewhat more involved modifications are required to generate model predictions, as it's based on an additional sampling of items not previously seen by the test users. Quote from the paper (Section 4.2.1): \n", + "
\"For each item in $\\mathcal{T}$, we randomly select another 1000 unrated items of the same user...\"
" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This means that we need to generate prediction scores for 1000 randomly selected unseen items in addition to every item from the holdout. Moreover, **every set of 1001 items is treated independently of the user it belongs to**. Normally, Polara performs evaluation on a *per user basis*; however, in this case the logic is different and we have to take care of users with mulltiple items in the holdout. From the line below it can be clearly seen that some test users can have up to 8 items:" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "8" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data_model.test.holdout.userid.value_counts().max()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In order to \"flatten\" the holdout dataset and to independently generate prediction scores for every holdout item (and 1000 of additionally sampled items) we will customize the `get_recommendations` method of the `SVDModel` class. Below is the support function, that helps to achieve the necessary result. It iterates over all holdout items, randomly samples a predefined amount of previously unrated items and generates prediction scores for them:" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "def sample_scores_flat(useridx, itemidx, seen_data, all_items, user_factors, item_factors, sample_size=1000, random_state=None):\n", + " '''Function to randomly sample unrated items and generate prediction scores for them.'''\n", + " scores = []\n", + " for user, items in itemidx.groupby(useridx): # iterate over every test user and get all user items\n", + " seen_items = seen_data[1][seen_data[0]==user].tolist() # list of the previously rated items of the user\n", + " seen_items.extend(items.tolist()) # take holdout items into account as well\n", + " item_pool = all_items[~all_items.isin(seen_items)] # exclude seen items from all available items\n", + " for item in items:\n", + " sampled_items = item_pool.sample(n=sample_size, random_state=random_state) \n", + " scores.append(item_factors[sampled_items.values, :].dot(user_factors[user, :]))\n", + " return scores" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "
Prediction scores are generated similarly to the standard *PureSVD* model by an orthogonal projection of a vector $r$ of user ratings onto the latent feature space, defined by the formula $VV^Tr$. Note that unlike the model computation phase, no scaling is used in the prediction.
\n", + "\n", + "The code above complies with this definition by expecting `user_factors` to be the product $V^Tr$ for a set of test users and `item_factors` to be $V$ itself. Below you can find a full implementation of our new model." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Defining the model" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "import pandas as pd\n", + "from polara import SVDModel\n", + "\n", + "class ScaledSVD(SVDModel):\n", + " '''Class that adds scaling functionality to the PureSVD model'''\n", + " \n", + " def __init__(self, *args, **kwargs):\n", + " super().__init__(*args, **kwargs)\n", + " self.col_scaling = 1 # scaling parameted d, initially corresponds to PureSVD\n", + " self.n_rnd_items = 1000 # number of randomly sampled items\n", + " self.seed = 0 # to control randomization\n", + " self.method = 'ScaledSVD'\n", + " \n", + " def get_training_matrix(self, *args, **kwargs):\n", + " svd_matrix = super().get_training_matrix(*args, **kwargs) # get sparse rating matrix\n", + " return sparse_normalize(svd_matrix, self.col_scaling, 0)\n", + " \n", + " def get_recommendations(self):\n", + " holdout = self.data.test.holdout\n", + " itemid = self.data.fields.itemid # \"movieid\" in the case of Movielense dataset\n", + " userid = self.data.fields.userid # \"userid\" in the case of Movielense dataset\n", + " \n", + " itemidx = holdout[itemid] # holdout items of the test users\n", + " useridx = pd.factorize(holdout[userid])[0] # have to \"rebase\" user index;\n", + " # necessary for indexing rows of the matrix with test user ratings\n", + " \n", + " # prediction scores for holdout items\n", + " test_matrix, seen_data = self.get_test_matrix() \n", + " item_factors = self.factors[itemid] # right singular vectors, matrix V\n", + " user_factors = test_matrix.dot(item_factors) # according to product V^T r for every test user\n", + " holdout_scores = (user_factors[useridx, :] * item_factors[itemidx.values, :]).sum(axis=1).squeeze()\n", + " \n", + " # scores for randomly sampled unseen items\n", + " all_items = self.data.index.itemid.new # all unique (reindexed) items\n", + " rs = np.random.RandomState(self.seed) # fixing random state to control random output\n", + " sampled_scores = sample_scores_flat(useridx, itemidx,\n", + " seen_data, all_items,\n", + " user_factors, item_factors,\n", + " self.n_rnd_items, random_state=rs)\n", + " \n", + " # combine all scores and rank selected items \n", + " scores = np.concatenate((holdout_scores[:, None], sampled_scores), axis=1) # stack into array with 1001 columns\n", + " rankings = np.apply_along_axis(np.argsort, 1, -scores)\n", + " return rankings" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The model is ready and can be used in a standard way:" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "ScaledSVD training time: 0.5471807377243749s\n" + ] + } + ], + "source": [ + "svd = ScaledSVD(data_model) # create model\n", + "svd.rank = 50\n", + "svd.col_scaling = 0.5\n", + "svd.build() # fit model" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now, when we have our model computed, its time to evaluate it. However, **we cannot use the built-in evaluation routine**. Normally, the number of test users is equal to the number of rows in recommendations array and that's the logic Polara relies on. In our case the number of test users is lower than the number of rows in recommendations array and actually corresponds to the total number of ratings in the holdout:" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "# of test users: 2009\n", + "# of rows and columns in recommendations array: (3140, 1001)\n", + "# of ratinhgs in the holdout: 3140\n" + ] + } + ], + "source": [ + "# if you run the cell for the first time you'll notice a short delay before print output due to calculation of recommendations\n", + "print('# of test users:', data_model.test.holdout.userid.nunique())\n", + "print('# of rows and columns in recommendations array:', svd.recommendations.shape)\n", + "print('# of ratinhgs in the holdout:', data_model.test.holdout.shape[0])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We will fix this inconsistency in the next section." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Worth noting here that Polara implements a unified system of callbacks, which reset the `svd.recommendations` property whenever either the `data_model` or the model itself are changed in a way that affects the models' output (try, for example, call `svd.recommendations`, then set the rank of the model to some higher value and call `svd.recommendations` again). This mechanism helps to ensure predictable and consistent state and to prevent accidental reuse of the cached results during experiments. \n", + "It can also be extended with user-defined triggers, which is probably the topic for another tutorial." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Model evaluation" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Simple approach" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "When you try to evaluate your model, it calls for the `model.recommendations` property which is automatically filled with the result of the `get_recommendations` method. The simplest way to evaluate the result in accordance with the new structure of the recommendations array is to define a small function as shown below:" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [], + "source": [ + "def evaluate_mrr(model):\n", + " '''Function to calculate MRR score.'''\n", + " is_holdout = model.recommendations==0 # holdout items are always in the first column before sorting\n", + " pos = np.where(is_holdout)[1] + 1.0 # position of holdout items (indexing starts from 0, so adding 1) \n", + " mrr = np.reciprocal(pos).mean() # mean reciprocal rank\n", + " return mrr" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Finally, to compute the MRR score, as it is done in the original paper, simply run:" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.3130822345824591" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "evaluate_mrr(svd)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### More functional approach" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "While the previously described approach is fully working and easy, in some cases you may want to use the built-in `model.evaluate` method, as it provides additional functionality. It is also useful to see how Polara can be customized to serve specific needs. The key ingredient here is the control of the type of entities that are recommended. By default, Polara expects items to be recommended to users and looks for the corresponding fields in the test data. These fields are defined via `data_model.fields.userid` and `data_model.fields.itemid` attributes respectively. The default behavior, however, can be redefined at the model level be setting `model._prediction_key` (users by default) and `model._prediction_target` (items by default) attributes to custom values. This scheme, for example, can be utilized in cold start experiments, where the task is to find users potentially interested in a \"cold\" item instead of recommending items to users (see `polara.recommender.coldstart` for implementation details). The following lines show how to change the default settings for our needs:" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [], + "source": [ + "svd._prediction_key = 'xuser'\n", + "svd._prediction_target = 'xitem'" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now, we need to specify the corresponding fields in the holdout data. Recall that our goal is to treat every item in the holdout independently of the user or, in other words, to assign every item to a unique \"virtual\" user (`'xuser'`). Furthermore, by construction, prediction scores for holdout items are located in the first column of the recommendations array. This means that every holdout item (`'xitem'`) should have index 0. Here's the necessary modification:" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [], + "source": [ + "data_model.test.holdout['xuser'] = np.arange(data_model.test.holdout.shape[0]) # number of rated items defines the range\n", + "data_model.test.holdout['xitem'] = 0" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's check that the result is the same (up to a small rounding error due to different calculation schemes):" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Ranking(mrr=0.31308223458245904)" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "svd.evaluate('ranking', simple_rates=True) # `simple_rates` is used to enforce calculation of MRR" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "
If you'll do the math you'll see that the whole experiment took under 100 lines of code to program, and the most part of it was pretty standard (i.e., declaring variables and methods).
\n", + "\n", + "Less lines of code typically means less risks for having bugs or inconsistencies. By following a certain protocol, Polara provides a high-level interface that abstracts many technical aspects allowing to focus on the most important parts of research." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Reproducing the results" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The next task is to repeat experiments from the EigenRec paper, where the authors compute\n", + "
\"...MRR scores as a function of the parameter $d$ for every case, using the number of latent factors that produces the best possible performance for each matrix.\"
" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Grid search" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The beauty of SVD-based models is that it is much easier to perform grid-search for finding optimal values of hyper-parameters. Once you have computed a model for a certain set of hyper parameters with some rank value $k$, you can quickly find all other models of rank \"k' < k\" without recomputing SVD.\n", + "
Going from larger values of rank to smaller ones is performed by a simple truncation of the latent factor matrix.
" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This not only allows to perform experiments faster, but also simplifies the code for it. Moreover, `SVDModel` already has the necessary rank-check procedures, which allow to avoid rebuilding the model when user sets a smaller value of rank. No special actions are required here. Below is the code that implements the grid search experiment, taking that feature into account (note that on a moderate hardware the code will run for approximately half an hour):" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [], + "source": [ + "from tqdm import tqdm_notebook\n", + "%matplotlib inline\n", + "\n", + "svd_mrr_flat = {} # will stor results here\n", + "svd.verbose = False\n", + "\n", + "max_rank = 150\n", + "scaling_params = np.arange(-20, 21, 2) / 10 # values of d from -2 to 2 with step 0.2\n", + "svd_ranks = range(10, max_rank+1, 10) # ranks from 10 to max_ranks with step 10\n", + "\n", + "for scaling in tqdm_notebook(scaling_params):\n", + " svd.col_scaling = scaling\n", + " svd.rank = max_rank\n", + " svd.build()\n", + " \n", + " for rank in list(reversed(svd_ranks)): # iterating over rank values in a descending order\n", + " svd.rank = rank # allows to truncate factor matrices without recomputing SVD\n", + " svd_mrr_flat[(scaling, rank)] = svd.evaluate('ranking', simple_rates=True).mrr" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Results" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now we have the results of the grid search stored in the `svd_mrr_flat` dictionary. There's one catch that wasn't clear for me at first:\n", + "
in order to show the effect of parameter $d$ the authors have fixed the value of rank corresponding to the best result achieved with EigenRec.
\n", + "\n", + "This means that the curve on Figure 1 in the original paper is obtained with a fixed value of rank, corresponding to the optimal point at the top of the curve, and all other points are obtained by only changing the scaling factor. Here's one way to draw it:" + ] + }, + { + "cell_type": "code", + "execution_count": 167, + "metadata": {}, + "outputs": [], + "source": [ + "result_flat = pd.Series(svd_mrr_flat)\n", + "best_d, best_rank = result_flat.idxmax()\n", + "best_d, best_rank" + ] + }, + { + "cell_type": "code", + "execution_count": 199, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "result_flat.xs(best_rank, axis=0, level=1).plot(label='fixed rank', legend=True, title='MRR',\n", + " figsize=(4.3, 2), ylim=(0, None), xlim=(-2, 2), grid=True);" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Comparing this picture to the bottom left graph of Figure 1 in the original paper leads to a satisfactory conclusion that the curves on the graphs are very close. Of course, there are slight differences; however, there are many factors that may affect it, like data sampling and unrated items randomization. It would be a good idea to repeat the experiment with different `seed` values and draw a confidence region around the curve. However, there are no drammatic differences in the general behavior of the curves, which is a very nice result that didn't take too much efforts. Here are some top-score configurations from the experiment:" + ] + }, + { + "cell_type": "code", + "execution_count": 170, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.4 120 0.323893\n", + " 110 0.322767\n", + " 140 0.321977\n", + "0.6 60 0.321719\n", + "0.4 130 0.321421\n", + "dtype: float64" + ] + }, + "execution_count": 170, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "result_flat.sort_values(ascending=False).head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## A bit of exploration" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The difference between the best result achieved with the EigenRec approach and the standard PureSVD result (that corresponds to the point with scaling parameter equal to 1) is quite large. However, such a comparison is a bit unfair as the restriction on having a fixed value of rank is artificial. We can draw another curve that corresponds to optimal values of both scaling parameter and rank of the decomposition:" + ] + }, + { + "cell_type": "code", + "execution_count": 200, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "result_flat.groupby(level=0).max().plot(label='optimal rank', legend=True)\n", + "result_flat.xs(best_rank, axis=0, level=1).plot(label='fixed rank', legend=True, title='MRR',\n", + " figsize=(4.3, 2), ylim=(0, None), xlim=(-2, 2), grid=True);" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now the difference is less pronounced. Anyway, the EigenRec approach still performs better. Moreover, the difference vary significantly from dataset to dataset and in some cases that difference can be much more noticeable. Another degree of freedom here, which may increase the top score, is the maximum value of rank used in the grid search. We have manually set it to be 150. Let's look which values of rank were used at each point of the curve:" + ] + }, + { + "cell_type": "code", + "execution_count": 202, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "ax = result_flat.groupby(level=0).idxmax().str[1].plot(label='optimal rank value', ls=\":\", legend=True,\n", + " secondary_y=True, c='g')\n", + "result_flat.groupby(level=0).max().plot(label='optimal rank experiment', legend=True)\n", + "result_flat.xs(best_rank, axis=0, level=1).plot(label='fixed rank experiment', legend=True, title='MRR',\n", + " figsize=(4.3, 2), ylim=(0, None), xlim=(-2, 2), grid=True);" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Clearly, the values are capped on the left half of the graph, which leaves the room for further improvement. I have performed experiments with a higher threshold and was able to achieve a higher top score with a bit lower value of the scaling parameter. If you want to see this, simply rerun the grid search experiment with a higher value of the `max_rank` variable. Be prepared that it will take a longer time (hint: reduce the search space for the scaling parameter). Anyway, the key conclusion doesn't change - **even a simple scaling factor can be advantageous and allows to outperform the standard model**. This conclusion is supported by many other experiments in the original paper, which we haven't run here." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Bonus: Double-scaled SVD" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + " The main change in the model is basically enclosed in the following line\n", + "```python\n", + "scaled_matrix = sparse_normalize(svd_matrix, self.col_scaling, 0)\n", + "```\n", + "which scales columns of the rating matrix. However, there's nothing really special about this particular type of scaling and we could also scale rows instead of or in addition to that. Scaling rows would help to control the contribution of users with either too high or too low number of rated items. The entire code for defining the double-scaled model is listed below:" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "```python\n", + "class DScaledSVD(ScaledSVD):\n", + " def __init__(self, *args, **kwargs):\n", + " super().__init__(*args, **kwargs)\n", + " self.row_scaling = 1 # PureSVD config\n", + " self.method = 'DScaledSVD'\n", + " \n", + " def get_training_matrix(self, *args, **kwargs):\n", + " svd_matrix = super().get_training_matrix(*args, **kwargs)\n", + " return sparse_normalize(svd_matrix, self.row_scaling, 1)\n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Note that running the grid search experiment with two scaling parameters, `row_scaling` and `col_scaling`, will take more time. In my experiments with Movielense data and the value of rank limited by 150 from the above there was a very weak improvement, which wasn't worth the efforts. This however, may change with higher values of ranks or at least with another data. I'll leave verifying this for the reader." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Instead of conclusion" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Being a researcher myself, I'm often involved in some sort of \"redoing\" the work that was already done by someone else. Leaving the reproducibility aspect aside, there are many other reasons why it can be useful, e.g., it may help to understand presented ideas better or to see if there're any subtle moments in an original work that are not evident at first glance. It helps to create a playground ready for further exploration and may even lead to new ideas. I hope that Polara will help you with this as it helps me in my own research." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.4" + }, + "toc": { + "nav_menu": {}, + "number_sections": true, + "sideBar": true, + "skip_h1_title": false, + "title_cell": "Table of Contents", + "title_sidebar": "Contents", + "toc_cell": false, + "toc_position": {}, + "toc_section_display": true, + "toc_window_display": false + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} From f305e37d3bf327ac5064f9b3a9fc36ebd4002d27 Mon Sep 17 00:00:00 2001 From: Evgeny Frolov Date: Tue, 21 Aug 2018 14:31:24 +0300 Subject: [PATCH 13/13] update readme --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index bc2b0f8..27d4438 100644 --- a/README.md +++ b/README.md @@ -41,6 +41,7 @@ svd = SVDModel(data_model) svd.build() svd.evaluate() ``` +Several different scenarios and use cases, which cover many practical aspects, can also be found in the [examples directory](/examples). ## Creating new recommender models Basic models can be extended by subclassing `RecommenderModel` class and defining two required methods: `self.build()` and `self.get_recommendations()`. Here's an example of a simple item-to-item recommender model: