From 43893171b819f99d9044b05c301ad404ce4fb642 Mon Sep 17 00:00:00 2001 From: Chuyang Ke Date: Thu, 18 Aug 2022 00:14:08 -0700 Subject: [PATCH 1/7] Rewrote ndcg_at_k --- recommenders/evaluation/python_evaluation.py | 59 ++++++++++++++++---- 1 file changed, 48 insertions(+), 11 deletions(-) diff --git a/recommenders/evaluation/python_evaluation.py b/recommenders/evaluation/python_evaluation.py index 0f51580a3f..305b5cd7e2 100644 --- a/recommenders/evaluation/python_evaluation.py +++ b/recommenders/evaluation/python_evaluation.py @@ -527,6 +527,8 @@ def ndcg_at_k( relevancy_method="top_k", k=DEFAULT_K, threshold=DEFAULT_THRESHOLD, + score_type="binary", + discfun_type="loge", ): """Normalized Discounted Cumulative Gain (nDCG). @@ -543,6 +545,10 @@ def ndcg_at_k( top k items are directly provided, so there is no need to compute the relevancy operation. k (int): number of top k items per user threshold (float): threshold of top items per user (optional) + score_type (str): type of relevance scores ['binary', 'raw', 'exp']. With the default option 'binary', the + relevance score is reduced to either 1 (hit) or 0 (miss). Option 'raw' uses the raw relevance score. + Option 'exp' uses (2 ** RAW_RELEVANCE - 1) as the relevance score + discfun_type (str): type of discount function ['loge', 'log2'] used to calculate DCG. Returns: float: nDCG at k (min=0, max=1). @@ -563,20 +569,51 @@ def ndcg_at_k( if df_hit.shape[0] == 0: return 0.0 - # calculate discounted gain for hit items - df_dcg = df_hit.copy() - # relevance in this case is always 1 - df_dcg["dcg"] = 1 / np.log1p(df_dcg["rank"]) - # sum up discount gained to get discount cumulative gain - df_dcg = df_dcg.groupby(col_user, as_index=False, sort=False).agg({"dcg": "sum"}) - # calculate ideal discounted cumulative gain - df_ndcg = pd.merge(df_dcg, df_hit_count, on=[col_user]) - df_ndcg["idcg"] = df_ndcg["actual"].apply( - lambda x: sum(1 / np.log1p(range(1, min(x, k) + 1))) + df_dcg = df_hit.merge(rating_pred, on=["user_id", "item_id"]).merge( + rating_true, on=["user_id", "item_id"], how="outer" + ) + + if score_type == "binary": + df_dcg["rel"] = 1 + elif score_type == "raw": + df_dcg["rel"] = df_dcg["rating"] + elif score_type == "exp": + df_dcg["rel"] = 2 ** df_dcg["rating"] - 1 + else: + raise ValueError("score_type must be one of 'binary', 'raw', 'exp'") + + if discfun_type == "loge": + discfun = np.log + elif discfun_type == "log2": + discfun = np.log2 + else: + raise ValueError("discfun_type must be one of 'loge', 'log2'") + + # calculate actual discounted gain for each record + df_dcg["dcg"] = df_dcg["rel"] / discfun(1 + df_dcg["rank"]) + + # calculate ideal discounted gain for each record + df_idcg = df_dcg.sort_values(["user_id", "rating"], ascending=False) + df_idcg["irank"] = df_idcg.groupby("user_id", as_index=False, sort=False)[ + "rating" + ].rank("first", ascending=False) + df_idcg["idcg"] = df_idcg["rel"] / discfun(1 + df_idcg["irank"]) + + # calculate actual DCG for each user + df_user = df_dcg.groupby("user_id", as_index=False, sort=False).agg({"dcg": "sum"}) + + # calculate ideal DCG for each user + df_user = df_user.merge( + df_idcg.groupby("user_id", as_index=False, sort=False) + .head(k) + .groupby("user_id", as_index=False, sort=False) + .agg({"idcg": "sum"}), + on="user_id", ) # DCG over IDCG is the normalized DCG - return (df_ndcg["dcg"] / df_ndcg["idcg"]).sum() / n_users + df_user["ndcg"] = df_user["dcg"] / df_user["idcg"] + return df_user["ndcg"].mean() def map_at_k( From 16485252d028c588f78b80047abe99cbb97c1ab1 Mon Sep 17 00:00:00 2001 From: Chuyang Ke Date: Thu, 18 Aug 2022 00:53:41 -0700 Subject: [PATCH 2/7] fixed column names --- recommenders/evaluation/python_evaluation.py | 22 ++++++++++---------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/recommenders/evaluation/python_evaluation.py b/recommenders/evaluation/python_evaluation.py index 305b5cd7e2..bc42a98f5b 100644 --- a/recommenders/evaluation/python_evaluation.py +++ b/recommenders/evaluation/python_evaluation.py @@ -569,16 +569,16 @@ def ndcg_at_k( if df_hit.shape[0] == 0: return 0.0 - df_dcg = df_hit.merge(rating_pred, on=["user_id", "item_id"]).merge( - rating_true, on=["user_id", "item_id"], how="outer" + df_dcg = df_hit.merge(rating_pred, on=[col_user, col_item]).merge( + rating_true, on=[col_user, col_item], how="outer" ) if score_type == "binary": df_dcg["rel"] = 1 elif score_type == "raw": - df_dcg["rel"] = df_dcg["rating"] + df_dcg["rel"] = df_dcg[col_rating] elif score_type == "exp": - df_dcg["rel"] = 2 ** df_dcg["rating"] - 1 + df_dcg["rel"] = 2 ** df_dcg[col_rating] - 1 else: raise ValueError("score_type must be one of 'binary', 'raw', 'exp'") @@ -593,22 +593,22 @@ def ndcg_at_k( df_dcg["dcg"] = df_dcg["rel"] / discfun(1 + df_dcg["rank"]) # calculate ideal discounted gain for each record - df_idcg = df_dcg.sort_values(["user_id", "rating"], ascending=False) - df_idcg["irank"] = df_idcg.groupby("user_id", as_index=False, sort=False)[ - "rating" + df_idcg = df_dcg.sort_values([col_user, col_rating], ascending=False) + df_idcg["irank"] = df_idcg.groupby(col_user, as_index=False, sort=False)[ + col_rating ].rank("first", ascending=False) df_idcg["idcg"] = df_idcg["rel"] / discfun(1 + df_idcg["irank"]) # calculate actual DCG for each user - df_user = df_dcg.groupby("user_id", as_index=False, sort=False).agg({"dcg": "sum"}) + df_user = df_dcg.groupby(col_user, as_index=False, sort=False).agg({"dcg": "sum"}) # calculate ideal DCG for each user df_user = df_user.merge( - df_idcg.groupby("user_id", as_index=False, sort=False) + df_idcg.groupby(col_user, as_index=False, sort=False) .head(k) - .groupby("user_id", as_index=False, sort=False) + .groupby(col_user, as_index=False, sort=False) .agg({"idcg": "sum"}), - on="user_id", + on=col_user, ) # DCG over IDCG is the normalized DCG From 6a44409693e84064c23f0a2e570096b9e159a05b Mon Sep 17 00:00:00 2001 From: Chuyang Ke Date: Thu, 18 Aug 2022 15:45:07 -0700 Subject: [PATCH 3/7] fixed column names to avoid duplicates --- recommenders/evaluation/python_evaluation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/recommenders/evaluation/python_evaluation.py b/recommenders/evaluation/python_evaluation.py index bc42a98f5b..b0d8982708 100644 --- a/recommenders/evaluation/python_evaluation.py +++ b/recommenders/evaluation/python_evaluation.py @@ -570,7 +570,7 @@ def ndcg_at_k( return 0.0 df_dcg = df_hit.merge(rating_pred, on=[col_user, col_item]).merge( - rating_true, on=[col_user, col_item], how="outer" + rating_true, on=[col_user, col_item], how="outer", suffixes=("_left", None) ) if score_type == "binary": From 96261f2c2856ac436345ecbfffbf67bf186bf7c9 Mon Sep 17 00:00:00 2001 From: Chuyang Ke Date: Thu, 18 Aug 2022 15:58:51 -0700 Subject: [PATCH 4/7] added wiki test case for ndcg using raw score --- .../evaluation/test_python_evaluation.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/tests/unit/recommenders/evaluation/test_python_evaluation.py b/tests/unit/recommenders/evaluation/test_python_evaluation.py index a1a29902ca..154295b2c5 100644 --- a/tests/unit/recommenders/evaluation/test_python_evaluation.py +++ b/tests/unit/recommenders/evaluation/test_python_evaluation.py @@ -264,6 +264,24 @@ def test_python_ndcg_at_k(rating_true, rating_pred, rating_nohit): assert ndcg_at_k(rating_true, rating_nohit, k=10) == 0.0 assert ndcg_at_k(rating_true, rating_pred, k=10) == pytest.approx(0.38172, TOL) + # Test raw relevance score and log2 discounting factor using wiki example + # See https://en.wikipedia.org/wiki/Discounted_cumulative_gain + df_true = pd.DataFrame( + { + DEFAULT_USER_COL: np.full(8, 0, dtype=int), + DEFAULT_ITEM_COL: np.arange(8), + DEFAULT_RATING_COL: np.asarray([3, 2, 3, 0, 1, 2, 3, 2]), + } + ) + df_pred = pd.DataFrame( + { + DEFAULT_USER_COL: np.full(6, 0, dtype=int), + DEFAULT_ITEM_COL: np.arange(6), + DEFAULT_PREDICTION_COL: np.asarray([6, 5, 4, 3, 2, 1]), + } + ) + assert ndcg_at_k(df_true, df_pred, k=6, score_type="raw", discfun_type="log2") == pytest.approx(0.785, TOL) + def test_python_map_at_k(rating_true, rating_pred, rating_nohit): assert ( From 621fb9c39cc2e7b0a33a4897147e7197eaf7a8f9 Mon Sep 17 00:00:00 2001 From: Chuyang Ke Date: Thu, 18 Aug 2022 15:59:38 -0700 Subject: [PATCH 5/7] black --- .../evaluation/test_python_evaluation.py | 154 ++++++++---------- 1 file changed, 66 insertions(+), 88 deletions(-) diff --git a/tests/unit/recommenders/evaluation/test_python_evaluation.py b/tests/unit/recommenders/evaluation/test_python_evaluation.py index 154295b2c5..c4e091d556 100644 --- a/tests/unit/recommenders/evaluation/test_python_evaluation.py +++ b/tests/unit/recommenders/evaluation/test_python_evaluation.py @@ -178,26 +178,20 @@ def test_python_mae(rating_true, rating_pred): def test_python_rsquared(rating_true, rating_pred): - assert ( - rsquared( - rating_true=rating_true, - rating_pred=rating_true, - col_prediction=DEFAULT_RATING_COL, - ) - == pytest.approx(1.0, TOL) - ) + assert rsquared( + rating_true=rating_true, + rating_pred=rating_true, + col_prediction=DEFAULT_RATING_COL, + ) == pytest.approx(1.0, TOL) assert rsquared(rating_true, rating_pred) == pytest.approx(-31.699029, TOL) def test_python_exp_var(rating_true, rating_pred): - assert ( - exp_var( - rating_true=rating_true, - rating_pred=rating_true, - col_prediction=DEFAULT_RATING_COL, - ) - == pytest.approx(1.0, TOL) - ) + assert exp_var( + rating_true=rating_true, + rating_pred=rating_true, + col_prediction=DEFAULT_RATING_COL, + ) == pytest.approx(1.0, TOL) assert exp_var(rating_true, rating_pred) == pytest.approx(-6.4466, TOL) @@ -211,16 +205,16 @@ def test_get_top_k_items(rating_true): top_3_user_true = pd.Series([1, 1, 1, 2, 2, 2, 3, 3, 3]) top_3_rating_true = pd.Series([5, 4, 3, 5, 5, 3, 5, 5, 5]) top_3_rank_true = pd.Series([1, 2, 3, 1, 2, 3, 1, 2, 3]) - assert(top_3_items_df[DEFAULT_USER_COL].equals(top_3_user_true)) - assert(top_3_items_df[DEFAULT_RATING_COL].equals(top_3_rating_true)) - assert(top_3_items_df['rank'].equals(top_3_rank_true)) - assert(top_3_items_df[DEFAULT_ITEM_COL][:3].equals(pd.Series([1, 2, 3]))) + assert top_3_items_df[DEFAULT_USER_COL].equals(top_3_user_true) + assert top_3_items_df[DEFAULT_RATING_COL].equals(top_3_rating_true) + assert top_3_items_df["rank"].equals(top_3_rank_true) + assert top_3_items_df[DEFAULT_ITEM_COL][:3].equals(pd.Series([1, 2, 3])) # First two itemIDs of user 2. The scores are both 5, so any order is OK. - assert(set(top_3_items_df[DEFAULT_ITEM_COL][3:5]) == set([1, 4])) + assert set(top_3_items_df[DEFAULT_ITEM_COL][3:5]) == set([1, 4]) # Third itemID of user 2. Both item 5 and 6 have a score of 3, so either one is OK. - assert(top_3_items_df[DEFAULT_ITEM_COL][5] in [5, 6]) + assert top_3_items_df[DEFAULT_ITEM_COL][5] in [5, 6] # All itemIDs of user 3. All three items have a score of 5, so any order is OK. - assert(set(top_3_items_df[DEFAULT_ITEM_COL][6:]) == set([2, 5, 6])) + assert set(top_3_items_df[DEFAULT_ITEM_COL][6:]) == set([2, 5, 6]) # Test get_top_k_items() when k is larger than the number of available items @@ -234,33 +228,30 @@ def test_get_top_k_items_largek(rating_true): top_6_user_true = pd.Series([1, 1, 1, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3]) top_6_rating_true = pd.Series([5, 4, 3, 5, 5, 3, 3, 1, 5, 5, 5, 4, 4, 3]) top_6_rank_true = pd.Series([1, 2, 3, 1, 2, 3, 4, 5, 1, 2, 3, 4, 5, 6]) - assert(top_6_items_df[DEFAULT_USER_COL].equals(top_6_user_true)) - assert(top_6_items_df[DEFAULT_RATING_COL].equals(top_6_rating_true)) - assert(top_6_items_df['rank'].equals(top_6_rank_true)) - assert(top_6_items_df[DEFAULT_ITEM_COL][:3].equals(pd.Series([1, 2, 3]))) + assert top_6_items_df[DEFAULT_USER_COL].equals(top_6_user_true) + assert top_6_items_df[DEFAULT_RATING_COL].equals(top_6_rating_true) + assert top_6_items_df["rank"].equals(top_6_rank_true) + assert top_6_items_df[DEFAULT_ITEM_COL][:3].equals(pd.Series([1, 2, 3])) # First two itemIDs of user 2. The scores are both 5, so any order is OK. - assert(set(top_6_items_df[DEFAULT_ITEM_COL][3:5]) == set([1, 4])) + assert set(top_6_items_df[DEFAULT_ITEM_COL][3:5]) == set([1, 4]) # Third and fourth itemID of user 2. The scores are both 3, so any order is OK. - assert(set(top_6_items_df[DEFAULT_ITEM_COL][5:7]) == set([5, 6])) - assert(top_6_items_df[DEFAULT_ITEM_COL][7] == 7) + assert set(top_6_items_df[DEFAULT_ITEM_COL][5:7]) == set([5, 6]) + assert top_6_items_df[DEFAULT_ITEM_COL][7] == 7 # First three itemIDs of user 3. The scores are both 5, so any order is OK. - assert(set(top_6_items_df[DEFAULT_ITEM_COL][8:11]) == set([2, 5, 6])) + assert set(top_6_items_df[DEFAULT_ITEM_COL][8:11]) == set([2, 5, 6]) # Fourth and fifth itemID of user 3. The scores are both 4, so any order is OK. - assert(set(top_6_items_df[DEFAULT_ITEM_COL][11:13]) == set([8, 9])) + assert set(top_6_items_df[DEFAULT_ITEM_COL][11:13]) == set([8, 9]) # Sixth itemID of user 3. Item 10,11,12 have a score of 3, so either one is OK. - assert(top_6_items_df[DEFAULT_ITEM_COL][13] in [10, 11, 12]) + assert top_6_items_df[DEFAULT_ITEM_COL][13] in [10, 11, 12] def test_python_ndcg_at_k(rating_true, rating_pred, rating_nohit): - assert ( - ndcg_at_k( - rating_true=rating_true, - rating_pred=rating_true, - col_prediction=DEFAULT_RATING_COL, - k=10, - ) - == pytest.approx(1.0, TOL) - ) + assert ndcg_at_k( + rating_true=rating_true, + rating_pred=rating_true, + col_prediction=DEFAULT_RATING_COL, + k=10, + ) == pytest.approx(1.0, TOL) assert ndcg_at_k(rating_true, rating_nohit, k=10) == 0.0 assert ndcg_at_k(rating_true, rating_pred, k=10) == pytest.approx(0.38172, TOL) @@ -280,7 +271,9 @@ def test_python_ndcg_at_k(rating_true, rating_pred, rating_nohit): DEFAULT_PREDICTION_COL: np.asarray([6, 5, 4, 3, 2, 1]), } ) - assert ndcg_at_k(df_true, df_pred, k=6, score_type="raw", discfun_type="log2") == pytest.approx(0.785, TOL) + assert ndcg_at_k( + df_true, df_pred, k=6, score_type="raw", discfun_type="log2" + ) == pytest.approx(0.785, TOL) def test_python_map_at_k(rating_true, rating_pred, rating_nohit): @@ -360,59 +353,44 @@ def test_python_precision(rating_true, rating_pred, rating_nohit): def test_python_recall(rating_true, rating_pred, rating_nohit): - assert ( - recall_at_k( - rating_true=rating_true, - rating_pred=rating_true, - col_prediction=DEFAULT_RATING_COL, - k=10, - ) - == pytest.approx(1, TOL) - ) + assert recall_at_k( + rating_true=rating_true, + rating_pred=rating_true, + col_prediction=DEFAULT_RATING_COL, + k=10, + ) == pytest.approx(1, TOL) assert recall_at_k(rating_true, rating_nohit, k=10) == 0.0 assert recall_at_k(rating_true, rating_pred, k=10) == pytest.approx(0.37777, TOL) def test_python_auc(rating_true_binary, rating_pred_binary): - assert ( - auc( - rating_true=rating_true_binary, - rating_pred=rating_true_binary, - col_prediction=DEFAULT_RATING_COL, - ) - == pytest.approx(1.0, TOL) - ) - - assert ( - auc( - rating_true=rating_true_binary, - rating_pred=rating_pred_binary, - col_rating=DEFAULT_RATING_COL, - col_prediction=DEFAULT_PREDICTION_COL, - ) - == pytest.approx(0.75, TOL) - ) + assert auc( + rating_true=rating_true_binary, + rating_pred=rating_true_binary, + col_prediction=DEFAULT_RATING_COL, + ) == pytest.approx(1.0, TOL) + + assert auc( + rating_true=rating_true_binary, + rating_pred=rating_pred_binary, + col_rating=DEFAULT_RATING_COL, + col_prediction=DEFAULT_PREDICTION_COL, + ) == pytest.approx(0.75, TOL) def test_python_logloss(rating_true_binary, rating_pred_binary): - assert ( - logloss( - rating_true=rating_true_binary, - rating_pred=rating_true_binary, - col_prediction=DEFAULT_RATING_COL, - ) - == pytest.approx(0, TOL) - ) - - assert ( - logloss( - rating_true=rating_true_binary, - rating_pred=rating_pred_binary, - col_rating=DEFAULT_RATING_COL, - col_prediction=DEFAULT_PREDICTION_COL, - ) - == pytest.approx(0.7835, TOL) - ) + assert logloss( + rating_true=rating_true_binary, + rating_pred=rating_true_binary, + col_prediction=DEFAULT_RATING_COL, + ) == pytest.approx(0, TOL) + + assert logloss( + rating_true=rating_true_binary, + rating_pred=rating_pred_binary, + col_rating=DEFAULT_RATING_COL, + col_prediction=DEFAULT_PREDICTION_COL, + ) == pytest.approx(0.7835, TOL) def test_python_errors(rating_true, rating_pred): From 8c43c1e014f0c408c9e8c7e2f7ed61c8d9a1c388 Mon Sep 17 00:00:00 2001 From: Chuyang Ke Date: Thu, 18 Aug 2022 16:30:23 -0700 Subject: [PATCH 6/7] update ndcg time in time performance tests --- .../evaluation/test_python_evaluation_time_performance.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/unit/recommenders/evaluation/test_python_evaluation_time_performance.py b/tests/unit/recommenders/evaluation/test_python_evaluation_time_performance.py index 63893d864e..cbc5ce29c9 100644 --- a/tests/unit/recommenders/evaluation/test_python_evaluation_time_performance.py +++ b/tests/unit/recommenders/evaluation/test_python_evaluation_time_performance.py @@ -185,7 +185,7 @@ def test_python_ndcg_at_k(rating_true, rating_pred): col_prediction=DEFAULT_PREDICTION_COL, k=10, ) - assert t.interval < 21.55627936 * (1 + TOL) + assert t.interval < 39.03877957 * (1 + TOL) def test_python_map_at_k(rating_true, rating_pred): From 3107d3b77e7ba15a368ce7b19351cb1a895da266 Mon Sep 17 00:00:00 2001 From: miguelgfierro Date: Tue, 20 Sep 2022 11:51:11 +0200 Subject: [PATCH 7/7] :memo: --- recommenders/evaluation/python_evaluation.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/recommenders/evaluation/python_evaluation.py b/recommenders/evaluation/python_evaluation.py index b0d8982708..a762fa10bd 100644 --- a/recommenders/evaluation/python_evaluation.py +++ b/recommenders/evaluation/python_evaluation.py @@ -554,7 +554,7 @@ def ndcg_at_k( float: nDCG at k (min=0, max=1). """ - df_hit, df_hit_count, n_users = merge_ranking_true_pred( + df_hit, _, _ = merge_ranking_true_pred( rating_true=rating_true, rating_pred=rating_pred, col_user=col_user, @@ -589,20 +589,20 @@ def ndcg_at_k( else: raise ValueError("discfun_type must be one of 'loge', 'log2'") - # calculate actual discounted gain for each record + # Calculate the actual discounted gain for each record df_dcg["dcg"] = df_dcg["rel"] / discfun(1 + df_dcg["rank"]) - # calculate ideal discounted gain for each record + # Calculate the ideal discounted gain for each record df_idcg = df_dcg.sort_values([col_user, col_rating], ascending=False) df_idcg["irank"] = df_idcg.groupby(col_user, as_index=False, sort=False)[ col_rating ].rank("first", ascending=False) df_idcg["idcg"] = df_idcg["rel"] / discfun(1 + df_idcg["irank"]) - # calculate actual DCG for each user + # Calculate the actual DCG for each user df_user = df_dcg.groupby(col_user, as_index=False, sort=False).agg({"dcg": "sum"}) - # calculate ideal DCG for each user + # Calculate the ideal DCG for each user df_user = df_user.merge( df_idcg.groupby(col_user, as_index=False, sort=False) .head(k)