diff --git a/recommenders/evaluation/python_evaluation.py b/recommenders/evaluation/python_evaluation.py index 0f51580a3f..a762fa10bd 100644 --- a/recommenders/evaluation/python_evaluation.py +++ b/recommenders/evaluation/python_evaluation.py @@ -527,6 +527,8 @@ def ndcg_at_k( relevancy_method="top_k", k=DEFAULT_K, threshold=DEFAULT_THRESHOLD, + score_type="binary", + discfun_type="loge", ): """Normalized Discounted Cumulative Gain (nDCG). @@ -543,12 +545,16 @@ def ndcg_at_k( top k items are directly provided, so there is no need to compute the relevancy operation. k (int): number of top k items per user threshold (float): threshold of top items per user (optional) + score_type (str): type of relevance scores ['binary', 'raw', 'exp']. With the default option 'binary', the + relevance score is reduced to either 1 (hit) or 0 (miss). Option 'raw' uses the raw relevance score. + Option 'exp' uses (2 ** RAW_RELEVANCE - 1) as the relevance score + discfun_type (str): type of discount function ['loge', 'log2'] used to calculate DCG. Returns: float: nDCG at k (min=0, max=1). """ - df_hit, df_hit_count, n_users = merge_ranking_true_pred( + df_hit, _, _ = merge_ranking_true_pred( rating_true=rating_true, rating_pred=rating_pred, col_user=col_user, @@ -563,20 +569,51 @@ def ndcg_at_k( if df_hit.shape[0] == 0: return 0.0 - # calculate discounted gain for hit items - df_dcg = df_hit.copy() - # relevance in this case is always 1 - df_dcg["dcg"] = 1 / np.log1p(df_dcg["rank"]) - # sum up discount gained to get discount cumulative gain - df_dcg = df_dcg.groupby(col_user, as_index=False, sort=False).agg({"dcg": "sum"}) - # calculate ideal discounted cumulative gain - df_ndcg = pd.merge(df_dcg, df_hit_count, on=[col_user]) - df_ndcg["idcg"] = df_ndcg["actual"].apply( - lambda x: sum(1 / np.log1p(range(1, min(x, k) + 1))) + df_dcg = df_hit.merge(rating_pred, on=[col_user, col_item]).merge( + rating_true, on=[col_user, col_item], how="outer", suffixes=("_left", None) + ) + + if score_type == "binary": + df_dcg["rel"] = 1 + elif score_type == "raw": + df_dcg["rel"] = df_dcg[col_rating] + elif score_type == "exp": + df_dcg["rel"] = 2 ** df_dcg[col_rating] - 1 + else: + raise ValueError("score_type must be one of 'binary', 'raw', 'exp'") + + if discfun_type == "loge": + discfun = np.log + elif discfun_type == "log2": + discfun = np.log2 + else: + raise ValueError("discfun_type must be one of 'loge', 'log2'") + + # Calculate the actual discounted gain for each record + df_dcg["dcg"] = df_dcg["rel"] / discfun(1 + df_dcg["rank"]) + + # Calculate the ideal discounted gain for each record + df_idcg = df_dcg.sort_values([col_user, col_rating], ascending=False) + df_idcg["irank"] = df_idcg.groupby(col_user, as_index=False, sort=False)[ + col_rating + ].rank("first", ascending=False) + df_idcg["idcg"] = df_idcg["rel"] / discfun(1 + df_idcg["irank"]) + + # Calculate the actual DCG for each user + df_user = df_dcg.groupby(col_user, as_index=False, sort=False).agg({"dcg": "sum"}) + + # Calculate the ideal DCG for each user + df_user = df_user.merge( + df_idcg.groupby(col_user, as_index=False, sort=False) + .head(k) + .groupby(col_user, as_index=False, sort=False) + .agg({"idcg": "sum"}), + on=col_user, ) # DCG over IDCG is the normalized DCG - return (df_ndcg["dcg"] / df_ndcg["idcg"]).sum() / n_users + df_user["ndcg"] = df_user["dcg"] / df_user["idcg"] + return df_user["ndcg"].mean() def map_at_k( diff --git a/tests/unit/recommenders/evaluation/test_python_evaluation.py b/tests/unit/recommenders/evaluation/test_python_evaluation.py index a1a29902ca..c4e091d556 100644 --- a/tests/unit/recommenders/evaluation/test_python_evaluation.py +++ b/tests/unit/recommenders/evaluation/test_python_evaluation.py @@ -178,26 +178,20 @@ def test_python_mae(rating_true, rating_pred): def test_python_rsquared(rating_true, rating_pred): - assert ( - rsquared( - rating_true=rating_true, - rating_pred=rating_true, - col_prediction=DEFAULT_RATING_COL, - ) - == pytest.approx(1.0, TOL) - ) + assert rsquared( + rating_true=rating_true, + rating_pred=rating_true, + col_prediction=DEFAULT_RATING_COL, + ) == pytest.approx(1.0, TOL) assert rsquared(rating_true, rating_pred) == pytest.approx(-31.699029, TOL) def test_python_exp_var(rating_true, rating_pred): - assert ( - exp_var( - rating_true=rating_true, - rating_pred=rating_true, - col_prediction=DEFAULT_RATING_COL, - ) - == pytest.approx(1.0, TOL) - ) + assert exp_var( + rating_true=rating_true, + rating_pred=rating_true, + col_prediction=DEFAULT_RATING_COL, + ) == pytest.approx(1.0, TOL) assert exp_var(rating_true, rating_pred) == pytest.approx(-6.4466, TOL) @@ -211,16 +205,16 @@ def test_get_top_k_items(rating_true): top_3_user_true = pd.Series([1, 1, 1, 2, 2, 2, 3, 3, 3]) top_3_rating_true = pd.Series([5, 4, 3, 5, 5, 3, 5, 5, 5]) top_3_rank_true = pd.Series([1, 2, 3, 1, 2, 3, 1, 2, 3]) - assert(top_3_items_df[DEFAULT_USER_COL].equals(top_3_user_true)) - assert(top_3_items_df[DEFAULT_RATING_COL].equals(top_3_rating_true)) - assert(top_3_items_df['rank'].equals(top_3_rank_true)) - assert(top_3_items_df[DEFAULT_ITEM_COL][:3].equals(pd.Series([1, 2, 3]))) + assert top_3_items_df[DEFAULT_USER_COL].equals(top_3_user_true) + assert top_3_items_df[DEFAULT_RATING_COL].equals(top_3_rating_true) + assert top_3_items_df["rank"].equals(top_3_rank_true) + assert top_3_items_df[DEFAULT_ITEM_COL][:3].equals(pd.Series([1, 2, 3])) # First two itemIDs of user 2. The scores are both 5, so any order is OK. - assert(set(top_3_items_df[DEFAULT_ITEM_COL][3:5]) == set([1, 4])) + assert set(top_3_items_df[DEFAULT_ITEM_COL][3:5]) == set([1, 4]) # Third itemID of user 2. Both item 5 and 6 have a score of 3, so either one is OK. - assert(top_3_items_df[DEFAULT_ITEM_COL][5] in [5, 6]) + assert top_3_items_df[DEFAULT_ITEM_COL][5] in [5, 6] # All itemIDs of user 3. All three items have a score of 5, so any order is OK. - assert(set(top_3_items_df[DEFAULT_ITEM_COL][6:]) == set([2, 5, 6])) + assert set(top_3_items_df[DEFAULT_ITEM_COL][6:]) == set([2, 5, 6]) # Test get_top_k_items() when k is larger than the number of available items @@ -234,36 +228,53 @@ def test_get_top_k_items_largek(rating_true): top_6_user_true = pd.Series([1, 1, 1, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3]) top_6_rating_true = pd.Series([5, 4, 3, 5, 5, 3, 3, 1, 5, 5, 5, 4, 4, 3]) top_6_rank_true = pd.Series([1, 2, 3, 1, 2, 3, 4, 5, 1, 2, 3, 4, 5, 6]) - assert(top_6_items_df[DEFAULT_USER_COL].equals(top_6_user_true)) - assert(top_6_items_df[DEFAULT_RATING_COL].equals(top_6_rating_true)) - assert(top_6_items_df['rank'].equals(top_6_rank_true)) - assert(top_6_items_df[DEFAULT_ITEM_COL][:3].equals(pd.Series([1, 2, 3]))) + assert top_6_items_df[DEFAULT_USER_COL].equals(top_6_user_true) + assert top_6_items_df[DEFAULT_RATING_COL].equals(top_6_rating_true) + assert top_6_items_df["rank"].equals(top_6_rank_true) + assert top_6_items_df[DEFAULT_ITEM_COL][:3].equals(pd.Series([1, 2, 3])) # First two itemIDs of user 2. The scores are both 5, so any order is OK. - assert(set(top_6_items_df[DEFAULT_ITEM_COL][3:5]) == set([1, 4])) + assert set(top_6_items_df[DEFAULT_ITEM_COL][3:5]) == set([1, 4]) # Third and fourth itemID of user 2. The scores are both 3, so any order is OK. - assert(set(top_6_items_df[DEFAULT_ITEM_COL][5:7]) == set([5, 6])) - assert(top_6_items_df[DEFAULT_ITEM_COL][7] == 7) + assert set(top_6_items_df[DEFAULT_ITEM_COL][5:7]) == set([5, 6]) + assert top_6_items_df[DEFAULT_ITEM_COL][7] == 7 # First three itemIDs of user 3. The scores are both 5, so any order is OK. - assert(set(top_6_items_df[DEFAULT_ITEM_COL][8:11]) == set([2, 5, 6])) + assert set(top_6_items_df[DEFAULT_ITEM_COL][8:11]) == set([2, 5, 6]) # Fourth and fifth itemID of user 3. The scores are both 4, so any order is OK. - assert(set(top_6_items_df[DEFAULT_ITEM_COL][11:13]) == set([8, 9])) + assert set(top_6_items_df[DEFAULT_ITEM_COL][11:13]) == set([8, 9]) # Sixth itemID of user 3. Item 10,11,12 have a score of 3, so either one is OK. - assert(top_6_items_df[DEFAULT_ITEM_COL][13] in [10, 11, 12]) + assert top_6_items_df[DEFAULT_ITEM_COL][13] in [10, 11, 12] def test_python_ndcg_at_k(rating_true, rating_pred, rating_nohit): - assert ( - ndcg_at_k( - rating_true=rating_true, - rating_pred=rating_true, - col_prediction=DEFAULT_RATING_COL, - k=10, - ) - == pytest.approx(1.0, TOL) - ) + assert ndcg_at_k( + rating_true=rating_true, + rating_pred=rating_true, + col_prediction=DEFAULT_RATING_COL, + k=10, + ) == pytest.approx(1.0, TOL) assert ndcg_at_k(rating_true, rating_nohit, k=10) == 0.0 assert ndcg_at_k(rating_true, rating_pred, k=10) == pytest.approx(0.38172, TOL) + # Test raw relevance score and log2 discounting factor using wiki example + # See https://en.wikipedia.org/wiki/Discounted_cumulative_gain + df_true = pd.DataFrame( + { + DEFAULT_USER_COL: np.full(8, 0, dtype=int), + DEFAULT_ITEM_COL: np.arange(8), + DEFAULT_RATING_COL: np.asarray([3, 2, 3, 0, 1, 2, 3, 2]), + } + ) + df_pred = pd.DataFrame( + { + DEFAULT_USER_COL: np.full(6, 0, dtype=int), + DEFAULT_ITEM_COL: np.arange(6), + DEFAULT_PREDICTION_COL: np.asarray([6, 5, 4, 3, 2, 1]), + } + ) + assert ndcg_at_k( + df_true, df_pred, k=6, score_type="raw", discfun_type="log2" + ) == pytest.approx(0.785, TOL) + def test_python_map_at_k(rating_true, rating_pred, rating_nohit): assert ( @@ -342,59 +353,44 @@ def test_python_precision(rating_true, rating_pred, rating_nohit): def test_python_recall(rating_true, rating_pred, rating_nohit): - assert ( - recall_at_k( - rating_true=rating_true, - rating_pred=rating_true, - col_prediction=DEFAULT_RATING_COL, - k=10, - ) - == pytest.approx(1, TOL) - ) + assert recall_at_k( + rating_true=rating_true, + rating_pred=rating_true, + col_prediction=DEFAULT_RATING_COL, + k=10, + ) == pytest.approx(1, TOL) assert recall_at_k(rating_true, rating_nohit, k=10) == 0.0 assert recall_at_k(rating_true, rating_pred, k=10) == pytest.approx(0.37777, TOL) def test_python_auc(rating_true_binary, rating_pred_binary): - assert ( - auc( - rating_true=rating_true_binary, - rating_pred=rating_true_binary, - col_prediction=DEFAULT_RATING_COL, - ) - == pytest.approx(1.0, TOL) - ) - - assert ( - auc( - rating_true=rating_true_binary, - rating_pred=rating_pred_binary, - col_rating=DEFAULT_RATING_COL, - col_prediction=DEFAULT_PREDICTION_COL, - ) - == pytest.approx(0.75, TOL) - ) + assert auc( + rating_true=rating_true_binary, + rating_pred=rating_true_binary, + col_prediction=DEFAULT_RATING_COL, + ) == pytest.approx(1.0, TOL) + + assert auc( + rating_true=rating_true_binary, + rating_pred=rating_pred_binary, + col_rating=DEFAULT_RATING_COL, + col_prediction=DEFAULT_PREDICTION_COL, + ) == pytest.approx(0.75, TOL) def test_python_logloss(rating_true_binary, rating_pred_binary): - assert ( - logloss( - rating_true=rating_true_binary, - rating_pred=rating_true_binary, - col_prediction=DEFAULT_RATING_COL, - ) - == pytest.approx(0, TOL) - ) - - assert ( - logloss( - rating_true=rating_true_binary, - rating_pred=rating_pred_binary, - col_rating=DEFAULT_RATING_COL, - col_prediction=DEFAULT_PREDICTION_COL, - ) - == pytest.approx(0.7835, TOL) - ) + assert logloss( + rating_true=rating_true_binary, + rating_pred=rating_true_binary, + col_prediction=DEFAULT_RATING_COL, + ) == pytest.approx(0, TOL) + + assert logloss( + rating_true=rating_true_binary, + rating_pred=rating_pred_binary, + col_rating=DEFAULT_RATING_COL, + col_prediction=DEFAULT_PREDICTION_COL, + ) == pytest.approx(0.7835, TOL) def test_python_errors(rating_true, rating_pred): diff --git a/tests/unit/recommenders/evaluation/test_python_evaluation_time_performance.py b/tests/unit/recommenders/evaluation/test_python_evaluation_time_performance.py index 63893d864e..cbc5ce29c9 100644 --- a/tests/unit/recommenders/evaluation/test_python_evaluation_time_performance.py +++ b/tests/unit/recommenders/evaluation/test_python_evaluation_time_performance.py @@ -185,7 +185,7 @@ def test_python_ndcg_at_k(rating_true, rating_pred): col_prediction=DEFAULT_PREDICTION_COL, k=10, ) - assert t.interval < 21.55627936 * (1 + TOL) + assert t.interval < 39.03877957 * (1 + TOL) def test_python_map_at_k(rating_true, rating_pred):