Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Python generalized ndcg #1812

Merged
merged 7 commits into from
Sep 20, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
61 changes: 49 additions & 12 deletions recommenders/evaluation/python_evaluation.py
Original file line number Diff line number Diff line change
Expand Up @@ -527,6 +527,8 @@ def ndcg_at_k(
relevancy_method="top_k",
k=DEFAULT_K,
threshold=DEFAULT_THRESHOLD,
score_type="binary",
discfun_type="loge",
):
"""Normalized Discounted Cumulative Gain (nDCG).

Expand All @@ -543,12 +545,16 @@ def ndcg_at_k(
top k items are directly provided, so there is no need to compute the relevancy operation.
k (int): number of top k items per user
threshold (float): threshold of top items per user (optional)
score_type (str): type of relevance scores ['binary', 'raw', 'exp']. With the default option 'binary', the
relevance score is reduced to either 1 (hit) or 0 (miss). Option 'raw' uses the raw relevance score.
Option 'exp' uses (2 ** RAW_RELEVANCE - 1) as the relevance score
discfun_type (str): type of discount function ['loge', 'log2'] used to calculate DCG.

Returns:
float: nDCG at k (min=0, max=1).
"""

df_hit, df_hit_count, n_users = merge_ranking_true_pred(
df_hit, _, _ = merge_ranking_true_pred(
rating_true=rating_true,
rating_pred=rating_pred,
col_user=col_user,
Expand All @@ -563,20 +569,51 @@ def ndcg_at_k(
if df_hit.shape[0] == 0:
return 0.0

# calculate discounted gain for hit items
df_dcg = df_hit.copy()
# relevance in this case is always 1
df_dcg["dcg"] = 1 / np.log1p(df_dcg["rank"])
# sum up discount gained to get discount cumulative gain
df_dcg = df_dcg.groupby(col_user, as_index=False, sort=False).agg({"dcg": "sum"})
# calculate ideal discounted cumulative gain
df_ndcg = pd.merge(df_dcg, df_hit_count, on=[col_user])
df_ndcg["idcg"] = df_ndcg["actual"].apply(
lambda x: sum(1 / np.log1p(range(1, min(x, k) + 1)))
df_dcg = df_hit.merge(rating_pred, on=[col_user, col_item]).merge(
rating_true, on=[col_user, col_item], how="outer", suffixes=("_left", None)
)

if score_type == "binary":
df_dcg["rel"] = 1
elif score_type == "raw":
df_dcg["rel"] = df_dcg[col_rating]
elif score_type == "exp":
df_dcg["rel"] = 2 ** df_dcg[col_rating] - 1
else:
raise ValueError("score_type must be one of 'binary', 'raw', 'exp'")

if discfun_type == "loge":
discfun = np.log
elif discfun_type == "log2":
discfun = np.log2
else:
raise ValueError("discfun_type must be one of 'loge', 'log2'")

# Calculate the actual discounted gain for each record
df_dcg["dcg"] = df_dcg["rel"] / discfun(1 + df_dcg["rank"])

# Calculate the ideal discounted gain for each record
df_idcg = df_dcg.sort_values([col_user, col_rating], ascending=False)
df_idcg["irank"] = df_idcg.groupby(col_user, as_index=False, sort=False)[
col_rating
].rank("first", ascending=False)
df_idcg["idcg"] = df_idcg["rel"] / discfun(1 + df_idcg["irank"])

# Calculate the actual DCG for each user
df_user = df_dcg.groupby(col_user, as_index=False, sort=False).agg({"dcg": "sum"})

# Calculate the ideal DCG for each user
df_user = df_user.merge(
df_idcg.groupby(col_user, as_index=False, sort=False)
.head(k)
.groupby(col_user, as_index=False, sort=False)
.agg({"idcg": "sum"}),
on=col_user,
)

# DCG over IDCG is the normalized DCG
return (df_ndcg["dcg"] / df_ndcg["idcg"]).sum() / n_users
df_user["ndcg"] = df_user["dcg"] / df_user["idcg"]
return df_user["ndcg"].mean()


def map_at_k(
Expand Down
170 changes: 83 additions & 87 deletions tests/unit/recommenders/evaluation/test_python_evaluation.py
Original file line number Diff line number Diff line change
Expand Up @@ -178,26 +178,20 @@ def test_python_mae(rating_true, rating_pred):


def test_python_rsquared(rating_true, rating_pred):
assert (
rsquared(
rating_true=rating_true,
rating_pred=rating_true,
col_prediction=DEFAULT_RATING_COL,
)
== pytest.approx(1.0, TOL)
)
assert rsquared(
rating_true=rating_true,
rating_pred=rating_true,
col_prediction=DEFAULT_RATING_COL,
) == pytest.approx(1.0, TOL)
assert rsquared(rating_true, rating_pred) == pytest.approx(-31.699029, TOL)


def test_python_exp_var(rating_true, rating_pred):
assert (
exp_var(
rating_true=rating_true,
rating_pred=rating_true,
col_prediction=DEFAULT_RATING_COL,
)
== pytest.approx(1.0, TOL)
)
assert exp_var(
rating_true=rating_true,
rating_pred=rating_true,
col_prediction=DEFAULT_RATING_COL,
) == pytest.approx(1.0, TOL)
assert exp_var(rating_true, rating_pred) == pytest.approx(-6.4466, TOL)


Expand All @@ -211,16 +205,16 @@ def test_get_top_k_items(rating_true):
top_3_user_true = pd.Series([1, 1, 1, 2, 2, 2, 3, 3, 3])
top_3_rating_true = pd.Series([5, 4, 3, 5, 5, 3, 5, 5, 5])
top_3_rank_true = pd.Series([1, 2, 3, 1, 2, 3, 1, 2, 3])
assert(top_3_items_df[DEFAULT_USER_COL].equals(top_3_user_true))
assert(top_3_items_df[DEFAULT_RATING_COL].equals(top_3_rating_true))
assert(top_3_items_df['rank'].equals(top_3_rank_true))
assert(top_3_items_df[DEFAULT_ITEM_COL][:3].equals(pd.Series([1, 2, 3])))
assert top_3_items_df[DEFAULT_USER_COL].equals(top_3_user_true)
assert top_3_items_df[DEFAULT_RATING_COL].equals(top_3_rating_true)
assert top_3_items_df["rank"].equals(top_3_rank_true)
assert top_3_items_df[DEFAULT_ITEM_COL][:3].equals(pd.Series([1, 2, 3]))
# First two itemIDs of user 2. The scores are both 5, so any order is OK.
assert(set(top_3_items_df[DEFAULT_ITEM_COL][3:5]) == set([1, 4]))
assert set(top_3_items_df[DEFAULT_ITEM_COL][3:5]) == set([1, 4])
# Third itemID of user 2. Both item 5 and 6 have a score of 3, so either one is OK.
assert(top_3_items_df[DEFAULT_ITEM_COL][5] in [5, 6])
assert top_3_items_df[DEFAULT_ITEM_COL][5] in [5, 6]
# All itemIDs of user 3. All three items have a score of 5, so any order is OK.
assert(set(top_3_items_df[DEFAULT_ITEM_COL][6:]) == set([2, 5, 6]))
assert set(top_3_items_df[DEFAULT_ITEM_COL][6:]) == set([2, 5, 6])


# Test get_top_k_items() when k is larger than the number of available items
Expand All @@ -234,36 +228,53 @@ def test_get_top_k_items_largek(rating_true):
top_6_user_true = pd.Series([1, 1, 1, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3])
top_6_rating_true = pd.Series([5, 4, 3, 5, 5, 3, 3, 1, 5, 5, 5, 4, 4, 3])
top_6_rank_true = pd.Series([1, 2, 3, 1, 2, 3, 4, 5, 1, 2, 3, 4, 5, 6])
assert(top_6_items_df[DEFAULT_USER_COL].equals(top_6_user_true))
assert(top_6_items_df[DEFAULT_RATING_COL].equals(top_6_rating_true))
assert(top_6_items_df['rank'].equals(top_6_rank_true))
assert(top_6_items_df[DEFAULT_ITEM_COL][:3].equals(pd.Series([1, 2, 3])))
assert top_6_items_df[DEFAULT_USER_COL].equals(top_6_user_true)
assert top_6_items_df[DEFAULT_RATING_COL].equals(top_6_rating_true)
assert top_6_items_df["rank"].equals(top_6_rank_true)
assert top_6_items_df[DEFAULT_ITEM_COL][:3].equals(pd.Series([1, 2, 3]))
# First two itemIDs of user 2. The scores are both 5, so any order is OK.
assert(set(top_6_items_df[DEFAULT_ITEM_COL][3:5]) == set([1, 4]))
assert set(top_6_items_df[DEFAULT_ITEM_COL][3:5]) == set([1, 4])
# Third and fourth itemID of user 2. The scores are both 3, so any order is OK.
assert(set(top_6_items_df[DEFAULT_ITEM_COL][5:7]) == set([5, 6]))
assert(top_6_items_df[DEFAULT_ITEM_COL][7] == 7)
assert set(top_6_items_df[DEFAULT_ITEM_COL][5:7]) == set([5, 6])
assert top_6_items_df[DEFAULT_ITEM_COL][7] == 7
# First three itemIDs of user 3. The scores are both 5, so any order is OK.
assert(set(top_6_items_df[DEFAULT_ITEM_COL][8:11]) == set([2, 5, 6]))
assert set(top_6_items_df[DEFAULT_ITEM_COL][8:11]) == set([2, 5, 6])
# Fourth and fifth itemID of user 3. The scores are both 4, so any order is OK.
assert(set(top_6_items_df[DEFAULT_ITEM_COL][11:13]) == set([8, 9]))
assert set(top_6_items_df[DEFAULT_ITEM_COL][11:13]) == set([8, 9])
# Sixth itemID of user 3. Item 10,11,12 have a score of 3, so either one is OK.
assert(top_6_items_df[DEFAULT_ITEM_COL][13] in [10, 11, 12])
assert top_6_items_df[DEFAULT_ITEM_COL][13] in [10, 11, 12]


def test_python_ndcg_at_k(rating_true, rating_pred, rating_nohit):
assert (
ndcg_at_k(
rating_true=rating_true,
rating_pred=rating_true,
col_prediction=DEFAULT_RATING_COL,
k=10,
)
== pytest.approx(1.0, TOL)
)
assert ndcg_at_k(
rating_true=rating_true,
rating_pred=rating_true,
col_prediction=DEFAULT_RATING_COL,
k=10,
) == pytest.approx(1.0, TOL)
assert ndcg_at_k(rating_true, rating_nohit, k=10) == 0.0
assert ndcg_at_k(rating_true, rating_pred, k=10) == pytest.approx(0.38172, TOL)

# Test raw relevance score and log2 discounting factor using wiki example
# See https://en.wikipedia.org/wiki/Discounted_cumulative_gain
df_true = pd.DataFrame(
{
DEFAULT_USER_COL: np.full(8, 0, dtype=int),
DEFAULT_ITEM_COL: np.arange(8),
DEFAULT_RATING_COL: np.asarray([3, 2, 3, 0, 1, 2, 3, 2]),
}
)
df_pred = pd.DataFrame(
{
DEFAULT_USER_COL: np.full(6, 0, dtype=int),
DEFAULT_ITEM_COL: np.arange(6),
DEFAULT_PREDICTION_COL: np.asarray([6, 5, 4, 3, 2, 1]),
}
)
assert ndcg_at_k(
df_true, df_pred, k=6, score_type="raw", discfun_type="log2"
) == pytest.approx(0.785, TOL)


def test_python_map_at_k(rating_true, rating_pred, rating_nohit):
assert (
Expand Down Expand Up @@ -342,59 +353,44 @@ def test_python_precision(rating_true, rating_pred, rating_nohit):


def test_python_recall(rating_true, rating_pred, rating_nohit):
assert (
recall_at_k(
rating_true=rating_true,
rating_pred=rating_true,
col_prediction=DEFAULT_RATING_COL,
k=10,
)
== pytest.approx(1, TOL)
)
assert recall_at_k(
rating_true=rating_true,
rating_pred=rating_true,
col_prediction=DEFAULT_RATING_COL,
k=10,
) == pytest.approx(1, TOL)
assert recall_at_k(rating_true, rating_nohit, k=10) == 0.0
assert recall_at_k(rating_true, rating_pred, k=10) == pytest.approx(0.37777, TOL)


def test_python_auc(rating_true_binary, rating_pred_binary):
assert (
auc(
rating_true=rating_true_binary,
rating_pred=rating_true_binary,
col_prediction=DEFAULT_RATING_COL,
)
== pytest.approx(1.0, TOL)
)

assert (
auc(
rating_true=rating_true_binary,
rating_pred=rating_pred_binary,
col_rating=DEFAULT_RATING_COL,
col_prediction=DEFAULT_PREDICTION_COL,
)
== pytest.approx(0.75, TOL)
)
assert auc(
rating_true=rating_true_binary,
rating_pred=rating_true_binary,
col_prediction=DEFAULT_RATING_COL,
) == pytest.approx(1.0, TOL)

assert auc(
rating_true=rating_true_binary,
rating_pred=rating_pred_binary,
col_rating=DEFAULT_RATING_COL,
col_prediction=DEFAULT_PREDICTION_COL,
) == pytest.approx(0.75, TOL)


def test_python_logloss(rating_true_binary, rating_pred_binary):
assert (
logloss(
rating_true=rating_true_binary,
rating_pred=rating_true_binary,
col_prediction=DEFAULT_RATING_COL,
)
== pytest.approx(0, TOL)
)

assert (
logloss(
rating_true=rating_true_binary,
rating_pred=rating_pred_binary,
col_rating=DEFAULT_RATING_COL,
col_prediction=DEFAULT_PREDICTION_COL,
)
== pytest.approx(0.7835, TOL)
)
assert logloss(
rating_true=rating_true_binary,
rating_pred=rating_true_binary,
col_prediction=DEFAULT_RATING_COL,
) == pytest.approx(0, TOL)

assert logloss(
rating_true=rating_true_binary,
rating_pred=rating_pred_binary,
col_rating=DEFAULT_RATING_COL,
col_prediction=DEFAULT_PREDICTION_COL,
) == pytest.approx(0.7835, TOL)


def test_python_errors(rating_true, rating_pred):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -185,7 +185,7 @@ def test_python_ndcg_at_k(rating_true, rating_pred):
col_prediction=DEFAULT_PREDICTION_COL,
k=10,
)
assert t.interval < 21.55627936 * (1 + TOL)
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The new version takes more time, because for raw and exp scores, we need to sort the dataframe based on the true ratings. We can probably use the old version for binary, and new version for raw and exp.

assert t.interval < 39.03877957 * (1 + TOL)


def test_python_map_at_k(rating_true, rating_pred):
Expand Down