diff --git a/recommenders/evaluation/python_evaluation.py b/recommenders/evaluation/python_evaluation.py
index 0f51580a3f..a762fa10bd 100644
--- a/recommenders/evaluation/python_evaluation.py
+++ b/recommenders/evaluation/python_evaluation.py
@@ -527,6 +527,8 @@ def ndcg_at_k(
     relevancy_method="top_k",
     k=DEFAULT_K,
     threshold=DEFAULT_THRESHOLD,
+    score_type="binary",
+    discfun_type="loge",
 ):
     """Normalized Discounted Cumulative Gain (nDCG).
 
@@ -543,12 +545,16 @@ def ndcg_at_k(
             top k items are directly provided, so there is no need to compute the relevancy operation.
         k (int): number of top k items per user
         threshold (float): threshold of top items per user (optional)
+        score_type (str): type of relevance scores ['binary', 'raw', 'exp']. With the default option 'binary', the
+            relevance score is reduced to either 1 (hit) or 0 (miss). Option 'raw' uses the raw relevance score.
+            Option 'exp' uses (2 ** RAW_RELEVANCE - 1) as the relevance score
+        discfun_type (str): type of discount function ['loge', 'log2'] used to calculate DCG.
 
     Returns:
         float: nDCG at k (min=0, max=1).
     """
 
-    df_hit, df_hit_count, n_users = merge_ranking_true_pred(
+    df_hit, _, _ = merge_ranking_true_pred(
         rating_true=rating_true,
         rating_pred=rating_pred,
         col_user=col_user,
@@ -563,20 +569,51 @@ def ndcg_at_k(
     if df_hit.shape[0] == 0:
         return 0.0
 
-    # calculate discounted gain for hit items
-    df_dcg = df_hit.copy()
-    # relevance in this case is always 1
-    df_dcg["dcg"] = 1 / np.log1p(df_dcg["rank"])
-    # sum up discount gained to get discount cumulative gain
-    df_dcg = df_dcg.groupby(col_user, as_index=False, sort=False).agg({"dcg": "sum"})
-    # calculate ideal discounted cumulative gain
-    df_ndcg = pd.merge(df_dcg, df_hit_count, on=[col_user])
-    df_ndcg["idcg"] = df_ndcg["actual"].apply(
-        lambda x: sum(1 / np.log1p(range(1, min(x, k) + 1)))
+    df_dcg = df_hit.merge(rating_pred, on=[col_user, col_item]).merge(
+        rating_true, on=[col_user, col_item], how="outer", suffixes=("_left", None)
+    )
+
+    if score_type == "binary":
+        df_dcg["rel"] = 1
+    elif score_type == "raw":
+        df_dcg["rel"] = df_dcg[col_rating]
+    elif score_type == "exp":
+        df_dcg["rel"] = 2 ** df_dcg[col_rating] - 1
+    else:
+        raise ValueError("score_type must be one of 'binary', 'raw', 'exp'")
+
+    if discfun_type == "loge":
+        discfun = np.log
+    elif discfun_type == "log2":
+        discfun = np.log2
+    else:
+        raise ValueError("discfun_type must be one of 'loge', 'log2'")
+
+    # Calculate the actual discounted gain for each record
+    df_dcg["dcg"] = df_dcg["rel"] / discfun(1 + df_dcg["rank"])
+
+    # Calculate the ideal discounted gain for each record
+    df_idcg = df_dcg.sort_values([col_user, col_rating], ascending=False)
+    df_idcg["irank"] = df_idcg.groupby(col_user, as_index=False, sort=False)[
+        col_rating
+    ].rank("first", ascending=False)
+    df_idcg["idcg"] = df_idcg["rel"] / discfun(1 + df_idcg["irank"])
+
+    # Calculate the actual DCG for each user
+    df_user = df_dcg.groupby(col_user, as_index=False, sort=False).agg({"dcg": "sum"})
+
+    # Calculate the ideal DCG for each user
+    df_user = df_user.merge(
+        df_idcg.groupby(col_user, as_index=False, sort=False)
+        .head(k)
+        .groupby(col_user, as_index=False, sort=False)
+        .agg({"idcg": "sum"}),
+        on=col_user,
     )
 
     # DCG over IDCG is the normalized DCG
-    return (df_ndcg["dcg"] / df_ndcg["idcg"]).sum() / n_users
+    df_user["ndcg"] = df_user["dcg"] / df_user["idcg"]
+    return df_user["ndcg"].mean()
 
 
 def map_at_k(
diff --git a/tests/unit/recommenders/evaluation/test_python_evaluation.py b/tests/unit/recommenders/evaluation/test_python_evaluation.py
index a1a29902ca..c4e091d556 100644
--- a/tests/unit/recommenders/evaluation/test_python_evaluation.py
+++ b/tests/unit/recommenders/evaluation/test_python_evaluation.py
@@ -178,26 +178,20 @@ def test_python_mae(rating_true, rating_pred):
 
 
 def test_python_rsquared(rating_true, rating_pred):
-    assert (
-        rsquared(
-            rating_true=rating_true,
-            rating_pred=rating_true,
-            col_prediction=DEFAULT_RATING_COL,
-        )
-        == pytest.approx(1.0, TOL)
-    )
+    assert rsquared(
+        rating_true=rating_true,
+        rating_pred=rating_true,
+        col_prediction=DEFAULT_RATING_COL,
+    ) == pytest.approx(1.0, TOL)
     assert rsquared(rating_true, rating_pred) == pytest.approx(-31.699029, TOL)
 
 
 def test_python_exp_var(rating_true, rating_pred):
-    assert (
-        exp_var(
-            rating_true=rating_true,
-            rating_pred=rating_true,
-            col_prediction=DEFAULT_RATING_COL,
-        )
-        == pytest.approx(1.0, TOL)
-    )
+    assert exp_var(
+        rating_true=rating_true,
+        rating_pred=rating_true,
+        col_prediction=DEFAULT_RATING_COL,
+    ) == pytest.approx(1.0, TOL)
     assert exp_var(rating_true, rating_pred) == pytest.approx(-6.4466, TOL)
 
 
@@ -211,16 +205,16 @@ def test_get_top_k_items(rating_true):
     top_3_user_true = pd.Series([1, 1, 1, 2, 2, 2, 3, 3, 3])
     top_3_rating_true = pd.Series([5, 4, 3, 5, 5, 3, 5, 5, 5])
     top_3_rank_true = pd.Series([1, 2, 3, 1, 2, 3, 1, 2, 3])
-    assert(top_3_items_df[DEFAULT_USER_COL].equals(top_3_user_true))
-    assert(top_3_items_df[DEFAULT_RATING_COL].equals(top_3_rating_true))
-    assert(top_3_items_df['rank'].equals(top_3_rank_true))
-    assert(top_3_items_df[DEFAULT_ITEM_COL][:3].equals(pd.Series([1, 2, 3])))
+    assert top_3_items_df[DEFAULT_USER_COL].equals(top_3_user_true)
+    assert top_3_items_df[DEFAULT_RATING_COL].equals(top_3_rating_true)
+    assert top_3_items_df["rank"].equals(top_3_rank_true)
+    assert top_3_items_df[DEFAULT_ITEM_COL][:3].equals(pd.Series([1, 2, 3]))
     # First two itemIDs of user 2. The scores are both 5, so any order is OK.
-    assert(set(top_3_items_df[DEFAULT_ITEM_COL][3:5]) == set([1, 4]))
+    assert set(top_3_items_df[DEFAULT_ITEM_COL][3:5]) == set([1, 4])
     # Third itemID of user 2. Both item 5 and 6 have a score of 3, so either one is OK.
-    assert(top_3_items_df[DEFAULT_ITEM_COL][5] in [5, 6])
+    assert top_3_items_df[DEFAULT_ITEM_COL][5] in [5, 6]
     # All itemIDs of user 3. All three items have a score of 5, so any order is OK.
-    assert(set(top_3_items_df[DEFAULT_ITEM_COL][6:]) == set([2, 5, 6]))
+    assert set(top_3_items_df[DEFAULT_ITEM_COL][6:]) == set([2, 5, 6])
 
 
 # Test get_top_k_items() when k is larger than the number of available items
@@ -234,36 +228,53 @@ def test_get_top_k_items_largek(rating_true):
     top_6_user_true = pd.Series([1, 1, 1, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3])
     top_6_rating_true = pd.Series([5, 4, 3, 5, 5, 3, 3, 1, 5, 5, 5, 4, 4, 3])
     top_6_rank_true = pd.Series([1, 2, 3, 1, 2, 3, 4, 5, 1, 2, 3, 4, 5, 6])
-    assert(top_6_items_df[DEFAULT_USER_COL].equals(top_6_user_true))
-    assert(top_6_items_df[DEFAULT_RATING_COL].equals(top_6_rating_true))
-    assert(top_6_items_df['rank'].equals(top_6_rank_true))
-    assert(top_6_items_df[DEFAULT_ITEM_COL][:3].equals(pd.Series([1, 2, 3])))
+    assert top_6_items_df[DEFAULT_USER_COL].equals(top_6_user_true)
+    assert top_6_items_df[DEFAULT_RATING_COL].equals(top_6_rating_true)
+    assert top_6_items_df["rank"].equals(top_6_rank_true)
+    assert top_6_items_df[DEFAULT_ITEM_COL][:3].equals(pd.Series([1, 2, 3]))
     # First two itemIDs of user 2. The scores are both 5, so any order is OK.
-    assert(set(top_6_items_df[DEFAULT_ITEM_COL][3:5]) == set([1, 4]))
+    assert set(top_6_items_df[DEFAULT_ITEM_COL][3:5]) == set([1, 4])
     # Third and fourth itemID of user 2. The scores are both 3, so any order is OK.
-    assert(set(top_6_items_df[DEFAULT_ITEM_COL][5:7]) == set([5, 6]))
-    assert(top_6_items_df[DEFAULT_ITEM_COL][7] == 7)
+    assert set(top_6_items_df[DEFAULT_ITEM_COL][5:7]) == set([5, 6])
+    assert top_6_items_df[DEFAULT_ITEM_COL][7] == 7
     # First three itemIDs of user 3. The scores are both 5, so any order is OK.
-    assert(set(top_6_items_df[DEFAULT_ITEM_COL][8:11]) == set([2, 5, 6]))
+    assert set(top_6_items_df[DEFAULT_ITEM_COL][8:11]) == set([2, 5, 6])
     # Fourth and fifth itemID of user 3. The scores are both 4, so any order is OK.
-    assert(set(top_6_items_df[DEFAULT_ITEM_COL][11:13]) == set([8, 9]))
+    assert set(top_6_items_df[DEFAULT_ITEM_COL][11:13]) == set([8, 9])
     # Sixth itemID of user 3. Item 10,11,12 have a score of 3, so either one is OK.
-    assert(top_6_items_df[DEFAULT_ITEM_COL][13] in [10, 11, 12])
+    assert top_6_items_df[DEFAULT_ITEM_COL][13] in [10, 11, 12]
 
 
 def test_python_ndcg_at_k(rating_true, rating_pred, rating_nohit):
-    assert (
-        ndcg_at_k(
-            rating_true=rating_true,
-            rating_pred=rating_true,
-            col_prediction=DEFAULT_RATING_COL,
-            k=10,
-        )
-        == pytest.approx(1.0, TOL)
-    )
+    assert ndcg_at_k(
+        rating_true=rating_true,
+        rating_pred=rating_true,
+        col_prediction=DEFAULT_RATING_COL,
+        k=10,
+    ) == pytest.approx(1.0, TOL)
     assert ndcg_at_k(rating_true, rating_nohit, k=10) == 0.0
     assert ndcg_at_k(rating_true, rating_pred, k=10) == pytest.approx(0.38172, TOL)
 
+    # Test raw relevance score and log2 discounting factor using wiki example
+    # See https://en.wikipedia.org/wiki/Discounted_cumulative_gain
+    df_true = pd.DataFrame(
+        {
+            DEFAULT_USER_COL: np.full(8, 0, dtype=int),
+            DEFAULT_ITEM_COL: np.arange(8),
+            DEFAULT_RATING_COL: np.asarray([3, 2, 3, 0, 1, 2, 3, 2]),
+        }
+    )
+    df_pred = pd.DataFrame(
+        {
+            DEFAULT_USER_COL: np.full(6, 0, dtype=int),
+            DEFAULT_ITEM_COL: np.arange(6),
+            DEFAULT_PREDICTION_COL: np.asarray([6, 5, 4, 3, 2, 1]),
+        }
+    )
+    assert ndcg_at_k(
+        df_true, df_pred, k=6, score_type="raw", discfun_type="log2"
+    ) == pytest.approx(0.785, TOL)
+
 
 def test_python_map_at_k(rating_true, rating_pred, rating_nohit):
     assert (
@@ -342,59 +353,44 @@ def test_python_precision(rating_true, rating_pred, rating_nohit):
 
 
 def test_python_recall(rating_true, rating_pred, rating_nohit):
-    assert (
-        recall_at_k(
-            rating_true=rating_true,
-            rating_pred=rating_true,
-            col_prediction=DEFAULT_RATING_COL,
-            k=10,
-        )
-        == pytest.approx(1, TOL)
-    )
+    assert recall_at_k(
+        rating_true=rating_true,
+        rating_pred=rating_true,
+        col_prediction=DEFAULT_RATING_COL,
+        k=10,
+    ) == pytest.approx(1, TOL)
     assert recall_at_k(rating_true, rating_nohit, k=10) == 0.0
     assert recall_at_k(rating_true, rating_pred, k=10) == pytest.approx(0.37777, TOL)
 
 
 def test_python_auc(rating_true_binary, rating_pred_binary):
-    assert (
-        auc(
-            rating_true=rating_true_binary,
-            rating_pred=rating_true_binary,
-            col_prediction=DEFAULT_RATING_COL,
-        )
-        == pytest.approx(1.0, TOL)
-    )
-
-    assert (
-        auc(
-            rating_true=rating_true_binary,
-            rating_pred=rating_pred_binary,
-            col_rating=DEFAULT_RATING_COL,
-            col_prediction=DEFAULT_PREDICTION_COL,
-        )
-        == pytest.approx(0.75, TOL)
-    )
+    assert auc(
+        rating_true=rating_true_binary,
+        rating_pred=rating_true_binary,
+        col_prediction=DEFAULT_RATING_COL,
+    ) == pytest.approx(1.0, TOL)
+
+    assert auc(
+        rating_true=rating_true_binary,
+        rating_pred=rating_pred_binary,
+        col_rating=DEFAULT_RATING_COL,
+        col_prediction=DEFAULT_PREDICTION_COL,
+    ) == pytest.approx(0.75, TOL)
 
 
 def test_python_logloss(rating_true_binary, rating_pred_binary):
-    assert (
-        logloss(
-            rating_true=rating_true_binary,
-            rating_pred=rating_true_binary,
-            col_prediction=DEFAULT_RATING_COL,
-        )
-        == pytest.approx(0, TOL)
-    )
-
-    assert (
-        logloss(
-            rating_true=rating_true_binary,
-            rating_pred=rating_pred_binary,
-            col_rating=DEFAULT_RATING_COL,
-            col_prediction=DEFAULT_PREDICTION_COL,
-        )
-        == pytest.approx(0.7835, TOL)
-    )
+    assert logloss(
+        rating_true=rating_true_binary,
+        rating_pred=rating_true_binary,
+        col_prediction=DEFAULT_RATING_COL,
+    ) == pytest.approx(0, TOL)
+
+    assert logloss(
+        rating_true=rating_true_binary,
+        rating_pred=rating_pred_binary,
+        col_rating=DEFAULT_RATING_COL,
+        col_prediction=DEFAULT_PREDICTION_COL,
+    ) == pytest.approx(0.7835, TOL)
 
 
 def test_python_errors(rating_true, rating_pred):
diff --git a/tests/unit/recommenders/evaluation/test_python_evaluation_time_performance.py b/tests/unit/recommenders/evaluation/test_python_evaluation_time_performance.py
index 63893d864e..cbc5ce29c9 100644
--- a/tests/unit/recommenders/evaluation/test_python_evaluation_time_performance.py
+++ b/tests/unit/recommenders/evaluation/test_python_evaluation_time_performance.py
@@ -185,7 +185,7 @@ def test_python_ndcg_at_k(rating_true, rating_pred):
             col_prediction=DEFAULT_PREDICTION_COL,
             k=10,
         )
-    assert t.interval < 21.55627936 * (1 + TOL)
+    assert t.interval < 39.03877957 * (1 + TOL)
 
 
 def test_python_map_at_k(rating_true, rating_pred):