recommenders-team · anargyri · Dec 14, 2021 · Nov 22, 2021 · Nov 22, 2021 · Nov 23, 2021
@@ -39,3 +39,33 @@ sbt test
 ```
 
 (use ~test and it will automatically check for changes in source files, but not build.sbt)
+
+
+## Notes for Spark 3.x  ##
+
+The code now has been modified to support Spark 3.x, and has been
+tested under different versions of Databricks Runtime (including 6.4
+Extended Support, 7.3 LTS, 9.1 LTS, 10.0 and 10.1) on Azure Databricks
+Service.  But now manual packaging is needed:
+
+
+```bash
+export VERSION=0.5.0
+cd python
+python setup.py bdist_wheel  # => dist/pysarplus-0.5.0-cp38-cp38-linux_x86_64.whl
+
+export SPARK_VERSION=3.2.0
+export HADOOP_VERSION=3.3.1
+export SCALA_VERSION=2.12.14
+cd scala
+sbt ++${SCALA_VERSION} package  # => target/scala-2.12/sarplus_2.12.14_s3.2.0_h3.3.1-0.5.0.jar
+```
+
+where `VERSION`, `SPARK_VERSION`, `HADOOP_VERSION`, `SCALA_VERSION`
+should be customized as needed.  When running on Spark 3.x, extra
+configurations are also required:
+
+```
+spark.sql.sources.default parquet
+spark.sql.legacy.createHiveTableByDefault true
+```
@@ -17,10 +17,7 @@ def __init__(self, path):
         def find_or_raise(extension):
             files = [f for f in all_files if f.endswith(extension)]
             if len(files) != 1:
-                raise ValueError(
-                    "Directory '%s' must contain exactly 1 file ending in '%s'"
-                    % (path, extension)
-                )
+                raise ValueError("Directory '%s' must contain exactly 1 file ending in '%s'" % (path, extension))
             return path + "/" + files[0]
 
         # instantiate C++ backend

@@ -1,13 +1,8 @@
-"""
-This is the one and only (to rule them all) implementation of SAR.
-"""
+"""This is the implementation of SAR."""
 
 import logging
-import pyspark.sql.functions as F
 import pandas as pd
 from pyspark.sql.types import (
-    StringType,
-    DoubleType,
     StructType,
     StructField,
     IntegerType,
@@ -16,6 +11,7 @@
 from pyspark.sql.functions import pandas_udf, PandasUDFType
 from pysarplus import SARModel
 
+
 SIM_COOCCUR = "cooccurrence"
 SIM_JACCARD = "jaccard"
 SIM_LIFT = "lift"
@@ -25,7 +21,7 @@
 
 
 class SARPlus:
-    """SAR implementation for PySpark"""
+    """SAR implementation for PySpark."""
 
     def __init__(
         self,
@@ -66,13 +62,15 @@ def f(self, str, **kwargs):
     # current time for time decay calculation
     # cooccurrence matrix threshold
     def fit(self, df):
-        """Main fit method for SAR. Expects the dataframes to have row_id, col_id columns which are indexes,
+        """Main fit method for SAR. 
+
+        Expects the dataframes to have row_id, col_id columns which are indexes,
         i.e. contain the sequential integer index of the original alphanumeric user and item IDs.
         Dataframe also contains rating and timestamp as floats; timestamp is in seconds since Epoch by default.
 
         Arguments:
-            df (pySpark.DataFrame): input dataframe which contains the index of users and items. """
-
+            df (pySpark.DataFrame): input dataframe which contains the index of users and items.
+        """
         # threshold - items below this number get set to zero in coocurrence counts
 
         df.createOrReplaceTempView(self.f("{prefix}df_train_input"))
@@ -93,22 +91,20 @@ def fit(self, df):
             query = self.f(
                 """
             SELECT
-                 {col_user}, {col_item},
+                 {col_user}, {col_item}, 
                  SUM({col_rating} * EXP(-log(2) * (latest_timestamp - CAST({col_timestamp} AS long)) / ({time_decay_coefficient} * 3600 * 24))) as {col_rating}
             FROM {prefix}df_train_input,
                  (SELECT CAST(MAX({col_timestamp}) AS long) latest_timestamp FROM {prefix}df_train_input)
-            GROUP BY {col_user}, {col_item}
-            CLUSTER BY {col_user}
+            GROUP BY {col_user}, {col_item} 
+            CLUSTER BY {col_user} 
             """
             )
 
             # replace with timedecayed version
             df = self.spark.sql(query)
         else:
             # since SQL is case insensitive, this check needs to be performed similar
-            if self.header["col_timestamp"].lower() in [
-                s.name.lower() for s in df.schema
-            ]:
+            if self.header["col_timestamp"].lower() in [s.name.lower() for s in df.schema]:
                 # we need to de-duplicate items by using the latest item
                 query = self.f(
                     """
@@ -143,16 +139,12 @@ def fit(self, df):
         )
 
         item_cooccurrence = self.spark.sql(query)
-        item_cooccurrence.write.mode("overwrite").saveAsTable(
-            self.f("{prefix}item_cooccurrence")
-        )
+        item_cooccurrence.write.mode("overwrite").saveAsTable(self.f("{prefix}item_cooccurrence"))
 
         # compute the diagonal used later for Jaccard and Lift
         if self.similarity_type == SIM_LIFT or self.similarity_type == SIM_JACCARD:
             item_marginal = self.spark.sql(
-                self.f(
-                    "SELECT i1 i, value AS margin FROM {prefix}item_cooccurrence WHERE i1 = i2"
-                )
+                self.f("SELECT i1 i, value AS margin FROM {prefix}item_cooccurrence WHERE i1 = i2")
             )
             item_marginal.createOrReplaceTempView(self.f("{prefix}item_marginal"))
 
@@ -181,17 +173,11 @@ def fit(self, df):
             )
             self.item_similarity = self.spark.sql(query)
         else:
-            raise ValueError(
-                "Unknown similarity type: {0}".format(self.similarity_type)
-            )
+            raise ValueError("Unknown similarity type: {0}".format(self.similarity_type))
 
         # store upper triangular
-        log.info(
-            "sarplus.fit 2/2: compute similiarity metric %s..." % self.similarity_type
-        )
-        self.item_similarity.write.mode("overwrite").saveAsTable(
-            self.f("{prefix}item_similarity_upper")
-        )
+        log.info("sarplus.fit 2/2: compute similiarity metric %s..." % self.similarity_type)
+        self.item_similarity.write.mode("overwrite").saveAsTable(self.f("{prefix}item_similarity_upper"))
 
         # expand upper triangular to full matrix
 
@@ -209,9 +195,7 @@ def fit(self, df):
         )
 
         self.item_similarity = self.spark.sql(query)
-        self.item_similarity.write.mode("overwrite").saveAsTable(
-            self.f("{prefix}item_similarity")
-        )
+        self.item_similarity.write.mode("overwrite").saveAsTable(self.f("{prefix}item_similarity"))
 
         # free space
         self.spark.sql(self.f("DROP TABLE {prefix}item_cooccurrence"))
@@ -228,14 +212,10 @@ def get_user_affinity(self, test):
         """
         test.createOrReplaceTempView(self.f("{prefix}df_test"))
 
-        query = self.f(
-            "SELECT DISTINCT {col_user} FROM {prefix}df_test CLUSTER BY {col_user}"
-        )
+        query = self.f("SELECT DISTINCT {col_user} FROM {prefix}df_test CLUSTER BY {col_user}")
 
         df_test_users = self.spark.sql(query)
-        df_test_users.write.mode("overwrite").saveAsTable(
-            self.f("{prefix}df_test_users")
-        )
+        df_test_users.write.mode("overwrite").saveAsTable(self.f("{prefix}df_test_users"))
 
         query = self.f(
             """
@@ -249,12 +229,7 @@ def get_user_affinity(self, test):
         return self.spark.sql(query)
 
     def recommend_k_items(
-        self,
-        test,
-        cache_path,
-        top_k=10,
-        remove_seen=True,
-        n_user_prediction_partitions=200,
+        self, test, cache_path, top_k=10, remove_seen=True, n_user_prediction_partitions=200,
     ):
 
         # create item id to continuous index mapping
@@ -286,16 +261,10 @@ def recommend_k_items(
         log.info("sarplus.recommend_k_items 2/3: prepare similarity matrix")
 
         self.spark.sql(
-            self.f(
-                "SELECT i1, i2, CAST(value AS DOUBLE) value FROM {prefix}item_similarity_mapped ORDER BY i1, i2"
-            )
-        ).coalesce(1).write.format("com.microsoft.sarplus").mode("overwrite").save(
-            cache_path_output
-        )
+            self.f("SELECT i1, i2, CAST(value AS DOUBLE) value FROM {prefix}item_similarity_mapped ORDER BY i1, i2")
+        ).coalesce(1).write.format("com.microsoft.sarplus").mode("overwrite").save(cache_path_output)
 
-        self.get_user_affinity(test).createOrReplaceTempView(
-            self.f("{prefix}user_affinity")
-        )
+        self.get_user_affinity(test).createOrReplaceTempView(self.f("{prefix}user_affinity"))
 
         # map item ids to index space
         pred_input = self.spark.sql(
@@ -314,9 +283,7 @@ def recommend_k_items(
 
         schema = StructType(
             [
-                StructField(
-                    "userID", pred_input.schema[self.header["col_user"]].dataType, True
-                ),
+                StructField("userID", pred_input.schema[self.header["col_user"]].dataType, True),
                 StructField("itemID", IntegerType(), True),
                 StructField("score", FloatType(), True),
             ]
@@ -334,24 +301,18 @@ def sar_predict_udf(df):
             # memory mapped, the memory consumption only happens ones per worker
             # for all python processes
             model = SARModel(cache_path_input)
-            preds = model.predict(
-                df["idx"].values, df["rating"].values, top_k, remove_seen
-            )
+            preds = model.predict(df["idx"].values, df["rating"].values, top_k, remove_seen)
 
             user = df[local_header["col_user"]].iloc[0]
 
-            preds_ret = pd.DataFrame(
-                [(user, x.id, x.score) for x in preds], columns=range(3)
-            )
+            preds_ret = pd.DataFrame([(user, x.id, x.score) for x in preds], columns=range(3))
 
             return preds_ret
 
         log.info("sarplus.recommend_k_items 3/3: compute recommendations")
 
         df_preds = (
-            pred_input.repartition(
-                n_user_prediction_partitions, self.header["col_user"]
-            )
+            pred_input.repartition(n_user_prediction_partitions, self.header["col_user"])
             .groupby(self.header["col_user"])
             .apply(sar_predict_udf)
         )
@@ -381,9 +342,7 @@ def recommend_k_items_slow(self, test, top_k=10, remove_seen=True):
         if remove_seen:
             raise ValueError("Not implemented")
 
-        self.get_user_affinity(test).write.mode("overwrite").saveAsTable(
-            self.f("{prefix}user_affinity")
-        )
+        self.get_user_affinity(test).write.mode("overwrite").saveAsTable(self.f("{prefix}user_affinity"))
 
         # user_affinity * item_similarity
         # filter top-k

@@ -1,3 +1,4 @@
+import os
 import sysconfig
 
 from setuptools import setup
@@ -13,10 +14,17 @@ def __str__(self):
 
         return pybind11.get_include(self.user)
 
+DEPENDENCIES = [
+    "numpy",
+    "pandas",
+    # "pyarrow==0.13.0",
+    "pybind11>=2.2",
+    "pyspark>=3.0.0"
+]
 
 setup(
     name="pysarplus",
-    version="0.2.6",
+    version=os.environ["VERSION"],
     description="SAR prediction for use with PySpark",
     url="https://github.com/Microsoft/Recommenders/contrib/sarplus",
     author="Markus Cozowicz",
@@ -33,16 +41,15 @@ def __str__(self):
         "Topic :: Scientific/Engineering :: Mathematics",
     ],
     setup_requires=["pytest-runner"],
-    install_requires=["pybind11>=2.2"],
+    install_requires=DEPENDENCIES,
     tests_require=["pytest"],
     packages=["pysarplus"],
     ext_modules=[
         Extension(
             "pysarplus_cpp",
             ["src/pysarplus.cpp"],
             include_dirs=[get_pybind_include(), get_pybind_include(user=True)],
-            extra_compile_args=sysconfig.get_config_var("CFLAGS").split()
-            + ["-std=c++11", "-Wall", "-Wextra"],
+            extra_compile_args=sysconfig.get_config_var("CFLAGS").split() + ["-std=c++11", "-Wall", "-Wextra"],
             libraries=["stdc++"],
             language="c++11",
         )