From a4a782e59a85d34f9c58f50f1c29689145bde4c5 Mon Sep 17 00:00:00 2001 From: Blair Chen Date: Mon, 31 Oct 2022 13:12:01 +0800 Subject: [PATCH 1/2] Revert "Refine example notebooks (#756)" This reverts commit 2f7e1fd066319bf80baa10335b603257c832b13c. --- docs/dev_guide/new_contributor_guide.md | 6 +- docs/quickstart_databricks.md | 6 +- docs/quickstart_synapse.md | 2 +- ...atabricks_quickstart_nyc_taxi_driver.ipynb | 1442 +++++++++++++++++ feathr_project/feathr/client.py | 2 +- feathr_project/feathr/datasets/__init__.py | 9 - feathr_project/feathr/datasets/constants.py | 3 - feathr_project/feathr/datasets/nyc_taxi.py | 87 - feathr_project/feathr/datasets/utils.py | 64 - .../spark_provider/_databricks_submission.py | 181 +-- feathr_project/feathr/utils/config.py | 61 - feathr_project/feathr/utils/job_utils.py | 218 +-- feathr_project/feathr/utils/platform.py | 45 - .../demo_data/green_tripdata_2020-04.csv | 14 + .../product_detail_mock_data.csv | 11 + .../user_observation_mock_data.csv | 35 + .../user_profile_mock_data.csv | 11 + .../user_purchase_history_mock_data.csv | 31 + .../nyc_driver_demo.ipynb | 720 ++++++++ feathr_project/setup.py | 3 +- feathr_project/test/samples/test_notebooks.py | 56 - .../test/unit/datasets/test_dataset_utils.py | 53 - .../test/unit/datasets/test_datasets.py | 106 -- feathr_project/test/unit/utils/test_config.py | 31 - 24 files changed, 2400 insertions(+), 797 deletions(-) create mode 100644 docs/samples/databricks/databricks_quickstart_nyc_taxi_driver.ipynb delete mode 100644 feathr_project/feathr/datasets/__init__.py delete mode 100644 feathr_project/feathr/datasets/constants.py delete mode 100644 feathr_project/feathr/datasets/nyc_taxi.py delete mode 100644 feathr_project/feathr/datasets/utils.py delete mode 100644 feathr_project/feathr/utils/config.py delete mode 100644 feathr_project/feathr/utils/platform.py create mode 100644 feathr_project/feathrcli/data/feathr_user_workspace/mockdata/feathrazuretest3fs@feathrazuretest3storage.dfs.core.windows.net/demo_data/green_tripdata_2020-04.csv create mode 100644 feathr_project/feathrcli/data/feathr_user_workspace/mockdata/feathrazuretest3fs@feathrazuretest3storage.dfs.core.windows.net/product_recommendation_sample/product_detail_mock_data.csv create mode 100644 feathr_project/feathrcli/data/feathr_user_workspace/mockdata/feathrazuretest3fs@feathrazuretest3storage.dfs.core.windows.net/product_recommendation_sample/user_observation_mock_data.csv create mode 100644 feathr_project/feathrcli/data/feathr_user_workspace/mockdata/feathrazuretest3fs@feathrazuretest3storage.dfs.core.windows.net/product_recommendation_sample/user_profile_mock_data.csv create mode 100644 feathr_project/feathrcli/data/feathr_user_workspace/mockdata/feathrazuretest3fs@feathrazuretest3storage.dfs.core.windows.net/product_recommendation_sample/user_purchase_history_mock_data.csv create mode 100644 feathr_project/feathrcli/data/feathr_user_workspace/nyc_driver_demo.ipynb delete mode 100644 feathr_project/test/samples/test_notebooks.py delete mode 100644 feathr_project/test/unit/datasets/test_dataset_utils.py delete mode 100644 feathr_project/test/unit/datasets/test_datasets.py delete mode 100644 feathr_project/test/unit/utils/test_config.py diff --git a/docs/dev_guide/new_contributor_guide.md b/docs/dev_guide/new_contributor_guide.md index 223b7d91b..1856ffd84 100644 --- a/docs/dev_guide/new_contributor_guide.md +++ b/docs/dev_guide/new_contributor_guide.md @@ -6,11 +6,11 @@ parent: Feathr Developer Guides # What can I contribute? All forms of contributions are welcome, including and not limited to: -* Improve or contribute new [notebook samples](https://github.com/feathr-ai/feathr/tree/main/docs/samples) +* Improve or contribute new [notebook samples](https://github.com/feathr-ai/feathr/tree/main/feathr_project/feathrcli/data/feathr_user_workspace) * Add tutorial, blog posts, tech talks etc * Increase media coverage and exposure * Improve user-facing documentation or developer-facing documentation -* Add testing code +* Add testing code * Add new features * Refactor and improve architecture * For any other forms of contribution and collaboration, don't hesitate to reach out to us. @@ -18,7 +18,7 @@ All forms of contributions are welcome, including and not limited to: # I am interested, how can I start? If you are new to this project, we recommend start with [`good-first-issue`](https://github.com/feathr-ai/feathr/issues?q=is%3Aissue+is%3Aopen+label%3A%22good+first+issue%22). -The issues are also labled with what types of programming language the task need. +The issues are also labled with what types of programming language the task need. * [`good-first-issue` and `Python`](https://github.com/feathr-ai/feathr/issues?q=is%3Aopen+label%3A%22good+first+issue%22+label%3Apython) * [`good-first-issue` and `Scala`](https://github.com/feathr-ai/feathr/issues?q=is%3Aopen+label%3A%22good+first+issue%22+label%3Ascala) * [`good-first-issue` and `Java`](https://github.com/feathr-ai/feathr/issues?q=is%3Aopen+label%3A%22good+first+issue%22+label%3Ajava) diff --git a/docs/quickstart_databricks.md b/docs/quickstart_databricks.md index 30eaaa835..dff5b5f0f 100644 --- a/docs/quickstart_databricks.md +++ b/docs/quickstart_databricks.md @@ -5,13 +5,13 @@ title: Quick Start Guide with Databricks # Feathr Quick Start Guide with Databricks -For Databricks, you can simply upload [this notebook](./samples/databricks/databricks_quickstart_nyc_taxi_demo.ipynb) to your Databricks cluster and just run it in the Databricks cluster. It has been pre-configured to use the current Databricks cluster to submit jobs. +For Databricks, you can simply upload [this notebook](./samples/databricks/databricks_quickstart_nyc_taxi_driver.ipynb) to your Databricks cluster and just run it in the Databricks cluster. It has been pre-configured to use the current Databricks cluster to submit jobs. 1. Import Notebooks in your Databricks cluster: ![Import Notebooks](./images/databricks_quickstart1.png) -2. Paste the [link to Databricks getting started notebook](./samples/databricks/databricks_quickstart_nyc_taxi_demo.ipynb): +2. Paste the [link to Databricks getting started notebook](./samples/databricks/databricks_quickstart_nyc_taxi_driver.ipynb): ![Import Notebooks](./images/databricks_quickstart2.png) @@ -21,7 +21,7 @@ For Databricks, you can simply upload [this notebook](./samples/databricks/datab Although Databricks Notebooks are great tools, there are also large developer communities that prefer the usage of Visual Studio Code, where [it has native support for Python and Jupyter Notebooks](https://code.visualstudio.com/docs/datascience/jupyter-notebooks) with many great features such as syntax highlight and IntelliSense. -In [this notebook](./samples/databricks/databricks_quickstart_nyc_taxi_demo.ipynb), there are a few lines of code like this: +In [this notebook](./samples/databricks/databricks_quickstart_nyc_taxi_driver.ipynb), there are a few lines of code like this: ```python # Get current databricks notebook context diff --git a/docs/quickstart_synapse.md b/docs/quickstart_synapse.md index c310dd789..d07198d92 100644 --- a/docs/quickstart_synapse.md +++ b/docs/quickstart_synapse.md @@ -24,7 +24,7 @@ Feathr has native cloud integration. Here are the steps to use Feathr on Azure: 1. Follow the [Feathr ARM deployment guide](https://feathr-ai.github.io/feathr/how-to-guides/azure-deployment-arm.html) to run Feathr on Azure. This allows you to quickly get started with automated deployment using Azure Resource Manager template. Alternatively, if you want to set up everything manually, you can checkout the [Feathr CLI deployment guide](https://feathr-ai.github.io/feathr/how-to-guides/azure-deployment-cli.html) to run Feathr on Azure. This allows you to understand what is going on and set up one resource at a time. -2. Once the deployment is complete,run the Feathr Jupyter Notebook by clicking this button: [![Binder](https://mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/feathr-ai/feathr/main?labpath=docs%2Fsamples%2Fnyc_taxi_demo.ipynb). +2. Once the deployment is complete,run the Feathr Jupyter Notebook by clicking this button: [![Binder](https://mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/feathr-ai/feathr/main?labpath=feathr_project%2Ffeathrcli%2Fdata%2Ffeathr_user_workspace%2Fnyc_driver_demo.ipynb). 3. You only need to change the specified `Resource Prefix`. ## Step 2: Install Feathr diff --git a/docs/samples/databricks/databricks_quickstart_nyc_taxi_driver.ipynb b/docs/samples/databricks/databricks_quickstart_nyc_taxi_driver.ipynb new file mode 100644 index 000000000..52790f884 --- /dev/null +++ b/docs/samples/databricks/databricks_quickstart_nyc_taxi_driver.ipynb @@ -0,0 +1,1442 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "384e5e16-7213-4186-9d04-09d03b155534", + "showTitle": false, + "title": "" + } + }, + "source": [ + "# Feathr Feature Store on Databricks Demo Notebook\n", + "\n", + "This notebook illustrates the use of Feature Store to create a model that predicts NYC Taxi fares. This is a notebook that's specially designed for databricks clusters and is relying on some of the databricks packages such as dbutils.\n", + "\n", + "The intent of this notebook is like \"one click run\" without configuring anything, so it has relatively limited capability. \n", + "\n", + "- For example, in this notebook there's no feature registry available since that requires running Azure Purview. \n", + "- Also for online store (Redis), you need to configure the Redis endpoint, otherwise that part will not work. \n", + "\n", + "However, the core part of Feathr, especially defining features, get offline features, point-in-time joins etc., should \"just work\". The full-fledged notebook is [located here](https://github.com/linkedin/feathr/blob/main/feathr_project/feathrcli/data/feathr_user_workspace/nyc_driver_demo.ipynb)." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "# Notebook Steps\n", + "\n", + "This tutorial demonstrates the key capabilities of Feathr, including:\n", + "\n", + "1. Install and set up Feathr with Azure\n", + "2. Create shareable features with Feathr feature definition configs.\n", + "3. Create a training dataset via point-in-time feature join.\n", + "4. Compute and write features.\n", + "5. Train a model using these features to predict fares.\n", + "6. Materialize feature value to online store.\n", + "7. Fetch feature value in real-time from online store for online scoring.\n", + "\n", + "In this tutorial, we use Feathr Feature Store to create a model that predicts NYC Taxi fares. The dataset comes from [here](https://www1.nyc.gov/site/tlc/about/tlc-trip-record-data.page). The feature flow is as below:\n", + "\n", + "![Feature Flow](https://github.com/linkedin/feathr/blob/main/docs/images/feature_flow.png?raw=true)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "f00b9d0b-94d1-418f-89b9-25bbacb8b068", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "" + ] + }, + "metadata": { + "application/vnd.databricks.v1+output": { + "arguments": {}, + "data": "", + "errorSummary": "", + "errorTraceType": null, + "metadata": {}, + "type": "ipynbError" + } + }, + "output_type": "display_data" + } + ], + "source": [ + "! pip install feathr pandavro scikit-learn" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "80223a02-631c-40c8-91b3-a037249ffff9", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "" + ] + }, + "metadata": { + "application/vnd.databricks.v1+output": { + "arguments": {}, + "data": "", + "errorSummary": "", + "errorTraceType": null, + "metadata": {}, + "type": "ipynbError" + } + }, + "output_type": "display_data" + } + ], + "source": [ + "import glob\n", + "import os\n", + "import tempfile\n", + "from datetime import datetime, timedelta\n", + "from math import sqrt\n", + "\n", + "import pandas as pd\n", + "import pandavro as pdx\n", + "from feathr import FeathrClient\n", + "from feathr import BOOLEAN, FLOAT, INT32, ValueType\n", + "from feathr import Feature, DerivedFeature, FeatureAnchor\n", + "from feathr import BackfillTime, MaterializationSettings\n", + "from feathr import FeatureQuery, ObservationSettings\n", + "from feathr import RedisSink\n", + "from feathr import INPUT_CONTEXT, HdfsSource\n", + "from feathr import WindowAggTransformation\n", + "from feathr import TypedKey\n", + "from sklearn.metrics import mean_squared_error\n", + "from sklearn.model_selection import train_test_split\n", + "from azure.identity import DefaultAzureCredential\n", + "from azure.keyvault.secrets import SecretClient\n", + "import json\n", + "import requests" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "41d3648a-9bc9-40dc-90da-bc82b21ef9b3", + "showTitle": false, + "title": "" + } + }, + "source": [ + "Get the required databricks credentials automatically:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "331753d6-1850-47b5-ad97-84b7c01d79d1", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "" + ] + }, + "metadata": { + "application/vnd.databricks.v1+output": { + "arguments": {}, + "data": "", + "errorSummary": "", + "errorTraceType": null, + "metadata": {}, + "type": "ipynbError" + } + }, + "output_type": "display_data" + } + ], + "source": [ + "# Get current databricks notebook context\n", + "ctx = dbutils.notebook.entry_point.getDbutils().notebook().getContext()\n", + "host_name = ctx.tags().get(\"browserHostName\").get()\n", + "host_token = ctx.apiToken().get()\n", + "cluster_id = ctx.tags().get(\"clusterId\").get()\n", + "\n", + "\n", + "\n", + "# databricks_config = {'run_name':'FEATHR_FILL_IN','existing_cluster_id':cluster_id,'libraries':[{'jar':'FEATHR_FILL_IN'}],'spark_jar_task':{'main_class_name':'FEATHR_FILL_IN','parameters':['FEATHR_FILL_IN']}}\n", + "os.environ['spark_config__databricks__workspace_instance_url'] = \"https://\" + host_name\n", + "os.environ['spark_config__databricks__config_template']='{\"run_name\":\"FEATHR_FILL_IN\",\"new_cluster\":{\"spark_version\":\"10.4.x-scala2.12\",\"node_type_id\":\"Standard_D3_v2\",\"num_workers\":2,\"spark_conf\":{\"FEATHR_FILL_IN\":\"FEATHR_FILL_IN\"}},\"libraries\":[{\"jar\":\"FEATHR_FILL_IN\"}],\"spark_jar_task\":{\"main_class_name\":\"FEATHR_FILL_IN\",\"parameters\":[\"FEATHR_FILL_IN\"]}}'\n", + "# os.environ['spark_config__databricks__config_template']=json.dumps(databricks_config)\n", + "os.environ['spark_config__databricks__work_dir']='dbfs:/feathr_getting_started'\n", + "os.environ['project_config__project_name']='feathr_getting_started'\n", + "os.environ['DATABRICKS_WORKSPACE_TOKEN_VALUE'] = host_token" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "You need to setup the Redis credentials below in order to push features to online store. You can skip this part if you don't have Redis, but there will be failures for `client.materialize_features(settings)` API." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Get redis credentials; This is to parse Redis connection string.\n", + "redis_port=\"\"\n", + "redis_host=\"\"\n", + "redis_password=\"\"\n", + "redis_ssl=\"\"\n", + "\n", + "# Set the resource link\n", + "os.environ['online_store__redis__host'] = redis_host\n", + "os.environ['online_store__redis__port'] = redis_port\n", + "os.environ['online_store__redis__ssl_enabled'] = redis_ssl\n", + "os.environ['REDIS_PASSWORD']=redis_password" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "08bc3b7e-bbf5-4e3a-9978-fe1aef8c1aee", + "showTitle": false, + "title": "" + } + }, + "source": [ + "Configure required credentials (skip if you don't use those):" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "8cd64e3a-376c-48e6-ba41-5197f3591d48", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "" + ] + }, + "metadata": { + "application/vnd.databricks.v1+output": { + "arguments": {}, + "data": "", + "errorSummary": "", + "errorTraceType": null, + "metadata": {}, + "type": "ipynbError" + } + }, + "output_type": "display_data" + } + ], + "source": [ + "import tempfile\n", + "yaml_config = \"\"\"\n", + "# Please refer to https://github.com/linkedin/feathr/blob/main/feathr_project/feathrcli/data/feathr_user_workspace/feathr_config.yaml for explanations on the meaning of each field.\n", + "api_version: 1\n", + "project_config:\n", + " project_name: 'feathr_getting_started2'\n", + " required_environment_variables:\n", + " - 'REDIS_PASSWORD'\n", + "offline_store:\n", + " adls:\n", + " adls_enabled: true\n", + " wasb:\n", + " wasb_enabled: true\n", + " s3:\n", + " s3_enabled: false\n", + " s3_endpoint: ''\n", + " jdbc:\n", + " jdbc_enabled: false\n", + " jdbc_database: ''\n", + " jdbc_table: ''\n", + " snowflake:\n", + " snowflake_enabled: false\n", + " url: \".snowflakecomputing.com\"\n", + " user: \"\"\n", + " role: \"\"\n", + "spark_config:\n", + " # choice for spark runtime. Currently support: azure_synapse, databricks\n", + " # The `databricks` configs will be ignored if `azure_synapse` is set and vice versa.\n", + " spark_cluster: \"databricks\"\n", + " spark_result_output_parts: \"1\"\n", + "\n", + "online_store:\n", + " redis:\n", + " host: '.redis.cache.windows.net'\n", + " port: 6380\n", + " ssl_enabled: True\n", + "feature_registry:\n", + " api_endpoint: \"https://.azurewebsites.net/api/v1\"\n", + "\"\"\"\n", + "tmp = tempfile.NamedTemporaryFile(mode='w', delete=False)\n", + "with open(tmp.name, \"w\") as text_file:\n", + " text_file.write(yaml_config)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "3fef7f2f-df19-4f53-90a5-ff7999ed983d", + "showTitle": false, + "title": "" + } + }, + "source": [ + "# Initialize Feathr Client" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "9713a2df-c7b2-4562-88b0-b7acce3cc43a", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "" + ] + }, + "metadata": { + "application/vnd.databricks.v1+output": { + "arguments": {}, + "data": "", + "errorSummary": "", + "errorTraceType": null, + "metadata": {}, + "type": "ipynbError" + } + }, + "output_type": "display_data" + } + ], + "source": [ + "client = FeathrClient(config_path=tmp.name)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "c3b64bda-d42c-4a64-b976-0fb604cf38c5", + "showTitle": false, + "title": "" + } + }, + "source": [ + "## View the data\n", + "\n", + "In this tutorial, we use Feathr Feature Store to create a model that predicts NYC Taxi fares. The dataset comes from [here](https://www1.nyc.gov/site/tlc/about/tlc-trip-record-data.page). The data is as below" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "c4ccd7b3-298a-4e5a-8eec-b7e309db393e", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "" + ] + }, + "metadata": { + "application/vnd.databricks.v1+output": { + "arguments": {}, + "data": "", + "errorSummary": "", + "errorTraceType": null, + "metadata": {}, + "type": "ipynbError" + } + }, + "output_type": "display_data" + } + ], + "source": [ + "import pandas as pd\n", + "pd.read_csv(\"https://azurefeathrstorage.blob.core.windows.net/public/sample_data/green_tripdata_2020-04_with_index.csv\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "7430c942-64e5-4b70-b823-16ce1d1b3cee", + "showTitle": false, + "title": "" + } + }, + "source": [ + "## Defining Features with Feathr\n", + "\n", + "In Feathr, a feature is viewed as a function, mapping from entity id or key, and timestamp to a feature value. For more details on feature definition, please refer to the [Feathr Feature Definition Guide](https://github.com/linkedin/feathr/blob/main/docs/concepts/feature-definition.md)\n", + "\n", + "\n", + "1. The typed key (a.k.a. entity id) identifies the subject of feature, e.g. a user id, 123.\n", + "2. The feature name is the aspect of the entity that the feature is indicating, e.g. the age of the user.\n", + "3. The feature value is the actual value of that aspect at a particular time, e.g. the value is 30 at year 2022." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "16420730-582e-4e11-a343-efc0ddd35108", + "showTitle": false, + "title": "" + } + }, + "source": [ + "Note that, in some cases, such as features defined on top of request data, may have no entity key or timestamp.\n", + "It is merely a function/transformation executing against request data at runtime.\n", + "For example, the day of week of the request, which is calculated by converting the request UNIX timestamp." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "728d2d5f-c11f-4941-bdc5-48507f5749f1", + "showTitle": false, + "title": "" + } + }, + "source": [ + "### Define Sources Section with UDFs\n", + "A feature source is needed for anchored features that describes the raw data in which the feature values are computed from. See the python documentation to get the details on each input column." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "3cc59a0e-a41b-480e-a84e-ca5443d63143", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "" + ] + }, + "metadata": { + "application/vnd.databricks.v1+output": { + "arguments": {}, + "data": "", + "errorSummary": "", + "errorTraceType": null, + "metadata": {}, + "type": "ipynbError" + } + }, + "output_type": "display_data" + } + ], + "source": [ + "batch_source = HdfsSource(name=\"nycTaxiBatchSource\",\n", + " path=\"wasbs://public@azurefeathrstorage.blob.core.windows.net/sample_data/green_tripdata_2020-04_with_index.csv\",\n", + " event_timestamp_column=\"lpep_dropoff_datetime\",\n", + " timestamp_format=\"yyyy-MM-dd HH:mm:ss\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "46f863c4-bb81-434a-a448-6b585031a221", + "showTitle": false, + "title": "" + } + }, + "source": [ + "### Define Anchors and Features\n", + "A feature is called an anchored feature when the feature is directly extracted from the source data, rather than computed on top of other features. The latter case is called derived feature." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "a373ecbe-a040-4cd3-9d87-0d5f4c5ba553", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "" + ] + }, + "metadata": { + "application/vnd.databricks.v1+output": { + "arguments": {}, + "data": "", + "errorSummary": "", + "errorTraceType": null, + "metadata": {}, + "type": "ipynbError" + } + }, + "output_type": "display_data" + } + ], + "source": [ + "f_trip_distance = Feature(name=\"f_trip_distance\",\n", + " feature_type=FLOAT, transform=\"trip_distance\")\n", + "\n", + "features = [\n", + " f_trip_distance,\n", + " Feature(name=\"f_is_long_trip_distance\",\n", + " feature_type=BOOLEAN,\n", + " transform=\"cast_float(trip_distance)>30\"),\n", + " Feature(name=\"f_day_of_week\",\n", + " feature_type=INT32,\n", + " transform=\"dayofweek(lpep_dropoff_datetime)\"),\n", + "]\n", + "\n", + "request_anchor = FeatureAnchor(name=\"request_features\",\n", + " source=INPUT_CONTEXT,\n", + " features=features)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "149f85e2-fa3c-4895-b0c5-de5543ca9b6d", + "showTitle": false, + "title": "" + } + }, + "source": [ + "### Window aggregation features\n", + "\n", + "For window aggregation features, see the supported fields below:\n", + "\n", + "Note that the `agg_func` should be any of these:\n", + "\n", + "| Aggregation Type | Input Type | Description |\n", + "| --- | --- | --- |\n", + "|SUM, COUNT, MAX, MIN, AVG\t|Numeric|Applies the the numerical operation on the numeric inputs. |\n", + "|MAX_POOLING, MIN_POOLING, AVG_POOLING\t| Numeric Vector | Applies the max/min/avg operation on a per entry bassis for a given a collection of numbers.|\n", + "|LATEST| Any |Returns the latest not-null values from within the defined time window |\n", + "\n", + "\n", + "After you have defined features and sources, bring them together to build an anchor:\n", + "\n", + "\n", + "Note that if the data source is from the observation data, the `source` section should be `INPUT_CONTEXT` to indicate the source of those defined anchors." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "05633bc3-9118-449b-9562-45fc437576c2", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "" + ] + }, + "metadata": { + "application/vnd.databricks.v1+output": { + "arguments": {}, + "data": "", + "errorSummary": "", + "errorTraceType": null, + "metadata": {}, + "type": "ipynbError" + } + }, + "output_type": "display_data" + } + ], + "source": [ + "location_id = TypedKey(key_column=\"DOLocationID\",\n", + " key_column_type=ValueType.INT32,\n", + " description=\"location id in NYC\",\n", + " full_name=\"nyc_taxi.location_id\")\n", + "agg_features = [Feature(name=\"f_location_avg_fare\",\n", + " key=location_id,\n", + " feature_type=FLOAT,\n", + " transform=WindowAggTransformation(agg_expr=\"cast_float(fare_amount)\",\n", + " agg_func=\"AVG\",\n", + " window=\"90d\")),\n", + " Feature(name=\"f_location_max_fare\",\n", + " key=location_id,\n", + " feature_type=FLOAT,\n", + " transform=WindowAggTransformation(agg_expr=\"cast_float(fare_amount)\",\n", + " agg_func=\"MAX\",\n", + " window=\"90d\")),\n", + " ]\n", + "\n", + "agg_anchor = FeatureAnchor(name=\"aggregationFeatures\",\n", + " source=batch_source,\n", + " features=agg_features)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "d2ecaca9-057e-4b36-811f-320f66f753ed", + "showTitle": false, + "title": "" + } + }, + "source": [ + "### Derived Features Section\n", + "Derived features are the features that are computed from other features. They could be computed from anchored features, or other derived features." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "270fb11e-8a71-404f-9639-ad29d8e6a2c1", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "" + ] + }, + "metadata": { + "application/vnd.databricks.v1+output": { + "arguments": {}, + "data": "", + "errorSummary": "", + "errorTraceType": null, + "metadata": {}, + "type": "ipynbError" + } + }, + "output_type": "display_data" + } + ], + "source": [ + "\n", + "f_trip_distance_rounded = DerivedFeature(name=\"f_trip_distance_rounded\",\n", + " feature_type=INT32,\n", + " input_features=[f_trip_distance],\n", + " transform=\"f_trip_distance * 10\")\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "ad102c45-586d-468c-85f0-9454401ef10b", + "showTitle": false, + "title": "" + } + }, + "source": [ + "And then we need to build those features so that it can be consumed later. Note that we have to build both the \"anchor\" and the \"derived\" features (which is not anchored to a source)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "91bb5ebb-87e4-470b-b8eb-1c89b351740e", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "" + ] + }, + "metadata": { + "application/vnd.databricks.v1+output": { + "arguments": {}, + "data": "", + "errorSummary": "", + "errorTraceType": null, + "metadata": {}, + "type": "ipynbError" + } + }, + "output_type": "display_data" + } + ], + "source": [ + "client.build_features(anchor_list=[agg_anchor, request_anchor], derived_feature_list=[\n", + " f_trip_distance_rounded])" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "632d5f46-f9e2-41a8-aab7-34f75206e2aa", + "showTitle": false, + "title": "" + } + }, + "source": [ + "## Create training data using point-in-time correct feature join\n", + "\n", + "A training dataset usually contains entity id columns, multiple feature columns, event timestamp column and label/target column. \n", + "\n", + "To create a training dataset using Feathr, one needs to provide a feature join configuration file to specify\n", + "what features and how these features should be joined to the observation data. \n", + "\n", + "To learn more on this topic, please refer to [Point-in-time Correctness](https://github.com/linkedin/feathr/blob/main/docs/concepts/point-in-time-join.md)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "e438e6d8-162e-4aa3-b3b3-9d1f3b0d2b7f", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "" + ] + }, + "metadata": { + "application/vnd.databricks.v1+output": { + "arguments": {}, + "data": "", + "errorSummary": "", + "errorTraceType": null, + "metadata": {}, + "type": "ipynbError" + } + }, + "output_type": "display_data" + } + ], + "source": [ + "\n", + "output_path = 'dbfs:/feathrazure_test.avro'\n", + "\n", + "\n", + "feature_query = FeatureQuery(\n", + " feature_list=[\"f_location_avg_fare\", \"f_trip_distance_rounded\", \"f_is_long_trip_distance\"], key=location_id)\n", + "settings = ObservationSettings(\n", + " observation_path=\"wasbs://public@azurefeathrstorage.blob.core.windows.net/sample_data/green_tripdata_2020-04_with_index.csv\",\n", + " event_timestamp_column=\"lpep_dropoff_datetime\",\n", + " timestamp_format=\"yyyy-MM-dd HH:mm:ss\")\n", + "client.get_offline_features(observation_settings=settings,\n", + " feature_query=feature_query,\n", + " output_path=output_path\n", + " )\n", + "client.wait_job_to_finish(timeout_sec=500)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "51f078e3-3f8f-4f10-b7f1-499ac8a9ff07", + "showTitle": false, + "title": "" + } + }, + "source": [ + "## Download the result and show the result\n", + "\n", + "Let's use the helper function `get_result_df` to download the result and view it:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "23c797b2-ac1a-4cf3-b0ed-c05216de3f37", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "" + ] + }, + "metadata": { + "application/vnd.databricks.v1+output": { + "arguments": {}, + "data": "", + "errorSummary": "", + "errorTraceType": null, + "metadata": {}, + "type": "ipynbError" + } + }, + "output_type": "display_data" + } + ], + "source": [ + "from feathr.utils.job_utils import get_result_df\n", + "df_res = get_result_df(client, format=\"avro\", res_url = output_path)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "b9be042e-eb12-46b9-9d91-a0e5dd0c704f", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "" + ] + }, + "metadata": { + "application/vnd.databricks.v1+output": { + "arguments": {}, + "data": "", + "errorSummary": "", + "errorTraceType": null, + "metadata": {}, + "type": "ipynbError" + } + }, + "output_type": "display_data" + } + ], + "source": [ + "df_res" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "dcbf17fc-7f79-4a65-a3af-9cffbd0b5d1f", + "showTitle": false, + "title": "" + } + }, + "source": [ + "## Train a machine learning model\n", + "After getting all the features, let's train a machine learning model with the converted feature by Feathr:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "84745f36-5bac-49c0-903b-38828b923c7c", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "" + ] + }, + "metadata": { + "application/vnd.databricks.v1+output": { + "arguments": {}, + "data": "", + "errorSummary": "", + "errorTraceType": null, + "metadata": {}, + "type": "ipynbError" + } + }, + "output_type": "display_data" + } + ], + "source": [ + "# remove columns\n", + "from sklearn.ensemble import GradientBoostingRegressor\n", + "final_df = df_res\n", + "final_df.drop([\"lpep_pickup_datetime\", \"lpep_dropoff_datetime\",\n", + " \"store_and_fwd_flag\"], axis=1, inplace=True, errors='ignore')\n", + "final_df.fillna(0, inplace=True)\n", + "final_df['fare_amount'] = final_df['fare_amount'].astype(\"float64\")\n", + "\n", + "\n", + "train_x, test_x, train_y, test_y = train_test_split(final_df.drop([\"fare_amount\"], axis=1),\n", + " final_df[\"fare_amount\"],\n", + " test_size=0.2,\n", + " random_state=42)\n", + "model = GradientBoostingRegressor()\n", + "model.fit(train_x, train_y)\n", + "\n", + "y_predict = model.predict(test_x)\n", + "\n", + "y_actual = test_y.values.flatten().tolist()\n", + "rmse = sqrt(mean_squared_error(y_actual, y_predict))\n", + "\n", + "sum_actuals = sum_errors = 0\n", + "\n", + "for actual_val, predict_val in zip(y_actual, y_predict):\n", + " abs_error = actual_val - predict_val\n", + " if abs_error < 0:\n", + " abs_error = abs_error * -1\n", + "\n", + " sum_errors = sum_errors + abs_error\n", + " sum_actuals = sum_actuals + actual_val\n", + "\n", + "mean_abs_percent_error = sum_errors / sum_actuals\n", + "print(\"Model MAPE:\")\n", + "print(mean_abs_percent_error)\n", + "print()\n", + "print(\"Model Accuracy:\")\n", + "print(1 - mean_abs_percent_error)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "5a226026-1c7b-48db-8f91-88d5c2ddf023", + "showTitle": false, + "title": "" + } + }, + "source": [ + "## Materialize feature value into offline/online storage\n", + "\n", + "While Feathr can compute the feature value from the feature definition on-the-fly at request time, it can also pre-compute\n", + "and materialize the feature value to offline and/or online storage. \n", + "\n", + "We can push the generated features to the online store like below:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "3b924c66-8634-42fe-90f3-c844487d3f75", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "" + ] + }, + "metadata": { + "application/vnd.databricks.v1+output": { + "arguments": {}, + "data": "", + "errorSummary": "", + "errorTraceType": null, + "metadata": {}, + "type": "ipynbError" + } + }, + "output_type": "display_data" + } + ], + "source": [ + "backfill_time = BackfillTime(start=datetime(\n", + " 2020, 5, 20), end=datetime(2020, 5, 20), step=timedelta(days=1))\n", + "redisSink = RedisSink(table_name=\"nycTaxiDemoFeature\")\n", + "settings = MaterializationSettings(\"nycTaxiTable\",\n", + " backfill_time=backfill_time,\n", + " sinks=[redisSink],\n", + " feature_names=[\"f_location_avg_fare\", \"f_location_max_fare\"])\n", + "\n", + "client.materialize_features(settings)\n", + "client.wait_job_to_finish(timeout_sec=500)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "6a3e2ab1-5c66-4d27-a737-c5e2af03b1dd", + "showTitle": false, + "title": "" + } + }, + "source": [ + "We can then get the features from the online store (Redis):" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "bef93538-9591-4247-97b6-289d2055b7b1", + "showTitle": false, + "title": "" + } + }, + "source": [ + "## Fetching feature value for online inference\n", + "\n", + "For features that are already materialized by the previous step, their latest value can be queried via the client's\n", + "`get_online_features` or `multi_get_online_features` API." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "0c3d5f35-11a3-4644-9992-5860169d8302", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "" + ] + }, + "metadata": { + "application/vnd.databricks.v1+output": { + "arguments": {}, + "data": "", + "errorSummary": "", + "errorTraceType": null, + "metadata": {}, + "type": "ipynbError" + } + }, + "output_type": "display_data" + } + ], + "source": [ + "res = client.get_online_features('nycTaxiDemoFeature', '265', [\n", + " 'f_location_avg_fare', 'f_location_max_fare'])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "application/vnd.databricks.v1+cell": { + "inputWidgets": {}, + "nuid": "4d4699ed-42e6-408f-903d-2f799284f4b6", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "" + ] + }, + "metadata": { + "application/vnd.databricks.v1+output": { + "arguments": {}, + "data": "", + "errorSummary": "", + "errorTraceType": null, + "metadata": {}, + "type": "ipynbError" + } + }, + "output_type": "display_data" + } + ], + "source": [ + "client.multi_get_online_features(\"nycTaxiDemoFeature\", [\"239\", \"265\"], [\n", + " 'f_location_avg_fare', 'f_location_max_fare'])" + ] + } + ], + "metadata": { + "application/vnd.databricks.v1+notebook": { + "dashboards": [], + "language": "python", + "notebookMetadata": { + "pythonIndentUnit": 4 + }, + "notebookName": "nyc_driver_demo", + "notebookOrigID": 930353059183053, + "widgets": {} + }, + "interpreter": { + "hash": "830c16c5b424e7ff512f67d4056b67cea1a756a7ad6a92c98b9e2b95c5e484ae" + }, + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.5" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/feathr_project/feathr/client.py b/feathr_project/feathr/client.py index dd39a70fa..b14bf868e 100644 --- a/feathr_project/feathr/client.py +++ b/feathr_project/feathr/client.py @@ -486,7 +486,7 @@ def _get_offline_features_with_config(self, job_tags = {OUTPUT_PATH_TAG:feature_join_job_params.job_output_path} # set output format in job tags if it's set by user, so that it can be used to parse the job result in the helper function if execution_configurations is not None and OUTPUT_FORMAT in execution_configurations: - job_tags[OUTPUT_FORMAT] = execution_configurations[OUTPUT_FORMAT] + job_tags[OUTPUT_FORMAT]= execution_configurations[OUTPUT_FORMAT] ''' - Job tags are for job metadata and it's not passed to the actual spark job (i.e. not visible to spark job), more like a platform related thing that Feathr want to add (currently job tags only have job output URL and job output format, ). They are carried over with the job and is visible to every Feathr client. Think this more like some customized metadata for the job which would be weird to be put in the spark job itself. - Job arguments (or sometimes called job parameters)are the arguments which are command line arguments passed into the actual spark job. This is usually highly related with the spark job. In Feathr it's like the input to the scala spark CLI. They are usually not spark specific (for example if we want to specify the location of the feature files, or want to diff --git a/feathr_project/feathr/datasets/__init__.py b/feathr_project/feathr/datasets/__init__.py deleted file mode 100644 index a1e2e5bf3..000000000 --- a/feathr_project/feathr/datasets/__init__.py +++ /dev/null @@ -1,9 +0,0 @@ -"""Utilities for downloading sample datasets""" - -from feathr.datasets.constants import ( - NYC_TAXI_SMALL_URL -) - -__all__ = [ - "NYC_TAXI_SMALL_URL", -] diff --git a/feathr_project/feathr/datasets/constants.py b/feathr_project/feathr/datasets/constants.py deleted file mode 100644 index 849865570..000000000 --- a/feathr_project/feathr/datasets/constants.py +++ /dev/null @@ -1,3 +0,0 @@ -NYC_TAXI_SMALL_URL = ( - "https://azurefeathrstorage.blob.core.windows.net/public/sample_data/green_tripdata_2020-04_with_index.csv" -) diff --git a/feathr_project/feathr/datasets/nyc_taxi.py b/feathr_project/feathr/datasets/nyc_taxi.py deleted file mode 100644 index ec605aae6..000000000 --- a/feathr_project/feathr/datasets/nyc_taxi.py +++ /dev/null @@ -1,87 +0,0 @@ -from pathlib import Path -from tempfile import TemporaryDirectory -from threading import local -from urllib.parse import urlparse - -import pandas as pd -from pyspark.sql import DataFrame, SparkSession - -from feathr.datasets import NYC_TAXI_SMALL_URL -from feathr.datasets.utils import maybe_download -from feathr.utils.platform import is_databricks - - -def get_pandas_df( - local_cache_path: str = None, -) -> pd.DataFrame: - """Get NYC taxi fare prediction data samples as a pandas DataFrame. - - Refs: - https://www1.nyc.gov/site/tlc/about/tlc-trip-record-data.page - - Args: - local_cache_path (optional): Local cache file path to download the data set. - If local_cache_path is a directory, the source file name will be added. - - Returns: - pandas DataFrame - """ - # if local_cache_path params is not provided then create a temporary folder - if local_cache_path is None: - local_cache_path = TemporaryDirectory().name - - # If local_cache_path is a directory, add the source file name. - src_filepath = Path(urlparse(NYC_TAXI_SMALL_URL).path) - dst_path = Path(local_cache_path) - if dst_path.suffix != src_filepath.suffix: - local_cache_path = str(dst_path.joinpath(src_filepath.name)) - - maybe_download(src_url=NYC_TAXI_SMALL_URL, dst_filepath=local_cache_path) - - pdf = pd.read_csv(local_cache_path) - - return pdf - - -def get_spark_df( - spark: SparkSession, - local_cache_path: str, -) -> DataFrame: - """Get NYC taxi fare prediction data samples as a spark DataFrame. - - Refs: - https://www1.nyc.gov/site/tlc/about/tlc-trip-record-data.page - - Args: - spark: Spark session. - local_cache_path: Local cache file path to download the data set. - If local_cache_path is a directory, the source file name will be added. - - Returns: - Spark DataFrame - """ - # In spark, local_cache_path should be a persist directory or file path - if local_cache_path is None: - raise ValueError("In spark, `local_cache_path` should be a persist directory or file path.") - - # If local_cache_path is a directory, add the source file name. - src_filepath = Path(urlparse(NYC_TAXI_SMALL_URL).path) - dst_path = Path(local_cache_path) - if dst_path.suffix != src_filepath.suffix: - local_cache_path = str(dst_path.joinpath(src_filepath.name)) - - if is_databricks(): - # Databricks uses "dbfs:/" prefix for spark paths - if not local_cache_path.startswith("dbfs:"): - local_cache_path = str(Path("dbfs:", local_cache_path.lstrip("/"))) - # Databricks uses "/dbfs/" prefix for python paths - python_local_cache_path = local_cache_path.replace("dbfs:", "/dbfs") - # TODO add "if is_synapse()" - else: - python_local_cache_path = local_cache_path - - maybe_download(src_url=NYC_TAXI_SMALL_URL, dst_filepath=python_local_cache_path) - - df = spark.read.option("header", True).csv(local_cache_path) - - return df diff --git a/feathr_project/feathr/datasets/utils.py b/feathr_project/feathr/datasets/utils.py deleted file mode 100644 index 5dcfb6e87..000000000 --- a/feathr_project/feathr/datasets/utils.py +++ /dev/null @@ -1,64 +0,0 @@ -"""Dataset utilities -""" -import logging -import math -from pathlib import Path -import requests -from urllib.parse import urlparse - -from tqdm import tqdm - - -log = logging.getLogger(__name__) - - -def maybe_download(src_url: str, dst_filepath: str, expected_bytes=None) -> bool: - """Check if file exists. If not, download and return True. Else, return False. - - Refs: - https://github.com/microsoft/recommenders/blob/main/recommenders/datasets/download_utils.py - - Args: - src_url: Source file URL. - dst_filepath: Destination file path. - expected_bytes (optional): Expected bytes of the file to verify. - - Returns: - bool: Whether the file was downloaded or not - """ - dst_filepath = Path(dst_filepath) - - if dst_filepath.is_file(): - log.info(f"File {str(dst_filepath)} already exists") - return False - - # Check dir if exists. If not, create one - dst_filepath.parent.mkdir(parents=True, exist_ok=True) - - response = requests.get(src_url, stream=True) - if response.status_code == 200: - log.info(f"Downloading {src_url}") - total_size = int(response.headers.get("content-length", 0)) - block_size = 1024 - num_iterables = math.ceil(total_size / block_size) - with open(str(dst_filepath.resolve()), "wb") as file: - for data in tqdm( - response.iter_content(block_size), - total=num_iterables, - unit="KB", - unit_scale=True, - ): - file.write(data) - - # Verify the file size - if expected_bytes is not None and expected_bytes != dst_filepath.stat().st_size: - # Delete the file since the size is not the same as the expected one. - dst_filepath.unlink() - raise IOError(f"Failed to verify {str(dst_filepath)}. Maybe interrupted while downloading?") - else: - return True - - else: - response.raise_for_status() - # If not HTTPError yet still cannot download - raise Exception(f"Problem downloading {src_url}") diff --git a/feathr_project/feathr/spark_provider/_databricks_submission.py b/feathr_project/feathr/spark_provider/_databricks_submission.py index cc587e999..cfff0180e 100644 --- a/feathr_project/feathr/spark_provider/_databricks_submission.py +++ b/feathr_project/feathr/spark_provider/_databricks_submission.py @@ -1,65 +1,67 @@ -from collections import namedtuple +from ast import Raise import copy import json import os +import time +from collections import namedtuple from os.path import basename from pathlib import Path -import time -from typing import Dict, List, Optional, Union +from typing import Any, Dict, List, Optional, Union from urllib.parse import urlparse from urllib.request import urlopen +import requests from databricks_cli.dbfs.api import DbfsApi from databricks_cli.runs.api import RunsApi from databricks_cli.sdk.api_client import ApiClient -from loguru import logger -import requests -from requests.structures import CaseInsensitiveDict - from feathr.constants import * from feathr.spark_provider._abc import SparkJobLauncher +from loguru import logger +from requests.structures import CaseInsensitiveDict class _FeathrDatabricksJobLauncher(SparkJobLauncher): """Class to interact with Databricks Spark cluster - This is a light-weight databricks job runner, users should use the provided template json string to get more fine controlled environment for databricks cluster. - For example, user can control whether to use a new cluster to run the job or not, specify the cluster ID, running frequency, node size, workder no., whether to send out failed notification email, etc. - This runner will only fill in necessary arguments in the JSON template. - - This class will read from the provided configs string, and do the following steps. - This default template can be overwritten by users, but users need to make sure the template is compatible with the default template. Specifically: - 1. it's a SparkJarTask (rather than other types of jobs, say NotebookTask or others). See https://docs.microsoft.com/en-us/azure/databricks/dev-tools/api/2.0/jobs#--runs-submit for more details - 2. Use the Feathr Jar to run the job (hence will add an entry in `libraries` section) - 3. Will override `main_class_name` and `parameters` field in the JSON template `spark_jar_task` field - 4. will override the name of this job - - Args: - workspace_instance_url (str): the workinstance url. Document to get workspace_instance_url: https://docs.microsoft.com/en-us/azure/databricks/workspace/workspace-details#workspace-url - token_value (str): see here on how to get tokens: https://docs.microsoft.com/en-us/azure/databricks/dev-tools/api/latest/authentication - config_template (str): config template for databricks cluster. See https://docs.microsoft.com/en-us/azure/databricks/dev-tools/api/2.0/jobs#--runs-submit for more details. - databricks_work_dir (_type_, optional): databricks_work_dir must start with dbfs:/. Defaults to 'dbfs:/feathr_jobs'. - """ + This is a light-weight databricks job runner, users should use the provided template json string to get more fine controlled environment for databricks cluster. + For example, user can control whether to use a new cluster to run the job or not, specify the cluster ID, running frequency, node size, workder no., whether to send out failed notification email, etc. + This runner will only fill in necessary arguments in the JSON template. + + This class will read from the provided configs string, and do the following steps. + This default template can be overwritten by users, but users need to make sure the template is compatible with the default template. Specifically: + 1. it's a SparkJarTask (rather than other types of jobs, say NotebookTask or others). See https://docs.microsoft.com/en-us/azure/databricks/dev-tools/api/2.0/jobs#--runs-submit for more details + 2. Use the Feathr Jar to run the job (hence will add an entry in `libraries` section) + 3. Only supports `new_cluster` type for now + 4. Will override `main_class_name` and `parameters` field in the JSON template `spark_jar_task` field + 5. will override the name of this job + Args: + workspace_instance_url (str): the workinstance url. Document to get workspace_instance_url: https://docs.microsoft.com/en-us/azure/databricks/workspace/workspace-details#workspace-url + token_value (str): see here on how to get tokens: https://docs.microsoft.com/en-us/azure/databricks/dev-tools/api/latest/authentication + config_template (str): config template for databricks cluster. See https://docs.microsoft.com/en-us/azure/databricks/dev-tools/api/2.0/jobs#--runs-submit for more details. + databricks_work_dir (_type_, optional): databricks_work_dir must start with dbfs:/. Defaults to 'dbfs:/feathr_jobs'. + """ def __init__( - self, - workspace_instance_url: str, - token_value: str, - config_template: Union[str, Dict], - databricks_work_dir: str = "dbfs:/feathr_jobs", + self, + workspace_instance_url: str, + token_value: str, + config_template: Union[str,Dict], + databricks_work_dir: str = 'dbfs:/feathr_jobs', ): + + # Below we will use Databricks job APIs (as well as many other APIs) to submit jobs or transfer files # For Job APIs, see https://docs.microsoft.com/en-us/azure/databricks/dev-tools/api/2.0/jobs # for DBFS APIs, see: https://docs.microsoft.com/en-us/azure/databricks/dev-tools/api/latest/dbfs self.config_template = config_template # remove possible trailing '/' due to wrong input format - self.workspace_instance_url = workspace_instance_url.rstrip("/") + self.workspace_instance_url = workspace_instance_url.rstrip('/') self.auth_headers = CaseInsensitiveDict() # Authenticate the REST APIs. Documentation: https://docs.microsoft.com/en-us/azure/databricks/dev-tools/api/latest/authentication - self.auth_headers["Accept"] = "application/json" - self.auth_headers["Authorization"] = f"Bearer {token_value}" + self.auth_headers['Accept'] = 'application/json' + self.auth_headers['Authorization'] = f'Bearer {token_value}' self.databricks_work_dir = databricks_work_dir - self.api_client = ApiClient(host=self.workspace_instance_url, token=token_value) + self.api_client = ApiClient(host=self.workspace_instance_url,token=token_value) def upload_or_get_cloud_path(self, local_path_or_http_path: str): """ @@ -75,7 +77,7 @@ def upload_or_get_cloud_path(self, local_path_or_http_path: str): with urlopen(local_path_or_http_path) as f: # use REST API to avoid local temp file data = f.read() - files = {"file": data} + files = {'file': data} # for DBFS APIs, see: https://docs.microsoft.com/en-us/azure/databricks/dev-tools/api/latest/dbfs r = requests.post(url=self.workspace_instance_url+'/api/2.0/dbfs/put', headers=self.auth_headers, files=files, data={'overwrite': 'true', 'path': cloud_dest_path}) @@ -88,12 +90,8 @@ def upload_or_get_cloud_path(self, local_path_or_http_path: str): cloud_dest_path = local_path_or_http_path elif src_parse_result.scheme.startswith(('wasb','s3','gs')): # if the path starts with a location that's not a local path - logger.error( - "File {} cannot be downloaded. Please upload the file to dbfs manually.", local_path_or_http_path - ) - raise RuntimeError( - f"File {local_path_or_http_path} cannot be downloaded. Please upload the file to dbfs manually." - ) + logger.error("File {} cannot be downloaded. Please upload the file to dbfs manually.", local_path_or_http_path) + raise RuntimeError(f"File {local_path_or_http_path} cannot be downloaded. Please upload the file to dbfs manually.") else: # else it should be a local file path or dir if os.path.isdir(local_path_or_http_path): @@ -124,18 +122,7 @@ def _upload_local_file_to_workspace(self, local_path: str) -> str: raise RuntimeError(f"The source path: {local_path}, or the destination path: {cloud_dest_path}, is/are not valid.") from e return cloud_dest_path - def submit_feathr_job( - self, - job_name: str, - main_jar_path: str, - main_class_name: str, - arguments: List[str], - python_files: List[str], - reference_files_path: List[str] = [], - job_tags: Dict[str, str] = None, - configuration: Dict[str, str] = {}, - properties: Dict[str, str] = {}, - ): + def submit_feathr_job(self, job_name: str, main_jar_path: str, main_class_name: str, arguments: List[str], python_files: List[str], reference_files_path: List[str] = [], job_tags: Dict[str, str] = None, configuration: Dict[str, str] = {}, properties: Dict[str, str] = {}): """ submit the feathr job to databricks Refer to the databricks doc for more details on the meaning of the parameters: @@ -159,93 +146,72 @@ def submit_feathr_job( # otherwise users might have missed the quotes in the config. Treat them as dict # Note that we need to use deep copy here, in order to make `self.config_template` immutable # Otherwise, since we need to change submission_params later, which will modify `self.config_template` and cause unexpected behaviors - submission_params = copy.deepcopy(self.config_template) - - submission_params["run_name"] = job_name - cfg = configuration.copy() - if "existing_cluster_id" in submission_params: - logger.info("Using an existing general purpose cluster to run the feathr job...") - if cfg: - logger.warning( - "Spark execution configuration will be ignored. To use job-specific spark configs, please use a new job cluster or set the configs via Databricks UI." - ) - if job_tags: - logger.warning( - "Job tags will be ignored. To assign job tags to the cluster, please use a new job cluster." - ) - elif "new_cluster" in submission_params: - logger.info("Using a new job cluster to run the feathr job...") + submission_params = copy.deepcopy(self.config_template) + + submission_params['run_name'] = job_name + if 'existing_cluster_id' not in submission_params: # if users don't specify existing_cluster_id # Solving this issue: Handshake fails trying to connect from Azure Databricks to Azure PostgreSQL with SSL # https://docs.microsoft.com/en-us/answers/questions/170730/handshake-fails-trying-to-connect-from-azure-datab.html - cfg["spark.executor.extraJavaOptions"] = "-Djava.security.properties=" - cfg["spark.driver.extraJavaOptions"] = "-Djava.security.properties=" - submission_params["new_cluster"]["spark_conf"] = cfg + configuration['spark.executor.extraJavaOptions'] = '-Djava.security.properties=' + configuration['spark.driver.extraJavaOptions'] = '-Djava.security.properties=' + submission_params['new_cluster']['spark_conf'] = configuration if job_tags: - custom_tags = submission_params["new_cluster"].get("custom_tags", {}) + custom_tags = submission_params['new_cluster'].get('custom_tags', {}) for tag, value in job_tags.items(): custom_tags[tag] = value - submission_params["new_cluster"]["custom_tags"] = custom_tags - else: - # TODO we should fail fast -- maybe check this in config verification while initializing the client. - raise ValueError( - "No cluster specifications are found. Either 'existing_cluster_id' or 'new_cluster' should be configured via feathr config." - ) + submission_params['new_cluster']['custom_tags'] = custom_tags # the feathr main jar file is anyway needed regardless it's pyspark or scala spark if not main_jar_path: logger.info(f"Main JAR file is not set, using default package '{FEATHR_MAVEN_ARTIFACT}' from Maven") - submission_params["libraries"][0]["maven"] = {"coordinates": FEATHR_MAVEN_ARTIFACT} + submission_params['libraries'][0]['maven'] = { "coordinates": FEATHR_MAVEN_ARTIFACT } else: - submission_params["libraries"][0]["jar"] = self.upload_or_get_cloud_path(main_jar_path) + submission_params['libraries'][0]['jar'] = self.upload_or_get_cloud_path(main_jar_path) # see here for the submission parameter definition https://docs.microsoft.com/en-us/azure/databricks/dev-tools/api/2.0/jobs#--request-structure-6 if python_files: # this is a pyspark job. definition here: https://docs.microsoft.com/en-us/azure/databricks/dev-tools/api/2.0/jobs#--sparkpythontask # the first file is the pyspark driver code. we only need the driver code to execute pyspark - param_and_file_dict = { - "parameters": arguments, - "python_file": self.upload_or_get_cloud_path(python_files[0]), - } + param_and_file_dict = {"parameters": arguments, "python_file": self.upload_or_get_cloud_path(python_files[0])} # indicates this is a pyspark job # `setdefault` method will get the value of the "spark_python_task" item, if the "spark_python_task" item does not exist, insert "spark_python_task" with the value "param_and_file_dict": - submission_params.setdefault("spark_python_task", param_and_file_dict) + submission_params.setdefault('spark_python_task',param_and_file_dict) else: # this is a scala spark job - submission_params["spark_jar_task"]["parameters"] = arguments - submission_params["spark_jar_task"]["main_class_name"] = main_class_name + submission_params['spark_jar_task']['parameters'] = arguments + submission_params['spark_jar_task']['main_class_name'] = main_class_name result = RunsApi(self.api_client).submit_run(submission_params) try: # see if we can parse the returned result - self.res_job_id = result["run_id"] + self.res_job_id = result['run_id'] except: - logger.error( - "Submitting Feathr job to Databricks cluster failed. Message returned from Databricks: {}", result - ) + logger.error("Submitting Feathr job to Databricks cluster failed. Message returned from Databricks: {}", result) exit(1) result = RunsApi(self.api_client).get_run(self.res_job_id) - self.job_url = result["run_page_url"] - logger.info("Feathr job Submitted Successfully. View more details here: {}", self.job_url) + self.job_url = result['run_page_url'] + logger.info('Feathr job Submitted Successfully. View more details here: {}', self.job_url) # return ID as the submission result return self.res_job_id def wait_for_completion(self, timeout_seconds: Optional[int] = 600) -> bool: - """Returns true if the job completed successfully""" + """ Returns true if the job completed successfully + """ start_time = time.time() while (timeout_seconds is None) or (time.time() - start_time < timeout_seconds): status = self.get_status() - logger.debug("Current Spark job status: {}", status) + logger.debug('Current Spark job status: {}', status) # see all the status here: # https://docs.microsoft.com/en-us/azure/databricks/dev-tools/api/2.0/jobs#--runlifecyclestate # https://docs.microsoft.com/en-us/azure/databricks/dev-tools/api/2.0/jobs#--runresultstate - if status in {"SUCCESS"}: + if status in {'SUCCESS'}: return True - elif status in {"INTERNAL_ERROR", "FAILED", "TIMEDOUT", "CANCELED"}: + elif status in {'INTERNAL_ERROR', 'FAILED', 'TIMEDOUT', 'CANCELED'}: result = RunsApi(self.api_client).get_run_output(self.res_job_id) # See here for the returned fields: https://docs.microsoft.com/en-us/azure/databricks/dev-tools/api/2.0/jobs#--response-structure-8 # print out logs and stack trace if the job has failed @@ -258,14 +224,14 @@ def wait_for_completion(self, timeout_seconds: Optional[int] = 600) -> bool: else: time.sleep(30) else: - raise TimeoutError("Timeout waiting for Feathr job to complete") + raise TimeoutError('Timeout waiting for Feathr job to complete') def get_status(self) -> str: assert self.res_job_id is not None result = RunsApi(self.api_client).get_run(self.res_job_id) # first try to get result state. it might not be available, and if that's the case, try to get life_cycle_state # see result structure: https://docs.microsoft.com/en-us/azure/databricks/dev-tools/api/2.0/jobs#--response-structure-6 - res_state = result["state"].get("result_state") or result["state"]["life_cycle_state"] + res_state = result['state'].get('result_state') or result['state']['life_cycle_state'] assert res_state is not None return res_state @@ -279,6 +245,7 @@ def get_job_result_uri(self) -> str: # in case users call this API even when there's no tags available return None if custom_tags is None else custom_tags[OUTPUT_PATH_TAG] + def get_job_tags(self) -> Dict[str, str]: """Get job tags @@ -289,23 +256,21 @@ def get_job_tags(self) -> Dict[str, str]: # For result structure, see https://docs.microsoft.com/en-us/azure/databricks/dev-tools/api/2.0/jobs#--response-structure-6 result = RunsApi(self.api_client).get_run(self.res_job_id) - if "new_cluster" in result["cluster_spec"]: - custom_tags = result["cluster_spec"]["new_cluster"]["custom_tags"] + if 'new_cluster' in result['cluster_spec']: + custom_tags = result['cluster_spec']['new_cluster']['custom_tags'] return custom_tags else: # this is not a new cluster; it's an existing cluster. - logger.warning( - "Job tags are not available since you are using an existing Databricks cluster. Consider using 'new_cluster' in databricks configuration." - ) + logger.warning("Job tags are not available since you are using an existing Databricks cluster. Consider using 'new_cluster' in databricks configuration.") return None + def download_result(self, result_path: str, local_folder: str): """ Supports downloading files from the result folder. Only support paths starts with `dbfs:/` and only support downloading files in one folder (per Spark's design, everything will be in the result folder in a flat manner) """ - if not result_path.startswith("dbfs"): - raise RuntimeError( - 'Currently only paths starting with dbfs is supported for downloading results from a databricks cluster. The path should start with "dbfs:" .' - ) + if not result_path.startswith('dbfs'): + raise RuntimeError('Currently only paths starting with dbfs is supported for downloading results from a databricks cluster. The path should start with \"dbfs:\" .') DbfsApi(self.api_client).cp(recursive=True, overwrite=True, src=result_path, dst=local_folder) + diff --git a/feathr_project/feathr/utils/config.py b/feathr_project/feathr/utils/config.py deleted file mode 100644 index 9a9438567..000000000 --- a/feathr_project/feathr/utils/config.py +++ /dev/null @@ -1,61 +0,0 @@ -from tempfile import NamedTemporaryFile - - -FEATHR_CONFIG_TEMPLATE = """ -api_version: 1 - -project_config: - project_name: {project_name} - -feature_registry: - api_endpoint: 'https://{resource_prefix}webapp.azurewebsites.net/api/v1' - -spark_config: - # Currently support: 'azure_synapse', 'databricks', and 'local' - spark_cluster: {spark_cluster} - spark_result_output_parts: '1' - -offline_store: - wasb: - wasb_enabled: true - -online_store: - # You can skip this part if you don't have Redis and skip materialization later in this notebook. - redis: - host: '{resource_prefix}redis.redis.cache.windows.net' - port: 6380 - ssl_enabled: true -""" - - -def generate_config( - resource_prefix: str, - project_name: str, - spark_cluster: str, - output_filepath: str = None, -) -> str: - """Generate a feathr config yaml file - - Args: - resource_prefix: Resource name prefix. - project_name: Project name. - spark_cluster: Spark cluster to use. Either 'local', 'databricks', or 'azure_synapse'. - output_filepath: Output filepath. - - Returns: - str: Generated config file path. output_filepath if provided. Otherwise, NamedTemporaryFile path. - """ - - conf_str = FEATHR_CONFIG_TEMPLATE.format( - resource_prefix=resource_prefix, - project_name=project_name, - spark_cluster=spark_cluster, - ) - - if not output_filepath: - output_filepath = NamedTemporaryFile(mode="w", delete=False).name - - with open(output_filepath, "w") as conf_file: - conf_file.write(conf_str) - - return output_filepath diff --git a/feathr_project/feathr/utils/job_utils.py b/feathr_project/feathr/utils/job_utils.py index 815e26c21..6a6bd63c0 100644 --- a/feathr_project/feathr/utils/job_utils.py +++ b/feathr_project/feathr/utils/job_utils.py @@ -1,187 +1,77 @@ -import glob +from feathr.client import FeathrClient import os -from pathlib import Path -from tempfile import TemporaryDirectory -from typing import Union - +import glob +from feathr.constants import OUTPUT_FORMAT from loguru import logger import pandas as pd +import tempfile from pandas.errors import EmptyDataError -from pyspark.sql import DataFrame, SparkSession - -from feathr.client import FeathrClient -from feathr.constants import OUTPUT_FORMAT -def get_result_pandas_df( - client: FeathrClient, - data_format: str = None, - res_url: str = None, - local_cache_path: str = None, -) -> pd.DataFrame: - """Download the job result dataset from cloud as a Pandas DataFrame. - Args: - client: Feathr client - data_format: Format to read the downloaded files. Currently support `parquet`, `delta`, `avro`, and `csv`. - Default to `avro` if not specified. - res_url: Result URL to download files from. Note that this will not block the job so you need to make sure - the job is finished and the result URL contains actual data. - local_cache_path (optional): Specify the absolute download path. if the user does not provide this, - the function will create a temporary directory. +def get_result_df(client: FeathrClient, format: str = None, res_url: str = None, local_folder: str = None) -> pd.DataFrame: + """Download the job result dataset from cloud as a Pandas dataframe to make it easier for the client to read. - Returns: - pandas DataFrame + format: format to read the downloaded files. Currently support `parquet`, `delta`, `avro`, and `csv`. Default to `avro` if not specified. + res_url: output URL to download files. Note that this will not block the job so you need to make sure the job is finished and result URL contains actual data. + local_folder: optional parameter to specify the absolute download path. if the user does not provide this, function will create a temporary directory and delete it after reading the dataframe. """ - return get_result_df(client, data_format, res_url, local_cache_path) - - -def get_result_spark_df( - spark: SparkSession, - client: FeathrClient, - data_format: str = None, - res_url: str = None, - local_cache_path: str = None, -) -> DataFrame: - """Download the job result dataset from cloud as a Spark DataFrame. - - Args: - spark: Spark session - client: Feathr client - data_format: Format to read the downloaded files. Currently support `parquet`, `delta`, `avro`, and `csv`. - Default to `avro` if not specified. - res_url: Result URL to download files from. Note that this will not block the job so you need to make sure - the job is finished and the result URL contains actual data. - local_cache_path (optional): Specify the absolute download path. if the user does not provide this, - the function will create a temporary directory. - - Returns: - Spark DataFrame - """ - return get_result_df(client, data_format, res_url, local_cache_path, spark=spark) - - -def get_result_df( - client: FeathrClient, - data_format: str = None, - res_url: str = None, - local_cache_path: str = None, - spark: SparkSession = None, -) -> Union[DataFrame, pd.DataFrame]: - """Download the job result dataset from cloud as a Spark DataFrame or pandas DataFrame. - - Args: - client: Feathr client - data_format: Format to read the downloaded files. Currently support `parquet`, `delta`, `avro`, and `csv`. - Default to `avro` if not specified. - res_url: Result URL to download files from. Note that this will not block the job so you need to make sure - the job is finished and the result URL contains actual data. - local_cache_path (optional): Specify the absolute download path. if the user does not provide this, - the function will create a temporary directory. - spark (optional): Spark session. If provided, the function returns spark Dataframe. - Otherwise, it returns pd.DataFrame. - - Returns: - Either Spark or pandas DataFrame. - """ - # use a result url if it's provided by the user, otherwise use the one provided by the job + # use a result url if it's provided by the user, otherwise use the one provided by the job res_url: str = res_url or client.get_job_result_uri(block=True, timeout_sec=1200) if res_url is None: - raise RuntimeError( - "res_url is None. Please make sure either you provide a res_url or make sure the job finished in FeathrClient has a valid result URI." - ) - - if client.spark_runtime == "local": - if local_cache_path is not None: - logger.warning( - "In local spark mode, the result files are expected to be stored at a local storage and thus `local_cache_path` argument will be ignored." - ) - local_cache_path = res_url - elif client.spark_runtime == "databricks": - if res_url.startswith("dbfs:"): - logger.warning( - "Result files are already in DBFS and thus `local_cache_path` will be ignored." - ) - local_cache_path = res_url - else: - # if local_cache_path params is not provided then create a temporary folder - if local_cache_path is None: - # We'll just use the name of a local TemporaryDirectory to cache the data into DBFS. - local_cache_path = TemporaryDirectory().name - - # Databricks uses "dbfs:/" prefix for spark paths - if not local_cache_path.startswith("dbfs:"): - local_cache_path = str(Path("dbfs:", local_cache_path.lstrip("/"))) - # TODO elif azure_synapse - - if local_cache_path != res_url: - logger.info(f"{res_url} files will be downloaded into {local_cache_path}") - client.feathr_spark_launcher.download_result(result_path=res_url, local_folder=local_cache_path) + raise RuntimeError("res_url is None. Please make sure either you provide a res_url or make sure the job finished in FeathrClient has a valid result URI.") - # use user provided format, if there isn't one, then otherwise use the one provided by the job; + # use user provided format, if there isn't one, then otherwise use the one provided by the job; # if none of them is available, "avro" is the default format. - data_format: str = data_format or client.get_job_tags().get(OUTPUT_FORMAT, "") - if data_format is None or data_format == "": - data_format = "avro" + format: str = format or client.get_job_tags().get(OUTPUT_FORMAT, "") + if format is None or format == "": + format = "avro" - result_df = None - - if spark is not None: - result_df = spark.read.format(data_format).load(local_cache_path) + # if local_folder params is not provided then create a temporary folder + if local_folder is not None: + local_dir_path = local_folder else: - result_df = _read_files_to_pandas_df( - dir_path=local_cache_path.replace("dbfs:", "/dbfs"), # replace to python path if spark path is provided. - data_format=data_format, - ) - - return result_df - - -def _read_files_to_pandas_df(dir_path: str, data_format: str = "avro") -> pd.DataFrame: - - if data_format == "parquet": + tmp_dir = tempfile.TemporaryDirectory() + local_dir_path = tmp_dir.name + + client.feathr_spark_launcher.download_result(result_path=res_url, local_folder=local_dir_path) + dataframe_list = [] + # by default the result are in avro format + if format.casefold()=="parquet": + files = glob.glob(os.path.join(local_dir_path, '*.parquet')) from pyarrow.parquet import ParquetDataset - - files = glob.glob(os.path.join(dir_path, "*.parquet")) ds = ParquetDataset(files) - return ds.read().to_pandas() - - elif data_format == "delta": + result_df = ds.read().to_pandas() + elif format.casefold()=="delta": from deltalake import DeltaTable - - delta = DeltaTable(dir_path) - # if client.spark_runtime != "azure_synapse": - # don't detect for synapse result with Delta as there's a problem with underlying system - # Issues are tracked here: https://github.com/delta-io/delta-rs/issues/582 - return delta.to_pyarrow_table().to_pandas() - # else: - # TODO -- Proper warning messages. Is this applied to all the other formats? - # raise RuntimeError( - # "Please use Azure Synapse to read the result in the Azure Synapse cluster. Reading local results is not supported for Azure Synapse." - # ) - - elif data_format == "avro": + delta = DeltaTable(local_dir_path) + if not client.spark_runtime == 'azure_synapse': + # don't detect for synapse result with Delta as there's a problem with underlying system + # Issues are tracked here: https://github.com/delta-io/delta-rs/issues/582 + result_df = delta.to_pyarrow_table().to_pandas() + else: + logger.info("Please use Azure Synapse to read the result in the Azure Synapse cluster. Reading local results is not supported for Azure Synapse. Empty DataFrame is returned.") + result_df = pd.DataFrame() + elif format.casefold()=="avro": import pandavro as pdx - - dataframe_list = [pdx.read_avro(file) for file in glob.glob(os.path.join(dir_path, "*.avro"))] - return pd.concat(dataframe_list, axis=0) - - elif data_format == "csv": - dataframe_list = [] - for file in glob.glob(os.path.join(dir_path, "*.csv")): + for file in glob.glob(os.path.join(local_dir_path, '*.avro')): + dataframe_list.append(pdx.read_avro(file)) + result_df = pd.concat(dataframe_list, axis=0) + elif format.casefold()=="csv": + for file in glob.glob(os.path.join(local_dir_path, '*.csv')): try: - dataframe_list.append(pd.read_csv(file, index_col=None, header=None)) + df = pd.read_csv(file, index_col=None, header=None) except EmptyDataError: # in case there are empty files - pass - - if dataframe_list: - # Reset index to avoid duplicated indices -- TODO don't we need reset_index when reading avro too? - return pd.concat(dataframe_list, axis=0).reset_index(drop=True) - else: - raise ValueError(f"Empty files in {dir_path}.") - + df = pd.DataFrame() + dataframe_list.append(df) + result_df = pd.concat(dataframe_list, axis=0) + # Reset index to avoid duplicated indices + result_df.reset_index(drop=True) else: - raise ValueError( - f"{data_format} is currently not supported in get_result_df. Currently only parquet, delta, avro, and csv are supported, please consider writing a customized function to read the result." - ) + raise RuntimeError(f"{format} is currently not supported in get_result_df. Currently only parquet, delta, avro, and csv are supported, please consider writing a customized function to read the result.") + + + if local_folder is None: + tmp_dir.cleanup() + return result_df \ No newline at end of file diff --git a/feathr_project/feathr/utils/platform.py b/feathr_project/feathr/utils/platform.py deleted file mode 100644 index 8f832f22d..000000000 --- a/feathr_project/feathr/utils/platform.py +++ /dev/null @@ -1,45 +0,0 @@ -"""Platform utilities. -Refs: https://github.com/microsoft/recommenders/blob/main/recommenders/utils/notebook_utils.py -""" -from pathlib import Path - - -def is_jupyter() -> bool: - """Check if the module is running on Jupyter notebook/console. - Note - there might be better way to check if the code is running on a jupyter notebook or not, - but this hacky way still works. - - Ref: - https://stackoverflow.com/questions/15411967/how-can-i-check-if-code-is-executed-in-the-ipython-notebook - - Returns: - bool: True if the module is running on Jupyter notebook or Jupyter console, False otherwise. - """ - try: - # Pre-loaded module `get_ipython()` tells you whether you are running inside IPython or not. - shell_name = get_ipython().__class__.__name__ - # `ZMQInteractiveShell` tells you if this is an interactive mode (notebook). - if shell_name == "ZMQInteractiveShell": - return True - else: - return False - except NameError: - return False - - -def is_databricks() -> bool: - """Check if the module is running on Databricks. - - Returns: - bool: True if the module is running on Databricks notebook, False otherwise. - """ - try: - if str(Path(".").resolve()) == "/databricks/driver": - return True - else: - return False - except NameError: - return False - - -# TODO maybe add is_synapse() diff --git a/feathr_project/feathrcli/data/feathr_user_workspace/mockdata/feathrazuretest3fs@feathrazuretest3storage.dfs.core.windows.net/demo_data/green_tripdata_2020-04.csv b/feathr_project/feathrcli/data/feathr_user_workspace/mockdata/feathrazuretest3fs@feathrazuretest3storage.dfs.core.windows.net/demo_data/green_tripdata_2020-04.csv new file mode 100644 index 000000000..ce34f255a --- /dev/null +++ b/feathr_project/feathrcli/data/feathr_user_workspace/mockdata/feathrazuretest3fs@feathrazuretest3storage.dfs.core.windows.net/demo_data/green_tripdata_2020-04.csv @@ -0,0 +1,14 @@ +VendorID,lpep_pickup_datetime,lpep_dropoff_datetime,store_and_fwd_flag,RatecodeID,PULocationID,DOLocationID,passenger_count,trip_distance,fare_amount,extra,mta_tax,tip_amount,tolls_amount,ehail_fee,improvement_surcharge,total_amount,payment_type,trip_type,congestion_surcharge +2,2021-01-01 00:15:56,2021-01-01 00:19:52,N,1,43,151,1,1.01,5.5,0.5,0.5,0,0,,0.3,6.8,2,1,0 +22,2021-01-01 11:25:59,2021-01-01 11:34:44,N,1,166,239,1,2.53,10,0.5,0.5,2.81,0,,0.3,16.86,1,1,2.75 +23,2021-01-01 00:45:57,2021-01-01 00:51:55,N,1,41,42,1,1.12,6,0.5,0.5,1,0,,0.3,8.3,1,1,0 +24,2020-12-31 23:57:51,2021-01-01 23:04:56,N,1,168,75,1,1.99,8,0.5,0.5,0,0,,0.3,9.3,2,1,0 +25,2021-01-01 17:16:36,2021-01-01 17:16:40,N,2,265,265,3,.00,-52,0,-0.5,0,0,,-0.3,-52.8,3,1,0 +12,2021-01-01 00:16:36,2021-01-01 00:16:40,N,2,265,265,3,.00,52,0,0.5,0,0,,0.3,52.8,2,1,0 +42,2021-01-01 05:19:14,2021-01-01 00:19:21,N,5,265,265,1,.00,180,0,0,36.06,0,,0.3,216.36,1,2,0 +52,2021-01-01 00:26:31,2021-01-01 00:28:50,N,1,75,75,6,.45,3.5,0.5,0.5,0.96,0,,0.3,5.76,1,1,0 +2,2021-01-01 00:57:46,2021-01-01 00:57:57,N,1,225,225,1,.00,2.5,0.5,0.5,0,0,,0.3,3.8,2,1,0 +32,2021-01-01 00:58:32,2021-01-01 01:32:34,N,1,225,265,1,12.19,38,0.5,0.5,2.75,0,,0.3,42.05,1,1,0 +2,2021-01-01 18:39:57,2021-01-01 18:55:25,N,1,74,60,1,5.48,18,0.5,0.5,0,0,,0.3,19.3,2,1,0 +15,2021-01-01 00:51:27,2021-01-01 00:57:20,N,1,42,41,2,.90,6,0.5,0.5,0,0,,0.3,7.3,1,1,0 +15,2021-01-01 00:29:05,2021-01-01 00:29:07,N,5,42,264,1,9.00E-02,10,0,0,2.06,0,,0.3,12.36,1,2,0 \ No newline at end of file diff --git a/feathr_project/feathrcli/data/feathr_user_workspace/mockdata/feathrazuretest3fs@feathrazuretest3storage.dfs.core.windows.net/product_recommendation_sample/product_detail_mock_data.csv b/feathr_project/feathrcli/data/feathr_user_workspace/mockdata/feathrazuretest3fs@feathrazuretest3storage.dfs.core.windows.net/product_recommendation_sample/product_detail_mock_data.csv new file mode 100644 index 000000000..476ea06f3 --- /dev/null +++ b/feathr_project/feathrcli/data/feathr_user_workspace/mockdata/feathrazuretest3fs@feathrazuretest3storage.dfs.core.windows.net/product_recommendation_sample/product_detail_mock_data.csv @@ -0,0 +1,11 @@ +product_id,category,price,quantity,recent_sold,made_in_state,discount +1,1,22,100,0,CA,7.5 +2,2,17,300,1,CA,7.5 +3,1,40,0,2,WA,7.5 +4,1,25,100,3,WA,7.5 +5,1,33,0,2,PA,0 +6,2,19,0,2,CA,7.5 +7,2,22,200,1,WA,7.5 +8,2,59,300,0,PA,8.5 +9,0,80,100,1,WA,8.5 +10,0,39,100,0,WA,7.5 \ No newline at end of file diff --git a/feathr_project/feathrcli/data/feathr_user_workspace/mockdata/feathrazuretest3fs@feathrazuretest3storage.dfs.core.windows.net/product_recommendation_sample/user_observation_mock_data.csv b/feathr_project/feathrcli/data/feathr_user_workspace/mockdata/feathrazuretest3fs@feathrazuretest3storage.dfs.core.windows.net/product_recommendation_sample/user_observation_mock_data.csv new file mode 100644 index 000000000..38fe25ceb --- /dev/null +++ b/feathr_project/feathrcli/data/feathr_user_workspace/mockdata/feathrazuretest3fs@feathrazuretest3storage.dfs.core.windows.net/product_recommendation_sample/user_observation_mock_data.csv @@ -0,0 +1,35 @@ +user_id,product_id,event_timestamp,product_rating +1,1,2021-04-01,4 +1,2,2021-04-01,4 +1,3,2021-04-01,4 +1,4,2021-04-01,4 +1,5,2021-04-01,4 +2,1,2021-04-01,5 +2,2,2021-04-01,5 +2,3,2021-04-01,5 +2,4,2021-04-01,5 +2,5,2021-04-01,5 +3,1,2021-04-01,5 +3,2,2021-04-01,5 +3,3,2021-04-01,5 +3,4,2021-04-01,5 +3,5,2021-04-01,5 +4,1,2021-04-01,1 +4,2,2021-04-01,1 +4,3,2021-04-01,1 +4,4,2021-04-01,1 +4,5,2021-04-01,1 +5,1,2021-04-01,5 +5,2,2021-04-01,5 +6,1,2021-04-01,2 +7,1,2021-04-01,5 +7,2,2021-04-01,5 +7,3,2021-04-01,5 +8,1,2021-04-01,2 +8,2,2021-04-01,2 +8,3,2021-04-01,2 +9,1,2021-04-01,5 +9,2,2021-04-01,5 +9,3,2021-04-01,5 +9,4,2021-04-01,5 +10,1,2021-04-01,3 \ No newline at end of file diff --git a/feathr_project/feathrcli/data/feathr_user_workspace/mockdata/feathrazuretest3fs@feathrazuretest3storage.dfs.core.windows.net/product_recommendation_sample/user_profile_mock_data.csv b/feathr_project/feathrcli/data/feathr_user_workspace/mockdata/feathrazuretest3fs@feathrazuretest3storage.dfs.core.windows.net/product_recommendation_sample/user_profile_mock_data.csv new file mode 100644 index 000000000..6c38f51d7 --- /dev/null +++ b/feathr_project/feathrcli/data/feathr_user_workspace/mockdata/feathrazuretest3fs@feathrazuretest3storage.dfs.core.windows.net/product_recommendation_sample/user_profile_mock_data.csv @@ -0,0 +1,11 @@ +user_id,gender,age,gift_card_balance,number_of_credit_cards,state,tax_rate +1,1,22,100,0,CA,7.5 +2,2,17,300,1,CA,7.5 +3,1,40,0,2,WA,7.5 +4,1,25,100,3,WA,7.5 +5,1,33,0,2,PA,0 +6,2,19,0,2,CA,7.5 +7,2,22,200,1,WA,7.5 +8,2,59,300,0,PA,8.5 +9,0,80,100,1,WA,8.5 +10,0,39,100,0,WA,7.5 \ No newline at end of file diff --git a/feathr_project/feathrcli/data/feathr_user_workspace/mockdata/feathrazuretest3fs@feathrazuretest3storage.dfs.core.windows.net/product_recommendation_sample/user_purchase_history_mock_data.csv b/feathr_project/feathrcli/data/feathr_user_workspace/mockdata/feathrazuretest3fs@feathrazuretest3storage.dfs.core.windows.net/product_recommendation_sample/user_purchase_history_mock_data.csv new file mode 100644 index 000000000..8c8481d1f --- /dev/null +++ b/feathr_project/feathrcli/data/feathr_user_workspace/mockdata/feathrazuretest3fs@feathrazuretest3storage.dfs.core.windows.net/product_recommendation_sample/user_purchase_history_mock_data.csv @@ -0,0 +1,31 @@ +user_id,purchase_date,purchase_amount +1,2021-01-01,0.33 +1,2021-03-03,574.35 +1,2021-01-03,796.07 +2,2021-01-04,342.15 +2,2021-03-05,280.46 +2,2021-01-06,664.18 +3,2021-01-07,359.02 +3,2021-01-08,357.12 +3,2021-01-09,845.40 +4,2021-01-10,103.92 +4,2021-02-21,670.12 +4,2021-02-12,698.65 +5,2021-01-13,110.52 +5,2021-01-14,931.72 +5,2021-02-15,388.14 +6,2021-01-16,822.96 +6,2021-01-17,292.39 +6,2021-01-18,524.76 +7,2021-01-19,262.00 +7,2021-03-20,715.94 +7,2021-01-21,345.70 +8,2021-01-22,379.00 +8,2021-01-23,194.96 +8,2021-01-24,862.33 +9,2021-01-25,430.41 +9,2021-01-26,398.72 +9,2021-02-27,158.52 +10,2021-01-28,550.01 +10,2021-03-02,157.88 +10,2021-03-03,528.43 \ No newline at end of file diff --git a/feathr_project/feathrcli/data/feathr_user_workspace/nyc_driver_demo.ipynb b/feathr_project/feathrcli/data/feathr_user_workspace/nyc_driver_demo.ipynb new file mode 100644 index 000000000..38cec2ca9 --- /dev/null +++ b/feathr_project/feathrcli/data/feathr_user_workspace/nyc_driver_demo.ipynb @@ -0,0 +1,720 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Feathr Feature Store on Azure Demo Notebook\n", + "\n", + "This notebook illustrates the use of Feature Store to create a model that predicts NYC Taxi fares. It includes these steps:\n", + "\n", + "\n", + "This tutorial demonstrates the key capabilities of Feathr, including:\n", + "\n", + "1. Install and set up Feathr with Azure\n", + "2. Create shareable features with Feathr feature definition configs.\n", + "3. Create a training dataset via point-in-time feature join.\n", + "4. Compute and write features.\n", + "5. Train a model using these features to predict fares.\n", + "6. Materialize feature value to online store.\n", + "7. Fetch feature value in real-time from online store for online scoring.\n", + "\n", + "In this tutorial, we use Feathr Feature Store to create a model that predicts NYC Taxi fares. The dataset comes from [here](https://www1.nyc.gov/site/tlc/about/tlc-trip-record-data.page). The feature flow is as below:\n", + "\n", + "![Feature Flow](https://github.com/linkedin/feathr/blob/main/docs/images/feature_flow.png?raw=true)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Prerequisite: Use Quick Start Template to Provision Azure Resources\n", + "First step is to provision required cloud resources if you want to use Feathr. Feathr provides a python based client to interact with cloud resources.\n", + "\n", + "Please follow the steps [here](https://feathr-ai.github.io/feathr/how-to-guides/azure-deployment-arm.html) to provision required cloud resources. Due to the complexity of the possible cloud environment, it is almost impossible to create a script that works for all the use cases. Because of this, [azure_resource_provision.sh](https://github.com/linkedin/feathr/blob/main/docs/how-to-guides/azure_resource_provision.sh) is a full end to end command line to create all the required resources, and you can tailor the script as needed, while [the companion documentation](https://feathr-ai.github.io/feathr/how-to-guides/azure-deployment-cli.html) can be used as a complete guide for using that shell script.\n", + "\n", + "\n", + "![Architecture](https://github.com/linkedin/feathr/blob/main/docs/images/architecture.png?raw=true)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Prerequisite: Install Feathr \n", + "\n", + "Install Feathr using pip:\n", + "\n", + "`pip install -U feathr pandavro scikit-learn`" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Prerequisite: Configure the required environment with Feathr Quick Start Template\n", + "\n", + "In the first step (Provision cloud resources), you should have provisioned all the required cloud resources. Run the code below to install Feathr, login to Azure to get the required credentials to access more cloud resources." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**REQUIRED STEP: Fill in the resource prefix when provisioning the resources**" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "resource_prefix = \"feathr_resource_prefix\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "! pip install feathr azure-cli pandavro scikit-learn" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Login to Azure with a device code (You will see instructions in the output):" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "! az login --use-device-code" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import glob\n", + "import os\n", + "import tempfile\n", + "from datetime import datetime, timedelta\n", + "from math import sqrt\n", + "\n", + "import pandas as pd\n", + "import pandavro as pdx\n", + "from feathr import FeathrClient\n", + "from feathr import BOOLEAN, FLOAT, INT32, ValueType\n", + "from feathr import Feature, DerivedFeature, FeatureAnchor\n", + "from feathr import BackfillTime, MaterializationSettings\n", + "from feathr import FeatureQuery, ObservationSettings\n", + "from feathr import RedisSink\n", + "from feathr import INPUT_CONTEXT, HdfsSource\n", + "from feathr import WindowAggTransformation\n", + "from feathr import TypedKey\n", + "from sklearn.metrics import mean_squared_error\n", + "from sklearn.model_selection import train_test_split\n", + "from azure.identity import DefaultAzureCredential\n", + "from azure.keyvault.secrets import SecretClient\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Get all the required credentials from Azure KeyVault" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Get all the required credentials from Azure Key Vault\n", + "key_vault_name=resource_prefix+\"kv\"\n", + "synapse_workspace_url=resource_prefix+\"syws\"\n", + "adls_account=resource_prefix+\"dls\"\n", + "adls_fs_name=resource_prefix+\"fs\"\n", + "purview_name=resource_prefix+\"purview\"\n", + "key_vault_uri = f\"https://{key_vault_name}.vault.azure.net\"\n", + "credential = DefaultAzureCredential(exclude_interactive_browser_credential=False)\n", + "client = SecretClient(vault_url=key_vault_uri, credential=credential)\n", + "secretName = \"FEATHR-ONLINE-STORE-CONN\"\n", + "retrieved_secret = client.get_secret(secretName).value\n", + "\n", + "# Get redis credentials; This is to parse Redis connection string.\n", + "redis_port=retrieved_secret.split(',')[0].split(\":\")[1]\n", + "redis_host=retrieved_secret.split(',')[0].split(\":\")[0]\n", + "redis_password=retrieved_secret.split(',')[1].split(\"password=\",1)[1]\n", + "redis_ssl=retrieved_secret.split(',')[2].split(\"ssl=\",1)[1]\n", + "\n", + "# Set the resource link\n", + "os.environ['spark_config__azure_synapse__dev_url'] = f'https://{synapse_workspace_url}.dev.azuresynapse.net'\n", + "os.environ['spark_config__azure_synapse__pool_name'] = 'spark31'\n", + "os.environ['spark_config__azure_synapse__workspace_dir'] = f'abfss://{adls_fs_name}@{adls_account}.dfs.core.windows.net/feathr_project'\n", + "os.environ['online_store__redis__host'] = redis_host\n", + "os.environ['online_store__redis__port'] = redis_port\n", + "os.environ['online_store__redis__ssl_enabled'] = redis_ssl\n", + "os.environ['REDIS_PASSWORD']=redis_password\n", + "feathr_output_path = f'abfss://{adls_fs_name}@{adls_account}.dfs.core.windows.net/feathr_output'" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Prerequisite: Configure the required environment (Don't need to update if using the above Quick Start Template)\n", + "\n", + "In the first step (Provision cloud resources), you should have provisioned all the required cloud resources. If you use Feathr CLI to create a workspace, you should have a folder with a file called `feathr_config.yaml` in it with all the required configurations. Otherwise, update the configuration below.\n", + "\n", + "The code below will write this configuration string to a temporary location and load it to Feathr. Please still refer to [feathr_config.yaml](https://github.com/linkedin/feathr/blob/main/feathr_project/feathrcli/data/feathr_user_workspace/feathr_config.yaml) and use that as the source of truth. It should also have more explanations on the meaning of each variable." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import tempfile\n", + "yaml_config = \"\"\"\n", + "# Please refer to https://github.com/linkedin/feathr/blob/main/feathr_project/feathrcli/data/feathr_user_workspace/feathr_config.yaml for explanations on the meaning of each field.\n", + "api_version: 1\n", + "project_config:\n", + " project_name: 'feathr_getting_started'\n", + " required_environment_variables:\n", + " - 'REDIS_PASSWORD'\n", + " - 'AZURE_CLIENT_ID'\n", + " - 'AZURE_TENANT_ID'\n", + " - 'AZURE_CLIENT_SECRET'\n", + "offline_store:\n", + " adls:\n", + " adls_enabled: true\n", + " wasb:\n", + " wasb_enabled: true\n", + " s3:\n", + " s3_enabled: false\n", + " s3_endpoint: 's3.amazonaws.com'\n", + " jdbc:\n", + " jdbc_enabled: false\n", + " jdbc_database: 'feathrtestdb'\n", + " jdbc_table: 'feathrtesttable'\n", + " snowflake:\n", + " url: \"dqllago-ol19457.snowflakecomputing.com\"\n", + " user: \"feathrintegration\"\n", + " role: \"ACCOUNTADMIN\"\n", + "spark_config:\n", + " spark_cluster: 'azure_synapse'\n", + " spark_result_output_parts: '1'\n", + " azure_synapse:\n", + " dev_url: 'https://feathrazuretest3synapse.dev.azuresynapse.net'\n", + " pool_name: 'spark3'\n", + " workspace_dir: 'abfss://feathrazuretest3fs@feathrazuretest3storage.dfs.core.windows.net/feathr_getting_started'\n", + " executor_size: 'Small'\n", + " executor_num: 1\n", + " databricks:\n", + " workspace_instance_url: 'https://adb-2474129336842816.16.azuredatabricks.net'\n", + " config_template: {'run_name':'','new_cluster':{'spark_version':'9.1.x-scala2.12','node_type_id':'Standard_D3_v2','num_workers':2,'spark_conf':{}},'libraries':[{'jar':''}],'spark_jar_task':{'main_class_name':'','parameters':['']}}\n", + " work_dir: 'dbfs:/feathr_getting_started'\n", + "online_store:\n", + " redis:\n", + " host: 'feathrazuretest3redis.redis.cache.windows.net'\n", + " port: 6380\n", + " ssl_enabled: True\n", + "feature_registry:\n", + " api_endpoint: \"https://feathr-sql-registry.azurewebsites.net/api/v1\"\n", + "\"\"\"\n", + "tmp = tempfile.NamedTemporaryFile(mode='w', delete=False)\n", + "with open(tmp.name, \"w\") as text_file:\n", + " text_file.write(yaml_config)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Setup necessary environment variables (Skip if using the above Quick Start Template)\n", + "\n", + "You should setup the environment variables in order to run this sample. More environment variables can be set by referring to [feathr_config.yaml](https://github.com/linkedin/feathr/blob/main/feathr_project/feathrcli/data/feathr_user_workspace/feathr_config.yaml) and use that as the source of truth. It also has more explanations on the meaning of each variable.\n", + "\n", + "To run this notebook, for Azure users, you need AZURE_CLIENT_ID, AZURE_TENANT_ID, AZURE_CLIENT_SECRET and REDIS_PASSWORD.\n", + "To run this notebook, for Databricks useres, you need DATABRICKS_WORKSPACE_TOKEN_VALUE and REDIS_PASSWORD." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Initialize Feathr Client" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "client = FeathrClient(config_path=tmp.name)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## View the data\n", + "\n", + "In this tutorial, we use Feathr Feature Store to create a model that predicts NYC Taxi fares. The dataset comes from [here](https://www1.nyc.gov/site/tlc/about/tlc-trip-record-data.page). The data is as below" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "pd.read_csv(\"https://azurefeathrstorage.blob.core.windows.net/public/sample_data/green_tripdata_2020-04_with_index.csv\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Defining Features with Feathr\n", + "\n", + "In Feathr, a feature is viewed as a function, mapping from entity id or key, and timestamp to a feature value. For more details on feature definition, please refer to the [Feathr Feature Definition Guide](https://github.com/linkedin/feathr/blob/main/docs/concepts/feature-definition.md)\n", + "\n", + "\n", + "1. The typed key (a.k.a. entity id) identifies the subject of feature, e.g. a user id, 123.\n", + "2. The feature name is the aspect of the entity that the feature is indicating, e.g. the age of the user.\n", + "3. The feature value is the actual value of that aspect at a particular time, e.g. the value is 30 at year 2022.\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Note that, in some cases, such as features defined on top of request data, may have no entity key or timestamp.\n", + "It is merely a function/transformation executing against request data at runtime.\n", + "For example, the day of week of the request, which is calculated by converting the request UNIX timestamp.\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Define Sources Section with UDFs\n", + "A feature source is needed for anchored features that describes the raw data in which the feature values are computed from. See the python documentation to get the details on each input column.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from pyspark.sql import SparkSession, DataFrame\n", + "def feathr_udf_day_calc(df: DataFrame) -> DataFrame:\n", + " from pyspark.sql.functions import dayofweek, dayofyear, col\n", + " df = df.withColumn(\"fare_amount_cents\", col(\"fare_amount\")*100)\n", + " return df\n", + "\n", + "batch_source = HdfsSource(name=\"nycTaxiBatchSource\",\n", + " path=\"wasbs://public@azurefeathrstorage.blob.core.windows.net/sample_data/green_tripdata_2020-04_with_index.csv\",\n", + " event_timestamp_column=\"lpep_dropoff_datetime\",\n", + " preprocessing=feathr_udf_day_calc,\n", + " timestamp_format=\"yyyy-MM-dd HH:mm:ss\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Define Anchors and Features\n", + "A feature is called an anchored feature when the feature is directly extracted from the source data, rather than computed on top of other features. The latter case is called derived feature." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "f_trip_distance = Feature(name=\"f_trip_distance\",\n", + " feature_type=FLOAT, transform=\"trip_distance\")\n", + "f_trip_time_duration = Feature(name=\"f_trip_time_duration\",\n", + " feature_type=INT32,\n", + " transform=\"(to_unix_timestamp(lpep_dropoff_datetime) - to_unix_timestamp(lpep_pickup_datetime))/60\")\n", + "\n", + "features = [\n", + " f_trip_distance,\n", + " f_trip_time_duration,\n", + " Feature(name=\"f_is_long_trip_distance\",\n", + " feature_type=BOOLEAN,\n", + " transform=\"cast_float(trip_distance)>30\"),\n", + " Feature(name=\"f_day_of_week\",\n", + " feature_type=INT32,\n", + " transform=\"dayofweek(lpep_dropoff_datetime)\"),\n", + "]\n", + "\n", + "request_anchor = FeatureAnchor(name=\"request_features\",\n", + " source=INPUT_CONTEXT,\n", + " features=features)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Window aggregation features\n", + "\n", + "For window aggregation features, see the supported fields below:\n", + "\n", + "Note that the `agg_func` should be any of these:\n", + "\n", + "| Aggregation Type | Input Type | Description |\n", + "| --- | --- | --- |\n", + "|SUM, COUNT, MAX, MIN, AVG\t|Numeric|Applies the the numerical operation on the numeric inputs. |\n", + "|MAX_POOLING, MIN_POOLING, AVG_POOLING\t| Numeric Vector | Applies the max/min/avg operation on a per entry bassis for a given a collection of numbers.|\n", + "|LATEST| Any |Returns the latest not-null values from within the defined time window |\n", + "\n", + "\n", + "After you have defined features and sources, bring them together to build an anchor:\n", + "\n", + "\n", + "Note that if the data source is from the observation data, the `source` section should be `INPUT_CONTEXT` to indicate the source of those defined anchors." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "location_id = TypedKey(key_column=\"DOLocationID\",\n", + " key_column_type=ValueType.INT32,\n", + " description=\"location id in NYC\",\n", + " full_name=\"nyc_taxi.location_id\")\n", + "agg_features = [Feature(name=\"f_location_avg_fare\",\n", + " key=location_id,\n", + " feature_type=FLOAT,\n", + " transform=WindowAggTransformation(agg_expr=\"cast_float(fare_amount)\",\n", + " agg_func=\"AVG\",\n", + " window=\"90d\")),\n", + " Feature(name=\"f_location_max_fare\",\n", + " key=location_id,\n", + " feature_type=FLOAT,\n", + " transform=WindowAggTransformation(agg_expr=\"cast_float(fare_amount)\",\n", + " agg_func=\"MAX\",\n", + " window=\"90d\")),\n", + " Feature(name=\"f_location_total_fare_cents\",\n", + " key=location_id,\n", + " feature_type=FLOAT,\n", + " transform=WindowAggTransformation(agg_expr=\"fare_amount_cents\",\n", + " agg_func=\"SUM\",\n", + " window=\"90d\")),\n", + " ]\n", + "\n", + "agg_anchor = FeatureAnchor(name=\"aggregationFeatures\",\n", + " source=batch_source,\n", + " features=agg_features)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Derived Features Section\n", + "Derived features are the features that are computed from other features. They could be computed from anchored features, or other derived features." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "f_trip_time_distance = DerivedFeature(name=\"f_trip_time_distance\",\n", + " feature_type=FLOAT,\n", + " input_features=[\n", + " f_trip_distance, f_trip_time_duration],\n", + " transform=\"f_trip_distance * f_trip_time_duration\")\n", + "\n", + "f_trip_time_rounded = DerivedFeature(name=\"f_trip_time_rounded\",\n", + " feature_type=INT32,\n", + " input_features=[f_trip_time_duration],\n", + " transform=\"f_trip_time_duration % 10\")\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "And then we need to build those features so that it can be consumed later. Note that we have to build both the \"anchor\" and the \"derived\" features (which is not anchored to a source)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "client.build_features(anchor_list=[agg_anchor, request_anchor], derived_feature_list=[\n", + " f_trip_time_distance, f_trip_time_rounded])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Create training data using point-in-time correct feature join\n", + "\n", + "A training dataset usually contains entity id columns, multiple feature columns, event timestamp column and label/target column. \n", + "\n", + "To create a training dataset using Feathr, one needs to provide a feature join configuration file to specify\n", + "what features and how these features should be joined to the observation data. \n", + "\n", + "To learn more on this topic, please refer to [Point-in-time Correctness](https://github.com/linkedin/feathr/blob/main/docs/concepts/point-in-time-join.md)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "if client.spark_runtime == 'databricks':\n", + " output_path = 'dbfs:/feathrazure_test.avro'\n", + "else:\n", + " output_path = feathr_output_path\n", + "\n", + "\n", + "feature_query = FeatureQuery(\n", + " feature_list=[\"f_location_avg_fare\", \"f_trip_time_rounded\", \"f_is_long_trip_distance\", \"f_location_total_fare_cents\"], key=location_id)\n", + "settings = ObservationSettings(\n", + " observation_path=\"wasbs://public@azurefeathrstorage.blob.core.windows.net/sample_data/green_tripdata_2020-04_with_index.csv\",\n", + " event_timestamp_column=\"lpep_dropoff_datetime\",\n", + " timestamp_format=\"yyyy-MM-dd HH:mm:ss\")\n", + "client.get_offline_features(observation_settings=settings,\n", + " feature_query=feature_query,\n", + " output_path=output_path)\n", + "client.wait_job_to_finish(timeout_sec=500)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Download the result and show the result\n", + "\n", + "Let's use the helper function `get_result_df` to download the result and view it:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def get_result_df(client: FeathrClient) -> pd.DataFrame:\n", + " \"\"\"Download the job result dataset from cloud as a Pandas dataframe.\"\"\"\n", + " res_url = client.get_job_result_uri(block=True, timeout_sec=600)\n", + " tmp_dir = tempfile.TemporaryDirectory()\n", + " client.feathr_spark_launcher.download_result(result_path=res_url, local_folder=tmp_dir.name)\n", + " dataframe_list = []\n", + " # assuming the result are in avro format\n", + " for file in glob.glob(os.path.join(tmp_dir.name, '*.avro')):\n", + " dataframe_list.append(pdx.read_avro(file))\n", + " vertical_concat_df = pd.concat(dataframe_list, axis=0)\n", + " tmp_dir.cleanup()\n", + " return vertical_concat_df\n", + "\n", + "df_res = get_result_df(client)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df_res" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Train a machine learning model\n", + "After getting all the features, let's train a machine learning model with the converted feature by Feathr:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# remove columns\n", + "from sklearn.ensemble import GradientBoostingRegressor\n", + "final_df = df_res\n", + "final_df.drop([\"lpep_pickup_datetime\", \"lpep_dropoff_datetime\",\n", + " \"store_and_fwd_flag\"], axis=1, inplace=True, errors='ignore')\n", + "final_df.fillna(0, inplace=True)\n", + "final_df['fare_amount'] = final_df['fare_amount'].astype(\"float64\")\n", + "\n", + "\n", + "train_x, test_x, train_y, test_y = train_test_split(final_df.drop([\"fare_amount\"], axis=1),\n", + " final_df[\"fare_amount\"],\n", + " test_size=0.2,\n", + " random_state=42)\n", + "model = GradientBoostingRegressor()\n", + "model.fit(train_x, train_y)\n", + "\n", + "y_predict = model.predict(test_x)\n", + "\n", + "y_actual = test_y.values.flatten().tolist()\n", + "rmse = sqrt(mean_squared_error(y_actual, y_predict))\n", + "\n", + "sum_actuals = sum_errors = 0\n", + "\n", + "for actual_val, predict_val in zip(y_actual, y_predict):\n", + " abs_error = actual_val - predict_val\n", + " if abs_error < 0:\n", + " abs_error = abs_error * -1\n", + "\n", + " sum_errors = sum_errors + abs_error\n", + " sum_actuals = sum_actuals + actual_val\n", + "\n", + "mean_abs_percent_error = sum_errors / sum_actuals\n", + "print(\"Model MAPE:\")\n", + "print(mean_abs_percent_error)\n", + "print()\n", + "print(\"Model Accuracy:\")\n", + "print(1 - mean_abs_percent_error)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Materialize feature value into offline/online storage\n", + "\n", + "While Feathr can compute the feature value from the feature definition on-the-fly at request time, it can also pre-compute\n", + "and materialize the feature value to offline and/or online storage. \n", + "\n", + "We can push the generated features to the online store like below:\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "backfill_time = BackfillTime(start=datetime(\n", + " 2020, 5, 20), end=datetime(2020, 5, 20), step=timedelta(days=1))\n", + "redisSink = RedisSink(table_name=\"nycTaxiDemoFeature\")\n", + "settings = MaterializationSettings(\"nycTaxiTable\",\n", + " backfill_time=backfill_time,\n", + " sinks=[redisSink],\n", + " feature_names=[\"f_location_avg_fare\", \"f_location_max_fare\"])\n", + "\n", + "client.materialize_features(settings)\n", + "client.wait_job_to_finish(timeout_sec=500)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can then get the features from the online store (Redis):\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Fetching feature value for online inference\n", + "\n", + "For features that are already materialized by the previous step, their latest value can be queried via the client's\n", + "`get_online_features` or `multi_get_online_features` API." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "res = client.get_online_features('nycTaxiDemoFeature', '265', [\n", + " 'f_location_avg_fare', 'f_location_max_fare'])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "client.multi_get_online_features(\"nycTaxiDemoFeature\", [\"239\", \"265\"], [\n", + " 'f_location_avg_fare', 'f_location_max_fare'])\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Registering and Fetching features\n", + "\n", + "We can also register the features with an Apache Atlas compatible service, such as Azure Purview, and share the registered features across teams:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "client.register_features()\n", + "client.list_registered_features(project_name=\"feathr_getting_started\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3.9.5 ('base')", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.5" + }, + "vscode": { + "interpreter": { + "hash": "3d597f4c481aa0f25dceb95d2a0067e73c0966dcbd003d741d821a7208527ecf" + } + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/feathr_project/setup.py b/feathr_project/setup.py index 0a6b38d49..69a99351f 100644 --- a/feathr_project/setup.py +++ b/feathr_project/setup.py @@ -3,7 +3,6 @@ from setuptools import setup, find_packages from pathlib import Path - # Use the README.md from /docs root_path = Path(__file__).resolve().parent.parent long_description = (root_path / "docs/README.md").read_text(encoding="utf8") @@ -16,7 +15,7 @@ sys.exit(-1) VERSION = __version__ # noqa -os.environ["FEATHR_VERSION"] = VERSION +os.environ["FEATHR_VERSION]"] = VERSION extras_require=dict( dev=[ diff --git a/feathr_project/test/samples/test_notebooks.py b/feathr_project/test/samples/test_notebooks.py deleted file mode 100644 index 778b157d7..000000000 --- a/feathr_project/test/samples/test_notebooks.py +++ /dev/null @@ -1,56 +0,0 @@ -from pathlib import Path -from tempfile import TemporaryDirectory - -import pytest -try: - import papermill as pm - import scrapbook as sb -except ImportError: - pass # disable error while collecting tests for non-notebook environments - - -SAMPLES_DIR = ( - Path(__file__) - .parent # .../samples - .parent # .../test - .parent # .../feathr_project - .parent # .../feathr (root of the repo) - .joinpath("docs", "samples") -) -NOTEBOOK_PATHS = { - "nyc_taxi_demo": str(SAMPLES_DIR.joinpath("nyc_taxi_demo.ipynb")), -} - - -@pytest.mark.notebooks -def test__nyc_taxi_demo(tmp_path): - notebook_name = "nyc_taxi_demo" - - output_tmpdir = TemporaryDirectory() - output_notebook_path = str(tmp_path.joinpath(f"{notebook_name}.ipynb")) - - pm.execute_notebook( - input_path=NOTEBOOK_PATHS[notebook_name], - output_path=output_notebook_path, - # kernel_name="python3", - parameters=dict( - RESOURCE_PREFIX="feathrazuretest3", # Use the test resource group - PROJECT_NAME=notebook_name, - DATA_STORE_PATH=output_tmpdir.name, - SPARK_CLUSTER="local", - USE_CLI_AUTH=False, - SCRAP_RESULTS=True, - ), - ) - - # Read results from the Scrapbook and assert expected values - nb = sb.read_notebook(output_notebook_path) - outputs = nb.scraps - - assert outputs["materialized_feature_values"].data["239"] == pytest.approx([5707., 1480.], abs=1.) - assert outputs["materialized_feature_values"].data["265"] == pytest.approx([10000., 4160.], abs=1.) - assert outputs["rmse"].data == pytest.approx(5., abs=2.) - assert outputs["mae"].data == pytest.approx(2., abs=1.) - - # clean up - output_tmpdir.cleanup() diff --git a/feathr_project/test/unit/datasets/test_dataset_utils.py b/feathr_project/test/unit/datasets/test_dataset_utils.py deleted file mode 100644 index 2aabaa9a1..000000000 --- a/feathr_project/test/unit/datasets/test_dataset_utils.py +++ /dev/null @@ -1,53 +0,0 @@ -from pathlib import Path -from tempfile import TemporaryDirectory -from urllib.parse import urlparse - -import pytest - -from feathr.datasets.nyc_taxi import NYC_TAXI_SMALL_URL -from feathr.datasets.utils import maybe_download - - -@pytest.mark.parametrize( - # 3924447 is the nyc_taxi sample data's bytes - "expected_bytes", [3924447, None] -) -def test__maybe_download(expected_bytes: int): - """Test maybe_download utility function w/ nyc_taxi data cached at Azure blob.""" - - tmpdir = TemporaryDirectory() - dst_filepath = Path(tmpdir.name, "data.csv") - - # Assert the data is downloaded - assert maybe_download( - src_url=NYC_TAXI_SMALL_URL, - dst_filepath=str(dst_filepath), - expected_bytes=expected_bytes, - ) - - # Assert the downloaded file exists. - assert dst_filepath.is_file() - - # Assert the data is already exists and thus the function does not download - assert not maybe_download( - src_url=NYC_TAXI_SMALL_URL, - dst_filepath=str(dst_filepath), - expected_bytes=expected_bytes, - ) - - tmpdir.cleanup() - - -def test__maybe_download__raise_exception(): - """Test maby_download utility function to raise IOError when the expected bytes mismatches.""" - - tmpdir = TemporaryDirectory() - - with pytest.raises(IOError): - maybe_download( - src_url=NYC_TAXI_SMALL_URL, - dst_filepath=Path(tmpdir.name, "data.csv").resolve(), - expected_bytes=10, - ) - - tmpdir.cleanup() diff --git a/feathr_project/test/unit/datasets/test_datasets.py b/feathr_project/test/unit/datasets/test_datasets.py deleted file mode 100644 index c1ac49a9b..000000000 --- a/feathr_project/test/unit/datasets/test_datasets.py +++ /dev/null @@ -1,106 +0,0 @@ -from pathlib import Path -from unittest.mock import MagicMock - -from pyspark.sql import SparkSession -import pytest -from pytest_mock import MockerFixture - -from feathr.datasets import nyc_taxi - - -TEST_DATASET_DIR = Path(__file__).parent.parent.parent.joinpath("test_user_workspace") -NYC_TAXI_FILE_PATH = str(TEST_DATASET_DIR.joinpath("green_tripdata_2020-04_with_index.csv").resolve()) - - -@pytest.fixture(scope="module") -def spark() -> SparkSession: - """Generate a spark session for tests.""" - # Set ui port other than the default one (4040) so that feathr spark job may not fail. - spark_session = SparkSession.builder.appName("tests").config("spark.ui.port", "8080").getOrCreate() - yield spark_session - spark_session.stop() - - -@pytest.mark.parametrize( - "local_cache_path", - [ - None, # default temporary directory - NYC_TAXI_FILE_PATH, # full filepath - str(Path(NYC_TAXI_FILE_PATH).parent), # directory - ], -) -def test__nyc_taxi__get_pandas_df( - mocker: MockerFixture, - local_cache_path: str, -): - """Test if nyc_taxi.get_pandas_df returns pd.DataFrame. Also check if the proper modules are being called.""" - # Mock maybe_download and TempDirectory - mocked_maybe_download = mocker.patch("feathr.datasets.nyc_taxi.maybe_download") - mocked_tmpdir = MagicMock() - mocked_tmpdir.name = NYC_TAXI_FILE_PATH - mocked_TemporaryDirectory = mocker.patch("feathr.datasets.nyc_taxi.TemporaryDirectory", return_value=mocked_tmpdir) - - pdf = nyc_taxi.get_pandas_df(local_cache_path=local_cache_path) - assert len(pdf) == 35612 - - # Assert mock called - if local_cache_path: - mocked_TemporaryDirectory.assert_not_called() - else: - mocked_TemporaryDirectory.assert_called_once() - - # TODO check this is called w/ file extension added - mocked_maybe_download.assert_called_once_with(src_url=nyc_taxi.NYC_TAXI_SMALL_URL, dst_filepath=NYC_TAXI_FILE_PATH) - - -@pytest.mark.parametrize( - "local_cache_path", [ - NYC_TAXI_FILE_PATH, # full filepath - str(Path(NYC_TAXI_FILE_PATH).parent), # directory - ], -) -def test__nyc_taxi__get_spark_df( - spark, - mocker: MockerFixture, - local_cache_path: str, -): - """Test if nyc_taxi.get_spark_df returns spark.sql.DataFrame.""" - # Mock maybe_download - mocked_maybe_download = mocker.patch("feathr.datasets.nyc_taxi.maybe_download") - - df = nyc_taxi.get_spark_df(spark=spark, local_cache_path=local_cache_path) - assert df.count() == 35612 - - mocked_maybe_download.assert_called_once_with( - src_url=nyc_taxi.NYC_TAXI_SMALL_URL, dst_filepath=NYC_TAXI_FILE_PATH - ) - - -@pytest.mark.parametrize( - "local_cache_path", [ - NYC_TAXI_FILE_PATH, # full filepath - str(Path(NYC_TAXI_FILE_PATH).parent), # directory - ], -) -def test__nyc_taxi__get_spark_df__with_databricks( - mocker: MockerFixture, - local_cache_path: str, -): - # Mock maybe_download and spark session - mocked_maybe_download = mocker.patch("feathr.datasets.nyc_taxi.maybe_download") - mocked_is_databricks = mocker.patch("feathr.datasets.nyc_taxi.is_databricks", return_value=True) - mocked_spark = MagicMock(spec=SparkSession) - - nyc_taxi.get_spark_df(spark=mocked_spark, local_cache_path=local_cache_path) - - # Assert mock called with databricks paths - mocked_is_databricks.assert_called_once() - - expected_dst_filepath = str(Path("/dbfs", NYC_TAXI_FILE_PATH.lstrip("/"))) - mocked_maybe_download.assert_called_once_with( - src_url=nyc_taxi.NYC_TAXI_SMALL_URL, dst_filepath=expected_dst_filepath - ) - - mocked_spark.read.option.return_value.csv.assert_called_once_with( - str(Path("dbfs:", NYC_TAXI_FILE_PATH.lstrip("/"))) - ) diff --git a/feathr_project/test/unit/utils/test_config.py b/feathr_project/test/unit/utils/test_config.py deleted file mode 100644 index 502a3a01d..000000000 --- a/feathr_project/test/unit/utils/test_config.py +++ /dev/null @@ -1,31 +0,0 @@ -from pathlib import Path -from tempfile import NamedTemporaryFile - -import pytest - -from feathr.utils.config import FEATHR_CONFIG_TEMPLATE, generate_config - - -@pytest.mark.parametrize( - "output_filepath", [None, NamedTemporaryFile().name], -) -def test__generate_config(output_filepath: str): - - config = FEATHR_CONFIG_TEMPLATE.format( - resource_prefix="test_prefix", - project_name="test_project", - spark_cluster="local", - ) - - config_filepath = generate_config( - resource_prefix="test_prefix", - project_name="test_project", - spark_cluster="local", - output_filepath=output_filepath, - ) - - if output_filepath: - assert output_filepath == config_filepath - - with open(config_filepath, "r") as f: - assert config == f.read() From f50e3320e169242cef1e50e506801cf0d9465969 Mon Sep 17 00:00:00 2001 From: Blair Chen Date: Mon, 31 Oct 2022 13:12:36 +0800 Subject: [PATCH 2/2] Resolve conflict --- ...atabricks_quickstart_nyc_taxi_driver.ipynb | 1442 ----------------- docs/samples/nyc_taxi_demo.ipynb | 1110 ------------- 2 files changed, 2552 deletions(-) delete mode 100644 docs/samples/databricks/databricks_quickstart_nyc_taxi_driver.ipynb delete mode 100644 docs/samples/nyc_taxi_demo.ipynb diff --git a/docs/samples/databricks/databricks_quickstart_nyc_taxi_driver.ipynb b/docs/samples/databricks/databricks_quickstart_nyc_taxi_driver.ipynb deleted file mode 100644 index 52790f884..000000000 --- a/docs/samples/databricks/databricks_quickstart_nyc_taxi_driver.ipynb +++ /dev/null @@ -1,1442 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "application/vnd.databricks.v1+cell": { - "inputWidgets": {}, - "nuid": "384e5e16-7213-4186-9d04-09d03b155534", - "showTitle": false, - "title": "" - } - }, - "source": [ - "# Feathr Feature Store on Databricks Demo Notebook\n", - "\n", - "This notebook illustrates the use of Feature Store to create a model that predicts NYC Taxi fares. This is a notebook that's specially designed for databricks clusters and is relying on some of the databricks packages such as dbutils.\n", - "\n", - "The intent of this notebook is like \"one click run\" without configuring anything, so it has relatively limited capability. \n", - "\n", - "- For example, in this notebook there's no feature registry available since that requires running Azure Purview. \n", - "- Also for online store (Redis), you need to configure the Redis endpoint, otherwise that part will not work. \n", - "\n", - "However, the core part of Feathr, especially defining features, get offline features, point-in-time joins etc., should \"just work\". The full-fledged notebook is [located here](https://github.com/linkedin/feathr/blob/main/feathr_project/feathrcli/data/feathr_user_workspace/nyc_driver_demo.ipynb)." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "# Notebook Steps\n", - "\n", - "This tutorial demonstrates the key capabilities of Feathr, including:\n", - "\n", - "1. Install and set up Feathr with Azure\n", - "2. Create shareable features with Feathr feature definition configs.\n", - "3. Create a training dataset via point-in-time feature join.\n", - "4. Compute and write features.\n", - "5. Train a model using these features to predict fares.\n", - "6. Materialize feature value to online store.\n", - "7. Fetch feature value in real-time from online store for online scoring.\n", - "\n", - "In this tutorial, we use Feathr Feature Store to create a model that predicts NYC Taxi fares. The dataset comes from [here](https://www1.nyc.gov/site/tlc/about/tlc-trip-record-data.page). The feature flow is as below:\n", - "\n", - "![Feature Flow](https://github.com/linkedin/feathr/blob/main/docs/images/feature_flow.png?raw=true)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "application/vnd.databricks.v1+cell": { - "inputWidgets": {}, - "nuid": "f00b9d0b-94d1-418f-89b9-25bbacb8b068", - "showTitle": false, - "title": "" - } - }, - "outputs": [ - { - "data": { - "text/html": [ - "" - ] - }, - "metadata": { - "application/vnd.databricks.v1+output": { - "arguments": {}, - "data": "", - "errorSummary": "", - "errorTraceType": null, - "metadata": {}, - "type": "ipynbError" - } - }, - "output_type": "display_data" - } - ], - "source": [ - "! pip install feathr pandavro scikit-learn" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "application/vnd.databricks.v1+cell": { - "inputWidgets": {}, - "nuid": "80223a02-631c-40c8-91b3-a037249ffff9", - "showTitle": false, - "title": "" - } - }, - "outputs": [ - { - "data": { - "text/html": [ - "" - ] - }, - "metadata": { - "application/vnd.databricks.v1+output": { - "arguments": {}, - "data": "", - "errorSummary": "", - "errorTraceType": null, - "metadata": {}, - "type": "ipynbError" - } - }, - "output_type": "display_data" - } - ], - "source": [ - "import glob\n", - "import os\n", - "import tempfile\n", - "from datetime import datetime, timedelta\n", - "from math import sqrt\n", - "\n", - "import pandas as pd\n", - "import pandavro as pdx\n", - "from feathr import FeathrClient\n", - "from feathr import BOOLEAN, FLOAT, INT32, ValueType\n", - "from feathr import Feature, DerivedFeature, FeatureAnchor\n", - "from feathr import BackfillTime, MaterializationSettings\n", - "from feathr import FeatureQuery, ObservationSettings\n", - "from feathr import RedisSink\n", - "from feathr import INPUT_CONTEXT, HdfsSource\n", - "from feathr import WindowAggTransformation\n", - "from feathr import TypedKey\n", - "from sklearn.metrics import mean_squared_error\n", - "from sklearn.model_selection import train_test_split\n", - "from azure.identity import DefaultAzureCredential\n", - "from azure.keyvault.secrets import SecretClient\n", - "import json\n", - "import requests" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "application/vnd.databricks.v1+cell": { - "inputWidgets": {}, - "nuid": "41d3648a-9bc9-40dc-90da-bc82b21ef9b3", - "showTitle": false, - "title": "" - } - }, - "source": [ - "Get the required databricks credentials automatically:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "application/vnd.databricks.v1+cell": { - "inputWidgets": {}, - "nuid": "331753d6-1850-47b5-ad97-84b7c01d79d1", - "showTitle": false, - "title": "" - } - }, - "outputs": [ - { - "data": { - "text/html": [ - "" - ] - }, - "metadata": { - "application/vnd.databricks.v1+output": { - "arguments": {}, - "data": "", - "errorSummary": "", - "errorTraceType": null, - "metadata": {}, - "type": "ipynbError" - } - }, - "output_type": "display_data" - } - ], - "source": [ - "# Get current databricks notebook context\n", - "ctx = dbutils.notebook.entry_point.getDbutils().notebook().getContext()\n", - "host_name = ctx.tags().get(\"browserHostName\").get()\n", - "host_token = ctx.apiToken().get()\n", - "cluster_id = ctx.tags().get(\"clusterId\").get()\n", - "\n", - "\n", - "\n", - "# databricks_config = {'run_name':'FEATHR_FILL_IN','existing_cluster_id':cluster_id,'libraries':[{'jar':'FEATHR_FILL_IN'}],'spark_jar_task':{'main_class_name':'FEATHR_FILL_IN','parameters':['FEATHR_FILL_IN']}}\n", - "os.environ['spark_config__databricks__workspace_instance_url'] = \"https://\" + host_name\n", - "os.environ['spark_config__databricks__config_template']='{\"run_name\":\"FEATHR_FILL_IN\",\"new_cluster\":{\"spark_version\":\"10.4.x-scala2.12\",\"node_type_id\":\"Standard_D3_v2\",\"num_workers\":2,\"spark_conf\":{\"FEATHR_FILL_IN\":\"FEATHR_FILL_IN\"}},\"libraries\":[{\"jar\":\"FEATHR_FILL_IN\"}],\"spark_jar_task\":{\"main_class_name\":\"FEATHR_FILL_IN\",\"parameters\":[\"FEATHR_FILL_IN\"]}}'\n", - "# os.environ['spark_config__databricks__config_template']=json.dumps(databricks_config)\n", - "os.environ['spark_config__databricks__work_dir']='dbfs:/feathr_getting_started'\n", - "os.environ['project_config__project_name']='feathr_getting_started'\n", - "os.environ['DATABRICKS_WORKSPACE_TOKEN_VALUE'] = host_token" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "You need to setup the Redis credentials below in order to push features to online store. You can skip this part if you don't have Redis, but there will be failures for `client.materialize_features(settings)` API." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Get redis credentials; This is to parse Redis connection string.\n", - "redis_port=\"\"\n", - "redis_host=\"\"\n", - "redis_password=\"\"\n", - "redis_ssl=\"\"\n", - "\n", - "# Set the resource link\n", - "os.environ['online_store__redis__host'] = redis_host\n", - "os.environ['online_store__redis__port'] = redis_port\n", - "os.environ['online_store__redis__ssl_enabled'] = redis_ssl\n", - "os.environ['REDIS_PASSWORD']=redis_password" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "application/vnd.databricks.v1+cell": { - "inputWidgets": {}, - "nuid": "08bc3b7e-bbf5-4e3a-9978-fe1aef8c1aee", - "showTitle": false, - "title": "" - } - }, - "source": [ - "Configure required credentials (skip if you don't use those):" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "application/vnd.databricks.v1+cell": { - "inputWidgets": {}, - "nuid": "8cd64e3a-376c-48e6-ba41-5197f3591d48", - "showTitle": false, - "title": "" - } - }, - "outputs": [ - { - "data": { - "text/html": [ - "" - ] - }, - "metadata": { - "application/vnd.databricks.v1+output": { - "arguments": {}, - "data": "", - "errorSummary": "", - "errorTraceType": null, - "metadata": {}, - "type": "ipynbError" - } - }, - "output_type": "display_data" - } - ], - "source": [ - "import tempfile\n", - "yaml_config = \"\"\"\n", - "# Please refer to https://github.com/linkedin/feathr/blob/main/feathr_project/feathrcli/data/feathr_user_workspace/feathr_config.yaml for explanations on the meaning of each field.\n", - "api_version: 1\n", - "project_config:\n", - " project_name: 'feathr_getting_started2'\n", - " required_environment_variables:\n", - " - 'REDIS_PASSWORD'\n", - "offline_store:\n", - " adls:\n", - " adls_enabled: true\n", - " wasb:\n", - " wasb_enabled: true\n", - " s3:\n", - " s3_enabled: false\n", - " s3_endpoint: ''\n", - " jdbc:\n", - " jdbc_enabled: false\n", - " jdbc_database: ''\n", - " jdbc_table: ''\n", - " snowflake:\n", - " snowflake_enabled: false\n", - " url: \".snowflakecomputing.com\"\n", - " user: \"\"\n", - " role: \"\"\n", - "spark_config:\n", - " # choice for spark runtime. Currently support: azure_synapse, databricks\n", - " # The `databricks` configs will be ignored if `azure_synapse` is set and vice versa.\n", - " spark_cluster: \"databricks\"\n", - " spark_result_output_parts: \"1\"\n", - "\n", - "online_store:\n", - " redis:\n", - " host: '.redis.cache.windows.net'\n", - " port: 6380\n", - " ssl_enabled: True\n", - "feature_registry:\n", - " api_endpoint: \"https://.azurewebsites.net/api/v1\"\n", - "\"\"\"\n", - "tmp = tempfile.NamedTemporaryFile(mode='w', delete=False)\n", - "with open(tmp.name, \"w\") as text_file:\n", - " text_file.write(yaml_config)\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "application/vnd.databricks.v1+cell": { - "inputWidgets": {}, - "nuid": "3fef7f2f-df19-4f53-90a5-ff7999ed983d", - "showTitle": false, - "title": "" - } - }, - "source": [ - "# Initialize Feathr Client" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "application/vnd.databricks.v1+cell": { - "inputWidgets": {}, - "nuid": "9713a2df-c7b2-4562-88b0-b7acce3cc43a", - "showTitle": false, - "title": "" - } - }, - "outputs": [ - { - "data": { - "text/html": [ - "" - ] - }, - "metadata": { - "application/vnd.databricks.v1+output": { - "arguments": {}, - "data": "", - "errorSummary": "", - "errorTraceType": null, - "metadata": {}, - "type": "ipynbError" - } - }, - "output_type": "display_data" - } - ], - "source": [ - "client = FeathrClient(config_path=tmp.name)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "application/vnd.databricks.v1+cell": { - "inputWidgets": {}, - "nuid": "c3b64bda-d42c-4a64-b976-0fb604cf38c5", - "showTitle": false, - "title": "" - } - }, - "source": [ - "## View the data\n", - "\n", - "In this tutorial, we use Feathr Feature Store to create a model that predicts NYC Taxi fares. The dataset comes from [here](https://www1.nyc.gov/site/tlc/about/tlc-trip-record-data.page). The data is as below" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "application/vnd.databricks.v1+cell": { - "inputWidgets": {}, - "nuid": "c4ccd7b3-298a-4e5a-8eec-b7e309db393e", - "showTitle": false, - "title": "" - } - }, - "outputs": [ - { - "data": { - "text/html": [ - "" - ] - }, - "metadata": { - "application/vnd.databricks.v1+output": { - "arguments": {}, - "data": "", - "errorSummary": "", - "errorTraceType": null, - "metadata": {}, - "type": "ipynbError" - } - }, - "output_type": "display_data" - } - ], - "source": [ - "import pandas as pd\n", - "pd.read_csv(\"https://azurefeathrstorage.blob.core.windows.net/public/sample_data/green_tripdata_2020-04_with_index.csv\")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "application/vnd.databricks.v1+cell": { - "inputWidgets": {}, - "nuid": "7430c942-64e5-4b70-b823-16ce1d1b3cee", - "showTitle": false, - "title": "" - } - }, - "source": [ - "## Defining Features with Feathr\n", - "\n", - "In Feathr, a feature is viewed as a function, mapping from entity id or key, and timestamp to a feature value. For more details on feature definition, please refer to the [Feathr Feature Definition Guide](https://github.com/linkedin/feathr/blob/main/docs/concepts/feature-definition.md)\n", - "\n", - "\n", - "1. The typed key (a.k.a. entity id) identifies the subject of feature, e.g. a user id, 123.\n", - "2. The feature name is the aspect of the entity that the feature is indicating, e.g. the age of the user.\n", - "3. The feature value is the actual value of that aspect at a particular time, e.g. the value is 30 at year 2022." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "application/vnd.databricks.v1+cell": { - "inputWidgets": {}, - "nuid": "16420730-582e-4e11-a343-efc0ddd35108", - "showTitle": false, - "title": "" - } - }, - "source": [ - "Note that, in some cases, such as features defined on top of request data, may have no entity key or timestamp.\n", - "It is merely a function/transformation executing against request data at runtime.\n", - "For example, the day of week of the request, which is calculated by converting the request UNIX timestamp." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "application/vnd.databricks.v1+cell": { - "inputWidgets": {}, - "nuid": "728d2d5f-c11f-4941-bdc5-48507f5749f1", - "showTitle": false, - "title": "" - } - }, - "source": [ - "### Define Sources Section with UDFs\n", - "A feature source is needed for anchored features that describes the raw data in which the feature values are computed from. See the python documentation to get the details on each input column." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "application/vnd.databricks.v1+cell": { - "inputWidgets": {}, - "nuid": "3cc59a0e-a41b-480e-a84e-ca5443d63143", - "showTitle": false, - "title": "" - } - }, - "outputs": [ - { - "data": { - "text/html": [ - "" - ] - }, - "metadata": { - "application/vnd.databricks.v1+output": { - "arguments": {}, - "data": "", - "errorSummary": "", - "errorTraceType": null, - "metadata": {}, - "type": "ipynbError" - } - }, - "output_type": "display_data" - } - ], - "source": [ - "batch_source = HdfsSource(name=\"nycTaxiBatchSource\",\n", - " path=\"wasbs://public@azurefeathrstorage.blob.core.windows.net/sample_data/green_tripdata_2020-04_with_index.csv\",\n", - " event_timestamp_column=\"lpep_dropoff_datetime\",\n", - " timestamp_format=\"yyyy-MM-dd HH:mm:ss\")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "application/vnd.databricks.v1+cell": { - "inputWidgets": {}, - "nuid": "46f863c4-bb81-434a-a448-6b585031a221", - "showTitle": false, - "title": "" - } - }, - "source": [ - "### Define Anchors and Features\n", - "A feature is called an anchored feature when the feature is directly extracted from the source data, rather than computed on top of other features. The latter case is called derived feature." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "application/vnd.databricks.v1+cell": { - "inputWidgets": {}, - "nuid": "a373ecbe-a040-4cd3-9d87-0d5f4c5ba553", - "showTitle": false, - "title": "" - } - }, - "outputs": [ - { - "data": { - "text/html": [ - "" - ] - }, - "metadata": { - "application/vnd.databricks.v1+output": { - "arguments": {}, - "data": "", - "errorSummary": "", - "errorTraceType": null, - "metadata": {}, - "type": "ipynbError" - } - }, - "output_type": "display_data" - } - ], - "source": [ - "f_trip_distance = Feature(name=\"f_trip_distance\",\n", - " feature_type=FLOAT, transform=\"trip_distance\")\n", - "\n", - "features = [\n", - " f_trip_distance,\n", - " Feature(name=\"f_is_long_trip_distance\",\n", - " feature_type=BOOLEAN,\n", - " transform=\"cast_float(trip_distance)>30\"),\n", - " Feature(name=\"f_day_of_week\",\n", - " feature_type=INT32,\n", - " transform=\"dayofweek(lpep_dropoff_datetime)\"),\n", - "]\n", - "\n", - "request_anchor = FeatureAnchor(name=\"request_features\",\n", - " source=INPUT_CONTEXT,\n", - " features=features)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "application/vnd.databricks.v1+cell": { - "inputWidgets": {}, - "nuid": "149f85e2-fa3c-4895-b0c5-de5543ca9b6d", - "showTitle": false, - "title": "" - } - }, - "source": [ - "### Window aggregation features\n", - "\n", - "For window aggregation features, see the supported fields below:\n", - "\n", - "Note that the `agg_func` should be any of these:\n", - "\n", - "| Aggregation Type | Input Type | Description |\n", - "| --- | --- | --- |\n", - "|SUM, COUNT, MAX, MIN, AVG\t|Numeric|Applies the the numerical operation on the numeric inputs. |\n", - "|MAX_POOLING, MIN_POOLING, AVG_POOLING\t| Numeric Vector | Applies the max/min/avg operation on a per entry bassis for a given a collection of numbers.|\n", - "|LATEST| Any |Returns the latest not-null values from within the defined time window |\n", - "\n", - "\n", - "After you have defined features and sources, bring them together to build an anchor:\n", - "\n", - "\n", - "Note that if the data source is from the observation data, the `source` section should be `INPUT_CONTEXT` to indicate the source of those defined anchors." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "application/vnd.databricks.v1+cell": { - "inputWidgets": {}, - "nuid": "05633bc3-9118-449b-9562-45fc437576c2", - "showTitle": false, - "title": "" - } - }, - "outputs": [ - { - "data": { - "text/html": [ - "" - ] - }, - "metadata": { - "application/vnd.databricks.v1+output": { - "arguments": {}, - "data": "", - "errorSummary": "", - "errorTraceType": null, - "metadata": {}, - "type": "ipynbError" - } - }, - "output_type": "display_data" - } - ], - "source": [ - "location_id = TypedKey(key_column=\"DOLocationID\",\n", - " key_column_type=ValueType.INT32,\n", - " description=\"location id in NYC\",\n", - " full_name=\"nyc_taxi.location_id\")\n", - "agg_features = [Feature(name=\"f_location_avg_fare\",\n", - " key=location_id,\n", - " feature_type=FLOAT,\n", - " transform=WindowAggTransformation(agg_expr=\"cast_float(fare_amount)\",\n", - " agg_func=\"AVG\",\n", - " window=\"90d\")),\n", - " Feature(name=\"f_location_max_fare\",\n", - " key=location_id,\n", - " feature_type=FLOAT,\n", - " transform=WindowAggTransformation(agg_expr=\"cast_float(fare_amount)\",\n", - " agg_func=\"MAX\",\n", - " window=\"90d\")),\n", - " ]\n", - "\n", - "agg_anchor = FeatureAnchor(name=\"aggregationFeatures\",\n", - " source=batch_source,\n", - " features=agg_features)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "application/vnd.databricks.v1+cell": { - "inputWidgets": {}, - "nuid": "d2ecaca9-057e-4b36-811f-320f66f753ed", - "showTitle": false, - "title": "" - } - }, - "source": [ - "### Derived Features Section\n", - "Derived features are the features that are computed from other features. They could be computed from anchored features, or other derived features." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "application/vnd.databricks.v1+cell": { - "inputWidgets": {}, - "nuid": "270fb11e-8a71-404f-9639-ad29d8e6a2c1", - "showTitle": false, - "title": "" - } - }, - "outputs": [ - { - "data": { - "text/html": [ - "" - ] - }, - "metadata": { - "application/vnd.databricks.v1+output": { - "arguments": {}, - "data": "", - "errorSummary": "", - "errorTraceType": null, - "metadata": {}, - "type": "ipynbError" - } - }, - "output_type": "display_data" - } - ], - "source": [ - "\n", - "f_trip_distance_rounded = DerivedFeature(name=\"f_trip_distance_rounded\",\n", - " feature_type=INT32,\n", - " input_features=[f_trip_distance],\n", - " transform=\"f_trip_distance * 10\")\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "application/vnd.databricks.v1+cell": { - "inputWidgets": {}, - "nuid": "ad102c45-586d-468c-85f0-9454401ef10b", - "showTitle": false, - "title": "" - } - }, - "source": [ - "And then we need to build those features so that it can be consumed later. Note that we have to build both the \"anchor\" and the \"derived\" features (which is not anchored to a source)." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "application/vnd.databricks.v1+cell": { - "inputWidgets": {}, - "nuid": "91bb5ebb-87e4-470b-b8eb-1c89b351740e", - "showTitle": false, - "title": "" - } - }, - "outputs": [ - { - "data": { - "text/html": [ - "" - ] - }, - "metadata": { - "application/vnd.databricks.v1+output": { - "arguments": {}, - "data": "", - "errorSummary": "", - "errorTraceType": null, - "metadata": {}, - "type": "ipynbError" - } - }, - "output_type": "display_data" - } - ], - "source": [ - "client.build_features(anchor_list=[agg_anchor, request_anchor], derived_feature_list=[\n", - " f_trip_distance_rounded])" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "application/vnd.databricks.v1+cell": { - "inputWidgets": {}, - "nuid": "632d5f46-f9e2-41a8-aab7-34f75206e2aa", - "showTitle": false, - "title": "" - } - }, - "source": [ - "## Create training data using point-in-time correct feature join\n", - "\n", - "A training dataset usually contains entity id columns, multiple feature columns, event timestamp column and label/target column. \n", - "\n", - "To create a training dataset using Feathr, one needs to provide a feature join configuration file to specify\n", - "what features and how these features should be joined to the observation data. \n", - "\n", - "To learn more on this topic, please refer to [Point-in-time Correctness](https://github.com/linkedin/feathr/blob/main/docs/concepts/point-in-time-join.md)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "application/vnd.databricks.v1+cell": { - "inputWidgets": {}, - "nuid": "e438e6d8-162e-4aa3-b3b3-9d1f3b0d2b7f", - "showTitle": false, - "title": "" - } - }, - "outputs": [ - { - "data": { - "text/html": [ - "" - ] - }, - "metadata": { - "application/vnd.databricks.v1+output": { - "arguments": {}, - "data": "", - "errorSummary": "", - "errorTraceType": null, - "metadata": {}, - "type": "ipynbError" - } - }, - "output_type": "display_data" - } - ], - "source": [ - "\n", - "output_path = 'dbfs:/feathrazure_test.avro'\n", - "\n", - "\n", - "feature_query = FeatureQuery(\n", - " feature_list=[\"f_location_avg_fare\", \"f_trip_distance_rounded\", \"f_is_long_trip_distance\"], key=location_id)\n", - "settings = ObservationSettings(\n", - " observation_path=\"wasbs://public@azurefeathrstorage.blob.core.windows.net/sample_data/green_tripdata_2020-04_with_index.csv\",\n", - " event_timestamp_column=\"lpep_dropoff_datetime\",\n", - " timestamp_format=\"yyyy-MM-dd HH:mm:ss\")\n", - "client.get_offline_features(observation_settings=settings,\n", - " feature_query=feature_query,\n", - " output_path=output_path\n", - " )\n", - "client.wait_job_to_finish(timeout_sec=500)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "application/vnd.databricks.v1+cell": { - "inputWidgets": {}, - "nuid": "51f078e3-3f8f-4f10-b7f1-499ac8a9ff07", - "showTitle": false, - "title": "" - } - }, - "source": [ - "## Download the result and show the result\n", - "\n", - "Let's use the helper function `get_result_df` to download the result and view it:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "application/vnd.databricks.v1+cell": { - "inputWidgets": {}, - "nuid": "23c797b2-ac1a-4cf3-b0ed-c05216de3f37", - "showTitle": false, - "title": "" - } - }, - "outputs": [ - { - "data": { - "text/html": [ - "" - ] - }, - "metadata": { - "application/vnd.databricks.v1+output": { - "arguments": {}, - "data": "", - "errorSummary": "", - "errorTraceType": null, - "metadata": {}, - "type": "ipynbError" - } - }, - "output_type": "display_data" - } - ], - "source": [ - "from feathr.utils.job_utils import get_result_df\n", - "df_res = get_result_df(client, format=\"avro\", res_url = output_path)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "application/vnd.databricks.v1+cell": { - "inputWidgets": {}, - "nuid": "b9be042e-eb12-46b9-9d91-a0e5dd0c704f", - "showTitle": false, - "title": "" - } - }, - "outputs": [ - { - "data": { - "text/html": [ - "" - ] - }, - "metadata": { - "application/vnd.databricks.v1+output": { - "arguments": {}, - "data": "", - "errorSummary": "", - "errorTraceType": null, - "metadata": {}, - "type": "ipynbError" - } - }, - "output_type": "display_data" - } - ], - "source": [ - "df_res" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "application/vnd.databricks.v1+cell": { - "inputWidgets": {}, - "nuid": "dcbf17fc-7f79-4a65-a3af-9cffbd0b5d1f", - "showTitle": false, - "title": "" - } - }, - "source": [ - "## Train a machine learning model\n", - "After getting all the features, let's train a machine learning model with the converted feature by Feathr:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "application/vnd.databricks.v1+cell": { - "inputWidgets": {}, - "nuid": "84745f36-5bac-49c0-903b-38828b923c7c", - "showTitle": false, - "title": "" - } - }, - "outputs": [ - { - "data": { - "text/html": [ - "" - ] - }, - "metadata": { - "application/vnd.databricks.v1+output": { - "arguments": {}, - "data": "", - "errorSummary": "", - "errorTraceType": null, - "metadata": {}, - "type": "ipynbError" - } - }, - "output_type": "display_data" - } - ], - "source": [ - "# remove columns\n", - "from sklearn.ensemble import GradientBoostingRegressor\n", - "final_df = df_res\n", - "final_df.drop([\"lpep_pickup_datetime\", \"lpep_dropoff_datetime\",\n", - " \"store_and_fwd_flag\"], axis=1, inplace=True, errors='ignore')\n", - "final_df.fillna(0, inplace=True)\n", - "final_df['fare_amount'] = final_df['fare_amount'].astype(\"float64\")\n", - "\n", - "\n", - "train_x, test_x, train_y, test_y = train_test_split(final_df.drop([\"fare_amount\"], axis=1),\n", - " final_df[\"fare_amount\"],\n", - " test_size=0.2,\n", - " random_state=42)\n", - "model = GradientBoostingRegressor()\n", - "model.fit(train_x, train_y)\n", - "\n", - "y_predict = model.predict(test_x)\n", - "\n", - "y_actual = test_y.values.flatten().tolist()\n", - "rmse = sqrt(mean_squared_error(y_actual, y_predict))\n", - "\n", - "sum_actuals = sum_errors = 0\n", - "\n", - "for actual_val, predict_val in zip(y_actual, y_predict):\n", - " abs_error = actual_val - predict_val\n", - " if abs_error < 0:\n", - " abs_error = abs_error * -1\n", - "\n", - " sum_errors = sum_errors + abs_error\n", - " sum_actuals = sum_actuals + actual_val\n", - "\n", - "mean_abs_percent_error = sum_errors / sum_actuals\n", - "print(\"Model MAPE:\")\n", - "print(mean_abs_percent_error)\n", - "print()\n", - "print(\"Model Accuracy:\")\n", - "print(1 - mean_abs_percent_error)\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "application/vnd.databricks.v1+cell": { - "inputWidgets": {}, - "nuid": "5a226026-1c7b-48db-8f91-88d5c2ddf023", - "showTitle": false, - "title": "" - } - }, - "source": [ - "## Materialize feature value into offline/online storage\n", - "\n", - "While Feathr can compute the feature value from the feature definition on-the-fly at request time, it can also pre-compute\n", - "and materialize the feature value to offline and/or online storage. \n", - "\n", - "We can push the generated features to the online store like below:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "application/vnd.databricks.v1+cell": { - "inputWidgets": {}, - "nuid": "3b924c66-8634-42fe-90f3-c844487d3f75", - "showTitle": false, - "title": "" - } - }, - "outputs": [ - { - "data": { - "text/html": [ - "" - ] - }, - "metadata": { - "application/vnd.databricks.v1+output": { - "arguments": {}, - "data": "", - "errorSummary": "", - "errorTraceType": null, - "metadata": {}, - "type": "ipynbError" - } - }, - "output_type": "display_data" - } - ], - "source": [ - "backfill_time = BackfillTime(start=datetime(\n", - " 2020, 5, 20), end=datetime(2020, 5, 20), step=timedelta(days=1))\n", - "redisSink = RedisSink(table_name=\"nycTaxiDemoFeature\")\n", - "settings = MaterializationSettings(\"nycTaxiTable\",\n", - " backfill_time=backfill_time,\n", - " sinks=[redisSink],\n", - " feature_names=[\"f_location_avg_fare\", \"f_location_max_fare\"])\n", - "\n", - "client.materialize_features(settings)\n", - "client.wait_job_to_finish(timeout_sec=500)\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "application/vnd.databricks.v1+cell": { - "inputWidgets": {}, - "nuid": "6a3e2ab1-5c66-4d27-a737-c5e2af03b1dd", - "showTitle": false, - "title": "" - } - }, - "source": [ - "We can then get the features from the online store (Redis):" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "application/vnd.databricks.v1+cell": { - "inputWidgets": {}, - "nuid": "bef93538-9591-4247-97b6-289d2055b7b1", - "showTitle": false, - "title": "" - } - }, - "source": [ - "## Fetching feature value for online inference\n", - "\n", - "For features that are already materialized by the previous step, their latest value can be queried via the client's\n", - "`get_online_features` or `multi_get_online_features` API." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "application/vnd.databricks.v1+cell": { - "inputWidgets": {}, - "nuid": "0c3d5f35-11a3-4644-9992-5860169d8302", - "showTitle": false, - "title": "" - } - }, - "outputs": [ - { - "data": { - "text/html": [ - "" - ] - }, - "metadata": { - "application/vnd.databricks.v1+output": { - "arguments": {}, - "data": "", - "errorSummary": "", - "errorTraceType": null, - "metadata": {}, - "type": "ipynbError" - } - }, - "output_type": "display_data" - } - ], - "source": [ - "res = client.get_online_features('nycTaxiDemoFeature', '265', [\n", - " 'f_location_avg_fare', 'f_location_max_fare'])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "application/vnd.databricks.v1+cell": { - "inputWidgets": {}, - "nuid": "4d4699ed-42e6-408f-903d-2f799284f4b6", - "showTitle": false, - "title": "" - } - }, - "outputs": [ - { - "data": { - "text/html": [ - "" - ] - }, - "metadata": { - "application/vnd.databricks.v1+output": { - "arguments": {}, - "data": "", - "errorSummary": "", - "errorTraceType": null, - "metadata": {}, - "type": "ipynbError" - } - }, - "output_type": "display_data" - } - ], - "source": [ - "client.multi_get_online_features(\"nycTaxiDemoFeature\", [\"239\", \"265\"], [\n", - " 'f_location_avg_fare', 'f_location_max_fare'])" - ] - } - ], - "metadata": { - "application/vnd.databricks.v1+notebook": { - "dashboards": [], - "language": "python", - "notebookMetadata": { - "pythonIndentUnit": 4 - }, - "notebookName": "nyc_driver_demo", - "notebookOrigID": 930353059183053, - "widgets": {} - }, - "interpreter": { - "hash": "830c16c5b424e7ff512f67d4056b67cea1a756a7ad6a92c98b9e2b95c5e484ae" - }, - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.9.5" - } - }, - "nbformat": 4, - "nbformat_minor": 0 -} diff --git a/docs/samples/nyc_taxi_demo.ipynb b/docs/samples/nyc_taxi_demo.ipynb deleted file mode 100644 index b789e9bf2..000000000 --- a/docs/samples/nyc_taxi_demo.ipynb +++ /dev/null @@ -1,1110 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "application/vnd.databricks.v1+cell": { - "inputWidgets": {}, - "nuid": "384e5e16-7213-4186-9d04-09d03b155534", - "showTitle": false, - "title": "" - } - }, - "source": [ - "# Feathr Quick Start Notebook\n", - "\n", - "This notebook illustrates the use of Feathr Feature Store to create a model that predicts NYC Taxi fares. The dataset comes from [here](https://www1.nyc.gov/site/tlc/about/tlc-trip-record-data.page).\n", - "\n", - "The major problems Feathr solves are:\n", - "\n", - "1. Create, share and manage useful features from raw source data.\n", - "2. Provide Point-in-time feature join to create training dataset to ensure no data leakage.\n", - "3. Deploy the same feature data to online store to eliminate training and inference data skew." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Prerequisite\n", - "\n", - "Feathr has native cloud integration. First step is to provision required cloud resources if you want to use Feathr.\n", - "\n", - "Follow the [Feathr ARM deployment guide](https://feathr-ai.github.io/feathr/how-to-guides/azure-deployment-arm.html) to run Feathr on Azure. This allows you to quickly get started with automated deployment using Azure Resource Manager template. For more details, please refer [README.md](https://github.com/feathr-ai/feathr#%EF%B8%8F-running-feathr-on-cloud-with-a-few-simple-steps).\n", - "\n", - "Additionally, to run this notebook, you'll need to install `feathr` pip package. For local spark, simply run `pip install feathr` on the machine that runs this notebook. To use Databricks or Azure Synapse Analytics, please see dependency management documents:\n", - "- [Azure Databricks dependency management](https://learn.microsoft.com/en-us/azure/databricks/libraries/)\n", - "- [Azure Synapse Analytics dependency management](https://learn.microsoft.com/en-us/azure/synapse-analytics/spark/apache-spark-azure-portal-add-libraries)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Notebook Steps\n", - "\n", - "This tutorial demonstrates the key capabilities of Feathr, including:\n", - "\n", - "1. Install Feathr and necessary dependencies\n", - "2. Create shareable features with Feathr feature definition configs\n", - "3. Create training data using point-in-time correct feature join\n", - "4. Train a prediction model and evaluate the model and features\n", - "5. Register the features to share across teams\n", - "6. Materialize feature values for online scoring\n", - "\n", - "The overall data flow is as follows:\n", - "\n", - "" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 1. Install Feathr and Necessary Dependancies\n", - "\n", - "Install feathr and necessary packages by running `pip install feathr[notebook]` if you haven't installed them already." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "%load_ext autoreload\n", - "%autoreload 2" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "application/vnd.databricks.v1+cell": { - "inputWidgets": {}, - "nuid": "80223a02-631c-40c8-91b3-a037249ffff9", - "showTitle": false, - "title": "" - } - }, - "outputs": [], - "source": [ - "from datetime import datetime, timedelta\n", - "import glob\n", - "import json\n", - "from math import sqrt\n", - "import os\n", - "from pathlib import Path\n", - "import requests\n", - "from tempfile import TemporaryDirectory\n", - "\n", - "from azure.identity import AzureCliCredential, DefaultAzureCredential \n", - "from azure.keyvault.secrets import SecretClient\n", - "import pandas as pd\n", - "from pyspark.ml import Pipeline\n", - "from pyspark.ml.evaluation import RegressionEvaluator\n", - "from pyspark.ml.feature import VectorAssembler\n", - "from pyspark.ml.regression import GBTRegressor\n", - "from pyspark.sql import DataFrame, SparkSession\n", - "import pyspark.sql.functions as F\n", - "\n", - "import feathr\n", - "from feathr import (\n", - " FeathrClient,\n", - " # Feature data types\n", - " BOOLEAN, FLOAT, INT32, ValueType,\n", - " # Feature data sources\n", - " INPUT_CONTEXT, HdfsSource,\n", - " # Feature aggregations\n", - " TypedKey, WindowAggTransformation,\n", - " # Feature types and anchor\n", - " DerivedFeature, Feature, FeatureAnchor,\n", - " # Materialization\n", - " BackfillTime, MaterializationSettings, RedisSink,\n", - " # Offline feature computation\n", - " FeatureQuery, ObservationSettings,\n", - ")\n", - "from feathr.datasets import nyc_taxi\n", - "from feathr.spark_provider.feathr_configurations import SparkExecutionConfiguration\n", - "from feathr.utils.config import generate_config\n", - "from feathr.utils.job_utils import get_result_df\n", - "from feathr.utils.platform import is_databricks, is_jupyter\n", - "\n", - "print(f\"Feathr version: {feathr.__version__}\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 2. Create Shareable Features with Feathr Feature Definition Configs\n", - "\n", - "First, we define all the necessary resource key values for authentication. These values are retrieved by using [Azure Key Vault](https://azure.microsoft.com/en-us/services/key-vault/) cloud key value store. For authentication, we use Azure CLI credential in this notebook, but you may add secrets' list and get permission for the necessary service principal instead of running `az login --use-device-code`.\n", - "\n", - "Please refer to [A note on using azure key vault to store credentials](https://github.com/feathr-ai/feathr/blob/41e7496b38c43af6d7f8f1de842f657b27840f6d/docs/how-to-guides/feathr-configuration-and-env.md#a-note-on-using-azure-key-vault-to-store-credentials) for more details." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "tags": [ - "parameters" - ] - }, - "outputs": [], - "source": [ - "RESOURCE_PREFIX = None # TODO fill the value\n", - "PROJECT_NAME = \"feathr_getting_started\"\n", - "\n", - "# Data store root path. Could be a local file system path or Azure storage path like abfs or wasbs\n", - "DATA_STORE_PATH = TemporaryDirectory().name\n", - "\n", - "# Currently support: 'azure_synapse', 'databricks', and 'local' \n", - "SPARK_CLUSTER = \"local\"\n", - "# TODO -- Synapse spark pool name or Databricks cluster id\n", - "CLUSTER_NAME = None\n", - "\n", - "# If set True, use an interactive browser authentication\n", - "USE_CLI_AUTH = False\n", - "\n", - "# (For the notebook test pipeline) If true, use ScrapBook package to collect the results.\n", - "SCRAP_RESULTS = False" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "KEY_VAULT = f\"{RESOURCE_PREFIX}kv\"\n", - "KEY_VAULT_URI = f\"https://{KEY_VAULT}.vault.azure.net\"\n", - "\n", - "ADLS_PATH = f\"abfss://{RESOURCE_PREFIX}fs@{RESOURCE_PREFIX}dls.dfs.core.windows.net/feathr_project\"\n", - "\n", - "if SPARK_CLUSTER == \"azure_synapse\":\n", - " os.environ['spark_config__azure_synapse__dev_url'] = f\"https://{resource_prefix}syws.dev.azuresynapse.net\"\n", - " os.environ['spark_config__azure_synapse__pool_name'] = CLUSTER_NAME\n", - " os.environ['spark_config__azure_synapse__workspace_dir'] = f\"abfss://{adls_fs_name}@{resource_prefix}dls.dfs.core.windows.net/{PROJECT_NAME}\"" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "if USE_CLI_AUTH:\n", - " !az login --use-device-code" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "scrolled": false - }, - "outputs": [], - "source": [ - "# Get all the required credentials from Azure Key Vault\n", - "credential = AzureCliCredential() if USE_CLI_AUTH else DefaultAzureCredential()\n", - "secret_client = SecretClient(vault_url=KEY_VAULT_URI, credential=credential)\n", - "retrieved_secret = secret_client.get_secret('FEATHR-ONLINE-STORE-CONN').value" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Redis credential\n", - "os.environ['REDIS_PASSWORD'] = retrieved_secret.split(\",\")[1].split(\"password=\", 1)[1]\n", - "\n", - "if SPARK_CLUSTER == \"local\":\n", - " os.environ['SPARK_LOCAL_IP'] = \"127.0.0.1\"\n", - "\n", - "elif SPARK_CLUSTER == \"databricks\" and is_databricks():\n", - " ctx = dbutils.notebook.entry_point.getDbutils().notebook().getContext()\n", - " databricks_config = {\n", - " 'run_name': \"FEATHR_FILL_IN\",\n", - " 'existing_cluster_id': ctx.tags().get('clusterId').get(),\n", - " 'libraries': [{'jar': \"FEATHR_FILL_IN\"}],\n", - " 'spark_jar_task': {\n", - " 'main_class_name': \"FEATHR_FILL_IN\",\n", - " 'parameters': [\"FEATHR_FILL_IN\"],\n", - " },\n", - " }\n", - " os.environ['spark_config__databricks__workspace_instance_url'] = \"https://\" + ctx.tags().get('browserHostName').get()\n", - " os.environ['spark_config__databricks__config_template'] = json.dumps(databricks_config)\n", - " os.environ['spark_config__databricks__work_dir'] = \"dbfs:/feathr_getting_started\"\n", - " os.environ['DATABRICKS_WORKSPACE_TOKEN_VALUE'] = ctx.apiToken().get()\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "application/vnd.databricks.v1+cell": { - "inputWidgets": {}, - "nuid": "41d3648a-9bc9-40dc-90da-bc82b21ef9b3", - "showTitle": false, - "title": "" - } - }, - "source": [ - "### Configurations\n", - "\n", - "Feathr uses a yaml file to define configurations. Please refer to [feathr_config.yaml]( https://github.com/linkedin/feathr/blob/main/feathr_project/feathrcli/data/feathr_user_workspace/feathr_config.yaml) for the meaning of each field." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "application/vnd.databricks.v1+cell": { - "inputWidgets": {}, - "nuid": "8cd64e3a-376c-48e6-ba41-5197f3591d48", - "showTitle": false, - "title": "" - } - }, - "outputs": [], - "source": [ - "config_path = generate_config(project_name=PROJECT_NAME, spark_cluster=SPARK_CLUSTER, resource_prefix=RESOURCE_PREFIX)\n", - "\n", - "with open(config_path, 'r') as f: \n", - " print(f.read())" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "All the configurations can be overwritten by environment variables with concatenation of `__` for different layers of the config file. For example, `feathr_runtime_location` for databricks config can be overwritten by setting `spark_config__databricks__feathr_runtime_location` environment variable." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "application/vnd.databricks.v1+cell": { - "inputWidgets": {}, - "nuid": "3fef7f2f-df19-4f53-90a5-ff7999ed983d", - "showTitle": false, - "title": "" - } - }, - "source": [ - "### Initialize Feathr client" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "application/vnd.databricks.v1+cell": { - "inputWidgets": {}, - "nuid": "9713a2df-c7b2-4562-88b0-b7acce3cc43a", - "showTitle": false, - "title": "" - } - }, - "outputs": [], - "source": [ - "client = FeathrClient(config_path=config_path)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "application/vnd.databricks.v1+cell": { - "inputWidgets": {}, - "nuid": "c3b64bda-d42c-4a64-b976-0fb604cf38c5", - "showTitle": false, - "title": "" - } - }, - "source": [ - "### Prepare the NYC taxi fare dataset" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# To run on a local spark, start a spark session:\n", - "if SPARK_CLUSTER == \"local\":\n", - " spark = (\n", - " SparkSession\n", - " .builder\n", - " .appName(\"feathr\")\n", - " .config(\"spark.jars.packages\", \"org.apache.spark:spark-avro_2.12:3.3.0\")\n", - " .config(\"spark.ui.port\", \"8080\") # Set ui port other than the default one (4040) so that feathr spark job doesn't fail. \n", - " .getOrCreate()\n", - " )\n", - " \n", - "# Else, you must already have spark session object available in databricks or synapse." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "application/vnd.databricks.v1+cell": { - "inputWidgets": {}, - "nuid": "c4ccd7b3-298a-4e5a-8eec-b7e309db393e", - "showTitle": false, - "title": "" - } - }, - "outputs": [], - "source": [ - "DATA_FILE_PATH = str(Path(DATA_STORE_PATH, \"nyc_taxi.csv\"))\n", - "\n", - "# Download the data file\n", - "df_raw = nyc_taxi.get_spark_df(spark=spark, local_cache_path=DATA_FILE_PATH)\n", - "df_raw.limit(5).toPandas()" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "application/vnd.databricks.v1+cell": { - "inputWidgets": {}, - "nuid": "7430c942-64e5-4b70-b823-16ce1d1b3cee", - "showTitle": false, - "title": "" - } - }, - "source": [ - "### Defining features with Feathr\n", - "\n", - "In Feathr, a feature is viewed as a function, mapping a key and timestamp to a feature value. For more details, please see [Feathr Feature Definition Guide](https://github.com/feathr-ai/feathr/blob/main/docs/concepts/feature-definition.md).\n", - "\n", - "* The feature key (a.k.a. entity id) identifies the subject of feature, e.g. a user_id or location_id.\n", - "* The feature name is the aspect of the entity that the feature is indicating, e.g. the age of the user.\n", - "* The feature value is the actual value of that aspect at a particular time, e.g. the value is 30 at year 2022.\n", - "\n", - "Note that, in some cases, a feature could be just a transformation function that has no entity key or timestamp involved, e.g. *the day of week of the request timestamp*.\n", - "\n", - "There are two types of features -- anchored features and derivated features:\n", - "\n", - "* **Anchored features**: Features that are directly extracted from sources. Could be with or without aggregation. \n", - "* **Derived features**: Features that are computed on top of other features.\n", - "\n", - "#### Define anchored features\n", - "\n", - "A feature source is needed for anchored features that describes the raw data in which the feature values are computed from. A source value should be either `INPUT_CONTEXT` (the features that will be extracted from the observation data directly) or `feathr.source.Source` object." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "TIMESTAMP_COL = \"lpep_dropoff_datetime\"\n", - "TIMESTAMP_FORMAT = \"yyyy-MM-dd HH:mm:ss\"" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "application/vnd.databricks.v1+cell": { - "inputWidgets": {}, - "nuid": "a373ecbe-a040-4cd3-9d87-0d5f4c5ba553", - "showTitle": false, - "title": "" - } - }, - "outputs": [], - "source": [ - "# We define f_trip_distance and f_trip_time_duration features separately\n", - "# so that we can reuse them later for the derived features.\n", - "f_trip_distance = Feature(\n", - " name=\"f_trip_distance\",\n", - " feature_type=FLOAT,\n", - " transform=\"trip_distance\",\n", - ")\n", - "f_trip_time_duration = Feature(\n", - " name=\"f_trip_time_duration\",\n", - " feature_type=FLOAT,\n", - " transform=\"cast_float((to_unix_timestamp(lpep_dropoff_datetime) - to_unix_timestamp(lpep_pickup_datetime)) / 60)\",\n", - ")\n", - "\n", - "features = [\n", - " f_trip_distance,\n", - " f_trip_time_duration,\n", - " Feature(\n", - " name=\"f_is_long_trip_distance\",\n", - " feature_type=BOOLEAN,\n", - " transform=\"trip_distance > 30.0\",\n", - " ),\n", - " Feature(\n", - " name=\"f_day_of_week\",\n", - " feature_type=INT32,\n", - " transform=\"dayofweek(lpep_dropoff_datetime)\",\n", - " ),\n", - " Feature(\n", - " name=\"f_day_of_month\",\n", - " feature_type=INT32,\n", - " transform=\"dayofmonth(lpep_dropoff_datetime)\",\n", - " ),\n", - " Feature(\n", - " name=\"f_hour_of_day\",\n", - " feature_type=INT32,\n", - " transform=\"hour(lpep_dropoff_datetime)\",\n", - " ),\n", - "]\n", - "\n", - "# After you have defined features, bring them together to build the anchor to the source.\n", - "feature_anchor = FeatureAnchor(\n", - " name=\"feature_anchor\",\n", - " source=INPUT_CONTEXT, # Pass through source, i.e. observation data.\n", - " features=features,\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We can define the source with a preprocessing python function." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "def preprocessing(df: DataFrame) -> DataFrame:\n", - " import pyspark.sql.functions as F\n", - " df = df.withColumn(\"fare_amount_cents\", (F.col(\"fare_amount\") * 100.0).cast(\"float\"))\n", - " return df\n", - "\n", - "batch_source = HdfsSource(\n", - " name=\"nycTaxiBatchSource\",\n", - " path=DATA_FILE_PATH,\n", - " event_timestamp_column=TIMESTAMP_COL,\n", - " preprocessing=preprocessing,\n", - " timestamp_format=TIMESTAMP_FORMAT,\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "For the features with aggregation, the supported functions are as follows:\n", - "\n", - "| Aggregation Function | Input Type | Description |\n", - "| --- | --- | --- |\n", - "|SUM, COUNT, MAX, MIN, AVG\t|Numeric|Applies the the numerical operation on the numeric inputs. |\n", - "|MAX_POOLING, MIN_POOLING, AVG_POOLING\t| Numeric Vector | Applies the max/min/avg operation on a per entry bassis for a given a collection of numbers.|\n", - "|LATEST| Any |Returns the latest not-null values from within the defined time window |" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "agg_key = TypedKey(\n", - " key_column=\"DOLocationID\",\n", - " key_column_type=ValueType.INT32,\n", - " description=\"location id in NYC\",\n", - " full_name=\"nyc_taxi.location_id\",\n", - ")\n", - "\n", - "agg_window = \"90d\"\n", - "\n", - "# Anchored features with aggregations\n", - "agg_features = [\n", - " Feature(\n", - " name=\"f_location_avg_fare\",\n", - " key=agg_key,\n", - " feature_type=FLOAT,\n", - " transform=WindowAggTransformation(\n", - " agg_expr=\"fare_amount_cents\",\n", - " agg_func=\"AVG\",\n", - " window=agg_window,\n", - " ),\n", - " ),\n", - " Feature(\n", - " name=\"f_location_max_fare\",\n", - " key=agg_key,\n", - " feature_type=FLOAT,\n", - " transform=WindowAggTransformation(\n", - " agg_expr=\"fare_amount_cents\",\n", - " agg_func=\"MAX\",\n", - " window=agg_window,\n", - " ),\n", - " ),\n", - "]\n", - "\n", - "agg_feature_anchor = FeatureAnchor(\n", - " name=\"agg_feature_anchor\",\n", - " source=batch_source, # External data source for feature. Typically a data table.\n", - " features=agg_features,\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "application/vnd.databricks.v1+cell": { - "inputWidgets": {}, - "nuid": "d2ecaca9-057e-4b36-811f-320f66f753ed", - "showTitle": false, - "title": "" - } - }, - "source": [ - "#### Define derived features\n", - "\n", - "We also define a derived feature, `f_trip_time_distance`, from the anchored features `f_trip_distance` and `f_trip_time_duration` as follows:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "application/vnd.databricks.v1+cell": { - "inputWidgets": {}, - "nuid": "270fb11e-8a71-404f-9639-ad29d8e6a2c1", - "showTitle": false, - "title": "" - } - }, - "outputs": [], - "source": [ - "derived_features = [\n", - " DerivedFeature(\n", - " name=\"f_trip_time_distance\",\n", - " feature_type=FLOAT,\n", - " input_features=[\n", - " f_trip_distance,\n", - " f_trip_time_duration,\n", - " ],\n", - " transform=\"f_trip_distance / f_trip_time_duration\",\n", - " )\n", - "]" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "application/vnd.databricks.v1+cell": { - "inputWidgets": {}, - "nuid": "ad102c45-586d-468c-85f0-9454401ef10b", - "showTitle": false, - "title": "" - } - }, - "source": [ - "### Build features\n", - "\n", - "Finally, we build the features." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "application/vnd.databricks.v1+cell": { - "inputWidgets": {}, - "nuid": "91bb5ebb-87e4-470b-b8eb-1c89b351740e", - "showTitle": false, - "title": "" - } - }, - "outputs": [], - "source": [ - "client.build_features(\n", - " anchor_list=[feature_anchor, agg_feature_anchor],\n", - " derived_feature_list=derived_features,\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "application/vnd.databricks.v1+cell": { - "inputWidgets": {}, - "nuid": "632d5f46-f9e2-41a8-aab7-34f75206e2aa", - "showTitle": false, - "title": "" - } - }, - "source": [ - "## 3. Create Training Data Using Point-in-Time Correct Feature Join\n", - "\n", - "After the feature producers have defined the features (as described in the Feature Definition part), the feature consumers may want to consume those features. Feature consumers will use observation data to query from different feature tables using Feature Query.\n", - "\n", - "To create a training dataset using Feathr, one needs to provide a feature join configuration file to specify\n", - "what features and how these features should be joined to the observation data. \n", - "\n", - "To learn more on this topic, please refer to [Point-in-time Correctness](https://github.com/linkedin/feathr/blob/main/docs/concepts/point-in-time-join.md)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "feature_names = [feature.name for feature in features + agg_features + derived_features]\n", - "feature_names" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "DATA_FORMAT = \"parquet\"\n", - "offline_features_path = str(Path(DATA_STORE_PATH, \"feathr_output\", f\"features.{DATA_FORMAT}\"))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "application/vnd.databricks.v1+cell": { - "inputWidgets": {}, - "nuid": "e438e6d8-162e-4aa3-b3b3-9d1f3b0d2b7f", - "showTitle": false, - "title": "" - }, - "scrolled": false - }, - "outputs": [], - "source": [ - "# Features that we want to request. Can use a subset of features\n", - "query = FeatureQuery(\n", - " feature_list=feature_names,\n", - " key=agg_key,\n", - ")\n", - "settings = ObservationSettings(\n", - " observation_path=DATA_FILE_PATH,\n", - " event_timestamp_column=TIMESTAMP_COL,\n", - " timestamp_format=TIMESTAMP_FORMAT,\n", - ")\n", - "client.get_offline_features(\n", - " observation_settings=settings,\n", - " feature_query=query,\n", - " # For more details, see https://feathr-ai.github.io/feathr/how-to-guides/feathr-job-configuration.html\n", - " execution_configurations=SparkExecutionConfiguration({\n", - " \"spark.feathr.outputFormat\": DATA_FORMAT,\n", - " }),\n", - " output_path=offline_features_path,\n", - ")\n", - "\n", - "client.wait_job_to_finish(timeout_sec=500)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Show feature results\n", - "df = get_result_df(\n", - " spark=spark,\n", - " client=client,\n", - " data_format=DATA_FORMAT,\n", - " res_url=offline_features_path,\n", - ")\n", - "df.select(feature_names).limit(5).toPandas()" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "application/vnd.databricks.v1+cell": { - "inputWidgets": {}, - "nuid": "dcbf17fc-7f79-4a65-a3af-9cffbd0b5d1f", - "showTitle": false, - "title": "" - } - }, - "source": [ - "## 4. Train a Prediction Model and Evaluate the Features\n", - "\n", - "After generating all the features, we train and evaluate a machine learning model to predict the NYC taxi fare prediction. In this example, we use Spark MLlib's [GBTRegressor](https://spark.apache.org/docs/latest/ml-classification-regression.html#gradient-boosted-tree-regression).\n", - "\n", - "Note that designing features, training prediction models and evaluating them are an iterative process where the models' performance maybe used to modify the features as a part of the modeling process." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Load Train and Test Data from the Offline Feature Values" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Train / test split\n", - "train_df, test_df = (\n", - " df # Dataframe that we generated from get_offline_features call.\n", - " .withColumn(\"label\", F.col(\"fare_amount\").cast(\"double\"))\n", - " .where(F.col(\"f_trip_time_duration\") > 0)\n", - " .fillna(0)\n", - " .randomSplit([0.8, 0.2])\n", - ")\n", - "\n", - "print(f\"Num train samples: {train_df.count()}\")\n", - "print(f\"Num test samples: {test_df.count()}\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Build a ML Pipeline\n", - "\n", - "Here, we use Spark ML Pipeline to aggregate feature vectors and feed them to the model." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Generate a feature vector column for SparkML\n", - "vector_assembler = VectorAssembler(\n", - " inputCols=[x for x in df.columns if x in feature_names],\n", - " outputCol=\"features\",\n", - ")\n", - "\n", - "# Define a model\n", - "gbt = GBTRegressor(\n", - " featuresCol=\"features\",\n", - " maxIter=100,\n", - " maxDepth=5,\n", - " maxBins=16,\n", - ")\n", - "\n", - "# Create a ML pipeline\n", - "ml_pipeline = Pipeline(stages=[\n", - " vector_assembler,\n", - " gbt,\n", - "])" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Train and Evaluate the Model" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Train a model\n", - "model = ml_pipeline.fit(train_df)\n", - "\n", - "# Make predictions\n", - "predictions = model.transform(test_df)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Evaluate\n", - "evaluator = RegressionEvaluator(\n", - " labelCol=\"label\",\n", - " predictionCol=\"prediction\",\n", - ")\n", - "\n", - "rmse = evaluator.evaluate(predictions, {evaluator.metricName: \"rmse\"})\n", - "mae = evaluator.evaluate(predictions, {evaluator.metricName: \"mae\"})\n", - "print(f\"RMSE: {rmse}\\nMAE: {mae}\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# predicted fare vs actual fare plots -- will this work for databricks / synapse / local ?\n", - "predictions_pdf = predictions.select([\"label\", \"prediction\"]).toPandas().reset_index()\n", - "\n", - "predictions_pdf.plot(\n", - " x=\"index\",\n", - " y=[\"label\", \"prediction\"],\n", - " style=['-', ':'],\n", - " figsize=(20, 10),\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "predictions_pdf.plot.scatter(\n", - " x=\"label\",\n", - " y=\"prediction\",\n", - " xlim=(0, 100),\n", - " ylim=(0, 100),\n", - " figsize=(10, 10),\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 5. Register the Features to Share Across Teams\n", - "\n", - "You can register your features in the centralized registry and share the corresponding project with other team members who want to consume those features and for further use." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "try:\n", - " client.register_features()\n", - "except KeyError:\n", - " # TODO temporarily go around the \"Already exists\" error\n", - " \n", - " client.list_registered_features(project_name=PROJECT_NAME)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "application/vnd.databricks.v1+cell": { - "inputWidgets": {}, - "nuid": "5a226026-1c7b-48db-8f91-88d5c2ddf023", - "showTitle": false, - "title": "" - } - }, - "source": [ - "## 6. Materialize Feature Values for Online Scoring\n", - "\n", - "While we computed feature values on-the-fly at request time via Feathr, we can pre-compute the feature values and materialize them to offline or online storages such as Redis.\n", - "\n", - "Note, only the features anchored to offline data source can be materialized." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Get registered features\n", - "registered_features_dict = client.get_features_from_registry(PROJECT_NAME)\n", - "\n", - "observation_feature_names = []\n", - "materialized_feature_names = []\n", - "\n", - "for feature_name, feature in registered_features_dict.items():\n", - " if feature.key[0].key_column == \"NOT_NEEDED\":\n", - " observation_feature_names.append(feature_name)\n", - " else:\n", - " materialized_feature_names.append(feature_name)\n", - " \n", - "print(f\"Features that will be extracted directly from the observation: {observation_feature_names}\")\n", - "print(\"\")\n", - "print(f\"Features that will be extracted from the source data and materialized to online storage: {materialized_feature_names}\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Get the last date from the dataset\n", - "backfill_timestamp = (\n", - " df_raw\n", - " .select(F.to_timestamp(F.col(TIMESTAMP_COL), TIMESTAMP_FORMAT).alias(TIMESTAMP_COL))\n", - " .agg({TIMESTAMP_COL: \"max\"})\n", - " .collect()[0][0]\n", - ")\n", - "backfill_timestamp" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "application/vnd.databricks.v1+cell": { - "inputWidgets": {}, - "nuid": "3b924c66-8634-42fe-90f3-c844487d3f75", - "showTitle": false, - "title": "" - }, - "scrolled": false - }, - "outputs": [], - "source": [ - "FEATURE_TABLE_NAME = \"nycTaxiDemoFeature\"\n", - "\n", - "# Time range to materialize\n", - "backfill_time = BackfillTime(\n", - " start=backfill_timestamp,\n", - " end=backfill_timestamp,\n", - " step=timedelta(days=1),\n", - ")\n", - "\n", - "# Destinations:\n", - "# For online store,\n", - "redis_sink = RedisSink(table_name=FEATURE_TABLE_NAME)\n", - "\n", - "# For offline store,\n", - "# adls_sink = HdfsSink(output_path=)\n", - "\n", - "settings = MaterializationSettings(\n", - " name=FEATURE_TABLE_NAME + \".job\", # job name\n", - " backfill_time=backfill_time,\n", - " sinks=[redis_sink], # or adls_sink\n", - " feature_names=materialized_feature_names,\n", - ")\n", - "\n", - "client.materialize_features(\n", - " settings=settings,\n", - " execution_configurations={\"spark.feathr.outputFormat\": \"parquet\"},\n", - ")\n", - "\n", - "client.wait_job_to_finish(timeout_sec=500)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Now, you can retrieve features for online scoring as follows:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Note, to get a single key, you may use client.get_online_features instead\n", - "materialized_feature_values = client.multi_get_online_features(\n", - " feature_table=FEATURE_TABLE_NAME,\n", - " keys=[\"239\", \"265\"],\n", - " feature_names=materialized_feature_names,\n", - ")\n", - "materialized_feature_values" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Cleanup" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# TODO: Unregister or any other cleanups." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Stop the spark session if it is a local session.\n", - "if is_jupyter():\n", - " spark.stop()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Scrap Variables for Testing" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "if SCRAP_RESULTS:\n", - " # Record results for test pipelines\n", - " import scrapbook as sb\n", - " sb.glue(\"materialized_feature_values\", materialized_feature_values)\n", - " sb.glue(\"rmse\", rmse)\n", - " sb.glue(\"mae\", mae)" - ] - } - ], - "metadata": { - "application/vnd.databricks.v1+notebook": { - "dashboards": [], - "language": "python", - "notebookMetadata": { - "pythonIndentUnit": 4 - }, - "notebookName": "nyc_driver_demo", - "notebookOrigID": 930353059183053, - "widgets": {} - }, - "celltoolbar": "Tags", - "kernelspec": { - "display_name": "Python 3.10.4 ('feathr')", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.4" - }, - "vscode": { - "interpreter": { - "hash": "ddb0e38f168d5afaa0b8ab4851ddd8c14364f1d087c15de6ff2ee5a559aec1f2" - } - } - }, - "nbformat": 4, - "nbformat_minor": 1 -}