diff --git a/.prow/config.yaml b/.prow/config.yaml index 7035d5cc9b..c5f127ab63 100644 --- a/.prow/config.yaml +++ b/.prow/config.yaml @@ -451,6 +451,11 @@ postsubmits: --file infra/docker/serving/Dockerfile \ --google-service-account-file /etc/gcloud/service-account.json + infra/scripts/publish-docker-image.sh \ + --repository gcr.io/kf-feast/feast-jupyter \ + --tag ${PULL_BASE_REF:1} \ + --file infra/docker/jupyter/Dockerfile \ + --google-service-account-file /etc/gcloud/service-account.json HIGHEST_SEMVER_TAG=$(git tag -l --sort -version:refname | head -n 1) echo "Only push to latest tag if tag is the highest semver version $HIGHEST_SEMVER_TAG" @@ -462,6 +467,9 @@ postsubmits: docker tag gcr.io/kf-feast/feast-serving:${PULL_BASE_REF:1} gcr.io/kf-feast/feast-serving:latest docker push gcr.io/kf-feast/feast-serving:latest + + docker tag gcr.io/kf-feast/feast-jupyter:${PULL_BASE_REF:1} gcr.io/kf-feast/feast-jupyter:latest + docker push gcr.io/kf-feast/feast-jupyter:latest fi fi diff --git a/examples/basic/basic.ipynb b/examples/basic/basic.ipynb index a56121328c..3d6bb3cc2a 100644 --- a/examples/basic/basic.ipynb +++ b/examples/basic/basic.ipynb @@ -11,7 +11,9 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "This is a minimal example of using Feast. In this example we will\n", + "This is a minimal example of using Feast. The point is to show users how to get data into Feast and how to retrieve features for online serving and model training.\n", + "\n", + "In this example we will\n", "1. Create a synthetic customer feature dataset\n", "2. Register a feature set to represent these features in Feast\n", "3. Ingest these features into Feast\n", @@ -60,55 +62,9 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Requirement already satisfied: feast in /home/zzy/.conda/envs/feast-ml/lib/python3.7/site-packages (0.5.0.post0)\n", - "Requirement already satisfied: google in /home/zzy/.conda/envs/feast-ml/lib/python3.7/site-packages (from feast) (2.0.3)\n", - "Requirement already satisfied: tabulate==0.8.* in /home/zzy/.conda/envs/feast-ml/lib/python3.7/site-packages (from feast) (0.8.7)\n", - "Requirement already satisfied: pandavro==1.5.* in /home/zzy/.conda/envs/feast-ml/lib/python3.7/site-packages (from feast) (1.5.1)\n", - "Requirement already satisfied: pandas==0.* in /home/zzy/.local/lib/python3.7/site-packages (from feast) (0.25.0)\n", - "Requirement already satisfied: google-cloud-core==1.0.* in /home/zzy/.conda/envs/feast-ml/lib/python3.7/site-packages (from feast) (1.0.3)\n", - "Requirement already satisfied: grpcio==1.* in /home/zzy/.conda/envs/feast-ml/lib/python3.7/site-packages (from feast) (1.29.0)\n", - "Requirement already satisfied: fastavro<0.23,>=0.22.11 in /home/zzy/.conda/envs/feast-ml/lib/python3.7/site-packages (from feast) (0.22.13)\n", - "Requirement already satisfied: PyYAML==5.1.* in /home/zzy/.conda/envs/feast-ml/lib/python3.7/site-packages (from feast) (5.1.2)\n", - "Requirement already satisfied: googleapis-common-protos==1.* in /home/zzy/.conda/envs/feast-ml/lib/python3.7/site-packages (from feast) (1.51.0)\n", - "Requirement already satisfied: tqdm==4.* in /home/zzy/.conda/envs/feast-ml/lib/python3.7/site-packages (from feast) (4.46.0)\n", - "Requirement already satisfied: numpy in /home/zzy/.local/lib/python3.7/site-packages (from feast) (1.17.4)\n", - "Requirement already satisfied: confluent-kafka in /home/zzy/.conda/envs/feast-ml/lib/python3.7/site-packages (from feast) (1.4.2)\n", - "Requirement already satisfied: google-cloud-bigquery-storage==0.7.* in /home/zzy/.conda/envs/feast-ml/lib/python3.7/site-packages (from feast) (0.7.0)\n", - "Requirement already satisfied: toml==0.10.* in /home/zzy/.conda/envs/feast-ml/lib/python3.7/site-packages (from feast) (0.10.1)\n", - "Requirement already satisfied: protobuf>=3.10 in /home/zzy/.conda/envs/feast-ml/lib/python3.7/site-packages (from feast) (3.12.1)\n", - "Requirement already satisfied: google-cloud-bigquery==1.18.* in /home/zzy/.conda/envs/feast-ml/lib/python3.7/site-packages (from feast) (1.18.1)\n", - "Requirement already satisfied: pyarrow>=0.15.1 in /home/zzy/.conda/envs/feast-ml/lib/python3.7/site-packages (from feast) (0.17.1)\n", - "Requirement already satisfied: google-api-core==1.14.* in /home/zzy/.conda/envs/feast-ml/lib/python3.7/site-packages (from feast) (1.14.3)\n", - "Requirement already satisfied: google-cloud-storage==1.20.* in /home/zzy/.conda/envs/feast-ml/lib/python3.7/site-packages (from feast) (1.20.0)\n", - "Requirement already satisfied: google-auth==1.6.* in /home/zzy/.conda/envs/feast-ml/lib/python3.7/site-packages (from feast) (1.6.3)\n", - "Requirement already satisfied: kafka-python==1.* in /home/zzy/.conda/envs/feast-ml/lib/python3.7/site-packages (from feast) (1.4.7)\n", - "Requirement already satisfied: Click==7.* in /home/zzy/.conda/envs/feast-ml/lib/python3.7/site-packages (from feast) (7.1.2)\n", - "Requirement already satisfied: beautifulsoup4 in /home/zzy/.conda/envs/feast-ml/lib/python3.7/site-packages (from google->feast) (4.9.1)\n", - "Requirement already satisfied: six>=1.9 in /home/zzy/.conda/envs/feast-ml/lib/python3.7/site-packages (from pandavro==1.5.*->feast) (1.14.0)\n", - "Requirement already satisfied: python-dateutil>=2.6.1 in /home/zzy/.local/lib/python3.7/site-packages (from pandas==0.*->feast) (2.8.0)\n", - "Requirement already satisfied: pytz>=2017.2 in /home/zzy/.local/lib/python3.7/site-packages (from pandas==0.*->feast) (2019.1)\n", - "Requirement already satisfied: setuptools in /home/zzy/.local/lib/python3.7/site-packages (from protobuf>=3.10->feast) (41.6.0)\n", - "Requirement already satisfied: google-resumable-media<0.5.0dev,>=0.3.1 in /home/zzy/.conda/envs/feast-ml/lib/python3.7/site-packages (from google-cloud-bigquery==1.18.*->feast) (0.4.1)\n", - "Requirement already satisfied: requests<3.0.0dev,>=2.18.0 in /home/zzy/.conda/envs/feast-ml/lib/python3.7/site-packages (from google-api-core==1.14.*->feast) (2.23.0)\n", - "Requirement already satisfied: rsa>=3.1.4 in /home/zzy/.conda/envs/feast-ml/lib/python3.7/site-packages (from google-auth==1.6.*->feast) (4.0)\n", - "Requirement already satisfied: cachetools>=2.0.0 in /home/zzy/.conda/envs/feast-ml/lib/python3.7/site-packages (from google-auth==1.6.*->feast) (4.1.0)\n", - "Requirement already satisfied: pyasn1-modules>=0.2.1 in /home/zzy/.conda/envs/feast-ml/lib/python3.7/site-packages (from google-auth==1.6.*->feast) (0.2.8)\n", - "Requirement already satisfied: soupsieve>1.2 in /home/zzy/.conda/envs/feast-ml/lib/python3.7/site-packages (from beautifulsoup4->google->feast) (2.0.1)\n", - "Requirement already satisfied: chardet<4,>=3.0.2 in /home/zzy/.conda/envs/feast-ml/lib/python3.7/site-packages (from requests<3.0.0dev,>=2.18.0->google-api-core==1.14.*->feast) (3.0.4)\n", - "Requirement already satisfied: idna<3,>=2.5 in /home/zzy/.conda/envs/feast-ml/lib/python3.7/site-packages (from requests<3.0.0dev,>=2.18.0->google-api-core==1.14.*->feast) (2.9)\n", - "Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /home/zzy/.conda/envs/feast-ml/lib/python3.7/site-packages (from requests<3.0.0dev,>=2.18.0->google-api-core==1.14.*->feast) (1.25.8)\n", - "Requirement already satisfied: certifi>=2017.4.17 in /home/zzy/.local/lib/python3.7/site-packages (from requests<3.0.0dev,>=2.18.0->google-api-core==1.14.*->feast) (2019.6.16)\n", - "Requirement already satisfied: pyasn1>=0.1.3 in /home/zzy/.conda/envs/feast-ml/lib/python3.7/site-packages (from rsa>=3.1.4->google-auth==1.6.*->feast) (0.4.8)\n" - ] - } - ], + "outputs": [], "source": [ "!pip install feast" ] @@ -122,7 +78,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 2, "metadata": {}, "outputs": [], "source": [ @@ -149,7 +105,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 3, "metadata": {}, "outputs": [], "source": [ @@ -172,165 +128,82 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "days = [datetime.utcnow().replace(hour=0, minute=0, second=0, microsecond=0).replace(tzinfo=utc) \\\n", - " - timedelta(day) for day in range(3)][::-1]\n", + " - timedelta(day) for day in range(10)][::-1]\n", "\n", "customers = [1001, 1002, 1003, 1004, 1005]" ] }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 5, "metadata": {}, "outputs": [ { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
datetimecustomer_iddaily_transactionstotal_transactions
02020-05-25 00:00:00+00:0010018.70480277
12020-05-25 00:00:00+00:0010027.16388731
22020-05-25 00:00:00+00:0010039.93597668
32020-05-25 00:00:00+00:0010041.10798078
42020-05-25 00:00:00+00:0010058.30738136
52020-05-26 00:00:00+00:0010012.41681114
62020-05-26 00:00:00+00:0010024.8177359
72020-05-26 00:00:00+00:0010034.40971495
82020-05-26 00:00:00+00:0010046.6173176
92020-05-26 00:00:00+00:0010051.03252586
\n", - "
" - ], - "text/plain": [ - " datetime customer_id daily_transactions \\\n", - "0 2020-05-25 00:00:00+00:00 1001 8.704802 \n", - "1 2020-05-25 00:00:00+00:00 1002 7.163887 \n", - "2 2020-05-25 00:00:00+00:00 1003 9.935976 \n", - "3 2020-05-25 00:00:00+00:00 1004 1.107980 \n", - "4 2020-05-25 00:00:00+00:00 1005 8.307381 \n", - "5 2020-05-26 00:00:00+00:00 1001 2.416811 \n", - "6 2020-05-26 00:00:00+00:00 1002 4.817735 \n", - "7 2020-05-26 00:00:00+00:00 1003 4.409714 \n", - "8 2020-05-26 00:00:00+00:00 1004 6.617317 \n", - "9 2020-05-26 00:00:00+00:00 1005 1.032525 \n", - "\n", - " total_transactions \n", - "0 77 \n", - "1 31 \n", - "2 68 \n", - "3 78 \n", - "4 36 \n", - "5 14 \n", - "6 9 \n", - "7 95 \n", - "8 6 \n", - "9 86 " - ] - }, - "execution_count": 6, - "metadata": {}, - "output_type": "execute_result" + "name": "stdout", + "output_type": "stream", + "text": [ + " datetime customer_id daily_transactions \\\n", + "0 2020-06-09 00:00:00+00:00 1001 0.564751 \n", + "1 2020-06-09 00:00:00+00:00 1002 3.945566 \n", + "2 2020-06-09 00:00:00+00:00 1003 7.291928 \n", + "3 2020-06-09 00:00:00+00:00 1004 6.690477 \n", + "4 2020-06-09 00:00:00+00:00 1005 6.415899 \n", + "5 2020-06-10 00:00:00+00:00 1001 0.347294 \n", + "6 2020-06-10 00:00:00+00:00 1002 5.363853 \n", + "7 2020-06-10 00:00:00+00:00 1003 0.538129 \n", + "8 2020-06-10 00:00:00+00:00 1004 4.755425 \n", + "9 2020-06-10 00:00:00+00:00 1005 2.867527 \n", + "10 2020-06-11 00:00:00+00:00 1001 9.493098 \n", + "11 2020-06-11 00:00:00+00:00 1002 5.130665 \n", + "12 2020-06-11 00:00:00+00:00 1003 1.794191 \n", + "13 2020-06-11 00:00:00+00:00 1004 4.698504 \n", + "14 2020-06-11 00:00:00+00:00 1005 2.908603 \n", + "15 2020-06-12 00:00:00+00:00 1001 9.857894 \n", + "16 2020-06-12 00:00:00+00:00 1002 5.416553 \n", + "17 2020-06-12 00:00:00+00:00 1003 5.374058 \n", + "18 2020-06-12 00:00:00+00:00 1004 9.834441 \n", + "19 2020-06-12 00:00:00+00:00 1005 0.480373 \n", + "\n", + " total_transactions \n", + "0 73 \n", + "1 75 \n", + "2 95 \n", + "3 50 \n", + "4 65 \n", + "5 28 \n", + "6 76 \n", + "7 42 \n", + "8 53 \n", + "9 61 \n", + "10 86 \n", + "11 31 \n", + "12 69 \n", + "13 9 \n", + "14 51 \n", + "15 23 \n", + "16 1 \n", + "17 34 \n", + "18 13 \n", + "19 50 \n" + ] } ], "source": [ "customer_features = pd.DataFrame(\n", " {\n", - " \"datetime\": [day for day in days for customer in customers],\n", - " \"customer_id\": [customer for day in days for customer in customers],\n", - " \"daily_transactions\": [np.random.rand() * 10 for _ in range(len(days) * len(customers))],\n", - " \"total_transactions\": [np.random.randint(100) for _ in range(len(days) * len(customers))],\n", + " \"datetime\": [day for day in days for customer in customers], # Datetime is required\n", + " \"customer_id\": [customer for day in days for customer in customers], # Customer is the entity\n", + " \"daily_transactions\": [np.random.rand() * 10 for _ in range(len(days) * len(customers))], # Feature 1\n", + " \"total_transactions\": [np.random.randint(100) for _ in range(len(days) * len(customers))], # Feature 2\n", " }\n", ")\n", "\n", - "customer_features.head(10)" + "print(customer_features.head(20))" ] }, { @@ -350,14 +223,13 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "customer_fs = FeatureSet(\n", " \"customer_transactions\",\n", - " entities=[Entity(name='customer_id', dtype=ValueType.INT64)],\n", - " max_age=Duration(seconds=432000) \n", + " entities=[Entity(name='customer_id', dtype=ValueType.INT64)]\n", ")" ] }, @@ -370,7 +242,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 7, "metadata": {}, "outputs": [ { @@ -398,7 +270,24 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "The apply() method will register the provided feature set with Feast core, allowing users to retrieve features from this feature set" + "The apply() method will register the provided feature set with Feast Core (the feature registry)." + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Feature set updated: \"customer_transactions\"\n" + ] + } + ], + "source": [ + "client.apply(customer_fs)" ] }, { @@ -410,7 +299,6 @@ "name": "stdout", "output_type": "stream", "text": [ - "Feature set created: \"customer_transactions\"\n", "{\n", " \"spec\": {\n", " \"name\": \"customer_transactions\",\n", @@ -422,15 +310,15 @@ " ],\n", " \"features\": [\n", " {\n", - " \"name\": \"daily_transactions\",\n", - " \"valueType\": \"DOUBLE\"\n", - " },\n", - " {\n", " \"name\": \"total_transactions\",\n", " \"valueType\": \"INT64\"\n", + " },\n", + " {\n", + " \"name\": \"daily_transactions\",\n", + " \"valueType\": \"DOUBLE\"\n", " }\n", " ],\n", - " \"maxAge\": \"432000s\",\n", + " \"maxAge\": \"0s\",\n", " \"source\": {\n", " \"type\": \"KAFKA\",\n", " \"kafkaSourceConfig\": {\n", @@ -441,15 +329,14 @@ " \"project\": \"default\"\n", " },\n", " \"meta\": {\n", - " \"createdTimestamp\": \"2020-05-27T03:58:07Z\",\n", - " \"status\": \"STATUS_PENDING\"\n", + " \"createdTimestamp\": \"2020-06-18T12:04:08Z\",\n", + " \"status\": \"STATUS_READY\"\n", " }\n", "}\n" ] } ], "source": [ - "client.apply(customer_fs)\n", "customer_fs = client.get_feature_set(\"customer_transactions\")\n", "print(customer_fs)" ] @@ -461,11 +348,26 @@ "### 7. Ingest data into Feast for a feature set" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Next we will ingest/load data into Feast. This process populates all registered stores (BigQuery, Redis) with your feature data." + ] + }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 11, "metadata": {}, "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\r", + " 0%| | 0/50 [00:00\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
datetimecustomer_id
02020-05-25 00:00:00+00:001001
12020-05-25 00:00:00+00:001002
22020-05-25 00:00:00+00:001003
32020-05-25 00:00:00+00:001004
42020-05-25 00:00:00+00:001005
52020-05-26 00:00:00+00:001001
62020-05-26 00:00:00+00:001002
72020-05-26 00:00:00+00:001003
82020-05-26 00:00:00+00:001004
92020-05-26 00:00:00+00:001005
\n", - "" - ], - "text/plain": [ - " datetime customer_id\n", - "0 2020-05-25 00:00:00+00:00 1001\n", - "1 2020-05-25 00:00:00+00:00 1002\n", - "2 2020-05-25 00:00:00+00:00 1003\n", - "3 2020-05-25 00:00:00+00:00 1004\n", - "4 2020-05-25 00:00:00+00:00 1005\n", - "5 2020-05-26 00:00:00+00:00 1001\n", - "6 2020-05-26 00:00:00+00:00 1002\n", - "7 2020-05-26 00:00:00+00:00 1003\n", - "8 2020-05-26 00:00:00+00:00 1004\n", - "9 2020-05-26 00:00:00+00:00 1005" - ] - }, - "execution_count": 19, - "metadata": {}, - "output_type": "execute_result" + "name": "stdout", + "output_type": "stream", + "text": [ + " datetime customer_id\n", + "0 2020-06-15 10:35:10.918716+00:00 1001\n", + "1 2020-06-15 14:00:10.918758+00:00 1002\n", + "2 2020-06-17 08:59:10.918767+00:00 1003\n", + "3 2020-06-13 16:51:10.918774+00:00 1004\n", + "4 2020-06-17 06:14:10.918780+00:00 1005\n", + "5 2020-06-17 14:33:10.918786+00:00 1001\n", + "6 2020-06-14 23:15:10.918792+00:00 1002\n", + "7 2020-06-15 11:25:10.918798+00:00 1003\n", + "8 2020-06-18 09:04:10.918804+00:00 1004\n", + "9 2020-06-16 10:27:10.918810+00:00 1005\n" + ] } ], "source": [ + "event_timestamps = [datetime.utcnow().replace(tzinfo=utc) - timedelta(days=randrange(5), hours=randrange(24), minutes=randrange(60)) for day in range(30)]\n", + "\n", "entity_rows = pd.DataFrame(\n", " {\n", - " \"datetime\": [day for day in days for customer in customers],\n", - " \"customer_id\": [customer for day in days for customer in customers],\n", + " \"datetime\": event_timestamps,\n", + " \"customer_id\": [customers[idx % len(customers)] for idx in range(len(event_timestamps))],\n", " }\n", ")\n", "\n", - "entity_rows.head(10)" + "print(entity_rows.head(10))" ] }, { @@ -741,12 +556,12 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Next we will create a new client object, but this time we will configure it to connect to the Batch Serving Service. This service will allow us to retrieve historical feature data." + "Next we will create a new client object, but this time we will configure it to connect to the Feast Batch Serving. This service will allow us to retrieve historical feature data." ] }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 15, "metadata": {}, "outputs": [], "source": [ @@ -757,12 +572,14 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "By calling the `get_batch_features` method we are able to retrieve a `job` object for the exporting of feature data. For every entity and timestamp combination in `entity_rows` we will be receiving a row with feature values joined to it." + "By calling the `get_batch_features` method we are able to retrieve a `job` object. This object can be used to retrieve the resulting training dataset that is exported by Feast. \n", + "\n", + "The dataset that is returned will contain feature values for each entity and timestamp combination in `entity_rows`." ] }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 16, "metadata": { "scrolled": true }, @@ -786,108 +603,114 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 17, + "metadata": {}, + "outputs": [], + "source": [ + "df = job.to_dataframe()" + ] + }, + { + "cell_type": "code", + "execution_count": 18, "metadata": {}, "outputs": [ { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
event_timestampcustomer_iddaily_transactionstotal_transactions
02020-05-26 00:00:00+00:0010012.41681114
12020-05-26 00:00:00+00:0010046.6173176
22020-05-26 00:00:00+00:0010034.40971495
32020-05-26 00:00:00+00:0010051.03252586
42020-05-26 00:00:00+00:0010024.8177359
\n", - "
" - ], - "text/plain": [ - " event_timestamp customer_id daily_transactions \\\n", - "0 2020-05-26 00:00:00+00:00 1001 2.416811 \n", - "1 2020-05-26 00:00:00+00:00 1004 6.617317 \n", - "2 2020-05-26 00:00:00+00:00 1003 4.409714 \n", - "3 2020-05-26 00:00:00+00:00 1005 1.032525 \n", - "4 2020-05-26 00:00:00+00:00 1002 4.817735 \n", - "\n", - " total_transactions \n", - "0 14 \n", - "1 6 \n", - "2 95 \n", - "3 86 \n", - "4 9 " - ] - }, - "execution_count": 22, - "metadata": {}, - "output_type": "execute_result" + "name": "stdout", + "output_type": "stream", + "text": [ + " event_timestamp customer_id daily_transactions \\\n", + "0 2020-06-13 23:45:10.918874+00:00 1001 1.879220 \n", + "1 2020-06-18 12:11:10.918845+00:00 1001 5.122846 \n", + "2 2020-06-17 20:46:10.918903+00:00 1001 2.145294 \n", + "3 2020-06-18 02:50:10.918816+00:00 1001 5.122846 \n", + "4 2020-06-15 10:35:10.918716+00:00 1001 5.758472 \n", + "5 2020-06-17 14:33:10.918786+00:00 1001 2.145294 \n", + "6 2020-06-14 23:15:10.918792+00:00 1002 5.467141 \n", + "7 2020-06-14 07:22:10.918851+00:00 1002 5.467141 \n", + "8 2020-06-17 23:40:10.918880+00:00 1002 3.338614 \n", + "9 2020-06-15 14:00:10.918758+00:00 1002 4.921264 \n", + "10 2020-06-15 18:22:10.918909+00:00 1002 4.921264 \n", + "11 2020-06-16 21:10:10.918822+00:00 1002 1.838186 \n", + "12 2020-06-18 05:47:10.918886+00:00 1003 2.702916 \n", + "13 2020-06-17 08:59:10.918767+00:00 1003 0.211125 \n", + "14 2020-06-15 11:25:10.918798+00:00 1003 4.476252 \n", + "15 2020-06-16 09:56:10.918857+00:00 1003 9.123597 \n", + "16 2020-06-14 11:39:10.918915+00:00 1003 6.353373 \n", + "17 2020-06-15 03:21:10.918828+00:00 1003 4.476252 \n", + "18 2020-06-18 09:04:10.918804+00:00 1004 8.756623 \n", + "19 2020-06-14 14:18:10.918834+00:00 1004 8.647374 \n", + "20 2020-06-17 03:10:10.918863+00:00 1004 2.377199 \n", + "21 2020-06-13 16:51:10.918774+00:00 1004 6.362085 \n", + "22 2020-06-15 03:54:10.918892+00:00 1004 8.235070 \n", + "23 2020-06-17 19:01:10.918921+00:00 1004 2.377199 \n", + "24 2020-06-17 06:14:10.918780+00:00 1005 9.958688 \n", + "25 2020-06-16 08:23:10.918839+00:00 1005 0.006388 \n", + "26 2020-06-16 00:30:10.918927+00:00 1005 0.006388 \n", + "27 2020-06-16 10:27:10.918810+00:00 1005 0.006388 \n", + "28 2020-06-17 01:50:10.918869+00:00 1005 9.958688 \n", + "29 2020-06-17 08:42:10.918897+00:00 1005 9.958688 \n", + "\n", + " total_transactions \n", + "0 7 \n", + "1 96 \n", + "2 63 \n", + "3 96 \n", + "4 85 \n", + "5 63 \n", + "6 10 \n", + "7 10 \n", + "8 50 \n", + "9 55 \n", + "10 55 \n", + "11 83 \n", + "12 50 \n", + "13 96 \n", + "14 61 \n", + "15 85 \n", + "16 69 \n", + "17 61 \n", + "18 84 \n", + "19 95 \n", + "20 25 \n", + "21 2 \n", + "22 58 \n", + "23 25 \n", + "24 6 \n", + "25 36 \n", + "26 36 \n", + "27 36 \n", + "28 6 \n", + "29 6 \n" + ] } ], "source": [ - "df = job.to_dataframe()\n", - "df.head()" + "print(df.head(50))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The training dataset is staged on Google Cloud Storage and can be accessed directly if it is too large to load into memory" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "job.get_avro_files()" ] } ], "metadata": { "kernelspec": { - "display_name": "feast-ml-py374", + "display_name": "Python 3", "language": "python", - "name": "feast-ml-py374" + "name": "python3" }, "language_info": { "codemirror_mode": { @@ -899,9 +722,9 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.7.4" + "version": "3.7.6" } }, "nbformat": 4, - "nbformat_minor": 4 + "nbformat_minor": 2 } diff --git a/examples/feast-xgboost-churn-prediction-tutorial/Telecom Customer Churn Prediction (with Feast and XGBoost).ipynb b/examples/feast-xgboost-churn-prediction-tutorial/Telecom Customer Churn Prediction (with Feast and XGBoost).ipynb index c29c01efff..8bc1f1edc1 100644 --- a/examples/feast-xgboost-churn-prediction-tutorial/Telecom Customer Churn Prediction (with Feast and XGBoost).ipynb +++ b/examples/feast-xgboost-churn-prediction-tutorial/Telecom Customer Churn Prediction (with Feast and XGBoost).ipynb @@ -11,12 +11,11 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "This tutorial will demonstrate the use of Feast in productionising a churn model. The tutorial is broken down into two sections\n", + "This tutorial will demonstrate the use of Feast in productionizing a churn model. The tutorial will walk through:\n", "\n", "1. Churn Modelling (without Feast): In this section we explore the data, refine it, train a model, and evaluate its performance.\n", "2. Churn Modelling (with Feast): In this section we introduce Feast for feature storage, management, as well as serving.\n", "\n", - "\n", "This tutorial is an extension of [this](https://www.kaggle.com/pavanraj159/telecom-customer-churn-prediction/comments#6.-Model-Performances) Kaggle notebook" ] }, @@ -29,9 +28,36 @@ "## 1. Churn Modelling (without Feast)" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Install dependencies for tutorial" + ] + }, { "cell_type": "code", "execution_count": null, + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [], + "source": [ + "!pip install sklearn python-slugify plotly statsmodels yellowbrick xgboost" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Import dependencies" + ] + }, + { + "cell_type": "code", + "execution_count": 2, "metadata": { "_cell_guid": "b1076dfc-b9ad-4769-8c92-a6c4dae69d19", "_kg_hide-input": false, @@ -39,27 +65,20 @@ }, "outputs": [], "source": [ - "import numpy as np\n", - "import pandas as pd\n", "import os\n", - "import matplotlib.pyplot as plt\n", - "from PIL import Image\n", - "%matplotlib inline\n", + "import numpy as np\n", "import pandas as pd\n", - "import seaborn as sns\n", - "import itertools\n", "import warnings\n", "warnings.filterwarnings(\"ignore\")\n", - "import io\n", "import plotly.offline as py\n", - "py.init_notebook_mode(connected=True)\n", "import plotly.graph_objs as go\n", "import plotly.tools as tls\n", - "import plotly.figure_factory as ff\n", - "import statsmodels, yellowbrick\n", - "import sklearn # Tested with 0.22.1\n", - "import imblearn\n", - "from slugify import slugify" + "from slugify import slugify\n", + "from sklearn.preprocessing import LabelEncoder\n", + "from sklearn.preprocessing import StandardScaler\n", + "from xgboost import XGBClassifier\n", + "from feast import Client, FeatureSet, Entity, ValueType\n", + "import datetime as dt" ] }, { @@ -69,9 +88,16 @@ "### 1.1 Data" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Retrieve the raw telco dataset" + ] + }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 3, "metadata": { "_cell_guid": "79c7e3d0-c299-4dcb-8224-4455121ee9b0", "_uuid": "d629ff2d2480ee46fbb7e2d37f6b5fab8052498a" @@ -279,7 +305,7 @@ "[5 rows x 21 columns]" ] }, - "execution_count": 2, + "execution_count": 3, "metadata": {}, "output_type": "execute_result" } @@ -300,7 +326,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 4, "metadata": { "_uuid": "8b10c13086dff7182e399b849e31bc03df54a14e" }, @@ -350,7 +376,7 @@ "target_col = [\"Churn\"]\n", "cat_cols = telcom.nunique()[telcom.nunique() < 6].keys().tolist()\n", "cat_cols = [x for x in cat_cols if x not in target_col]\n", - "num_cols = [x for x in telcom.columns if x not in cat_cols + target_col + Id_col]\n" + "num_cols = [x for x in telcom.columns if x not in cat_cols + target_col + Id_col]" ] }, { @@ -364,7 +390,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 5, "metadata": { "_uuid": "8921591320c5e336ec5a2e1efc5ed3cb0f9ec1b2" }, @@ -585,15 +611,12 @@ "[5 rows x 35 columns]" ] }, - "execution_count": 4, + "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "from sklearn.preprocessing import LabelEncoder\n", - "from sklearn.preprocessing import StandardScaler\n", - "\n", "# Customer id col\n", "Id_col = ['customer_id']\n", "\n", @@ -647,7 +670,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 6, "metadata": {}, "outputs": [ { @@ -689,7 +712,7 @@ " customer_id\n", " 7032\n", " 7032\n", - " 0835-DUUIQ\n", + " 0256-LTHVJ\n", " 1\n", " NaN\n", " NaN\n", @@ -1181,7 +1204,7 @@ ], "text/plain": [ " count unique top freq \\\n", - "customer_id 7032 7032 0835-DUUIQ 1 \n", + "customer_id 7032 7032 0256-LTHVJ 1 \n", "gender 7032 NaN NaN NaN \n", "seniorcitizen 7032 NaN NaN NaN \n", "partner 7032 NaN NaN NaN \n", @@ -1292,7 +1315,7 @@ "totalcharges -0.830249 -0.390815 0.666827 2.82426 " ] }, - "execution_count": 5, + "execution_count": 6, "metadata": {}, "output_type": "execute_result" } @@ -1312,11 +1335,93 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 7, "metadata": { "_uuid": "b52cf9c7f402ed706e82221e3f8601fdeea9ab27" }, "outputs": [ + { + "data": { + "text/html": [ + " \n", + " " + ] + }, + "metadata": {}, + "output_type": "display_data" + }, { "data": { "application/vnd.plotly.v1+json": { @@ -1457,7 +1562,7 @@ -0.007514979909200033, -0.01632782307070617, -0.013092839264555001, - -0.0008067457759124324, + -8.067457759124324E-4, -0.008507162405232782, -0.0071243969867245535, -0.010105418366566195, @@ -1474,16 +1579,16 @@ -0.003603167413572989, -0.015973079031173835, 0.001631872518613598, - 0.0008437084888753327, + 8.437084888753327E-4, 0.01319936726545174, -0.0010503798619876774, - -0.0006494991207074467, + -6.494991207074467E-4, -0.010516394125351592, -0.004318975744551275, 0.01627881894278637, 0.005285371870295646, -0.013779327268354416, - 4.783950839776602e-05 + 4.783950839776602E-5 ], [ -0.001819390613419179, @@ -1540,7 +1645,7 @@ 0.1425612874681736, -0.0010430787434336079, 0.0012346095228208073, - -0.0002855204740384597, + -2.855204740384597E-4, -0.2802019157901561, 0.08306706395255747, 0.24733370647615796, @@ -1690,7 +1795,7 @@ 0.11139068731904943, 0.0869415675760231, 0.09045518641091457, - -0.00036426636786500763, + -3.6426636786500763E-4, -0.17407470231312427, -0.26736609150996915, -0.08408097138202993, @@ -1702,7 +1807,7 @@ 0.510100290145439 ], [ - -0.0008067457759124324, + -8.067457759124324E-4, 0.059513871482029225, 0.1535564364182745, 0.013899668260943368, @@ -2054,8 +2159,8 @@ 0.04275388869901973, -0.0014704079413737543, -0.013428683581908657, - -0.0004678565419111244, - -0.00079001062411148, + -4.678565419111244E-4, + -7.9001062411148E-4, 0.014665913237832425, 0.013786269825793156, -0.16136793538251534, @@ -2100,7 +2205,7 @@ [ 0.004744965758849955, -0.18251949495535458, - -0.0002855204740384597, + -2.855204740384597E-4, 0.13838288994798562, 0.1718171065699321, -0.33279949932167546, @@ -2195,7 +2300,7 @@ 0.05762874929825631, 0.06758968145792751, -0.10954646682258033, - 0.0001971241890499786, + 1.971241890499786E-4, -0.2512993032332009, -0.017196458507178478, 0.15389327309228798, @@ -2314,13 +2419,13 @@ 0.1826633671546002 ], [ - 0.0008437084888753327, + 8.437084888753327E-4, 0.17132216591713703, -0.08320661736633733, -0.1492739811934336, 0.0027471183312986857, -0.11229466175861408, - -0.00036426636786500763, + -3.6426636786500763E-4, -0.003308493511411254, -0.11480726996437085, 0.1447470086556032, @@ -2370,7 +2475,7 @@ -0.3059839224841771, 0.3196937439459745, 0.006208692442055045, - 0.0001971241890499786, + 1.971241890499786E-4, -0.007422540118618717, -0.28809669563791657, -0.28558254792863036, @@ -2422,7 +2527,7 @@ -0.5924430690900127 ], [ - -0.0006494991207074467, + -6.494991207074467E-4, 0.0018604411671567017, -0.048481275955609554, -0.0014594010172321779, @@ -2474,7 +2579,7 @@ -0.03215707580783382, 0.014777815951004133, 0.023690157563696343, - -0.0004678565419111244, + -4.678565419111244E-4, 0.005613761063888361, -0.006230482005525746, -0.05215631834144274, @@ -2510,7 +2615,7 @@ -0.08063023581160907, 0.009750281447949528, 0.07573985650934924, - -0.00079001062411148, + -7.9001062411148E-4, 0.01746632456448475, -0.020153136101165186, -0.19870886170688193, @@ -2638,7 +2743,7 @@ 0.6510648032262032 ], [ - 4.783950839776602e-05, + 4.783950839776602E-5, 0.10241060539532633, 0.31907236323857324, 0.0646532494217739, @@ -3510,20 +3615,20 @@ "
\n", " \n", " \n", - "
\n", + "
\n", "