diff --git a/.prow/config.yaml b/.prow/config.yaml
index 7035d5cc9b..c5f127ab63 100644
--- a/.prow/config.yaml
+++ b/.prow/config.yaml
@@ -451,6 +451,11 @@ postsubmits:
--file infra/docker/serving/Dockerfile \
--google-service-account-file /etc/gcloud/service-account.json
+ infra/scripts/publish-docker-image.sh \
+ --repository gcr.io/kf-feast/feast-jupyter \
+ --tag ${PULL_BASE_REF:1} \
+ --file infra/docker/jupyter/Dockerfile \
+ --google-service-account-file /etc/gcloud/service-account.json
HIGHEST_SEMVER_TAG=$(git tag -l --sort -version:refname | head -n 1)
echo "Only push to latest tag if tag is the highest semver version $HIGHEST_SEMVER_TAG"
@@ -462,6 +467,9 @@ postsubmits:
docker tag gcr.io/kf-feast/feast-serving:${PULL_BASE_REF:1} gcr.io/kf-feast/feast-serving:latest
docker push gcr.io/kf-feast/feast-serving:latest
+
+ docker tag gcr.io/kf-feast/feast-jupyter:${PULL_BASE_REF:1} gcr.io/kf-feast/feast-jupyter:latest
+ docker push gcr.io/kf-feast/feast-jupyter:latest
fi
fi
diff --git a/examples/basic/basic.ipynb b/examples/basic/basic.ipynb
index a56121328c..3d6bb3cc2a 100644
--- a/examples/basic/basic.ipynb
+++ b/examples/basic/basic.ipynb
@@ -11,7 +11,9 @@
"cell_type": "markdown",
"metadata": {},
"source": [
- "This is a minimal example of using Feast. In this example we will\n",
+ "This is a minimal example of using Feast. The point is to show users how to get data into Feast and how to retrieve features for online serving and model training.\n",
+ "\n",
+ "In this example we will\n",
"1. Create a synthetic customer feature dataset\n",
"2. Register a feature set to represent these features in Feast\n",
"3. Ingest these features into Feast\n",
@@ -60,55 +62,9 @@
},
{
"cell_type": "code",
- "execution_count": 2,
+ "execution_count": null,
"metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Requirement already satisfied: feast in /home/zzy/.conda/envs/feast-ml/lib/python3.7/site-packages (0.5.0.post0)\n",
- "Requirement already satisfied: google in /home/zzy/.conda/envs/feast-ml/lib/python3.7/site-packages (from feast) (2.0.3)\n",
- "Requirement already satisfied: tabulate==0.8.* in /home/zzy/.conda/envs/feast-ml/lib/python3.7/site-packages (from feast) (0.8.7)\n",
- "Requirement already satisfied: pandavro==1.5.* in /home/zzy/.conda/envs/feast-ml/lib/python3.7/site-packages (from feast) (1.5.1)\n",
- "Requirement already satisfied: pandas==0.* in /home/zzy/.local/lib/python3.7/site-packages (from feast) (0.25.0)\n",
- "Requirement already satisfied: google-cloud-core==1.0.* in /home/zzy/.conda/envs/feast-ml/lib/python3.7/site-packages (from feast) (1.0.3)\n",
- "Requirement already satisfied: grpcio==1.* in /home/zzy/.conda/envs/feast-ml/lib/python3.7/site-packages (from feast) (1.29.0)\n",
- "Requirement already satisfied: fastavro<0.23,>=0.22.11 in /home/zzy/.conda/envs/feast-ml/lib/python3.7/site-packages (from feast) (0.22.13)\n",
- "Requirement already satisfied: PyYAML==5.1.* in /home/zzy/.conda/envs/feast-ml/lib/python3.7/site-packages (from feast) (5.1.2)\n",
- "Requirement already satisfied: googleapis-common-protos==1.* in /home/zzy/.conda/envs/feast-ml/lib/python3.7/site-packages (from feast) (1.51.0)\n",
- "Requirement already satisfied: tqdm==4.* in /home/zzy/.conda/envs/feast-ml/lib/python3.7/site-packages (from feast) (4.46.0)\n",
- "Requirement already satisfied: numpy in /home/zzy/.local/lib/python3.7/site-packages (from feast) (1.17.4)\n",
- "Requirement already satisfied: confluent-kafka in /home/zzy/.conda/envs/feast-ml/lib/python3.7/site-packages (from feast) (1.4.2)\n",
- "Requirement already satisfied: google-cloud-bigquery-storage==0.7.* in /home/zzy/.conda/envs/feast-ml/lib/python3.7/site-packages (from feast) (0.7.0)\n",
- "Requirement already satisfied: toml==0.10.* in /home/zzy/.conda/envs/feast-ml/lib/python3.7/site-packages (from feast) (0.10.1)\n",
- "Requirement already satisfied: protobuf>=3.10 in /home/zzy/.conda/envs/feast-ml/lib/python3.7/site-packages (from feast) (3.12.1)\n",
- "Requirement already satisfied: google-cloud-bigquery==1.18.* in /home/zzy/.conda/envs/feast-ml/lib/python3.7/site-packages (from feast) (1.18.1)\n",
- "Requirement already satisfied: pyarrow>=0.15.1 in /home/zzy/.conda/envs/feast-ml/lib/python3.7/site-packages (from feast) (0.17.1)\n",
- "Requirement already satisfied: google-api-core==1.14.* in /home/zzy/.conda/envs/feast-ml/lib/python3.7/site-packages (from feast) (1.14.3)\n",
- "Requirement already satisfied: google-cloud-storage==1.20.* in /home/zzy/.conda/envs/feast-ml/lib/python3.7/site-packages (from feast) (1.20.0)\n",
- "Requirement already satisfied: google-auth==1.6.* in /home/zzy/.conda/envs/feast-ml/lib/python3.7/site-packages (from feast) (1.6.3)\n",
- "Requirement already satisfied: kafka-python==1.* in /home/zzy/.conda/envs/feast-ml/lib/python3.7/site-packages (from feast) (1.4.7)\n",
- "Requirement already satisfied: Click==7.* in /home/zzy/.conda/envs/feast-ml/lib/python3.7/site-packages (from feast) (7.1.2)\n",
- "Requirement already satisfied: beautifulsoup4 in /home/zzy/.conda/envs/feast-ml/lib/python3.7/site-packages (from google->feast) (4.9.1)\n",
- "Requirement already satisfied: six>=1.9 in /home/zzy/.conda/envs/feast-ml/lib/python3.7/site-packages (from pandavro==1.5.*->feast) (1.14.0)\n",
- "Requirement already satisfied: python-dateutil>=2.6.1 in /home/zzy/.local/lib/python3.7/site-packages (from pandas==0.*->feast) (2.8.0)\n",
- "Requirement already satisfied: pytz>=2017.2 in /home/zzy/.local/lib/python3.7/site-packages (from pandas==0.*->feast) (2019.1)\n",
- "Requirement already satisfied: setuptools in /home/zzy/.local/lib/python3.7/site-packages (from protobuf>=3.10->feast) (41.6.0)\n",
- "Requirement already satisfied: google-resumable-media<0.5.0dev,>=0.3.1 in /home/zzy/.conda/envs/feast-ml/lib/python3.7/site-packages (from google-cloud-bigquery==1.18.*->feast) (0.4.1)\n",
- "Requirement already satisfied: requests<3.0.0dev,>=2.18.0 in /home/zzy/.conda/envs/feast-ml/lib/python3.7/site-packages (from google-api-core==1.14.*->feast) (2.23.0)\n",
- "Requirement already satisfied: rsa>=3.1.4 in /home/zzy/.conda/envs/feast-ml/lib/python3.7/site-packages (from google-auth==1.6.*->feast) (4.0)\n",
- "Requirement already satisfied: cachetools>=2.0.0 in /home/zzy/.conda/envs/feast-ml/lib/python3.7/site-packages (from google-auth==1.6.*->feast) (4.1.0)\n",
- "Requirement already satisfied: pyasn1-modules>=0.2.1 in /home/zzy/.conda/envs/feast-ml/lib/python3.7/site-packages (from google-auth==1.6.*->feast) (0.2.8)\n",
- "Requirement already satisfied: soupsieve>1.2 in /home/zzy/.conda/envs/feast-ml/lib/python3.7/site-packages (from beautifulsoup4->google->feast) (2.0.1)\n",
- "Requirement already satisfied: chardet<4,>=3.0.2 in /home/zzy/.conda/envs/feast-ml/lib/python3.7/site-packages (from requests<3.0.0dev,>=2.18.0->google-api-core==1.14.*->feast) (3.0.4)\n",
- "Requirement already satisfied: idna<3,>=2.5 in /home/zzy/.conda/envs/feast-ml/lib/python3.7/site-packages (from requests<3.0.0dev,>=2.18.0->google-api-core==1.14.*->feast) (2.9)\n",
- "Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /home/zzy/.conda/envs/feast-ml/lib/python3.7/site-packages (from requests<3.0.0dev,>=2.18.0->google-api-core==1.14.*->feast) (1.25.8)\n",
- "Requirement already satisfied: certifi>=2017.4.17 in /home/zzy/.local/lib/python3.7/site-packages (from requests<3.0.0dev,>=2.18.0->google-api-core==1.14.*->feast) (2019.6.16)\n",
- "Requirement already satisfied: pyasn1>=0.1.3 in /home/zzy/.conda/envs/feast-ml/lib/python3.7/site-packages (from rsa>=3.1.4->google-auth==1.6.*->feast) (0.4.8)\n"
- ]
- }
- ],
+ "outputs": [],
"source": [
"!pip install feast"
]
@@ -122,7 +78,7 @@
},
{
"cell_type": "code",
- "execution_count": 3,
+ "execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
@@ -149,7 +105,7 @@
},
{
"cell_type": "code",
- "execution_count": 4,
+ "execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
@@ -172,165 +128,82 @@
},
{
"cell_type": "code",
- "execution_count": 5,
+ "execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"days = [datetime.utcnow().replace(hour=0, minute=0, second=0, microsecond=0).replace(tzinfo=utc) \\\n",
- " - timedelta(day) for day in range(3)][::-1]\n",
+ " - timedelta(day) for day in range(10)][::-1]\n",
"\n",
"customers = [1001, 1002, 1003, 1004, 1005]"
]
},
{
"cell_type": "code",
- "execution_count": 6,
+ "execution_count": 5,
"metadata": {},
"outputs": [
{
- "data": {
- "text/html": [
- "
\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " datetime | \n",
- " customer_id | \n",
- " daily_transactions | \n",
- " total_transactions | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " 0 | \n",
- " 2020-05-25 00:00:00+00:00 | \n",
- " 1001 | \n",
- " 8.704802 | \n",
- " 77 | \n",
- "
\n",
- " \n",
- " 1 | \n",
- " 2020-05-25 00:00:00+00:00 | \n",
- " 1002 | \n",
- " 7.163887 | \n",
- " 31 | \n",
- "
\n",
- " \n",
- " 2 | \n",
- " 2020-05-25 00:00:00+00:00 | \n",
- " 1003 | \n",
- " 9.935976 | \n",
- " 68 | \n",
- "
\n",
- " \n",
- " 3 | \n",
- " 2020-05-25 00:00:00+00:00 | \n",
- " 1004 | \n",
- " 1.107980 | \n",
- " 78 | \n",
- "
\n",
- " \n",
- " 4 | \n",
- " 2020-05-25 00:00:00+00:00 | \n",
- " 1005 | \n",
- " 8.307381 | \n",
- " 36 | \n",
- "
\n",
- " \n",
- " 5 | \n",
- " 2020-05-26 00:00:00+00:00 | \n",
- " 1001 | \n",
- " 2.416811 | \n",
- " 14 | \n",
- "
\n",
- " \n",
- " 6 | \n",
- " 2020-05-26 00:00:00+00:00 | \n",
- " 1002 | \n",
- " 4.817735 | \n",
- " 9 | \n",
- "
\n",
- " \n",
- " 7 | \n",
- " 2020-05-26 00:00:00+00:00 | \n",
- " 1003 | \n",
- " 4.409714 | \n",
- " 95 | \n",
- "
\n",
- " \n",
- " 8 | \n",
- " 2020-05-26 00:00:00+00:00 | \n",
- " 1004 | \n",
- " 6.617317 | \n",
- " 6 | \n",
- "
\n",
- " \n",
- " 9 | \n",
- " 2020-05-26 00:00:00+00:00 | \n",
- " 1005 | \n",
- " 1.032525 | \n",
- " 86 | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " datetime customer_id daily_transactions \\\n",
- "0 2020-05-25 00:00:00+00:00 1001 8.704802 \n",
- "1 2020-05-25 00:00:00+00:00 1002 7.163887 \n",
- "2 2020-05-25 00:00:00+00:00 1003 9.935976 \n",
- "3 2020-05-25 00:00:00+00:00 1004 1.107980 \n",
- "4 2020-05-25 00:00:00+00:00 1005 8.307381 \n",
- "5 2020-05-26 00:00:00+00:00 1001 2.416811 \n",
- "6 2020-05-26 00:00:00+00:00 1002 4.817735 \n",
- "7 2020-05-26 00:00:00+00:00 1003 4.409714 \n",
- "8 2020-05-26 00:00:00+00:00 1004 6.617317 \n",
- "9 2020-05-26 00:00:00+00:00 1005 1.032525 \n",
- "\n",
- " total_transactions \n",
- "0 77 \n",
- "1 31 \n",
- "2 68 \n",
- "3 78 \n",
- "4 36 \n",
- "5 14 \n",
- "6 9 \n",
- "7 95 \n",
- "8 6 \n",
- "9 86 "
- ]
- },
- "execution_count": 6,
- "metadata": {},
- "output_type": "execute_result"
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ " datetime customer_id daily_transactions \\\n",
+ "0 2020-06-09 00:00:00+00:00 1001 0.564751 \n",
+ "1 2020-06-09 00:00:00+00:00 1002 3.945566 \n",
+ "2 2020-06-09 00:00:00+00:00 1003 7.291928 \n",
+ "3 2020-06-09 00:00:00+00:00 1004 6.690477 \n",
+ "4 2020-06-09 00:00:00+00:00 1005 6.415899 \n",
+ "5 2020-06-10 00:00:00+00:00 1001 0.347294 \n",
+ "6 2020-06-10 00:00:00+00:00 1002 5.363853 \n",
+ "7 2020-06-10 00:00:00+00:00 1003 0.538129 \n",
+ "8 2020-06-10 00:00:00+00:00 1004 4.755425 \n",
+ "9 2020-06-10 00:00:00+00:00 1005 2.867527 \n",
+ "10 2020-06-11 00:00:00+00:00 1001 9.493098 \n",
+ "11 2020-06-11 00:00:00+00:00 1002 5.130665 \n",
+ "12 2020-06-11 00:00:00+00:00 1003 1.794191 \n",
+ "13 2020-06-11 00:00:00+00:00 1004 4.698504 \n",
+ "14 2020-06-11 00:00:00+00:00 1005 2.908603 \n",
+ "15 2020-06-12 00:00:00+00:00 1001 9.857894 \n",
+ "16 2020-06-12 00:00:00+00:00 1002 5.416553 \n",
+ "17 2020-06-12 00:00:00+00:00 1003 5.374058 \n",
+ "18 2020-06-12 00:00:00+00:00 1004 9.834441 \n",
+ "19 2020-06-12 00:00:00+00:00 1005 0.480373 \n",
+ "\n",
+ " total_transactions \n",
+ "0 73 \n",
+ "1 75 \n",
+ "2 95 \n",
+ "3 50 \n",
+ "4 65 \n",
+ "5 28 \n",
+ "6 76 \n",
+ "7 42 \n",
+ "8 53 \n",
+ "9 61 \n",
+ "10 86 \n",
+ "11 31 \n",
+ "12 69 \n",
+ "13 9 \n",
+ "14 51 \n",
+ "15 23 \n",
+ "16 1 \n",
+ "17 34 \n",
+ "18 13 \n",
+ "19 50 \n"
+ ]
}
],
"source": [
"customer_features = pd.DataFrame(\n",
" {\n",
- " \"datetime\": [day for day in days for customer in customers],\n",
- " \"customer_id\": [customer for day in days for customer in customers],\n",
- " \"daily_transactions\": [np.random.rand() * 10 for _ in range(len(days) * len(customers))],\n",
- " \"total_transactions\": [np.random.randint(100) for _ in range(len(days) * len(customers))],\n",
+ " \"datetime\": [day for day in days for customer in customers], # Datetime is required\n",
+ " \"customer_id\": [customer for day in days for customer in customers], # Customer is the entity\n",
+ " \"daily_transactions\": [np.random.rand() * 10 for _ in range(len(days) * len(customers))], # Feature 1\n",
+ " \"total_transactions\": [np.random.randint(100) for _ in range(len(days) * len(customers))], # Feature 2\n",
" }\n",
")\n",
"\n",
- "customer_features.head(10)"
+ "print(customer_features.head(20))"
]
},
{
@@ -350,14 +223,13 @@
},
{
"cell_type": "code",
- "execution_count": 7,
+ "execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"customer_fs = FeatureSet(\n",
" \"customer_transactions\",\n",
- " entities=[Entity(name='customer_id', dtype=ValueType.INT64)],\n",
- " max_age=Duration(seconds=432000) \n",
+ " entities=[Entity(name='customer_id', dtype=ValueType.INT64)]\n",
")"
]
},
@@ -370,7 +242,7 @@
},
{
"cell_type": "code",
- "execution_count": 8,
+ "execution_count": 7,
"metadata": {},
"outputs": [
{
@@ -398,7 +270,24 @@
"cell_type": "markdown",
"metadata": {},
"source": [
- "The apply() method will register the provided feature set with Feast core, allowing users to retrieve features from this feature set"
+ "The apply() method will register the provided feature set with Feast Core (the feature registry)."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Feature set updated: \"customer_transactions\"\n"
+ ]
+ }
+ ],
+ "source": [
+ "client.apply(customer_fs)"
]
},
{
@@ -410,7 +299,6 @@
"name": "stdout",
"output_type": "stream",
"text": [
- "Feature set created: \"customer_transactions\"\n",
"{\n",
" \"spec\": {\n",
" \"name\": \"customer_transactions\",\n",
@@ -422,15 +310,15 @@
" ],\n",
" \"features\": [\n",
" {\n",
- " \"name\": \"daily_transactions\",\n",
- " \"valueType\": \"DOUBLE\"\n",
- " },\n",
- " {\n",
" \"name\": \"total_transactions\",\n",
" \"valueType\": \"INT64\"\n",
+ " },\n",
+ " {\n",
+ " \"name\": \"daily_transactions\",\n",
+ " \"valueType\": \"DOUBLE\"\n",
" }\n",
" ],\n",
- " \"maxAge\": \"432000s\",\n",
+ " \"maxAge\": \"0s\",\n",
" \"source\": {\n",
" \"type\": \"KAFKA\",\n",
" \"kafkaSourceConfig\": {\n",
@@ -441,15 +329,14 @@
" \"project\": \"default\"\n",
" },\n",
" \"meta\": {\n",
- " \"createdTimestamp\": \"2020-05-27T03:58:07Z\",\n",
- " \"status\": \"STATUS_PENDING\"\n",
+ " \"createdTimestamp\": \"2020-06-18T12:04:08Z\",\n",
+ " \"status\": \"STATUS_READY\"\n",
" }\n",
"}\n"
]
}
],
"source": [
- "client.apply(customer_fs)\n",
"customer_fs = client.get_feature_set(\"customer_transactions\")\n",
"print(customer_fs)"
]
@@ -461,11 +348,26 @@
"### 7. Ingest data into Feast for a feature set"
]
},
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Next we will ingest/load data into Feast. This process populates all registered stores (BigQuery, Redis) with your feature data."
+ ]
+ },
{
"cell_type": "code",
- "execution_count": 10,
+ "execution_count": 11,
"metadata": {},
"outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\r",
+ " 0%| | 0/50 [00:00, ?rows/s]"
+ ]
+ },
{
"name": "stdout",
"output_type": "stream",
@@ -477,7 +379,7 @@
"name": "stderr",
"output_type": "stream",
"text": [
- "100%|██████████| 15/15 [00:01<00:00, 13.99rows/s]"
+ "100%|██████████| 50/50 [00:01<00:00, 48.83rows/s]"
]
},
{
@@ -487,7 +389,7 @@
"Ingestion complete!\n",
"\n",
"Ingestion statistics:\n",
- "Success: 15/15\n",
+ "Success: 50/50\n",
"Removing temporary file(s)...\n"
]
},
@@ -501,10 +403,10 @@
{
"data": {
"text/plain": [
- "'3b988d56-6885-36c6-804e-73ea76b7eae6'"
+ "'8da648b7-d6ac-3970-90c6-cf789078c869'"
]
},
- "execution_count": 10,
+ "execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
@@ -524,19 +426,12 @@
"cell_type": "markdown",
"metadata": {},
"source": [
- "The process of retrieving features from the online API is very similar to that of the batch API. The only major difference is that users do not have to provide timestamps (only the latest features are returned, as long as they are within the maximum age window)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "The example below retrieves online features for a single customer: \"1001\". It is possible to retrieve any features from feast, even outside of the current project."
+ "The example below retrieves online features for a single customer: \"1001\". Retrieval of features is not limited to a single feature set. Users can provide any features as long as they are present on the provided entities."
]
},
{
"cell_type": "code",
- "execution_count": 11,
+ "execution_count": 12,
"metadata": {},
"outputs": [
{
@@ -553,13 +448,13 @@
" fields {\n",
" key: \"daily_transactions\"\n",
" value {\n",
- " double_val: 2.460333315469021\n",
+ " double_val: 5.1228456657485495\n",
" }\n",
" }\n",
" fields {\n",
" key: \"total_transactions\"\n",
" value {\n",
- " int64_val: 11\n",
+ " int64_val: 96\n",
" }\n",
" }\n",
"}\n",
@@ -596,138 +491,58 @@
"cell_type": "markdown",
"metadata": {},
"source": [
- "### The following section requires Google Cloud Platform (Google Cloud Storage and BigQuery)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "### 9. Create a batch retrieval query"
+ "### 9. Retrieve training features"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
- "In order to retrieve historical feature data, the user must provide an entity_rows dataframe. This dataframe contains a combination of timestamps and entities. In this case, the user must provide both customer_ids and timestamps. \n",
+ "(Requires Google Cloud Platform)\n",
+ "\n",
+ "In order to retrieve historical feature data, the user must provide an `entity_rows` DataFrame. This DataFrame contains a combination of timestamps and entities (customers, in this case).\n",
+ "\n",
+ "The timestamps correlate to the event_time that a prediction needs to be made. At each one of these points in time you need to know the \"current\" feature values.\n",
"\n",
- "We will randomly generate timestamps over the last 30 days, and assign customer_ids to them. When these entity rows are sent to the Feast Serving API to retrieve feature values, along with a list of feature ids, Feast is then able to attach the correct feature values to each entity row. "
+ "We will randomly generate timestamps over the last 5 days and assign `customer_ids` to them.\n",
+ "\n",
+ "When these entity rows are sent to the Feast Serving API to retrieve feature values, along with a list of feature ids, Feast is then able to attach the correct feature values to each entity row. It will join the correct feature values at each point in time for each entity onto these entity rows."
]
},
{
"cell_type": "code",
- "execution_count": 19,
+ "execution_count": 13,
"metadata": {},
"outputs": [
{
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " datetime | \n",
- " customer_id | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " 0 | \n",
- " 2020-05-25 00:00:00+00:00 | \n",
- " 1001 | \n",
- "
\n",
- " \n",
- " 1 | \n",
- " 2020-05-25 00:00:00+00:00 | \n",
- " 1002 | \n",
- "
\n",
- " \n",
- " 2 | \n",
- " 2020-05-25 00:00:00+00:00 | \n",
- " 1003 | \n",
- "
\n",
- " \n",
- " 3 | \n",
- " 2020-05-25 00:00:00+00:00 | \n",
- " 1004 | \n",
- "
\n",
- " \n",
- " 4 | \n",
- " 2020-05-25 00:00:00+00:00 | \n",
- " 1005 | \n",
- "
\n",
- " \n",
- " 5 | \n",
- " 2020-05-26 00:00:00+00:00 | \n",
- " 1001 | \n",
- "
\n",
- " \n",
- " 6 | \n",
- " 2020-05-26 00:00:00+00:00 | \n",
- " 1002 | \n",
- "
\n",
- " \n",
- " 7 | \n",
- " 2020-05-26 00:00:00+00:00 | \n",
- " 1003 | \n",
- "
\n",
- " \n",
- " 8 | \n",
- " 2020-05-26 00:00:00+00:00 | \n",
- " 1004 | \n",
- "
\n",
- " \n",
- " 9 | \n",
- " 2020-05-26 00:00:00+00:00 | \n",
- " 1005 | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " datetime customer_id\n",
- "0 2020-05-25 00:00:00+00:00 1001\n",
- "1 2020-05-25 00:00:00+00:00 1002\n",
- "2 2020-05-25 00:00:00+00:00 1003\n",
- "3 2020-05-25 00:00:00+00:00 1004\n",
- "4 2020-05-25 00:00:00+00:00 1005\n",
- "5 2020-05-26 00:00:00+00:00 1001\n",
- "6 2020-05-26 00:00:00+00:00 1002\n",
- "7 2020-05-26 00:00:00+00:00 1003\n",
- "8 2020-05-26 00:00:00+00:00 1004\n",
- "9 2020-05-26 00:00:00+00:00 1005"
- ]
- },
- "execution_count": 19,
- "metadata": {},
- "output_type": "execute_result"
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ " datetime customer_id\n",
+ "0 2020-06-15 10:35:10.918716+00:00 1001\n",
+ "1 2020-06-15 14:00:10.918758+00:00 1002\n",
+ "2 2020-06-17 08:59:10.918767+00:00 1003\n",
+ "3 2020-06-13 16:51:10.918774+00:00 1004\n",
+ "4 2020-06-17 06:14:10.918780+00:00 1005\n",
+ "5 2020-06-17 14:33:10.918786+00:00 1001\n",
+ "6 2020-06-14 23:15:10.918792+00:00 1002\n",
+ "7 2020-06-15 11:25:10.918798+00:00 1003\n",
+ "8 2020-06-18 09:04:10.918804+00:00 1004\n",
+ "9 2020-06-16 10:27:10.918810+00:00 1005\n"
+ ]
}
],
"source": [
+ "event_timestamps = [datetime.utcnow().replace(tzinfo=utc) - timedelta(days=randrange(5), hours=randrange(24), minutes=randrange(60)) for day in range(30)]\n",
+ "\n",
"entity_rows = pd.DataFrame(\n",
" {\n",
- " \"datetime\": [day for day in days for customer in customers],\n",
- " \"customer_id\": [customer for day in days for customer in customers],\n",
+ " \"datetime\": event_timestamps,\n",
+ " \"customer_id\": [customers[idx % len(customers)] for idx in range(len(event_timestamps))],\n",
" }\n",
")\n",
"\n",
- "entity_rows.head(10)"
+ "print(entity_rows.head(10))"
]
},
{
@@ -741,12 +556,12 @@
"cell_type": "markdown",
"metadata": {},
"source": [
- "Next we will create a new client object, but this time we will configure it to connect to the Batch Serving Service. This service will allow us to retrieve historical feature data."
+ "Next we will create a new client object, but this time we will configure it to connect to the Feast Batch Serving. This service will allow us to retrieve historical feature data."
]
},
{
"cell_type": "code",
- "execution_count": 20,
+ "execution_count": 15,
"metadata": {},
"outputs": [],
"source": [
@@ -757,12 +572,14 @@
"cell_type": "markdown",
"metadata": {},
"source": [
- "By calling the `get_batch_features` method we are able to retrieve a `job` object for the exporting of feature data. For every entity and timestamp combination in `entity_rows` we will be receiving a row with feature values joined to it."
+ "By calling the `get_batch_features` method we are able to retrieve a `job` object. This object can be used to retrieve the resulting training dataset that is exported by Feast. \n",
+ "\n",
+ "The dataset that is returned will contain feature values for each entity and timestamp combination in `entity_rows`."
]
},
{
"cell_type": "code",
- "execution_count": 21,
+ "execution_count": 16,
"metadata": {
"scrolled": true
},
@@ -786,108 +603,114 @@
},
{
"cell_type": "code",
- "execution_count": 22,
+ "execution_count": 17,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df = job.to_dataframe()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 18,
"metadata": {},
"outputs": [
{
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " event_timestamp | \n",
- " customer_id | \n",
- " daily_transactions | \n",
- " total_transactions | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " 0 | \n",
- " 2020-05-26 00:00:00+00:00 | \n",
- " 1001 | \n",
- " 2.416811 | \n",
- " 14 | \n",
- "
\n",
- " \n",
- " 1 | \n",
- " 2020-05-26 00:00:00+00:00 | \n",
- " 1004 | \n",
- " 6.617317 | \n",
- " 6 | \n",
- "
\n",
- " \n",
- " 2 | \n",
- " 2020-05-26 00:00:00+00:00 | \n",
- " 1003 | \n",
- " 4.409714 | \n",
- " 95 | \n",
- "
\n",
- " \n",
- " 3 | \n",
- " 2020-05-26 00:00:00+00:00 | \n",
- " 1005 | \n",
- " 1.032525 | \n",
- " 86 | \n",
- "
\n",
- " \n",
- " 4 | \n",
- " 2020-05-26 00:00:00+00:00 | \n",
- " 1002 | \n",
- " 4.817735 | \n",
- " 9 | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " event_timestamp customer_id daily_transactions \\\n",
- "0 2020-05-26 00:00:00+00:00 1001 2.416811 \n",
- "1 2020-05-26 00:00:00+00:00 1004 6.617317 \n",
- "2 2020-05-26 00:00:00+00:00 1003 4.409714 \n",
- "3 2020-05-26 00:00:00+00:00 1005 1.032525 \n",
- "4 2020-05-26 00:00:00+00:00 1002 4.817735 \n",
- "\n",
- " total_transactions \n",
- "0 14 \n",
- "1 6 \n",
- "2 95 \n",
- "3 86 \n",
- "4 9 "
- ]
- },
- "execution_count": 22,
- "metadata": {},
- "output_type": "execute_result"
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ " event_timestamp customer_id daily_transactions \\\n",
+ "0 2020-06-13 23:45:10.918874+00:00 1001 1.879220 \n",
+ "1 2020-06-18 12:11:10.918845+00:00 1001 5.122846 \n",
+ "2 2020-06-17 20:46:10.918903+00:00 1001 2.145294 \n",
+ "3 2020-06-18 02:50:10.918816+00:00 1001 5.122846 \n",
+ "4 2020-06-15 10:35:10.918716+00:00 1001 5.758472 \n",
+ "5 2020-06-17 14:33:10.918786+00:00 1001 2.145294 \n",
+ "6 2020-06-14 23:15:10.918792+00:00 1002 5.467141 \n",
+ "7 2020-06-14 07:22:10.918851+00:00 1002 5.467141 \n",
+ "8 2020-06-17 23:40:10.918880+00:00 1002 3.338614 \n",
+ "9 2020-06-15 14:00:10.918758+00:00 1002 4.921264 \n",
+ "10 2020-06-15 18:22:10.918909+00:00 1002 4.921264 \n",
+ "11 2020-06-16 21:10:10.918822+00:00 1002 1.838186 \n",
+ "12 2020-06-18 05:47:10.918886+00:00 1003 2.702916 \n",
+ "13 2020-06-17 08:59:10.918767+00:00 1003 0.211125 \n",
+ "14 2020-06-15 11:25:10.918798+00:00 1003 4.476252 \n",
+ "15 2020-06-16 09:56:10.918857+00:00 1003 9.123597 \n",
+ "16 2020-06-14 11:39:10.918915+00:00 1003 6.353373 \n",
+ "17 2020-06-15 03:21:10.918828+00:00 1003 4.476252 \n",
+ "18 2020-06-18 09:04:10.918804+00:00 1004 8.756623 \n",
+ "19 2020-06-14 14:18:10.918834+00:00 1004 8.647374 \n",
+ "20 2020-06-17 03:10:10.918863+00:00 1004 2.377199 \n",
+ "21 2020-06-13 16:51:10.918774+00:00 1004 6.362085 \n",
+ "22 2020-06-15 03:54:10.918892+00:00 1004 8.235070 \n",
+ "23 2020-06-17 19:01:10.918921+00:00 1004 2.377199 \n",
+ "24 2020-06-17 06:14:10.918780+00:00 1005 9.958688 \n",
+ "25 2020-06-16 08:23:10.918839+00:00 1005 0.006388 \n",
+ "26 2020-06-16 00:30:10.918927+00:00 1005 0.006388 \n",
+ "27 2020-06-16 10:27:10.918810+00:00 1005 0.006388 \n",
+ "28 2020-06-17 01:50:10.918869+00:00 1005 9.958688 \n",
+ "29 2020-06-17 08:42:10.918897+00:00 1005 9.958688 \n",
+ "\n",
+ " total_transactions \n",
+ "0 7 \n",
+ "1 96 \n",
+ "2 63 \n",
+ "3 96 \n",
+ "4 85 \n",
+ "5 63 \n",
+ "6 10 \n",
+ "7 10 \n",
+ "8 50 \n",
+ "9 55 \n",
+ "10 55 \n",
+ "11 83 \n",
+ "12 50 \n",
+ "13 96 \n",
+ "14 61 \n",
+ "15 85 \n",
+ "16 69 \n",
+ "17 61 \n",
+ "18 84 \n",
+ "19 95 \n",
+ "20 25 \n",
+ "21 2 \n",
+ "22 58 \n",
+ "23 25 \n",
+ "24 6 \n",
+ "25 36 \n",
+ "26 36 \n",
+ "27 36 \n",
+ "28 6 \n",
+ "29 6 \n"
+ ]
}
],
"source": [
- "df = job.to_dataframe()\n",
- "df.head()"
+ "print(df.head(50))"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "The training dataset is staged on Google Cloud Storage and can be accessed directly if it is too large to load into memory"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "job.get_avro_files()"
]
}
],
"metadata": {
"kernelspec": {
- "display_name": "feast-ml-py374",
+ "display_name": "Python 3",
"language": "python",
- "name": "feast-ml-py374"
+ "name": "python3"
},
"language_info": {
"codemirror_mode": {
@@ -899,9 +722,9 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
- "version": "3.7.4"
+ "version": "3.7.6"
}
},
"nbformat": 4,
- "nbformat_minor": 4
+ "nbformat_minor": 2
}
diff --git a/examples/feast-xgboost-churn-prediction-tutorial/Telecom Customer Churn Prediction (with Feast and XGBoost).ipynb b/examples/feast-xgboost-churn-prediction-tutorial/Telecom Customer Churn Prediction (with Feast and XGBoost).ipynb
index c29c01efff..8bc1f1edc1 100644
--- a/examples/feast-xgboost-churn-prediction-tutorial/Telecom Customer Churn Prediction (with Feast and XGBoost).ipynb
+++ b/examples/feast-xgboost-churn-prediction-tutorial/Telecom Customer Churn Prediction (with Feast and XGBoost).ipynb
@@ -11,12 +11,11 @@
"cell_type": "markdown",
"metadata": {},
"source": [
- "This tutorial will demonstrate the use of Feast in productionising a churn model. The tutorial is broken down into two sections\n",
+ "This tutorial will demonstrate the use of Feast in productionizing a churn model. The tutorial will walk through:\n",
"\n",
"1. Churn Modelling (without Feast): In this section we explore the data, refine it, train a model, and evaluate its performance.\n",
"2. Churn Modelling (with Feast): In this section we introduce Feast for feature storage, management, as well as serving.\n",
"\n",
- "\n",
"This tutorial is an extension of [this](https://www.kaggle.com/pavanraj159/telecom-customer-churn-prediction/comments#6.-Model-Performances) Kaggle notebook"
]
},
@@ -29,9 +28,36 @@
"## 1. Churn Modelling (without Feast)"
]
},
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Install dependencies for tutorial"
+ ]
+ },
{
"cell_type": "code",
"execution_count": null,
+ "metadata": {
+ "pycharm": {
+ "name": "#%%\n"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "!pip install sklearn python-slugify plotly statsmodels yellowbrick xgboost"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Import dependencies"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
"metadata": {
"_cell_guid": "b1076dfc-b9ad-4769-8c92-a6c4dae69d19",
"_kg_hide-input": false,
@@ -39,27 +65,20 @@
},
"outputs": [],
"source": [
- "import numpy as np\n",
- "import pandas as pd\n",
"import os\n",
- "import matplotlib.pyplot as plt\n",
- "from PIL import Image\n",
- "%matplotlib inline\n",
+ "import numpy as np\n",
"import pandas as pd\n",
- "import seaborn as sns\n",
- "import itertools\n",
"import warnings\n",
"warnings.filterwarnings(\"ignore\")\n",
- "import io\n",
"import plotly.offline as py\n",
- "py.init_notebook_mode(connected=True)\n",
"import plotly.graph_objs as go\n",
"import plotly.tools as tls\n",
- "import plotly.figure_factory as ff\n",
- "import statsmodels, yellowbrick\n",
- "import sklearn # Tested with 0.22.1\n",
- "import imblearn\n",
- "from slugify import slugify"
+ "from slugify import slugify\n",
+ "from sklearn.preprocessing import LabelEncoder\n",
+ "from sklearn.preprocessing import StandardScaler\n",
+ "from xgboost import XGBClassifier\n",
+ "from feast import Client, FeatureSet, Entity, ValueType\n",
+ "import datetime as dt"
]
},
{
@@ -69,9 +88,16 @@
"### 1.1 Data"
]
},
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Retrieve the raw telco dataset"
+ ]
+ },
{
"cell_type": "code",
- "execution_count": 2,
+ "execution_count": 3,
"metadata": {
"_cell_guid": "79c7e3d0-c299-4dcb-8224-4455121ee9b0",
"_uuid": "d629ff2d2480ee46fbb7e2d37f6b5fab8052498a"
@@ -279,7 +305,7 @@
"[5 rows x 21 columns]"
]
},
- "execution_count": 2,
+ "execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
@@ -300,7 +326,7 @@
},
{
"cell_type": "code",
- "execution_count": 3,
+ "execution_count": 4,
"metadata": {
"_uuid": "8b10c13086dff7182e399b849e31bc03df54a14e"
},
@@ -350,7 +376,7 @@
"target_col = [\"Churn\"]\n",
"cat_cols = telcom.nunique()[telcom.nunique() < 6].keys().tolist()\n",
"cat_cols = [x for x in cat_cols if x not in target_col]\n",
- "num_cols = [x for x in telcom.columns if x not in cat_cols + target_col + Id_col]\n"
+ "num_cols = [x for x in telcom.columns if x not in cat_cols + target_col + Id_col]"
]
},
{
@@ -364,7 +390,7 @@
},
{
"cell_type": "code",
- "execution_count": 4,
+ "execution_count": 5,
"metadata": {
"_uuid": "8921591320c5e336ec5a2e1efc5ed3cb0f9ec1b2"
},
@@ -585,15 +611,12 @@
"[5 rows x 35 columns]"
]
},
- "execution_count": 4,
+ "execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
- "from sklearn.preprocessing import LabelEncoder\n",
- "from sklearn.preprocessing import StandardScaler\n",
- "\n",
"# Customer id col\n",
"Id_col = ['customer_id']\n",
"\n",
@@ -647,7 +670,7 @@
},
{
"cell_type": "code",
- "execution_count": 5,
+ "execution_count": 6,
"metadata": {},
"outputs": [
{
@@ -689,7 +712,7 @@
" customer_id | \n",
" 7032 | \n",
" 7032 | \n",
- " 0835-DUUIQ | \n",
+ " 0256-LTHVJ | \n",
" 1 | \n",
" NaN | \n",
" NaN | \n",
@@ -1181,7 +1204,7 @@
],
"text/plain": [
" count unique top freq \\\n",
- "customer_id 7032 7032 0835-DUUIQ 1 \n",
+ "customer_id 7032 7032 0256-LTHVJ 1 \n",
"gender 7032 NaN NaN NaN \n",
"seniorcitizen 7032 NaN NaN NaN \n",
"partner 7032 NaN NaN NaN \n",
@@ -1292,7 +1315,7 @@
"totalcharges -0.830249 -0.390815 0.666827 2.82426 "
]
},
- "execution_count": 5,
+ "execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
@@ -1312,11 +1335,93 @@
},
{
"cell_type": "code",
- "execution_count": 6,
+ "execution_count": 7,
"metadata": {
"_uuid": "b52cf9c7f402ed706e82221e3f8601fdeea9ab27"
},
"outputs": [
+ {
+ "data": {
+ "text/html": [
+ " \n",
+ " "
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
{
"data": {
"application/vnd.plotly.v1+json": {
@@ -1457,7 +1562,7 @@
-0.007514979909200033,
-0.01632782307070617,
-0.013092839264555001,
- -0.0008067457759124324,
+ -8.067457759124324E-4,
-0.008507162405232782,
-0.0071243969867245535,
-0.010105418366566195,
@@ -1474,16 +1579,16 @@
-0.003603167413572989,
-0.015973079031173835,
0.001631872518613598,
- 0.0008437084888753327,
+ 8.437084888753327E-4,
0.01319936726545174,
-0.0010503798619876774,
- -0.0006494991207074467,
+ -6.494991207074467E-4,
-0.010516394125351592,
-0.004318975744551275,
0.01627881894278637,
0.005285371870295646,
-0.013779327268354416,
- 4.783950839776602e-05
+ 4.783950839776602E-5
],
[
-0.001819390613419179,
@@ -1540,7 +1645,7 @@
0.1425612874681736,
-0.0010430787434336079,
0.0012346095228208073,
- -0.0002855204740384597,
+ -2.855204740384597E-4,
-0.2802019157901561,
0.08306706395255747,
0.24733370647615796,
@@ -1690,7 +1795,7 @@
0.11139068731904943,
0.0869415675760231,
0.09045518641091457,
- -0.00036426636786500763,
+ -3.6426636786500763E-4,
-0.17407470231312427,
-0.26736609150996915,
-0.08408097138202993,
@@ -1702,7 +1807,7 @@
0.510100290145439
],
[
- -0.0008067457759124324,
+ -8.067457759124324E-4,
0.059513871482029225,
0.1535564364182745,
0.013899668260943368,
@@ -2054,8 +2159,8 @@
0.04275388869901973,
-0.0014704079413737543,
-0.013428683581908657,
- -0.0004678565419111244,
- -0.00079001062411148,
+ -4.678565419111244E-4,
+ -7.9001062411148E-4,
0.014665913237832425,
0.013786269825793156,
-0.16136793538251534,
@@ -2100,7 +2205,7 @@
[
0.004744965758849955,
-0.18251949495535458,
- -0.0002855204740384597,
+ -2.855204740384597E-4,
0.13838288994798562,
0.1718171065699321,
-0.33279949932167546,
@@ -2195,7 +2300,7 @@
0.05762874929825631,
0.06758968145792751,
-0.10954646682258033,
- 0.0001971241890499786,
+ 1.971241890499786E-4,
-0.2512993032332009,
-0.017196458507178478,
0.15389327309228798,
@@ -2314,13 +2419,13 @@
0.1826633671546002
],
[
- 0.0008437084888753327,
+ 8.437084888753327E-4,
0.17132216591713703,
-0.08320661736633733,
-0.1492739811934336,
0.0027471183312986857,
-0.11229466175861408,
- -0.00036426636786500763,
+ -3.6426636786500763E-4,
-0.003308493511411254,
-0.11480726996437085,
0.1447470086556032,
@@ -2370,7 +2475,7 @@
-0.3059839224841771,
0.3196937439459745,
0.006208692442055045,
- 0.0001971241890499786,
+ 1.971241890499786E-4,
-0.007422540118618717,
-0.28809669563791657,
-0.28558254792863036,
@@ -2422,7 +2527,7 @@
-0.5924430690900127
],
[
- -0.0006494991207074467,
+ -6.494991207074467E-4,
0.0018604411671567017,
-0.048481275955609554,
-0.0014594010172321779,
@@ -2474,7 +2579,7 @@
-0.03215707580783382,
0.014777815951004133,
0.023690157563696343,
- -0.0004678565419111244,
+ -4.678565419111244E-4,
0.005613761063888361,
-0.006230482005525746,
-0.05215631834144274,
@@ -2510,7 +2615,7 @@
-0.08063023581160907,
0.009750281447949528,
0.07573985650934924,
- -0.00079001062411148,
+ -7.9001062411148E-4,
0.01746632456448475,
-0.020153136101165186,
-0.19870886170688193,
@@ -2638,7 +2743,7 @@
0.6510648032262032
],
[
- 4.783950839776602e-05,
+ 4.783950839776602E-5,
0.10241060539532633,
0.31907236323857324,
0.0646532494217739,
@@ -3510,20 +3615,20 @@
"\n",
" \n",
" \n",
- "
\n",
+ "
\n",
"