From 7d4f482548fd6a9818d6196d383dd09057f731c7 Mon Sep 17 00:00:00 2001 From: EC2 Default User Date: Wed, 4 May 2022 03:07:09 +0000 Subject: [PATCH 01/25] initial refactor --- ...h_data.ipynb => 01_data_exploration.ipynb} | 36 +- ...s.ipynb => 02_export_feature_groups.ipynb} | 771 +++++++++--- ...y_debugger_explain_monitor_registry.ipynb} | 1056 ++++++++++++----- .../{06_pipeline.ipynb => 03b_pipeline.ipynb} | 69 +- 4 files changed, 1397 insertions(+), 535 deletions(-) rename end_to_end/music_recommendation/{00_overview_arch_data.ipynb => 01_data_exploration.ipynb} (92%) rename end_to_end/music_recommendation/{02a_export_fg_tracks.ipynb => 02_export_feature_groups.ipynb} (54%) rename end_to_end/music_recommendation/{03_train_model_lineage_registry_debugger.ipynb => 03a_train_deploy_debugger_explain_monitor_registry.ipynb} (97%) rename end_to_end/music_recommendation/{06_pipeline.ipynb => 03b_pipeline.ipynb} (95%) diff --git a/end_to_end/music_recommendation/00_overview_arch_data.ipynb b/end_to_end/music_recommendation/01_data_exploration.ipynb similarity index 92% rename from end_to_end/music_recommendation/00_overview_arch_data.ipynb rename to end_to_end/music_recommendation/01_data_exploration.ipynb index 89e1474f31..d2ba68193d 100644 --- a/end_to_end/music_recommendation/00_overview_arch_data.ipynb +++ b/end_to_end/music_recommendation/01_data_exploration.ipynb @@ -60,11 +60,7 @@ "source": [ "import sys\n", "import pprint\n", - "sys.path.insert(1, './code')\n", - "from parameter_store import ParameterStore\n", - "\n", - "ps = ParameterStore()\n", - "ps.create(namespace='music-rec')" + "sys.path.insert(1, './code')" ] }, { @@ -114,9 +110,7 @@ "# s3 client\n", "s3_client = boto3.client(\"s3\")\n", "\n", - "print(f\"this is your default SageMaker Studio bucket name: {bucket}\") \n", - "\n", - "ps.add({'bucket': bucket, 'prefix': prefix}, namespace='music-rec')" + "print(f\"this is your default SageMaker Studio bucket name: {bucket}\") \n" ] }, { @@ -228,9 +222,7 @@ "source": [ "# these are the new file paths located on your SageMaker Studio default s3 storage bucket\n", "tracks_data_source = f's3://{bucket}/{prefix}/tracks.csv'\n", - "ratings_data_source = f's3://{bucket}/{prefix}/ratings.csv'\n", - "\n", - "ps.add({'tracks_data_source': tracks_data_source, 'ratings_data_source': ratings_data_source}, namespace='music-rec')" + "ratings_data_source = f's3://{bucket}/{prefix}/ratings.csv'" ] }, { @@ -246,10 +238,7 @@ "metadata": {}, "outputs": [], "source": [ - "pretrained_model_path = get_model('./model/model.tar.gz', bucket)\n", - "\n", - "ps.add({'pretrained_model_path': pretrained_model_path}, namespace='music-rec')\n", - "ps.store()" + "pretrained_model_path = get_model('./model/model.tar.gz', bucket)" ] }, { @@ -381,25 +370,14 @@ "s3_client.upload_file(Filename=\"./data/tracks_new.csv\", Bucket=bucket, Key=f'{prefix}/data/tracks_new.csv')\n", "s3_client.upload_file(Filename=\"./data/ratings_new.csv\", Bucket=bucket, Key=f'{prefix}/data/ratings_new.csv')" ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "----\n", - "\n", - "# Music Recommender Part 1: Data Prep using Data Wrangler\n", - "\n", - "After you completed running this notebook, you can open the Data Wrangler file `01_music_dataprep.flow`." - ] } ], "metadata": { "instance_type": "ml.t3.medium", "kernelspec": { - "display_name": "Python 3 (Data Science)", + "display_name": "conda_python3", "language": "python", - "name": "python3__SAGEMAKER_INTERNAL__arn:aws:sagemaker:us-east-2:429704687514:image/datascience-1.0" + "name": "conda_python3" }, "language_info": { "codemirror_mode": { @@ -411,7 +389,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.7.10" + "version": "3.6.13" } }, "nbformat": 4, diff --git a/end_to_end/music_recommendation/02a_export_fg_tracks.ipynb b/end_to_end/music_recommendation/02_export_feature_groups.ipynb similarity index 54% rename from end_to_end/music_recommendation/02a_export_fg_tracks.ipynb rename to end_to_end/music_recommendation/02_export_feature_groups.ipynb index edd8ec7a87..99c75046d3 100644 --- a/end_to_end/music_recommendation/02a_export_fg_tracks.ipynb +++ b/end_to_end/music_recommendation/02_export_feature_groups.ipynb @@ -48,43 +48,85 @@ ] }, { - "cell_type": "markdown", + "cell_type": "code", + "execution_count": null, "metadata": {}, + "outputs": [], "source": [ - "## Create Feature Group\n", + "import sys\n", + "import pprint\n", + "sys.path.insert(1, './code')\n", + "# from parameter_store import ParameterStore\n", "\n", - "_What is a feature group_\n", + "# ps = ParameterStore()\n", + "# ps.create(namespace='music-rec')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# update pandas to avoid data type issues in older 1.0 version\n", + "!pip install pandas --upgrade --quiet\n", + "import pandas as pd\n", + "print(pd.__version__)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# create data folder\n", + "!mkdir data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import matplotlib.pyplot as plt\n", + "%matplotlib inline\n", "\n", - "A single feature corresponds to a column in your dataset. A feature group is a predefined schema for a \n", - "collection of features - each feature in the feature group has a specified data type and name. \n", - "A single record in a feature group corresponds to a row in your dataframe. A feature store is a \n", - "collection of feature groups. To learn more about SageMaker Feature Store, see \n", - "[Amazon Feature Store Documentation](http://docs.aws.amazon.com/sagemaker/latest/dg/feature-store.html)." + "import json\n", + "import sagemaker \n", + "import boto3\n", + "import os\n", + "from awscli.customizations.s3.utils import split_s3_bucket_key\n", + "\n", + "# Sagemaker session\n", + "sess = sagemaker.Session()\n", + "# get session bucket name\n", + "bucket = sess.default_bucket()\n", + "# bucket prefix or the subfolder for everything we produce\n", + "prefix='music-recommendation'\n", + "# s3 client\n", + "s3_client = boto3.client(\"s3\")\n", + "\n", + "print(f\"this is your default SageMaker Studio bucket name: {bucket}\") \n", + "\n", + "# ps.add({'bucket': bucket, 'prefix': prefix}, namespace='music-rec')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "\n", + "\n", "\n", - "### Define Feature Group \n", + "## Prereqs: Get Data \n", "\n", - "##### [back to top](#02a-nb)\n", + "##### [back to top](#00-nb)\n", "\n", "----\n", - "Select Record identifier and Event time feature name. These are required parameters for feature group\n", - "creation.\n", - "* **Record identifier name** is the name of the feature defined in the feature group's feature definitions \n", - "whose value uniquely identifies a Record defined in the feature group's feature definitions.\n", - "* **Event time feature name** is the name of the EventTime feature of a Record in FeatureGroup. An EventTime \n", - "is a timestamp that represents the point in time when a new event occurs that corresponds to the creation or \n", - "update of a Record in the FeatureGroup. All Records in the FeatureGroup must have a corresponding EventTime.\n", "\n", - "
💡Record identifier and Event time feature name are required \n", - "for feature group. After filling in the values, you can choose Run Selected Cell and All Below \n", - "from the Run Menu from the menu bar. \n", - "
" + "Here we will download the music data from a public S3 bucket that we'll be using for this demo and uploads it to your default S3 bucket that was created for you when you initially created a SageMaker Studio workspace. " ] }, { @@ -93,7 +135,64 @@ "metadata": {}, "outputs": [], "source": [ - "!pip install sagemaker boto3 --upgrade --quiet" + "def get_data(public_s3_data, to_bucket, sample_data=1):\n", + " new_paths = []\n", + " for f in public_s3_data:\n", + " bucket_name, key_name = split_s3_bucket_key(f)\n", + " filename = f.split('/')[-1]\n", + " new_path = \"s3://{}/{}/{}\".format(to_bucket, prefix, filename)\n", + " new_paths.append(new_path)\n", + " \n", + " # only download if not already downloaded\n", + " if not os.path.exists('./data/{}'.format(filename)):\n", + " # download s3 data\n", + " print(\"Downloading file from {}\".format(f))\n", + " s3_client.download_file(bucket_name, key_name, './data/{}'.format(filename))\n", + " \n", + " # subsample the data to create a smaller datatset for this demo\n", + " new_df = pd.read_csv('./data/{}'.format(filename))\n", + " new_df = new_df.sample(frac=sample_data)\n", + " new_df.to_csv('./data/{}'.format(filename), index=False)\n", + " \n", + " # upload s3 data to our default s3 bucket for SageMaker Studio\n", + " print(\"Uploading {} to {}\\n\".format(filename, new_path))\n", + " s3_client.upload_file('./data/{}'.format(filename), to_bucket, os.path.join(prefix,filename))\n", + " \n", + " return new_paths\n", + "\n", + "\n", + "def get_model(model_path, to_bucket):\n", + " # upload model to our default s3 bucket for SageMaker Studio\n", + " filename = model_path.split('/')[-1]\n", + " print(\"Uploading {} to {}\\n\".format(model_path, os.path.join(to_bucket,prefix,filename)))\n", + " s3_client.upload_file(model_path, to_bucket, os.path.join(prefix,filename))\n", + " return \"s://{}\".format(os.path.join(to_bucket,prefix,filename))\n", + " \n", + "\n", + "def update_data_sources(flow_path, tracks_data_source, ratings_data_source):\n", + " with open(flow_path) as flowf:\n", + " flow = json.load(flowf)\n", + " \n", + " for node in flow['nodes']:\n", + " # if the key exists for our s3 endpoint\n", + " try:\n", + " if node['parameters']['dataset_definition']['name'] == 'tracks.csv':\n", + " # reset the s3 data source for tracks data\n", + " old_source = node['parameters']['dataset_definition']['s3ExecutionContext']['s3Uri']\n", + " print(\"Changed {} to {}\".format(old_source, tracks_data_source))\n", + " node['parameters']['dataset_definition']['s3ExecutionContext']['s3Uri'] = tracks_data_source\n", + " elif node['parameters']['dataset_definition']['name'] == 'ratings.csv':\n", + " # reset the s3 data source for ratings data\n", + " old_source = node['parameters']['dataset_definition']['s3ExecutionContext']['s3Uri']\n", + " print(\"Changed {} to {}\".format(old_source, ratings_data_source))\n", + " node['parameters']['dataset_definition']['s3ExecutionContext']['s3Uri'] = ratings_data_source\n", + " except:\n", + " continue\n", + " # write out the updated json flow file\n", + " with open(flow_path, 'w') as outfile:\n", + " json.dump(flow, outfile)\n", + " \n", + " return flow" ] }, { @@ -102,12 +201,18 @@ "metadata": {}, "outputs": [], "source": [ - "import sys\n", - "import pprint\n", - "sys.path.insert(1, './code')\n", - "from parameter_store import ParameterStore\n", - "ps = ParameterStore(verbose=False)\n", - "parameters = ps.read('music-rec')" + "# public S3 bucket that contains our music data\n", + "s3_bucket_music_data = \"s3://sagemaker-sample-files/datasets/tabular/synthetic-music\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "new_data_paths = get_data([f\"{s3_bucket_music_data}/tracks.csv\", f\"{s3_bucket_music_data}/ratings.csv\"], bucket, sample_data=0.70)\n", + "print(new_data_paths)" ] }, { @@ -116,11 +221,70 @@ "metadata": {}, "outputs": [], "source": [ - "bucket = parameters['bucket']\n", - "prefix = parameters['prefix']\n", - "pretrained_model_path = parameters['pretrained_model_path']\n", - "ratings_data_source = parameters['ratings_data_source']\n", - "tracks_data_source = parameters['tracks_data_source']" + "# these are the new file paths located on your SageMaker Studio default s3 storage bucket\n", + "tracks_data_source = f's3://{bucket}/{prefix}/tracks.csv'\n", + "ratings_data_source = f's3://{bucket}/{prefix}/ratings.csv'\n", + "\n", + "# ps.add({'tracks_data_source': tracks_data_source, 'ratings_data_source': ratings_data_source}, namespace='music-rec')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Upload pretrained model" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "pretrained_model_path = get_model('./model/model.tar.gz', bucket)\n", + "\n", + "# ps.add({'pretrained_model_path': pretrained_model_path}, namespace='music-rec')\n", + "# ps.store()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Create Feature Group\n", + "\n", + "_What is a feature group_\n", + "\n", + "A single feature corresponds to a column in your dataset. A feature group is a predefined schema for a \n", + "collection of features - each feature in the feature group has a specified data type and name. \n", + "A single record in a feature group corresponds to a row in your dataframe. A feature store is a \n", + "collection of feature groups. To learn more about SageMaker Feature Store, see \n", + "[Amazon Feature Store Documentation](http://docs.aws.amazon.com/sagemaker/latest/dg/feature-store.html)." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Define Feature Group \n", + "\n", + "##### [back to top](#02a-nb)\n", + "\n", + "----\n", + "Select Record identifier and Event time feature name. These are required parameters for feature group\n", + "creation.\n", + "* **Record identifier name** is the name of the feature defined in the feature group's feature definitions \n", + "whose value uniquely identifies a Record defined in the feature group's feature definitions.\n", + "* **Event time feature name** is the name of the EventTime feature of a Record in FeatureGroup. An EventTime \n", + "is a timestamp that represents the point in time when a new event occurs that corresponds to the creation or \n", + "update of a Record in the FeatureGroup. All Records in the FeatureGroup must have a corresponding EventTime.\n", + "\n", + "
💡Record identifier and Event time feature name are required \n", + "for feature group. After filling in the values, you can choose Run Selected Cell and All Below \n", + "from the Run Menu from the menu bar. \n", + "
" ] }, { @@ -129,13 +293,14 @@ "metadata": {}, "outputs": [], "source": [ - "record_identifier_feature_name = 'trackId'\n", - "if record_identifier_feature_name is None:\n", - " raise SystemExit(\"Select a column name as the feature group record identifier.\")\n", + "# feature group name, with flow_name and an unique id. You can give it a customized name\n", + "feature_group_names = ['track-features-music-rec', 'user-5star-track-features-music-rec', 'ratings-features-music-rec']\n", + "print(f\"Feature Group Name: {feature_group_names}\")\n", "\n", - "event_time_feature_name = 'EventTime'\n", - "if event_time_feature_name is None:\n", - " raise SystemExit(\"Select a column name as the event time feature name.\")" + "record_identifier_feature_names = {'track-features-music-rec': 'trackId', \n", + " 'user-5star-track-features-music-rec': 'userId', \n", + " 'ratings-features-music-rec': \"ratingEventId\"}\n", + "event_time_feature_name = 'EventTime'" ] }, { @@ -179,7 +344,7 @@ "metadata": {}, "outputs": [], "source": [ - "column_schemas = [\n", + "track_column_schemas = [\n", " {\n", " \"name\": \"trackId\",\n", " \"type\": \"string\"\n", @@ -264,7 +429,131 @@ " \"name\": \"EventTime\",\n", " \"type\": \"float\"\n", " }\n", - "]" + "]\n", + "\n", + "user_column_schemas = [\n", + " {\n", + " \"name\": \"userId\",\n", + " \"type\": \"long\"\n", + " },\n", + " {\n", + " \"name\": \"energy_5star\",\n", + " \"type\": \"float\"\n", + " },\n", + " {\n", + " \"name\": \"acousticness_5star\",\n", + " \"type\": \"float\"\n", + " },\n", + " {\n", + " \"name\": \"valence_5star\",\n", + " \"type\": \"float\"\n", + " },\n", + " {\n", + " \"name\": \"speechiness_5star\",\n", + " \"type\": \"float\"\n", + " },\n", + " {\n", + " \"name\": \"instrumentalness_5star\",\n", + " \"type\": \"float\"\n", + " },\n", + " {\n", + " \"name\": \"liveness_5star\",\n", + " \"type\": \"float\"\n", + " },\n", + " {\n", + " \"name\": \"tempo_5star\",\n", + " \"type\": \"float\"\n", + " },\n", + " {\n", + " \"name\": \"danceability_5star\",\n", + " \"type\": \"float\"\n", + " },\n", + " {\n", + " \"name\": \"genre_Latin_5star\",\n", + " \"type\": \"float\"\n", + " },\n", + " {\n", + " \"name\": \"genre_Folk_5star\",\n", + " \"type\": \"float\"\n", + " },\n", + " {\n", + " \"name\": \"genre_Blues_5star\",\n", + " \"type\": \"float\"\n", + " },\n", + " {\n", + " \"name\": \"genre_Rap_5star\",\n", + " \"type\": \"float\"\n", + " },\n", + " {\n", + " \"name\": \"genre_Reggae_5star\",\n", + " \"type\": \"float\"\n", + " },\n", + " {\n", + " \"name\": \"genre_Jazz_5star\",\n", + " \"type\": \"float\"\n", + " },\n", + " {\n", + " \"name\": \"genre_RnB_5star\",\n", + " \"type\": \"float\"\n", + " },\n", + " {\n", + " \"name\": \"genre_Country_5star\",\n", + " \"type\": \"float\"\n", + " },\n", + " {\n", + " \"name\": \"genre_Electronic_5star\",\n", + " \"type\": \"float\"\n", + " },\n", + " {\n", + " \"name\": \"genre_Pop_Rock_5star\",\n", + " \"type\": \"float\"\n", + " },\n", + " {\n", + " \"name\": \"EventTime\",\n", + " \"type\": \"float\"\n", + " }\n", + "]\n", + "\n", + "rating_column_schemas = [\n", + " {\n", + " \"name\": \"ratingEventId\",\n", + " \"type\": \"string\"\n", + " },\n", + " {\n", + " \"name\": \"ts\",\n", + " \"type\": \"long\"\n", + " },\n", + " {\n", + " \"name\": \"userId\",\n", + " \"type\": \"long\"\n", + " },\n", + " {\n", + " \"name\": \"trackId\",\n", + " \"type\": \"string\"\n", + " },\n", + " {\n", + " \"name\": \"sessionId\",\n", + " \"type\": \"long\"\n", + " },\n", + " {\n", + " \"name\": \"itemInSession\",\n", + " \"type\": \"long\"\n", + " },\n", + " {\n", + " \"name\": \"Rating\",\n", + " \"type\": \"float\"\n", + " },\n", + " {\n", + " \"name\": \"EventTime\",\n", + " \"type\": \"float\"\n", + " }\n", + "]\n", + "\n", + "column_schemas = {\n", + " 'track-features-music-rec': track_column_schemas, \n", + " 'user-5star-track-features-music-rec': user_column_schemas, \n", + " 'ratings-features-music-rec': rating_column_schemas,\n", + "}" ] }, { @@ -290,12 +579,15 @@ " \"long\": FeatureTypeEnum.INTEGRAL\n", "}\n", "\n", - "feature_definitions = [\n", - " FeatureDefinition(\n", - " feature_name=column_schema['name'], \n", - " feature_type=column_to_feature_type_mapping.get(column_schema['type'], default_feature_type)\n", - " ) for column_schema in column_schemas\n", - "]" + "feature_definitions = {}\n", + "for feature_group_name in feature_group_names:\n", + " feature_definition = [\n", + " FeatureDefinition(\n", + " feature_name=column_schema['name'], \n", + " feature_type=column_to_feature_type_mapping.get(column_schema['type'], default_feature_type)\n", + " ) for column_schema in column_schemas[feature_group_name]\n", + " ]\n", + " feature_definitions[feature_group_name] = feature_definition" ] }, { @@ -325,8 +617,7 @@ "outputs": [], "source": [ "from time import gmtime, strftime\n", - "import uuid\n", - "import sagemaker \n" + "import uuid" ] }, { @@ -335,9 +626,6 @@ "metadata": {}, "outputs": [], "source": [ - "# Sagemaker session\n", - "sess = sagemaker.Session()\n", - "\n", "# IAM role for executing the processing job.\n", "iam_role = sagemaker.get_execution_role()\n", "\n", @@ -346,40 +634,21 @@ "flow_export_id = f\"{strftime('%d-%H-%M-%S', gmtime())}-{str(uuid.uuid4())[:8]}\"\n", "flow_export_name = f\"flow-{flow_export_id}\"\n", "\n", - "# feature group name, with flow_name and an unique id. You can give it a customized name\n", - "feature_group_name = 'track-features-music-rec'\n", - "print(f\"Feature Group Name: {feature_group_name}\")\n", - "\n", "# SageMaker FeatureStore writes the data in the OfflineStore of a FeatureGroup to a \n", "# S3 location owned by you.\n", "feature_store_offline_s3_uri = 's3://' + bucket\n", "\n", "# controls if online store is enabled. Enabling the online store allows quick access to \n", "# the latest value for a Record via the GetRecord API.\n", - "enable_online_store = True" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ + "enable_online_store = True\n", "fg_name_tracks = feature_group_name\n", - "\n", - "ps.add({'fg_name_tracks': fg_name_tracks}, namespace='music-rec')\n", - "ps.add({'flow_export_id': flow_export_id}, namespace='music-rec')\n", - "\n", "dw_ecrlist = {\n", " 'region':{'us-west-2':'174368400705',\n", " 'us-east-2':'415577184552',\n", " 'us-west-1':'926135532090',\n", " 'us-east-1':'663277389841'\n", " }\n", - "}\n", - "\n", - "ps.add({'dw_ecrlist': dw_ecrlist}, namespace='music-rec')\n", - "ps.store()" + "}" ] }, { @@ -418,13 +687,6 @@ ")" ] }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Feature group is initialized and created below" - ] - }, { "cell_type": "code", "execution_count": null, @@ -432,37 +694,56 @@ "outputs": [], "source": [ "from sagemaker.feature_store.feature_group import FeatureGroup\n", + "import time\n", + "\n", + "def wait_for_feature_group_creation_complete(feature_group):\n", + " \"\"\"Helper function to wait for the completions of creating a feature group\"\"\"\n", + " status = feature_group.describe().get(\"FeatureGroupStatus\")\n", + " while status == \"Creating\":\n", + " print(\"Waiting for Feature Group Creation\")\n", + " time.sleep(5)\n", + " status = feature_group.describe().get(\"FeatureGroupStatus\")\n", + " if status != \"Created\":\n", + " raise SystemExit(f\"Failed to create feature group {feature_group.name}: {status}\")\n", + " print(f\"FeatureGroup {feature_group.name} successfully created.\")\n", + "\n", + "\n", + "def create_feature_group(feature_group_name, feature_store_session, feature_definitions):\n", + " feature_group = FeatureGroup(\n", + " name=feature_group_name, sagemaker_session=feature_store_session, feature_definitions=feature_definitions[feature_group_name])\n", "\n", - "feature_group = FeatureGroup(\n", - " name=feature_group_name, sagemaker_session=feature_store_session, feature_definitions=feature_definitions)\n", - "\n", - "# only create feature group if it doesn't already exist\n", - "try:\n", - " sagemaker_client.describe_feature_group(FeatureGroupName=feature_group_name, NextToken='string')\n", - " feature_group_exists=True\n", - " print(\"Feature Group {0} already exists. Using {0}\".format(feature_group_name))\n", - "except Exception as e:\n", - " error = e.response.get('Error').get('Code')\n", - " if error == \"ResourceNotFound\":\n", - " feature_group_exists=False\n", - " print(\"Creating Feature Group {}\".format(feature_group_name))\n", - " feature_group.create(\n", - " s3_uri=feature_store_offline_s3_uri,\n", - " record_identifier_name=record_identifier_feature_name,\n", - " event_time_feature_name=event_time_feature_name,\n", - " role_arn=iam_role,\n", - " enable_online_store=enable_online_store\n", - " )\n", - " if error == 'ResourceInUse':\n", + " # only create feature group if it doesn't already exist\n", + " try:\n", + " sagemaker_client.describe_feature_group(FeatureGroupName=feature_group_name, NextToken='string')\n", " feature_group_exists=True\n", - " print(\"Feature Group {0} already exists. Using {0}\".format(feature_group_name))" + " print(\"Feature Group {0} already exists. Using {0}\".format(feature_group_name))\n", + " except Exception as e:\n", + " error = e.response.get('Error').get('Code')\n", + " if error == \"ResourceNotFound\":\n", + " feature_group_exists=False\n", + " print(\"Creating Feature Group {}\".format(feature_group_name))\n", + " feature_group.create(\n", + " s3_uri=feature_store_offline_s3_uri,\n", + " record_identifier_name=record_identifier_feature_names[feature_group_name],\n", + " event_time_feature_name=event_time_feature_name,\n", + " role_arn=iam_role,\n", + " enable_online_store=enable_online_store\n", + " )\n", + " # Invoke the Feature Store API to create the feature group and wait until it is ready\n", + " wait_for_feature_group_creation_complete(feature_group=feature_group)\n", + " if error == 'ResourceInUse':\n", + " feature_group_exists=True\n", + " print(\"Feature Group {0} already exists. Using {0}\".format(feature_group_name))\n", + " \n", + " return feature_group_exists\n", + " \n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Invoke the Feature Store API to create the feature group and wait until it is ready" + "Feature group is initialized and created below" ] }, { @@ -471,20 +752,10 @@ "metadata": {}, "outputs": [], "source": [ - "import time\n", - "\n", - "def wait_for_feature_group_creation_complete(feature_group):\n", - " \"\"\"Helper function to wait for the completions of creating a feature group\"\"\"\n", - " status = feature_group.describe().get(\"FeatureGroupStatus\")\n", - " while status == \"Creating\":\n", - " print(\"Waiting for Feature Group Creation\")\n", - " time.sleep(5)\n", - " status = feature_group.describe().get(\"FeatureGroupStatus\")\n", - " if status != \"Created\":\n", - " raise SystemExit(f\"Failed to create feature group {feature_group.name}: {status}\")\n", - " print(f\"FeatureGroup {feature_group.name} successfully created.\")\n", - "\n", - "wait_for_feature_group_creation_complete(feature_group=feature_group)" + "feature_group_existence = {}\n", + "for feature_group_name in feature_group_names:\n", + " feature_group_exists = create_feature_group(feature_group_name, feature_store_session, feature_definitions)\n", + " feature_group_existence[feature_group_name] = feature_group_exists" ] }, { @@ -598,13 +869,21 @@ "from sagemaker.processing import FeatureStoreOutput\n", "\n", "# Output name is auto-generated from the select node's ID + output name from the .flow file\n", - "output_name = \"19ad8e80-2002-4ee9-9753-fe9a384b1166.default\" # tracks node\n", + "output_names = {\n", + " \"track-features-music-rec\": \"19ad8e80-2002-4ee9-9753-fe9a384b1166.default\", \n", + " \"user-5star-track-features-music-rec\": \"7a6dad19-2c80-43e3-b03d-ec23c3842ae9.default\", \n", + " \"ratings-features-music-rec\": \"9a283380-91ca-478e-be99-6ba3bf57c680.default\",\n", + "}\n", "\n", - "processing_job_output = ProcessingOutput(\n", - " output_name=output_name,\n", - " app_managed=True,\n", - " feature_store_output=FeatureStoreOutput(feature_group_name=feature_group_name),\n", - ")" + "processing_job_outputs = {}\n", + "\n", + "for feature_group_name in feature_group_names:\n", + " processing_job_output = ProcessingOutput(\n", + " output_name=output_names[feature_group_name],\n", + " app_managed=True,\n", + " feature_store_output=FeatureStoreOutput(feature_group_name=feature_group_name),\n", + " )\n", + " processing_job_outputs[feature_group_name] = processing_job_output" ] }, { @@ -646,10 +925,6 @@ "\n", "flow_s3_uri = f\"s3://{bucket}/{prefix}/data_wrangler_flows/{flow_export_name}.flow\"\n", "\n", - "\n", - "ps.add({'flow_s3_uri': flow_s3_uri}, namespace='music-rec')\n", - "ps.store()\n", - "\n", "print(f\"Data Wrangler flow {flow_file_name} uploaded to {flow_s3_uri}\")" ] }, @@ -711,10 +986,6 @@ "metadata": {}, "outputs": [], "source": [ - "# Unique processing job name. Give a unique name every time you re-execute processing jobs\n", - "processing_job_name = \"dw-flow-proc-music-rec-tracks-{}-{}\".format(flow_export_id, str(uuid.uuid4())[:8])\n", - "print (f\"{processing_job_name}\")\n", - "\n", "# Data Wrangler Container URL.\n", "container_uri = f\"{dw_ecrlist['region'][region]}.dkr.ecr.{region}.amazonaws.com/sagemaker-data-wrangler-container:1.x\"\n", "\n", @@ -729,14 +1000,7 @@ "output_content_type = \"CSV\"\n", "\n", "# Network Isolation mode; default is off\n", - "enable_network_isolation = False\n", - "\n", - "# Output configuration used as processing job container arguments \n", - "output_config = {\n", - " output_name: {\n", - " \"content_type\": output_content_type\n", - " }\n", - "}" + "enable_network_isolation = False\n" ] }, { @@ -786,22 +1050,36 @@ "source": [ "%%time\n", "\n", - "# Run Processing Job if job not already previously ran\n", - "if feature_group_exists:\n", - " print(\"Feature Group {0} already exists therefore we will not run a processing job to create it again\".format(feature_group_name))\n", - "else:\n", - " print(\"Creating Processing Job: {}\".format(feature_group_name))\n", - " processor.run(\n", - " inputs=[flow_input] + data_sources, \n", - " outputs=[processing_job_output],\n", - " arguments=[f\"--output-config '{json.dumps(output_config)}'\"],\n", - " wait=False,\n", - " logs=False,\n", - " job_name=processing_job_name\n", - " ) \n", + "feature_group_exists = False\n", + "for feature_group_name in feature_group_names:\n", + " print(f\"Processing {feature_group_name}\")\n", + " # Unique processing job name. Give a unique name every time you re-execute processing jobs\n", + " processing_job_name = \"dw-flow-proc-music-rec-tracks-{}-{}\".format(flow_export_id, str(uuid.uuid4())[:8])\n", + " print (f\"{processing_job_name}\")\n", " \n", - " job_result = sess.wait_for_processing_job(processing_job_name)\n", - " print(job_result)" + " # Output configuration used as processing job container arguments \n", + " output_config = {\n", + " output_names[feature_group_name]: {\n", + " \"content_type\": output_content_type\n", + " }\n", + " }\n", + "\n", + " # Run Processing Job if job not already previously ran\n", + " if feature_group_exists: #feature_group_existence[feature_group_name]\n", + " print(\"Feature Group {0} already exists therefore we will not run a processing job to create it again\".format(feature_group_name))\n", + " else:\n", + " print(\"Creating Processing Job: {}\".format(feature_group_name))\n", + " processor.run(\n", + " inputs=[flow_input] + data_sources, \n", + " outputs=[processing_job_outputs[feature_group_name]],\n", + " arguments=[f\"--output-config '{json.dumps(output_config)}'\"],\n", + " wait=False,\n", + " logs=False,\n", + " job_name=processing_job_name\n", + " ) \n", + "\n", + " job_result = sess.wait_for_processing_job(processing_job_name)\n", + " print(job_result)" ] }, { @@ -811,14 +1089,191 @@ "You can view newly created feature group in Studio, refer to [Use Amazon SageMaker Feature Store with Amazon SageMaker Studio](https://docs.aws.amazon.com/sagemaker/latest/dg/feature-store-use-with-studio.html)\n", "for detailed guide. [Learn more about SageMaker Feature Store](https://github.com/aws/amazon-sagemaker-examples/tree/master/sagemaker-featurestore)" ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "## Fetch Data from Offline Feature Store\n", + "\n", + "##### [back to top](#03-nb)\n", + "\n", + "----\n", + "There are 3 feature stores for the ratings, tracks, and user preferences data. We retrieve data from all 3 before joining them." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "feature_groups = []\n", + "for name in feature_group_names:\n", + " feature_group = FeatureGroup(name=name, sagemaker_session=feature_store_session)\n", + " feature_groups.append(feature_group)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "s3_client = boto3.client('s3')\n", + "account_id = boto3.client('sts').get_caller_identity()[\"Account\"]\n", + "\n", + "sagemaker_role = sagemaker.get_execution_role()\n", + "\n", + "s3_output_path = 's3://' + bucket" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "feature_group_s3_prefixes = []\n", + "for feature_group in feature_groups:\n", + " feature_group_table_name = feature_group.describe().get(\"OfflineStoreConfig\").get(\"DataCatalogConfig\").get(\"TableName\")\n", + " feature_group_s3_prefix = f'{account_id}/sagemaker/{region}/offline-store/{feature_group_table_name}'\n", + " feature_group_s3_prefixes.append(feature_group_s3_prefix)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "# wait for data to be added to offline feature store\n", + "def wait_for_offline_store(feature_group_s3_prefix):\n", + " print(feature_group_s3_prefix)\n", + " offline_store_contents = None\n", + " while (offline_store_contents is None):\n", + " objects_in_bucket = s3_client.list_objects(Bucket=bucket, Prefix=feature_group_s3_prefix)\n", + " if ('Contents' in objects_in_bucket and len(objects_in_bucket['Contents']) > 1):\n", + " offline_store_contents = objects_in_bucket['Contents']\n", + " else:\n", + " print('Waiting for data in offline store...')\n", + " time.sleep(60)\n", + " print('Data available.')\n", + " \n", + "for s3_prefix in feature_group_s3_prefixes:\n", + " wait_for_offline_store(s3_prefix)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "tables = { \n", + " 'ratings': {'feature_group': feature_groups[2],\n", + " 'cols': ['userId', 'trackid', 'rating']\n", + " },\n", + " 'tracks': {'feature_group': feature_groups[0],\n", + " 'cols': ['trackid', 'length', 'energy', 'acousticness', 'valence', 'speechiness', 'instrumentalness', \n", + " 'liveness', 'tempo', 'danceability', 'genre_latin', 'genre_folk', 'genre_blues', 'genre_rap', \n", + " 'genre_reggae', 'genre_jazz', 'genre_rnb', 'genre_country', 'genre_electronic', 'genre_pop_rock']\n", + " },\n", + " 'user_5star_features': {'feature_group': feature_groups[1],\n", + " 'cols': ['userId', 'energy_5star', 'acousticness_5star', 'valence_5star', 'speechiness_5star', 'instrumentalness_5star', \n", + " 'liveness_5star','tempo_5star', 'danceability_5star', 'genre_latin_5star', 'genre_folk_5star', 'genre_blues_5star', \n", + " 'genre_rap_5star','genre_reggae_5star', 'genre_jazz_5star', 'genre_rnb_5star', 'genre_country_5star', \n", + " 'genre_electronic_5star', 'genre_pop_rock_5star']\n", + " },\n", + " }" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "check if the athena queres have been done and the data sets exist, then just do train test split or just proceed to training" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def get_train_val():\n", + " for k, v in tables.items():\n", + " query = v['feature_group'].athena_query()\n", + " joined_cols = \", \".join(v['cols'])\n", + " # limit number of datapoints for training time\n", + " query_string = \"SELECT {} FROM \\\"{}\\\" LIMIT 500000\".format(joined_cols, query.table_name)\n", + " print(query_string,'\\n')\n", + "\n", + " output_location = f's3://{bucket}/{prefix}/query_results/'\n", + " query.run(query_string=query_string, output_location=output_location)\n", + " query.wait()\n", + "\n", + " tables[k]['df'] = query.as_dataframe() \n", + " \n", + " ratings = tables['ratings']['df']\n", + " tracks = tables['tracks']['df']\n", + " user_prefs = tables['user_5star_features']['df']\n", + " \n", + " print('Merging datasets...')\n", + " print(f'Ratings: {ratings.shape}\\nTracks: {tracks.shape}\\nUser Prefs: {user_prefs.shape}\\n')\n", + " \n", + " dataset = pd.merge(ratings, tracks, on='trackid', how='inner')\n", + " dataset = pd.merge(dataset, user_prefs, on='userId', how='inner')\n", + " dataset.drop_duplicates(inplace=True)\n", + " dataset.drop(['userId', 'trackid'], axis=1, inplace=True)\n", + "\n", + " # split data\n", + " from sklearn.model_selection import train_test_split\n", + " train, val = train_test_split(dataset, test_size=0.2, random_state=42)\n", + " print(\"Training dataset shape: {}\\nValidation dataset shape: {}\\n\".format(train.shape, val.shape))\n", + "\n", + " return train, val" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%%time\n", + "import pandas as pd\n", + "import glob\n", + "\n", + "\n", + "print('Creating training and validation sets...\\n')\n", + "train, val = get_train_val()\n", + "# Write to csv in S3 without headers and index column\n", + "train.to_csv('./data/train_data.csv', header=False, index=False)\n", + "val.to_csv('./data/val_data.csv', header=False, index=False)\n", + "\n", + "pd.DataFrame({\"ColumnName\": train.columns}).to_csv(\"./data/train_data_headers.csv\", header=False, index=False)\n", + "pd.DataFrame({\"ColumnName\": val.columns}).to_csv(\"./data/val_data_headers.csv\", header=False, index=False)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { "instance_type": "ml.t3.medium", "kernelspec": { - "display_name": "Python 3 (Data Science)", + "display_name": "conda_python3", "language": "python", - "name": "python3__SAGEMAKER_INTERNAL__arn:aws:sagemaker:us-east-2:429704687514:image/datascience-1.0" + "name": "conda_python3" }, "language_info": { "codemirror_mode": { @@ -830,7 +1285,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.7.10" + "version": "3.6.13" } }, "nbformat": 4, diff --git a/end_to_end/music_recommendation/03_train_model_lineage_registry_debugger.ipynb b/end_to_end/music_recommendation/03a_train_deploy_debugger_explain_monitor_registry.ipynb similarity index 97% rename from end_to_end/music_recommendation/03_train_model_lineage_registry_debugger.ipynb rename to end_to_end/music_recommendation/03a_train_deploy_debugger_explain_monitor_registry.ipynb index 713761b922..e32bccf6b6 100644 --- a/end_to_end/music_recommendation/03_train_model_lineage_registry_debugger.ipynb +++ b/end_to_end/music_recommendation/03a_train_deploy_debugger_explain_monitor_registry.ipynb @@ -70,7 +70,10 @@ "import pandas as pd\n", "import pathlib\n", "import sagemaker\n", - "import glob" + "import glob\n", + "import json\n", + "from datetime import datetime\n", + "import matplotlib.pyplot as plt" ] }, { @@ -81,30 +84,7 @@ "source": [ "import sys\n", "import pprint\n", - "sys.path.insert(1, './code')\n", - "from parameter_store import ParameterStore\n", - "ps = ParameterStore(verbose=False)\n", - "\n", - "parameters = ps.read('music-rec')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "dw_ecrlist = parameters['dw_ecrlist']\n", - "fg_name_ratings = parameters['fg_name_ratings']\n", - "fg_name_tracks = parameters['fg_name_tracks']\n", - "fg_name_user_preferences = parameters['fg_name_user_preferences']\n", - "flow_export_id = parameters['flow_export_id']\n", - "flow_s3_uri = parameters['flow_s3_uri']\n", - "pretrained_model_path = parameters['pretrained_model_path']\n", - "prefix = parameters['prefix']\n", - "bucket = parameters['bucket']\n", - "ratings_data_source = parameters['ratings_data_source']\n", - "tracks_data_source = parameters['tracks_data_source']" + "sys.path.insert(1, './code')" ] }, { @@ -126,6 +106,17 @@ "metadata": {}, "outputs": [], "source": [ + "# Sagemaker session\n", + "sess = sagemaker.Session()\n", + "# get session bucket name\n", + "bucket = sess.default_bucket()\n", + "# bucket prefix or the subfolder for everything we produce\n", + "prefix='music-recommendation'\n", + "# get sagemaker role\n", + "sagemaker_role = sagemaker.get_execution_role()\n", + "# s3 client\n", + "s3_client = boto3.client(\"s3\")\n", + "\n", "region = boto3.Session().region_name\n", "boto_session = boto3.Session(region_name=region)\n", "\n", @@ -148,14 +139,47 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "\n", + "## Save data to S3" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%%time\n", + "\n", + "train_headers = pd.read_csv('data/train_data_headers.csv', header=None)[0].tolist()\n", + "val_headers = pd.read_csv('data/val_data_headers.csv', header=None)[0].tolist()\n", + "train = pd.read_csv('data/train_data.csv', names=train_headers)\n", + "val = pd.read_csv('data/val_data.csv', names=val_headers)\n", + "\n", + "s3_client.upload_file('data/train_data.csv', bucket, f'{prefix}/data/train/train_data.csv')\n", + "s3_client.upload_file('data/val_data.csv', bucket, f'{prefix}/data/val/val_data.csv')\n", + "\n", + "\n", + "train_data_uri = f's3://{bucket}/{prefix}/data/train/train_data.csv'\n", + "val_data_uri = f's3://{bucket}/{prefix}/data/val/val_data.csv'\n", + "print (f\"Saving training data to {train_data_uri}\")\n", + "\n", + "# configure data inputs for SageMaker training\n", + "from sagemaker.inputs import TrainingInput\n", + "train_input = TrainingInput(train_data_uri, content_type=\"text/csv\")\n", + "val_input = TrainingInput(val_data_uri, content_type=\"text/csv\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", "\n", - "## Fetch Data from Offline Feature Store\n", + "## Train Model \n", "\n", "##### [back to top](#03-nb)\n", "\n", - "----\n", - "There are 3 feature stores for the ratings, tracks, and user preferences data. We retrieve data from all 3 before joining them." + "----" ] }, { @@ -164,13 +188,9 @@ "metadata": {}, "outputs": [], "source": [ - "from sagemaker.feature_store.feature_group import FeatureGroup\n", - "\n", - "feature_group_names = [fg_name_ratings, fg_name_tracks, fg_name_user_preferences]\n", - "feature_groups = []\n", - "for name in feature_group_names:\n", - " feature_group = FeatureGroup(name=name, sagemaker_session=feature_store_session)\n", - " feature_groups.append(feature_group)" + "from sagemaker.debugger import Rule, rule_configs\n", + "from sagemaker.debugger import DebuggerHookConfig, CollectionConfig\n", + "from sagemaker.estimator import Estimator" ] }, { @@ -179,13 +199,85 @@ "metadata": {}, "outputs": [], "source": [ - "s3_client = boto3.client('s3')\n", - "account_id = boto3.client('sts').get_caller_identity()[\"Account\"]\n", + "# variables used for parameterizing the notebook run\n", + "estimator_output_path = f's3://{bucket}/{prefix}/training_jobs'\n", + "train_instance_count = 2\n", + "train_instance_type = 'ml.m5.4xlarge'\n", + "save_interval = 2\n", + "image = sagemaker.image_uris.retrieve(\"xgboost\", region, \"0.90-2\")\n", + "model_name = 'music-rec-model-{}'.format(datetime.now().strftime(\"%Y%m%d-%H%M%S\"))\n", "\n", - "sagemaker_role = sagemaker.get_execution_role()\n", + "hyperparameters = {\n", + " \"max_depth\": \"4\",\n", + " \"eta\": \"0.2\",\n", + " \"objective\": \"reg:squarederror\",\n", + " \"num_round\": \"100\",\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "xgb_estimator = Estimator(\n", + " role=sagemaker_role,\n", + " instance_count=train_instance_count,\n", + " instance_type=train_instance_type,\n", + " image_uri=image,\n", + " hyperparameters=hyperparameters,\n", + "# base_job_name=model_name,\n", + " output_path=estimator_output_path,\n", + " \n", + " debugger_hook_config=DebuggerHookConfig(\n", + " s3_output_path=estimator_output_path+'/debugger', \n", + " collection_configs=[\n", + " CollectionConfig(\n", + " name=\"metrics\",\n", + " parameters={\n", + " \"save_interval\": str(save_interval)\n", + " }\n", + " ),\n", + " CollectionConfig(\n", + " name=\"feature_importance\",\n", + " parameters={\n", + " \"save_interval\": str(save_interval)\n", + " }\n", + " ),\n", + " CollectionConfig(\n", + " name=\"full_shap\",\n", + " parameters={\n", + " \"save_interval\": str(save_interval)\n", + " }\n", + " ),\n", + " CollectionConfig(\n", + " name=\"average_shap\",\n", + " parameters={\n", + " \"save_interval\": str(save_interval)\n", + " }\n", + " ),\n", + " ],\n", + " ),\n", "\n", - "s3_output_path = 's3://' + bucket\n", - "ps.add({'s3_output_path': s3_output_path}, namespace='music-rec')" + " rules=[\n", + " Rule.sagemaker(\n", + " rule_configs.loss_not_decreasing(),\n", + " rule_parameters={\n", + " \"collection_names\": \"metrics\",\n", + " \"num_steps\": str(save_interval * 2),\n", + " },\n", + " )\n", + " ],\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Train the model here in order to access SageMaker Debugger\n", + "or skip to the next notebook now to deploy a pretrained model" ] }, { @@ -194,11 +286,12 @@ "metadata": {}, "outputs": [], "source": [ - "feature_group_s3_prefixes = []\n", - "for feature_group in feature_groups:\n", - " feature_group_table_name = feature_group.describe().get(\"OfflineStoreConfig\").get(\"DataCatalogConfig\").get(\"TableName\")\n", - " feature_group_s3_prefix = f'{account_id}/sagemaker/{region}/offline-store/{feature_group_table_name}'\n", - " feature_group_s3_prefixes.append(feature_group_s3_prefix)" + "response = sagemaker_client.list_training_jobs(\n", + " NameContains = model_name,\n", + " StatusEquals = 'Completed',\n", + " SortBy='CreationTime',\n", + " SortOrder='Descending'\n", + ")" ] }, { @@ -209,21 +302,30 @@ }, "outputs": [], "source": [ - "# wait for data to be added to offline feature store\n", - "def wait_for_offline_store(feature_group_s3_prefix):\n", - " print(feature_group_s3_prefix)\n", - " offline_store_contents = None\n", - " while (offline_store_contents is None):\n", - " objects_in_bucket = s3_client.list_objects(Bucket=bucket, Prefix=feature_group_s3_prefix)\n", - " if ('Contents' in objects_in_bucket and len(objects_in_bucket['Contents']) > 1):\n", - " offline_store_contents = objects_in_bucket['Contents']\n", - " else:\n", - " print('Waiting for data in offline store...')\n", - " time.sleep(60)\n", - " print('Data available.')\n", - " \n", - "for s3_prefix in feature_group_s3_prefixes:\n", - " wait_for_offline_store(s3_prefix)" + "%%time\n", + "\n", + "train_model = True # True if training a new model, False if wanting to use an existing estimator once you've already trained\n", + "\n", + "if train_model:\n", + " print('Training the model')\n", + " xgb_estimator.fit(inputs = {'train': train_input, 'validation': val_input}, job_name=model_name)\n", + " s3_debugger_output_path = xgb_estimator.latest_job_debugger_artifacts_path()\n", + "elif len(response['TrainingJobSummaries']) > 0:\n", + " training_job_name = response['TrainingJobSummaries'][0]['TrainingJobName']\n", + " xgb_estimator = Estimator.attach(training_job_name)\n", + " s3_debugger_output_path = xgb_estimator.latest_job_debugger_artifacts_path()\n", + "else:\n", + " print(\"No existing estimator found. You'll need to run as train = True\")\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "training_job_name = xgb_estimator.latest_training_job.job_name\n", + "print(training_job_name)" ] }, { @@ -232,29 +334,22 @@ "metadata": {}, "outputs": [], "source": [ - "tables = { \n", - " 'ratings': {'feature_group': feature_groups[0],\n", - " 'cols': ['userid', 'trackid', 'rating']\n", - " },\n", - " 'tracks': {'feature_group': feature_groups[1],\n", - " 'cols': ['trackid', 'length', 'energy', 'acousticness', 'valence', 'speechiness', 'instrumentalness', \n", - " 'liveness', 'tempo', 'danceability', 'genre_latin', 'genre_folk', 'genre_blues', 'genre_rap', \n", - " 'genre_reggae', 'genre_jazz', 'genre_rnb', 'genre_country', 'genre_electronic', 'genre_pop_rock']\n", - " },\n", - " 'user_5star_features': {'feature_group': feature_groups[2],\n", - " 'cols': ['userid', 'energy_5star', 'acousticness_5star', 'valence_5star', 'speechiness_5star', 'instrumentalness_5star', \n", - " 'liveness_5star','tempo_5star', 'danceability_5star', 'genre_latin_5star', 'genre_folk_5star', 'genre_blues_5star', \n", - " 'genre_rap_5star','genre_reggae_5star', 'genre_jazz_5star', 'genre_rnb_5star', 'genre_country_5star', \n", - " 'genre_electronic_5star', 'genre_pop_rock_5star']\n", - " },\n", - " }" + "import pprint\n", + "training_job_info = sagemaker_client.describe_training_job(TrainingJobName=training_job_name)\n", + "pprint.pprint(f\"{training_job_info}\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "check if the athena queres have been done and the data sets exist, then just do train test split or just proceed to training" + "\n", + "\n", + "## Deploy Model\n", + "\n", + "##### [back to top](#04-nb)\n", + "\n", + "----" ] }, { @@ -263,38 +358,8 @@ "metadata": {}, "outputs": [], "source": [ - "def get_train_val():\n", - " for k, v in tables.items():\n", - " query = v['feature_group'].athena_query()\n", - " joined_cols = \", \".join(v['cols'])\n", - " # limit number of datapoints for training time\n", - " query_string = \"SELECT {} FROM \\\"{}\\\" LIMIT 500000\".format(joined_cols, query.table_name)\n", - " print(query_string,'\\n')\n", - "\n", - " output_location = f's3://{bucket}/{prefix}/query_results/'\n", - " query.run(query_string=query_string, output_location=output_location)\n", - " query.wait()\n", - "\n", - " tables[k]['df'] = query.as_dataframe() \n", - " \n", - " ratings = tables['ratings']['df']\n", - " tracks = tables['tracks']['df']\n", - " user_prefs = tables['user_5star_features']['df']\n", - " \n", - " print('Merging datasets...')\n", - " print(f'Ratings: {ratings.shape}\\nTracks: {tracks.shape}\\nUser Prefs: {user_prefs.shape}\\n')\n", - " \n", - " dataset = pd.merge(ratings, tracks, on='trackid', how='inner')\n", - " dataset = pd.merge(dataset, user_prefs, on='userid', how='inner')\n", - " dataset.drop_duplicates(inplace=True)\n", - " dataset.drop(['userid', 'trackid'], axis=1, inplace=True)\n", - "\n", - " # split data\n", - " from sklearn.model_selection import train_test_split\n", - " train, val = train_test_split(dataset, test_size=0.2, random_state=42)\n", - " print(\"Training dataset shape: {}\\nValidation dataset shape: {}\\n\".format(train.shape, val.shape))\n", - "\n", - " return train, val" + "endpoint_name = 'music-rec-endpoint-{}'.format(datetime.now().strftime(\"%Y%m%d-%H%M%S\"))\n", + "print(endpoint_name)" ] }, { @@ -303,27 +368,20 @@ "metadata": {}, "outputs": [], "source": [ - "%%time\n", - "import pandas as pd\n", - "\n", - "# create the training data if it has not been created already\n", - "if glob.glob('data/train_data.csv') and 'feature_names' in parameters:\n", - " print('Using existing files: train_data.csv & val_data.csv')\n", - " train = pd.read_csv('data/train_data.csv', names=[\"rating\"]+parameters['feature_names'])\n", - " val = pd.read_csv('data/val_data.csv', names=[\"rating\"]+parameters['feature_names'])\n", - "else:\n", - " print('Creating training and validation sets...\\n')\n", - " train, val = get_train_val()\n", - " # Write to csv in S3 without headers and index column\n", - " train.to_csv('./data/train_data.csv', header=False, index=False)\n", - " val.to_csv('./data/val_data.csv', header=False, index=False)" + "endpoint_list = sagemaker_client.list_endpoints(\n", + " SortBy='CreationTime',\n", + " SortOrder='Descending',\n", + " NameContains=endpoint_name,\n", + " StatusEquals='InService'\n", + ")\n", + "endpoint_list" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "## Save data to S3" + "### Create endpoint" ] }, { @@ -333,18 +391,27 @@ "outputs": [], "source": [ "%%time\n", - "s3_client.upload_file('data/train_data.csv', bucket, f'{prefix}/data/train/train_data.csv')\n", - "s3_client.upload_file('data/val_data.csv', bucket, f'{prefix}/data/val/val_data.csv')\n", + "if len(endpoint_list['Endpoints']) > 0:\n", + " print(f\"Using existing endpoint: {endpoint_list['Endpoints'][0]['EndpointName']}\")\n", + "else:\n", + " # deploy endpoint for model if it doesn't already exist\n", + " xgb_estimator.deploy(initial_instance_count=1,\n", + " instance_type='ml.m4.xlarge',\n", + " endpoint_name=endpoint_name\n", + " )" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + " \n", "\n", + "## Create a predictor\n", "\n", - "train_data_uri = f's3://{bucket}/{prefix}/data/train/train_data.csv'\n", - "val_data_uri = f's3://{bucket}/{prefix}/data/val/val_data.csv'\n", - "print (f\"Saving training data to {train_data_uri}\")\n", + "##### [back to top](#04-nb)\n", "\n", - "# configure data inputs for SageMaker training\n", - "from sagemaker.inputs import TrainingInput\n", - "train_input = TrainingInput(train_data_uri, content_type=\"text/csv\")\n", - "val_input = TrainingInput(val_data_uri, content_type=\"text/csv\")" + "----" ] }, { @@ -353,18 +420,135 @@ "metadata": {}, "outputs": [], "source": [ - "ps.add({'train_data_uri': train_data_uri, 'val_data_uri': val_data_uri}, namespace='music-rec')" + "predictor = sagemaker.predictor.Predictor(\n", + " endpoint_name=endpoint_name,\n", + " sagemaker_session=sagemaker_session)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "\n", + "### Pull user data from feature group" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# # random user ID. You can try any other ID\n", + "# sample_user_id = 11005" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# featurestore_runtime = boto_session.client(service_name='sagemaker-featurestore-runtime', region_name=region)\n", "\n", - "## Train Model \n", + "# feature_store_session = sagemaker.Session(\n", + "# boto_session=boto_session,\n", + "# sagemaker_client=sagemaker_client,\n", + "# sagemaker_featurestore_runtime_client=featurestore_runtime\n", + "# )" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# # pull the sample user's 5 star preferences record from the feature store\n", + "# fg_response = featurestore_runtime.get_record(\n", + "# FeatureGroupName='user-5star-track-features-music-rec', \n", + "# RecordIdentifierValueAsString=str(sample_user_id)\n", + "# )\n", + "\n", + "# record = fg_response['Record']\n", + "# df_user = pd.DataFrame(record).set_index('FeatureName')\n", + "# df_user.to_csv(\"./data/sample_user.csv\")\n", + "df_user = pd.read_csv(\"./data/sample_user.csv\")\n", + "df_user = df_user.set_index('FeatureName')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Pull sample of 1000 tracks from feature group" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# # pull a sample of the tracks data (multiple records) from the feature store using athena query\n", + "# fg_name_tracks_obj = FeatureGroup(name='track-features-music-rec', sagemaker_session=feature_store_session)\n", + "# tracks_query = fg_name_tracks_obj.athena_query()\n", + "# tracks_table = tracks_query.table_name\n", + "\n", + "# # use escaped quotes aound table name since it contains '-' symbols\n", + "# query_string = (\"SELECT * FROM \\\"{}\\\" LIMIT 1000\".format(tracks_table))\n", + "# print(\"Running \" + query_string)\n", + "\n", + "# # run Athena query. The output is loaded to a Pandas dataframe.\n", + "# tracks_query.run(query_string=query_string, output_location=f\"s3://{bucket}/{prefix}/query_results/\")\n", + "# tracks_query.wait()\n", + "# df_tracks = tracks_query.as_dataframe()\n", + "# df_tracks.to_csv(\"./data/sample_tracks.csv\")\n", + "df_tracks = pd.read_csv(\"./data/sample_tracks.csv\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "feature_names = pd.read_csv('data/train_data_headers.csv', header=None)[0].tolist()[1:]\n", + "data = (\n", + " df_tracks.assign(key=1)\n", + " .merge(pd.DataFrame(df_user['ValueAsString']).T.assign(key=1), on=\"key\")\n", + " .drop(\"key\", axis=1)\n", + ")\n", + "data.columns = [c.lower() for c in data.columns]\n", + "inference_df = data[feature_names]\n", + "inference_df" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Format the datapoint\n", + "The datapoint must match the exact input format as the model was trained--with all features in the correct order. In this example, the `col_order` variable was saved when you created the train and test datasets earlier in the guide." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "data_inputs = [','.join([str(i) for i in row]) for row in inference_df.values]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + " \n", "\n", - "##### [back to top](#03-nb)\n", + "## Infer (predict) new songs using model\n", + "\n", + "##### [back to top](#04-nb)\n", "\n", "----" ] @@ -375,9 +559,12 @@ "metadata": {}, "outputs": [], "source": [ - "from sagemaker.debugger import Rule, rule_configs\n", - "from sagemaker.debugger import DebuggerHookConfig, CollectionConfig\n", - "from sagemaker.estimator import Estimator" + "predictions = []\n", + "for data_input in data_inputs:\n", + " results = predictor.predict(data_input, initial_args = {\"ContentType\": \"text/csv\"})\n", + " prediction = json.loads(results)\n", + " predictions.append(prediction)\n", + "print(f'Predicted rating for sample user:', prediction)" ] }, { @@ -386,85 +573,38 @@ "metadata": {}, "outputs": [], "source": [ - "# variables used for parameterizing the notebook run\n", - "estimator_output_path = f's3://{bucket}/{prefix}/training_jobs'\n", - "train_instance_count = 2\n", - "train_instance_type = 'ml.m5.4xlarge'\n", - "save_interval = 2\n", - "image = sagemaker.image_uris.retrieve(\"xgboost\", region, \"0.90-2\")\n", - "model_name = 'music-recommendation-model'\n", + "# Write to csv in S3 without headers and index column.\n", + "inference_df['rating'] = predictions\n", + "inference_df = inference_df[['rating']+feature_names]\n", + "inference_df.to_csv('data/prediction_data.csv', header=False, index=False)\n", "\n", - "hyperparameters = {\n", - " \"max_depth\": \"4\",\n", - " \"eta\": \"0.2\",\n", - " \"objective\": \"reg:squarederror\",\n", - " \"num_round\": \"100\",\n", - "}" + "s3_client.upload_file('data/prediction_data.csv', bucket, f'{prefix}/data/pred/prediction_data.csv')\n", + "\n", + "pred_data_uri = f's3://{bucket}/{prefix}/data/pred/prediction_data.csv'" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, - "outputs": [], - "source": [ - "xgb_estimator = Estimator(\n", - " role=sagemaker_role,\n", - " instance_count=train_instance_count,\n", - " instance_type=train_instance_type,\n", - " image_uri=image,\n", - " hyperparameters=hyperparameters,\n", - " base_job_name=model_name,\n", - " output_path=estimator_output_path,\n", - " \n", - " debugger_hook_config=DebuggerHookConfig(\n", - " s3_output_path=estimator_output_path+'/debugger', \n", - " collection_configs=[\n", - " CollectionConfig(\n", - " name=\"metrics\",\n", - " parameters={\n", - " \"save_interval\": str(save_interval)\n", - " }\n", - " ),\n", - " CollectionConfig(\n", - " name=\"feature_importance\",\n", - " parameters={\n", - " \"save_interval\": str(save_interval)\n", - " }\n", - " ),\n", - " CollectionConfig(\n", - " name=\"full_shap\",\n", - " parameters={\n", - " \"save_interval\": str(save_interval)\n", - " }\n", - " ),\n", - " CollectionConfig(\n", - " name=\"average_shap\",\n", - " parameters={\n", - " \"save_interval\": str(save_interval)\n", - " }\n", - " ),\n", - " ],\n", - " ),\n", + "outputs": [], + "source": [ + "df_train = pd.read_csv(train_data_uri)\n", "\n", - " rules=[\n", - " Rule.sagemaker(\n", - " rule_configs.loss_not_decreasing(),\n", - " rule_parameters={\n", - " \"collection_names\": \"metrics\",\n", - " \"num_steps\": str(save_interval * 2),\n", - " },\n", - " )\n", - " ],\n", - ")" + "label = 'rating'" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "#### Train the model here in order to access SageMaker Debugger\n", - "or skip to the next notebook now to deploy a pretrained model" + " \n", + "\n", + "## Explain model predictions\n", + "\n", + "##### [back to top](#04-nb)\n", + "\n", + "----" ] }, { @@ -473,12 +613,49 @@ "metadata": {}, "outputs": [], "source": [ - "response = sagemaker_client.list_training_jobs(\n", - " NameContains = model_name,\n", - " StatusEquals = 'Completed',\n", - " SortBy='CreationTime',\n", - " SortOrder='Descending'\n", - ")" + "model = xgb_estimator.create_model(name=model_name)\n", + "container_def = model.prepare_container_def()\n", + "sess.create_model(model_name, sagemaker_role, container_def)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "explainability_output_path = f's3://{bucket}/{prefix}/clarify-output/explainability'" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "clarify_processor = sagemaker.clarify.SageMakerClarifyProcessor(\n", + " role=sagemaker_role,\n", + " instance_count=1,\n", + " instance_type='ml.c4.xlarge',\n", + " sagemaker_session=sagemaker_session)\n", + "\n", + "model_config = sagemaker.clarify.ModelConfig(\n", + " model_name=model_name,\n", + " instance_type='ml.m4.xlarge',\n", + " instance_count=1,\n", + " accept_type='text/csv')\n", + "\n", + "shap_config = sagemaker.clarify.SHAPConfig(\n", + " baseline=[df_train.median().values[1:].tolist()], # ignore the first column since that is that target\n", + " num_samples=100,\n", + " agg_method='mean_abs')\n", + "\n", + "explainability_data_config = sagemaker.clarify.DataConfig(\n", + " s3_data_input_path=pred_data_uri,\n", + " s3_output_path=explainability_output_path,\n", + " label=label,\n", + " headers=[label]+feature_names,\n", + " dataset_type='text/csv')\n" ] }, { @@ -490,23 +667,21 @@ "outputs": [], "source": [ "%%time\n", - "\n", - "train_model = True # True if training a new model, False if wanting to use an existing estimator once you've already trained\n", - "\n", - "if train_model:\n", - " print('Training the model')\n", - " xgb_estimator.fit(inputs = {'train': train_input, 'validation': val_input})\n", - " s3_debugger_output_path = xgb_estimator.latest_job_debugger_artifacts_path()\n", - " ps.add({'s3_debugger_output_path': s3_debugger_output_path}, namespace='music-rec')\n", - " ps.store()\n", - "elif len(response['TrainingJobSummaries']) > 0:\n", - " training_job_name = response['TrainingJobSummaries'][0]['TrainingJobName']\n", - " xgb_estimator = Estimator.attach(training_job_name)\n", - " parameters = ps.read('music-rec')\n", - " s3_debugger_output_path = parameters['s3_debugger_output_path']\n", - " print(f'Using estimator from completed training job: {training_job_name}\\nwith debugger path {s3_debugger_output_path}')\n", - "else:\n", - " print(\"No existing estimator found. You'll need to run as train = True\")\n" + "try:\n", + " s3_client.download_file(\n", + " Bucket = bucket, \n", + " Key = f'{prefix}/clarify-output/explainability/explanations_shap/out.csv', \n", + " Filename = 'data/shap_output.csv'\n", + " )\n", + " print('Downloaded output from previous explainability job')\n", + "except Exception as e:\n", + " error = e.response.get('Error').get('Code')\n", + " if error == '404':\n", + " print('Running explainability job')\n", + " clarify_processor.run_explainability(\n", + " data_config=explainability_data_config,\n", + " model_config=model_config,\n", + " explainability_config=shap_config)" ] }, { @@ -515,11 +690,7 @@ "metadata": {}, "outputs": [], "source": [ - "training_job_name = xgb_estimator.latest_training_job.job_name\n", - "print(training_job_name)\n", - "\n", - "ps.add({'training_job_name': training_job_name}, namespace='music-rec')\n", - "ps.store()" + "inference_df['trackid'] = data['trackid']" ] }, { @@ -528,9 +699,23 @@ "metadata": {}, "outputs": [], "source": [ - "import pprint\n", - "training_job_info = sagemaker_client.describe_training_job(TrainingJobName=training_job_name)\n", - "pprint.pprint(f\"{training_job_info}\")" + "playlist_length = 10 # number of songs to recommend in playlist\n", + "playlist = inference_df.sort_values(by='rating', ascending=False).head(playlist_length)\n", + "print('Curated Playlist:\\n', playlist['trackid'])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "local_explanations_out = pd.read_csv(explainability_output_path+'/explanations_shap/out.csv')\n", + "local_explanations_out.columns = feature_names\n", + "\n", + "print(\"Model prediction:\", playlist.iloc[0, 0])\n", + "plt.figure(figsize=(12,6))\n", + "local_explanations_out.iloc[0].sort_values().plot.barh(title='Local explanation for prediction')" ] }, { @@ -579,10 +764,7 @@ "outputs": [], "source": [ "feature_names = list(train.drop('rating', axis=1).columns)\n", - "print(feature_names)\n", - "\n", - "ps.add({'feature_names': feature_names}, namespace='music-rec')\n", - "ps.store()" + "print(feature_names)" ] }, { @@ -765,21 +947,23 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "\n", "\n", - "## Examine Lineage\n", "\n", - "##### [back to top](#03-nb)\n", + "\n", + "## Model Monitor\n", "\n", - "----\n", - "Though you already know the training job details from above, if we were just given the model uri, we could use SageMaker Lineage to produce the training job details which produced the model." + "## Step 1: Enable real-time inference data capture\n", + "\n", + "##### [back to top](#05-nb)\n", + "\n", + "----" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "### Data Lineage and Metrics for Best Model" + "To enable data capture for monitoring the model data quality, you specify the new capture option called `DataCaptureConfig`. You can capture the request payload, the response payload or both with this configuration. The capture config applies to all variants. Please provide the Endpoint name in the following cell:" ] }, { @@ -788,14 +972,90 @@ "metadata": {}, "outputs": [], "source": [ - "from sagemaker.lineage import context, artifact, association, action" + "from sagemaker.model_monitor import DataCaptureConfig\n", + "\n", + "# Please fill in the following for enabling data capture\n", + "s3_capture_upload_path = f's3://{bucket}/{prefix}/endpoint-data-capture/' #example: s3://bucket-name/path/to/endpoint-data-capture/\n", + "\n", + "##### \n", + "## IMPORTANT\n", + "##\n", + "## Please make sure to add the \"s3:PutObject\" permission to the \"role' you provided in the SageMaker Model \n", + "## behind this Endpoint. Otherwise, Endpoint data capture will not work.\n", + "## \n", + "##### " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "%%time\n", + "# Change parameters as you would like - adjust sampling percentage, \n", + "# chose to capture request or response or both\n", + "data_capture_config = DataCaptureConfig(\n", + " enable_capture = True,\n", + " sampling_percentage=25,\n", + " destination_s3_uri=s3_capture_upload_path,\n", + " kms_key_id=None,\n", + " capture_options=[\"REQUEST\", \"RESPONSE\"],\n", + " csv_content_types=[\"text/csv\"],\n", + " json_content_types=[\"application/json\"]\n", + ")\n", + "\n", + "# Now it is time to apply the new configuration and wait for it to be applied\n", + "predictor.update_data_capture_config(data_capture_config=data_capture_config)\n", + "sess.wait_for_endpoint(endpoint=endpoint_name)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Before you proceed:\n", + "Currently SageMaker supports monitoring Endpoints out of the box only for **tabular (csv, flat-json)** datasets. If your Endpoint uses some other datasets, these following steps will NOT work for you.\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "## Step 2: Model Monitor - Baselining\n", + "\n", + "##### [back to top](#05-nb)\n", + "\n", + "----" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In addition to collecting the data, SageMaker allows you to monitor and evaluate the data observed by the Endpoints. For this :\n", + "1. We need to create a baseline with which we compare the realtime traffic against. \n", + "1. Once a baseline is ready, we can setup a schedule to continously evaluate/compare against the baseline." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Constraint suggestion with baseline/training dataset" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "### Training data artifact" + "The training dataset with which you trained the model is usually a good baseline dataset. Note that the training dataset's data schema and the inference dataset schema should exactly match (i.e. number and order of the features).\n", + "\n", + "Using our training dataset, we'll ask SageMaker to suggest a set of baseline constraints and generate descriptive statistics to explore the data." ] }, { @@ -804,34 +1064,81 @@ "metadata": {}, "outputs": [], "source": [ - "data_artifact_list = []\n", - "for data_input in training_job_info['InputDataConfig']:\n", - " channel = data_input['ChannelName']\n", - " data_s3_uri = data_input['DataSource']['S3DataSource']['S3Uri']\n", + "##'s3://bucketname/path/to/baseline/data' - Where your validation data is\n", + "baseline_data_uri = val_data_uri \n", + "##'s3://bucketname/path/to/baseline/data' - Where the results are to be stored in\n", + "baseline_results_uri = f's3://{bucket}/{prefix}/baseline/results' \n", "\n", - " matching_artifacts = list(artifact.Artifact.list(\n", - " source_uri=data_s3_uri,\n", - " sagemaker_session=sagemaker_session)\n", - " )\n", - " \n", - " if matching_artifacts:\n", - " data_artifact = matching_artifacts[0]\n", - " print(f'Using existing artifact: {data_artifact.artifact_arn}')\n", - " else:\n", - " data_artifact = artifact.Artifact.create(\n", - " artifact_name=channel,\n", - " source_uri=data_s3_uri,\n", - " artifact_type='Dataset',\n", - " sagemaker_session=sagemaker_session)\n", - " print(f'Create artifact {data_artifact.artifact_arn}: SUCCESSFUL')\n", - " data_artifact_list.append(data_artifact)" + "print('Baseline data uri: {}'.format(baseline_data_uri))\n", + "print('Baseline results uri: {}'.format(baseline_results_uri))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Create a baselining job with the validation dataset" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "### Model artifact" + "Now that we have the training data ready in S3, let's kick off a job to `suggest` constraints. `DefaultModelMonitor.suggest_baseline(..)` kicks off a `ProcessingJob` using a SageMaker provided Model Monitor container to generate the constraints. Please edit the configurations to fit your needs." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "from sagemaker.model_monitor import DefaultModelMonitor\n", + "from sagemaker.model_monitor.dataset_format import DatasetFormat\n", + "from sagemaker import get_execution_role\n", + "import datetime\n", + "\n", + "role = get_execution_role(sagemaker_session=sess)\n", + "\n", + "datetime_stamp = datetime.datetime.now().strftime(\"%Y-%m-%d-%H%M%S\")\n", + "\n", + "my_default_monitor = DefaultModelMonitor(\n", + " role=role,\n", + " instance_count=2,\n", + " instance_type='ml.m5.xlarge',\n", + " volume_size_in_gb=20,\n", + " max_runtime_in_seconds=1800,\n", + " base_job_name=f\"{prefix}-monitor-{datetime_stamp}\"\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "%%time\n", + "\n", + "monitor_baseline = my_default_monitor.suggest_baseline(\n", + " baseline_dataset=baseline_data_uri,\n", + " dataset_format=DatasetFormat.csv(header=False),\n", + " output_s3_uri=baseline_results_uri,\n", + " job_name=f\"{prefix}-monitor-baseline-{datetime_stamp}\",\n", + " wait=True\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Exploratory Analysis of the Processing Jobs underlying SageMaker Monitor\n", + "In this short section [next few cells] we will be showing you how to further view the underlying jobs for the monitoring job" ] }, { @@ -840,30 +1147,69 @@ "metadata": {}, "outputs": [], "source": [ - "trained_model_s3_uri = training_job_info['ModelArtifacts']['S3ModelArtifacts']\n", + "from time import gmtime, strftime\n", + "import boto3\n", "\n", - "matching_artifacts = list(artifact.Artifact.list(\n", - " source_uri=trained_model_s3_uri,\n", - " sagemaker_session=sagemaker_session)\n", - ")\n", + "client = boto3.client('sagemaker')\n", "\n", - "if matching_artifacts:\n", - " model_artifact = matching_artifacts[0]\n", - " print(f'Using existing artifact: {model_artifact.artifact_arn}')\n", - "else:\n", - " model_artifact = artifact.Artifact.create(\n", - " artifact_name='TrainedModel',\n", - " source_uri=trained_model_s3_uri,\n", - " artifact_type='Model',\n", - " sagemaker_session=sagemaker_session)\n", - " print(f'Create artifact {model_artifact.artifact_arn}: SUCCESSFUL')" + "def get_last_processing_job():\n", + " \n", + " response = client.list_processing_jobs(\n", + " NameContains=f\"{prefix}-monitor-baseline-{datetime_stamp}\",\n", + " StatusEquals='Completed',\n", + " SortBy='CreationTime',\n", + " SortOrder='Descending',\n", + " MaxResults=20\n", + " )\n", + " pprint.pprint(response['ProcessingJobSummaries'][0])\n", + " return response['ProcessingJobSummaries'][0]['ProcessingJobName']" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from sagemaker.processing import ProcessingJob \n", + "from sagemaker.estimator import Estimator\n", + "from sagemaker.model_monitor.model_monitoring import ModelMonitor\n", + "\n", + "my_default_monitor_name = get_last_processing_job()\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "my_default_monitor_reload = ProcessingJob.from_processing_name(sess, my_default_monitor_name)\n", + "\n", + "response = client.describe_processing_job(\n", + " ProcessingJobName=my_default_monitor_name\n", + ")\n", + "pprint.pprint(response)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "#### Set artifact associations" + "### Explore the generated constraints and statistics" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "\n", + "baseline_job = my_default_monitor.latest_baselining_job\n", + "schema_df = pd.io.json.json_normalize(baseline_job.baseline_statistics().body_dict[\"features\"])\n", + "schema_df.head(10)" ] }, { @@ -872,15 +1218,78 @@ "metadata": {}, "outputs": [], "source": [ - "trial_component = sagemaker_client.describe_trial_component(TrialComponentName=training_job_info['TrainingJobName']+'-aws-training-job')\n", - "trial_component_arn = trial_component['TrialComponentArn']" + "constraints_df = pd.io.json.json_normalize(baseline_job.suggested_constraints().body_dict[\"features\"])\n", + "constraints_df.head(10)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Before proceeding to enable monitoring, you could chose to edit the constraint file as required to fine tune the constraints." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "## Step 3: Enable continous monitoring\n", + "\n", + "##### [back to top](#05-nb)\n", + "\n", + "----\n", + "\n", + "We have collected the data above, here we proceed to analyze and monitor the data with MonitoringSchedules." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Create a schedule" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "#### Store artifacts" + "We are ready to create a model monitoring schedule for the Endpoint created earlier with the baseline resources (constraints and statistics)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from sagemaker.model_monitor import CronExpressionGenerator\n", + "import datetime as datetime\n", + "from time import gmtime, strftime\n", + "\n", + "\n", + "mon_schedule_name = 'music-rec-monitor-schedule-{}'.format(datetime.datetime.now().strftime(\"%Y-%m-%d-%H%M%S\"))\n", + "s3_report_path = f's3://{bucket}/{prefix}/monitor/report'\n", + "\n", + "try:\n", + " my_default_monitor.create_monitoring_schedule(\n", + " monitor_schedule_name=mon_schedule_name,\n", + " endpoint_input=endpoint_name,\n", + " output_s3_uri=s3_report_path,\n", + " statistics=my_default_monitor.baseline_statistics(),\n", + " constraints=my_default_monitor.suggested_constraints(),\n", + " schedule_cron_expression=CronExpressionGenerator.daily(),\n", + " enable_cloudwatch_metrics=True,\n", + " )\n", + " print(f\"Created monitoring schedule {mon_schedule_name}\")\n", + "except:\n", + " my_default_monitor.update_monitoring_schedule(\n", + " endpoint_input=endpoint_name,\n", + " schedule_cron_expression=CronExpressionGenerator.daily(),\n", + " enable_cloudwatch_metrics=True,\n", + " )\n", + " print(f\"Updated monitoring schedule {my_default_monitor.monitoring_schedule_name}\")" ] }, { @@ -889,22 +1298,22 @@ "metadata": {}, "outputs": [], "source": [ - "artifact_list = data_artifact_list + [model_artifact]\n", + "import time\n", "\n", - "for artif in artifact_list:\n", - " if artif.artifact_type == 'Dataset':\n", - " assoc = 'ContributedTo'\n", - " else:\n", - " assoc = 'Produced'\n", - " try:\n", - " association.Association.create(\n", - " source_arn=artif.artifact_arn,\n", - " destination_arn=trial_component_arn,\n", - " association_type=assoc,\n", - " sagemaker_session=sagemaker_session)\n", - " print(f\"Association with {artif.artifact_type}: SUCCESSFUL\")\n", - " except:\n", - " print(f\"Association already exists with {artif.artifact_type}\")" + "desc_schedule_result = my_default_monitor.describe_schedule()\n", + "while desc_schedule_result['MonitoringScheduleStatus'] != 'Scheduled':\n", + " print('Schedule status: {}'.format(desc_schedule_result['MonitoringScheduleStatus']))\n", + " desc_schedule_result = my_default_monitor.describe_schedule()\n", + " time.sleep(30)\n", + "print('Schedule status: {}'.format(desc_schedule_result['MonitoringScheduleStatus']))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### All set\n", + "Now that your monitoring schedule has been created. Please return to the Amazon SageMaker Studio to list the executions for this Schedule and observe the results going forward." ] }, { @@ -928,9 +1337,6 @@ "source": [ "mpg_name = prefix+'-notebooks'\n", "\n", - "ps.add({'mpg_name':mpg_name}, namespace='music-rec')\n", - "\n", - "\n", "model_packages = sagemaker_client.list_model_packages(ModelPackageGroupName=mpg_name)['ModelPackageSummaryList']\n", "\n", "if model_packages:\n", @@ -1005,10 +1411,32 @@ " image_uri=training_job_info['AlgorithmSpecification']['TrainingImage']\n", ")\n", "\n", - "print(f\"Created new model: {model_name}\")\n", - "\n", - "ps.add({'model_name':model_name}, namespace='music-rec')\n", - "ps.store()" + "print(f\"Created new model: {model_name}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Clean Up" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import demo_helpers # our custom set of functions\n", + "demo_helpers.delete_project_resources(\n", + " sagemaker_boto_client=sagemaker_client, \n", + " sagemaker_session=sagemaker_session,\n", + " endpoint_names=[endpoint_name],\n", + " mpg_name=mpg_name,\n", + " prefix=prefix,\n", + " delete_s3_objects=True,\n", + " bucket_name=bucket\n", + ")" ] }, { @@ -1022,9 +1450,9 @@ "metadata": { "instance_type": "ml.m5.4xlarge", "kernelspec": { - "display_name": "Python 3 (Data Science)", + "display_name": "conda_python3", "language": "python", - "name": "python3__SAGEMAKER_INTERNAL__arn:aws:sagemaker:us-east-2:429704687514:image/datascience-1.0" + "name": "conda_python3" }, "language_info": { "codemirror_mode": { @@ -1036,7 +1464,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.7.10" + "version": "3.6.13" } }, "nbformat": 4, diff --git a/end_to_end/music_recommendation/06_pipeline.ipynb b/end_to_end/music_recommendation/03b_pipeline.ipynb similarity index 95% rename from end_to_end/music_recommendation/06_pipeline.ipynb rename to end_to_end/music_recommendation/03b_pipeline.ipynb index 522b8e7b81..696757b051 100644 --- a/end_to_end/music_recommendation/06_pipeline.ipynb +++ b/end_to_end/music_recommendation/03b_pipeline.ipynb @@ -102,21 +102,7 @@ "source": [ "import sys\n", "import pprint\n", - "sys.path.insert(1, './code')\n", - "from parameter_store import ParameterStore\n", - "\n", - "ps = ParameterStore(verbose=False)\n", - "\n", - "parameters = ps.read('music-rec')\n", - "\n", - "bucket = parameters['bucket']\n", - "prefix = parameters['prefix']\n", - "ratings_data_source = parameters['ratings_data_source']\n", - "tracks_data_source = parameters['tracks_data_source']\n", - "val_data_uri = f\"s3://{bucket}/{prefix}/data/val/val_data.csv\"\n", - "endpoint_name = parameters['endpoint_name']\n", - "mpg_name = parameters['mpg_name']\n", - "dw_ecrlist = parameters['dw_ecrlist']" + "sys.path.insert(1, './code')" ] }, { @@ -319,10 +305,16 @@ "outputs": [], "source": [ "# Define feature group names we previously created in notebooks 02a-c\n", - "fg_name_tracks = parameters['fg_name_tracks']\n", - "fg_name_ratings = parameters['fg_name_ratings']\n", - "fg_name_user_preferences = parameters['fg_name_user_preferences']\n", - "dw_ecrlist = parameters['dw_ecrlist']" + "fg_name_tracks = 'track-features-music-rec'\n", + "fg_name_ratings = 'ratings-features-music-rec'\n", + "fg_name_user_preferences = 'user-5star-track-features-music-rec'\n", + "dw_ecrlist = {\n", + " 'region':{'us-west-2':'174368400705',\n", + " 'us-east-2':'415577184552',\n", + " 'us-west-1':'926135532090',\n", + " 'us-east-1':'663277389841'\n", + " }\n", + "}" ] }, { @@ -750,14 +742,7 @@ "source": [ "pipeline_name = f'MusicRecommendationPipeline'\n", "dataprep_pipeline_name = f'MusicRecommendationDataPrepPipeline'\n", - "train_deploy_pipeline_name = f'MusicRecommendationTrainDeployPipeline'\n", - "\n", - "ps.add({'pipeline_name':pipeline_name, 'dataprep_pipeline_name':dataprep_pipeline_name, \n", - " 'train_deploy_pipeline_name':train_deploy_pipeline_name,\n", - " 'pipeline_endpoint_name':pipeline_endpoint_name}, \n", - " namespace='music-rec'\n", - ")\n", - "ps.store()" + "train_deploy_pipeline_name = f'MusicRecommendationTrainDeployPipeline'" ] }, { @@ -773,9 +758,7 @@ "metadata": {}, "outputs": [], "source": [ - "pipeline_name = f'MusicRecommendationPipeline'\n", - "\n", - "ps.add({'pipeline_name':pipeline_name}, namespace='music-rec')" + "pipeline_name = f'MusicRecommendationPipeline'" ] }, { @@ -988,20 +971,38 @@ "After completion we can use Sagemaker Studio's **Components and Registries** tab to see our Pipeline graph and any further error or log messages." ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Clean Up" + ] + }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], - "source": [] + "source": [ + "demo_helpers.delete_project_resources(\n", + " sagemaker_boto_client=sagemaker_boto_client, \n", + " sagemaker_session=sagemaker_session,\n", + " endpoint_names=[pipeline_endpoint_name],\n", + " pipeline_names=[pipeline_name, dataprep_pipeline_name, train_deploy_pipeline_name], \n", + " mpg_name=mpg_name,\n", + " prefix=prefix,\n", + " delete_s3_objects=True,\n", + " bucket_name=bucket\n", + ")" + ] } ], "metadata": { "instance_type": "ml.t3.medium", "kernelspec": { - "display_name": "Python 3 (Data Science)", + "display_name": "conda_python3", "language": "python", - "name": "python3__SAGEMAKER_INTERNAL__arn:aws:sagemaker:us-east-2:429704687514:image/datascience-1.0" + "name": "conda_python3" }, "language_info": { "codemirror_mode": { @@ -1013,7 +1014,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.7.10" + "version": "3.6.13" } }, "nbformat": 4, From 77b9e7ae4fd88d42fbc335068c4a444b6c14077b Mon Sep 17 00:00:00 2001 From: atqy Date: Wed, 4 May 2022 15:35:19 +0000 Subject: [PATCH 02/25] another refactor --- .../01_data_exploration.ipynb | 138 ++------------ .../02_export_feature_groups.ipynb | 174 +++++------------- ...y_debugger_explain_monitor_registry.ipynb} | 121 ++++++++++-- .../music_recommendation/code/demo_helpers.py | 65 ++++++- ...peline.ipynb => end_to_end_pipeline.ipynb} | 88 +++------ 5 files changed, 264 insertions(+), 322 deletions(-) rename end_to_end/music_recommendation/{03a_train_deploy_debugger_explain_monitor_registry.ipynb => 03_train_deploy_debugger_explain_monitor_registry.ipynb} (99%) rename end_to_end/music_recommendation/{03b_pipeline.ipynb => end_to_end_pipeline.ipynb} (90%) diff --git a/end_to_end/music_recommendation/01_data_exploration.ipynb b/end_to_end/music_recommendation/01_data_exploration.ipynb index d2ba68193d..4257f40f0e 100644 --- a/end_to_end/music_recommendation/01_data_exploration.ipynb +++ b/end_to_end/music_recommendation/01_data_exploration.ipynb @@ -6,50 +6,21 @@ "source": [ "\n", "\n", - "# Architect and Build a Music Recommender System across the Entire ML-Lifecycle with Amazon SageMaker\n", + "# Music Recommender Data Exploration\n", "\n", - "## Overview\n", + "## Background\n", "\n", - "----\n", - "\n", - "Welcome of the Music Recommender use-case with Amazon SageMaker. In this series of notebooks we will go through the ML Lifecycle and show how we can build a Music Recommender System using a combination of SageMaker Services and features. IN each phase, we will have relevant notebooks that show you how easy it is to implement that phase of the lifecycle.\n", + "This notebook is part of a notebook series that goes through the ML Lifecycle and show how we can build a Music Recommender System using a combination of SageMaker Services and features. In this notebook, we will be focusing on exploring the data. It is the first notebook in a series of notebooks. You can choose to run this notebook by itself or in sequence with the other notebooks listed below. Please see the [README.md](README.md) for more information about this use case implement of this sequence of notebooks. \n", "\n", + "1. [Music Recommender Data Exploration](01_data_exploration.ipynb) (current notebook)\n", + "1. [Music Recommender Data Preparation with SageMaker Feature Store and SageMaker Data Wrangler](02_data_exploration.ipynb)\n", + "1. [Train, Deploy, and Monitor the Music Recommender Model using SageMaker SDK](03_train_deploy_debugger_explain_monitor_registry.ipynb)\n", "\n", - "----\n", "\n", "### Contents\n", - "\n", - "- [Overview](00_overview_arch_data.ipynb)\n", - " - [Architecture](#arch-overview)\n", - " - [Get the Data](#get-the-data)\n", - " - [Update the data sources](#update-data-sources)\n", - " - [Explore the Data](#explore-data)\n", - "- [Part 1: Data Prep using Data Wrangler](01_music_dataprep.flow)\n", - "- [Part 2a: Feature Store Creation - Tracks](02a_export_fg_tracks.ipynb)\n", - "- [Part 2b: Feature Store Creation - User Preferences](02b_export_fg_5star_features.ipynb)\n", - "- [Part 2c: Feature Store Creation - Ratings](02c_export_fg_ratings.ipynb)\n", - "- [Part 3: Train Model with Debugger Hooks. Set Artifacts and Register Model.](03_train_model_lineage_registry_debugger.ipynb)\n", - "- [Part 4: Deploy Model & Inference using Online Feature Store](04_deploy_infer_explain.ipynb)\n", - "- [Part 5: Model Monitor](05_model_monitor.ipynb)\n", - "- [Part 6: SageMaker Pipelines](06_pipeline.ipynb)\n", - "- [Part 7: Resource Cleanup](07_clean_up.ipynb)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "## Architecture\n", - "\n", - "Let's look at the overall solution architecure for this use case. We will start by doing each of these tasks within the exploratoyr phase of the ML Lifecycle, then when we are done with Experimentation and Trials, we can develop an automated pipeline such as the one depicted here to prepare data, deposit in feature store, train and tune the model, deposit it in the registry, then deploy it to a SageMaker hosted endpoint, and run Monitoring on it.\n", - "\n", - "##### [back to top](#00-nb)\n", - "\n", - "----\n", - "\n", - "![Solution Architecure](./images/music-rec.png)" + "1. [Prereqs: Get Data](#Prereqs:-Get-Data)\n", + "1. [Update the Data Source in the .flow File](#Update-the-Data-Source-in-the-.flow-File)\n", + "1. [Explore the Data](#Explore-the-Data)\n" ] }, { @@ -99,7 +70,6 @@ "import sagemaker \n", "import boto3\n", "import os\n", - "from awscli.customizations.s3.utils import split_s3_bucket_key\n", "\n", "# Sagemaker session\n", "sess = sagemaker.Session()\n", @@ -114,84 +84,23 @@ ] }, { - "cell_type": "code", - "execution_count": null, + "cell_type": "markdown", "metadata": {}, - "outputs": [], "source": [ - "def get_data(public_s3_data, to_bucket, sample_data=1):\n", - " new_paths = []\n", - " for f in public_s3_data:\n", - " bucket_name, key_name = split_s3_bucket_key(f)\n", - " filename = f.split('/')[-1]\n", - " new_path = \"s3://{}/{}/{}\".format(to_bucket, prefix, filename)\n", - " new_paths.append(new_path)\n", - " \n", - " # only download if not already downloaded\n", - " if not os.path.exists('./data/{}'.format(filename)):\n", - " # download s3 data\n", - " print(\"Downloading file from {}\".format(f))\n", - " s3_client.download_file(bucket_name, key_name, './data/{}'.format(filename))\n", - " \n", - " # subsample the data to create a smaller datatset for this demo\n", - " new_df = pd.read_csv('./data/{}'.format(filename))\n", - " new_df = new_df.sample(frac=sample_data)\n", - " new_df.to_csv('./data/{}'.format(filename), index=False)\n", - " \n", - " # upload s3 data to our default s3 bucket for SageMaker Studio\n", - " print(\"Uploading {} to {}\\n\".format(filename, new_path))\n", - " s3_client.upload_file('./data/{}'.format(filename), to_bucket, os.path.join(prefix,filename))\n", - " \n", - " return new_paths\n", - "\n", + "## Prereqs: Get Data \n", "\n", - "def get_model(model_path, to_bucket):\n", - " # upload model to our default s3 bucket for SageMaker Studio\n", - " filename = model_path.split('/')[-1]\n", - " print(\"Uploading {} to {}\\n\".format(model_path, os.path.join(to_bucket,prefix,filename)))\n", - " s3_client.upload_file(model_path, to_bucket, os.path.join(prefix,filename))\n", - " return \"s://{}\".format(os.path.join(to_bucket,prefix,filename))\n", - " \n", + "----\n", "\n", - "def update_data_sources(flow_path, tracks_data_source, ratings_data_source):\n", - " with open(flow_path) as flowf:\n", - " flow = json.load(flowf)\n", - " \n", - " for node in flow['nodes']:\n", - " # if the key exists for our s3 endpoint\n", - " try:\n", - " if node['parameters']['dataset_definition']['name'] == 'tracks.csv':\n", - " # reset the s3 data source for tracks data\n", - " old_source = node['parameters']['dataset_definition']['s3ExecutionContext']['s3Uri']\n", - " print(\"Changed {} to {}\".format(old_source, tracks_data_source))\n", - " node['parameters']['dataset_definition']['s3ExecutionContext']['s3Uri'] = tracks_data_source\n", - " elif node['parameters']['dataset_definition']['name'] == 'ratings.csv':\n", - " # reset the s3 data source for ratings data\n", - " old_source = node['parameters']['dataset_definition']['s3ExecutionContext']['s3Uri']\n", - " print(\"Changed {} to {}\".format(old_source, ratings_data_source))\n", - " node['parameters']['dataset_definition']['s3ExecutionContext']['s3Uri'] = ratings_data_source\n", - " except:\n", - " continue\n", - " # write out the updated json flow file\n", - " with open(flow_path, 'w') as outfile:\n", - " json.dump(flow, outfile)\n", - " \n", - " return flow" + "Here we will download the music data from a public S3 bucket that we'll be using for this demo and uploads it to your default S3 bucket that was created for you when you initially created a SageMaker Studio workspace. " ] }, { - "cell_type": "markdown", + "cell_type": "code", + "execution_count": null, "metadata": {}, + "outputs": [], "source": [ - "\n", - "\n", - "## Prereqs: Get Data \n", - "\n", - "##### [back to top](#00-nb)\n", - "\n", - "----\n", - "\n", - "Here we will download the music data from a public S3 bucket that we'll be using for this demo and uploads it to your default S3 bucket that was created for you when you initially created a SageMaker Studio workspace. " + "from demo_helpers import get_data, get_model, update_data_sources" ] }, { @@ -245,13 +154,10 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "\n", - "\n", - "## Update the data source in the `.flow` file\n", - "\n", - "##### [back to top](#00-nb)\n", + "## Update the Data Source in the .flow File\n", "\n", "----\n", + "\n", "The `01_music_datapred.flow` file is a JSON file containing instructions for where to find your data sources and how to transform the data. We'll be updating the object telling Data Wrangler where to find the input data on S3. We will set this to your default S3 bucket. With this update to the `.flow` file it now points to your new S3 bucket as the data source used by SageMaker Data Wrangler.\n", "\n", "Make sure the `.flow` file is closed before running this next step or it won't update the new s3 file locations in the file" @@ -272,14 +178,8 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "\n", - "\n", "## Explore the Data\n", "\n", - "\n", - "##### [back to top](#00-nb)\n", - "\n", - "\n", "----" ] }, diff --git a/end_to_end/music_recommendation/02_export_feature_groups.ipynb b/end_to_end/music_recommendation/02_export_feature_groups.ipynb index 99c75046d3..6921795ab8 100644 --- a/end_to_end/music_recommendation/02_export_feature_groups.ipynb +++ b/end_to_end/music_recommendation/02_export_feature_groups.ipynb @@ -6,35 +6,29 @@ "source": [ "\n", "\n", - "# Music Recommender Part 2a: Feature Store Creation - Tracks\n", + "# Music Recommender Data Preparation with SageMaker Feature Store and SageMaker Data Wrangler\n", "\n", "----\n", "\n", - "This notebook creates a feature group for our tracks data to place in our feature store using the transformation instructions found in our `.flow` file. [Amazon SageMaker Feature Store](https://www.youtube.com/watch?v=pEg5c6d4etI) is a fully managed, purpose-built repository to store, update, retrieve, and share machine learning (ML) features.\n", - "\n", - "Features are the attributes or properties models use during training and inference to make predictions. For example, in a ML application that recommends a music playlist, features could include song ratings, which songs were listened to previously, and how long songs were listened to. The accuracy of a ML model is based on a precise set and composition of features. Often, these features are used repeatedly by multiple teams training multiple models. And whichever feature set was used to train the model needs to be available to make real-time predictions (inference). Keeping a single source of features that is consistent and up-to-date across these different access patterns is a challenge as most organizations keep two different feature stores, one for training and one for inference.\n", - "\n", - "Amazon SageMaker Feature Store is a purpose-built repository where you can store and access features so it’s much easier to name, organize, and reuse them across teams. SageMaker Feature Store provides a unified store for features during training and real-time inference without the need to write additional code or create manual processes to keep features consistent. SageMaker Feature Store keeps track of the metadata of stored features (e.g. feature name or version number) so that you can query the features for the right attributes in batches or in real time using Amazon Athena, an interactive query service. SageMaker Feature Store also keeps features updated, because as new data is generated during inference, the single repository is updated so new features are always available for models to use during training and inference.\n", + "## Background\n", "\n", + "This notebook is part of a notebook series that goes through the ML Lifecycle and show how we can build a Music Recommender System using a combination of SageMaker Services and features. This notebook uses Amazon SageMaker Feature Store (Feature Store) to create a feature group, \n", + "executes your Data Wrangler Flow `01_music_dataprep.flow` on the entire dataset using a SageMaker \n", + "Processing Job and ingest processed data to Feature Store. It is the second notebook in the series. You can choose to run this notebook by itself or in sequence with the other notebooks listed below. Please see the [README.md](README.md) for more information about this use case implement of this sequence of notebooks. \n", "\n", + "1. [Music Recommender Data Exploration](01_data_exploration.ipynb)\n", + "1. [Music Recommender Data Preparation with SageMaker Feature Store and SageMaker Data Wrangler](02_data_exploration.ipynb) (current notebook)\n", + "1. [Train, Deploy, and Monitor the Music Recommender Model using SageMaker SDK](03_train_deploy_debugger_explain_monitor_registry.ipynb)\n", + " \n", "----\n", "### Contents\n", - "- [Overview](00_overview_arch_data.ipynb)\n", - "- [Part 1: Data Prep using Data Wrangler](01_music_dataprep.flow)\n", - "- [Part 2a: Feature Store Creation - Tracks](02a_export_fg_tracks.ipynb)\n", - " - [Define Feature Group](#02a-define-fg)\n", - " - [Configure Feature Group](#02a-config-fg)\n", - " - [Initialize & Create Feature Group](#02a-init-create-fg)\n", - " - [Inputs and Outputs](#02a-input-output)\n", - " - [Upload flow file](#02a-upload-flow)\n", - " - [Run Processing Job](#02a-run-job)\n", - "- [Part 2b: Feature Store Creation - User Preferences](02b_export_fg_5star_features.ipynb)\n", - "- [Part 2c: Feature Store Creation - Ratings](02c_export_fg_ratings.ipynb)\n", - "- [Part 3: Train Model with Debugger Hooks. Set Artifacts and Register Model.](03_train_model_lineage_registry_debugger.ipynb)\n", - "- [Part 4: Deploy Model & Inference using Online Feature Store](04_deploy_infer_explain.ipynb)\n", - "- [Part 5: Model Monitor](05_model_monitor.ipynb)\n", - "- [Part 6: SageMaker Pipelines](06_pipeline.ipynb)\n", - "- [Part 7: Resource Cleanup](07_clean_up.ipynb)\n", + "1. [Define Feature Group](#Define-Feature-Group)\n", + "1. [Configure Feature Group](#Configure-Feature-Group)\n", + "1. [Initialize & Create Feature Group](#Initialize-&-Create-Feature-Group)\n", + "1. [Inputs and Outputs](#Inputs-and-Outputs)\n", + "1. [Upload Flow to S3](#Upload-Flow-to-S3)\n", + "1. [Run Processing Job](#Run-Processing-Job)\n", + "\n", "\n", "
💡 Quick Start \n", "To save your processed data to feature store, \n", @@ -42,9 +36,7 @@ "\n", "
\n", "\n", - "This notebook uses Amazon SageMaker Feature Store (Feature Store) to create a feature group, \n", - "executes your Data Wrangler Flow `01_music_dataprep.flow` on the entire dataset using a SageMaker \n", - "Processing Job and ingest processed data to Feature Store. \n" + "\n" ] }, { @@ -55,11 +47,7 @@ "source": [ "import sys\n", "import pprint\n", - "sys.path.insert(1, './code')\n", - "# from parameter_store import ParameterStore\n", - "\n", - "# ps = ParameterStore()\n", - "# ps.create(namespace='music-rec')" + "sys.path.insert(1, './code')" ] }, { @@ -135,64 +123,17 @@ "metadata": {}, "outputs": [], "source": [ - "def get_data(public_s3_data, to_bucket, sample_data=1):\n", - " new_paths = []\n", - " for f in public_s3_data:\n", - " bucket_name, key_name = split_s3_bucket_key(f)\n", - " filename = f.split('/')[-1]\n", - " new_path = \"s3://{}/{}/{}\".format(to_bucket, prefix, filename)\n", - " new_paths.append(new_path)\n", - " \n", - " # only download if not already downloaded\n", - " if not os.path.exists('./data/{}'.format(filename)):\n", - " # download s3 data\n", - " print(\"Downloading file from {}\".format(f))\n", - " s3_client.download_file(bucket_name, key_name, './data/{}'.format(filename))\n", - " \n", - " # subsample the data to create a smaller datatset for this demo\n", - " new_df = pd.read_csv('./data/{}'.format(filename))\n", - " new_df = new_df.sample(frac=sample_data)\n", - " new_df.to_csv('./data/{}'.format(filename), index=False)\n", - " \n", - " # upload s3 data to our default s3 bucket for SageMaker Studio\n", - " print(\"Uploading {} to {}\\n\".format(filename, new_path))\n", - " s3_client.upload_file('./data/{}'.format(filename), to_bucket, os.path.join(prefix,filename))\n", - " \n", - " return new_paths\n", - "\n", - "\n", - "def get_model(model_path, to_bucket):\n", - " # upload model to our default s3 bucket for SageMaker Studio\n", - " filename = model_path.split('/')[-1]\n", - " print(\"Uploading {} to {}\\n\".format(model_path, os.path.join(to_bucket,prefix,filename)))\n", - " s3_client.upload_file(model_path, to_bucket, os.path.join(prefix,filename))\n", - " return \"s://{}\".format(os.path.join(to_bucket,prefix,filename))\n", - " \n", - "\n", - "def update_data_sources(flow_path, tracks_data_source, ratings_data_source):\n", - " with open(flow_path) as flowf:\n", - " flow = json.load(flowf)\n", - " \n", - " for node in flow['nodes']:\n", - " # if the key exists for our s3 endpoint\n", - " try:\n", - " if node['parameters']['dataset_definition']['name'] == 'tracks.csv':\n", - " # reset the s3 data source for tracks data\n", - " old_source = node['parameters']['dataset_definition']['s3ExecutionContext']['s3Uri']\n", - " print(\"Changed {} to {}\".format(old_source, tracks_data_source))\n", - " node['parameters']['dataset_definition']['s3ExecutionContext']['s3Uri'] = tracks_data_source\n", - " elif node['parameters']['dataset_definition']['name'] == 'ratings.csv':\n", - " # reset the s3 data source for ratings data\n", - " old_source = node['parameters']['dataset_definition']['s3ExecutionContext']['s3Uri']\n", - " print(\"Changed {} to {}\".format(old_source, ratings_data_source))\n", - " node['parameters']['dataset_definition']['s3ExecutionContext']['s3Uri'] = ratings_data_source\n", - " except:\n", - " continue\n", - " # write out the updated json flow file\n", - " with open(flow_path, 'w') as outfile:\n", - " json.dump(flow, outfile)\n", - " \n", - " return flow" + "from demo_helpers import get_data, get_model, update_data_sources" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# create data folder\n", + "!mkdir data" ] }, { @@ -211,7 +152,7 @@ "metadata": {}, "outputs": [], "source": [ - "new_data_paths = get_data([f\"{s3_bucket_music_data}/tracks.csv\", f\"{s3_bucket_music_data}/ratings.csv\"], bucket, sample_data=0.70)\n", + "new_data_paths = get_data(s3_client, [f\"{s3_bucket_music_data}/tracks.csv\", f\"{s3_bucket_music_data}/ratings.csv\"], bucket, prefix, sample_data=0.70)\n", "print(new_data_paths)" ] }, @@ -223,28 +164,34 @@ "source": [ "# these are the new file paths located on your SageMaker Studio default s3 storage bucket\n", "tracks_data_source = f's3://{bucket}/{prefix}/tracks.csv'\n", - "ratings_data_source = f's3://{bucket}/{prefix}/ratings.csv'\n", - "\n", - "# ps.add({'tracks_data_source': tracks_data_source, 'ratings_data_source': ratings_data_source}, namespace='music-rec')" + "ratings_data_source = f's3://{bucket}/{prefix}/ratings.csv'" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "### Upload pretrained model" + "\n", + "\n", + "## Update the data source in the `.flow` file\n", + "\n", + "##### [back to top](#00-nb)\n", + "\n", + "----\n", + "The `01_music_datapred.flow` file is a JSON file containing instructions for where to find your data sources and how to transform the data. We'll be updating the object telling Data Wrangler where to find the input data on S3. We will set this to your default S3 bucket. With this update to the `.flow` file it now points to your new S3 bucket as the data source used by SageMaker Data Wrangler.\n", + "\n", + "Make sure the `.flow` file is closed before running this next step or it won't update the new s3 file locations in the file" ] }, { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "scrolled": true + }, "outputs": [], "source": [ - "pretrained_model_path = get_model('./model/model.tar.gz', bucket)\n", - "\n", - "# ps.add({'pretrained_model_path': pretrained_model_path}, namespace='music-rec')\n", - "# ps.store()" + "update_data_sources('01_music_dataprep.flow', tracks_data_source, ratings_data_source)" ] }, { @@ -253,6 +200,10 @@ "source": [ "## Create Feature Group\n", "\n", + "[Amazon SageMaker Feature Store](https://www.youtube.com/watch?v=pEg5c6d4etI) is a fully managed, purpose-built repository to store, update, retrieve, and share machine learning (ML) features. Features are the attributes or properties models use during training and inference to make predictions. For example, in a ML application that recommends a music playlist, features could include song ratings, which songs were listened to previously, and how long songs were listened to. The accuracy of a ML model is based on a precise set and composition of features. Often, these features are used repeatedly by multiple teams training multiple models. And whichever feature set was used to train the model needs to be available to make real-time predictions (inference). Keeping a single source of features that is consistent and up-to-date across these different access patterns is a challenge as most organizations keep two different feature stores, one for training and one for inference.\n", + "\n", + "Amazon SageMaker Feature Store is a purpose-built repository where you can store and access features so it’s much easier to name, organize, and reuse them across teams. SageMaker Feature Store provides a unified store for features during training and real-time inference without the need to write additional code or create manual processes to keep features consistent. SageMaker Feature Store keeps track of the metadata of stored features (e.g. feature name or version number) so that you can query the features for the right attributes in batches or in real time using Amazon Athena, an interactive query service. SageMaker Feature Store also keeps features updated, because as new data is generated during inference, the single repository is updated so new features are always available for models to use during training and inference.\n", + "\n", "_What is a feature group_\n", "\n", "A single feature corresponds to a column in your dataset. A feature group is a predefined schema for a \n", @@ -266,12 +217,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "\n", - "\n", "### Define Feature Group \n", - "\n", - "##### [back to top](#02a-nb)\n", - "\n", "----\n", "Select Record identifier and Event time feature name. These are required parameters for feature group\n", "creation.\n", @@ -594,12 +540,8 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "\n", - "\n", "## Configure Feature Group\n", "\n", - "##### [back to top](#02a-nb)\n", - "\n", "----\n", "
💡 Configurable Settings \n", "\n", @@ -655,12 +597,8 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "\n", - "\n", "### Initialize & Create Feature Group\n", "\n", - "##### [back to top](#02a-nb)\n", - "\n", "----" ] }, @@ -770,12 +708,8 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "\n", - "\n", "## Inputs and Outputs\n", "\n", - "##### [back to top](#02a-nb)\n", - "\n", "----\n", "\n", "The below settings configure the inputs and outputs for the flow export.\n", @@ -890,12 +824,8 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "\n", - "\n", "## Upload Flow to S3\n", "\n", - "##### [back to top](#02a-nb)\n", - "\n", "----\n", "To use the Data Wrangler as an input to the processing job, first upload your flow file to Amazon S3." ] @@ -956,11 +886,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "\n", - "\n", - "## Run Processing Job \n", - "\n", - "##### [back to top](#02a-nb)\n", + "## Run Processing Job\n", "\n", "----\n", "## Job Configurations\n", diff --git a/end_to_end/music_recommendation/03a_train_deploy_debugger_explain_monitor_registry.ipynb b/end_to_end/music_recommendation/03_train_deploy_debugger_explain_monitor_registry.ipynb similarity index 99% rename from end_to_end/music_recommendation/03a_train_deploy_debugger_explain_monitor_registry.ipynb rename to end_to_end/music_recommendation/03_train_deploy_debugger_explain_monitor_registry.ipynb index e32bccf6b6..d7390e33f3 100644 --- a/end_to_end/music_recommendation/03a_train_deploy_debugger_explain_monitor_registry.ipynb +++ b/end_to_end/music_recommendation/03_train_deploy_debugger_explain_monitor_registry.ipynb @@ -4,16 +4,16 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "\n", - "\n", - "# Music Recommender Part 3: Train Model with Debugger Hooks and Set Artifacts and Register Model\n", + "# Train, Deploy, and Monitor the Music Recommender Model using SageMaker SDK\n", "\n", "----\n", - "In this notebook, we'll train our model using the data we prepped with SageMaker Data Wrangler and stored in our Feature Store, attaching [SageMaker Debugger](https://docs.aws.amazon.com/sagemaker/latest/dg/train-debugger.html) to the model training so that we can capture training metrics/statistics about the model. Then, we'll log more model artifacts using [SageMaker ML Lineage Tracking](https://docs.aws.amazon.com/sagemaker/latest/dg/lineage-tracking.html). Finally we'll register the model and save its version.\n", "\n", - "A machine learning training job can have problems such as system bottlenecks, overfitting, saturated activation functions, and vanishing gradients, which can compromise model performance. SageMaker Debugger profiles and debugs training jobs to help resolve such problems and improve your ML model's compute resource utilization and performance. Debugger offers tools to send alerts when training anomalies are found, take actions against the problems, and identify the root cause of them by visualizing collected metrics and tensors.\n", + "This notebook is part of a notebook series that goes through the ML Lifecycle and show how we can build a Music Recommender System using a combination of SageMaker Services and features. This notebook will train our model using the data we prepped with SageMaker Data Wrangler and stored in our Feature Store, attaching [SageMaker Debugger](https://docs.aws.amazon.com/sagemaker/latest/dg/train-debugger.html) to the model training so that we can capture training metrics/statistics about the model. Then, we will deploy the model and use SageMaker Explainability and Model Monitor to examine our deployed model. After that, we'll log more model artifacts using [SageMaker ML Lineage Tracking](https://docs.aws.amazon.com/sagemaker/latest/dg/lineage-tracking.html). Finally we'll register the model and save its version. It is one of two notebooks you choose to run as the third notebook in the series. You can choose to run this notebook by itself or in sequence with the other notebooks listed below. Please see the [README.md](README.md) for more information about this use case of this sequence of notebooks.\n", + "\n", + "1. [Music Recommender Data Exploration](01_data_exploration.ipynb)\n", + "1. [Music Recommender Data Preparation with SageMaker Feature Store and SageMaker Data Wrangler](02_data_exploration.ipynb) (current notebook)\n", + "1. [Train, Deploy, and Monitor the Music Recommender Model using SageMaker SDK](03a_train_deploy_debugger_explain_monitor_registry.ipynb)\n", "\n", - "Amazon SageMaker ML Lineage Tracking creates and stores information about the steps of a machine learning workflow from data preparation to model deployment. With the tracking information you can reproduce the workflow steps, track model and dataset lineage, and establish model governance and audit standards. \n", "\n", "
💡 Alert \n", "\n", @@ -40,14 +40,6 @@ "- [Part 7: Resource Cleanup](07_clean_up.ipynb)" ] }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Load stored variables\n", - "If you ran this notebook before, you may want to re-use the resources you aready created with AWS. Run the cell below to load any prevously created variables. You should see a print-out of the existing variables. If you don't see anything you may need to create them again or it may be your first time running this notebook." - ] - }, { "cell_type": "code", "execution_count": null, @@ -135,6 +127,101 @@ ")" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "## Prereqs: Get Data \n", + "\n", + "##### [back to top](#00-nb)\n", + "\n", + "----\n", + "\n", + "Here we will download the music data from a public S3 bucket that we'll be using for this demo and uploads it to your default S3 bucket that was created for you when you initially created a SageMaker Studio workspace. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from demo_helpers import get_data, get_model, update_data_sources" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# create data folder\n", + "!mkdir data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# public S3 bucket that contains our music data\n", + "s3_bucket_music_data = \"s3://sagemaker-sample-files/datasets/tabular/synthetic-music\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "new_data_paths = get_data(s3_client, [f\"{s3_bucket_music_data}/tracks.csv\", f\"{s3_bucket_music_data}/ratings.csv\"], bucket, prefix, sample_data=0.70)\n", + "print(new_data_paths)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "files_to_download = [\n", + " f\"sample_tracks.csv\",\n", + " f\"sample_user.csv\",\n", + " f\"train_data_headers.csv\",\n", + " f\"train_data.zip\",\n", + " f\"val_data_headers.csv\",\n", + " f\"val_data.zip\",\n", + " \n", + "]\n", + "\n", + "for file in files_to_download:\n", + " s3_client.download_file(f\"sagemaker-sample-files\", f\"datasets/tabular/synthetic-music/{file}\", f\"./data/{file}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "! unzip './data/*.zip' -d './data'\n", + "! rm ./data/*.zip" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# these are the new file paths located on your SageMaker Studio default s3 storage bucket\n", + "tracks_data_source = f's3://{bucket}/{prefix}/tracks.csv'\n", + "ratings_data_source = f's3://{bucket}/{prefix}/ratings.csv'" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -726,9 +813,9 @@ "\n", "## View SageMaker Debugger Reports\n", "\n", - "##### [back to top](#03-nb)\n", + "----\n", "\n", - "----" + "A machine learning training job can have problems such as system bottlenecks, overfitting, saturated activation functions, and vanishing gradients, which can compromise model performance. SageMaker Debugger profiles and debugs training jobs to help resolve such problems and improve your ML model's compute resource utilization and performance. Debugger offers tools to send alerts when training anomalies are found, take actions against the problems, and identify the root cause of them by visualizing collected metrics and tensors.. \n" ] }, { @@ -1324,7 +1411,7 @@ "\n", "## Model Registry\n", "\n", - "##### [back to top](#03-nb)\n", + "Amazon SageMaker ML Lineage Tracking creates and stores information about the steps of a machine learning workflow from data preparation to model deployment. With the tracking information you can reproduce the workflow steps, track model and dataset lineage, and establish model governance and audit standards\n", "\n", "----" ] diff --git a/end_to_end/music_recommendation/code/demo_helpers.py b/end_to_end/music_recommendation/code/demo_helpers.py index a68beda069..4a75022e99 100644 --- a/end_to_end/music_recommendation/code/demo_helpers.py +++ b/end_to_end/music_recommendation/code/demo_helpers.py @@ -1,9 +1,72 @@ +import os +import json import boto3 import time +import pandas as pd from sagemaker.lineage.context import Context from sagemaker.lineage.action import Action from sagemaker.lineage.association import Association from sagemaker.lineage.artifact import Artifact +from awscli.customizations.s3.utils import split_s3_bucket_key + +def get_data(s3_client, public_s3_data, to_bucket, to_prefix, sample_data=1): + new_paths = [] + for f in public_s3_data: + bucket_name, key_name = split_s3_bucket_key(f) + filename = f.split('/')[-1] + new_path = "s3://{}/{}/{}".format(to_bucket, to_prefix, filename) + new_paths.append(new_path) + + # only download if not already downloaded + if not os.path.exists('./data/{}'.format(filename)): + # download s3 data + print("Downloading file from {}".format(f)) + s3_client.download_file(bucket_name, key_name, './data/{}'.format(filename)) + + # subsample the data to create a smaller datatset for this demo + new_df = pd.read_csv('./data/{}'.format(filename)) + new_df = new_df.sample(frac=sample_data) + new_df.to_csv('./data/{}'.format(filename), index=False) + + # upload s3 data to our default s3 bucket for SageMaker Studio + print("Uploading {} to {}\n".format(filename, new_path)) + s3_client.upload_file('./data/{}'.format(filename), to_bucket, os.path.join(to_prefix,filename)) + + return new_paths + + +def get_model(model_path, to_bucket): + # upload model to our default s3 bucket for SageMaker Studio + filename = model_path.split('/')[-1] + print("Uploading {} to {}\n".format(model_path, os.path.join(to_bucket,prefix,filename))) + s3_client.upload_file(model_path, to_bucket, os.path.join(prefix,filename)) + return "s://{}".format(os.path.join(to_bucket,prefix,filename)) + + +def update_data_sources(flow_path, tracks_data_source, ratings_data_source): + with open(flow_path) as flowf: + flow = json.load(flowf) + + for node in flow['nodes']: + # if the key exists for our s3 endpoint + try: + if node['parameters']['dataset_definition']['name'] == 'tracks.csv': + # reset the s3 data source for tracks data + old_source = node['parameters']['dataset_definition']['s3ExecutionContext']['s3Uri'] + print("Changed {} to {}".format(old_source, tracks_data_source)) + node['parameters']['dataset_definition']['s3ExecutionContext']['s3Uri'] = tracks_data_source + elif node['parameters']['dataset_definition']['name'] == 'ratings.csv': + # reset the s3 data source for ratings data + old_source = node['parameters']['dataset_definition']['s3ExecutionContext']['s3Uri'] + print("Changed {} to {}".format(old_source, ratings_data_source)) + node['parameters']['dataset_definition']['s3ExecutionContext']['s3Uri'] = ratings_data_source + except: + continue + # write out the updated json flow file + with open(flow_path, 'w') as outfile: + json.dump(flow, outfile) + + return flow def delete_project_resources(sagemaker_boto_client, sagemaker_session, endpoint_names=None, pipeline_names=None, mpg_name=None, @@ -53,7 +116,7 @@ def delete_lineage_data(): delete_associations(summary.context_arn) ctx = Context(context_name=summary.context_name, sagemaker_session=sagemaker_session) ctx.delete() - time.sleep(1) + time.sleep(2) for summary in Action.list(): if prefix in summary.source.source_uri: diff --git a/end_to_end/music_recommendation/03b_pipeline.ipynb b/end_to_end/music_recommendation/end_to_end_pipeline.ipynb similarity index 90% rename from end_to_end/music_recommendation/03b_pipeline.ipynb rename to end_to_end/music_recommendation/end_to_end_pipeline.ipynb index 696757b051..9961e1a5e8 100644 --- a/end_to_end/music_recommendation/03b_pipeline.ipynb +++ b/end_to_end/music_recommendation/end_to_end_pipeline.ipynb @@ -4,27 +4,18 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "\n", - "\n", - "# Music Recommender Part 6: SageMaker Pipelines\n", + "# Train, Deploy, and Monitor the Music Recommender Model using SageMaker Pipelines\n", "\n", "----\n", - "In this final notebook, we'll combine all the steps we've gone over in each individual notebook, and condense them down into a [SageMaker Pipelines](https://docs.aws.amazon.com/sagemaker/latest/dg/pipelines.html) object which will automate the entire modeling process from the beginning of data ingestion to monitoring the model. SageMaker Pipelines is a tool for building machine learning pipelines that take advantage of direct SageMaker integration. Because of this integration, you can create a pipeline and set up SageMaker Projects for orchestration using a tool that handles much of the step creation and management for you.\n", + "## Background\n", + "\n", + "In this notebook, we'll build an end-to-end pipeline to create a music recommender using [SageMaker Pipelines](https://docs.aws.amazon.com/sagemaker/latest/dg/pipelines.html), which will automate the entire modeling process from the beginning of data ingestion to monitoring the model. SageMaker Pipelines is a tool for building machine learning pipelines that take advantage of direct SageMaker integration. Because of this integration, you can create a pipeline and set up SageMaker Projects for orchestration using a tool that handles much of the step creation and management for you.\n", "\n", "----\n", "### Contents\n", - "- [Overview](00_overview_arch_data.ipynb)\n", - "- [Part 1: Data Prep using Data Wrangler](01_music_dataprep.flow)\n", - "- [Part 2a: Feature Store Creation - Tracks](02a_export_fg_tracks.ipynb)\n", - "- [Part 2b: Feature Store Creation - User Preferences](02b_export_fg_5star_features.ipynb)\n", - "- [Part 2c: Feature Store Creation - Ratings](02c_export_fg_ratings.ipynb)\n", - "- [Part 3: Train Model with Debugger Hooks. Set Artifacts and Register Model.](03_train_model_lineage_registry_debugger.ipynb)\n", - "- [Part 4: Deploy Model & Inference using Online Feature Store](04_deploy_inference_explainability.ipynb)\n", - "- [Part 5: Model Monitor](05_model_monitor.ipynb)\n", - "- [Part 6: SageMaker Pipelines](06_pipeline.ipynb)\n", - " - [Architecture](#06-arch)\n", - " - [Pipelines Overview](#pipelines)\n", - "- [Part 7: Resource Cleanup](07_clean_up.ipynb)" + "1. [Architecture: Create a SageMaker Pipeline to Automate All the Steps from Data Prep to Model Deployment](#Architecture:-Create-a-SageMaker-Pipeline-to-Automate-All-the-Steps-from-Data-Prep-to-Model-Deployment)\n", + "1. [SageMaker Pipeline Overview](#SageMaker-Pipeline-Overview)\n", + "1. [Clean Up](#Clean-Up)" ] }, { @@ -151,12 +142,8 @@ "cell_type": "markdown", "metadata": {}, "source": [ - " \n", - "\n", "## Architecture: Create a SageMaker Pipeline to Automate All the Steps from Data Prep to Model Deployment\n", "\n", - "##### [back to top](#06-nb)\n", - "\n", "----\n", "\n", "![arch diagram](./images/music-rec.png)" @@ -166,22 +153,20 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "\n", - "\n", "## SageMaker Pipeline Overview\n", "\n", - "##### [back to top](#06-nb)\n", - "\n", "---- \n", "\n", - "#### [Step 1: Data Wrangler Preprocessing Step](#data-wrangler)\n", - "#### [Step 2: Dataset and train test split](#dataset-train-test)\n", - "#### [Step 3: Train XGboost Model](#pipe-train-xgb)\n", - "#### [Step 4: Model Pre-deployment](#pipe-pre-deploy)\n", - "#### [Step 5: Register Model](#pipe-Register-Model)\n", - "#### [Step 6: Deploy Model](#deploy)\n", - "#### [Step 7: Monitor Model](#monitor)\n", - "#### [Combine Steps and Run Pipeline](#combine)" + "### List of Steps\n", + "\n", + "1. [Step 1: Data Wrangler Preprocessing Step](#Step-1:-Data-Wrangler-Preprocessing-Step)\n", + "1. [Step 2: Create Dataset and Train/Test Split](#Step-2:-Create-Dataset-and-Train/Test-Split)\n", + "1. [Step 3: Train XGBoost Model](#Step-3:-Train-XGBoost-Model)\n", + "1. [Step 4: Model Pre-Deployment Step](#Step-4:-Model-Pre-Deployment-Step)\n", + "1. [Step 5: Register Model](#Step-5:-Register-Model)\n", + "1. [Step 6: Deploy Model](#Step-6:-Deploy-Model)\n", + "1. [Step 7: Monitor Model Deployed to SageMaker Hosted Endpoint](#Step-7:-Monitor-Model-Deployed-to-SageMaker-Hosted-Endpoint)\n", + "1. [Combine Steps and Run Pipeline](#Combine-Steps-and-Run-Pipeline)" ] }, { @@ -221,9 +206,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "\n", - "### Step 1: Data Wranger Preprocessing Step\n", - "[Pipeline Overview](#pipelines)\n", + "### Step 1: Data Wrangler Preprocessing Step\n", "\n", "#### Upload flow to S3\n", "This will become an input to the first step and, as such, needs to be in S3." @@ -438,10 +421,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "\n", - "### Step 2: Create Dataset and Train/Test Split\n", - "\n", - "[Pipeline Overview](#pipelines)" + "### Step 2: Create Dataset and Train/Test Split" ] }, { @@ -485,11 +465,8 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "\n", "### Step 3: Train XGBoost Model\n", - "In this step we use the ParameterString `train_instance_param` defined at the beginning of the pipeline.\n", - "\n", - "[Pipeline Overview](#pipelines)" + "In this step we use the ParameterString `train_instance_param` defined at the beginning of the pipeline." ] }, { @@ -569,10 +546,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "\n", - "### Step 4: Model Pre-Deployment Step\n", - "\n", - "[Pipeline Overview](#pipelines)" + "### Step 4: Model Pre-Deployment Step" ] }, { @@ -604,7 +578,6 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "\n", "### Step 5: Register Model\n", "In this step you will use the ParameterString `model_approval_status` defined at the outset of the pipeline code.\n", "\n", @@ -635,10 +608,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "\n", - "### Step 6: Deploy Model\n", - "\n", - "[Pipeline Overview](#pipelines)" + "### Step 6: Deploy Model" ] }, { @@ -676,10 +646,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "\n", - "### Step 7: Monitor Model Deployed to SageMaker Hosted Endpoint\n", - "\n", - "[Pipeline Overview](#pipelines)" + "### Step 7: Monitor Model Deployed to SageMaker Hosted Endpoint\n" ] }, { @@ -724,10 +691,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "\n", - "\n", - "### Combine the Pipeline Steps and Run\n", - "[Pipeline Overview](#pipelines)\n", + "### Combine Steps and Run Pipeline\n", "\n", "Once all of our steps are defined, we can put them together using the SageMaker `Pipeline` object. While we pass the steps in order so that it is easier to read, technically the order that we pass them does not matter since the pipeline DAG will parse it out properly based on any dependencies between steps. If the input of one step is the output of another step, the Pipelines understands which must come first." ] @@ -975,7 +939,9 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Clean Up" + "## Clean Up\n", + "\n", + "----" ] }, { From 896dee36892bf8bea7590b4e5a9779fed4344cc0 Mon Sep 17 00:00:00 2001 From: atqy Date: Wed, 4 May 2022 16:28:31 +0000 Subject: [PATCH 03/25] delete notebooks --- .../02b_export_fg_5star_features.ipynb | 762 ------------------ .../02c_export_fg_ratings.ipynb | 706 ---------------- .../04_deploy_infer_explain.ipynb | 567 ------------- .../05_model_monitor.ipynb | 494 ------------ .../music_recommendation/07_clean_up.ipynb | 187 ----- 5 files changed, 2716 deletions(-) delete mode 100644 end_to_end/music_recommendation/02b_export_fg_5star_features.ipynb delete mode 100644 end_to_end/music_recommendation/02c_export_fg_ratings.ipynb delete mode 100644 end_to_end/music_recommendation/04_deploy_infer_explain.ipynb delete mode 100644 end_to_end/music_recommendation/05_model_monitor.ipynb delete mode 100644 end_to_end/music_recommendation/07_clean_up.ipynb diff --git a/end_to_end/music_recommendation/02b_export_fg_5star_features.ipynb b/end_to_end/music_recommendation/02b_export_fg_5star_features.ipynb deleted file mode 100644 index 1a1e9ba331..0000000000 --- a/end_to_end/music_recommendation/02b_export_fg_5star_features.ipynb +++ /dev/null @@ -1,762 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "# Music Recommender Part 2b: Feature Store Creation - User Preferences\n", - "\n", - "----\n", - "\n", - "This notebook creates a feature group for our user music preference data to place in our feature store using the transformation instructions found in our `.flow` file.\n", - "\n", - "----\n", - "### Contents\n", - "- [Overview](00_overview_arch_data.ipynb)\n", - "- [Part 2a: Feature Store Creation - Tracks](02a_export_fg_tracks.ipynb)\n", - "- [Part 2b: Feature Store Creation - User Preferences](02b_export_fg_5star_features.ipynb)\n", - " - [Define Feature Group](#02b-define-fg)\n", - " - [Configure Feature Group](#02b-config-fg)\n", - " - [Initialize & Create Feature Group](#02b-init-create-fg)\n", - " - [Inputs and Outputs](#02b-input-output)\n", - " - [Run Processing Job](#02b-run-job)\n", - "- [Part 2c: Feature Store Creation - Ratings](02c_export_fg_ratings.ipynb)\n", - "- [Part 3: Train Model with Debugger Hooks. Set Artifacts and Register Model.](03_train_model_lineage_registry_debugger.ipynb)\n", - "- [Part 4: Deploy Model & Inference using Online Feature Store](04_deploy_infer_explain.ipynb)\n", - "- [Part 5: Model Monitor](05_model_monitor.ipynb)\n", - "- [Part 6: SageMaker Pipelines](06_pipeline.ipynb)\n", - "- [Part 7: Resource Cleanup](07_clean_up.ipynb)\n", - "\n", - "\n", - "
💡 Quick Start \n", - "To save your processed data to feature store, \n", - " Click here to create a feature group and follow the instruction to run a SageMaker processing job.\n", - "\n", - "
\n", - "\n", - "This notebook uses Amazon SageMaker Feature Store (Feature Store) to create a feature group, \n", - "executes your Data Wrangler Flow `01_music_dataprep.flow` on the entire dataset using a SageMaker \n", - "Processing Job and ingest processed data to Feature Store. \n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Create Feature Group\n", - "\n", - "_What is a feature group_\n", - "\n", - "A single feature corresponds to a column in your dataset. A feature group is a predefined schema for a \n", - "collection of features - each feature in the feature group has a specified data type and name. \n", - "A single record in a feature group corresponds to a row in your dataframe. A feature store is a \n", - "collection of feature groups. To learn more about SageMaker Feature Store, see \n", - "[Amazon Feature Store Documentation](http://docs.aws.amazon.com/sagemaker/latest/dg/feature-store.html)." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "### Define Feature Group \n", - "\n", - "##### [back to top](#02b-nb)\n", - "\n", - "----\n", - "Select Record identifier and Event time feature name. These are required parameters for feature group\n", - "creation.\n", - "* **Record identifier name** is the name of the feature defined in the feature group's feature definitions \n", - "whose value uniquely identifies a Record defined in the feature group's feature definitions.\n", - "* **Event time feature name** is the name of the EventTime feature of a Record in FeatureGroup. An EventTime \n", - "is a timestamp that represents the point in time when a new event occurs that corresponds to the creation or \n", - "update of a Record in the FeatureGroup. All Records in the FeatureGroup must have a corresponding EventTime.\n", - "\n", - "
💡Record identifier and Event time feature name are required \n", - "for feature group. After filling in the values, you can choose Run Selected Cell and All Below \n", - "from the Run Menu from the menu bar. \n", - "
" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import sys\n", - "import pprint\n", - "sys.path.insert(1, './code')\n", - "from parameter_store import ParameterStore\n", - "ps = ParameterStore(verbose=False)\n", - "\n", - "parameters = ps.read('music-rec')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "bucket = parameters['bucket']\n", - "dw_ecrlist = parameters['dw_ecrlist']\n", - "fg_name_tracks = parameters['fg_name_tracks']\n", - "flow_export_id = parameters['flow_export_id']\n", - "flow_s3_uri = parameters['flow_s3_uri']\n", - "pretrained_model_path = parameters['pretrained_model_path']\n", - "prefix = parameters['prefix']\n", - "ratings_data_source = parameters['ratings_data_source']\n", - "tracks_data_source = parameters['tracks_data_source']" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "record_identifier_feature_name = 'userId'\n", - "if record_identifier_feature_name is None:\n", - " raise SystemExit(\"Select a column name as the feature group record identifier.\")\n", - "\n", - "event_time_feature_name = 'EventTime'\n", - "if event_time_feature_name is None:\n", - " raise SystemExit(\"Select a column name as the event time feature name.\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Feature Definitions\n", - "The following is a list of the feature names and feature types of the final dataset that will be produced \n", - "when your data flow is used to process your input dataset. These are automatically generated from the \n", - "step `Custom Pyspark` from `Source: Answers.Csv`. To save from a different step, go to Data Wrangler to \n", - "select a new step to export.\n", - "\n", - "
💡 Configurable Settings \n", - "\n", - "1. You can select a subset of the features. By default all columns of the result dataframe will be used as \n", - "features.\n", - "2. You can change the Data Wrangler data type to one of the Feature Store supported types \n", - "(Integral, Fractional, or String). The default type is set to String. \n", - "This means that, if a column in your dataset is not a float or long type, it will default \n", - "to String in your Feature Store.\n", - "\n", - "For Event Time features, make sure the format follows the feature store\n", - "\n", - " \n", - " Event Time feature format\n", - " \n", - "\n", - "
" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The following is a list of the feature names and data types of the final dataset that will be produced when your data flow is used to process your input dataset." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "column_schemas = [\n", - " {\n", - " \"name\": \"userId\",\n", - " \"type\": \"long\"\n", - " },\n", - " {\n", - " \"name\": \"energy_5star\",\n", - " \"type\": \"float\"\n", - " },\n", - " {\n", - " \"name\": \"acousticness_5star\",\n", - " \"type\": \"float\"\n", - " },\n", - " {\n", - " \"name\": \"valence_5star\",\n", - " \"type\": \"float\"\n", - " },\n", - " {\n", - " \"name\": \"speechiness_5star\",\n", - " \"type\": \"float\"\n", - " },\n", - " {\n", - " \"name\": \"instrumentalness_5star\",\n", - " \"type\": \"float\"\n", - " },\n", - " {\n", - " \"name\": \"liveness_5star\",\n", - " \"type\": \"float\"\n", - " },\n", - " {\n", - " \"name\": \"tempo_5star\",\n", - " \"type\": \"float\"\n", - " },\n", - " {\n", - " \"name\": \"danceability_5star\",\n", - " \"type\": \"float\"\n", - " },\n", - " {\n", - " \"name\": \"genre_Latin_5star\",\n", - " \"type\": \"float\"\n", - " },\n", - " {\n", - " \"name\": \"genre_Folk_5star\",\n", - " \"type\": \"float\"\n", - " },\n", - " {\n", - " \"name\": \"genre_Blues_5star\",\n", - " \"type\": \"float\"\n", - " },\n", - " {\n", - " \"name\": \"genre_Rap_5star\",\n", - " \"type\": \"float\"\n", - " },\n", - " {\n", - " \"name\": \"genre_Reggae_5star\",\n", - " \"type\": \"float\"\n", - " },\n", - " {\n", - " \"name\": \"genre_Jazz_5star\",\n", - " \"type\": \"float\"\n", - " },\n", - " {\n", - " \"name\": \"genre_RnB_5star\",\n", - " \"type\": \"float\"\n", - " },\n", - " {\n", - " \"name\": \"genre_Country_5star\",\n", - " \"type\": \"float\"\n", - " },\n", - " {\n", - " \"name\": \"genre_Electronic_5star\",\n", - " \"type\": \"float\"\n", - " },\n", - " {\n", - " \"name\": \"genre_Pop_Rock_5star\",\n", - " \"type\": \"float\"\n", - " },\n", - " {\n", - " \"name\": \"EventTime\",\n", - " \"type\": \"float\"\n", - " }\n", - "]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Below we create the SDK input for those feature definitions. Some schema types in Data Wrangler are not \n", - "supported by Feature Store. The following will create a default_FG_type set to String for these types." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from sagemaker.feature_store.feature_definition import FeatureDefinition\n", - "from sagemaker.feature_store.feature_definition import FeatureTypeEnum\n", - "\n", - "default_feature_type = FeatureTypeEnum.STRING\n", - "column_to_feature_type_mapping = {\n", - " \"float\": FeatureTypeEnum.FRACTIONAL,\n", - " \"long\": FeatureTypeEnum.INTEGRAL\n", - "}\n", - "\n", - "feature_definitions = [\n", - " FeatureDefinition(\n", - " feature_name=column_schema['name'], \n", - " feature_type=column_to_feature_type_mapping.get(column_schema['type'], default_feature_type)\n", - " ) for column_schema in column_schemas\n", - "]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "## Configure Feature Group\n", - "\n", - "##### [back to top](#02b-nb)\n", - "\n", - "----\n", - "\n", - "
💡 Configurable Settings \n", - "\n", - "1. feature_group_name: name of the feature group.\n", - "1. feature_store_offline_s3_uri: SageMaker FeatureStore writes the data in the OfflineStore of a FeatureGroup to a S3 location owned by you.\n", - "1. enable_online_store: controls if online store is enabled. Enabling the online store allows quick access to the latest value for a Record via the GetRecord API.\n", - "1. iam_role: IAM role for executing the processing job.\n", - "
" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from time import gmtime, strftime\n", - "import sagemaker \n", - "\n", - "# Sagemaker session\n", - "sess = sagemaker.Session()\n", - "\n", - "# IAM role for executing the processing job.\n", - "iam_role = sagemaker.get_execution_role()\n", - "\n", - "# flow name and an unique ID for this export (used later as the processing job name for the export)\n", - "flow_name = \"01_music_dataprep\"\n", - "flow_export_name = f\"flow-{flow_export_id}\"\n", - "\n", - "# feature group name, with flow_name and an unique id. You can give it a customized name\n", - "feature_group_name = 'user-5star-track-features-music-rec'\n", - "print(f\"Feature Group Name: {feature_group_name}\")\n", - "\n", - "# SageMaker FeatureStore writes the data in the OfflineStore of a FeatureGroup to a \n", - "# S3 location owned by you.\n", - "feature_store_offline_s3_uri = 's3://' + bucket\n", - "\n", - "# controls if online store is enabled. Enabling the online store allows quick access to \n", - "# the latest value for a Record via the GetRecord API.\n", - "enable_online_store = True" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "fg_name_user_preferences = feature_group_name\n", - "\n", - "ps.add({'fg_name_user_preferences': fg_name_user_preferences}, namespace='music-rec')\n", - "ps.store()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "### Initialize & Create Feature Group\n", - "\n", - "##### [back to top](#02b-nb)\n", - "\n", - "----" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Initialize Boto3 session that is required to create feature group\n", - "import boto3\n", - "from sagemaker.session import Session\n", - "\n", - "region = boto3.Session().region_name\n", - "boto_session = boto3.Session(region_name=region)\n", - "\n", - "sagemaker_client = boto_session.client(service_name='sagemaker', region_name=region)\n", - "featurestore_runtime = boto_session.client(service_name='sagemaker-featurestore-runtime', region_name=region)\n", - "\n", - "feature_store_session = Session(\n", - " boto_session=boto_session,\n", - " sagemaker_client=sagemaker_client,\n", - " sagemaker_featurestore_runtime_client=featurestore_runtime\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Feature group is initialized and created below" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from sagemaker.feature_store.feature_group import FeatureGroup\n", - "\n", - "feature_group = FeatureGroup(\n", - " name=feature_group_name, sagemaker_session=feature_store_session, feature_definitions=feature_definitions)\n", - "\n", - "# only create feature group if it doesn't already exist\n", - "try:\n", - " sagemaker_client.describe_feature_group(FeatureGroupName=feature_group_name, NextToken='string')\n", - " feature_group_exists=True\n", - " print(\"Feature Group {0} already exists. Using {0}\".format(feature_group_name))\n", - "except Exception as e:\n", - " error = e.response.get('Error').get('Code')\n", - " if error == \"ResourceNotFound\":\n", - " feature_group_exists=False\n", - " print(\"Creating Feature Group {}\".format(feature_group_name))\n", - " feature_group.create(\n", - " s3_uri=feature_store_offline_s3_uri,\n", - " record_identifier_name=record_identifier_feature_name,\n", - " event_time_feature_name=event_time_feature_name,\n", - " role_arn=iam_role,\n", - " enable_online_store=enable_online_store\n", - " )\n", - " if error == 'ResourceInUse':\n", - " feature_group_exists=True\n", - " print(\"Feature Group {0} already exists. Using {0}\".format(feature_group_name))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Invoke the Feature Store API to create the feature group and wait until it is ready" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import time\n", - "def wait_for_feature_group_creation_complete(feature_group):\n", - " \"\"\"Helper function to wait for the completions of creating a feature group\"\"\"\n", - " status = feature_group.describe().get(\"FeatureGroupStatus\")\n", - " while status == \"Creating\":\n", - " print(\"Waiting for Feature Group Creation\")\n", - " time.sleep(5)\n", - " status = feature_group.describe().get(\"FeatureGroupStatus\")\n", - " if status != \"Created\":\n", - " raise SystemExit(f\"Failed to create feature group {feature_group.name}: {status}\")\n", - " print(f\"FeatureGroup {feature_group.name} successfully created.\")\n", - "\n", - "wait_for_feature_group_creation_complete(feature_group=feature_group)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Now that the feature group is created, You will use a processing job to process your \n", - " data at scale and ingest the transformed data into this feature group." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "## Inputs and Outputs\n", - "\n", - "##### [back to top](#02b-nb)\n", - "\n", - "----\n", - "The below settings configure the inputs and outputs for the flow export.\n", - "\n", - "
💡 Configurable Settings \n", - "\n", - "In Input - Source you can configure the data sources that will be used as input by Data Wrangler\n", - "\n", - "1. For S3 sources, configure the source attribute that points to the input S3 prefixes\n", - "2. For all other sources, configure attributes like query_string, database in the source's \n", - "DatasetDefinition object.\n", - "\n", - "If you modify the inputs the provided data must have the same schema and format as the data used in the Flow. \n", - "You should also re-execute the cells in this section if you have modified the settings in any data sources.\n", - "
" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from sagemaker.processing import ProcessingInput, ProcessingOutput\n", - "from sagemaker.dataset_definition.inputs import AthenaDatasetDefinition, DatasetDefinition, RedshiftDatasetDefinition\n", - "\n", - "data_sources = []" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Input - S3 Source: tracks.csv" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "data_sources.append(ProcessingInput(\n", - " source=f\"{tracks_data_source}\", # You could override this to point to another dataset on S3\n", - " destination=\"/opt/ml/processing/tracks.csv\",\n", - " input_name=\"tracks.csv\",\n", - " s3_data_type=\"S3Prefix\",\n", - " s3_input_mode=\"File\",\n", - " s3_data_distribution_type=\"FullyReplicated\"\n", - "))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Input - S3 Source: ratings.csv" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "data_sources.append(ProcessingInput(\n", - " source=f\"{ratings_data_source}\", # You could override this to point to another dataset on S3\n", - " destination=\"/opt/ml/processing/ratings.csv\",\n", - " input_name=\"ratings.csv\",\n", - " s3_data_type=\"S3Prefix\",\n", - " s3_input_mode=\"File\",\n", - " s3_data_distribution_type=\"FullyReplicated\"\n", - "))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Output: Feature Store \n", - "Below are the inputs required by the SageMaker Python SDK to launch a processing job with feature store as an output. Notice the `output_name` variable below; this ID is found within the `.flow` file at the node point you want to capture transformations up to. The `.flow` file contains instructions for SageMaker Data Wrangler to know where to look for data and how to transform it. Each data transformation action is associated with a node and therefore a node ID. Using the associated node ID + output name tells SageMaker up to what point in the transformation process you want to export to a feature store." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from sagemaker.processing import FeatureStoreOutput\n", - "\n", - "# Output name is auto-generated from the select node's ID + output name from the flow file.\n", - "output_name = \"7a6dad19-2c80-43e3-b03d-ec23c3842ae9.default\" # joined node\n", - "\n", - "processing_job_output = ProcessingOutput(\n", - " output_name=output_name,\n", - " app_managed=True,\n", - " feature_store_output=FeatureStoreOutput(feature_group_name=feature_group_name),\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We already uploaded our flow file in the previous notebook. Here the Data Wrangler Flow is also provided to the Processing Job as an input source which we configure below." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "## Input - Flow: 01_music_dataprep.flow\n", - "flow_input = ProcessingInput(\n", - " source=flow_s3_uri,\n", - " destination=\"/opt/ml/processing/flow\",\n", - " input_name=\"flow\",\n", - " s3_data_type=\"S3Prefix\",\n", - " s3_input_mode=\"File\",\n", - " s3_data_distribution_type=\"FullyReplicated\"\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "## Run Processing Job \n", - "\n", - "##### [back to top](#02b-nb)\n", - "\n", - "----\n", - "## Job Configurations\n", - "\n", - "
💡 Configurable Settings \n", - "\n", - "You can configure the following settings for Processing Jobs. If you change any configurations you will \n", - "need to re-execute this and all cells below it by selecting the Run menu above and click \n", - "Run Selected Cells and All Below\n", - "\n", - "1. IAM role for executing the processing job. \n", - "2. A unique name of the processing job. Give a unique name every time you re-execute processing jobs\n", - "3. Data Wrangler Container URL.\n", - "4. Instance count, instance type and storage volume size in GB.\n", - "5. Content type for each output. Data Wrangler supports CSV as default and Parquet.\n", - "6. Network Isolation settings\n", - "
" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import uuid\n", - "\n", - "# Unique processing job name. Give a unique name every time you re-execute processing jobs\n", - "#processing_job_name = \"data-wrangler-flow-processing-{}-{}\".format(flow_export_id, str(uuid.uuid4())[:8])\n", - "\n", - "processing_job_name = \"dw-flow-proc-music-rec-5star-{}-{}\".format(flow_export_id, str(uuid.uuid4())[:8])\n", - "print (f\"{processing_job_name}\")\n", - "\n", - "\n", - "# Data Wrangler Container URL.\n", - "container_uri = f\"{dw_ecrlist['region'][region]}.dkr.ecr.{region}.amazonaws.com/sagemaker-data-wrangler-container:1.x\"\n", - "\n", - "# Processing Job Instance count and instance type.\n", - "instance_count = 2\n", - "instance_type = \"ml.m5.4xlarge\"\n", - "\n", - "# Size in GB of the EBS volume to use for storing data during processing\n", - "volume_size_in_gb = 30\n", - "\n", - "# Content type for each output. Data Wrangler supports CSV as default and Parquet.\n", - "output_content_type = \"CSV\"\n", - "\n", - "# Network Isolation mode; default is off\n", - "enable_network_isolation = False\n", - "\n", - "# Output configuration used as processing job container arguments \n", - "output_config = {\n", - " output_name: {\n", - " \"content_type\": output_content_type\n", - " }\n", - "}" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Create Processing Job\n", - "\n", - "To launch a Processing Job, you will use the SageMaker Python SDK to create a Processor function." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from sagemaker.processing import Processor\n", - "from sagemaker.network import NetworkConfig\n", - "import json\n", - "\n", - "processor = Processor(\n", - " role=iam_role,\n", - " image_uri=container_uri,\n", - " instance_count=instance_count,\n", - " instance_type=instance_type,\n", - " volume_size_in_gb=volume_size_in_gb,\n", - " network_config=NetworkConfig(enable_network_isolation=enable_network_isolation),\n", - " sagemaker_session=sess\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Job Status & S3 Output Location\n", - "\n", - "Below you wait for processing job to finish. If it finishes successfully, your feature group should be populated \n", - "with transformed feature values. In addition the raw parameters used by the Processing Job will be printed." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "%%time\n", - "\n", - "# Run Processing Job if job not already previously ran\n", - "if feature_group_exists:\n", - " print(\"Feature Group {0} already exists therefore we will not run a processing job to create it again\".format(feature_group_name))\n", - "else:\n", - " print(\"Creating Processing Job: {}\".format(feature_group_name))\n", - " processor.run(\n", - " inputs=[flow_input] + data_sources, \n", - " outputs=[processing_job_output],\n", - " arguments=[f\"--output-config '{json.dumps(output_config)}'\"],\n", - " wait=False,\n", - " logs=False,\n", - " job_name=processing_job_name\n", - " ) \n", - " \n", - " job_result = sess.wait_for_processing_job(processing_job_name)\n", - " print(job_result)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "You can view newly created feature group in Studio, refer to [Use Amazon SageMaker Feature Store with Amazon SageMaker Studio](https://docs.aws.amazon.com/sagemaker/latest/dg/feature-store-use-with-studio.html)\n", - "for detailed guide.[Learn more about SageMaker Feature Store](https://github.com/aws/amazon-sagemaker-examples/tree/master/sagemaker-featurestore)" - ] - } - ], - "metadata": { - "instance_type": "ml.m5.large", - "kernelspec": { - "display_name": "Python 3 (Data Science)", - "language": "python", - "name": "python3__SAGEMAKER_INTERNAL__arn:aws:sagemaker:us-east-2:429704687514:image/datascience-1.0" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.10" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/end_to_end/music_recommendation/02c_export_fg_ratings.ipynb b/end_to_end/music_recommendation/02c_export_fg_ratings.ipynb deleted file mode 100644 index 69d0bc4b2d..0000000000 --- a/end_to_end/music_recommendation/02c_export_fg_ratings.ipynb +++ /dev/null @@ -1,706 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "# Music Recommender Part 2c: Feature Store Creation - Ratings\n", - "\n", - "----\n", - "\n", - "This notebook creates a feature group for our ratings data to place in our feature store using the transformation instructions found in our `.flow` file.\n", - "\n", - "----\n", - "### Contents\n", - "- [Overview](00_overview_arch_data.ipynb)\n", - "- [Part 2a: Feature Store Creation - Tracks](02a_export_fg_tracks.ipynb)\n", - "- [Part 2b: Feature Store Creation - User Preferences](02b_export_fg_5star_features.ipynb)\n", - "- [Part 2c: Feature Store Creation - Ratings](02c_export_fg_ratings.ipynb)\n", - " - [Define Feature Group](#02c-define-fg)\n", - " - [Configure Feature Group](#02c-config-fg)\n", - " - [Initialize & Create Feature Group](#02c-init-create-fg)\n", - " - [Inputs and Outputs](#02c-input-output)\n", - " - [Run Processing Job](#02c-run-job)\n", - "- [Part 3: Train Model with Debugger Hooks. Set Artifacts and Register Model.](03_train_model_lineage_registry_debugger.ipynb)\n", - "- [Part 4: Deploy Model & Inference using Online Feature Store](04_deploy_infer_explain.ipynb)\n", - "- [Part 5: Model Monitor](05_model_monitor.ipynb)\n", - "- [Part 6: SageMaker Pipelines](06_pipeline.ipynb)\n", - "- [Part 7: Resource Cleanup](07_clean_up.ipynb)\n", - "\n", - "\n", - "
💡 Quick Start \n", - "To save your processed data to feature store, \n", - " Click here to create a feature group and follow the instruction to run a SageMaker processing job.\n", - "\n", - "
\n", - "\n", - "This notebook uses Amazon SageMaker Feature Store (Feature Store) to create a feature group, \n", - "executes your Data Wrangler Flow `01_music_dataprep.flow` on the entire dataset using a SageMaker \n", - "Processing Job and ingest processed data to Feature Store. \n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import sys\n", - "import pprint\n", - "sys.path.insert(1, './code')\n", - "from parameter_store import ParameterStore\n", - "ps = ParameterStore(verbose=False)\n", - "\n", - "parameters = ps.read('music-rec')\n", - "\n", - "bucket = parameters['bucket']\n", - "dw_ecrlist = parameters['dw_ecrlist']\n", - "fg_name_tracks = parameters['fg_name_tracks']\n", - "flow_export_id = parameters['flow_export_id']\n", - "flow_s3_uri = parameters['flow_s3_uri']\n", - "pretrained_model_path = parameters['pretrained_model_path']\n", - "prefix = parameters['prefix']\n", - "ratings_data_source = parameters['ratings_data_source']\n", - "tracks_data_source = parameters['tracks_data_source']" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Create Feature Group\n", - "\n", - "_What is a feature group_\n", - "\n", - "A single feature corresponds to a column in your dataset. A feature group is a predefined schema for a \n", - "collection of features - each feature in the feature group has a specified data type and name. \n", - "A single record in a feature group corresponds to a row in your dataframe. A feature store is a \n", - "collection of feature groups. To learn more about SageMaker Feature Store, see \n", - "[Amazon Feature Store Documentation](http://docs.aws.amazon.com/sagemaker/latest/dg/feature-store.html)." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "### Define Feature Group \n", - "\n", - "##### [back to top](#02c-nb)\n", - "\n", - "----\n", - "Select Record identifier and Event time feature name. These are required parameters for feature group\n", - "creation.\n", - "* **Record identifier name** is the name of the feature defined in the feature group's feature definitions \n", - "whose value uniquely identifies a Record defined in the feature group's feature definitions.\n", - "* **Event time feature name** is the name of the EventTime feature of a Record in FeatureGroup. An EventTime \n", - "is a timestamp that represents the point in time when a new event occurs that corresponds to the creation or \n", - "update of a Record in the FeatureGroup. All Records in the FeatureGroup must have a corresponding EventTime.\n", - "\n", - "
💡Record identifier and Event time feature name are required \n", - "for feature group. After filling in the values, you can choose Run Selected Cell and All Below \n", - "from the Run Menu from the menu bar. \n", - "
" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "record_identifier_feature_name = \"ratingEventId\"\n", - "if record_identifier_feature_name is None:\n", - " raise SystemExit(\"Select a column name as the feature group record identifier.\")\n", - "\n", - "event_time_feature_name = \"EventTime\"\n", - "if event_time_feature_name is None:\n", - " raise SystemExit(\"Select a column name as the event time feature name.\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Feature Definitions\n", - "The following is a list of the feature names and feature types of the final dataset that will be produced \n", - "when your data flow is used to process your input dataset. These are automatically generated from the \n", - "step `Custom Pyspark` from `Source: Answers.Csv`. To save from a different step, go to Data Wrangler to \n", - "select a new step to export.\n", - "\n", - "
💡 Configurable Settings \n", - "\n", - "1. You can select a subset of the features. By default all columns of the result dataframe will be used as \n", - "features.\n", - "2. You can change the Data Wrangler data type to one of the Feature Store supported types \n", - "(Integral, Fractional, or String). The default type is set to String. \n", - "This means that, if a column in your dataset is not a float or long type, it will default \n", - "to String in your Feature Store.\n", - "\n", - "For Event Time features, make sure the format follows the feature store\n", - "\n", - " \n", - " Event Time feature format\n", - " \n", - "\n", - "
" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The following is a list of the feature names and data types of the final dataset that will be produced when your data flow is used to process your input dataset." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "column_schemas = [\n", - " {\n", - " \"name\": \"ratingEventId\",\n", - " \"type\": \"string\"\n", - " },\n", - " {\n", - " \"name\": \"ts\",\n", - " \"type\": \"long\"\n", - " },\n", - " {\n", - " \"name\": \"userId\",\n", - " \"type\": \"long\"\n", - " },\n", - " {\n", - " \"name\": \"trackId\",\n", - " \"type\": \"string\"\n", - " },\n", - " {\n", - " \"name\": \"sessionId\",\n", - " \"type\": \"long\"\n", - " },\n", - " {\n", - " \"name\": \"itemInSession\",\n", - " \"type\": \"long\"\n", - " },\n", - " {\n", - " \"name\": \"Rating\",\n", - " \"type\": \"float\"\n", - " },\n", - " {\n", - " \"name\": \"EventTime\",\n", - " \"type\": \"float\"\n", - " }\n", - "]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Below we create the SDK input for those feature definitions. Some schema types in Data Wrangler are not \n", - "supported by Feature Store. The following will create a default_FG_type set to String for these types." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from sagemaker.feature_store.feature_definition import FeatureDefinition\n", - "from sagemaker.feature_store.feature_definition import FeatureTypeEnum\n", - "\n", - "default_feature_type = FeatureTypeEnum.STRING\n", - "column_to_feature_type_mapping = {\n", - " \"float\": FeatureTypeEnum.FRACTIONAL,\n", - " \"long\": FeatureTypeEnum.INTEGRAL\n", - "}\n", - "\n", - "feature_definitions = [\n", - " FeatureDefinition(\n", - " feature_name=column_schema['name'], \n", - " feature_type=column_to_feature_type_mapping.get(column_schema['type'], default_feature_type)\n", - " ) for column_schema in column_schemas\n", - "]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "## Configure Feature Group\n", - "\n", - "##### [back to top](#02c-nb)\n", - "\n", - "----\n", - "\n", - "
💡 Configurable Settings \n", - "\n", - "1. feature_group_name: name of the feature group.\n", - "1. feature_store_offline_s3_uri: SageMaker FeatureStore writes the data in the OfflineStore of a FeatureGroup to a S3 location owned by you.\n", - "1. enable_online_store: controls if online store is enabled. Enabling the online store allows quick access to the latest value for a Record via the GetRecord API.\n", - "1. iam_role: IAM role for executing the processing job.\n", - "
" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from time import gmtime, strftime\n", - "import uuid\n", - "import sagemaker \n", - "\n", - "# Sagemaker session\n", - "sess = sagemaker.Session()\n", - "\n", - "# IAM role for executing the processing job.\n", - "iam_role = sagemaker.get_execution_role()\n", - "\n", - "# flow name and an unique ID for this export (used later as the processing job name for the export)\n", - "flow_name = \"01_music_dataprep\"\n", - "flow_export_name = f\"flow-{flow_export_id}\"\n", - "\n", - "# feature group name, with flow_name and an unique id. You can give it a customized name\n", - "feature_group_name = 'ratings-features-music-rec'\n", - "print(f\"Feature Group Name: {feature_group_name}\")\n", - "\n", - "# SageMaker FeatureStore writes the data in the OfflineStore of a FeatureGroup to a \n", - "# S3 location owned by you.\n", - "feature_store_offline_s3_uri = 's3://' + bucket\n", - "\n", - "# controls if online store is enabled. Enabling the online store allows quick access to \n", - "# the latest value for a Record via the GetRecord API.\n", - "enable_online_store = True" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "fg_name_ratings = feature_group_name\n", - "\n", - "ps.add({'fg_name_ratings': fg_name_ratings}, namespace='music-rec')\n", - "ps.store()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "### Initialize & Create Feature Group\n", - "\n", - "##### [back to top](#02c-nb)\n", - "\n", - "----" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Initialize Boto3 session that is required to create feature group\n", - "import boto3\n", - "from sagemaker.session import Session\n", - "\n", - "region = boto3.Session().region_name\n", - "boto_session = boto3.Session(region_name=region)\n", - "\n", - "sagemaker_client = boto_session.client(service_name='sagemaker', region_name=region)\n", - "featurestore_runtime = boto_session.client(service_name='sagemaker-featurestore-runtime', region_name=region)\n", - "\n", - "feature_store_session = Session(\n", - " boto_session=boto_session,\n", - " sagemaker_client=sagemaker_client,\n", - " sagemaker_featurestore_runtime_client=featurestore_runtime\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Feature group is initialized and created below" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from sagemaker.feature_store.feature_group import FeatureGroup\n", - "\n", - "feature_group = FeatureGroup(\n", - " name=feature_group_name, sagemaker_session=feature_store_session, feature_definitions=feature_definitions)\n", - "\n", - "# only create feature group if it doesn't already exist\n", - "try:\n", - " sagemaker_client.describe_feature_group(FeatureGroupName=feature_group_name, NextToken='string')\n", - " feature_group_exists=True\n", - " print(\"Feature Group {0} already exists. Using {0}\".format(feature_group_name))\n", - "except Exception as e:\n", - " error = e.response.get('Error').get('Code')\n", - " if error == \"ResourceNotFound\":\n", - " feature_group_exists=False\n", - " print(\"Creating Feature Group {}\".format(feature_group_name))\n", - " feature_group.create(\n", - " s3_uri=feature_store_offline_s3_uri,\n", - " record_identifier_name=record_identifier_feature_name,\n", - " event_time_feature_name=event_time_feature_name,\n", - " role_arn=iam_role,\n", - " enable_online_store=enable_online_store\n", - " )\n", - " if error == 'ResourceInUse':\n", - " feature_group_exists=True\n", - " print(\"Feature Group {0} already exists. Using {0}\".format(feature_group_name))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Invoke the Feature Store API to create the feature group and wait until it is ready" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import time\n", - "def wait_for_feature_group_creation_complete(feature_group):\n", - " \"\"\"Helper function to wait for the completions of creating a feature group\"\"\"\n", - " status = feature_group.describe().get(\"FeatureGroupStatus\")\n", - " while status == \"Creating\":\n", - " print(\"Waiting for Feature Group Creation\")\n", - " time.sleep(5)\n", - " status = feature_group.describe().get(\"FeatureGroupStatus\")\n", - " if status != \"Created\":\n", - " raise SystemExit(f\"Failed to create feature group {feature_group.name}: {status}\")\n", - " print(f\"FeatureGroup {feature_group.name} successfully created.\")\n", - "\n", - "wait_for_feature_group_creation_complete(feature_group=feature_group)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Now that the feature group is created, You will use a processing job to process your \n", - " data at scale and ingest the transformed data into this feature group." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "## Inputs and Outputs\n", - "\n", - "##### [back to top](#02c-nb)\n", - "\n", - "----\n", - "The below settings configure the inputs and outputs for the flow export.\n", - "\n", - "
💡 Configurable Settings \n", - "\n", - "In Input - Source you can configure the data sources that will be used as input by Data Wrangler\n", - "\n", - "1. For S3 sources, configure the source attribute that points to the input S3 prefixes\n", - "2. For all other sources, configure attributes like query_string, database in the source's \n", - "DatasetDefinition object.\n", - "\n", - "If you modify the inputs the provided data must have the same schema and format as the data used in the Flow. \n", - "You should also re-execute the cells in this section if you have modified the settings in any data sources.\n", - "
" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from sagemaker.processing import ProcessingInput, ProcessingOutput\n", - "from sagemaker.dataset_definition.inputs import AthenaDatasetDefinition, DatasetDefinition, RedshiftDatasetDefinition\n", - "\n", - "data_sources = []" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Input - S3 Source: tracks.csv" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "data_sources.append(ProcessingInput(\n", - " source=f\"{tracks_data_source}\", # You could override this to point to another dataset on S3\n", - " destination=\"/opt/ml/processing/tracks.csv\",\n", - " input_name=\"tracks.csv\",\n", - " s3_data_type=\"S3Prefix\",\n", - " s3_input_mode=\"File\",\n", - " s3_data_distribution_type=\"FullyReplicated\"\n", - "))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Input - S3 Source: ratings.csv" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "data_sources.append(ProcessingInput(\n", - " source=f\"{ratings_data_source}\", # You could override this to point to another dataset on S3\n", - " destination=\"/opt/ml/processing/ratings.csv\",\n", - " input_name=\"ratings.csv\",\n", - " s3_data_type=\"S3Prefix\",\n", - " s3_input_mode=\"File\",\n", - " s3_data_distribution_type=\"FullyReplicated\"\n", - "))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Output: Feature Store \n", - "\n", - "Below are the inputs required by the SageMaker Python SDK to launch a processing job with feature store as an output." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from sagemaker.processing import FeatureStoreOutput\n", - "\n", - "# Output name is auto-generated from the select node's ID + output name from the flow file.\n", - "output_name = \"9a283380-91ca-478e-be99-6ba3bf57c680.default\" # ratings node\n", - "\n", - "processing_job_output = ProcessingOutput(\n", - " output_name=output_name,\n", - " app_managed=True,\n", - " feature_store_output=FeatureStoreOutput(feature_group_name=feature_group_name),\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We already uploaded our flow file in the `02a` notebook. Here the Data Wrangler Flow is also provided to the Processing Job as an input source which we configure below." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "## Input - Flow: 01_music_dataprep.flow\n", - "flow_input = ProcessingInput(\n", - " source=flow_s3_uri,\n", - " destination=\"/opt/ml/processing/flow\",\n", - " input_name=\"flow\",\n", - " s3_data_type=\"S3Prefix\",\n", - " s3_input_mode=\"File\",\n", - " s3_data_distribution_type=\"FullyReplicated\"\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "## Run Processing Job \n", - "\n", - "##### [back to top](#02c-nb)\n", - "\n", - "----\n", - "## Job Configurations\n", - "\n", - "
💡 Configurable Settings \n", - "\n", - "You can configure the following settings for Processing Jobs. If you change any configurations you will \n", - "need to re-execute this and all cells below it by selecting the Run menu above and click \n", - "Run Selected Cells and All Below\n", - "\n", - "1. IAM role for executing the processing job. \n", - "2. A unique name of the processing job. Give a unique name every time you re-execute processing jobs\n", - "3. Data Wrangler Container URL.\n", - "4. Instance count, instance type and storage volume size in GB.\n", - "5. Content type for each output. Data Wrangler supports CSV as default and Parquet.\n", - "6. Network Isolation settings\n", - "
" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# IAM role for executing the processing job.\n", - "iam_role = sagemaker.get_execution_role()\n", - "\n", - "processing_job_name = \"dw-flow-proc-music-rec-ratings-{}-{}\".format(flow_export_id, str(uuid.uuid4())[:8])\n", - "print (f\"{processing_job_name}\")\n", - "\n", - "# Data Wrangler Container URL.\n", - "container_uri = f\"{dw_ecrlist['region'][region]}.dkr.ecr.{region}.amazonaws.com/sagemaker-data-wrangler-container:1.x\"\n", - "\n", - "# Processing Job Instance count and instance type.\n", - "instance_count = 2\n", - "instance_type = \"ml.m5.4xlarge\"\n", - "\n", - "# Size in GB of the EBS volume to use for storing data during processing\n", - "volume_size_in_gb = 30\n", - "\n", - "# Content type for each output. Data Wrangler supports CSV as default and Parquet.\n", - "output_content_type = \"CSV\"\n", - "\n", - "# Network Isolation mode; default is off\n", - "enable_network_isolation = False\n", - "\n", - "# Output configuration used as processing job container arguments \n", - "output_config = {\n", - " output_name: {\n", - " \"content_type\": output_content_type\n", - " }\n", - "}" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Create Processing Job\n", - "\n", - "To launch a Processing Job, you will use the SageMaker Python SDK to create a Processor function." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from sagemaker.processing import Processor\n", - "from sagemaker.network import NetworkConfig\n", - "import json\n", - "\n", - "processor = Processor(\n", - " role=iam_role,\n", - " image_uri=container_uri,\n", - " instance_count=instance_count,\n", - " instance_type=instance_type,\n", - " volume_size_in_gb=volume_size_in_gb,\n", - " network_config=NetworkConfig(enable_network_isolation=enable_network_isolation),\n", - " sagemaker_session=sess\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Job Status & S3 Output Location\n", - "\n", - "Below you wait for processing job to finish. If it finishes successfully, your feature group should be populated \n", - "with transformed feature values. In addition the raw parameters used by the Processing Job will be printed." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "%%time\n", - "\n", - "# Run Processing Job if job not already previously ran\n", - "if feature_group_exists:\n", - " print(\"Feature Group {0} already exists therefore we will not run a processing job to create it again\".format(feature_group_name))\n", - "else:\n", - " print(\"Creating Processing Job: {}\".format(feature_group_name))\n", - " processor.run(\n", - " inputs=[flow_input] + data_sources, \n", - " outputs=[processing_job_output],\n", - " arguments=[f\"--output-config '{json.dumps(output_config)}'\"],\n", - " wait=False,\n", - " logs=False,\n", - " job_name=processing_job_name\n", - " ) \n", - " \n", - " job_result = sess.wait_for_processing_job(processing_job_name)\n", - " print(job_result)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "You can view newly created feature group in Studio, refer to [Use Amazon SageMaker Feature Store with Amazon SageMaker Studio](https://docs.aws.amazon.com/sagemaker/latest/dg/feature-store-use-with-studio.html)\n", - "for detailed guide.[Learn more about SageMaker Feature Store](https://github.com/aws/amazon-sagemaker-examples/tree/master/sagemaker-featurestore)" - ] - } - ], - "metadata": { - "instance_type": "ml.t3.medium", - "kernelspec": { - "display_name": "Python 3 (Data Science)", - "language": "python", - "name": "python3__SAGEMAKER_INTERNAL__arn:aws:sagemaker:us-east-2:429704687514:image/datascience-1.0" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.10" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/end_to_end/music_recommendation/04_deploy_infer_explain.ipynb b/end_to_end/music_recommendation/04_deploy_infer_explain.ipynb deleted file mode 100644 index f8f6009869..0000000000 --- a/end_to_end/music_recommendation/04_deploy_infer_explain.ipynb +++ /dev/null @@ -1,567 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "# Music Recommender Part 4: Deploy Model & Inference using Online Feature Store\n", - "\n", - "----\n", - "\n", - "In this notebook, we'll deploy our chosen model as an endpoint so that we can make predictions/inferences against it. \n", - "Under the hood the *model.deploy* function creates a model, an endpoint configuration and an endpoint. \n", - "\n", - "Then we'll make music recommendations for a single user by inferencing against our model. We'll query our Feature Store to get some data to use for inferencing and show you how [SageMaker Clarify](https://docs.aws.amazon.com/sagemaker/latest/dg/clarify-model-explainability.html) can explain which features were most useful in making the recommended music predictions using SHAP values.\n", - "\n", - "Amazon SageMaker Clarify provides tools to help explain how machine learning models make predictions. These tools can help ML modelers and developers and other internal stakeholders understand model characteristics as a whole prior to deployment and to debug predictions provided by the model after it's deployed. Transparency about how ML models arrive at their predictions is also critical to consumers and regulators who need to trust the model predictions if they are going to accept the decisions based on them.\n", - "\n", - "----\n", - "### Contents\n", - "- [Overview](00_overview_arch_data.ipynb)\n", - "- [Part 1: Data Prep using Data Wrangler](01_music_dataprep.flow)\n", - "- [Part 2a: Feature Store Creation - Tracks](02a_export_fg_tracks.ipynb)\n", - "- [Part 2b: Feature Store Creation - User Preferences](02b_export_fg_5star_features.ipynb)\n", - "- [Part 2c: Feature Store Creation - Ratings](02c_export_fg_ratings.ipynb)\n", - "- [Part 3: Train Model with Debugger Hooks. Set Artifacts and Register Model.](03_train_model_lineage_registry_debugger.ipynb)\n", - "- [Part 4: Deploy Model & Inference using Online Feature Store](04_deploy_inference_explainability.ipynb)\n", - " - [Deploy model](#04-deploy)\n", - " - [Create predictor](#04-predictor)\n", - " - [Infer new songs](#04-infer)\n", - " - [Explain model predictions](#04-explain)\n", - "- [Part 5: Model Monitor](05_model_monitor.ipynb)\n", - "- [Part 6: SageMaker Pipelines](06_pipeline.ipynb)\n", - "- [Part 7: Resource Cleanup](07_clean_up.ipynb)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "scrolled": true - }, - "outputs": [], - "source": [ - "try:\n", - " !pip install -U awswrangler\n", - "except ModuleNotFoundError:\n", - " !pip install --no-input awswrangler" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# update pandas to avoid data type issues in older 1.0 version\n", - "!pip install -qU pandas==1.2.0\n", - "import pandas as pd\n", - "print(pd.__version__)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import time\n", - "import boto3\n", - "import argparse\n", - "import pathlib\n", - "\n", - "import sagemaker\n", - "from sagemaker.feature_store.feature_group import FeatureGroup\n", - "from sagemaker.estimator import Estimator\n", - "import awswrangler as wr\n", - "\n", - "import os\n", - "import json\n", - "import matplotlib.pyplot as plt\n", - "import numpy as np" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import sys\n", - "import pprint\n", - "sys.path.insert(1, './code')\n", - "from parameter_store import ParameterStore\n", - "ps = ParameterStore(verbose=False)\n", - "\n", - "parameters = ps.read('music-rec')\n", - "\n", - "bucket = parameters['bucket']\n", - "dw_ecrlist = parameters['dw_ecrlist']\n", - "fg_name_ratings = parameters['fg_name_ratings']\n", - "fg_name_tracks = parameters['fg_name_tracks']\n", - "fg_name_user_preferences = parameters['fg_name_user_preferences']\n", - "\n", - "flow_export_id = parameters['flow_export_id']\n", - "flow_s3_uri = parameters['flow_s3_uri']\n", - "pretrained_model_path = parameters['pretrained_model_path']\n", - "prefix = parameters['prefix']\n", - "ratings_data_source = parameters['ratings_data_source']\n", - "tracks_data_source = parameters['tracks_data_source']\n", - "model_name = parameters['model_name']\n", - "training_job_name = parameters['training_job_name']\n", - "mpg_name = parameters['mpg_name']\n", - "model_name = parameters['model_name']\n", - "feature_names = parameters['feature_names']\n", - "train_data_uri = parameters['train_data_uri']\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "sess = sagemaker.Session()\n", - "region = boto3.Session().region_name\n", - "boto3.setup_default_session(region_name=region)\n", - "\n", - "s3_client = boto3.client('s3')\n", - "account_id = boto3.client('sts').get_caller_identity()[\"Account\"]\n", - "\n", - "boto_session = boto3.Session(region_name=region)\n", - "\n", - "sagemaker_client = boto_session.client(service_name='sagemaker', region_name=region)\n", - "\n", - "sagemaker_session = sagemaker.session.Session(\n", - " boto_session=boto_session,\n", - " sagemaker_client=sagemaker_client\n", - ")\n", - "\n", - "sagemaker_role = sagemaker.get_execution_role(sagemaker_session=sagemaker_session)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "## Deploy Model\n", - "\n", - "##### [back to top](#04-nb)\n", - "\n", - "----" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "endpoint_name = '{}-endpoint-notebooks'.format(model_name)\n", - "print(endpoint_name)\n", - "\n", - "ps.add({'endpoint_name':endpoint_name}, namespace='music-rec')\n", - "ps.store()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# if you want to use a pretrained model, set use_pretrained = True\n", - "## else use_pretrained = False to use the model you trained in the previous notebook\n", - "use_pretrained = False\n", - "\n", - "if use_pretrained:\n", - " # or use a pretrained model if you skipped model training in the last notebook\n", - " xgb_estimator = sagemaker.model.Model(\n", - " image_uri=sagemaker.image_uris.retrieve(\"xgboost\", region, \"0.90-2\"),\n", - " model_data=pretrained_model_path,\n", - " role=sagemaker_role\n", - " )\n", - "else:\n", - " print(training_job_name)\n", - " # reinstantiate the estimator we trained in the previous notebook\n", - " xgb_estimator = Estimator.attach(training_job_name)\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "endpoint_list = sagemaker_client.list_endpoints(\n", - " SortBy='CreationTime',\n", - " SortOrder='Descending',\n", - " NameContains=endpoint_name,\n", - " StatusEquals='InService'\n", - ")\n", - "endpoint_list" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Create endpoint" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "%%time\n", - "if len(endpoint_list['Endpoints']) > 0:\n", - " print(f\"Using existing endpoint: {endpoint_list['Endpoints'][0]['EndpointName']}\")\n", - "else:\n", - " # deploy endpoint for model if it doesn't already exist\n", - " xgb_estimator.deploy(initial_instance_count=1,\n", - " instance_type='ml.m4.xlarge',\n", - " model_name=model_name,\n", - " endpoint_name=endpoint_name\n", - " )" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "model_package = sagemaker_client.list_model_packages(ModelPackageGroupName=mpg_name)['ModelPackageSummaryList'][0]\n", - "model_package_update = {\n", - " 'ModelPackageArn': model_package['ModelPackageArn'],\n", - " 'ModelApprovalStatus': 'Approved'\n", - "}\n", - "\n", - "update_response = sagemaker_client.update_model_package(**model_package_update)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - " \n", - "\n", - "## Create a predictor\n", - "\n", - "##### [back to top](#04-nb)\n", - "\n", - "----" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "predictor = sagemaker.predictor.Predictor(\n", - " endpoint_name=endpoint_name,\n", - " sagemaker_session=sagemaker_session)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Pull user data from feature group" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# random user ID. You can try any other ID\n", - "sample_user_id = 11005" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "featurestore_runtime = boto_session.client(service_name='sagemaker-featurestore-runtime', region_name=region)\n", - "\n", - "feature_store_session = sagemaker.Session(\n", - " boto_session=boto_session,\n", - " sagemaker_client=sagemaker_client,\n", - " sagemaker_featurestore_runtime_client=featurestore_runtime\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# pull the sample user's 5 star preferences record from the feature store\n", - "fg_response = featurestore_runtime.get_record(\n", - " FeatureGroupName=fg_name_user_preferences, \n", - " RecordIdentifierValueAsString=str(sample_user_id)\n", - ")\n", - "\n", - "record = fg_response['Record']\n", - "df_user = pd.DataFrame(record).set_index('FeatureName')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Pull sample of 1000 tracks from feature group" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# pull a sample of the tracks data (multiple records) from the feature store using athena query\n", - "fg_name_tracks_obj = FeatureGroup(name=fg_name_tracks, sagemaker_session=feature_store_session)\n", - "tracks_query = fg_name_tracks_obj.athena_query()\n", - "tracks_table = tracks_query.table_name\n", - "\n", - "# use escaped quotes aound table name since it contains '-' symbols\n", - "query_string = (\"SELECT * FROM \\\"{}\\\" LIMIT 1000\".format(tracks_table))\n", - "print(\"Running \" + query_string)\n", - "\n", - "# run Athena query. The output is loaded to a Pandas dataframe.\n", - "tracks_query.run(query_string=query_string, output_location=f\"s3://{bucket}/{prefix}/query_results/\")\n", - "tracks_query.wait()\n", - "df_tracks = tracks_query.as_dataframe()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "data = df_tracks.merge(pd.DataFrame(df_user['ValueAsString']).T, how='cross')\n", - "data.columns = [c.lower() for c in data.columns]\n", - "inference_df = data[feature_names]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Format the datapoint\n", - "The datapoint must match the exact input format as the model was trained--with all features in the correct order. In this example, the `col_order` variable was saved when you created the train and test datasets earlier in the guide." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "data_inputs = [','.join([str(i) for i in row]) for row in inference_df.values]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - " \n", - "\n", - "## Infer (predict) new songs using model\n", - "\n", - "##### [back to top](#04-nb)\n", - "\n", - "----" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "predictions = []\n", - "for data_input in data_inputs:\n", - " results = predictor.predict(data_input, initial_args = {\"ContentType\": \"text/csv\"})\n", - " prediction = json.loads(results)\n", - " predictions.append(prediction)\n", - "print(f'Predicted rating for user {int(sample_user_id)}:', prediction)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Write to csv in S3 without headers and index column.\n", - "inference_df['rating'] = predictions\n", - "inference_df = inference_df[['rating']+feature_names]\n", - "inference_df.to_csv('data/prediction_data.csv', header=False, index=False)\n", - "\n", - "s3_client.upload_file('data/prediction_data.csv', bucket, f'{prefix}/data/pred/prediction_data.csv')\n", - "\n", - "pred_data_uri = f's3://{bucket}/{prefix}/data/pred/prediction_data.csv'" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "df_train = pd.read_csv(train_data_uri)\n", - "\n", - "label = 'rating'" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - " \n", - "\n", - "## Explain model predictions\n", - "\n", - "##### [back to top](#04-nb)\n", - "\n", - "----" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "explainability_output_path = f's3://{bucket}/{prefix}/clarify-output/explainability'" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "clarify_processor = sagemaker.clarify.SageMakerClarifyProcessor(\n", - " role=sagemaker_role,\n", - " instance_count=1,\n", - " instance_type='ml.c4.xlarge',\n", - " sagemaker_session=sagemaker_session)\n", - "\n", - "model_config = sagemaker.clarify.ModelConfig(\n", - " model_name=model_name,\n", - " instance_type='ml.m4.xlarge',\n", - " instance_count=1,\n", - " accept_type='text/csv')\n", - "\n", - "shap_config = sagemaker.clarify.SHAPConfig(\n", - " baseline=[df_train.median().values[1:].tolist()], # ignore the first column since that is that target\n", - " num_samples=100,\n", - " agg_method='mean_abs')\n", - "\n", - "explainability_data_config = sagemaker.clarify.DataConfig(\n", - " s3_data_input_path=pred_data_uri,\n", - " s3_output_path=explainability_output_path,\n", - " label=label,\n", - " headers=[label]+feature_names,\n", - " dataset_type='text/csv')\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "scrolled": true - }, - "outputs": [], - "source": [ - "%%time\n", - "try:\n", - " s3_client.download_file(\n", - " Bucket = bucket, \n", - " Key = f'{prefix}/clarify-output/explainability/explanations_shap/out.csv', \n", - " Filename = 'data/shap_output.csv'\n", - " )\n", - " print('Downloaded output from previous explainability job')\n", - "except Exception as e:\n", - " error = e.response.get('Error').get('Code')\n", - " if error == '404':\n", - " print('Running explainability job')\n", - " clarify_processor.run_explainability(\n", - " data_config=explainability_data_config,\n", - " model_config=model_config,\n", - " explainability_config=shap_config)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "inference_df['trackid'] = data['trackid']" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "playlist_length = 10 # number of songs to recommend in playlist\n", - "playlist = inference_df.sort_values(by='rating', ascending=False).head(playlist_length)\n", - "print('Curated Playlist:\\n', playlist['trackid'])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "local_explanations_out = pd.read_csv(explainability_output_path+'/explanations_shap/out.csv')\n", - "local_explanations_out.columns = feature_names\n", - "\n", - "print(\"Model prediction:\", playlist.iloc[0, 0])\n", - "plt.figure(figsize=(12,6))\n", - "local_explanations_out.iloc[0].sort_values().plot.barh(title='Local explanation for prediction')" - ] - } - ], - "metadata": { - "instance_type": "ml.t3.medium", - "kernelspec": { - "display_name": "Python 3 (Data Science)", - "language": "python", - "name": "python3__SAGEMAKER_INTERNAL__arn:aws:sagemaker:us-east-2:429704687514:image/datascience-1.0" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.10" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/end_to_end/music_recommendation/05_model_monitor.ipynb b/end_to_end/music_recommendation/05_model_monitor.ipynb deleted file mode 100644 index 3517343b4e..0000000000 --- a/end_to_end/music_recommendation/05_model_monitor.ipynb +++ /dev/null @@ -1,494 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "# Music Recommender Part 5: Model Monitor\n", - "\n", - "----\n", - "In this notebook, we'll set up [SageMaker Model Monitor](https://docs.aws.amazon.com/sagemaker/latest/dg/model-monitor.html) to detect when our model or data significantly deviates from its \"normal\" behavior. SageMaker Model Monitor provides the ability to monitor machine learning models in production and detect deviations in data quality in comparison to a baseline dataset (e.g. training data set). This notebook walks you through enabling data capture and setting up continous monitoring for an existing Endpoint.\n", - "\n", - "This Notebook helps with the following:\n", - "* Update your existing SageMaker Endpoint to enable Model Monitoring\n", - "* Analyze the training dataset to generate a baseline constraint\n", - "* Setup a MonitoringSchedule for monitoring deviations from the specified baseline\n", - "\n", - "----\n", - "### Contents\n", - "- [Overview](00_overview_arch_data.ipynb)\n", - "- [Part 1: Data Prep using Data Wrangler](01_music_dataprep.flow)\n", - "- [Part 2a: Feature Store Creation - Tracks](02a_export_fg_tracks.ipynb)\n", - "- [Part 2b: Feature Store Creation - User Preferences](02b_export_fg_5star_features.ipynb)\n", - "- [Part 2c: Feature Store Creation - Ratings](02c_export_fg_ratings.ipynb)\n", - "- [Part 3: Train Model with Debugger Hooks. Set Artifacts and Register Model.](03_train_model_lineage_registry_debugger.ipynb)\n", - "- [Part 4: Deploy Model & Inference using Online Feature Store](04_deploy_inference_explainability.ipynb)\n", - "- [Part 5: Model Monitor](05_model_monitor.ipynb)\n", - " - [Enable data capture](#05-capture)\n", - " - [Baselining](#05-baseline)\n", - " - [Enable continous monitoring](#05-continuous)\n", - "- [Part 6: SageMaker Pipelines](06_pipeline.ipynb)\n", - "- [Part 7: Resource Cleanup](07_clean_up.ipynb)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "\n", - "\n", - "## Step 1: Enable real-time inference data capture\n", - "\n", - "##### [back to top](#05-nb)\n", - "\n", - "----" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "To enable data capture for monitoring the model data quality, you specify the new capture option called `DataCaptureConfig`. You can capture the request payload, the response payload or both with this configuration. The capture config applies to all variants. Please provide the Endpoint name in the following cell:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from sagemaker.model_monitor import DataCaptureConfig\n", - "from sagemaker.predictor import Predictor\n", - "from sagemaker import session\n", - "import boto3" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "scrolled": true - }, - "outputs": [], - "source": [ - "import sys\n", - "import pprint\n", - "sys.path.insert(1, './code')\n", - "from parameter_store import ParameterStore\n", - "ps = ParameterStore(verbose=False)\n", - "\n", - "parameters = ps.read('music-rec')\n", - "\n", - "bucket = parameters['bucket']\n", - "dw_ecrlist = parameters['dw_ecrlist']\n", - "fg_name_ratings = parameters['fg_name_ratings']\n", - "fg_name_tracks = parameters['fg_name_tracks']\n", - "fg_name_user_preferences = parameters['fg_name_user_preferences']\n", - "\n", - "flow_export_id = parameters['flow_export_id']\n", - "flow_s3_uri = parameters['flow_s3_uri']\n", - "pretrained_model_path = parameters['pretrained_model_path']\n", - "prefix = parameters['prefix']\n", - "ratings_data_source = parameters['ratings_data_source']\n", - "tracks_data_source = parameters['tracks_data_source']\n", - "endpoint_name = parameters['endpoint_name']\n", - "val_data_uri = parameters['val_data_uri']\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "sm_session = session.Session(boto3.Session())\n", - "region = boto3.Session().region_name" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Please fill in the following for enabling data capture\n", - "s3_capture_upload_path = f's3://{bucket}/{prefix}/endpoint-data-capture/' #example: s3://bucket-name/path/to/endpoint-data-capture/\n", - "\n", - "##### \n", - "## IMPORTANT\n", - "##\n", - "## Please make sure to add the \"s3:PutObject\" permission to the \"role' you provided in the SageMaker Model \n", - "## behind this Endpoint. Otherwise, Endpoint data capture will not work.\n", - "## \n", - "##### " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "scrolled": true - }, - "outputs": [], - "source": [ - "%%time\n", - "# Change parameters as you would like - adjust sampling percentage, \n", - "# chose to capture request or response or both\n", - "data_capture_config = DataCaptureConfig(\n", - " enable_capture = True,\n", - " sampling_percentage=25,\n", - " destination_s3_uri=s3_capture_upload_path,\n", - " kms_key_id=None,\n", - " capture_options=[\"REQUEST\", \"RESPONSE\"],\n", - " csv_content_types=[\"text/csv\"],\n", - " json_content_types=[\"application/json\"]\n", - ")\n", - "\n", - "# Now it is time to apply the new configuration and wait for it to be applied\n", - "predictor = Predictor(endpoint_name=endpoint_name)\n", - "predictor.update_data_capture_config(data_capture_config=data_capture_config)\n", - "sm_session.wait_for_endpoint(endpoint=endpoint_name)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Before you proceed:\n", - "Currently SageMaker supports monitoring Endpoints out of the box only for **tabular (csv, flat-json)** datasets. If your Endpoint uses some other datasets, these following steps will NOT work for you.\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "## Step 2: Model Monitor - Baselining\n", - "\n", - "##### [back to top](#05-nb)\n", - "\n", - "----" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "In addition to collecting the data, SageMaker allows you to monitor and evaluate the data observed by the Endpoints. For this :\n", - "1. We need to create a baseline with which we compare the realtime traffic against. \n", - "1. Once a baseline is ready, we can setup a schedule to continously evaluate/compare against the baseline." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Constraint suggestion with baseline/training dataset" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The training dataset with which you trained the model is usually a good baseline dataset. Note that the training dataset's data schema and the inference dataset schema should exactly match (i.e. number and order of the features).\n", - "\n", - "Using our training dataset, we'll ask SageMaker to suggest a set of baseline constraints and generate descriptive statistics to explore the data." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "##'s3://bucketname/path/to/baseline/data' - Where your validation data is\n", - "baseline_data_uri = val_data_uri \n", - "##'s3://bucketname/path/to/baseline/data' - Where the results are to be stored in\n", - "baseline_results_uri = f's3://{bucket}/{prefix}/baseline/results' \n", - "\n", - "print('Baseline data uri: {}'.format(baseline_data_uri))\n", - "print('Baseline results uri: {}'.format(baseline_results_uri))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Create a baselining job with the validation dataset" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Now that we have the training data ready in S3, let's kick off a job to `suggest` constraints. `DefaultModelMonitor.suggest_baseline(..)` kicks off a `ProcessingJob` using a SageMaker provided Model Monitor container to generate the constraints. Please edit the configurations to fit your needs." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "scrolled": true - }, - "outputs": [], - "source": [ - "from sagemaker.model_monitor import DefaultModelMonitor\n", - "from sagemaker.model_monitor.dataset_format import DatasetFormat\n", - "from sagemaker import get_execution_role\n", - "import datetime\n", - "\n", - "role = get_execution_role(sagemaker_session=sm_session)\n", - "\n", - "datetime_stamp = datetime.datetime.now().strftime(\"%Y-%m-%d-%H%M%S\")\n", - "\n", - "my_default_monitor = DefaultModelMonitor(\n", - " role=role,\n", - " instance_count=2,\n", - " instance_type='ml.m5.xlarge',\n", - " volume_size_in_gb=20,\n", - " max_runtime_in_seconds=1800,\n", - " base_job_name=f\"{prefix}-monitor-{datetime_stamp}\"\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "scrolled": true - }, - "outputs": [], - "source": [ - "%%time\n", - "\n", - "monitor_baseline = my_default_monitor.suggest_baseline(\n", - " baseline_dataset=baseline_data_uri,\n", - " dataset_format=DatasetFormat.csv(header=False),\n", - " output_s3_uri=baseline_results_uri,\n", - " job_name=f\"{prefix}-monitor-baseline-{datetime_stamp}\",\n", - " wait=True\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Exploratory Analysis of the Processing Jobs underlying SageMaker Monitor\n", - "In this short section [next few cells] we will be showing you how to further view the underlying jobs for the monitoring job" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from time import gmtime, strftime\n", - "import boto3\n", - "\n", - "client = boto3.client('sagemaker')\n", - "\n", - "def get_last_processing_job():\n", - " \n", - " response = client.list_processing_jobs(\n", - " NameContains=f\"{prefix}-monitor-baseline-{datetime_stamp}\",\n", - " StatusEquals='Completed',\n", - " SortBy='CreationTime',\n", - " SortOrder='Descending',\n", - " MaxResults=20\n", - " )\n", - " pprint.pprint(response['ProcessingJobSummaries'][0])\n", - " return response['ProcessingJobSummaries'][0]['ProcessingJobName']" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from sagemaker.processing import ProcessingJob \n", - "from sagemaker.estimator import Estimator\n", - "from sagemaker.model_monitor.model_monitoring import ModelMonitor\n", - "\n", - "my_default_monitor_name = get_last_processing_job()\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "my_default_monitor_reload = ProcessingJob.from_processing_name(sm_session, my_default_monitor_name)\n", - "\n", - "response = client.describe_processing_job(\n", - " ProcessingJobName=my_default_monitor_name\n", - ")\n", - "pprint.pprint(response)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Explore the generated constraints and statistics" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import pandas as pd\n", - "\n", - "baseline_job = my_default_monitor.latest_baselining_job\n", - "schema_df = pd.io.json.json_normalize(baseline_job.baseline_statistics().body_dict[\"features\"])\n", - "schema_df.head(10)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "constraints_df = pd.io.json.json_normalize(baseline_job.suggested_constraints().body_dict[\"features\"])\n", - "constraints_df.head(10)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Before proceeding to enable monitoring, you could chose to edit the constraint file as required to fine tune the constraints." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "## Step 3: Enable continous monitoring\n", - "\n", - "##### [back to top](#05-nb)\n", - "\n", - "----\n", - "\n", - "We have collected the data above, here we proceed to analyze and monitor the data with MonitoringSchedules." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Create a schedule" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We are ready to create a model monitoring schedule for the Endpoint created earlier with the baseline resources (constraints and statistics)." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from sagemaker.model_monitor import CronExpressionGenerator\n", - "import datetime as datetime\n", - "from time import gmtime, strftime\n", - "\n", - "\n", - "mon_schedule_name = 'music-rec-monitor-schedule-{}'.format(datetime.datetime.now().strftime(\"%Y-%m-%d-%H%M%S\"))\n", - "s3_report_path = f's3://{bucket}/{prefix}/monitor/report'\n", - "\n", - "try:\n", - " my_default_monitor.create_monitoring_schedule(\n", - " monitor_schedule_name=mon_schedule_name,\n", - " endpoint_input=endpoint_name,\n", - " output_s3_uri=s3_report_path,\n", - " statistics=my_default_monitor.baseline_statistics(),\n", - " constraints=my_default_monitor.suggested_constraints(),\n", - " schedule_cron_expression=CronExpressionGenerator.daily(),\n", - " enable_cloudwatch_metrics=True,\n", - " )\n", - " print(f\"Created monitoring schedule {mon_schedule_name}\")\n", - "except:\n", - " my_default_monitor.update_monitoring_schedule(\n", - " endpoint_input=endpoint_name,\n", - " schedule_cron_expression=CronExpressionGenerator.daily(),\n", - " enable_cloudwatch_metrics=True,\n", - " )\n", - " print(f\"Updated monitoring schedule {my_default_monitor.monitoring_schedule_name}\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Schedule status: Pending\n" - ] - } - ], - "source": [ - "import time\n", - "\n", - "desc_schedule_result = my_default_monitor.describe_schedule()\n", - "while desc_schedule_result['MonitoringScheduleStatus'] != 'Scheduled':\n", - " print('Schedule status: {}'.format(desc_schedule_result['MonitoringScheduleStatus']))\n", - " desc_schedule_result = my_default_monitor.describe_schedule()\n", - " time.sleep(30)\n", - "print('Schedule status: {}'.format(desc_schedule_result['MonitoringScheduleStatus']))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### All set\n", - "Now that your monitoring schedule has been created. Please return to the Amazon SageMaker Studio to list the executions for this Schedule and observe the results going forward." - ] - } - ], - "metadata": { - "anaconda-cloud": {}, - "instance_type": "ml.m5.large", - "kernelspec": { - "display_name": "Python 3 (Data Science)", - "language": "python", - "name": "python3__SAGEMAKER_INTERNAL__arn:aws:sagemaker:us-east-2:429704687514:image/datascience-1.0" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.10" - }, - "notice": "Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the \"License\"). You may not use this file except in compliance with the License. A copy of the License is located at http://aws.amazon.com/apache2.0/ or in the \"license\" file accompanying this file. This file is distributed on an \"AS IS\" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License." - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/end_to_end/music_recommendation/07_clean_up.ipynb b/end_to_end/music_recommendation/07_clean_up.ipynb deleted file mode 100644 index a718a1218e..0000000000 --- a/end_to_end/music_recommendation/07_clean_up.ipynb +++ /dev/null @@ -1,187 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "# Music Recommender Part 7: Clean Up\n", - "\n", - "## Overview\n", - "\n", - "----\n", - "### Clean up : Delete all Resources Created in the past 8 notebooks (nb 00-06)\n", - "In the past notebooks we have created many Amazon Resources; represented by their ARNs : Amazon Resource Names.\n", - "In order not to incur any cost in keeping those resources running, such as endpoints etc. We will use this notebook as a reminder to clean up and delete all the resources you have created in this music recommendation example.\n", - "\n", - "First we will read in all parameters saved in the 'music-rec' namespace as we went from one notebook to the next,\n", - "second we will use a little utility under the `./code/demo_helpers.py` script file to actually delete all resources passed\n", - "----\n", - "### Contents\n", - "- [Overview](00_overview_arch_data.ipynb)\n", - "- [Part 1: Data Prep using Data Wrangler](01_music_dataprep.flow)\n", - "- [Part 2a: Feature Store Creation - Tracks](02a_export_fg_tracks.ipynb)\n", - "- [Part 2b: Feature Store Creation - User Preferences](02b_export_fg_5star_features.ipynb)\n", - "- [Part 2c: Feature Store Creation - Ratings](02c_export_fg_ratings.ipynb)\n", - "- [Part 3: Train Model with Debugger Hooks. Set Artifacts and Register Model.](03_train_model_lineage_registry_debugger.ipynb)\n", - "- [Part 4: Deploy Model & Inference using Online Feature Store](04_deploy_infer_explain.ipynb)\n", - "- [Part 5: Model Monitor](05_model_monitor.ipynb)\n", - "- [Part 6: SageMaker Pipelines](06_pipeline.ipynb)\n", - "- [Part 7: Resource Cleanup](07_clean_up.ipynb)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import json\n", - "import boto3\n", - "import pathlib\n", - "import sagemaker\n", - "import numpy as np\n", - "import pandas as pd\n", - "import awswrangler as wr\n", - "\n", - "from sagemaker.estimator import Estimator\n", - "from sagemaker.workflow.pipeline import Pipeline\n", - "from sagemaker.workflow.steps import CreateModelStep\n", - "from sagemaker.sklearn.processing import SKLearnProcessor\n", - "from sagemaker.workflow.step_collections import RegisterModel\n", - "from sagemaker.workflow.steps import ProcessingStep, TrainingStep\n", - "from sagemaker.processing import ProcessingInput, ProcessingOutput\n", - "from sagemaker.workflow.parameters import ParameterInteger, ParameterFloat, ParameterString\n", - "from sagemaker.feature_store.feature_group import FeatureGroup" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "scrolled": true - }, - "outputs": [], - "source": [ - "import sys\n", - "import pprint\n", - "sys.path.insert(1, './code')\n", - "from parameter_store import ParameterStore\n", - "\n", - "ps = ParameterStore(verbose=False)\n", - "\n", - "parameters = ps.read('music-rec')\n", - "\n", - "bucket = parameters['bucket']\n", - "prefix = parameters['prefix']\n", - "ratings_data_source = parameters['ratings_data_source']\n", - "tracks_data_source = parameters['tracks_data_source']\n", - "val_data_uri = f\"s3://{bucket}/{prefix}/data/val/val_data.csv\"\n", - "\n", - "pipeline_endpoint_name = parameters['pipeline_endpoint_name']\n", - "pipeline_name = parameters['pipeline_name']\n", - "\n", - "fg_name_tracks = parameters['fg_name_tracks']\n", - "fg_name_ratings = parameters['fg_name_ratings']\n", - "fg_name_user_preferences = parameters['fg_name_user_preferences']\n", - "\n", - "dw_ecrlist = parameters['dw_ecrlist']\n", - "\n", - "pipeline_name = parameters['pipeline_name']\n", - "dataprep_pipeline_name = parameters['dataprep_pipeline_name']\n", - "train_deploy_pipeline_name = parameters['train_deploy_pipeline_name']\n", - "\n", - "endpoint_name = parameters['endpoint_name']\n", - "pipeline_endpoint_name = parameters['pipeline_endpoint_name']\n", - "\n", - "mpg_name = parameters['mpg_name']" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "region = boto3.Session().region_name\n", - "boto3.setup_default_session(region_name=region)\n", - "boto_session = boto3.Session(region_name=region)\n", - "\n", - "s3_client = boto3.client('s3', region_name=region)\n", - "\n", - "sagemaker_boto_client = boto_session.client('sagemaker')\n", - "sagemaker_session = sagemaker.session.Session(\n", - " boto_session=boto_session,\n", - " sagemaker_client=sagemaker_boto_client)\n", - "sagemaker_role = sagemaker.get_execution_role()\n", - "\n", - "account_id = boto3.client('sts').get_caller_identity()[\"Account\"]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# when demo_helpers.delete_project_resources() is ran it will delete all the resources created by this demo\n", - "sys.path.insert(1, './code')\n", - "import demo_helpers # our custom set of functions\n", - "\n", - "\n", - "def remove_all_resources():\n", - " demo_helpers.delete_project_resources(\n", - " sagemaker_boto_client=sagemaker_boto_client, \n", - " sagemaker_session=sagemaker_session,\n", - " endpoint_names=[pipeline_endpoint_name, endpoint_name],\n", - " pipeline_names=[pipeline_name, dataprep_pipeline_name, train_deploy_pipeline_name], \n", - " mpg_name=mpg_name,\n", - " feature_groups=[fg_name_ratings, fg_name_tracks, fg_name_user_preferences], \n", - " prefix=prefix,\n", - " delete_s3_objects=True,\n", - " bucket_name=bucket\n", - " )" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Uncomment the next line and run to delete all resources\n", - "# remove_all_resources()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "instance_type": "ml.t3.medium", - "kernelspec": { - "display_name": "Python 3 (Data Science)", - "language": "python", - "name": "python3__SAGEMAKER_INTERNAL__arn:aws:sagemaker:us-east-2:429704687514:image/datascience-1.0" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.10" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} From d483b1226dabfaaff481ab4989dcacc29499e6d4 Mon Sep 17 00:00:00 2001 From: atqy Date: Thu, 5 May 2022 19:41:06 +0000 Subject: [PATCH 04/25] make CI corrections --- .../01_data_exploration.ipynb | 2 +- ...oy_debugger_explain_monitor_registry.ipynb | 51 +------------------ .../end_to_end_pipeline.ipynb | 5 +- 3 files changed, 7 insertions(+), 51 deletions(-) diff --git a/end_to_end/music_recommendation/01_data_exploration.ipynb b/end_to_end/music_recommendation/01_data_exploration.ipynb index 4257f40f0e..20cc4be251 100644 --- a/end_to_end/music_recommendation/01_data_exploration.ipynb +++ b/end_to_end/music_recommendation/01_data_exploration.ipynb @@ -119,7 +119,7 @@ "metadata": {}, "outputs": [], "source": [ - "new_data_paths = get_data([f\"{s3_bucket_music_data}/tracks.csv\", f\"{s3_bucket_music_data}/ratings.csv\"], bucket, sample_data=0.70)\n", + "new_data_paths = get_data(s3_client, [f\"{s3_bucket_music_data}/tracks.csv\", f\"{s3_bucket_music_data}/ratings.csv\"], bucket, prefix, sample_data=0.70)\n", "print(new_data_paths)" ] }, diff --git a/end_to_end/music_recommendation/03_train_deploy_debugger_explain_monitor_registry.ipynb b/end_to_end/music_recommendation/03_train_deploy_debugger_explain_monitor_registry.ipynb index d7390e33f3..7e65dc0a4f 100644 --- a/end_to_end/music_recommendation/03_train_deploy_debugger_explain_monitor_registry.ipynb +++ b/end_to_end/music_recommendation/03_train_deploy_debugger_explain_monitor_registry.ipynb @@ -525,40 +525,6 @@ "metadata": {}, "outputs": [], "source": [ - "# # random user ID. You can try any other ID\n", - "# sample_user_id = 11005" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# featurestore_runtime = boto_session.client(service_name='sagemaker-featurestore-runtime', region_name=region)\n", - "\n", - "# feature_store_session = sagemaker.Session(\n", - "# boto_session=boto_session,\n", - "# sagemaker_client=sagemaker_client,\n", - "# sagemaker_featurestore_runtime_client=featurestore_runtime\n", - "# )" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# # pull the sample user's 5 star preferences record from the feature store\n", - "# fg_response = featurestore_runtime.get_record(\n", - "# FeatureGroupName='user-5star-track-features-music-rec', \n", - "# RecordIdentifierValueAsString=str(sample_user_id)\n", - "# )\n", - "\n", - "# record = fg_response['Record']\n", - "# df_user = pd.DataFrame(record).set_index('FeatureName')\n", - "# df_user.to_csv(\"./data/sample_user.csv\")\n", "df_user = pd.read_csv(\"./data/sample_user.csv\")\n", "df_user = df_user.set_index('FeatureName')" ] @@ -576,20 +542,6 @@ "metadata": {}, "outputs": [], "source": [ - "# # pull a sample of the tracks data (multiple records) from the feature store using athena query\n", - "# fg_name_tracks_obj = FeatureGroup(name='track-features-music-rec', sagemaker_session=feature_store_session)\n", - "# tracks_query = fg_name_tracks_obj.athena_query()\n", - "# tracks_table = tracks_query.table_name\n", - "\n", - "# # use escaped quotes aound table name since it contains '-' symbols\n", - "# query_string = (\"SELECT * FROM \\\"{}\\\" LIMIT 1000\".format(tracks_table))\n", - "# print(\"Running \" + query_string)\n", - "\n", - "# # run Athena query. The output is loaded to a Pandas dataframe.\n", - "# tracks_query.run(query_string=query_string, output_location=f\"s3://{bucket}/{prefix}/query_results/\")\n", - "# tracks_query.wait()\n", - "# df_tracks = tracks_query.as_dataframe()\n", - "# df_tracks.to_csv(\"./data/sample_tracks.csv\")\n", "df_tracks = pd.read_csv(\"./data/sample_tracks.csv\")" ] }, @@ -676,7 +628,8 @@ "metadata": {}, "outputs": [], "source": [ - "df_train = pd.read_csv(train_data_uri)\n", + "s3_client.download_file(bucket, f\"{prefix}/data/train/train_data.csv\", f\"train_data.csv\")\n", + "df_train = pd.read_csv(\"train_data.csv\")\n", "\n", "label = 'rating'" ] diff --git a/end_to_end/music_recommendation/end_to_end_pipeline.ipynb b/end_to_end/music_recommendation/end_to_end_pipeline.ipynb index 9961e1a5e8..f8e130a4b1 100644 --- a/end_to_end/music_recommendation/end_to_end_pipeline.ipynb +++ b/end_to_end/music_recommendation/end_to_end_pipeline.ipynb @@ -114,7 +114,10 @@ " sagemaker_client=sagemaker_boto_client)\n", "sagemaker_role = sagemaker.get_execution_role()\n", "\n", - "account_id = boto3.client('sts').get_caller_identity()[\"Account\"]" + "account_id = boto3.client('sts').get_caller_identity()[\"Account\"]\n", + "\n", + "bucket = sess.default_bucket()\n", + "prefix='music-recommendation'" ] }, { From 7d9f91a97793840fce9b5a5d751c0cd74168f9c3 Mon Sep 17 00:00:00 2001 From: atqy Date: Thu, 5 May 2022 23:59:14 +0000 Subject: [PATCH 05/25] CI corrections --- end_to_end/music_recommendation/01_data_exploration.ipynb | 2 +- ...3_train_deploy_debugger_explain_monitor_registry.ipynb | 3 ++- end_to_end/music_recommendation/code/demo_helpers.py | 8 ++++---- end_to_end/music_recommendation/end_to_end_pipeline.ipynb | 1 + 4 files changed, 8 insertions(+), 6 deletions(-) diff --git a/end_to_end/music_recommendation/01_data_exploration.ipynb b/end_to_end/music_recommendation/01_data_exploration.ipynb index 20cc4be251..d8aaa6ee21 100644 --- a/end_to_end/music_recommendation/01_data_exploration.ipynb +++ b/end_to_end/music_recommendation/01_data_exploration.ipynb @@ -147,7 +147,7 @@ "metadata": {}, "outputs": [], "source": [ - "pretrained_model_path = get_model('./model/model.tar.gz', bucket)" + "pretrained_model_path = get_model('./model/model.tar.gz', bucket, prefix)" ] }, { diff --git a/end_to_end/music_recommendation/03_train_deploy_debugger_explain_monitor_registry.ipynb b/end_to_end/music_recommendation/03_train_deploy_debugger_explain_monitor_registry.ipynb index 7e65dc0a4f..2dad2ab0f2 100644 --- a/end_to_end/music_recommendation/03_train_deploy_debugger_explain_monitor_registry.ipynb +++ b/end_to_end/music_recommendation/03_train_deploy_debugger_explain_monitor_registry.ipynb @@ -750,7 +750,8 @@ "metadata": {}, "outputs": [], "source": [ - "local_explanations_out = pd.read_csv(explainability_output_path+'/explanations_shap/out.csv')\n", + "s3_client.download_file(bucket, f\"{prefix}/clarify-output/explainability/explanations_shap/out.csv\", f\"out.csv\")\n", + "local_explanations_out = pd.read_csv('out.csv')\n", "local_explanations_out.columns = feature_names\n", "\n", "print(\"Model prediction:\", playlist.iloc[0, 0])\n", diff --git a/end_to_end/music_recommendation/code/demo_helpers.py b/end_to_end/music_recommendation/code/demo_helpers.py index 4a75022e99..15ea3545f0 100644 --- a/end_to_end/music_recommendation/code/demo_helpers.py +++ b/end_to_end/music_recommendation/code/demo_helpers.py @@ -35,12 +35,12 @@ def get_data(s3_client, public_s3_data, to_bucket, to_prefix, sample_data=1): return new_paths -def get_model(model_path, to_bucket): +def get_model(model_path, to_bucket, to_prefix): # upload model to our default s3 bucket for SageMaker Studio filename = model_path.split('/')[-1] - print("Uploading {} to {}\n".format(model_path, os.path.join(to_bucket,prefix,filename))) - s3_client.upload_file(model_path, to_bucket, os.path.join(prefix,filename)) - return "s://{}".format(os.path.join(to_bucket,prefix,filename)) + print("Uploading {} to {}\n".format(model_path, os.path.join(to_bucket,to_prefix,filename))) + s3_client.upload_file(model_path, to_bucket, os.path.join(to_prefix,filename)) + return "s://{}".format(os.path.join(to_bucket,to_prefix,filename)) def update_data_sources(flow_path, tracks_data_source, ratings_data_source): diff --git a/end_to_end/music_recommendation/end_to_end_pipeline.ipynb b/end_to_end/music_recommendation/end_to_end_pipeline.ipynb index f8e130a4b1..82425856c4 100644 --- a/end_to_end/music_recommendation/end_to_end_pipeline.ipynb +++ b/end_to_end/music_recommendation/end_to_end_pipeline.ipynb @@ -116,6 +116,7 @@ "\n", "account_id = boto3.client('sts').get_caller_identity()[\"Account\"]\n", "\n", + "sess = sagemaker.Session()\n", "bucket = sess.default_bucket()\n", "prefix='music-recommendation'" ] From 73ca82fd8a155865dec63cb615f0fa7bae823341 Mon Sep 17 00:00:00 2001 From: atqy Date: Fri, 6 May 2022 17:38:01 +0000 Subject: [PATCH 06/25] Fix 01 --- .../01_data_exploration.ipynb | 17 ----------------- .../music_recommendation/code/demo_helpers.py | 2 +- 2 files changed, 1 insertion(+), 18 deletions(-) diff --git a/end_to_end/music_recommendation/01_data_exploration.ipynb b/end_to_end/music_recommendation/01_data_exploration.ipynb index d8aaa6ee21..307d294dcd 100644 --- a/end_to_end/music_recommendation/01_data_exploration.ipynb +++ b/end_to_end/music_recommendation/01_data_exploration.ipynb @@ -134,22 +134,6 @@ "ratings_data_source = f's3://{bucket}/{prefix}/ratings.csv'" ] }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Upload pretrained model" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "pretrained_model_path = get_model('./model/model.tar.gz', bucket, prefix)" - ] - }, { "cell_type": "markdown", "metadata": {}, @@ -273,7 +257,6 @@ } ], "metadata": { - "instance_type": "ml.t3.medium", "kernelspec": { "display_name": "conda_python3", "language": "python", diff --git a/end_to_end/music_recommendation/code/demo_helpers.py b/end_to_end/music_recommendation/code/demo_helpers.py index 15ea3545f0..ac51ae1cde 100644 --- a/end_to_end/music_recommendation/code/demo_helpers.py +++ b/end_to_end/music_recommendation/code/demo_helpers.py @@ -35,7 +35,7 @@ def get_data(s3_client, public_s3_data, to_bucket, to_prefix, sample_data=1): return new_paths -def get_model(model_path, to_bucket, to_prefix): +def get_model(s3_client, model_path, to_bucket, to_prefix): # upload model to our default s3 bucket for SageMaker Studio filename = model_path.split('/')[-1] print("Uploading {} to {}\n".format(model_path, os.path.join(to_bucket,to_prefix,filename))) From 1ca6e5881af4b2cc83184cb43a51bc9ae879262e Mon Sep 17 00:00:00 2001 From: atqy Date: Fri, 6 May 2022 21:28:55 +0000 Subject: [PATCH 07/25] add error handling --- end_to_end/music_recommendation/code/demo_helpers.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/end_to_end/music_recommendation/code/demo_helpers.py b/end_to_end/music_recommendation/code/demo_helpers.py index ac51ae1cde..d3611e547b 100644 --- a/end_to_end/music_recommendation/code/demo_helpers.py +++ b/end_to_end/music_recommendation/code/demo_helpers.py @@ -133,8 +133,12 @@ def delete_lineage_data(): artfct = Artifact(artifact_arn=summary.artifact_arn, sagemaker_session=sagemaker_session) artfct.delete() time.sleep(1) + # Delete model lineage associations and artifacts created in demo - delete_lineage_data() + try: + delete_lineage_data() + except Expection as err: + print(f"Failed to delete lineage data: {err}") if endpoint_names is not None: try: From 762d26229c480d19952f66675ec112e2c783e0d3 Mon Sep 17 00:00:00 2001 From: atqy Date: Fri, 6 May 2022 22:53:49 +0000 Subject: [PATCH 08/25] change instance type --- end_to_end/music_recommendation/02_export_feature_groups.ipynb | 1 - 1 file changed, 1 deletion(-) diff --git a/end_to_end/music_recommendation/02_export_feature_groups.ipynb b/end_to_end/music_recommendation/02_export_feature_groups.ipynb index 6921795ab8..8cd9cf45c1 100644 --- a/end_to_end/music_recommendation/02_export_feature_groups.ipynb +++ b/end_to_end/music_recommendation/02_export_feature_groups.ipynb @@ -1195,7 +1195,6 @@ } ], "metadata": { - "instance_type": "ml.t3.medium", "kernelspec": { "display_name": "conda_python3", "language": "python", From 76dd78717be17564d2557c9583c508a356e9e010 Mon Sep 17 00:00:00 2001 From: atqy Date: Fri, 6 May 2022 22:54:24 +0000 Subject: [PATCH 09/25] fix spelling --- end_to_end/music_recommendation/code/demo_helpers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/end_to_end/music_recommendation/code/demo_helpers.py b/end_to_end/music_recommendation/code/demo_helpers.py index d3611e547b..cd9bd416a1 100644 --- a/end_to_end/music_recommendation/code/demo_helpers.py +++ b/end_to_end/music_recommendation/code/demo_helpers.py @@ -137,7 +137,7 @@ def delete_lineage_data(): # Delete model lineage associations and artifacts created in demo try: delete_lineage_data() - except Expection as err: + except Exception as err: print(f"Failed to delete lineage data: {err}") if endpoint_names is not None: From bf80ec4935e515bfa294e65973b817e80229abee Mon Sep 17 00:00:00 2001 From: atqy Date: Fri, 6 May 2022 23:51:45 +0000 Subject: [PATCH 10/25] fix pipeline notebook --- .../end_to_end_pipeline.ipynb | 116 +++++++++++++++++- 1 file changed, 112 insertions(+), 4 deletions(-) diff --git a/end_to_end/music_recommendation/end_to_end_pipeline.ipynb b/end_to_end/music_recommendation/end_to_end_pipeline.ipynb index 82425856c4..a2a6c6d7e5 100644 --- a/end_to_end/music_recommendation/end_to_end_pipeline.ipynb +++ b/end_to_end/music_recommendation/end_to_end_pipeline.ipynb @@ -153,6 +153,88 @@ "![arch diagram](./images/music-rec.png)" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "## Prereqs: Get Data \n", + "\n", + "##### [back to top](#00-nb)\n", + "\n", + "----\n", + "\n", + "Here we will download the music data from a public S3 bucket that we'll be using for this demo and uploads it to your default S3 bucket that was created for you when you initially created a SageMaker Studio workspace. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from demo_helpers import get_data, get_model, update_data_sources" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# create data folder\n", + "!mkdir data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# public S3 bucket that contains our music data\n", + "s3_bucket_music_data = \"s3://sagemaker-sample-files/datasets/tabular/synthetic-music\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "new_data_paths = get_data(s3_client, [f\"{s3_bucket_music_data}/tracks.csv\", f\"{s3_bucket_music_data}/ratings.csv\"], bucket, prefix, sample_data=0.70)\n", + "new_data_path_new = get_data(s3_client, [f\"{s3_bucket_music_data}/tracks_new.csv\", f\"{s3_bucket_music_data}/ratings_new.csv\"], bucket, prefix, sample_data=0.70)\n", + "print(new_data_paths, new_data_path_new)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# these are the new file paths located on your SageMaker Studio default s3 storage bucket\n", + "tracks_data_source = f's3://{bucket}/{prefix}/tracks.csv'\n", + "ratings_data_source = f's3://{bucket}/{prefix}/ratings.csv'" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# upload train and validation datasets as well\n", + "s3_client.upload_file('data/train_data.csv', bucket, f'{prefix}/data/train/train_data.csv')\n", + "s3_client.upload_file('data/val_data.csv', bucket, f'{prefix}/data/val/val_data.csv')\n", + "\n", + "\n", + "train_data_uri = f's3://{bucket}/{prefix}/data/train/train_data.csv'\n", + "val_data_uri = f's3://{bucket}/{prefix}/data/val/val_data.csv'\n", + "print (f\"Saving training data to {train_data_uri}\")" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -210,8 +292,34 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "### Step 1: Data Wrangler Preprocessing Step\n", + "### Step 1: Data Wrangler Preprocessing Step" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Update the data source in the `.flow` file\n", + "The `01_music_datapred.flow` file is a JSON file containing instructions for where to find your data sources and how to transform the data. We'll be updating the object telling Data Wrangler where to find the input data on S3. We will set this to your default S3 bucket. With this update to the `.flow` file it now points to your new S3 bucket as the data source used by SageMaker Data Wrangler.\n", "\n", + "Make sure the `.flow` file is closed before running this next step or it won't update the new s3 file locations in the file" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "update_data_sources('01_music_dataprep.flow', tracks_data_source, ratings_data_source)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ "#### Upload flow to S3\n", "This will become an input to the first step and, as such, needs to be in S3." ] @@ -954,12 +1062,13 @@ "metadata": {}, "outputs": [], "source": [ + "import demo_helpers\n", + "\n", "demo_helpers.delete_project_resources(\n", " sagemaker_boto_client=sagemaker_boto_client, \n", " sagemaker_session=sagemaker_session,\n", " endpoint_names=[pipeline_endpoint_name],\n", - " pipeline_names=[pipeline_name, dataprep_pipeline_name, train_deploy_pipeline_name], \n", - " mpg_name=mpg_name,\n", + " pipeline_names=[pipeline_name, dataprep_pipeline_name, train_deploy_pipeline_name],\n", " prefix=prefix,\n", " delete_s3_objects=True,\n", " bucket_name=bucket\n", @@ -968,7 +1077,6 @@ } ], "metadata": { - "instance_type": "ml.t3.medium", "kernelspec": { "display_name": "conda_python3", "language": "python", From 8eb286e3289847394a7a1327731221de5c601c02 Mon Sep 17 00:00:00 2001 From: atqy Date: Sat, 7 May 2022 06:50:48 +0000 Subject: [PATCH 11/25] download more files --- .../end_to_end_pipeline.ipynb | 30 +++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/end_to_end/music_recommendation/end_to_end_pipeline.ipynb b/end_to_end/music_recommendation/end_to_end_pipeline.ipynb index a2a6c6d7e5..535e6f912e 100644 --- a/end_to_end/music_recommendation/end_to_end_pipeline.ipynb +++ b/end_to_end/music_recommendation/end_to_end_pipeline.ipynb @@ -219,6 +219,36 @@ "ratings_data_source = f's3://{bucket}/{prefix}/ratings.csv'" ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "files_to_download = [\n", + " f\"sample_tracks.csv\",\n", + " f\"sample_user.csv\",\n", + " f\"train_data_headers.csv\",\n", + " f\"train_data.zip\",\n", + " f\"val_data_headers.csv\",\n", + " f\"val_data.zip\",\n", + " \n", + "]\n", + "\n", + "for file in files_to_download:\n", + " s3_client.download_file(f\"sagemaker-sample-files\", f\"datasets/tabular/synthetic-music/{file}\", f\"./data/{file}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "! unzip -o './data/*.zip' -d './data'\n", + "! rm ./data/*.zip" + ] + }, { "cell_type": "code", "execution_count": null, From 8ef0d65e6e0966d4aa5743f40db1df43131def36 Mon Sep 17 00:00:00 2001 From: atqy Date: Mon, 9 May 2022 12:09:16 +0000 Subject: [PATCH 12/25] file download location changed --- .../end_to_end_pipeline.ipynb | 21 +++---------------- 1 file changed, 3 insertions(+), 18 deletions(-) diff --git a/end_to_end/music_recommendation/end_to_end_pipeline.ipynb b/end_to_end/music_recommendation/end_to_end_pipeline.ipynb index 535e6f912e..60d9922ec6 100644 --- a/end_to_end/music_recommendation/end_to_end_pipeline.ipynb +++ b/end_to_end/music_recommendation/end_to_end_pipeline.ipynb @@ -204,8 +204,7 @@ "outputs": [], "source": [ "new_data_paths = get_data(s3_client, [f\"{s3_bucket_music_data}/tracks.csv\", f\"{s3_bucket_music_data}/ratings.csv\"], bucket, prefix, sample_data=0.70)\n", - "new_data_path_new = get_data(s3_client, [f\"{s3_bucket_music_data}/tracks_new.csv\", f\"{s3_bucket_music_data}/ratings_new.csv\"], bucket, prefix, sample_data=0.70)\n", - "print(new_data_paths, new_data_path_new)" + "print(new_data_paths)" ] }, { @@ -232,6 +231,8 @@ " f\"train_data.zip\",\n", " f\"val_data_headers.csv\",\n", " f\"val_data.zip\",\n", + " f\"tracks_new.csv\",\n", + " f\"ratings_new.csv\",\n", " \n", "]\n", "\n", @@ -668,22 +669,6 @@ ")" ] }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "#TuningStep" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, { "cell_type": "markdown", "metadata": {}, From 97ce6b4b5248d060f51eab6efc276b811e2456c3 Mon Sep 17 00:00:00 2001 From: atqy Date: Mon, 9 May 2022 12:13:13 +0000 Subject: [PATCH 13/25] file download location changed --- end_to_end/music_recommendation/end_to_end_pipeline.ipynb | 2 ++ 1 file changed, 2 insertions(+) diff --git a/end_to_end/music_recommendation/end_to_end_pipeline.ipynb b/end_to_end/music_recommendation/end_to_end_pipeline.ipynb index 60d9922ec6..56ffe43936 100644 --- a/end_to_end/music_recommendation/end_to_end_pipeline.ipynb +++ b/end_to_end/music_recommendation/end_to_end_pipeline.ipynb @@ -257,6 +257,8 @@ "outputs": [], "source": [ "# upload train and validation datasets as well\n", + "s3_client.upload_file('data/tracks_new.csv', bucket, f'{prefix}/data/tracks_new.csv')\n", + "s3_client.upload_file('data/ratings_new.csv', bucket, f'{prefix}/data/ratings_new.csv')\n", "s3_client.upload_file('data/train_data.csv', bucket, f'{prefix}/data/train/train_data.csv')\n", "s3_client.upload_file('data/val_data.csv', bucket, f'{prefix}/data/val/val_data.csv')\n", "\n", From b8e9fdfc2c93d3af5fcfb7067868ef37fd23617c Mon Sep 17 00:00:00 2001 From: atqy Date: Mon, 9 May 2022 16:46:59 +0000 Subject: [PATCH 14/25] markdown corrections --- .../01_data_exploration.ipynb | 2 +- .../02_export_feature_groups.ipynb | 2 +- ...eploy_debugger_explain_monitor_registry.ipynb | 4 ++-- .../end_to_end_pipeline.ipynb | 16 ++++++++++++++++ 4 files changed, 20 insertions(+), 4 deletions(-) diff --git a/end_to_end/music_recommendation/01_data_exploration.ipynb b/end_to_end/music_recommendation/01_data_exploration.ipynb index 307d294dcd..e6b11cf45c 100644 --- a/end_to_end/music_recommendation/01_data_exploration.ipynb +++ b/end_to_end/music_recommendation/01_data_exploration.ipynb @@ -13,7 +13,7 @@ "This notebook is part of a notebook series that goes through the ML Lifecycle and show how we can build a Music Recommender System using a combination of SageMaker Services and features. In this notebook, we will be focusing on exploring the data. It is the first notebook in a series of notebooks. You can choose to run this notebook by itself or in sequence with the other notebooks listed below. Please see the [README.md](README.md) for more information about this use case implement of this sequence of notebooks. \n", "\n", "1. [Music Recommender Data Exploration](01_data_exploration.ipynb) (current notebook)\n", - "1. [Music Recommender Data Preparation with SageMaker Feature Store and SageMaker Data Wrangler](02_data_exploration.ipynb)\n", + "1. [Music Recommender Data Preparation with SageMaker Feature Store and SageMaker Data Wrangler](02_export_feature_groups.ipynb)\n", "1. [Train, Deploy, and Monitor the Music Recommender Model using SageMaker SDK](03_train_deploy_debugger_explain_monitor_registry.ipynb)\n", "\n", "\n", diff --git a/end_to_end/music_recommendation/02_export_feature_groups.ipynb b/end_to_end/music_recommendation/02_export_feature_groups.ipynb index 8cd9cf45c1..93a1bb871d 100644 --- a/end_to_end/music_recommendation/02_export_feature_groups.ipynb +++ b/end_to_end/music_recommendation/02_export_feature_groups.ipynb @@ -17,7 +17,7 @@ "Processing Job and ingest processed data to Feature Store. It is the second notebook in the series. You can choose to run this notebook by itself or in sequence with the other notebooks listed below. Please see the [README.md](README.md) for more information about this use case implement of this sequence of notebooks. \n", "\n", "1. [Music Recommender Data Exploration](01_data_exploration.ipynb)\n", - "1. [Music Recommender Data Preparation with SageMaker Feature Store and SageMaker Data Wrangler](02_data_exploration.ipynb) (current notebook)\n", + "1. [Music Recommender Data Preparation with SageMaker Feature Store and SageMaker Data Wrangler](02_export_feature_groups.ipynb) (current notebook)\n", "1. [Train, Deploy, and Monitor the Music Recommender Model using SageMaker SDK](03_train_deploy_debugger_explain_monitor_registry.ipynb)\n", " \n", "----\n", diff --git a/end_to_end/music_recommendation/03_train_deploy_debugger_explain_monitor_registry.ipynb b/end_to_end/music_recommendation/03_train_deploy_debugger_explain_monitor_registry.ipynb index 2dad2ab0f2..ccf3e4f24d 100644 --- a/end_to_end/music_recommendation/03_train_deploy_debugger_explain_monitor_registry.ipynb +++ b/end_to_end/music_recommendation/03_train_deploy_debugger_explain_monitor_registry.ipynb @@ -11,8 +11,8 @@ "This notebook is part of a notebook series that goes through the ML Lifecycle and show how we can build a Music Recommender System using a combination of SageMaker Services and features. This notebook will train our model using the data we prepped with SageMaker Data Wrangler and stored in our Feature Store, attaching [SageMaker Debugger](https://docs.aws.amazon.com/sagemaker/latest/dg/train-debugger.html) to the model training so that we can capture training metrics/statistics about the model. Then, we will deploy the model and use SageMaker Explainability and Model Monitor to examine our deployed model. After that, we'll log more model artifacts using [SageMaker ML Lineage Tracking](https://docs.aws.amazon.com/sagemaker/latest/dg/lineage-tracking.html). Finally we'll register the model and save its version. It is one of two notebooks you choose to run as the third notebook in the series. You can choose to run this notebook by itself or in sequence with the other notebooks listed below. Please see the [README.md](README.md) for more information about this use case of this sequence of notebooks.\n", "\n", "1. [Music Recommender Data Exploration](01_data_exploration.ipynb)\n", - "1. [Music Recommender Data Preparation with SageMaker Feature Store and SageMaker Data Wrangler](02_data_exploration.ipynb) (current notebook)\n", - "1. [Train, Deploy, and Monitor the Music Recommender Model using SageMaker SDK](03a_train_deploy_debugger_explain_monitor_registry.ipynb)\n", + "1. [Music Recommender Data Preparation with SageMaker Feature Store and SageMaker Data Wrangler](02_export_feature_groups.ipynb)\n", + "1. [Train, Deploy, and Monitor the Music Recommender Model using SageMaker SDK](03_train_deploy_debugger_explain_monitor_registry.ipynb) (current notebook)\n", "\n", "\n", "
💡 Alert \n", diff --git a/end_to_end/music_recommendation/end_to_end_pipeline.ipynb b/end_to_end/music_recommendation/end_to_end_pipeline.ipynb index 56ffe43936..c9c3d3897d 100644 --- a/end_to_end/music_recommendation/end_to_end_pipeline.ipynb +++ b/end_to_end/music_recommendation/end_to_end_pipeline.ipynb @@ -11,6 +11,12 @@ "\n", "In this notebook, we'll build an end-to-end pipeline to create a music recommender using [SageMaker Pipelines](https://docs.aws.amazon.com/sagemaker/latest/dg/pipelines.html), which will automate the entire modeling process from the beginning of data ingestion to monitoring the model. SageMaker Pipelines is a tool for building machine learning pipelines that take advantage of direct SageMaker integration. Because of this integration, you can create a pipeline and set up SageMaker Projects for orchestration using a tool that handles much of the step creation and management for you.\n", "\n", + "If you want to learn more about each step of the pipeline, feel free to look at the series of notebooks listed below. It basically implements the same process in this notebook in a manual way with more detailed descriptions of what each step does. Please see the [README.md](README.md) for more information about the use case implemented by this sequence of notebooks. \n", + "\n", + "1. [Music Recommender Data Exploration](01_data_exploration.ipynb)\n", + "1. [Music Recommender Data Preparation with SageMaker Feature Store and SageMaker Data Wrangler](02_export_feature_groups.ipynb)\n", + "1. [Train, Deploy, and Monitor the Music Recommender Model using SageMaker SDK](03_train_deploy_debugger_explain_monitor_registry.ipynb)\n", + "\n", "----\n", "### Contents\n", "1. [Architecture: Create a SageMaker Pipeline to Automate All the Steps from Data Prep to Model Deployment](#Architecture:-Create-a-SageMaker-Pipeline-to-Automate-All-the-Steps-from-Data-Prep-to-Model-Deployment)\n", @@ -218,6 +224,16 @@ "ratings_data_source = f's3://{bucket}/{prefix}/ratings.csv'" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "For this example, we will provide the processed data you need to complete this task. But you are free to take a look at how we processed the data:\n", + "\n", + "* If you are curious as to how `tracks_new.csv` and `ratings_new.csv` are generated, see [Music Recommender Data Exploration](01_data_exploration.ipynb)\n", + "* If you are curious as to how the rest of the files are generated, see [Music Recommender Data Preparation with SageMaker Feature Store and SageMaker Data Wrangler](02_export_feature_groups.ipynb)" + ] + }, { "cell_type": "code", "execution_count": null, From 8330b8d82d21f3bf43d87c0337c1bdce0d79ea6d Mon Sep 17 00:00:00 2001 From: atqy Date: Mon, 9 May 2022 18:00:34 +0000 Subject: [PATCH 15/25] change instance type --- end_to_end/music_recommendation/02_export_feature_groups.ipynb | 1 + end_to_end/music_recommendation/end_to_end_pipeline.ipynb | 1 + 2 files changed, 2 insertions(+) diff --git a/end_to_end/music_recommendation/02_export_feature_groups.ipynb b/end_to_end/music_recommendation/02_export_feature_groups.ipynb index 93a1bb871d..804d6669ff 100644 --- a/end_to_end/music_recommendation/02_export_feature_groups.ipynb +++ b/end_to_end/music_recommendation/02_export_feature_groups.ipynb @@ -1195,6 +1195,7 @@ } ], "metadata": { + "instance_type": "ml.m5.2xlarge", "kernelspec": { "display_name": "conda_python3", "language": "python", diff --git a/end_to_end/music_recommendation/end_to_end_pipeline.ipynb b/end_to_end/music_recommendation/end_to_end_pipeline.ipynb index c9c3d3897d..24e34f4cb5 100644 --- a/end_to_end/music_recommendation/end_to_end_pipeline.ipynb +++ b/end_to_end/music_recommendation/end_to_end_pipeline.ipynb @@ -1110,6 +1110,7 @@ } ], "metadata": { + "instance_type": "ml.m5.2xlarge", "kernelspec": { "display_name": "conda_python3", "language": "python", From 634b4cea06060d01a27e2c86098a9bdc03ec8429 Mon Sep 17 00:00:00 2001 From: atqy Date: Mon, 9 May 2022 19:22:43 +0000 Subject: [PATCH 16/25] change instence type --- end_to_end/music_recommendation/end_to_end_pipeline.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/end_to_end/music_recommendation/end_to_end_pipeline.ipynb b/end_to_end/music_recommendation/end_to_end_pipeline.ipynb index 24e34f4cb5..a92b253ba7 100644 --- a/end_to_end/music_recommendation/end_to_end_pipeline.ipynb +++ b/end_to_end/music_recommendation/end_to_end_pipeline.ipynb @@ -1110,7 +1110,7 @@ } ], "metadata": { - "instance_type": "ml.m5.2xlarge", + "instance_type": "ml.m5.4xlarge", "kernelspec": { "display_name": "conda_python3", "language": "python", From fc5ebf4cde4a5773120395eba2d185e2d9b65233 Mon Sep 17 00:00:00 2001 From: atqy Date: Tue, 10 May 2022 00:46:04 +0000 Subject: [PATCH 17/25] change instance type --- end_to_end/music_recommendation/02_export_feature_groups.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/end_to_end/music_recommendation/02_export_feature_groups.ipynb b/end_to_end/music_recommendation/02_export_feature_groups.ipynb index 804d6669ff..5cdf448d6c 100644 --- a/end_to_end/music_recommendation/02_export_feature_groups.ipynb +++ b/end_to_end/music_recommendation/02_export_feature_groups.ipynb @@ -1195,7 +1195,7 @@ } ], "metadata": { - "instance_type": "ml.m5.2xlarge", + "instance_type": "ml.m5.4xlarge", "kernelspec": { "display_name": "conda_python3", "language": "python", From e8a5d8e4643240a9b1cdc2e68f25c59560d5b54e Mon Sep 17 00:00:00 2001 From: atqy Date: Tue, 10 May 2022 05:42:56 +0000 Subject: [PATCH 18/25] change names --- .../music_recommendation/end_to_end_pipeline.ipynb | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/end_to_end/music_recommendation/end_to_end_pipeline.ipynb b/end_to_end/music_recommendation/end_to_end_pipeline.ipynb index a92b253ba7..bedba2113e 100644 --- a/end_to_end/music_recommendation/end_to_end_pipeline.ipynb +++ b/end_to_end/music_recommendation/end_to_end_pipeline.ipynb @@ -124,7 +124,7 @@ "\n", "sess = sagemaker.Session()\n", "bucket = sess.default_bucket()\n", - "prefix='music-recommendation'" + "prefix='music-recommendation-pipeline'" ] }, { @@ -600,7 +600,7 @@ " instance_type=\"ml.m5.4xlarge\",\n", " instance_count=2,\n", " volume_size_in_gb=100,\n", - " base_job_name='music-recommendation-split-data',\n", + " base_job_name='music-rec-pipeline-split-data',\n", " sagemaker_session=sagemaker_session)\n", "\n", "create_dataset_step = ProcessingStep(\n", @@ -660,7 +660,7 @@ " image_uri=sagemaker.image_uris.retrieve(\"xgboost\", region, \"0.90-2\"),\n", " hyperparameters=hyperparameters,\n", " output_path=f's3://{bucket}/{prefix}/training_jobs',\n", - " base_job_name='xgb-music-rec-model-pipeline',\n", + " base_job_name='xgb-music-rec-pipeline-model',\n", " max_run=1800\n", ")" ] @@ -701,7 +701,7 @@ "outputs": [], "source": [ "model = sagemaker.model.Model(\n", - " name='music-recommender-xgboost-model',\n", + " name='music-rec-pipeline-xgboost-model',\n", " image_uri=train_step.properties.AlgorithmSpecification.TrainingImage,\n", " model_data=train_step.properties.ModelArtifacts.S3ModelArtifacts,\n", " sagemaker_session=sagemaker_session,\n", @@ -764,7 +764,7 @@ "source": [ "s3_client.upload_file(Filename='./code/deploy_model.py', Bucket=bucket, Key=f'{prefix}/code/deploy_model.py')\n", "deploy_model_script_uri = f's3://{bucket}/{prefix}/code/deploy_model.py'\n", - "pipeline_endpoint_name = 'music-rec-model-endpoint'\n", + "pipeline_endpoint_name = 'music-rec-pipeline-endpoint'\n", "\n", "deploy_model_processor = SKLearnProcessor(\n", " framework_version='0.23-1',\n", @@ -802,7 +802,7 @@ "source": [ "s3_client.upload_file(Filename='./code/model_monitor.py', Bucket=bucket, Key=f'{prefix}/code/model_monitor.py')\n", "model_monitor_script_uri = f's3://{bucket}/{prefix}/code/model_monitor.py'\n", - "mon_schedule_name_base = 'music-recommender-daily-monitor'\n", + "mon_schedule_name_base = 'music-rec-pipeline-daily-monitor'\n", "\n", "\n", "model_monitor_processor = SKLearnProcessor(\n", @@ -811,7 +811,7 @@ " instance_type='ml.m5.xlarge',\n", " instance_count=1,\n", " volume_size_in_gb=60,\n", - " base_job_name='music-recommendation-model-monitor',\n", + " base_job_name='music-rec-pipeline-model-monitor',\n", " sagemaker_session=sagemaker_session)\n", "\n", "monitor_model_step = ProcessingStep(\n", From 23c8a672f9d499bfc8e3bb0c73edee18a5e6a86b Mon Sep 17 00:00:00 2001 From: atqy Date: Tue, 10 May 2022 08:20:30 +0000 Subject: [PATCH 19/25] increase max_attempts --- end_to_end/music_recommendation/end_to_end_pipeline.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/end_to_end/music_recommendation/end_to_end_pipeline.ipynb b/end_to_end/music_recommendation/end_to_end_pipeline.ipynb index bedba2113e..a2df64e7ad 100644 --- a/end_to_end/music_recommendation/end_to_end_pipeline.ipynb +++ b/end_to_end/music_recommendation/end_to_end_pipeline.ipynb @@ -1069,7 +1069,7 @@ "start_response = pipeline.start(parameters=parameters)\n", "# start_response = pipeline_dataprep.start(parameters=parameters)\n", "# start_response = pipeline_train_deploy_monitor.start(parameters=parameters)\n", - "start_response.wait(delay=60, max_attempts=200)\n", + "start_response.wait(delay=60, max_attempts=1000)\n", "start_response.describe()" ] }, From 0f2f694d80c147db2d50d66258cfe5dd4ee01607 Mon Sep 17 00:00:00 2001 From: atqy Date: Tue, 10 May 2022 14:36:22 -0700 Subject: [PATCH 20/25] edit markdown --- .../01_data_exploration.ipynb | 10 +++--- .../02_export_feature_groups.ipynb | 24 ++++++-------- ...oy_debugger_explain_monitor_registry.ipynb | 31 +++---------------- end_to_end/music_recommendation/README.md | 2 +- .../end_to_end_pipeline.ipynb | 10 ++---- 5 files changed, 21 insertions(+), 56 deletions(-) diff --git a/end_to_end/music_recommendation/01_data_exploration.ipynb b/end_to_end/music_recommendation/01_data_exploration.ipynb index e6b11cf45c..edb0da7cd8 100644 --- a/end_to_end/music_recommendation/01_data_exploration.ipynb +++ b/end_to_end/music_recommendation/01_data_exploration.ipynb @@ -4,20 +4,20 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "\n", - "\n", "# Music Recommender Data Exploration\n", + "----\n", "\n", "## Background\n", "\n", - "This notebook is part of a notebook series that goes through the ML Lifecycle and show how we can build a Music Recommender System using a combination of SageMaker Services and features. In this notebook, we will be focusing on exploring the data. It is the first notebook in a series of notebooks. You can choose to run this notebook by itself or in sequence with the other notebooks listed below. Please see the [README.md](README.md) for more information about this use case implement of this sequence of notebooks. \n", + "This notebook is part of a notebook series that goes through the ML lifecycle and shows how we can build a Music Recommender System using a combination of SageMaker services and features. In this notebook, we will be focusing on exploring the data. It is the first notebook in a series of notebooks. You can choose to run this notebook by itself or in sequence with the other notebooks listed below. Please see the [README.md](README.md) for more information about this use case implement of this sequence of notebooks. \n", "\n", "1. [Music Recommender Data Exploration](01_data_exploration.ipynb) (current notebook)\n", "1. [Music Recommender Data Preparation with SageMaker Feature Store and SageMaker Data Wrangler](02_export_feature_groups.ipynb)\n", "1. [Train, Deploy, and Monitor the Music Recommender Model using SageMaker SDK](03_train_deploy_debugger_explain_monitor_registry.ipynb)\n", "\n", + "----\n", "\n", - "### Contents\n", + "## Contents\n", "1. [Prereqs: Get Data](#Prereqs:-Get-Data)\n", "1. [Update the Data Source in the .flow File](#Update-the-Data-Source-in-the-.flow-File)\n", "1. [Explore the Data](#Explore-the-Data)\n" @@ -142,7 +142,7 @@ "\n", "----\n", "\n", - "The `01_music_datapred.flow` file is a JSON file containing instructions for where to find your data sources and how to transform the data. We'll be updating the object telling Data Wrangler where to find the input data on S3. We will set this to your default S3 bucket. With this update to the `.flow` file it now points to your new S3 bucket as the data source used by SageMaker Data Wrangler.\n", + "The `01_music_dataprep.flow` file is a JSON file containing instructions for where to find your data sources and how to transform the data. We'll be updating the object telling Data Wrangler where to find the input data on S3. We will set this to your default S3 bucket. With this update to the `.flow` file it now points to your new S3 bucket as the data source used by SageMaker Data Wrangler.\n", "\n", "Make sure the `.flow` file is closed before running this next step or it won't update the new s3 file locations in the file" ] diff --git a/end_to_end/music_recommendation/02_export_feature_groups.ipynb b/end_to_end/music_recommendation/02_export_feature_groups.ipynb index 5cdf448d6c..49a9b5bdba 100644 --- a/end_to_end/music_recommendation/02_export_feature_groups.ipynb +++ b/end_to_end/music_recommendation/02_export_feature_groups.ipynb @@ -4,25 +4,26 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "\n", - "\n", "# Music Recommender Data Preparation with SageMaker Feature Store and SageMaker Data Wrangler\n", "\n", "----\n", "\n", "## Background\n", "\n", - "This notebook is part of a notebook series that goes through the ML Lifecycle and show how we can build a Music Recommender System using a combination of SageMaker Services and features. This notebook uses Amazon SageMaker Feature Store (Feature Store) to create a feature group, \n", + "This notebook is part of a notebook series that goes through the ML lifecycle and shows how we can build a Music Recommender System using a combination of SageMaker services and features. This notebook uses Amazon SageMaker Feature Store (Feature Store) to create a feature group, \n", "executes your Data Wrangler Flow `01_music_dataprep.flow` on the entire dataset using a SageMaker \n", "Processing Job and ingest processed data to Feature Store. It is the second notebook in the series. You can choose to run this notebook by itself or in sequence with the other notebooks listed below. Please see the [README.md](README.md) for more information about this use case implement of this sequence of notebooks. \n", "\n", "1. [Music Recommender Data Exploration](01_data_exploration.ipynb)\n", "1. [Music Recommender Data Preparation with SageMaker Feature Store and SageMaker Data Wrangler](02_export_feature_groups.ipynb) (current notebook)\n", "1. [Train, Deploy, and Monitor the Music Recommender Model using SageMaker SDK](03_train_deploy_debugger_explain_monitor_registry.ipynb)\n", - " \n", + "\n", "----\n", - "### Contents\n", - "1. [Define Feature Group](#Define-Feature-Group)\n", + "\n", + "## Contents\n", + "1. [Prereqs: Get Data](#Prereqs:-Get-Data)\n", + "1. [Update the Data Source in the .flow File](#Update-the-Data-Source-in-the-.flow-File)\n", + "1. [Create Feature Group](#Create-Feature-Group)\n", "1. [Configure Feature Group](#Configure-Feature-Group)\n", "1. [Initialize & Create Feature Group](#Initialize-&-Create-Feature-Group)\n", "1. [Inputs and Outputs](#Inputs-and-Outputs)\n", @@ -106,12 +107,8 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "\n", - "\n", "## Prereqs: Get Data \n", "\n", - "##### [back to top](#00-nb)\n", - "\n", "----\n", "\n", "Here we will download the music data from a public S3 bucket that we'll be using for this demo and uploads it to your default S3 bucket that was created for you when you initially created a SageMaker Studio workspace. " @@ -175,8 +172,6 @@ "\n", "## Update the data source in the `.flow` file\n", "\n", - "##### [back to top](#00-nb)\n", - "\n", "----\n", "The `01_music_datapred.flow` file is a JSON file containing instructions for where to find your data sources and how to transform the data. We'll be updating the object telling Data Wrangler where to find the input data on S3. We will set this to your default S3 bucket. With this update to the `.flow` file it now points to your new S3 bucket as the data source used by SageMaker Data Wrangler.\n", "\n", @@ -199,6 +194,7 @@ "metadata": {}, "source": [ "## Create Feature Group\n", + "----\n", "\n", "[Amazon SageMaker Feature Store](https://www.youtube.com/watch?v=pEg5c6d4etI) is a fully managed, purpose-built repository to store, update, retrieve, and share machine learning (ML) features. Features are the attributes or properties models use during training and inference to make predictions. For example, in a ML application that recommends a music playlist, features could include song ratings, which songs were listened to previously, and how long songs were listened to. The accuracy of a ML model is based on a precise set and composition of features. Often, these features are used repeatedly by multiple teams training multiple models. And whichever feature set was used to train the model needs to be available to make real-time predictions (inference). Keeping a single source of features that is consistent and up-to-date across these different access patterns is a challenge as most organizations keep two different feature stores, one for training and one for inference.\n", "\n", @@ -218,7 +214,7 @@ "metadata": {}, "source": [ "### Define Feature Group \n", - "----\n", + "\n", "Select Record identifier and Event time feature name. These are required parameters for feature group\n", "creation.\n", "* **Record identifier name** is the name of the feature defined in the feature group's feature definitions \n", @@ -1024,8 +1020,6 @@ "\n", "## Fetch Data from Offline Feature Store\n", "\n", - "##### [back to top](#03-nb)\n", - "\n", "----\n", "There are 3 feature stores for the ratings, tracks, and user preferences data. We retrieve data from all 3 before joining them." ] diff --git a/end_to_end/music_recommendation/03_train_deploy_debugger_explain_monitor_registry.ipynb b/end_to_end/music_recommendation/03_train_deploy_debugger_explain_monitor_registry.ipynb index ccf3e4f24d..3cb4d7df6c 100644 --- a/end_to_end/music_recommendation/03_train_deploy_debugger_explain_monitor_registry.ipynb +++ b/end_to_end/music_recommendation/03_train_deploy_debugger_explain_monitor_registry.ipynb @@ -8,7 +8,9 @@ "\n", "----\n", "\n", - "This notebook is part of a notebook series that goes through the ML Lifecycle and show how we can build a Music Recommender System using a combination of SageMaker Services and features. This notebook will train our model using the data we prepped with SageMaker Data Wrangler and stored in our Feature Store, attaching [SageMaker Debugger](https://docs.aws.amazon.com/sagemaker/latest/dg/train-debugger.html) to the model training so that we can capture training metrics/statistics about the model. Then, we will deploy the model and use SageMaker Explainability and Model Monitor to examine our deployed model. After that, we'll log more model artifacts using [SageMaker ML Lineage Tracking](https://docs.aws.amazon.com/sagemaker/latest/dg/lineage-tracking.html). Finally we'll register the model and save its version. It is one of two notebooks you choose to run as the third notebook in the series. You can choose to run this notebook by itself or in sequence with the other notebooks listed below. Please see the [README.md](README.md) for more information about this use case of this sequence of notebooks.\n", + "## Background\n", + "\n", + "This notebook is part of a notebook series that goes through the ML lifecycle and shows how we can build a Music Recommender System using a combination of SageMaker services and features. This notebook will train our model using the data we prepped with SageMaker Data Wrangler and stored in our Feature Store, attaching [SageMaker Debugger](https://docs.aws.amazon.com/sagemaker/latest/dg/train-debugger.html) to the model training so that we can capture training metrics/statistics about the model. Then, we will deploy the model and use SageMaker Explainability and Model Monitor to examine our deployed model. After that, we'll log more model artifacts using [SageMaker ML Lineage Tracking](https://docs.aws.amazon.com/sagemaker/latest/dg/lineage-tracking.html). Finally we'll register the model and save its version. It is one of two notebooks you choose to run as the third notebook in the series. You can choose to run this notebook by itself or in sequence with the other notebooks listed below. Please see the [README.md](README.md) for more information about this use case of this sequence of notebooks.\n", "\n", "1. [Music Recommender Data Exploration](01_data_exploration.ipynb)\n", "1. [Music Recommender Data Preparation with SageMaker Feature Store and SageMaker Data Wrangler](02_export_feature_groups.ipynb)\n", @@ -21,7 +23,7 @@ "
\n", "\n", "----\n", - "### Contents\n", + "## Contents\n", "- [Overview](00_overview_arch_data.ipynb)\n", "- [Part 1: Data Prep using Data Wrangler](01_music_dataprep.flow)\n", "- [Part 2a: Feature Store Creation - Tracks](02a_export_fg_tracks.ipynb)\n", @@ -135,8 +137,6 @@ "\n", "## Prereqs: Get Data \n", "\n", - "##### [back to top](#00-nb)\n", - "\n", "----\n", "\n", "Here we will download the music data from a public S3 bucket that we'll be using for this demo and uploads it to your default S3 bucket that was created for you when you initially created a SageMaker Studio workspace. " @@ -264,8 +264,6 @@ "\n", "## Train Model \n", "\n", - "##### [back to top](#03-nb)\n", - "\n", "----" ] }, @@ -433,9 +431,6 @@ "\n", "\n", "## Deploy Model\n", - "\n", - "##### [back to top](#04-nb)\n", - "\n", "----" ] }, @@ -495,9 +490,6 @@ " \n", "\n", "## Create a predictor\n", - "\n", - "##### [back to top](#04-nb)\n", - "\n", "----" ] }, @@ -586,9 +578,6 @@ " \n", "\n", "## Infer (predict) new songs using model\n", - "\n", - "##### [back to top](#04-nb)\n", - "\n", "----" ] }, @@ -641,9 +630,6 @@ " \n", "\n", "## Explain model predictions\n", - "\n", - "##### [back to top](#04-nb)\n", - "\n", "----" ] }, @@ -994,9 +980,6 @@ "## Model Monitor\n", "\n", "## Step 1: Enable real-time inference data capture\n", - "\n", - "##### [back to top](#05-nb)\n", - "\n", "----" ] }, @@ -1068,9 +1051,6 @@ "\n", "\n", "## Step 2: Model Monitor - Baselining\n", - "\n", - "##### [back to top](#05-nb)\n", - "\n", "----" ] }, @@ -1277,9 +1257,6 @@ "\n", "\n", "## Step 3: Enable continous monitoring\n", - "\n", - "##### [back to top](#05-nb)\n", - "\n", "----\n", "\n", "We have collected the data above, here we proceed to analyze and monitor the data with MonitoringSchedules." diff --git a/end_to_end/music_recommendation/README.md b/end_to_end/music_recommendation/README.md index 84f6256819..3b1bce5dcc 100644 --- a/end_to_end/music_recommendation/README.md +++ b/end_to_end/music_recommendation/README.md @@ -72,4 +72,4 @@ In the following notebooks we'll take 2 different approaches with the same model # Clean Up -In order to prevent ongoing charges to your AWS account, clean up any resources we spun up during this tutorial. We've also included a notebook, `07_clean_up.ipynb`, to delete all resources spun up by this demo. +In order to prevent ongoing charges to your AWS account, clean up any resources we spun up during this tutorial at the end of notebooks [Train, Deploy, and Monitor the Music Recommender Model using SageMaker SDK](03_train_deploy_debugger_explain_monitor_registry.ipynb) and [Train, Deploy, and Monitor the Music Recommender Model using SageMaker Pipelines](end_to_end_pipeline.ipynb). diff --git a/end_to_end/music_recommendation/end_to_end_pipeline.ipynb b/end_to_end/music_recommendation/end_to_end_pipeline.ipynb index a2df64e7ad..b35337c1f9 100644 --- a/end_to_end/music_recommendation/end_to_end_pipeline.ipynb +++ b/end_to_end/music_recommendation/end_to_end_pipeline.ipynb @@ -18,7 +18,7 @@ "1. [Train, Deploy, and Monitor the Music Recommender Model using SageMaker SDK](03_train_deploy_debugger_explain_monitor_registry.ipynb)\n", "\n", "----\n", - "### Contents\n", + "## Contents\n", "1. [Architecture: Create a SageMaker Pipeline to Automate All the Steps from Data Prep to Model Deployment](#Architecture:-Create-a-SageMaker-Pipeline-to-Automate-All-the-Steps-from-Data-Prep-to-Model-Deployment)\n", "1. [SageMaker Pipeline Overview](#SageMaker-Pipeline-Overview)\n", "1. [Clean Up](#Clean-Up)" @@ -166,9 +166,6 @@ "\n", "\n", "## Prereqs: Get Data \n", - "\n", - "##### [back to top](#00-nb)\n", - "\n", "----\n", "\n", "Here we will download the music data from a public S3 bucket that we'll be using for this demo and uploads it to your default S3 bucket that was created for you when you initially created a SageMaker Studio workspace. " @@ -724,10 +721,7 @@ "metadata": {}, "source": [ "### Step 5: Register Model\n", - "In this step you will use the ParameterString `model_approval_status` defined at the outset of the pipeline code.\n", - "\n", - "\n", - "[Pipeline Overview](#pipelines)" + "In this step you will use the ParameterString `model_approval_status` defined at the outset of the pipeline code." ] }, { From 5110c5d39c98a00adb6b58234f112c38055aeeb2 Mon Sep 17 00:00:00 2001 From: atqy Date: Tue, 10 May 2022 21:59:11 +0000 Subject: [PATCH 21/25] clean up table of contents --- .../02_export_feature_groups.ipynb | 19 ++-- ...oy_debugger_explain_monitor_registry.ipynb | 86 +++++++------------ .../end_to_end_pipeline.ipynb | 4 +- 3 files changed, 38 insertions(+), 71 deletions(-) diff --git a/end_to_end/music_recommendation/02_export_feature_groups.ipynb b/end_to_end/music_recommendation/02_export_feature_groups.ipynb index 49a9b5bdba..c30400e390 100644 --- a/end_to_end/music_recommendation/02_export_feature_groups.ipynb +++ b/end_to_end/music_recommendation/02_export_feature_groups.ipynb @@ -29,6 +29,7 @@ "1. [Inputs and Outputs](#Inputs-and-Outputs)\n", "1. [Upload Flow to S3](#Upload-Flow-to-S3)\n", "1. [Run Processing Job](#Run-Processing-Job)\n", + "1. [Fetch Data from Offline Feature Store](#Fetch-Data-from-Offline-Feature-Store)\n", "\n", "\n", "
💡 Quick Start \n", @@ -168,9 +169,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "\n", - "\n", - "## Update the data source in the `.flow` file\n", + "## Update the Data Source in the `.flow` File\n", "\n", "----\n", "The `01_music_datapred.flow` file is a JSON file containing instructions for where to find your data sources and how to transform the data. We'll be updating the object telling Data Wrangler where to find the input data on S3. We will set this to your default S3 bucket. With this update to the `.flow` file it now points to your new S3 bucket as the data source used by SageMaker Data Wrangler.\n", @@ -593,7 +592,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "### Initialize & Create Feature Group\n", + "## Initialize & Create Feature Group\n", "\n", "----" ] @@ -739,7 +738,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Input - S3 Source: tracks.csv" + "### Input - S3 Source: tracks.csv" ] }, { @@ -762,7 +761,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Input - S3 Source: ratings.csv" + "### Input - S3 Source: ratings.csv" ] }, { @@ -885,7 +884,7 @@ "## Run Processing Job\n", "\n", "----\n", - "## Job Configurations\n", + "### Job Configurations\n", "\n", "
💡 Configurable Settings \n", "\n", @@ -929,7 +928,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Create Processing Job\n", + "### Create Processing Job\n", "\n", "To launch a Processing Job, you will use the SageMaker Python SDK to create a Processor function." ] @@ -958,7 +957,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Job Status & S3 Output Location\n", + "### Job Status & S3 Output Location\n", "\n", "Below you wait for processing job to finish. If it finishes successfully, your feature group should be populated \n", "with transformed feature values. In addition the raw parameters used by the Processing Job will be printed." @@ -1016,8 +1015,6 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "\n", - "\n", "## Fetch Data from Offline Feature Store\n", "\n", "----\n", diff --git a/end_to_end/music_recommendation/03_train_deploy_debugger_explain_monitor_registry.ipynb b/end_to_end/music_recommendation/03_train_deploy_debugger_explain_monitor_registry.ipynb index 3cb4d7df6c..9788a44e8e 100644 --- a/end_to_end/music_recommendation/03_train_deploy_debugger_explain_monitor_registry.ipynb +++ b/end_to_end/music_recommendation/03_train_deploy_debugger_explain_monitor_registry.ipynb @@ -24,22 +24,16 @@ "\n", "----\n", "## Contents\n", - "- [Overview](00_overview_arch_data.ipynb)\n", - "- [Part 1: Data Prep using Data Wrangler](01_music_dataprep.flow)\n", - "- [Part 2a: Feature Store Creation - Tracks](02a_export_fg_tracks.ipynb)\n", - "- [Part 2b: Feature Store Creation - User Preferences](02b_export_fg_5star_features.ipynb)\n", - "- [Part 2c: Feature Store Creation - Ratings](02c_export_fg_ratings.ipynb)\n", - "- [Part 3: Train Model with Debugger Hooks. Set Artifacts and Register Model.](03_train_model_lineage_registry_debugger.ipynb)\n", - " - [Fetch Data from Feature Store](#03-feature-store)\n", - " - [Split Data and Save to S3](#03-split)\n", - " - [Train Model](#03-train)\n", - " - [SageMaker Debugger Reports](#03-debugger)\n", - " - [Set Lineage Artifacts](#03-lineage)\n", - " - [Register Model](#03-register)\n", - "- [Part 4: Deploy Model & Inference using Online Feature Store](04_deploy_infer_explain.ipynb)\n", - "- [Part 5: Model Monitor](05_model_monitor.ipynb)\n", - "- [Part 6: SageMaker Pipelines](06_pipeline.ipynb)\n", - "- [Part 7: Resource Cleanup](07_clean_up.ipynb)" + "1. [Prereqs: Get Data](#Prereqs:-Get-Data)\n", + "1. [Train Model](#Train-Model)\n", + "1. [Deploy Model](#Deploy-Model)\n", + "1. [Create a Predictor](#Create-a-Predictor)\n", + "1. [Infer New Songs using Model](#Infer-New-Songs-using-Model)\n", + "1. [Explain Model Predictions](#Explain-Model-Predictions)\n", + "1. [View SageMaker Debugger Reports](#View-SageMaker-Debugger-Reports)\n", + "1. [SageMaker Model Monitor](#SageMaker-Model-Monitor)\n", + "1. [Register Model with SageMaker Model Registry](#Register-Model-with-SageMaker-Model-Registry)\n", + "1. [Clean Up](#Clean-Up)" ] }, { @@ -133,8 +127,6 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "\n", - "\n", "## Prereqs: Get Data \n", "\n", "----\n", @@ -226,7 +218,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Save data to S3" + "### Save data to S3" ] }, { @@ -260,8 +252,6 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "\n", - "\n", "## Train Model \n", "\n", "----" @@ -428,8 +418,6 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "\n", - "\n", "## Deploy Model\n", "----" ] @@ -487,9 +475,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - " \n", - "\n", - "## Create a predictor\n", + "## Create a Predictor\n", "----" ] }, @@ -575,9 +561,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - " \n", - "\n", - "## Infer (predict) new songs using model\n", + "## Infer New Songs using Model\n", "----" ] }, @@ -627,9 +611,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - " \n", - "\n", - "## Explain model predictions\n", + "## Explain Model Predictions\n", "----" ] }, @@ -749,8 +731,6 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "\n", - "\n", "## View SageMaker Debugger Reports\n", "\n", "----\n", @@ -974,12 +954,9 @@ "cell_type": "markdown", "metadata": {}, "source": [ + "## SageMaker Model Monitor\n", "\n", - "\n", - "\n", - "## Model Monitor\n", - "\n", - "## Step 1: Enable real-time inference data capture\n", + "### Step 1: Enable real-time inference data capture\n", "----" ] }, @@ -1040,7 +1017,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Before you proceed:\n", + "#### Before you proceed:\n", "Currently SageMaker supports monitoring Endpoints out of the box only for **tabular (csv, flat-json)** datasets. If your Endpoint uses some other datasets, these following steps will NOT work for you.\n" ] }, @@ -1048,9 +1025,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "\n", - "\n", - "## Step 2: Model Monitor - Baselining\n", + "### Step 2: Model Monitor - Baselining\n", "----" ] }, @@ -1067,7 +1042,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Constraint suggestion with baseline/training dataset" + "#### Constraint suggestion with baseline/training dataset" ] }, { @@ -1098,7 +1073,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "### Create a baselining job with the validation dataset" + "#### Create a baselining job with the validation dataset" ] }, { @@ -1158,7 +1133,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "### Exploratory Analysis of the Processing Jobs underlying SageMaker Monitor\n", + "#### Exploratory Analysis of the Processing Jobs underlying SageMaker Monitor\n", "In this short section [next few cells] we will be showing you how to further view the underlying jobs for the monitoring job" ] }, @@ -1217,7 +1192,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "### Explore the generated constraints and statistics" + "#### Explore the generated constraints and statistics" ] }, { @@ -1254,9 +1229,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "\n", - "\n", - "## Step 3: Enable continous monitoring\n", + "### Step 3: Enable continous monitoring\n", "----\n", "\n", "We have collected the data above, here we proceed to analyze and monitor the data with MonitoringSchedules." @@ -1266,7 +1239,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "### Create a schedule" + "#### Create a schedule" ] }, { @@ -1330,7 +1303,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "### All set\n", + "#### All set\n", "Now that your monitoring schedule has been created. Please return to the Amazon SageMaker Studio to list the executions for this Schedule and observe the results going forward." ] }, @@ -1338,9 +1311,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "\n", - "\n", - "## Model Registry\n", + "## Register Model with SageMaker Model Registry\n", "\n", "Amazon SageMaker ML Lineage Tracking creates and stores information about the steps of a machine learning workflow from data preparation to model deployment. With the tracking information you can reproduce the workflow steps, track model and dataset lineage, and establish model governance and audit standards\n", "\n", @@ -1407,7 +1378,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Create Model from Estimator" + "### Create Model from Estimator" ] }, { @@ -1436,7 +1407,8 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Clean Up" + "## Clean Up\n", + "----" ] }, { diff --git a/end_to_end/music_recommendation/end_to_end_pipeline.ipynb b/end_to_end/music_recommendation/end_to_end_pipeline.ipynb index b35337c1f9..a1869d35fa 100644 --- a/end_to_end/music_recommendation/end_to_end_pipeline.ipynb +++ b/end_to_end/music_recommendation/end_to_end_pipeline.ipynb @@ -163,8 +163,6 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "\n", - "\n", "## Prereqs: Get Data \n", "----\n", "\n", @@ -852,7 +850,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "### Option 1: The Entire Pipeline End to end" + "#### Option 1: The Entire Pipeline End to end" ] }, { From 6111cb99a6901d03c1505e8c1f0a7b71cac1f62d Mon Sep 17 00:00:00 2001 From: atqy Date: Tue, 10 May 2022 22:04:55 +0000 Subject: [PATCH 22/25] edit links --- end_to_end/music_recommendation/README.md | 13 ++++--------- end_to_end/music_recommendation/index.rst | 13 ++++--------- 2 files changed, 8 insertions(+), 18 deletions(-) diff --git a/end_to_end/music_recommendation/README.md b/end_to_end/music_recommendation/README.md index 3b1bce5dcc..21273be10a 100644 --- a/end_to_end/music_recommendation/README.md +++ b/end_to_end/music_recommendation/README.md @@ -54,17 +54,12 @@ For this tutorial, we'll be using our own generated track and user ratings data, # Approach In the following notebooks we'll take 2 different approaches with the same modeling solution to create our music recommender. -1. Run each notebook, 02a_ to 05_, to walkthrough each data prep and modeling step +1. Run the following notebooks in order to walkthrough each data prep and modeling step - 01_music_dataprep.flow: Flow file defining our data input and transformation steps; this file is created in the Sagemaker Data Wrangler GUI - - 02a_export_fs_tracks.ipynb: export our tracks data created in Data Wrangler to a feature store - - 02b_export_fs_5star_features.ipynb: export our 5-star rated tracks data created in Data Wrangler to a feature store - - 02c_fs_create_ratings.ipynb: export our user ratings data created in Data Wrangler to a feature store - - 03_train_model_lineage_registry_debugger.ipynb: train the model using xgboost to predict each song rating for each user - - 04_inference_explainability.ipynb: go over feature importances using SHAP values - - 05_model_monitor.ipynb: setup Sagemaker Model Monitor + - 02_export_feature_groups.ipynb: export our tracks data, 5-star rated tracks data, and user ratings data created in Data Wrangler to a feature store + - 03_train_deploy_debugger_explain_monitor_registry.ipynb: train and deploy the model using xgboost to predict each song rating for each user. We also go over feature importances using SHAP values and setup Sagemaker Model Monitor. 1. Setup a Sagemaker Pipeline to do all the aformentioned steps in a single notebook so that it can be ran automatically over time - - 01_music_dataprep.flow: Flow file defining our data input and transformation steps; this file is created in the Sagemaker Data Wrangler GUI - - 06_pipeline.ipynb: setup each modeling step using sagemaker.workflow Pipeline object + - end_to_end_pipeline.ipynb: setup each modeling step using sagemaker.workflow Pipeline object ### Solution Architecture ![architecture diagram](./images/music-rec-2c-all-mlops.png) diff --git a/end_to_end/music_recommendation/index.rst b/end_to_end/music_recommendation/index.rst index 9fd6d132b3..e8ddb460ee 100644 --- a/end_to_end/music_recommendation/index.rst +++ b/end_to_end/music_recommendation/index.rst @@ -9,12 +9,7 @@ Music Recommender System across the Entire ML-Lifecycle with Amazon SageMaker .. toctree:: :maxdepth: 1 - 00_overview_arch_data.ipynb - 02a_export_fg_tracks.ipynb - 02b_export_fg_5star_features.ipynb - 02c_export_fg_ratings.ipynb - 03_train_model_lineage_registry_debugger.ipynb - 04_deploy_infer_explain.ipynb - 05_model_monitor.ipynb - 06_pipeline.ipynb - 07_clean_up.ipynb + 01_data_exploration + 02_export_feature_groups + 03_train_deploy_debugger_explain_monitor_registry + end_to_end_pipeline \ No newline at end of file From 2bfe9e31c141776856f5bb9f2ab1b5ccb65b8ce0 Mon Sep 17 00:00:00 2001 From: atqy Date: Tue, 10 May 2022 15:10:34 -0700 Subject: [PATCH 23/25] reformat --- .../01_data_exploration.ipynb | 53 +- .../02_export_feature_groups.ipynb | 597 ++++++++---------- ...oy_debugger_explain_monitor_registry.ipynb | 368 +++++------ .../end_to_end_pipeline.ipynb | 458 ++++++++------ 4 files changed, 736 insertions(+), 740 deletions(-) diff --git a/end_to_end/music_recommendation/01_data_exploration.ipynb b/end_to_end/music_recommendation/01_data_exploration.ipynb index edb0da7cd8..5a22c220dd 100644 --- a/end_to_end/music_recommendation/01_data_exploration.ipynb +++ b/end_to_end/music_recommendation/01_data_exploration.ipynb @@ -31,7 +31,8 @@ "source": [ "import sys\n", "import pprint\n", - "sys.path.insert(1, './code')" + "\n", + "sys.path.insert(1, \"./code\")" ] }, { @@ -43,6 +44,7 @@ "# update pandas to avoid data type issues in older 1.0 version\n", "!pip install pandas --upgrade --quiet\n", "import pandas as pd\n", + "\n", "print(pd.__version__)" ] }, @@ -64,10 +66,11 @@ "source": [ "import pandas as pd\n", "import matplotlib.pyplot as plt\n", + "\n", "%matplotlib inline\n", "\n", "import json\n", - "import sagemaker \n", + "import sagemaker\n", "import boto3\n", "import os\n", "\n", @@ -76,11 +79,11 @@ "# get session bucket name\n", "bucket = sess.default_bucket()\n", "# bucket prefix or the subfolder for everything we produce\n", - "prefix='music-recommendation'\n", + "prefix = \"music-recommendation\"\n", "# s3 client\n", "s3_client = boto3.client(\"s3\")\n", "\n", - "print(f\"this is your default SageMaker Studio bucket name: {bucket}\") \n" + "print(f\"this is your default SageMaker Studio bucket name: {bucket}\")" ] }, { @@ -119,7 +122,13 @@ "metadata": {}, "outputs": [], "source": [ - "new_data_paths = get_data(s3_client, [f\"{s3_bucket_music_data}/tracks.csv\", f\"{s3_bucket_music_data}/ratings.csv\"], bucket, prefix, sample_data=0.70)\n", + "new_data_paths = get_data(\n", + " s3_client,\n", + " [f\"{s3_bucket_music_data}/tracks.csv\", f\"{s3_bucket_music_data}/ratings.csv\"],\n", + " bucket,\n", + " prefix,\n", + " sample_data=0.70,\n", + ")\n", "print(new_data_paths)" ] }, @@ -130,8 +139,8 @@ "outputs": [], "source": [ "# these are the new file paths located on your SageMaker Studio default s3 storage bucket\n", - "tracks_data_source = f's3://{bucket}/{prefix}/tracks.csv'\n", - "ratings_data_source = f's3://{bucket}/{prefix}/ratings.csv'" + "tracks_data_source = f\"s3://{bucket}/{prefix}/tracks.csv\"\n", + "ratings_data_source = f\"s3://{bucket}/{prefix}/ratings.csv\"" ] }, { @@ -155,7 +164,7 @@ }, "outputs": [], "source": [ - "update_data_sources('01_music_dataprep.flow', tracks_data_source, ratings_data_source)" + "update_data_sources(\"01_music_dataprep.flow\", tracks_data_source, ratings_data_source)" ] }, { @@ -173,8 +182,8 @@ "metadata": {}, "outputs": [], "source": [ - "tracks = pd.read_csv('./data/tracks.csv')\n", - "ratings = pd.read_csv('./data/ratings.csv')" + "tracks = pd.read_csv(\"./data/tracks.csv\")\n", + "ratings = pd.read_csv(\"./data/ratings.csv\")" ] }, { @@ -201,9 +210,9 @@ "metadata": {}, "outputs": [], "source": [ - "print(\"{:,} different songs/tracks\".format(tracks['trackId'].nunique()))\n", - "print(\"{:,} users\".format(ratings['userId'].nunique()))\n", - "print(\"{:,} user rating events\".format(ratings['ratingEventId'].nunique()))" + "print(\"{:,} different songs/tracks\".format(tracks[\"trackId\"].nunique()))\n", + "print(\"{:,} users\".format(ratings[\"userId\"].nunique()))\n", + "print(\"{:,} user rating events\".format(ratings[\"ratingEventId\"].nunique()))" ] }, { @@ -212,7 +221,7 @@ "metadata": {}, "outputs": [], "source": [ - "tracks.groupby('genre')['genre'].count().plot.bar(title=\"Tracks by Genre\");" + "tracks.groupby(\"genre\")[\"genre\"].count().plot.bar(title=\"Tracks by Genre\");" ] }, { @@ -221,7 +230,9 @@ "metadata": {}, "outputs": [], "source": [ - "ratings[['ratingEventId','userId']].plot.hist(by='userId', bins=50, title=\"Distribution of # of Ratings by User\");" + "ratings[[\"ratingEventId\", \"userId\"]].plot.hist(\n", + " by=\"userId\", bins=50, title=\"Distribution of # of Ratings by User\"\n", + ");" ] }, { @@ -241,8 +252,8 @@ "ratings_new = ratings[:1000]\n", "\n", "# export dataframes to csv\n", - "tracks_new.to_csv('./data/tracks_new.csv', index=False)\n", - "ratings_new.to_csv('./data/ratings_new.csv', index=False)" + "tracks_new.to_csv(\"./data/tracks_new.csv\", index=False)\n", + "ratings_new.to_csv(\"./data/ratings_new.csv\", index=False)" ] }, { @@ -251,8 +262,12 @@ "metadata": {}, "outputs": [], "source": [ - "s3_client.upload_file(Filename=\"./data/tracks_new.csv\", Bucket=bucket, Key=f'{prefix}/data/tracks_new.csv')\n", - "s3_client.upload_file(Filename=\"./data/ratings_new.csv\", Bucket=bucket, Key=f'{prefix}/data/ratings_new.csv')" + "s3_client.upload_file(\n", + " Filename=\"./data/tracks_new.csv\", Bucket=bucket, Key=f\"{prefix}/data/tracks_new.csv\"\n", + ")\n", + "s3_client.upload_file(\n", + " Filename=\"./data/ratings_new.csv\", Bucket=bucket, Key=f\"{prefix}/data/ratings_new.csv\"\n", + ")" ] } ], diff --git a/end_to_end/music_recommendation/02_export_feature_groups.ipynb b/end_to_end/music_recommendation/02_export_feature_groups.ipynb index c30400e390..82a27c0598 100644 --- a/end_to_end/music_recommendation/02_export_feature_groups.ipynb +++ b/end_to_end/music_recommendation/02_export_feature_groups.ipynb @@ -49,7 +49,8 @@ "source": [ "import sys\n", "import pprint\n", - "sys.path.insert(1, './code')" + "\n", + "sys.path.insert(1, \"./code\")" ] }, { @@ -61,6 +62,7 @@ "# update pandas to avoid data type issues in older 1.0 version\n", "!pip install pandas --upgrade --quiet\n", "import pandas as pd\n", + "\n", "print(pd.__version__)" ] }, @@ -82,10 +84,11 @@ "source": [ "import pandas as pd\n", "import matplotlib.pyplot as plt\n", + "\n", "%matplotlib inline\n", "\n", "import json\n", - "import sagemaker \n", + "import sagemaker\n", "import boto3\n", "import os\n", "from awscli.customizations.s3.utils import split_s3_bucket_key\n", @@ -95,11 +98,11 @@ "# get session bucket name\n", "bucket = sess.default_bucket()\n", "# bucket prefix or the subfolder for everything we produce\n", - "prefix='music-recommendation'\n", + "prefix = \"music-recommendation\"\n", "# s3 client\n", "s3_client = boto3.client(\"s3\")\n", "\n", - "print(f\"this is your default SageMaker Studio bucket name: {bucket}\") \n", + "print(f\"this is your default SageMaker Studio bucket name: {bucket}\")\n", "\n", "# ps.add({'bucket': bucket, 'prefix': prefix}, namespace='music-rec')" ] @@ -150,7 +153,13 @@ "metadata": {}, "outputs": [], "source": [ - "new_data_paths = get_data(s3_client, [f\"{s3_bucket_music_data}/tracks.csv\", f\"{s3_bucket_music_data}/ratings.csv\"], bucket, prefix, sample_data=0.70)\n", + "new_data_paths = get_data(\n", + " s3_client,\n", + " [f\"{s3_bucket_music_data}/tracks.csv\", f\"{s3_bucket_music_data}/ratings.csv\"],\n", + " bucket,\n", + " prefix,\n", + " sample_data=0.70,\n", + ")\n", "print(new_data_paths)" ] }, @@ -161,8 +170,8 @@ "outputs": [], "source": [ "# these are the new file paths located on your SageMaker Studio default s3 storage bucket\n", - "tracks_data_source = f's3://{bucket}/{prefix}/tracks.csv'\n", - "ratings_data_source = f's3://{bucket}/{prefix}/ratings.csv'" + "tracks_data_source = f\"s3://{bucket}/{prefix}/tracks.csv\"\n", + "ratings_data_source = f\"s3://{bucket}/{prefix}/ratings.csv\"" ] }, { @@ -185,7 +194,7 @@ }, "outputs": [], "source": [ - "update_data_sources('01_music_dataprep.flow', tracks_data_source, ratings_data_source)" + "update_data_sources(\"01_music_dataprep.flow\", tracks_data_source, ratings_data_source)" ] }, { @@ -235,13 +244,19 @@ "outputs": [], "source": [ "# feature group name, with flow_name and an unique id. You can give it a customized name\n", - "feature_group_names = ['track-features-music-rec', 'user-5star-track-features-music-rec', 'ratings-features-music-rec']\n", + "feature_group_names = [\n", + " \"track-features-music-rec\",\n", + " \"user-5star-track-features-music-rec\",\n", + " \"ratings-features-music-rec\",\n", + "]\n", "print(f\"Feature Group Name: {feature_group_names}\")\n", "\n", - "record_identifier_feature_names = {'track-features-music-rec': 'trackId', \n", - " 'user-5star-track-features-music-rec': 'userId', \n", - " 'ratings-features-music-rec': \"ratingEventId\"}\n", - "event_time_feature_name = 'EventTime'" + "record_identifier_feature_names = {\n", + " \"track-features-music-rec\": \"trackId\",\n", + " \"user-5star-track-features-music-rec\": \"userId\",\n", + " \"ratings-features-music-rec\": \"ratingEventId\",\n", + "}\n", + "event_time_feature_name = \"EventTime\"" ] }, { @@ -286,214 +301,67 @@ "outputs": [], "source": [ "track_column_schemas = [\n", - " {\n", - " \"name\": \"trackId\",\n", - " \"type\": \"string\"\n", - " },\n", - " {\n", - " \"name\": \"length\",\n", - " \"type\": \"float\"\n", - " },\n", - " {\n", - " \"name\": \"energy\",\n", - " \"type\": \"float\"\n", - " },\n", - " {\n", - " \"name\": \"acousticness\",\n", - " \"type\": \"float\"\n", - " },\n", - " {\n", - " \"name\": \"valence\",\n", - " \"type\": \"float\"\n", - " },\n", - " {\n", - " \"name\": \"speechiness\",\n", - " \"type\": \"float\"\n", - " },\n", - " {\n", - " \"name\": \"instrumentalness\",\n", - " \"type\": \"float\"\n", - " },\n", - " {\n", - " \"name\": \"liveness\",\n", - " \"type\": \"float\"\n", - " },\n", - " {\n", - " \"name\": \"tempo\",\n", - " \"type\": \"float\"\n", - " },\n", - " {\n", - " \"name\": \"genre_Folk\",\n", - " \"type\": \"float\"\n", - " },\n", - " {\n", - " \"name\": \"genre_Country\",\n", - " \"type\": \"float\"\n", - " },\n", - " {\n", - " \"name\": \"genre_Latin\",\n", - " \"type\": \"float\"\n", - " },\n", - " {\n", - " \"name\": \"genre_Jazz\",\n", - " \"type\": \"float\"\n", - " },\n", - " {\n", - " \"name\": \"genre_RnB\",\n", - " \"type\": \"float\"\n", - " },\n", - " {\n", - " \"name\": \"genre_Reggae\",\n", - " \"type\": \"float\"\n", - " },\n", - " {\n", - " \"name\": \"genre_Rap\",\n", - " \"type\": \"float\"\n", - " },\n", - " {\n", - " \"name\": \"genre_Pop_Rock\",\n", - " \"type\": \"float\"\n", - " },\n", - " {\n", - " \"name\": \"genre_Electronic\",\n", - " \"type\": \"float\"\n", - " },\n", - " {\n", - " \"name\": \"genre_Blues\",\n", - " \"type\": \"float\"\n", - " },\n", - " {\n", - " \"name\": \"danceability\",\n", - " \"type\": \"float\"\n", - " },\n", - " {\n", - " \"name\": \"EventTime\",\n", - " \"type\": \"float\"\n", - " }\n", + " {\"name\": \"trackId\", \"type\": \"string\"},\n", + " {\"name\": \"length\", \"type\": \"float\"},\n", + " {\"name\": \"energy\", \"type\": \"float\"},\n", + " {\"name\": \"acousticness\", \"type\": \"float\"},\n", + " {\"name\": \"valence\", \"type\": \"float\"},\n", + " {\"name\": \"speechiness\", \"type\": \"float\"},\n", + " {\"name\": \"instrumentalness\", \"type\": \"float\"},\n", + " {\"name\": \"liveness\", \"type\": \"float\"},\n", + " {\"name\": \"tempo\", \"type\": \"float\"},\n", + " {\"name\": \"genre_Folk\", \"type\": \"float\"},\n", + " {\"name\": \"genre_Country\", \"type\": \"float\"},\n", + " {\"name\": \"genre_Latin\", \"type\": \"float\"},\n", + " {\"name\": \"genre_Jazz\", \"type\": \"float\"},\n", + " {\"name\": \"genre_RnB\", \"type\": \"float\"},\n", + " {\"name\": \"genre_Reggae\", \"type\": \"float\"},\n", + " {\"name\": \"genre_Rap\", \"type\": \"float\"},\n", + " {\"name\": \"genre_Pop_Rock\", \"type\": \"float\"},\n", + " {\"name\": \"genre_Electronic\", \"type\": \"float\"},\n", + " {\"name\": \"genre_Blues\", \"type\": \"float\"},\n", + " {\"name\": \"danceability\", \"type\": \"float\"},\n", + " {\"name\": \"EventTime\", \"type\": \"float\"},\n", "]\n", "\n", "user_column_schemas = [\n", - " {\n", - " \"name\": \"userId\",\n", - " \"type\": \"long\"\n", - " },\n", - " {\n", - " \"name\": \"energy_5star\",\n", - " \"type\": \"float\"\n", - " },\n", - " {\n", - " \"name\": \"acousticness_5star\",\n", - " \"type\": \"float\"\n", - " },\n", - " {\n", - " \"name\": \"valence_5star\",\n", - " \"type\": \"float\"\n", - " },\n", - " {\n", - " \"name\": \"speechiness_5star\",\n", - " \"type\": \"float\"\n", - " },\n", - " {\n", - " \"name\": \"instrumentalness_5star\",\n", - " \"type\": \"float\"\n", - " },\n", - " {\n", - " \"name\": \"liveness_5star\",\n", - " \"type\": \"float\"\n", - " },\n", - " {\n", - " \"name\": \"tempo_5star\",\n", - " \"type\": \"float\"\n", - " },\n", - " {\n", - " \"name\": \"danceability_5star\",\n", - " \"type\": \"float\"\n", - " },\n", - " {\n", - " \"name\": \"genre_Latin_5star\",\n", - " \"type\": \"float\"\n", - " },\n", - " {\n", - " \"name\": \"genre_Folk_5star\",\n", - " \"type\": \"float\"\n", - " },\n", - " {\n", - " \"name\": \"genre_Blues_5star\",\n", - " \"type\": \"float\"\n", - " },\n", - " {\n", - " \"name\": \"genre_Rap_5star\",\n", - " \"type\": \"float\"\n", - " },\n", - " {\n", - " \"name\": \"genre_Reggae_5star\",\n", - " \"type\": \"float\"\n", - " },\n", - " {\n", - " \"name\": \"genre_Jazz_5star\",\n", - " \"type\": \"float\"\n", - " },\n", - " {\n", - " \"name\": \"genre_RnB_5star\",\n", - " \"type\": \"float\"\n", - " },\n", - " {\n", - " \"name\": \"genre_Country_5star\",\n", - " \"type\": \"float\"\n", - " },\n", - " {\n", - " \"name\": \"genre_Electronic_5star\",\n", - " \"type\": \"float\"\n", - " },\n", - " {\n", - " \"name\": \"genre_Pop_Rock_5star\",\n", - " \"type\": \"float\"\n", - " },\n", - " {\n", - " \"name\": \"EventTime\",\n", - " \"type\": \"float\"\n", - " }\n", + " {\"name\": \"userId\", \"type\": \"long\"},\n", + " {\"name\": \"energy_5star\", \"type\": \"float\"},\n", + " {\"name\": \"acousticness_5star\", \"type\": \"float\"},\n", + " {\"name\": \"valence_5star\", \"type\": \"float\"},\n", + " {\"name\": \"speechiness_5star\", \"type\": \"float\"},\n", + " {\"name\": \"instrumentalness_5star\", \"type\": \"float\"},\n", + " {\"name\": \"liveness_5star\", \"type\": \"float\"},\n", + " {\"name\": \"tempo_5star\", \"type\": \"float\"},\n", + " {\"name\": \"danceability_5star\", \"type\": \"float\"},\n", + " {\"name\": \"genre_Latin_5star\", \"type\": \"float\"},\n", + " {\"name\": \"genre_Folk_5star\", \"type\": \"float\"},\n", + " {\"name\": \"genre_Blues_5star\", \"type\": \"float\"},\n", + " {\"name\": \"genre_Rap_5star\", \"type\": \"float\"},\n", + " {\"name\": \"genre_Reggae_5star\", \"type\": \"float\"},\n", + " {\"name\": \"genre_Jazz_5star\", \"type\": \"float\"},\n", + " {\"name\": \"genre_RnB_5star\", \"type\": \"float\"},\n", + " {\"name\": \"genre_Country_5star\", \"type\": \"float\"},\n", + " {\"name\": \"genre_Electronic_5star\", \"type\": \"float\"},\n", + " {\"name\": \"genre_Pop_Rock_5star\", \"type\": \"float\"},\n", + " {\"name\": \"EventTime\", \"type\": \"float\"},\n", "]\n", "\n", "rating_column_schemas = [\n", - " {\n", - " \"name\": \"ratingEventId\",\n", - " \"type\": \"string\"\n", - " },\n", - " {\n", - " \"name\": \"ts\",\n", - " \"type\": \"long\"\n", - " },\n", - " {\n", - " \"name\": \"userId\",\n", - " \"type\": \"long\"\n", - " },\n", - " {\n", - " \"name\": \"trackId\",\n", - " \"type\": \"string\"\n", - " },\n", - " {\n", - " \"name\": \"sessionId\",\n", - " \"type\": \"long\"\n", - " },\n", - " {\n", - " \"name\": \"itemInSession\",\n", - " \"type\": \"long\"\n", - " },\n", - " {\n", - " \"name\": \"Rating\",\n", - " \"type\": \"float\"\n", - " },\n", - " {\n", - " \"name\": \"EventTime\",\n", - " \"type\": \"float\"\n", - " }\n", + " {\"name\": \"ratingEventId\", \"type\": \"string\"},\n", + " {\"name\": \"ts\", \"type\": \"long\"},\n", + " {\"name\": \"userId\", \"type\": \"long\"},\n", + " {\"name\": \"trackId\", \"type\": \"string\"},\n", + " {\"name\": \"sessionId\", \"type\": \"long\"},\n", + " {\"name\": \"itemInSession\", \"type\": \"long\"},\n", + " {\"name\": \"Rating\", \"type\": \"float\"},\n", + " {\"name\": \"EventTime\", \"type\": \"float\"},\n", "]\n", "\n", "column_schemas = {\n", - " 'track-features-music-rec': track_column_schemas, \n", - " 'user-5star-track-features-music-rec': user_column_schemas, \n", - " 'ratings-features-music-rec': rating_column_schemas,\n", + " \"track-features-music-rec\": track_column_schemas,\n", + " \"user-5star-track-features-music-rec\": user_column_schemas,\n", + " \"ratings-features-music-rec\": rating_column_schemas,\n", "}" ] }, @@ -517,16 +385,19 @@ "default_feature_type = FeatureTypeEnum.STRING\n", "column_to_feature_type_mapping = {\n", " \"float\": FeatureTypeEnum.FRACTIONAL,\n", - " \"long\": FeatureTypeEnum.INTEGRAL\n", + " \"long\": FeatureTypeEnum.INTEGRAL,\n", "}\n", "\n", "feature_definitions = {}\n", "for feature_group_name in feature_group_names:\n", " feature_definition = [\n", " FeatureDefinition(\n", - " feature_name=column_schema['name'], \n", - " feature_type=column_to_feature_type_mapping.get(column_schema['type'], default_feature_type)\n", - " ) for column_schema in column_schemas[feature_group_name]\n", + " feature_name=column_schema[\"name\"],\n", + " feature_type=column_to_feature_type_mapping.get(\n", + " column_schema[\"type\"], default_feature_type\n", + " ),\n", + " )\n", + " for column_schema in column_schemas[feature_group_name]\n", " ]\n", " feature_definitions[feature_group_name] = feature_definition" ] @@ -571,20 +442,21 @@ "flow_export_id = f\"{strftime('%d-%H-%M-%S', gmtime())}-{str(uuid.uuid4())[:8]}\"\n", "flow_export_name = f\"flow-{flow_export_id}\"\n", "\n", - "# SageMaker FeatureStore writes the data in the OfflineStore of a FeatureGroup to a \n", + "# SageMaker FeatureStore writes the data in the OfflineStore of a FeatureGroup to a\n", "# S3 location owned by you.\n", - "feature_store_offline_s3_uri = 's3://' + bucket\n", + "feature_store_offline_s3_uri = \"s3://\" + bucket\n", "\n", - "# controls if online store is enabled. Enabling the online store allows quick access to \n", + "# controls if online store is enabled. Enabling the online store allows quick access to\n", "# the latest value for a Record via the GetRecord API.\n", "enable_online_store = True\n", "fg_name_tracks = feature_group_name\n", "dw_ecrlist = {\n", - " 'region':{'us-west-2':'174368400705',\n", - " 'us-east-2':'415577184552',\n", - " 'us-west-1':'926135532090',\n", - " 'us-east-1':'663277389841'\n", - " }\n", + " \"region\": {\n", + " \"us-west-2\": \"174368400705\",\n", + " \"us-east-2\": \"415577184552\",\n", + " \"us-west-1\": \"926135532090\",\n", + " \"us-east-1\": \"663277389841\",\n", + " }\n", "}" ] }, @@ -610,13 +482,15 @@ "region = boto3.Session().region_name\n", "boto_session = boto3.Session(region_name=region)\n", "\n", - "sagemaker_client = boto_session.client(service_name='sagemaker', region_name=region)\n", - "featurestore_runtime = boto_session.client(service_name='sagemaker-featurestore-runtime', region_name=region)\n", + "sagemaker_client = boto_session.client(service_name=\"sagemaker\", region_name=region)\n", + "featurestore_runtime = boto_session.client(\n", + " service_name=\"sagemaker-featurestore-runtime\", region_name=region\n", + ")\n", "\n", "feature_store_session = Session(\n", " boto_session=boto_session,\n", " sagemaker_client=sagemaker_client,\n", - " sagemaker_featurestore_runtime_client=featurestore_runtime\n", + " sagemaker_featurestore_runtime_client=featurestore_runtime,\n", ")" ] }, @@ -629,6 +503,7 @@ "from sagemaker.feature_store.feature_group import FeatureGroup\n", "import time\n", "\n", + "\n", "def wait_for_feature_group_creation_complete(feature_group):\n", " \"\"\"Helper function to wait for the completions of creating a feature group\"\"\"\n", " status = feature_group.describe().get(\"FeatureGroupStatus\")\n", @@ -643,33 +518,37 @@ "\n", "def create_feature_group(feature_group_name, feature_store_session, feature_definitions):\n", " feature_group = FeatureGroup(\n", - " name=feature_group_name, sagemaker_session=feature_store_session, feature_definitions=feature_definitions[feature_group_name])\n", + " name=feature_group_name,\n", + " sagemaker_session=feature_store_session,\n", + " feature_definitions=feature_definitions[feature_group_name],\n", + " )\n", "\n", " # only create feature group if it doesn't already exist\n", " try:\n", - " sagemaker_client.describe_feature_group(FeatureGroupName=feature_group_name, NextToken='string')\n", - " feature_group_exists=True\n", + " sagemaker_client.describe_feature_group(\n", + " FeatureGroupName=feature_group_name, NextToken=\"string\"\n", + " )\n", + " feature_group_exists = True\n", " print(\"Feature Group {0} already exists. Using {0}\".format(feature_group_name))\n", " except Exception as e:\n", - " error = e.response.get('Error').get('Code')\n", + " error = e.response.get(\"Error\").get(\"Code\")\n", " if error == \"ResourceNotFound\":\n", - " feature_group_exists=False\n", + " feature_group_exists = False\n", " print(\"Creating Feature Group {}\".format(feature_group_name))\n", " feature_group.create(\n", " s3_uri=feature_store_offline_s3_uri,\n", " record_identifier_name=record_identifier_feature_names[feature_group_name],\n", " event_time_feature_name=event_time_feature_name,\n", " role_arn=iam_role,\n", - " enable_online_store=enable_online_store\n", + " enable_online_store=enable_online_store,\n", " )\n", " # Invoke the Feature Store API to create the feature group and wait until it is ready\n", " wait_for_feature_group_creation_complete(feature_group=feature_group)\n", - " if error == 'ResourceInUse':\n", - " feature_group_exists=True\n", + " if error == \"ResourceInUse\":\n", + " feature_group_exists = True\n", " print(\"Feature Group {0} already exists. Using {0}\".format(feature_group_name))\n", - " \n", - " return feature_group_exists\n", - " \n" + "\n", + " return feature_group_exists" ] }, { @@ -687,7 +566,9 @@ "source": [ "feature_group_existence = {}\n", "for feature_group_name in feature_group_names:\n", - " feature_group_exists = create_feature_group(feature_group_name, feature_store_session, feature_definitions)\n", + " feature_group_exists = create_feature_group(\n", + " feature_group_name, feature_store_session, feature_definitions\n", + " )\n", " feature_group_existence[feature_group_name] = feature_group_exists" ] }, @@ -729,7 +610,11 @@ "outputs": [], "source": [ "from sagemaker.processing import ProcessingInput, ProcessingOutput\n", - "from sagemaker.dataset_definition.inputs import AthenaDatasetDefinition, DatasetDefinition, RedshiftDatasetDefinition\n", + "from sagemaker.dataset_definition.inputs import (\n", + " AthenaDatasetDefinition,\n", + " DatasetDefinition,\n", + " RedshiftDatasetDefinition,\n", + ")\n", "\n", "data_sources = []" ] @@ -747,14 +632,16 @@ "metadata": {}, "outputs": [], "source": [ - "data_sources.append(ProcessingInput(\n", - " source=f\"{tracks_data_source}\", # You could override this to point to another dataset on S3\n", - " destination=\"/opt/ml/processing/tracks.csv\",\n", - " input_name=\"tracks.csv\",\n", - " s3_data_type=\"S3Prefix\",\n", - " s3_input_mode=\"File\",\n", - " s3_data_distribution_type=\"FullyReplicated\"\n", - "))" + "data_sources.append(\n", + " ProcessingInput(\n", + " source=f\"{tracks_data_source}\", # You could override this to point to another dataset on S3\n", + " destination=\"/opt/ml/processing/tracks.csv\",\n", + " input_name=\"tracks.csv\",\n", + " s3_data_type=\"S3Prefix\",\n", + " s3_input_mode=\"File\",\n", + " s3_data_distribution_type=\"FullyReplicated\",\n", + " )\n", + ")" ] }, { @@ -770,14 +657,16 @@ "metadata": {}, "outputs": [], "source": [ - "data_sources.append(ProcessingInput(\n", - " source=f\"{ratings_data_source}\", # You could override this to point to another dataset on S3\n", - " destination=\"/opt/ml/processing/ratings.csv\",\n", - " input_name=\"ratings.csv\",\n", - " s3_data_type=\"S3Prefix\",\n", - " s3_input_mode=\"File\",\n", - " s3_data_distribution_type=\"FullyReplicated\"\n", - "))" + "data_sources.append(\n", + " ProcessingInput(\n", + " source=f\"{ratings_data_source}\", # You could override this to point to another dataset on S3\n", + " destination=\"/opt/ml/processing/ratings.csv\",\n", + " input_name=\"ratings.csv\",\n", + " s3_data_type=\"S3Prefix\",\n", + " s3_input_mode=\"File\",\n", + " s3_data_distribution_type=\"FullyReplicated\",\n", + " )\n", + ")" ] }, { @@ -799,8 +688,8 @@ "\n", "# Output name is auto-generated from the select node's ID + output name from the .flow file\n", "output_names = {\n", - " \"track-features-music-rec\": \"19ad8e80-2002-4ee9-9753-fe9a384b1166.default\", \n", - " \"user-5star-track-features-music-rec\": \"7a6dad19-2c80-43e3-b03d-ec23c3842ae9.default\", \n", + " \"track-features-music-rec\": \"19ad8e80-2002-4ee9-9753-fe9a384b1166.default\",\n", + " \"user-5star-track-features-music-rec\": \"7a6dad19-2c80-43e3-b03d-ec23c3842ae9.default\",\n", " \"ratings-features-music-rec\": \"9a283380-91ca-478e-be99-6ba3bf57c680.default\",\n", "}\n", "\n", @@ -838,7 +727,7 @@ "# name of the flow file which should exist in the current notebook working directory\n", "flow_file_name = \"01_music_dataprep.flow\"\n", "\n", - "# Load .flow file from current notebook working directory \n", + "# Load .flow file from current notebook working directory\n", "!echo \"Loading flow file from current notebook working directory: $PWD\"\n", "\n", "with open(flow_file_name) as f:\n", @@ -846,7 +735,9 @@ "\n", "# Upload flow to S3\n", "s3_client = boto3.client(\"s3\")\n", - "s3_client.upload_file(flow_file_name, bucket, f\"{prefix}/data_wrangler_flows/{flow_export_name}.flow\")\n", + "s3_client.upload_file(\n", + " flow_file_name, bucket, f\"{prefix}/data_wrangler_flows/{flow_export_name}.flow\"\n", + ")\n", "\n", "flow_s3_uri = f\"s3://{bucket}/{prefix}/data_wrangler_flows/{flow_export_name}.flow\"\n", "\n", @@ -873,7 +764,7 @@ " input_name=\"flow\",\n", " s3_data_type=\"S3Prefix\",\n", " s3_input_mode=\"File\",\n", - " s3_data_distribution_type=\"FullyReplicated\"\n", + " s3_data_distribution_type=\"FullyReplicated\",\n", ")" ] }, @@ -921,7 +812,7 @@ "output_content_type = \"CSV\"\n", "\n", "# Network Isolation mode; default is off\n", - "enable_network_isolation = False\n" + "enable_network_isolation = False" ] }, { @@ -949,7 +840,7 @@ " instance_type=instance_type,\n", " volume_size_in_gb=volume_size_in_gb,\n", " network_config=NetworkConfig(enable_network_isolation=enable_network_isolation),\n", - " sagemaker_session=sess\n", + " sagemaker_session=sess,\n", ")" ] }, @@ -975,29 +866,31 @@ "for feature_group_name in feature_group_names:\n", " print(f\"Processing {feature_group_name}\")\n", " # Unique processing job name. Give a unique name every time you re-execute processing jobs\n", - " processing_job_name = \"dw-flow-proc-music-rec-tracks-{}-{}\".format(flow_export_id, str(uuid.uuid4())[:8])\n", - " print (f\"{processing_job_name}\")\n", - " \n", - " # Output configuration used as processing job container arguments \n", - " output_config = {\n", - " output_names[feature_group_name]: {\n", - " \"content_type\": output_content_type\n", - " }\n", - " }\n", + " processing_job_name = \"dw-flow-proc-music-rec-tracks-{}-{}\".format(\n", + " flow_export_id, str(uuid.uuid4())[:8]\n", + " )\n", + " print(f\"{processing_job_name}\")\n", + "\n", + " # Output configuration used as processing job container arguments\n", + " output_config = {output_names[feature_group_name]: {\"content_type\": output_content_type}}\n", "\n", " # Run Processing Job if job not already previously ran\n", - " if feature_group_exists: #feature_group_existence[feature_group_name]\n", - " print(\"Feature Group {0} already exists therefore we will not run a processing job to create it again\".format(feature_group_name))\n", + " if feature_group_exists: # feature_group_existence[feature_group_name]\n", + " print(\n", + " \"Feature Group {0} already exists therefore we will not run a processing job to create it again\".format(\n", + " feature_group_name\n", + " )\n", + " )\n", " else:\n", " print(\"Creating Processing Job: {}\".format(feature_group_name))\n", " processor.run(\n", - " inputs=[flow_input] + data_sources, \n", + " inputs=[flow_input] + data_sources,\n", " outputs=[processing_job_outputs[feature_group_name]],\n", " arguments=[f\"--output-config '{json.dumps(output_config)}'\"],\n", " wait=False,\n", " logs=False,\n", - " job_name=processing_job_name\n", - " ) \n", + " job_name=processing_job_name,\n", + " )\n", "\n", " job_result = sess.wait_for_processing_job(processing_job_name)\n", " print(job_result)" @@ -1039,12 +932,12 @@ "metadata": {}, "outputs": [], "source": [ - "s3_client = boto3.client('s3')\n", - "account_id = boto3.client('sts').get_caller_identity()[\"Account\"]\n", + "s3_client = boto3.client(\"s3\")\n", + "account_id = boto3.client(\"sts\").get_caller_identity()[\"Account\"]\n", "\n", "sagemaker_role = sagemaker.get_execution_role()\n", "\n", - "s3_output_path = 's3://' + bucket" + "s3_output_path = \"s3://\" + bucket" ] }, { @@ -1055,8 +948,12 @@ "source": [ "feature_group_s3_prefixes = []\n", "for feature_group in feature_groups:\n", - " feature_group_table_name = feature_group.describe().get(\"OfflineStoreConfig\").get(\"DataCatalogConfig\").get(\"TableName\")\n", - " feature_group_s3_prefix = f'{account_id}/sagemaker/{region}/offline-store/{feature_group_table_name}'\n", + " feature_group_table_name = (\n", + " feature_group.describe().get(\"OfflineStoreConfig\").get(\"DataCatalogConfig\").get(\"TableName\")\n", + " )\n", + " feature_group_s3_prefix = (\n", + " f\"{account_id}/sagemaker/{region}/offline-store/{feature_group_table_name}\"\n", + " )\n", " feature_group_s3_prefixes.append(feature_group_s3_prefix)" ] }, @@ -1072,15 +969,16 @@ "def wait_for_offline_store(feature_group_s3_prefix):\n", " print(feature_group_s3_prefix)\n", " offline_store_contents = None\n", - " while (offline_store_contents is None):\n", + " while offline_store_contents is None:\n", " objects_in_bucket = s3_client.list_objects(Bucket=bucket, Prefix=feature_group_s3_prefix)\n", - " if ('Contents' in objects_in_bucket and len(objects_in_bucket['Contents']) > 1):\n", - " offline_store_contents = objects_in_bucket['Contents']\n", + " if \"Contents\" in objects_in_bucket and len(objects_in_bucket[\"Contents\"]) > 1:\n", + " offline_store_contents = objects_in_bucket[\"Contents\"]\n", " else:\n", - " print('Waiting for data in offline store...')\n", + " print(\"Waiting for data in offline store...\")\n", " time.sleep(60)\n", - " print('Data available.')\n", - " \n", + " print(\"Data available.\")\n", + "\n", + "\n", "for s3_prefix in feature_group_s3_prefixes:\n", " wait_for_offline_store(s3_prefix)" ] @@ -1091,22 +989,58 @@ "metadata": {}, "outputs": [], "source": [ - "tables = { \n", - " 'ratings': {'feature_group': feature_groups[2],\n", - " 'cols': ['userId', 'trackid', 'rating']\n", - " },\n", - " 'tracks': {'feature_group': feature_groups[0],\n", - " 'cols': ['trackid', 'length', 'energy', 'acousticness', 'valence', 'speechiness', 'instrumentalness', \n", - " 'liveness', 'tempo', 'danceability', 'genre_latin', 'genre_folk', 'genre_blues', 'genre_rap', \n", - " 'genre_reggae', 'genre_jazz', 'genre_rnb', 'genre_country', 'genre_electronic', 'genre_pop_rock']\n", - " },\n", - " 'user_5star_features': {'feature_group': feature_groups[1],\n", - " 'cols': ['userId', 'energy_5star', 'acousticness_5star', 'valence_5star', 'speechiness_5star', 'instrumentalness_5star', \n", - " 'liveness_5star','tempo_5star', 'danceability_5star', 'genre_latin_5star', 'genre_folk_5star', 'genre_blues_5star', \n", - " 'genre_rap_5star','genre_reggae_5star', 'genre_jazz_5star', 'genre_rnb_5star', 'genre_country_5star', \n", - " 'genre_electronic_5star', 'genre_pop_rock_5star']\n", - " },\n", - " }" + "tables = {\n", + " \"ratings\": {\"feature_group\": feature_groups[2], \"cols\": [\"userId\", \"trackid\", \"rating\"]},\n", + " \"tracks\": {\n", + " \"feature_group\": feature_groups[0],\n", + " \"cols\": [\n", + " \"trackid\",\n", + " \"length\",\n", + " \"energy\",\n", + " \"acousticness\",\n", + " \"valence\",\n", + " \"speechiness\",\n", + " \"instrumentalness\",\n", + " \"liveness\",\n", + " \"tempo\",\n", + " \"danceability\",\n", + " \"genre_latin\",\n", + " \"genre_folk\",\n", + " \"genre_blues\",\n", + " \"genre_rap\",\n", + " \"genre_reggae\",\n", + " \"genre_jazz\",\n", + " \"genre_rnb\",\n", + " \"genre_country\",\n", + " \"genre_electronic\",\n", + " \"genre_pop_rock\",\n", + " ],\n", + " },\n", + " \"user_5star_features\": {\n", + " \"feature_group\": feature_groups[1],\n", + " \"cols\": [\n", + " \"userId\",\n", + " \"energy_5star\",\n", + " \"acousticness_5star\",\n", + " \"valence_5star\",\n", + " \"speechiness_5star\",\n", + " \"instrumentalness_5star\",\n", + " \"liveness_5star\",\n", + " \"tempo_5star\",\n", + " \"danceability_5star\",\n", + " \"genre_latin_5star\",\n", + " \"genre_folk_5star\",\n", + " \"genre_blues_5star\",\n", + " \"genre_rap_5star\",\n", + " \"genre_reggae_5star\",\n", + " \"genre_jazz_5star\",\n", + " \"genre_rnb_5star\",\n", + " \"genre_country_5star\",\n", + " \"genre_electronic_5star\",\n", + " \"genre_pop_rock_5star\",\n", + " ],\n", + " },\n", + "}" ] }, { @@ -1124,34 +1058,37 @@ "source": [ "def get_train_val():\n", " for k, v in tables.items():\n", - " query = v['feature_group'].athena_query()\n", - " joined_cols = \", \".join(v['cols'])\n", + " query = v[\"feature_group\"].athena_query()\n", + " joined_cols = \", \".join(v[\"cols\"])\n", " # limit number of datapoints for training time\n", - " query_string = \"SELECT {} FROM \\\"{}\\\" LIMIT 500000\".format(joined_cols, query.table_name)\n", - " print(query_string,'\\n')\n", + " query_string = 'SELECT {} FROM \"{}\" LIMIT 500000'.format(joined_cols, query.table_name)\n", + " print(query_string, \"\\n\")\n", "\n", - " output_location = f's3://{bucket}/{prefix}/query_results/'\n", + " output_location = f\"s3://{bucket}/{prefix}/query_results/\"\n", " query.run(query_string=query_string, output_location=output_location)\n", " query.wait()\n", "\n", - " tables[k]['df'] = query.as_dataframe() \n", - " \n", - " ratings = tables['ratings']['df']\n", - " tracks = tables['tracks']['df']\n", - " user_prefs = tables['user_5star_features']['df']\n", - " \n", - " print('Merging datasets...')\n", - " print(f'Ratings: {ratings.shape}\\nTracks: {tracks.shape}\\nUser Prefs: {user_prefs.shape}\\n')\n", - " \n", - " dataset = pd.merge(ratings, tracks, on='trackid', how='inner')\n", - " dataset = pd.merge(dataset, user_prefs, on='userId', how='inner')\n", + " tables[k][\"df\"] = query.as_dataframe()\n", + "\n", + " ratings = tables[\"ratings\"][\"df\"]\n", + " tracks = tables[\"tracks\"][\"df\"]\n", + " user_prefs = tables[\"user_5star_features\"][\"df\"]\n", + "\n", + " print(\"Merging datasets...\")\n", + " print(f\"Ratings: {ratings.shape}\\nTracks: {tracks.shape}\\nUser Prefs: {user_prefs.shape}\\n\")\n", + "\n", + " dataset = pd.merge(ratings, tracks, on=\"trackid\", how=\"inner\")\n", + " dataset = pd.merge(dataset, user_prefs, on=\"userId\", how=\"inner\")\n", " dataset.drop_duplicates(inplace=True)\n", - " dataset.drop(['userId', 'trackid'], axis=1, inplace=True)\n", + " dataset.drop([\"userId\", \"trackid\"], axis=1, inplace=True)\n", "\n", " # split data\n", " from sklearn.model_selection import train_test_split\n", + "\n", " train, val = train_test_split(dataset, test_size=0.2, random_state=42)\n", - " print(\"Training dataset shape: {}\\nValidation dataset shape: {}\\n\".format(train.shape, val.shape))\n", + " print(\n", + " \"Training dataset shape: {}\\nValidation dataset shape: {}\\n\".format(train.shape, val.shape)\n", + " )\n", "\n", " return train, val" ] @@ -1167,14 +1104,18 @@ "import glob\n", "\n", "\n", - "print('Creating training and validation sets...\\n')\n", + "print(\"Creating training and validation sets...\\n\")\n", "train, val = get_train_val()\n", "# Write to csv in S3 without headers and index column\n", - "train.to_csv('./data/train_data.csv', header=False, index=False)\n", - "val.to_csv('./data/val_data.csv', header=False, index=False)\n", - "\n", - "pd.DataFrame({\"ColumnName\": train.columns}).to_csv(\"./data/train_data_headers.csv\", header=False, index=False)\n", - "pd.DataFrame({\"ColumnName\": val.columns}).to_csv(\"./data/val_data_headers.csv\", header=False, index=False)" + "train.to_csv(\"./data/train_data.csv\", header=False, index=False)\n", + "val.to_csv(\"./data/val_data.csv\", header=False, index=False)\n", + "\n", + "pd.DataFrame({\"ColumnName\": train.columns}).to_csv(\n", + " \"./data/train_data_headers.csv\", header=False, index=False\n", + ")\n", + "pd.DataFrame({\"ColumnName\": val.columns}).to_csv(\n", + " \"./data/val_data_headers.csv\", header=False, index=False\n", + ")" ] }, { diff --git a/end_to_end/music_recommendation/03_train_deploy_debugger_explain_monitor_registry.ipynb b/end_to_end/music_recommendation/03_train_deploy_debugger_explain_monitor_registry.ipynb index 9788a44e8e..d6fc6172fb 100644 --- a/end_to_end/music_recommendation/03_train_deploy_debugger_explain_monitor_registry.ipynb +++ b/end_to_end/music_recommendation/03_train_deploy_debugger_explain_monitor_registry.ipynb @@ -72,7 +72,8 @@ "source": [ "import sys\n", "import pprint\n", - "sys.path.insert(1, './code')" + "\n", + "sys.path.insert(1, \"./code\")" ] }, { @@ -99,7 +100,7 @@ "# get session bucket name\n", "bucket = sess.default_bucket()\n", "# bucket prefix or the subfolder for everything we produce\n", - "prefix='music-recommendation'\n", + "prefix = \"music-recommendation\"\n", "# get sagemaker role\n", "sagemaker_role = sagemaker.get_execution_role()\n", "# s3 client\n", @@ -109,17 +110,18 @@ "boto_session = boto3.Session(region_name=region)\n", "\n", "\n", - "sagemaker_client = boto_session.client(service_name='sagemaker', region_name=region)\n", + "sagemaker_client = boto_session.client(service_name=\"sagemaker\", region_name=region)\n", "sagemaker_session = sagemaker.session.Session(\n", - " boto_session=boto_session,\n", - " sagemaker_client=sagemaker_client\n", + " boto_session=boto_session, sagemaker_client=sagemaker_client\n", + ")\n", + "featurestore_runtime = boto_session.client(\n", + " service_name=\"sagemaker-featurestore-runtime\", region_name=region\n", ")\n", - "featurestore_runtime = boto_session.client(service_name='sagemaker-featurestore-runtime', region_name=region)\n", "\n", "feature_store_session = sagemaker.session.Session(\n", " boto_session=boto_session,\n", " sagemaker_client=sagemaker_client,\n", - " sagemaker_featurestore_runtime_client=featurestore_runtime\n", + " sagemaker_featurestore_runtime_client=featurestore_runtime,\n", ")" ] }, @@ -169,7 +171,13 @@ "metadata": {}, "outputs": [], "source": [ - "new_data_paths = get_data(s3_client, [f\"{s3_bucket_music_data}/tracks.csv\", f\"{s3_bucket_music_data}/ratings.csv\"], bucket, prefix, sample_data=0.70)\n", + "new_data_paths = get_data(\n", + " s3_client,\n", + " [f\"{s3_bucket_music_data}/tracks.csv\", f\"{s3_bucket_music_data}/ratings.csv\"],\n", + " bucket,\n", + " prefix,\n", + " sample_data=0.70,\n", + ")\n", "print(new_data_paths)" ] }, @@ -186,11 +194,12 @@ " f\"train_data.zip\",\n", " f\"val_data_headers.csv\",\n", " f\"val_data.zip\",\n", - " \n", "]\n", "\n", "for file in files_to_download:\n", - " s3_client.download_file(f\"sagemaker-sample-files\", f\"datasets/tabular/synthetic-music/{file}\", f\"./data/{file}\")" + " s3_client.download_file(\n", + " f\"sagemaker-sample-files\", f\"datasets/tabular/synthetic-music/{file}\", f\"./data/{file}\"\n", + " )" ] }, { @@ -210,8 +219,8 @@ "outputs": [], "source": [ "# these are the new file paths located on your SageMaker Studio default s3 storage bucket\n", - "tracks_data_source = f's3://{bucket}/{prefix}/tracks.csv'\n", - "ratings_data_source = f's3://{bucket}/{prefix}/ratings.csv'" + "tracks_data_source = f\"s3://{bucket}/{prefix}/tracks.csv\"\n", + "ratings_data_source = f\"s3://{bucket}/{prefix}/ratings.csv\"" ] }, { @@ -229,21 +238,22 @@ "source": [ "%%time\n", "\n", - "train_headers = pd.read_csv('data/train_data_headers.csv', header=None)[0].tolist()\n", - "val_headers = pd.read_csv('data/val_data_headers.csv', header=None)[0].tolist()\n", - "train = pd.read_csv('data/train_data.csv', names=train_headers)\n", - "val = pd.read_csv('data/val_data.csv', names=val_headers)\n", + "train_headers = pd.read_csv(\"data/train_data_headers.csv\", header=None)[0].tolist()\n", + "val_headers = pd.read_csv(\"data/val_data_headers.csv\", header=None)[0].tolist()\n", + "train = pd.read_csv(\"data/train_data.csv\", names=train_headers)\n", + "val = pd.read_csv(\"data/val_data.csv\", names=val_headers)\n", "\n", - "s3_client.upload_file('data/train_data.csv', bucket, f'{prefix}/data/train/train_data.csv')\n", - "s3_client.upload_file('data/val_data.csv', bucket, f'{prefix}/data/val/val_data.csv')\n", + "s3_client.upload_file(\"data/train_data.csv\", bucket, f\"{prefix}/data/train/train_data.csv\")\n", + "s3_client.upload_file(\"data/val_data.csv\", bucket, f\"{prefix}/data/val/val_data.csv\")\n", "\n", "\n", - "train_data_uri = f's3://{bucket}/{prefix}/data/train/train_data.csv'\n", - "val_data_uri = f's3://{bucket}/{prefix}/data/val/val_data.csv'\n", - "print (f\"Saving training data to {train_data_uri}\")\n", + "train_data_uri = f\"s3://{bucket}/{prefix}/data/train/train_data.csv\"\n", + "val_data_uri = f\"s3://{bucket}/{prefix}/data/val/val_data.csv\"\n", + "print(f\"Saving training data to {train_data_uri}\")\n", "\n", "# configure data inputs for SageMaker training\n", "from sagemaker.inputs import TrainingInput\n", + "\n", "train_input = TrainingInput(train_data_uri, content_type=\"text/csv\")\n", "val_input = TrainingInput(val_data_uri, content_type=\"text/csv\")" ] @@ -275,12 +285,12 @@ "outputs": [], "source": [ "# variables used for parameterizing the notebook run\n", - "estimator_output_path = f's3://{bucket}/{prefix}/training_jobs'\n", + "estimator_output_path = f\"s3://{bucket}/{prefix}/training_jobs\"\n", "train_instance_count = 2\n", - "train_instance_type = 'ml.m5.4xlarge'\n", + "train_instance_type = \"ml.m5.4xlarge\"\n", "save_interval = 2\n", "image = sagemaker.image_uris.retrieve(\"xgboost\", region, \"0.90-2\")\n", - "model_name = 'music-rec-model-{}'.format(datetime.now().strftime(\"%Y%m%d-%H%M%S\"))\n", + "model_name = \"music-rec-model-{}\".format(datetime.now().strftime(\"%Y%m%d-%H%M%S\"))\n", "\n", "hyperparameters = {\n", " \"max_depth\": \"4\",\n", @@ -302,39 +312,19 @@ " instance_type=train_instance_type,\n", " image_uri=image,\n", " hyperparameters=hyperparameters,\n", - "# base_job_name=model_name,\n", + " # base_job_name=model_name,\n", " output_path=estimator_output_path,\n", - " \n", " debugger_hook_config=DebuggerHookConfig(\n", - " s3_output_path=estimator_output_path+'/debugger', \n", + " s3_output_path=estimator_output_path + \"/debugger\",\n", " collection_configs=[\n", + " CollectionConfig(name=\"metrics\", parameters={\"save_interval\": str(save_interval)}),\n", " CollectionConfig(\n", - " name=\"metrics\",\n", - " parameters={\n", - " \"save_interval\": str(save_interval)\n", - " }\n", - " ),\n", - " CollectionConfig(\n", - " name=\"feature_importance\",\n", - " parameters={\n", - " \"save_interval\": str(save_interval)\n", - " }\n", - " ),\n", - " CollectionConfig(\n", - " name=\"full_shap\",\n", - " parameters={\n", - " \"save_interval\": str(save_interval)\n", - " }\n", - " ),\n", - " CollectionConfig(\n", - " name=\"average_shap\",\n", - " parameters={\n", - " \"save_interval\": str(save_interval)\n", - " }\n", + " name=\"feature_importance\", parameters={\"save_interval\": str(save_interval)}\n", " ),\n", + " CollectionConfig(name=\"full_shap\", parameters={\"save_interval\": str(save_interval)}),\n", + " CollectionConfig(name=\"average_shap\", parameters={\"save_interval\": str(save_interval)}),\n", " ],\n", " ),\n", - "\n", " rules=[\n", " Rule.sagemaker(\n", " rule_configs.loss_not_decreasing(),\n", @@ -362,10 +352,7 @@ "outputs": [], "source": [ "response = sagemaker_client.list_training_jobs(\n", - " NameContains = model_name,\n", - " StatusEquals = 'Completed',\n", - " SortBy='CreationTime',\n", - " SortOrder='Descending'\n", + " NameContains=model_name, StatusEquals=\"Completed\", SortBy=\"CreationTime\", SortOrder=\"Descending\"\n", ")" ] }, @@ -379,18 +366,18 @@ "source": [ "%%time\n", "\n", - "train_model = True # True if training a new model, False if wanting to use an existing estimator once you've already trained\n", + "train_model = True # True if training a new model, False if wanting to use an existing estimator once you've already trained\n", "\n", "if train_model:\n", - " print('Training the model')\n", - " xgb_estimator.fit(inputs = {'train': train_input, 'validation': val_input}, job_name=model_name)\n", + " print(\"Training the model\")\n", + " xgb_estimator.fit(inputs={\"train\": train_input, \"validation\": val_input}, job_name=model_name)\n", " s3_debugger_output_path = xgb_estimator.latest_job_debugger_artifacts_path()\n", - "elif len(response['TrainingJobSummaries']) > 0:\n", - " training_job_name = response['TrainingJobSummaries'][0]['TrainingJobName']\n", + "elif len(response[\"TrainingJobSummaries\"]) > 0:\n", + " training_job_name = response[\"TrainingJobSummaries\"][0][\"TrainingJobName\"]\n", " xgb_estimator = Estimator.attach(training_job_name)\n", " s3_debugger_output_path = xgb_estimator.latest_job_debugger_artifacts_path()\n", "else:\n", - " print(\"No existing estimator found. You'll need to run as train = True\")\n" + " print(\"No existing estimator found. You'll need to run as train = True\")" ] }, { @@ -410,6 +397,7 @@ "outputs": [], "source": [ "import pprint\n", + "\n", "training_job_info = sagemaker_client.describe_training_job(TrainingJobName=training_job_name)\n", "pprint.pprint(f\"{training_job_info}\")" ] @@ -428,7 +416,7 @@ "metadata": {}, "outputs": [], "source": [ - "endpoint_name = 'music-rec-endpoint-{}'.format(datetime.now().strftime(\"%Y%m%d-%H%M%S\"))\n", + "endpoint_name = \"music-rec-endpoint-{}\".format(datetime.now().strftime(\"%Y%m%d-%H%M%S\"))\n", "print(endpoint_name)" ] }, @@ -439,10 +427,10 @@ "outputs": [], "source": [ "endpoint_list = sagemaker_client.list_endpoints(\n", - " SortBy='CreationTime',\n", - " SortOrder='Descending',\n", + " SortBy=\"CreationTime\",\n", + " SortOrder=\"Descending\",\n", " NameContains=endpoint_name,\n", - " StatusEquals='InService'\n", + " StatusEquals=\"InService\",\n", ")\n", "endpoint_list" ] @@ -461,14 +449,13 @@ "outputs": [], "source": [ "%%time\n", - "if len(endpoint_list['Endpoints']) > 0:\n", + "if len(endpoint_list[\"Endpoints\"]) > 0:\n", " print(f\"Using existing endpoint: {endpoint_list['Endpoints'][0]['EndpointName']}\")\n", "else:\n", " # deploy endpoint for model if it doesn't already exist\n", - " xgb_estimator.deploy(initial_instance_count=1,\n", - " instance_type='ml.m4.xlarge',\n", - " endpoint_name=endpoint_name\n", - " )" + " xgb_estimator.deploy(\n", + " initial_instance_count=1, instance_type=\"ml.m4.xlarge\", endpoint_name=endpoint_name\n", + " )" ] }, { @@ -486,8 +473,8 @@ "outputs": [], "source": [ "predictor = sagemaker.predictor.Predictor(\n", - " endpoint_name=endpoint_name,\n", - " sagemaker_session=sagemaker_session)" + " endpoint_name=endpoint_name, sagemaker_session=sagemaker_session\n", + ")" ] }, { @@ -504,7 +491,7 @@ "outputs": [], "source": [ "df_user = pd.read_csv(\"./data/sample_user.csv\")\n", - "df_user = df_user.set_index('FeatureName')" + "df_user = df_user.set_index(\"FeatureName\")" ] }, { @@ -529,10 +516,10 @@ "metadata": {}, "outputs": [], "source": [ - "feature_names = pd.read_csv('data/train_data_headers.csv', header=None)[0].tolist()[1:]\n", + "feature_names = pd.read_csv(\"data/train_data_headers.csv\", header=None)[0].tolist()[1:]\n", "data = (\n", " df_tracks.assign(key=1)\n", - " .merge(pd.DataFrame(df_user['ValueAsString']).T.assign(key=1), on=\"key\")\n", + " .merge(pd.DataFrame(df_user[\"ValueAsString\"]).T.assign(key=1), on=\"key\")\n", " .drop(\"key\", axis=1)\n", ")\n", "data.columns = [c.lower() for c in data.columns]\n", @@ -554,7 +541,7 @@ "metadata": {}, "outputs": [], "source": [ - "data_inputs = [','.join([str(i) for i in row]) for row in inference_df.values]" + "data_inputs = [\",\".join([str(i) for i in row]) for row in inference_df.values]" ] }, { @@ -573,10 +560,10 @@ "source": [ "predictions = []\n", "for data_input in data_inputs:\n", - " results = predictor.predict(data_input, initial_args = {\"ContentType\": \"text/csv\"})\n", + " results = predictor.predict(data_input, initial_args={\"ContentType\": \"text/csv\"})\n", " prediction = json.loads(results)\n", " predictions.append(prediction)\n", - "print(f'Predicted rating for sample user:', prediction)" + "print(f\"Predicted rating for sample user:\", prediction)" ] }, { @@ -586,13 +573,13 @@ "outputs": [], "source": [ "# Write to csv in S3 without headers and index column.\n", - "inference_df['rating'] = predictions\n", - "inference_df = inference_df[['rating']+feature_names]\n", - "inference_df.to_csv('data/prediction_data.csv', header=False, index=False)\n", + "inference_df[\"rating\"] = predictions\n", + "inference_df = inference_df[[\"rating\"] + feature_names]\n", + "inference_df.to_csv(\"data/prediction_data.csv\", header=False, index=False)\n", "\n", - "s3_client.upload_file('data/prediction_data.csv', bucket, f'{prefix}/data/pred/prediction_data.csv')\n", + "s3_client.upload_file(\"data/prediction_data.csv\", bucket, f\"{prefix}/data/pred/prediction_data.csv\")\n", "\n", - "pred_data_uri = f's3://{bucket}/{prefix}/data/pred/prediction_data.csv'" + "pred_data_uri = f\"s3://{bucket}/{prefix}/data/pred/prediction_data.csv\"" ] }, { @@ -604,7 +591,7 @@ "s3_client.download_file(bucket, f\"{prefix}/data/train/train_data.csv\", f\"train_data.csv\")\n", "df_train = pd.read_csv(\"train_data.csv\")\n", "\n", - "label = 'rating'" + "label = \"rating\"" ] }, { @@ -632,7 +619,7 @@ "metadata": {}, "outputs": [], "source": [ - "explainability_output_path = f's3://{bucket}/{prefix}/clarify-output/explainability'" + "explainability_output_path = f\"s3://{bucket}/{prefix}/clarify-output/explainability\"" ] }, { @@ -644,26 +631,29 @@ "clarify_processor = sagemaker.clarify.SageMakerClarifyProcessor(\n", " role=sagemaker_role,\n", " instance_count=1,\n", - " instance_type='ml.c4.xlarge',\n", - " sagemaker_session=sagemaker_session)\n", + " instance_type=\"ml.c4.xlarge\",\n", + " sagemaker_session=sagemaker_session,\n", + ")\n", "\n", "model_config = sagemaker.clarify.ModelConfig(\n", - " model_name=model_name,\n", - " instance_type='ml.m4.xlarge',\n", - " instance_count=1,\n", - " accept_type='text/csv')\n", + " model_name=model_name, instance_type=\"ml.m4.xlarge\", instance_count=1, accept_type=\"text/csv\"\n", + ")\n", "\n", "shap_config = sagemaker.clarify.SHAPConfig(\n", - " baseline=[df_train.median().values[1:].tolist()], # ignore the first column since that is that target\n", + " baseline=[\n", + " df_train.median().values[1:].tolist()\n", + " ], # ignore the first column since that is that target\n", " num_samples=100,\n", - " agg_method='mean_abs')\n", + " agg_method=\"mean_abs\",\n", + ")\n", "\n", "explainability_data_config = sagemaker.clarify.DataConfig(\n", " s3_data_input_path=pred_data_uri,\n", " s3_output_path=explainability_output_path,\n", " label=label,\n", - " headers=[label]+feature_names,\n", - " dataset_type='text/csv')\n" + " headers=[label] + feature_names,\n", + " dataset_type=\"text/csv\",\n", + ")" ] }, { @@ -677,19 +667,20 @@ "%%time\n", "try:\n", " s3_client.download_file(\n", - " Bucket = bucket, \n", - " Key = f'{prefix}/clarify-output/explainability/explanations_shap/out.csv', \n", - " Filename = 'data/shap_output.csv'\n", + " Bucket=bucket,\n", + " Key=f\"{prefix}/clarify-output/explainability/explanations_shap/out.csv\",\n", + " Filename=\"data/shap_output.csv\",\n", " )\n", - " print('Downloaded output from previous explainability job')\n", + " print(\"Downloaded output from previous explainability job\")\n", "except Exception as e:\n", - " error = e.response.get('Error').get('Code')\n", - " if error == '404':\n", - " print('Running explainability job')\n", + " error = e.response.get(\"Error\").get(\"Code\")\n", + " if error == \"404\":\n", + " print(\"Running explainability job\")\n", " clarify_processor.run_explainability(\n", " data_config=explainability_data_config,\n", " model_config=model_config,\n", - " explainability_config=shap_config)" + " explainability_config=shap_config,\n", + " )" ] }, { @@ -698,7 +689,7 @@ "metadata": {}, "outputs": [], "source": [ - "inference_df['trackid'] = data['trackid']" + "inference_df[\"trackid\"] = data[\"trackid\"]" ] }, { @@ -708,8 +699,8 @@ "outputs": [], "source": [ "playlist_length = 10 # number of songs to recommend in playlist\n", - "playlist = inference_df.sort_values(by='rating', ascending=False).head(playlist_length)\n", - "print('Curated Playlist:\\n', playlist['trackid'])" + "playlist = inference_df.sort_values(by=\"rating\", ascending=False).head(playlist_length)\n", + "print(\"Curated Playlist:\\n\", playlist[\"trackid\"])" ] }, { @@ -718,13 +709,15 @@ "metadata": {}, "outputs": [], "source": [ - "s3_client.download_file(bucket, f\"{prefix}/clarify-output/explainability/explanations_shap/out.csv\", f\"out.csv\")\n", - "local_explanations_out = pd.read_csv('out.csv')\n", + "s3_client.download_file(\n", + " bucket, f\"{prefix}/clarify-output/explainability/explanations_shap/out.csv\", f\"out.csv\"\n", + ")\n", + "local_explanations_out = pd.read_csv(\"out.csv\")\n", "local_explanations_out.columns = feature_names\n", "\n", "print(\"Model prediction:\", playlist.iloc[0, 0])\n", - "plt.figure(figsize=(12,6))\n", - "local_explanations_out.iloc[0].sort_values().plot.barh(title='Local explanation for prediction')" + "plt.figure(figsize=(12, 6))\n", + "local_explanations_out.iloc[0].sort_values().plot.barh(title=\"Local explanation for prediction\")" ] }, { @@ -759,8 +752,8 @@ "try:\n", " trial = create_trial(s3_debugger_output_path)\n", "except:\n", - " parameters = ps.read('music-rec')\n", - " s3_debugger_output_path = parameters['s3_debugger_output_path']\n", + " parameters = ps.read(\"music-rec\")\n", + " s3_debugger_output_path = parameters[\"s3_debugger_output_path\"]\n", " trial = create_trial(s3_debugger_output_path)" ] }, @@ -770,7 +763,7 @@ "metadata": {}, "outputs": [], "source": [ - "feature_names = list(train.drop('rating', axis=1).columns)\n", + "feature_names = list(train.drop(\"rating\", axis=1).columns)\n", "print(feature_names)" ] }, @@ -786,6 +779,7 @@ "\n", "MAX_PLOTS = 35\n", "\n", + "\n", "def get_data(trial, tname):\n", " \"\"\"\n", " For the given tensor name, walks though all the iterations\n", @@ -797,27 +791,29 @@ " vals = [tensor.value(s) for s in steps]\n", " return steps, vals\n", "\n", + "\n", "def match_tensor_name_with_feature_name(tensor_name, feature_names=feature_names):\n", " feature_tag = tensor_name.split(\"/\")\n", " for ifeat, feature_name in enumerate(feature_names):\n", - " if feature_tag[-1]==\"f{}\".format(str(ifeat)): return feature_name\n", + " if feature_tag[-1] == \"f{}\".format(str(ifeat)):\n", + " return feature_name\n", " return tensor_name\n", "\n", "\n", - "def plot_collection(trial, collection_name, regex='.*', figsize=(8, 6)):\n", + "def plot_collection(trial, collection_name, regex=\".*\", figsize=(8, 6)):\n", " \"\"\"\n", - " Takes a `trial` and a collection name, and \n", + " Takes a `trial` and a collection name, and\n", " plots all tensors that match the given regex.\n", " \"\"\"\n", " fig, ax = plt.subplots(figsize=figsize)\n", - " tensors = (trial.collection(collection_name).tensor_names)\n", + " tensors = trial.collection(collection_name).tensor_names\n", " matched_tensors = [t for t in tensors if re.match(regex, t)]\n", " for tensor_name in islice(matched_tensors, MAX_PLOTS):\n", " steps, data = get_data(trial, tensor_name)\n", " ax.plot(steps, data, label=match_tensor_name_with_feature_name(tensor_name))\n", "\n", - " ax.legend(loc='center left', bbox_to_anchor=(1, 0.5))\n", - " ax.set_xlabel('Iteration')" + " ax.legend(loc=\"center left\", bbox_to_anchor=(1, 0.5))\n", + " ax.set_xlabel(\"Iteration\")" ] }, { @@ -836,15 +832,10 @@ "outputs": [], "source": [ "def plot_feature_importance(trial, importance_type=\"weight\"):\n", - " SUPPORTED_IMPORTANCE_TYPES = [\n", - " \"weight\", \"gain\", \"cover\", \"total_gain\", \"total_cover\"]\n", + " SUPPORTED_IMPORTANCE_TYPES = [\"weight\", \"gain\", \"cover\", \"total_gain\", \"total_cover\"]\n", " if importance_type not in SUPPORTED_IMPORTANCE_TYPES:\n", - " raise ValueError(\n", - " f\"{importance_type} is not one of the supported importance types.\")\n", - " plot_collection(\n", - " trial,\n", - " \"feature_importance\",\n", - " regex=f\"feature_importance/{importance_type}/.*\")" + " raise ValueError(f\"{importance_type} is not one of the supported importance types.\")\n", + " plot_collection(trial, \"feature_importance\", regex=f\"feature_importance/{importance_type}/.*\")" ] }, { @@ -912,7 +903,7 @@ "shap_values = trial.tensor(\"full_shap/f0\").value(trial.last_complete_step)\n", "shap_no_base = shap_values[:, :-1]\n", "shap_base_value = shap_values[0, -1]\n", - "shap.summary_plot(shap_no_base, plot_type='bar', feature_names=feature_names)" + "shap.summary_plot(shap_no_base, plot_type=\"bar\", feature_names=feature_names)" ] }, { @@ -976,15 +967,15 @@ "from sagemaker.model_monitor import DataCaptureConfig\n", "\n", "# Please fill in the following for enabling data capture\n", - "s3_capture_upload_path = f's3://{bucket}/{prefix}/endpoint-data-capture/' #example: s3://bucket-name/path/to/endpoint-data-capture/\n", + "s3_capture_upload_path = f\"s3://{bucket}/{prefix}/endpoint-data-capture/\" # example: s3://bucket-name/path/to/endpoint-data-capture/\n", "\n", - "##### \n", + "#####\n", "## IMPORTANT\n", "##\n", - "## Please make sure to add the \"s3:PutObject\" permission to the \"role' you provided in the SageMaker Model \n", + "## Please make sure to add the \"s3:PutObject\" permission to the \"role' you provided in the SageMaker Model\n", "## behind this Endpoint. Otherwise, Endpoint data capture will not work.\n", - "## \n", - "##### " + "##\n", + "#####" ] }, { @@ -996,16 +987,16 @@ "outputs": [], "source": [ "%%time\n", - "# Change parameters as you would like - adjust sampling percentage, \n", + "# Change parameters as you would like - adjust sampling percentage,\n", "# chose to capture request or response or both\n", "data_capture_config = DataCaptureConfig(\n", - " enable_capture = True,\n", + " enable_capture=True,\n", " sampling_percentage=25,\n", " destination_s3_uri=s3_capture_upload_path,\n", " kms_key_id=None,\n", " capture_options=[\"REQUEST\", \"RESPONSE\"],\n", " csv_content_types=[\"text/csv\"],\n", - " json_content_types=[\"application/json\"]\n", + " json_content_types=[\"application/json\"],\n", ")\n", "\n", "# Now it is time to apply the new configuration and wait for it to be applied\n", @@ -1061,12 +1052,12 @@ "outputs": [], "source": [ "##'s3://bucketname/path/to/baseline/data' - Where your validation data is\n", - "baseline_data_uri = val_data_uri \n", + "baseline_data_uri = val_data_uri\n", "##'s3://bucketname/path/to/baseline/data' - Where the results are to be stored in\n", - "baseline_results_uri = f's3://{bucket}/{prefix}/baseline/results' \n", + "baseline_results_uri = f\"s3://{bucket}/{prefix}/baseline/results\"\n", "\n", - "print('Baseline data uri: {}'.format(baseline_data_uri))\n", - "print('Baseline results uri: {}'.format(baseline_results_uri))" + "print(\"Baseline data uri: {}\".format(baseline_data_uri))\n", + "print(\"Baseline results uri: {}\".format(baseline_results_uri))" ] }, { @@ -1103,10 +1094,10 @@ "my_default_monitor = DefaultModelMonitor(\n", " role=role,\n", " instance_count=2,\n", - " instance_type='ml.m5.xlarge',\n", + " instance_type=\"ml.m5.xlarge\",\n", " volume_size_in_gb=20,\n", " max_runtime_in_seconds=1800,\n", - " base_job_name=f\"{prefix}-monitor-{datetime_stamp}\"\n", + " base_job_name=f\"{prefix}-monitor-{datetime_stamp}\",\n", ")" ] }, @@ -1125,7 +1116,7 @@ " dataset_format=DatasetFormat.csv(header=False),\n", " output_s3_uri=baseline_results_uri,\n", " job_name=f\"{prefix}-monitor-baseline-{datetime_stamp}\",\n", - " wait=True\n", + " wait=True,\n", ")" ] }, @@ -1146,19 +1137,20 @@ "from time import gmtime, strftime\n", "import boto3\n", "\n", - "client = boto3.client('sagemaker')\n", + "client = boto3.client(\"sagemaker\")\n", + "\n", "\n", "def get_last_processing_job():\n", - " \n", + "\n", " response = client.list_processing_jobs(\n", " NameContains=f\"{prefix}-monitor-baseline-{datetime_stamp}\",\n", - " StatusEquals='Completed',\n", - " SortBy='CreationTime',\n", - " SortOrder='Descending',\n", - " MaxResults=20\n", + " StatusEquals=\"Completed\",\n", + " SortBy=\"CreationTime\",\n", + " SortOrder=\"Descending\",\n", + " MaxResults=20,\n", " )\n", - " pprint.pprint(response['ProcessingJobSummaries'][0])\n", - " return response['ProcessingJobSummaries'][0]['ProcessingJobName']" + " pprint.pprint(response[\"ProcessingJobSummaries\"][0])\n", + " return response[\"ProcessingJobSummaries\"][0][\"ProcessingJobName\"]" ] }, { @@ -1167,11 +1159,11 @@ "metadata": {}, "outputs": [], "source": [ - "from sagemaker.processing import ProcessingJob \n", + "from sagemaker.processing import ProcessingJob\n", "from sagemaker.estimator import Estimator\n", "from sagemaker.model_monitor.model_monitoring import ModelMonitor\n", "\n", - "my_default_monitor_name = get_last_processing_job()\n" + "my_default_monitor_name = get_last_processing_job()" ] }, { @@ -1182,9 +1174,7 @@ "source": [ "my_default_monitor_reload = ProcessingJob.from_processing_name(sess, my_default_monitor_name)\n", "\n", - "response = client.describe_processing_job(\n", - " ProcessingJobName=my_default_monitor_name\n", - ")\n", + "response = client.describe_processing_job(ProcessingJobName=my_default_monitor_name)\n", "pprint.pprint(response)" ] }, @@ -1214,7 +1204,9 @@ "metadata": {}, "outputs": [], "source": [ - "constraints_df = pd.io.json.json_normalize(baseline_job.suggested_constraints().body_dict[\"features\"])\n", + "constraints_df = pd.io.json.json_normalize(\n", + " baseline_job.suggested_constraints().body_dict[\"features\"]\n", + ")\n", "constraints_df.head(10)" ] }, @@ -1260,8 +1252,10 @@ "from time import gmtime, strftime\n", "\n", "\n", - "mon_schedule_name = 'music-rec-monitor-schedule-{}'.format(datetime.datetime.now().strftime(\"%Y-%m-%d-%H%M%S\"))\n", - "s3_report_path = f's3://{bucket}/{prefix}/monitor/report'\n", + "mon_schedule_name = \"music-rec-monitor-schedule-{}\".format(\n", + " datetime.datetime.now().strftime(\"%Y-%m-%d-%H%M%S\")\n", + ")\n", + "s3_report_path = f\"s3://{bucket}/{prefix}/monitor/report\"\n", "\n", "try:\n", " my_default_monitor.create_monitoring_schedule(\n", @@ -1292,11 +1286,11 @@ "import time\n", "\n", "desc_schedule_result = my_default_monitor.describe_schedule()\n", - "while desc_schedule_result['MonitoringScheduleStatus'] != 'Scheduled':\n", - " print('Schedule status: {}'.format(desc_schedule_result['MonitoringScheduleStatus']))\n", + "while desc_schedule_result[\"MonitoringScheduleStatus\"] != \"Scheduled\":\n", + " print(\"Schedule status: {}\".format(desc_schedule_result[\"MonitoringScheduleStatus\"]))\n", " desc_schedule_result = my_default_monitor.describe_schedule()\n", " time.sleep(30)\n", - "print('Schedule status: {}'.format(desc_schedule_result['MonitoringScheduleStatus']))" + "print(\"Schedule status: {}\".format(desc_schedule_result[\"MonitoringScheduleStatus\"]))" ] }, { @@ -1324,20 +1318,22 @@ "metadata": {}, "outputs": [], "source": [ - "mpg_name = prefix+'-notebooks'\n", + "mpg_name = prefix + \"-notebooks\"\n", "\n", - "model_packages = sagemaker_client.list_model_packages(ModelPackageGroupName=mpg_name)['ModelPackageSummaryList']\n", + "model_packages = sagemaker_client.list_model_packages(ModelPackageGroupName=mpg_name)[\n", + " \"ModelPackageSummaryList\"\n", + "]\n", "\n", "if model_packages:\n", - " print(f'Using existing Model Package Group: {mpg_name}')\n", + " print(f\"Using existing Model Package Group: {mpg_name}\")\n", "else:\n", " mpg_input_dict = {\n", - " 'ModelPackageGroupName': mpg_name,\n", - " 'ModelPackageGroupDescription': 'Music Recommendation Models'\n", + " \"ModelPackageGroupName\": mpg_name,\n", + " \"ModelPackageGroupDescription\": \"Music Recommendation Models\",\n", " }\n", "\n", " mpg_response = sagemaker_client.create_model_package_group(**mpg_input_dict)\n", - " print(f'Create Model Package Group {mpg_name}: SUCCESSFUL')" + " print(f\"Create Model Package Group {mpg_name}: SUCCESSFUL\")" ] }, { @@ -1348,29 +1344,32 @@ }, "outputs": [], "source": [ - "sys.path.insert(1, './code')\n", + "sys.path.insert(1, \"./code\")\n", "from inference_specification import InferenceSpecification\n", "\n", - "model_uri = training_job_info.get('ModelArtifacts', {}).get('S3ModelArtifacts')\n", - "training_image = training_job_info['AlgorithmSpecification']['TrainingImage']\n", + "model_uri = training_job_info.get(\"ModelArtifacts\", {}).get(\"S3ModelArtifacts\")\n", + "training_image = training_job_info[\"AlgorithmSpecification\"][\"TrainingImage\"]\n", "\n", "mp_inference_spec = InferenceSpecification().get_inference_specification_dict(\n", " ecr_image=training_image,\n", " supports_gpu=False,\n", - " supported_content_types=['text/csv'],\n", - " supported_mime_types=['text/csv'])\n", + " supported_content_types=[\"text/csv\"],\n", + " supported_mime_types=[\"text/csv\"],\n", + ")\n", "\n", - "mp_inference_spec['InferenceSpecification']['Containers'][0]['ModelDataUrl'] = model_uri\n", + "mp_inference_spec[\"InferenceSpecification\"][\"Containers\"][0][\"ModelDataUrl\"] = model_uri\n", "mp_input_dict = {\n", - " 'ModelPackageGroupName': mpg_name,\n", - " 'ModelPackageDescription': 'SageMaker Music Recommender',\n", - " 'ModelApprovalStatus': 'PendingManualApproval'\n", + " \"ModelPackageGroupName\": mpg_name,\n", + " \"ModelPackageDescription\": \"SageMaker Music Recommender\",\n", + " \"ModelApprovalStatus\": \"PendingManualApproval\",\n", "}\n", "\n", "mp_input_dict.update(mp_inference_spec)\n", "mp_response = sagemaker_client.create_model_package(**mp_input_dict)\n", - " \n", - "model_packages = sagemaker_client.list_model_packages(ModelPackageGroupName=mpg_name)['ModelPackageSummaryList']\n", + "\n", + "model_packages = sagemaker_client.list_model_packages(ModelPackageGroupName=mpg_name)[\n", + " \"ModelPackageSummaryList\"\n", + "]\n", "model_packages" ] }, @@ -1387,17 +1386,17 @@ "metadata": {}, "outputs": [], "source": [ - "model_matches = sagemaker_client.list_models(NameContains=model_name)['Models']\n", + "model_matches = sagemaker_client.list_models(NameContains=model_name)[\"Models\"]\n", "\n", "for model_name_match in model_matches:\n", - " sagemaker_session.delete_model(model_name_match['ModelName'])\n", + " sagemaker_session.delete_model(model_name_match[\"ModelName\"])\n", " print(f\"Deleted existing model: {model_name_match['ModelName']}\")\n", - " \n", + "\n", "model = sagemaker_session.create_model_from_job(\n", " name=model_name,\n", " training_job_name=training_job_name,\n", " role=sagemaker_role,\n", - " image_uri=training_job_info['AlgorithmSpecification']['TrainingImage']\n", + " image_uri=training_job_info[\"AlgorithmSpecification\"][\"TrainingImage\"],\n", ")\n", "\n", "print(f\"Created new model: {model_name}\")" @@ -1418,14 +1417,15 @@ "outputs": [], "source": [ "import demo_helpers # our custom set of functions\n", + "\n", "demo_helpers.delete_project_resources(\n", - " sagemaker_boto_client=sagemaker_client, \n", + " sagemaker_boto_client=sagemaker_client,\n", " sagemaker_session=sagemaker_session,\n", " endpoint_names=[endpoint_name],\n", " mpg_name=mpg_name,\n", " prefix=prefix,\n", " delete_s3_objects=True,\n", - " bucket_name=bucket\n", + " bucket_name=bucket,\n", ")" ] }, diff --git a/end_to_end/music_recommendation/end_to_end_pipeline.ipynb b/end_to_end/music_recommendation/end_to_end_pipeline.ipynb index a1869d35fa..cf98d853cb 100644 --- a/end_to_end/music_recommendation/end_to_end_pipeline.ipynb +++ b/end_to_end/music_recommendation/end_to_end_pipeline.ipynb @@ -99,7 +99,8 @@ "source": [ "import sys\n", "import pprint\n", - "sys.path.insert(1, './code')" + "\n", + "sys.path.insert(1, \"./code\")" ] }, { @@ -112,19 +113,19 @@ "boto3.setup_default_session(region_name=region)\n", "boto_session = boto3.Session(region_name=region)\n", "\n", - "s3_client = boto3.client('s3', region_name=region)\n", + "s3_client = boto3.client(\"s3\", region_name=region)\n", "\n", - "sagemaker_boto_client = boto_session.client('sagemaker')\n", + "sagemaker_boto_client = boto_session.client(\"sagemaker\")\n", "sagemaker_session = sagemaker.session.Session(\n", - " boto_session=boto_session,\n", - " sagemaker_client=sagemaker_boto_client)\n", + " boto_session=boto_session, sagemaker_client=sagemaker_boto_client\n", + ")\n", "sagemaker_role = sagemaker.get_execution_role()\n", "\n", - "account_id = boto3.client('sts').get_caller_identity()[\"Account\"]\n", + "account_id = boto3.client(\"sts\").get_caller_identity()[\"Account\"]\n", "\n", "sess = sagemaker.Session()\n", "bucket = sess.default_bucket()\n", - "prefix='music-recommendation-pipeline'" + "prefix = \"music-recommendation-pipeline\"" ] }, { @@ -135,13 +136,15 @@ "source": [ "processing_dir = \"/opt/ml/processing\"\n", "\n", - "# Output name is auto-generated from the select node's ID + output name from the flow file. \n", + "# Output name is auto-generated from the select node's ID + output name from the flow file.\n", "# You can change to a different node ID to export a different step in the flow file\n", - "output_name_tracks = \"19ad8e80-2002-4ee9-9753-fe9a384b1166.default\" # tracks node in flow file\n", - "output_name_user_preferences = \"7a6dad19-2c80-43e3-b03d-ec23c3842ae9.default\" # joined node in flow file\"\n", - "output_name_ratings = \"9a283380-91ca-478e-be99-6ba3bf57c680.default\" # ratings node in flow file\n", + "output_name_tracks = \"19ad8e80-2002-4ee9-9753-fe9a384b1166.default\" # tracks node in flow file\n", + "output_name_user_preferences = (\n", + " \"7a6dad19-2c80-43e3-b03d-ec23c3842ae9.default\" # joined node in flow file\"\n", + ")\n", + "output_name_ratings = \"9a283380-91ca-478e-be99-6ba3bf57c680.default\" # ratings node in flow file\n", "\n", - "#======> variables used for parameterizing the notebook run\n", + "# ======> variables used for parameterizing the notebook run\n", "flow_instance_count = 1\n", "flow_instance_type = \"ml.m5.4xlarge\"\n", "\n", @@ -204,7 +207,13 @@ "metadata": {}, "outputs": [], "source": [ - "new_data_paths = get_data(s3_client, [f\"{s3_bucket_music_data}/tracks.csv\", f\"{s3_bucket_music_data}/ratings.csv\"], bucket, prefix, sample_data=0.70)\n", + "new_data_paths = get_data(\n", + " s3_client,\n", + " [f\"{s3_bucket_music_data}/tracks.csv\", f\"{s3_bucket_music_data}/ratings.csv\"],\n", + " bucket,\n", + " prefix,\n", + " sample_data=0.70,\n", + ")\n", "print(new_data_paths)" ] }, @@ -215,8 +224,8 @@ "outputs": [], "source": [ "# these are the new file paths located on your SageMaker Studio default s3 storage bucket\n", - "tracks_data_source = f's3://{bucket}/{prefix}/tracks.csv'\n", - "ratings_data_source = f's3://{bucket}/{prefix}/ratings.csv'" + "tracks_data_source = f\"s3://{bucket}/{prefix}/tracks.csv\"\n", + "ratings_data_source = f\"s3://{bucket}/{prefix}/ratings.csv\"" ] }, { @@ -244,11 +253,12 @@ " f\"val_data.zip\",\n", " f\"tracks_new.csv\",\n", " f\"ratings_new.csv\",\n", - " \n", "]\n", "\n", "for file in files_to_download:\n", - " s3_client.download_file(f\"sagemaker-sample-files\", f\"datasets/tabular/synthetic-music/{file}\", f\"./data/{file}\")" + " s3_client.download_file(\n", + " f\"sagemaker-sample-files\", f\"datasets/tabular/synthetic-music/{file}\", f\"./data/{file}\"\n", + " )" ] }, { @@ -268,15 +278,15 @@ "outputs": [], "source": [ "# upload train and validation datasets as well\n", - "s3_client.upload_file('data/tracks_new.csv', bucket, f'{prefix}/data/tracks_new.csv')\n", - "s3_client.upload_file('data/ratings_new.csv', bucket, f'{prefix}/data/ratings_new.csv')\n", - "s3_client.upload_file('data/train_data.csv', bucket, f'{prefix}/data/train/train_data.csv')\n", - "s3_client.upload_file('data/val_data.csv', bucket, f'{prefix}/data/val/val_data.csv')\n", + "s3_client.upload_file(\"data/tracks_new.csv\", bucket, f\"{prefix}/data/tracks_new.csv\")\n", + "s3_client.upload_file(\"data/ratings_new.csv\", bucket, f\"{prefix}/data/ratings_new.csv\")\n", + "s3_client.upload_file(\"data/train_data.csv\", bucket, f\"{prefix}/data/train/train_data.csv\")\n", + "s3_client.upload_file(\"data/val_data.csv\", bucket, f\"{prefix}/data/val/val_data.csv\")\n", "\n", "\n", - "train_data_uri = f's3://{bucket}/{prefix}/data/train/train_data.csv'\n", - "val_data_uri = f's3://{bucket}/{prefix}/data/val/val_data.csv'\n", - "print (f\"Saving training data to {train_data_uri}\")" + "train_data_uri = f\"s3://{bucket}/{prefix}/data/train/train_data.csv\"\n", + "val_data_uri = f\"s3://{bucket}/{prefix}/data/val/val_data.csv\"\n", + "print(f\"Saving training data to {train_data_uri}\")" ] }, { @@ -327,8 +337,7 @@ ")\n", "\n", "model_approval_status = ParameterString(\n", - " name=\"ModelApprovalStatus\",\n", - " default_value=\"PendingManualApproval\"\n", + " name=\"ModelApprovalStatus\", default_value=\"PendingManualApproval\"\n", ")" ] }, @@ -357,7 +366,7 @@ }, "outputs": [], "source": [ - "update_data_sources('01_music_dataprep.flow', tracks_data_source, ratings_data_source)" + "update_data_sources(\"01_music_dataprep.flow\", tracks_data_source, ratings_data_source)" ] }, { @@ -377,8 +386,10 @@ "# name of the flow file which should exist in the current notebook working directory\n", "flow_file_name = \"01_music_dataprep.flow\"\n", "\n", - "s3_client.upload_file(Filename=flow_file_name, Bucket=bucket, Key=f'{prefix}/dataprep-notebooks/music_dataprep.flow')\n", - "flow_s3_uri = f's3://{bucket}/{prefix}/dataprep-notebooks/music_dataprep.flow'\n", + "s3_client.upload_file(\n", + " Filename=flow_file_name, Bucket=bucket, Key=f\"{prefix}/dataprep-notebooks/music_dataprep.flow\"\n", + ")\n", + "flow_s3_uri = f\"s3://{bucket}/{prefix}/dataprep-notebooks/music_dataprep.flow\"\n", "\n", "print(f\"Data Wrangler flow {flow_file_name} uploaded to {flow_s3_uri}\")" ] @@ -400,24 +411,28 @@ "data_sources = []\n", "\n", "## Input - S3 Source: tracks.csv\n", - "data_sources.append(ProcessingInput(\n", - " source=f\"s3://{bucket}/{prefix}/data/tracks_new.csv\", # You can override this to point to another dataset on S3\n", - " destination=f\"{processing_dir}/data/tracks_new.csv\",\n", - " input_name=\"tracks_new.csv\",\n", - " s3_data_type=\"S3Prefix\",\n", - " s3_input_mode=\"File\",\n", - " s3_data_distribution_type=\"FullyReplicated\"\n", - "))\n", + "data_sources.append(\n", + " ProcessingInput(\n", + " source=f\"s3://{bucket}/{prefix}/data/tracks_new.csv\", # You can override this to point to another dataset on S3\n", + " destination=f\"{processing_dir}/data/tracks_new.csv\",\n", + " input_name=\"tracks_new.csv\",\n", + " s3_data_type=\"S3Prefix\",\n", + " s3_input_mode=\"File\",\n", + " s3_data_distribution_type=\"FullyReplicated\",\n", + " )\n", + ")\n", "\n", "## Input - S3 Source: ratings.csv\n", - "data_sources.append(ProcessingInput(\n", - " source=f\"s3://{bucket}/{prefix}/data/ratings_new.csv\", # You can override this to point to another dataset on S3\n", - " destination=f\"{processing_dir}/data/ratings_new.csv\",\n", - " input_name=\"ratings_new.csv\",\n", - " s3_data_type=\"S3Prefix\",\n", - " s3_input_mode=\"File\",\n", - " s3_data_distribution_type=\"FullyReplicated\"\n", - "))\n", + "data_sources.append(\n", + " ProcessingInput(\n", + " source=f\"s3://{bucket}/{prefix}/data/ratings_new.csv\", # You can override this to point to another dataset on S3\n", + " destination=f\"{processing_dir}/data/ratings_new.csv\",\n", + " input_name=\"ratings_new.csv\",\n", + " s3_data_type=\"S3Prefix\",\n", + " s3_input_mode=\"File\",\n", + " s3_data_distribution_type=\"FullyReplicated\",\n", + " )\n", + ")\n", "\n", "## Input - Flow: 01_music_dataprep.flow\n", "flow_input = ProcessingInput(\n", @@ -426,7 +441,7 @@ " input_name=\"flow\",\n", " s3_data_type=\"S3Prefix\",\n", " s3_input_mode=\"File\",\n", - " s3_data_distribution_type=\"FullyReplicated\"\n", + " s3_data_distribution_type=\"FullyReplicated\",\n", ")" ] }, @@ -444,15 +459,16 @@ "outputs": [], "source": [ "# Define feature group names we previously created in notebooks 02a-c\n", - "fg_name_tracks = 'track-features-music-rec'\n", - "fg_name_ratings = 'ratings-features-music-rec'\n", - "fg_name_user_preferences = 'user-5star-track-features-music-rec'\n", + "fg_name_tracks = \"track-features-music-rec\"\n", + "fg_name_ratings = \"ratings-features-music-rec\"\n", + "fg_name_user_preferences = \"user-5star-track-features-music-rec\"\n", "dw_ecrlist = {\n", - " 'region':{'us-west-2':'174368400705',\n", - " 'us-east-2':'415577184552',\n", - " 'us-west-1':'926135532090',\n", - " 'us-east-1':'663277389841'\n", - " }\n", + " \"region\": {\n", + " \"us-west-2\": \"174368400705\",\n", + " \"us-east-2\": \"415577184552\",\n", + " \"us-west-1\": \"926135532090\",\n", + " \"us-east-1\": \"663277389841\",\n", + " }\n", "}" ] }, @@ -465,23 +481,24 @@ "flow_output_tracks = sagemaker.processing.ProcessingOutput(\n", " output_name=output_name_tracks,\n", " app_managed=True,\n", - " feature_store_output=sagemaker.processing.FeatureStoreOutput(\n", - " feature_group_name=fg_name_tracks)\n", - " )\n", + " feature_store_output=sagemaker.processing.FeatureStoreOutput(feature_group_name=fg_name_tracks),\n", + ")\n", "\n", "flow_output_user_preferences = sagemaker.processing.ProcessingOutput(\n", " output_name=output_name_user_preferences,\n", " app_managed=True,\n", " feature_store_output=sagemaker.processing.FeatureStoreOutput(\n", - " feature_group_name=fg_name_user_preferences)\n", - " )\n", + " feature_group_name=fg_name_user_preferences\n", + " ),\n", + ")\n", "\n", "flow_output_ratings = sagemaker.processing.ProcessingOutput(\n", " output_name=output_name_ratings,\n", " app_managed=True,\n", " feature_store_output=sagemaker.processing.FeatureStoreOutput(\n", - " feature_group_name=fg_name_ratings)\n", - " )" + " feature_group_name=fg_name_ratings\n", + " ),\n", + ")" ] }, { @@ -490,24 +507,12 @@ "metadata": {}, "outputs": [], "source": [ - "# Output configuration used as processing job container arguments \n", - "output_config_tracks = {\n", - " output_name_tracks: {\n", - " \"content_type\": \"CSV\"\n", - " }\n", - "}\n", + "# Output configuration used as processing job container arguments\n", + "output_config_tracks = {output_name_tracks: {\"content_type\": \"CSV\"}}\n", "\n", - "output_config_user_preferences = {\n", - " output_name_user_preferences: {\n", - " \"content_type\": \"CSV\"\n", - " }\n", - "}\n", + "output_config_user_preferences = {output_name_user_preferences: {\"content_type\": \"CSV\"}}\n", "\n", - "output_config_ratings = {\n", - " output_name_ratings: {\n", - " \"content_type\": \"CSV\"\n", - " }\n", - "}" + "output_config_ratings = {output_name_ratings: {\"content_type\": \"CSV\"}}" ] }, { @@ -539,37 +544,37 @@ "\n", "\n", "flow_processor = sagemaker.processing.Processor(\n", - " role=sagemaker_role, \n", - " image_uri=container_uri, \n", - " instance_count=flow_instance_count, \n", - " instance_type=flow_instance_type, \n", + " role=sagemaker_role,\n", + " image_uri=container_uri,\n", + " instance_count=flow_instance_count,\n", + " instance_type=flow_instance_type,\n", " volume_size_in_gb=30,\n", " network_config=NetworkConfig(enable_network_isolation=False),\n", - " sagemaker_session=sagemaker_session\n", + " sagemaker_session=sagemaker_session,\n", ")\n", "\n", "flow_step_tracks = ProcessingStep(\n", - " name='DataWranglerStepTracks', \n", - " processor=flow_processor, \n", - " inputs=[flow_input] + data_sources, \n", + " name=\"DataWranglerStepTracks\",\n", + " processor=flow_processor,\n", + " inputs=[flow_input] + data_sources,\n", " outputs=[flow_output_tracks],\n", " job_arguments=[f\"--output-config '{json.dumps(output_config_tracks)}'\"],\n", ")\n", "\n", "flow_step_ratings = ProcessingStep(\n", - " name='DataWranglerStepRatings', \n", - " processor=flow_processor, \n", - " inputs=[flow_input] + data_sources, \n", + " name=\"DataWranglerStepRatings\",\n", + " processor=flow_processor,\n", + " inputs=[flow_input] + data_sources,\n", " outputs=[flow_output_ratings],\n", - " job_arguments=[f\"--output-config '{json.dumps(output_config_ratings)}'\"]\n", + " job_arguments=[f\"--output-config '{json.dumps(output_config_ratings)}'\"],\n", ")\n", "\n", "flow_step_user_preferences = ProcessingStep(\n", - " name='DataWranglerStepUserPref', \n", - " processor=flow_processor, \n", - " inputs=[flow_input] + data_sources, \n", + " name=\"DataWranglerStepUserPref\",\n", + " processor=flow_processor,\n", + " inputs=[flow_input] + data_sources,\n", " outputs=[flow_output_user_preferences],\n", - " job_arguments=[f\"--output-config '{json.dumps(output_config_user_preferences)}'\"]\n", + " job_arguments=[f\"--output-config '{json.dumps(output_config_user_preferences)}'\"],\n", ")" ] }, @@ -586,35 +591,49 @@ "metadata": {}, "outputs": [], "source": [ - "s3_client.upload_file(Filename='./code/create_datasets.py', Bucket=bucket, Key=f'{prefix}/code/create_datasets.py')\n", - "create_dataset_script_uri = f's3://{bucket}/{prefix}/code/create_datasets.py'\n", + "s3_client.upload_file(\n", + " Filename=\"./code/create_datasets.py\", Bucket=bucket, Key=f\"{prefix}/code/create_datasets.py\"\n", + ")\n", + "create_dataset_script_uri = f\"s3://{bucket}/{prefix}/code/create_datasets.py\"\n", "\n", "create_dataset_processor = SKLearnProcessor(\n", - " framework_version='0.23-1',\n", + " framework_version=\"0.23-1\",\n", " role=sagemaker_role,\n", " instance_type=\"ml.m5.4xlarge\",\n", " instance_count=2,\n", " volume_size_in_gb=100,\n", - " base_job_name='music-rec-pipeline-split-data',\n", - " sagemaker_session=sagemaker_session)\n", + " base_job_name=\"music-rec-pipeline-split-data\",\n", + " sagemaker_session=sagemaker_session,\n", + ")\n", "\n", "create_dataset_step = ProcessingStep(\n", - " name='SplitData',\n", + " name=\"SplitData\",\n", " processor=create_dataset_processor,\n", - " outputs = [\n", - " sagemaker.processing.ProcessingOutput(output_name='train_data', source=f'{processing_dir}/output/train'),\n", - " sagemaker.processing.ProcessingOutput(output_name='test_data', source=f'{processing_dir}/output/test')\n", + " outputs=[\n", + " sagemaker.processing.ProcessingOutput(\n", + " output_name=\"train_data\", source=f\"{processing_dir}/output/train\"\n", + " ),\n", + " sagemaker.processing.ProcessingOutput(\n", + " output_name=\"test_data\", source=f\"{processing_dir}/output/test\"\n", + " ),\n", + " ],\n", + " job_arguments=[\n", + " \"--feature-group-name-tracks\",\n", + " fg_name_tracks,\n", + " \"--feature-group-name-ratings\",\n", + " fg_name_ratings,\n", + " \"--feature-group-name-user-preferences\",\n", + " fg_name_user_preferences,\n", + " \"--bucket-name\",\n", + " bucket,\n", + " \"--bucket-prefix\",\n", + " prefix,\n", + " \"--region\",\n", + " region,\n", " ],\n", - " job_arguments=[\"--feature-group-name-tracks\", fg_name_tracks,\n", - " \"--feature-group-name-ratings\", fg_name_ratings,\n", - " \"--feature-group-name-user-preferences\", fg_name_user_preferences,\n", - " \"--bucket-name\", bucket,\n", - " \"--bucket-prefix\", prefix,\n", - " \"--region\", region\n", - " ],\n", " code=create_dataset_script_uri,\n", - " depends_on=[flow_step_tracks.name, flow_step_ratings.name, flow_step_user_preferences.name]\n", - ")\n" + " depends_on=[flow_step_tracks.name, flow_step_ratings.name, flow_step_user_preferences.name],\n", + ")" ] }, { @@ -635,7 +654,7 @@ " \"max_depth\": \"4\",\n", " \"eta\": \"0.2\",\n", " \"objective\": \"reg:squarederror\",\n", - " \"num_round\": \"100\"\n", + " \"num_round\": \"100\",\n", "}\n", "\n", "save_interval = 5" @@ -650,13 +669,13 @@ "xgb_estimator = Estimator(\n", " role=sagemaker_role,\n", " instance_count=2,\n", - " instance_type='ml.m5.4xlarge',\n", + " instance_type=\"ml.m5.4xlarge\",\n", " volume_size=60,\n", " image_uri=sagemaker.image_uris.retrieve(\"xgboost\", region, \"0.90-2\"),\n", " hyperparameters=hyperparameters,\n", - " output_path=f's3://{bucket}/{prefix}/training_jobs',\n", - " base_job_name='xgb-music-rec-pipeline-model',\n", - " max_run=1800\n", + " output_path=f\"s3://{bucket}/{prefix}/training_jobs\",\n", + " base_job_name=\"xgb-music-rec-pipeline-model\",\n", + " max_run=1800,\n", ")" ] }, @@ -667,18 +686,22 @@ "outputs": [], "source": [ "train_step = TrainingStep(\n", - " name='TrainStep',\n", + " name=\"TrainStep\",\n", " estimator=xgb_estimator,\n", " inputs={\n", - " 'train': sagemaker.inputs.TrainingInput(\n", - " s3_data=create_dataset_step.properties.ProcessingOutputConfig.Outputs['train_data'].S3Output.S3Uri,\n", - " content_type=\"text/csv\"\n", + " \"train\": sagemaker.inputs.TrainingInput(\n", + " s3_data=create_dataset_step.properties.ProcessingOutputConfig.Outputs[\n", + " \"train_data\"\n", + " ].S3Output.S3Uri,\n", + " content_type=\"text/csv\",\n", " ),\n", - " 'validation': sagemaker.inputs.TrainingInput(\n", - " s3_data=create_dataset_step.properties.ProcessingOutputConfig.Outputs['test_data'].S3Output.S3Uri,\n", - " content_type=\"text/csv\"\n", - " )\n", - " }\n", + " \"validation\": sagemaker.inputs.TrainingInput(\n", + " s3_data=create_dataset_step.properties.ProcessingOutputConfig.Outputs[\n", + " \"test_data\"\n", + " ].S3Output.S3Uri,\n", + " content_type=\"text/csv\",\n", + " ),\n", + " },\n", ")" ] }, @@ -696,22 +719,16 @@ "outputs": [], "source": [ "model = sagemaker.model.Model(\n", - " name='music-rec-pipeline-xgboost-model',\n", + " name=\"music-rec-pipeline-xgboost-model\",\n", " image_uri=train_step.properties.AlgorithmSpecification.TrainingImage,\n", " model_data=train_step.properties.ModelArtifacts.S3ModelArtifacts,\n", " sagemaker_session=sagemaker_session,\n", - " role=sagemaker_role\n", + " role=sagemaker_role,\n", ")\n", "\n", - "inputs = sagemaker.inputs.CreateModelInput(\n", - " instance_type=\"ml.m4.xlarge\"\n", - ")\n", + "inputs = sagemaker.inputs.CreateModelInput(instance_type=\"ml.m4.xlarge\")\n", "\n", - "create_model_step = CreateModelStep(\n", - " name=\"CreateModel\",\n", - " model=model,\n", - " inputs=inputs\n", - ")" + "create_model_step = CreateModelStep(name=\"CreateModel\", model=model, inputs=inputs)" ] }, { @@ -754,29 +771,37 @@ "metadata": {}, "outputs": [], "source": [ - "s3_client.upload_file(Filename='./code/deploy_model.py', Bucket=bucket, Key=f'{prefix}/code/deploy_model.py')\n", - "deploy_model_script_uri = f's3://{bucket}/{prefix}/code/deploy_model.py'\n", - "pipeline_endpoint_name = 'music-rec-pipeline-endpoint'\n", + "s3_client.upload_file(\n", + " Filename=\"./code/deploy_model.py\", Bucket=bucket, Key=f\"{prefix}/code/deploy_model.py\"\n", + ")\n", + "deploy_model_script_uri = f\"s3://{bucket}/{prefix}/code/deploy_model.py\"\n", + "pipeline_endpoint_name = \"music-rec-pipeline-endpoint\"\n", "\n", "deploy_model_processor = SKLearnProcessor(\n", - " framework_version='0.23-1',\n", + " framework_version=\"0.23-1\",\n", " role=sagemaker_role,\n", - " instance_type='ml.m5.xlarge',\n", + " instance_type=\"ml.m5.xlarge\",\n", " instance_count=1,\n", " volume_size_in_gb=60,\n", - " base_job_name='music-recommender-deploy-model',\n", - " sagemaker_session=sagemaker_session)\n", + " base_job_name=\"music-recommender-deploy-model\",\n", + " sagemaker_session=sagemaker_session,\n", + ")\n", "\n", "deploy_step = ProcessingStep(\n", - " name='DeployModel',\n", + " name=\"DeployModel\",\n", " processor=deploy_model_processor,\n", " job_arguments=[\n", - " \"--model-name\", create_model_step.properties.ModelName, \n", - " \"--region\", region,\n", - " \"--endpoint-instance-type\", deploy_model_instance_type,\n", - " \"--endpoint-name\", pipeline_endpoint_name\n", + " \"--model-name\",\n", + " create_model_step.properties.ModelName,\n", + " \"--region\",\n", + " region,\n", + " \"--endpoint-instance-type\",\n", + " deploy_model_instance_type,\n", + " \"--endpoint-name\",\n", + " pipeline_endpoint_name,\n", " ],\n", - " code=deploy_model_script_uri)" + " code=deploy_model_script_uri,\n", + ")" ] }, { @@ -792,35 +817,47 @@ "metadata": {}, "outputs": [], "source": [ - "s3_client.upload_file(Filename='./code/model_monitor.py', Bucket=bucket, Key=f'{prefix}/code/model_monitor.py')\n", - "model_monitor_script_uri = f's3://{bucket}/{prefix}/code/model_monitor.py'\n", - "mon_schedule_name_base = 'music-rec-pipeline-daily-monitor'\n", + "s3_client.upload_file(\n", + " Filename=\"./code/model_monitor.py\", Bucket=bucket, Key=f\"{prefix}/code/model_monitor.py\"\n", + ")\n", + "model_monitor_script_uri = f\"s3://{bucket}/{prefix}/code/model_monitor.py\"\n", + "mon_schedule_name_base = \"music-rec-pipeline-daily-monitor\"\n", "\n", "\n", "model_monitor_processor = SKLearnProcessor(\n", - " framework_version='0.23-1',\n", + " framework_version=\"0.23-1\",\n", " role=sagemaker_role,\n", - " instance_type='ml.m5.xlarge',\n", + " instance_type=\"ml.m5.xlarge\",\n", " instance_count=1,\n", " volume_size_in_gb=60,\n", - " base_job_name='music-rec-pipeline-model-monitor',\n", - " sagemaker_session=sagemaker_session)\n", + " base_job_name=\"music-rec-pipeline-model-monitor\",\n", + " sagemaker_session=sagemaker_session,\n", + ")\n", "\n", "monitor_model_step = ProcessingStep(\n", - " name='ModelMonitor',\n", + " name=\"ModelMonitor\",\n", " processor=model_monitor_processor,\n", - " outputs = [\n", - " sagemaker.processing.ProcessingOutput(output_name='model_baseline', source=f'{processing_dir}/output/baselineresults')\n", + " outputs=[\n", + " sagemaker.processing.ProcessingOutput(\n", + " output_name=\"model_baseline\", source=f\"{processing_dir}/output/baselineresults\"\n", + " )\n", + " ],\n", + " job_arguments=[\n", + " \"--baseline-data-uri\",\n", + " val_data_uri,\n", + " \"--bucket-name\",\n", + " bucket,\n", + " \"--bucket-prefix\",\n", + " prefix,\n", + " \"--endpoint\",\n", + " pipeline_endpoint_name,\n", + " \"--region\",\n", + " region,\n", + " \"--schedule-name\",\n", + " mon_schedule_name_base,\n", " ],\n", - " job_arguments=[\"--baseline-data-uri\", val_data_uri,\n", - " \"--bucket-name\", bucket,\n", - " \"--bucket-prefix\", prefix,\n", - " \"--endpoint\", pipeline_endpoint_name,\n", - " \"--region\", region,\n", - " \"--schedule-name\", mon_schedule_name_base\n", - " ],\n", " code=model_monitor_script_uri,\n", - " depends_on=[deploy_step.name]\n", + " depends_on=[deploy_step.name],\n", ")" ] }, @@ -841,9 +878,9 @@ }, "outputs": [], "source": [ - "pipeline_name = f'MusicRecommendationPipeline'\n", - "dataprep_pipeline_name = f'MusicRecommendationDataPrepPipeline'\n", - "train_deploy_pipeline_name = f'MusicRecommendationTrainDeployPipeline'" + "pipeline_name = f\"MusicRecommendationPipeline\"\n", + "dataprep_pipeline_name = f\"MusicRecommendationDataPrepPipeline\"\n", + "train_deploy_pipeline_name = f\"MusicRecommendationTrainDeployPipeline\"" ] }, { @@ -859,7 +896,7 @@ "metadata": {}, "outputs": [], "source": [ - "pipeline_name = f'MusicRecommendationPipeline'" + "pipeline_name = f\"MusicRecommendationPipeline\"" ] }, { @@ -870,20 +907,19 @@ "source": [ "pipeline = Pipeline(\n", " name=pipeline_name,\n", - " parameters=[\n", - " train_instance_param, \n", - " model_approval_status],\n", + " parameters=[train_instance_param, model_approval_status],\n", " steps=[\n", " flow_step_tracks,\n", " flow_step_user_preferences,\n", " flow_step_ratings,\n", " create_dataset_step,\n", - " train_step, \n", - " create_model_step, \n", + " train_step,\n", + " create_model_step,\n", " register_step,\n", " deploy_step,\n", - " monitor_model_step \n", - " ])" + " monitor_model_step,\n", + " ],\n", + ")" ] }, { @@ -916,15 +952,9 @@ "source": [ "pipeline_dataprep = Pipeline(\n", " name=dataprep_pipeline_name,\n", - " parameters=[\n", - " train_instance_param, \n", - " model_approval_status],\n", - " steps=[\n", - " flow_step_tracks,\n", - " flow_step_user_preferences,\n", - " flow_step_ratings,\n", - " create_dataset_step\n", - " ])" + " parameters=[train_instance_param, model_approval_status],\n", + " steps=[flow_step_tracks, flow_step_user_preferences, flow_step_ratings, create_dataset_step],\n", + ")" ] }, { @@ -941,20 +971,31 @@ "outputs": [], "source": [ "create_dataset_step_no_depend = ProcessingStep(\n", - " name='SplitData',\n", + " name=\"SplitData\",\n", " processor=create_dataset_processor,\n", - " outputs = [\n", - " sagemaker.processing.ProcessingOutput(output_name='train_data', source=f'{processing_dir}/output/train'),\n", - " sagemaker.processing.ProcessingOutput(output_name='test_data', source=f'{processing_dir}/output/test')\n", + " outputs=[\n", + " sagemaker.processing.ProcessingOutput(\n", + " output_name=\"train_data\", source=f\"{processing_dir}/output/train\"\n", + " ),\n", + " sagemaker.processing.ProcessingOutput(\n", + " output_name=\"test_data\", source=f\"{processing_dir}/output/test\"\n", + " ),\n", + " ],\n", + " job_arguments=[\n", + " \"--feature-group-name-tracks\",\n", + " fg_name_tracks,\n", + " \"--feature-group-name-ratings\",\n", + " fg_name_ratings,\n", + " \"--feature-group-name-user-preferences\",\n", + " fg_name_user_preferences,\n", + " \"--bucket-name\",\n", + " bucket,\n", + " \"--bucket-prefix\",\n", + " prefix,\n", + " \"--region\",\n", + " region,\n", " ],\n", - " job_arguments=[\"--feature-group-name-tracks\", fg_name_tracks,\n", - " \"--feature-group-name-ratings\", fg_name_ratings,\n", - " \"--feature-group-name-user-preferences\", fg_name_user_preferences,\n", - " \"--bucket-name\", bucket,\n", - " \"--bucket-prefix\", prefix,\n", - " \"--region\", region\n", - " ],\n", - " code=create_dataset_script_uri\n", + " code=create_dataset_script_uri,\n", ")" ] }, @@ -966,17 +1007,16 @@ "source": [ "pipeline_train_deploy_monitor = Pipeline(\n", " name=train_deploy_pipeline_name,\n", - " parameters=[\n", - " train_instance_param, \n", - " model_approval_status],\n", + " parameters=[train_instance_param, model_approval_status],\n", " steps=[\n", " create_dataset_step_no_depend,\n", - " train_step, \n", - " create_model_step, \n", + " train_step,\n", + " create_model_step,\n", " register_step,\n", " deploy_step,\n", - " monitor_model_step \n", - " ])" + " monitor_model_step,\n", + " ],\n", + ")" ] }, { @@ -1021,7 +1061,7 @@ }, "outputs": [], "source": [ - "#json.loads(pipeline.describe()['PipelineDefinition'])" + "# json.loads(pipeline.describe()['PipelineDefinition'])" ] }, { @@ -1041,7 +1081,7 @@ "outputs": [], "source": [ "# Special pipeline parameters can be defined or changed here\n", - "parameters = {'TrainingInstance': 'ml.m5.4xlarge'}" + "parameters = {\"TrainingInstance\": \"ml.m5.4xlarge\"}" ] }, { @@ -1090,13 +1130,13 @@ "import demo_helpers\n", "\n", "demo_helpers.delete_project_resources(\n", - " sagemaker_boto_client=sagemaker_boto_client, \n", + " sagemaker_boto_client=sagemaker_boto_client,\n", " sagemaker_session=sagemaker_session,\n", " endpoint_names=[pipeline_endpoint_name],\n", " pipeline_names=[pipeline_name, dataprep_pipeline_name, train_deploy_pipeline_name],\n", " prefix=prefix,\n", " delete_s3_objects=True,\n", - " bucket_name=bucket\n", + " bucket_name=bucket,\n", ")" ] } From 73a327d4a12c6b6621b9cd0c8b66bffdf2570c37 Mon Sep 17 00:00:00 2001 From: atqy Date: Tue, 10 May 2022 15:29:51 -0700 Subject: [PATCH 24/25] fix title --- end_to_end/music_recommendation/01_data_exploration.ipynb | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/end_to_end/music_recommendation/01_data_exploration.ipynb b/end_to_end/music_recommendation/01_data_exploration.ipynb index 5a22c220dd..2adec62e42 100644 --- a/end_to_end/music_recommendation/01_data_exploration.ipynb +++ b/end_to_end/music_recommendation/01_data_exploration.ipynb @@ -4,7 +4,13 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# Music Recommender Data Exploration\n", + "# Music Recommender Data Exploration" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ "----\n", "\n", "## Background\n", From e3d8d8fab05a2d0cf8429261a1a399fac039644a Mon Sep 17 00:00:00 2001 From: atqy Date: Wed, 11 May 2022 10:37:27 -0700 Subject: [PATCH 25/25] PR edits --- .../01_data_exploration.ipynb | 2 +- .../02_export_feature_groups.ipynb | 20 +++++-------------- ...oy_debugger_explain_monitor_registry.ipynb | 3 +-- .../end_to_end_pipeline.ipynb | 12 ++--------- 4 files changed, 9 insertions(+), 28 deletions(-) diff --git a/end_to_end/music_recommendation/01_data_exploration.ipynb b/end_to_end/music_recommendation/01_data_exploration.ipynb index 2adec62e42..9d85607008 100644 --- a/end_to_end/music_recommendation/01_data_exploration.ipynb +++ b/end_to_end/music_recommendation/01_data_exploration.ipynb @@ -238,7 +238,7 @@ "source": [ "ratings[[\"ratingEventId\", \"userId\"]].plot.hist(\n", " by=\"userId\", bins=50, title=\"Distribution of # of Ratings by User\"\n", - ");" + ")" ] }, { diff --git a/end_to_end/music_recommendation/02_export_feature_groups.ipynb b/end_to_end/music_recommendation/02_export_feature_groups.ipynb index 82a27c0598..b53507c28d 100644 --- a/end_to_end/music_recommendation/02_export_feature_groups.ipynb +++ b/end_to_end/music_recommendation/02_export_feature_groups.ipynb @@ -93,7 +93,7 @@ "import os\n", "from awscli.customizations.s3.utils import split_s3_bucket_key\n", "\n", - "# Sagemaker session\n", + "# SageMaker session\n", "sess = sagemaker.Session()\n", "# get session bucket name\n", "bucket = sess.default_bucket()\n", @@ -102,9 +102,7 @@ "# s3 client\n", "s3_client = boto3.client(\"s3\")\n", "\n", - "print(f\"this is your default SageMaker Studio bucket name: {bucket}\")\n", - "\n", - "# ps.add({'bucket': bucket, 'prefix': prefix}, namespace='music-rec')" + "print(f\"this is your default SageMaker Studio bucket name: {bucket}\")" ] }, { @@ -449,15 +447,7 @@ "# controls if online store is enabled. Enabling the online store allows quick access to\n", "# the latest value for a Record via the GetRecord API.\n", "enable_online_store = True\n", - "fg_name_tracks = feature_group_name\n", - "dw_ecrlist = {\n", - " \"region\": {\n", - " \"us-west-2\": \"174368400705\",\n", - " \"us-east-2\": \"415577184552\",\n", - " \"us-west-1\": \"926135532090\",\n", - " \"us-east-1\": \"663277389841\",\n", - " }\n", - "}" + "fg_name_tracks = feature_group_name" ] }, { @@ -799,7 +789,7 @@ "outputs": [], "source": [ "# Data Wrangler Container URL.\n", - "container_uri = f\"{dw_ecrlist['region'][region]}.dkr.ecr.{region}.amazonaws.com/sagemaker-data-wrangler-container:1.x\"\n", + "container_uri = sagemaker.image_uris.retrieve(framework=\"data-wrangler\", region=region)\n", "\n", "# Processing Job Instance count and instance type.\n", "instance_count = 2\n", @@ -1047,7 +1037,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "check if the athena queres have been done and the data sets exist, then just do train test split or just proceed to training" + "Check if the Athena queries have been done and the data sets exist, then just do train test split or just proceed to training" ] }, { diff --git a/end_to_end/music_recommendation/03_train_deploy_debugger_explain_monitor_registry.ipynb b/end_to_end/music_recommendation/03_train_deploy_debugger_explain_monitor_registry.ipynb index d6fc6172fb..4463aad1c6 100644 --- a/end_to_end/music_recommendation/03_train_deploy_debugger_explain_monitor_registry.ipynb +++ b/end_to_end/music_recommendation/03_train_deploy_debugger_explain_monitor_registry.ipynb @@ -95,7 +95,7 @@ "metadata": {}, "outputs": [], "source": [ - "# Sagemaker session\n", + "# SageMaker session\n", "sess = sagemaker.Session()\n", "# get session bucket name\n", "bucket = sess.default_bucket()\n", @@ -312,7 +312,6 @@ " instance_type=train_instance_type,\n", " image_uri=image,\n", " hyperparameters=hyperparameters,\n", - " # base_job_name=model_name,\n", " output_path=estimator_output_path,\n", " debugger_hook_config=DebuggerHookConfig(\n", " s3_output_path=estimator_output_path + \"/debugger\",\n", diff --git a/end_to_end/music_recommendation/end_to_end_pipeline.ipynb b/end_to_end/music_recommendation/end_to_end_pipeline.ipynb index cf98d853cb..7fb39c5676 100644 --- a/end_to_end/music_recommendation/end_to_end_pipeline.ipynb +++ b/end_to_end/music_recommendation/end_to_end_pipeline.ipynb @@ -461,15 +461,7 @@ "# Define feature group names we previously created in notebooks 02a-c\n", "fg_name_tracks = \"track-features-music-rec\"\n", "fg_name_ratings = \"ratings-features-music-rec\"\n", - "fg_name_user_preferences = \"user-5star-track-features-music-rec\"\n", - "dw_ecrlist = {\n", - " \"region\": {\n", - " \"us-west-2\": \"174368400705\",\n", - " \"us-east-2\": \"415577184552\",\n", - " \"us-west-1\": \"926135532090\",\n", - " \"us-east-1\": \"663277389841\",\n", - " }\n", - "}" + "fg_name_user_preferences = \"user-5star-track-features-music-rec\"" ] }, { @@ -540,7 +532,7 @@ "# Data Wrangler Container URL\n", "# You can also find the proper container uri by exporting your Data Wrangler flow to a pipeline notebook\n", "\n", - "container_uri = f\"{dw_ecrlist['region'][region]}.dkr.ecr.{region}.amazonaws.com/sagemaker-data-wrangler-container:1.x\"\n", + "container_uri = sagemaker.image_uris.retrieve(framework=\"data-wrangler\", region=region)\n", "\n", "\n", "flow_processor = sagemaker.processing.Processor(\n",