diff --git a/lineage/lineage_auto_tracking_example.ipynb b/lineage/lineage_auto_tracking_example.ipynb deleted file mode 100644 index 194966bf9d..0000000000 --- a/lineage/lineage_auto_tracking_example.ipynb +++ /dev/null @@ -1,670 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Lineage Auto Tracking with MNIST Handwritten Digits Example\n", - "\n", - "This demo shows how SageMaker Lineage metadata is auto generated during training.\n", - "\n", - "1. Setup the beta SDK. \n", - "1. Download and prepare the MNIST dataset.\n", - "1. Train a Convolutional Neural Network (CNN) Model.\n", - "1. Traverse the auto generated lineage entities.\n", - "\n", - "Make sure you selected `conda_mxnet_p36` kernel." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Make sure:\n", - "* your account has been whitelisted\n", - "* your execution role has the appropriate trusts" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "import sys\n", - "\n", - "!{sys.executable} -m pip install -q -U pip\n", - "!{sys.executable} -m pip install -q sagemaker-2.6.1.dev0.tar.gz" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [], - "source": [ - "import IPython\n", - "#may need to restart the kernel after initial install of beta sdk\n", - "#IPython.Application.instance().kernel.do_shutdown(True)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Install Python SDKs" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [], - "source": [ - "import sys\n", - "import os" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Requirement already satisfied: sagemaker-experiments in /home/ec2-user/anaconda3/envs/mxnet_p36/lib/python3.6/site-packages (0.1.24)\n", - "Requirement already satisfied: boto3>=1.12.8 in /home/ec2-user/anaconda3/envs/mxnet_p36/lib/python3.6/site-packages (from sagemaker-experiments) (1.14.60)\n", - "Requirement already satisfied: jmespath<1.0.0,>=0.7.1 in /home/ec2-user/anaconda3/envs/mxnet_p36/lib/python3.6/site-packages (from boto3>=1.12.8->sagemaker-experiments) (0.9.4)\n", - "Requirement already satisfied: botocore<1.18.0,>=1.17.60 in /home/ec2-user/anaconda3/envs/mxnet_p36/lib/python3.6/site-packages (from boto3>=1.12.8->sagemaker-experiments) (1.17.60)\n", - "Requirement already satisfied: s3transfer<0.4.0,>=0.3.0 in /home/ec2-user/anaconda3/envs/mxnet_p36/lib/python3.6/site-packages (from boto3>=1.12.8->sagemaker-experiments) (0.3.3)\n", - "Requirement already satisfied: docutils<0.16,>=0.10 in /home/ec2-user/anaconda3/envs/mxnet_p36/lib/python3.6/site-packages (from botocore<1.18.0,>=1.17.60->boto3>=1.12.8->sagemaker-experiments) (0.15.2)\n", - "Requirement already satisfied: python-dateutil<3.0.0,>=2.1 in /home/ec2-user/anaconda3/envs/mxnet_p36/lib/python3.6/site-packages (from botocore<1.18.0,>=1.17.60->boto3>=1.12.8->sagemaker-experiments) (2.8.1)\n", - "Requirement already satisfied: urllib3<1.26,>=1.20; python_version != \"3.4\" in /home/ec2-user/anaconda3/envs/mxnet_p36/lib/python3.6/site-packages (from botocore<1.18.0,>=1.17.60->boto3>=1.12.8->sagemaker-experiments) (1.25.8)\n", - "Requirement already satisfied: six>=1.5 in /home/ec2-user/anaconda3/envs/mxnet_p36/lib/python3.6/site-packages (from python-dateutil<3.0.0,>=2.1->botocore<1.18.0,>=1.17.60->boto3>=1.12.8->sagemaker-experiments) (1.14.0)\n" - ] - } - ], - "source": [ - "!{sys.executable} -m pip install sagemaker-experiments" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Setup" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [], - "source": [ - "import time\n", - "\n", - "import boto3\n", - "import numpy as np\n", - "import pandas as pd\n", - "%config InlineBackend.figure_format = 'retina'\n", - "from matplotlib import pyplot as plt\n", - "\n", - "import sagemaker\n", - "from sagemaker import get_execution_role\n", - "from sagemaker.session import Session\n", - "from sagemaker.analytics import ExperimentAnalytics\n", - "\n", - "from smexperiments.experiment import Experiment\n", - "from smexperiments.trial import Trial\n", - "from smexperiments.trial_component import TrialComponent\n", - "from smexperiments.tracker import Tracker" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [], - "source": [ - "# lineage beta only available in CMH\n", - "region = 'us-east-2'\n", - "\n", - "sess = boto3.Session(region_name=region)\n", - "sm = sess.client('sagemaker')\n", - "role = get_execution_role()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Create a S3 bucket to hold data" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "An error occurred (BucketAlreadyOwnedByYou) when calling the CreateBucket operation: Your previous request to create the named bucket succeeded and you already own it.\n" - ] - } - ], - "source": [ - "# create a s3 bucket to hold data, note that your account might already created a bucket with the same name\n", - "account_id = sess.client('sts').get_caller_identity()[\"Account\"]\n", - "bucket = 'sagemaker-experiments-{}-{}'.format(sess.region_name, account_id)\n", - "prefix = 'mnist'\n", - "\n", - "try:\n", - " if sess.region_name == \"us-east-1\":\n", - " sess.client('s3').create_bucket(Bucket=bucket)\n", - " else:\n", - " sess.client('s3').create_bucket(Bucket=bucket, \n", - " CreateBucketConfiguration={'LocationConstraint': sess.region_name})\n", - "except Exception as e:\n", - " print(e)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Create an Experiment" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Experiment(sagemaker_boto_client=,experiment_name='mnist-hand-written-digits-classification-1600448285',description='Classification of mnist hand-written digits',tags=None,experiment_arn='arn:aws:sagemaker:us-east-2:707662012936:experiment/mnist-hand-written-digits-classification-1600448285',response_metadata={'RequestId': 'eaf6dbea-6ecb-4781-9167-53ead98746b7', 'HTTPStatusCode': 200, 'HTTPHeaders': {'x-amzn-requestid': 'eaf6dbea-6ecb-4781-9167-53ead98746b7', 'content-type': 'application/x-amz-json-1.1', 'content-length': '123', 'date': 'Fri, 18 Sep 2020 16:58:05 GMT'}, 'RetryAttempts': 0})\n" - ] - } - ], - "source": [ - "mnist_experiment = Experiment.create(\n", - " experiment_name=f\"mnist-hand-written-digits-classification-{int(time.time())}\", \n", - " description=\"Classification of mnist hand-written digits\", \n", - " sagemaker_boto_client=sm)\n", - "print(mnist_experiment)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Create a Trial For Each Training Run\n", - "\n", - "Note the execution of the following code takes a while." - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [], - "source": [ - "from sagemaker.mxnet import MXNet\n", - "from sagemaker import get_execution_role\n", - "\n", - "# Bucket location where results of model training are saved.\n", - "model_artifacts_location = 's3://{}/mxnet-mnist-example/artifacts'.format(bucket)\n", - "custom_code_upload_location = 's3://{}/mxnet-mnist-example/code'.format(bucket)\n", - "train_data_location = 's3://sagemaker-sample-data-{}/mxnet/mnist/train'.format(region)\n", - "test_data_location = 's3://sagemaker-sample-data-{}/mxnet/mnist/test'.format(region)\n", - "# IAM execution role that gives SageMaker access to resources in your AWS account.\n", - "# We can use the SageMaker Python SDK to get the role from our notebook environment. \n", - "role = get_execution_role()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "If you want to run the following training jobs asynchronously, you may need to increase your resource limit. Otherwise, you can run them sequentially." - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [], - "source": [ - "# create trial\n", - "trial_name = f\"cnn-training-job-{int(time.time())}\"\n", - "cnn_trial = Trial.create(\n", - " trial_name=trial_name, \n", - " experiment_name=mnist_experiment.experiment_name,\n", - " sagemaker_boto_client=sm,\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "INFO:sagemaker:Creating training-job with name: cnn-training-job-1600448298\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "2020-09-18 16:58:19 Starting - Starting the training job...\n", - "2020-09-18 16:58:21 Starting - Launching requested ML instances......\n", - "2020-09-18 16:59:24 Starting - Preparing the instances for training...\n", - "2020-09-18 17:00:14 Downloading - Downloading input data...\n", - "2020-09-18 17:00:38 Training - Downloading the training image..\n", - "2020-09-18 17:00:57 Training - Training image download completed. Training in progress.\u001b[34m2020-09-18 17:00:58,918 sagemaker-containers INFO Imported framework sagemaker_mxnet_container.training\u001b[0m\n", - "\u001b[34m2020-09-18 17:00:58,922 sagemaker-containers INFO No GPUs detected (normal if no gpus installed)\u001b[0m\n", - "\u001b[34m2020-09-18 17:00:58,936 sagemaker_mxnet_container.training INFO MXNet training environment: {'SM_HOSTS': '[\"algo-1\"]', 'SM_NETWORK_INTERFACE_NAME': 'eth0', 'SM_HPS': '{\"learning-rate\":0.1}', 'SM_USER_ENTRY_POINT': 'mnist.py', 'SM_FRAMEWORK_PARAMS': '{}', 'SM_RESOURCE_CONFIG': '{\"current_host\":\"algo-1\",\"hosts\":[\"algo-1\"],\"network_interface_name\":\"eth0\"}', 'SM_INPUT_DATA_CONFIG': '{\"test\":{\"RecordWrapperType\":\"None\",\"S3DistributionType\":\"FullyReplicated\",\"TrainingInputMode\":\"File\"},\"train\":{\"RecordWrapperType\":\"None\",\"S3DistributionType\":\"FullyReplicated\",\"TrainingInputMode\":\"File\"}}', 'SM_OUTPUT_DATA_DIR': '/opt/ml/output/data', 'SM_CHANNELS': '[\"test\",\"train\"]', 'SM_CURRENT_HOST': 'algo-1', 'SM_MODULE_NAME': 'mnist', 'SM_LOG_LEVEL': '20', 'SM_FRAMEWORK_MODULE': 'sagemaker_mxnet_container.training:main', 'SM_INPUT_DIR': '/opt/ml/input', 'SM_INPUT_CONFIG_DIR': '/opt/ml/input/config', 'SM_OUTPUT_DIR': '/opt/ml/output', 'SM_NUM_CPUS': '4', 'SM_NUM_GPUS': '0', 'SM_MODEL_DIR': '/opt/ml/model', 'SM_MODULE_DIR': 's3://sagemaker-experiments-us-east-2-707662012936/mxnet-mnist-example/code/cnn-training-job-1600448298/source/sourcedir.tar.gz', 'SM_TRAINING_ENV': '{\"additional_framework_parameters\":{},\"channel_input_dirs\":{\"test\":\"/opt/ml/input/data/test\",\"train\":\"/opt/ml/input/data/train\"},\"current_host\":\"algo-1\",\"framework_module\":\"sagemaker_mxnet_container.training:main\",\"hosts\":[\"algo-1\"],\"hyperparameters\":{\"learning-rate\":0.1},\"input_config_dir\":\"/opt/ml/input/config\",\"input_data_config\":{\"test\":{\"RecordWrapperType\":\"None\",\"S3DistributionType\":\"FullyReplicated\",\"TrainingInputMode\":\"File\"},\"train\":{\"RecordWrapperType\":\"None\",\"S3DistributionType\":\"FullyReplicated\",\"TrainingInputMode\":\"File\"}},\"input_dir\":\"/opt/ml/input\",\"is_master\":true,\"job_name\":\"cnn-training-job-1600448298\",\"log_level\":20,\"master_hostname\":\"algo-1\",\"model_dir\":\"/opt/ml/model\",\"module_dir\":\"s3://sagemaker-experiments-us-east-2-707662012936/mxnet-mnist-example/code/cnn-training-job-1600448298/source/sourcedir.tar.gz\",\"module_name\":\"mnist\",\"network_interface_name\":\"eth0\",\"num_cpus\":4,\"num_gpus\":0,\"output_data_dir\":\"/opt/ml/output/data\",\"output_dir\":\"/opt/ml/output\",\"output_intermediate_dir\":\"/opt/ml/output/intermediate\",\"resource_config\":{\"current_host\":\"algo-1\",\"hosts\":[\"algo-1\"],\"network_interface_name\":\"eth0\"},\"user_entry_point\":\"mnist.py\"}', 'SM_USER_ARGS': '[\"--learning-rate\",\"0.1\"]', 'SM_OUTPUT_INTERMEDIATE_DIR': '/opt/ml/output/intermediate', 'SM_CHANNEL_TEST': '/opt/ml/input/data/test', 'SM_CHANNEL_TRAIN': '/opt/ml/input/data/train', 'SM_HP_LEARNING-RATE': '0.1'}\u001b[0m\n", - "\u001b[34m2020-09-18 17:00:59,276 sagemaker-containers INFO Module mnist does not provide a setup.py. \u001b[0m\n", - "\u001b[34mGenerating setup.py\u001b[0m\n", - "\u001b[34m2020-09-18 17:00:59,277 sagemaker-containers INFO Generating setup.cfg\u001b[0m\n", - "\u001b[34m2020-09-18 17:00:59,277 sagemaker-containers INFO Generating MANIFEST.in\u001b[0m\n", - "\u001b[34m2020-09-18 17:00:59,277 sagemaker-containers INFO Installing module with the following command:\u001b[0m\n", - "\u001b[34m/usr/local/bin/python3.6 -m pip install -U . \u001b[0m\n", - "\u001b[34mProcessing /opt/ml/code\u001b[0m\n", - "\u001b[34mInstalling collected packages: mnist\n", - " Running setup.py install for mnist: started\n", - " Running setup.py install for mnist: finished with status 'done'\u001b[0m\n", - "\u001b[34mSuccessfully installed mnist-1.0.0\u001b[0m\n", - "\u001b[34mWARNING: You are using pip version 19.1.1, however version 20.2.3 is available.\u001b[0m\n", - "\u001b[34mYou should consider upgrading via the 'pip install --upgrade pip' command.\u001b[0m\n", - "\u001b[34m2020-09-18 17:01:01,093 sagemaker-containers INFO No GPUs detected (normal if no gpus installed)\u001b[0m\n", - "\u001b[34m2020-09-18 17:01:01,109 sagemaker-containers INFO Invoking user script\n", - "\u001b[0m\n", - "\u001b[34mTraining Env:\n", - "\u001b[0m\n", - "\u001b[34m{\n", - " \"additional_framework_parameters\": {},\n", - " \"channel_input_dirs\": {\n", - " \"test\": \"/opt/ml/input/data/test\",\n", - " \"train\": \"/opt/ml/input/data/train\"\n", - " },\n", - " \"current_host\": \"algo-1\",\n", - " \"framework_module\": \"sagemaker_mxnet_container.training:main\",\n", - " \"hosts\": [\n", - " \"algo-1\"\n", - " ],\n", - " \"hyperparameters\": {\n", - " \"learning-rate\": 0.1\n", - " },\n", - " \"input_config_dir\": \"/opt/ml/input/config\",\n", - " \"input_data_config\": {\n", - " \"test\": {\n", - " \"TrainingInputMode\": \"File\",\n", - " \"S3DistributionType\": \"FullyReplicated\",\n", - " \"RecordWrapperType\": \"None\"\n", - " },\n", - " \"train\": {\n", - " \"TrainingInputMode\": \"File\",\n", - " \"S3DistributionType\": \"FullyReplicated\",\n", - " \"RecordWrapperType\": \"None\"\n", - " }\n", - " },\n", - " \"input_dir\": \"/opt/ml/input\",\n", - " \"is_master\": true,\n", - " \"job_name\": \"cnn-training-job-1600448298\",\n", - " \"log_level\": 20,\n", - " \"master_hostname\": \"algo-1\",\n", - " \"model_dir\": \"/opt/ml/model\",\n", - " \"module_dir\": \"s3://sagemaker-experiments-us-east-2-707662012936/mxnet-mnist-example/code/cnn-training-job-1600448298/source/sourcedir.tar.gz\",\n", - " \"module_name\": \"mnist\",\n", - " \"network_interface_name\": \"eth0\",\n", - " \"num_cpus\": 4,\n", - " \"num_gpus\": 0,\n", - " \"output_data_dir\": \"/opt/ml/output/data\",\n", - " \"output_dir\": \"/opt/ml/output\",\n", - " \"output_intermediate_dir\": \"/opt/ml/output/intermediate\",\n", - " \"resource_config\": {\n", - " \"current_host\": \"algo-1\",\n", - " \"hosts\": [\n", - " \"algo-1\"\n", - " ],\n", - " \"network_interface_name\": \"eth0\"\n", - " },\n", - " \"user_entry_point\": \"mnist.py\"\u001b[0m\n", - "\u001b[34m}\n", - "\u001b[0m\n", - "\u001b[34mEnvironment variables:\n", - "\u001b[0m\n", - "\u001b[34mSM_HOSTS=[\"algo-1\"]\u001b[0m\n", - "\u001b[34mSM_NETWORK_INTERFACE_NAME=eth0\u001b[0m\n", - "\u001b[34mSM_HPS={\"learning-rate\":0.1}\u001b[0m\n", - "\u001b[34mSM_USER_ENTRY_POINT=mnist.py\u001b[0m\n", - "\u001b[34mSM_FRAMEWORK_PARAMS={}\u001b[0m\n", - "\u001b[34mSM_RESOURCE_CONFIG={\"current_host\":\"algo-1\",\"hosts\":[\"algo-1\"],\"network_interface_name\":\"eth0\"}\u001b[0m\n", - "\u001b[34mSM_INPUT_DATA_CONFIG={\"test\":{\"RecordWrapperType\":\"None\",\"S3DistributionType\":\"FullyReplicated\",\"TrainingInputMode\":\"File\"},\"train\":{\"RecordWrapperType\":\"None\",\"S3DistributionType\":\"FullyReplicated\",\"TrainingInputMode\":\"File\"}}\u001b[0m\n", - "\u001b[34mSM_OUTPUT_DATA_DIR=/opt/ml/output/data\u001b[0m\n", - "\u001b[34mSM_CHANNELS=[\"test\",\"train\"]\u001b[0m\n", - "\u001b[34mSM_CURRENT_HOST=algo-1\u001b[0m\n", - "\u001b[34mSM_MODULE_NAME=mnist\u001b[0m\n", - "\u001b[34mSM_LOG_LEVEL=20\u001b[0m\n", - "\u001b[34mSM_FRAMEWORK_MODULE=sagemaker_mxnet_container.training:main\u001b[0m\n", - "\u001b[34mSM_INPUT_DIR=/opt/ml/input\u001b[0m\n", - "\u001b[34mSM_INPUT_CONFIG_DIR=/opt/ml/input/config\u001b[0m\n", - "\u001b[34mSM_OUTPUT_DIR=/opt/ml/output\u001b[0m\n", - "\u001b[34mSM_NUM_CPUS=4\u001b[0m\n", - "\u001b[34mSM_NUM_GPUS=0\u001b[0m\n", - "\u001b[34mSM_MODEL_DIR=/opt/ml/model\u001b[0m\n", - "\u001b[34mSM_MODULE_DIR=s3://sagemaker-experiments-us-east-2-707662012936/mxnet-mnist-example/code/cnn-training-job-1600448298/source/sourcedir.tar.gz\u001b[0m\n", - "\u001b[34mSM_TRAINING_ENV={\"additional_framework_parameters\":{},\"channel_input_dirs\":{\"test\":\"/opt/ml/input/data/test\",\"train\":\"/opt/ml/input/data/train\"},\"current_host\":\"algo-1\",\"framework_module\":\"sagemaker_mxnet_container.training:main\",\"hosts\":[\"algo-1\"],\"hyperparameters\":{\"learning-rate\":0.1},\"input_config_dir\":\"/opt/ml/input/config\",\"input_data_config\":{\"test\":{\"RecordWrapperType\":\"None\",\"S3DistributionType\":\"FullyReplicated\",\"TrainingInputMode\":\"File\"},\"train\":{\"RecordWrapperType\":\"None\",\"S3DistributionType\":\"FullyReplicated\",\"TrainingInputMode\":\"File\"}},\"input_dir\":\"/opt/ml/input\",\"is_master\":true,\"job_name\":\"cnn-training-job-1600448298\",\"log_level\":20,\"master_hostname\":\"algo-1\",\"model_dir\":\"/opt/ml/model\",\"module_dir\":\"s3://sagemaker-experiments-us-east-2-707662012936/mxnet-mnist-example/code/cnn-training-job-1600448298/source/sourcedir.tar.gz\",\"module_name\":\"mnist\",\"network_interface_name\":\"eth0\",\"num_cpus\":4,\"num_gpus\":0,\"output_data_dir\":\"/opt/ml/output/data\",\"output_dir\":\"/opt/ml/output\",\"output_intermediate_dir\":\"/opt/ml/output/intermediate\",\"resource_config\":{\"current_host\":\"algo-1\",\"hosts\":[\"algo-1\"],\"network_interface_name\":\"eth0\"},\"user_entry_point\":\"mnist.py\"}\u001b[0m\n", - "\u001b[34mSM_USER_ARGS=[\"--learning-rate\",\"0.1\"]\u001b[0m\n", - "\u001b[34mSM_OUTPUT_INTERMEDIATE_DIR=/opt/ml/output/intermediate\u001b[0m\n", - "\u001b[34mSM_CHANNEL_TEST=/opt/ml/input/data/test\u001b[0m\n", - "\u001b[34mSM_CHANNEL_TRAIN=/opt/ml/input/data/train\u001b[0m\n", - "\u001b[34mSM_HP_LEARNING-RATE=0.1\u001b[0m\n", - "\u001b[34mPYTHONPATH=/usr/local/bin:/usr/local/lib/python36.zip:/usr/local/lib/python3.6:/usr/local/lib/python3.6/lib-dynload:/usr/local/lib/python3.6/site-packages\n", - "\u001b[0m\n", - "\u001b[34mInvoking script with the following command:\n", - "\u001b[0m\n", - "\u001b[34m/usr/local/bin/python3.6 -m mnist --learning-rate 0.1\n", - "\n", - "\u001b[0m\n", - "\u001b[34mINFO:root:Epoch[0] Batch [0-100]#011Speed: 50205.03 samples/sec#011accuracy=0.109109\u001b[0m\n", - "\u001b[34mINFO:root:Epoch[0] Batch [100-200]#011Speed: 56347.94 samples/sec#011accuracy=0.112500\u001b[0m\n", - "\u001b[34mINFO:root:Epoch[0] Batch [200-300]#011Speed: 56731.99 samples/sec#011accuracy=0.114400\u001b[0m\n", - "\u001b[34mINFO:root:Epoch[0] Batch [300-400]#011Speed: 55335.14 samples/sec#011accuracy=0.112100\u001b[0m\n", - "\u001b[34mINFO:root:Epoch[0] Batch [400-500]#011Speed: 54007.85 samples/sec#011accuracy=0.111500\u001b[0m\n", - "\u001b[34mINFO:root:Epoch[0] Train-accuracy=0.131767\u001b[0m\n", - "\u001b[34mINFO:root:Epoch[0] Time cost=1.168\u001b[0m\n", - "\u001b[34mINFO:root:Epoch[0] Validation-accuracy=0.361700\u001b[0m\n", - "\u001b[34mINFO:root:Epoch[1] Batch [0-100]#011Speed: 46970.64 samples/sec#011accuracy=0.485149\u001b[0m\n", - "\u001b[34mINFO:root:Epoch[1] Batch [100-200]#011Speed: 51159.84 samples/sec#011accuracy=0.671800\u001b[0m\n", - "\u001b[34mINFO:root:Epoch[1] Batch [200-300]#011Speed: 51225.45 samples/sec#011accuracy=0.772000\u001b[0m\n", - "\u001b[34mINFO:root:Epoch[1] Batch [300-400]#011Speed: 49382.28 samples/sec#011accuracy=0.802200\u001b[0m\n", - "\u001b[34mINFO:root:Epoch[1] Batch [400-500]#011Speed: 54041.46 samples/sec#011accuracy=0.821100\u001b[0m\n", - "\u001b[34mINFO:root:Epoch[1] Train-accuracy=0.732617\u001b[0m\n", - "\u001b[34mINFO:root:Epoch[1] Time cost=1.161\u001b[0m\n", - "\u001b[34mINFO:root:Epoch[1] Validation-accuracy=0.841000\u001b[0m\n", - "\u001b[34mINFO:root:Epoch[2] Batch [0-100]#011Speed: 47934.41 samples/sec#011accuracy=0.855743\u001b[0m\n", - "\u001b[34mINFO:root:Epoch[2] Batch [100-200]#011Speed: 43980.94 samples/sec#011accuracy=0.873200\u001b[0m\n", - "\u001b[34mINFO:root:Epoch[2] Batch [200-300]#011Speed: 46539.72 samples/sec#011accuracy=0.890400\u001b[0m\n", - "\u001b[34mINFO:root:Epoch[2] Batch [300-400]#011Speed: 56658.80 samples/sec#011accuracy=0.901300\u001b[0m\n", - "\u001b[34mINFO:root:Epoch[2] Batch [400-500]#011Speed: 60301.89 samples/sec#011accuracy=0.904900\u001b[0m\n", - "\u001b[34mINFO:root:Epoch[2] Train-accuracy=0.889467\u001b[0m\n", - "\u001b[34mINFO:root:Epoch[2] Time cost=1.177\u001b[0m\n", - "\u001b[34mINFO:root:Epoch[2] Validation-accuracy=0.919500\u001b[0m\n", - "\u001b[34mINFO:root:Epoch[3] Batch [0-100]#011Speed: 44645.67 samples/sec#011accuracy=0.921584\u001b[0m\n", - "\u001b[34mINFO:root:Epoch[3] Batch [100-200]#011Speed: 50707.90 samples/sec#011accuracy=0.926800\u001b[0m\n", - "\u001b[34mINFO:root:Epoch[3] Batch [200-300]#011Speed: 55945.46 samples/sec#011accuracy=0.929800\u001b[0m\n", - "\u001b[34mINFO:root:Epoch[3] Batch [300-400]#011Speed: 53771.68 samples/sec#011accuracy=0.931100\u001b[0m\n", - "\u001b[34mINFO:root:Epoch[3] Batch [400-500]#011Speed: 58883.54 samples/sec#011accuracy=0.932000\u001b[0m\n", - "\u001b[34mINFO:root:Epoch[3] Train-accuracy=0.929450\u001b[0m\n", - "\u001b[34mINFO:root:Epoch[3] Time cost=1.144\u001b[0m\n", - "\u001b[34mINFO:root:Epoch[3] Validation-accuracy=0.939000\u001b[0m\n", - "\u001b[34mINFO:root:Epoch[4] Batch [0-100]#011Speed: 49187.93 samples/sec#011accuracy=0.942376\u001b[0m\n", - "\u001b[34mINFO:root:Epoch[4] Batch [100-200]#011Speed: 61187.84 samples/sec#011accuracy=0.943200\u001b[0m\n", - "\u001b[34mINFO:root:Epoch[4] Batch [200-300]#011Speed: 56989.14 samples/sec#011accuracy=0.945400\u001b[0m\n", - "\u001b[34mINFO:root:Epoch[4] Batch [300-400]#011Speed: 55916.23 samples/sec#011accuracy=0.947000\u001b[0m\n", - "\u001b[34mINFO:root:Epoch[4] Batch [400-500]#011Speed: 52112.93 samples/sec#011accuracy=0.948400\u001b[0m\n", - "\u001b[34mINFO:root:Epoch[4] Train-accuracy=0.946367\u001b[0m\n", - "\u001b[34mINFO:root:Epoch[4] Time cost=1.107\u001b[0m\n", - "\u001b[34mINFO:root:Epoch[4] Validation-accuracy=0.953300\u001b[0m\n", - "\u001b[34mINFO:root:Epoch[5] Batch [0-100]#011Speed: 46316.68 samples/sec#011accuracy=0.958812\u001b[0m\n", - "\u001b[34mINFO:root:Epoch[5] Batch [100-200]#011Speed: 57687.36 samples/sec#011accuracy=0.956300\u001b[0m\n", - "\u001b[34mINFO:root:Epoch[5] Batch [200-300]#011Speed: 60239.45 samples/sec#011accuracy=0.955200\u001b[0m\n", - "\u001b[34mINFO:root:Epoch[5] Batch [300-400]#011Speed: 42946.92 samples/sec#011accuracy=0.956400\u001b[0m\n", - "\u001b[34mINFO:root:Epoch[5] Batch [400-500]#011Speed: 58556.36 samples/sec#011accuracy=0.957900\u001b[0m\n", - "\u001b[34mINFO:root:Epoch[5] Train-accuracy=0.957400\u001b[0m\n", - "\u001b[34mINFO:root:Epoch[5] Time cost=1.144\u001b[0m\n", - "\u001b[34mINFO:root:Epoch[5] Validation-accuracy=0.956300\u001b[0m\n", - "\u001b[34mINFO:root:Epoch[6] Batch [0-100]#011Speed: 43963.93 samples/sec#011accuracy=0.963663\u001b[0m\n", - "\u001b[34mINFO:root:Epoch[6] Batch [100-200]#011Speed: 59093.51 samples/sec#011accuracy=0.964100\u001b[0m\n", - "\u001b[34mINFO:root:Epoch[6] Batch [200-300]#011Speed: 55414.68 samples/sec#011accuracy=0.965300\u001b[0m\n", - "\u001b[34mINFO:root:Epoch[6] Batch [300-400]#011Speed: 57791.73 samples/sec#011accuracy=0.964200\u001b[0m\n", - "\u001b[34mINFO:root:Epoch[6] Batch [400-500]#011Speed: 56011.66 samples/sec#011accuracy=0.963900\u001b[0m\n", - "\u001b[34mINFO:root:Epoch[6] Train-accuracy=0.964483\u001b[0m\n", - "\u001b[34mINFO:root:Epoch[6] Time cost=1.125\u001b[0m\n", - "\u001b[34mINFO:root:Epoch[6] Validation-accuracy=0.962200\u001b[0m\n", - "\u001b[34mINFO:root:Epoch[7] Batch [0-100]#011Speed: 49319.63 samples/sec#011accuracy=0.970396\u001b[0m\n", - "\u001b[34mINFO:root:Epoch[7] Batch [100-200]#011Speed: 49068.18 samples/sec#011accuracy=0.972700\u001b[0m\n", - "\u001b[34mINFO:root:Epoch[7] Batch [200-300]#011Speed: 50908.98 samples/sec#011accuracy=0.968400\u001b[0m\n", - "\u001b[34mINFO:root:Epoch[7] Batch [300-400]#011Speed: 57289.14 samples/sec#011accuracy=0.968700\u001b[0m\n", - "\u001b[34mINFO:root:Epoch[7] Batch [400-500]#011Speed: 50406.79 samples/sec#011accuracy=0.969500\u001b[0m\n", - "\u001b[34mINFO:root:Epoch[7] Train-accuracy=0.969983\u001b[0m\n", - "\u001b[34mINFO:root:Epoch[7] Time cost=1.167\u001b[0m\n", - "\u001b[34mINFO:root:Epoch[7] Validation-accuracy=0.967600\u001b[0m\n", - "\u001b[34mINFO:root:Epoch[8] Batch [0-100]#011Speed: 49308.79 samples/sec#011accuracy=0.974158\u001b[0m\n", - "\u001b[34mINFO:root:Epoch[8] Batch [100-200]#011Speed: 55016.65 samples/sec#011accuracy=0.972100\u001b[0m\n", - "\u001b[34mINFO:root:Epoch[8] Batch [200-300]#011Speed: 56157.89 samples/sec#011accuracy=0.974800\u001b[0m\n", - "\u001b[34mINFO:root:Epoch[8] Batch [300-400]#011Speed: 56100.66 samples/sec#011accuracy=0.974600\u001b[0m\n", - "\u001b[34mINFO:root:Epoch[8] Batch [400-500]#011Speed: 55336.53 samples/sec#011accuracy=0.975700\u001b[0m\n", - "\u001b[34mINFO:root:Epoch[8] Train-accuracy=0.973583\u001b[0m\n", - "\u001b[34mINFO:root:Epoch[8] Time cost=1.153\u001b[0m\n", - "\u001b[34mINFO:root:Epoch[8] Validation-accuracy=0.969100\u001b[0m\n", - "\u001b[34mINFO:root:Epoch[9] Batch [0-100]#011Speed: 49495.16 samples/sec#011accuracy=0.976931\u001b[0m\n", - "\u001b[34mINFO:root:Epoch[9] Batch [100-200]#011Speed: 56176.17 samples/sec#011accuracy=0.978100\u001b[0m\n", - "\u001b[34mINFO:root:Epoch[9] Batch [200-300]#011Speed: 54348.38 samples/sec#011accuracy=0.977700\u001b[0m\n", - "\u001b[34mINFO:root:Epoch[9] Batch [300-400]#011Speed: 55923.31 samples/sec#011accuracy=0.974100\u001b[0m\n", - "\u001b[34mINFO:root:Epoch[9] Batch [400-500]#011Speed: 55544.87 samples/sec#011accuracy=0.976800\u001b[0m\n", - "\u001b[34mINFO:root:Epoch[9] Train-accuracy=0.976783\u001b[0m\n", - "\u001b[34mINFO:root:Epoch[9] Time cost=1.097\u001b[0m\n", - "\u001b[34mINFO:root:Epoch[9] Validation-accuracy=0.971200\u001b[0m\n", - "\u001b[34m2020-09-18 17:01:20,526 sagemaker-containers INFO Reporting training SUCCESS\u001b[0m\n", - "\n", - "2020-09-18 17:01:30 Uploading - Uploading generated training model\n", - "2020-09-18 17:01:30 Completed - Training job completed\n", - "Training seconds: 76\n", - "Billable seconds: 76\n" - ] - } - ], - "source": [ - "# all input configurations, parameters, and metrics specified in estimator \n", - "# definition are automatically tracked\n", - "from sagemaker.mxnet import MXNet\n", - "\n", - "estimator = MXNet(entry_point='mnist.py',\n", - " role=role,\n", - " output_path=model_artifacts_location,\n", - " code_location=custom_code_upload_location,\n", - " instance_count=1,\n", - " instance_type='ml.m4.xlarge',\n", - " framework_version='1.4.1',\n", - " py_version='py3',\n", - " #distributions={'parameter_server': {'enabled': True}},\n", - " hyperparameters={'learning-rate': 0.1})\n", - "\n", - "cnn_training_job_name = \"cnn-training-job-{}\".format(int(time.time()))\n", - "\n", - "# Now associate the estimator with the Experiment and Trial\n", - "estimator.fit(\n", - " inputs={'train': train_data_location, 'test': test_data_location},\n", - " job_name=cnn_training_job_name,\n", - " experiment_config={\n", - " \"TrialName\": cnn_trial.trial_name,\n", - " \"TrialComponentDisplayName\": \"Training\",\n", - " },\n", - " wait=True,\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### View the Lineage Data\n", - "\n", - "Now we will traverse the lineage metadata auto generated by SageMaker for the previously created training job." - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": {}, - "outputs": [], - "source": [ - "from sagemaker.lineage.association import Association" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "Lineage entities upstream from arn:aws:sagemaker:us-east-2:707662012936:experiment-trial-component/cnn-training-job-1600448298-aws-training-job:\n", - "arn:aws:sagemaker:us-east-2:707662012936:artifact/670e8b0c6229188481e498f852cad181\n", - "arn:aws:sagemaker:us-east-2:707662012936:artifact/9f4e6150b0cb6b66b70038d62c60c287\n", - "arn:aws:sagemaker:us-east-2:707662012936:artifact/faa0f168c72092323e13042098253e44\n", - "\n", - "Lineage entities downstream from arn:aws:sagemaker:us-east-2:707662012936:experiment-trial-component/cnn-training-job-1600448298-aws-training-job:\n" - ] - } - ], - "source": [ - "trial_component_name = cnn_training_job_name + '-aws-training-job'\n", - "trial_component = TrialComponent.load(trial_component_name=trial_component_name, sagemaker_boto_client=sm)\n", - "tc_arn = trial_component.trial_component_arn\n", - "\n", - "# Incoming Associations\n", - "incoming_associations = Association.list(destination_arn=tc_arn, sagemaker_boto_client=sm)\n", - "\n", - "print(f'\\nLineage entities upstream from {tc_arn}:')\n", - "for association in incoming_associations:\n", - " print(association.source_arn)\n", - "\n", - "# Outgoing Assocaitions\n", - "outgoing_associations = Association.list(source_arn=tc_arn, sagemaker_boto_client=sm)\n", - "print(f'\\nLineage entities downstream from {tc_arn}:')\n", - "for association in outgoing_associations:\n", - " print(association.destination_arn)" - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "metadata": {}, - "outputs": [ - { - "data": { - "image/png": "\n", - "text/plain": [ - "
" - ] - }, - "metadata": { - "image/png": { - "height": 573, - "width": 572 - } - }, - "output_type": "display_data" - }, - { - "data": { - "image/png": "\n", - "text/plain": [ - "
" - ] - }, - "metadata": { - "image/png": { - "height": 573, - "width": 572 - } - }, - "output_type": "display_data" - } - ], - "source": [ - "%run lineage_visualizer.py\n", - "\n", - "import lineage_visualizer\n", - "\n", - "# plot a rough visualization of the linaege artifacts\n", - "vis = LineageVisualizer(sm)\n", - "vis.upstream(tc_arn)\n", - "\n", - "vis.downstream(tc_arn)" - ] - }, - { - "cell_type": "code", - "execution_count": 19, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "!!python/object:networkx.classes.digraph.DiGraph\n", - "_adj: &id001 {}\n", - "_node: &id003 {}\n", - "_pred: {}\n", - "_succ: *id001\n", - "adjlist_inner_dict_factory: &id002 !!python/name:builtins.dict ''\n", - "adjlist_outer_dict_factory: *id002\n", - "edge_attr_dict_factory: *id002\n", - "graph: {}\n", - "graph_attr_dict_factory: *id002\n", - "node_attr_dict_factory: *id002\n", - "node_dict_factory: *id002\n", - "nodes: !!python/object:networkx.classes.reportviews.NodeView\n", - " _nodes: *id003\n", - "\n" - ] - } - ], - "source": [ - "# represent lineage as yaml\n", - "file_name = vis.write_yaml()\n", - "f = open(file_name, \"r\")\n", - "print(f.read())" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "jupytext": { - "formats": "ipynb,md" - }, - "kernelspec": { - "display_name": "conda_mxnet_p36", - "language": "python", - "name": "conda_mxnet_p36" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.10" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} \ No newline at end of file diff --git a/lineage/lineage_tracking_example.ipynb b/lineage/lineage_tracking_example.ipynb deleted file mode 100644 index f594997150..0000000000 --- a/lineage/lineage_tracking_example.ipynb +++ /dev/null @@ -1,611 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Lineage Tracking and Traversal Example\n", - "\n", - "SageMaker Lineage makes it easy to track all the artifacts created in a machine learning workflow\n", - " from start to finish.\n", - "\n", - "The [SageMaker Python SDK](https://sagemaker.readthedocs.io/en/stable) is an SDK to train and \n", - "deploy Apache MXNet models. In this example, we train a simple neural network using the Apache MXNet [Module API](https://mxnet.apache.org/api/python/module/module.html) and the MNIST dataset. \n", - "\n", - "\n", - "Make sure you selected `conda_mxnet_p36` kernel." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Make sure:\n", - "* your account has been whitelisted\n", - "* your execution role has the appropriate trusts" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "import sys\n", - "# Import Private Beta SDK.\n", - "!{sys.executable} -m pip install -q -U pip\n", - "!{sys.executable} -m pip install -q sagemaker-2.6.1.dev0.tar.gz" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [], - "source": [ - "import IPython\n", - "#may need to restart the kernel after initial install of beta sdk\n", - "#IPython.Application.instance().kernel.do_shutdown(True)" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [], - "source": [ - "from sagemaker import get_execution_role\n", - "from sagemaker.session import Session\n", - "from sagemaker.lineage import context, artifact, association, action\n", - "import boto3\n", - "from datetime import datetime\n", - "import logging\n", - "import os" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [], - "source": [ - "# lineage beta only available in CMH\n", - "region = 'us-east-2'\n", - "\n", - "# S3 bucket for saving code and model artifacts.\n", - "# Feel free to specify a different bucket here if you wish.\n", - "bucket = Session().default_bucket()\n", - "boto_session = boto3.Session(region_name=region)\n", - "sagemaker_client = boto_session.client(\"sagemaker\")" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": { - "collapsed": false, - "jupyter": { - "outputs_hidden": false - }, - "pycharm": { - "name": "#%%\n" - } - }, - "outputs": [], - "source": [ - "# Bucket location where your custom code will be saved in the tar.gz format.\n", - "custom_code_upload_location = 's3://{}/mxnet-mnist-example/code'.format(bucket)\n", - "list_response = list(artifact.Artifact.list(source_uri=custom_code_upload_location, sagemaker_boto_client=sagemaker_client))\n", - "\n", - "if len(list_response):\n", - " code_artifact_arn = list_response[0].artifact_arn\n", - "else:\n", - " code_artifact_arn = artifact.Artifact.create(\n", - " artifact_name='SourceCodeLocation',\n", - " source_uri=custom_code_upload_location,\n", - " artifact_type='codelocation',\n", - " sagemaker_boto_client=sagemaker_client\n", - " ).artifact_arn\n", - "\n", - "# Bucket location where results of model training are saved.\n", - "model_artifacts_location = 's3://{}/mxnet-mnist-example/artifacts'.format(bucket)\n", - "list_response = list(artifact.Artifact.list(source_uri=model_artifacts_location, sagemaker_boto_client=sagemaker_client))\n", - "if len(list_response):\n", - " model_location_artifact_arn = list_response[0].artifact_arn\n", - "else:\n", - " model_location_artifact_arn = artifact.Artifact.create(\n", - " artifact_name='model-artifacts-location',\n", - " source_uri=model_artifacts_location,\n", - " artifact_type='model-artifacts-location',\n", - " sagemaker_boto_client=sagemaker_client,\n", - " ).artifact_arn\n", - "\n", - "# IAM execution role that gives SageMaker access to resources in your AWS account.\n", - "# We can use the SageMaker Python SDK to get the role from our notebook environment. \n", - "role = get_execution_role()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### The training script\n", - "\n", - "The `mnist.py` script provides all the code we need for training and hosting a SageMaker model. The script also checkpoints the model at the end of every epoch and saves the model graph, params and optimizer state in the folder `/opt/ml/checkpoints`. If the folder path does not exist then it skips checkpointing. The script we use is adaptated from Apache MXNet [MNIST tutorial](https://mxnet.incubator.apache.org/tutorials/python/mnist.html).\n", - "\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### SageMaker's MXNet estimator class" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [], - "source": [ - "from sagemaker.mxnet import MXNet\n", - "\n", - "mnist_estimator = MXNet(entry_point='mnist.py',\n", - " role=role,\n", - " output_path=model_artifacts_location,\n", - " code_location=custom_code_upload_location,\n", - " instance_count=1,\n", - " instance_type='ml.m4.xlarge',\n", - " framework_version='1.4.1',\n", - " py_version='py3',\n", - " #distributions={'parameter_server': {'enabled': True}},\n", - " hyperparameters={'learning-rate': 0.1})" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Running the Training Job" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "After we've constructed our MXNet object, we can fit it using data stored in S3. Below we run SageMaker training on two input channels: **train** and **test**.\n", - "\n", - "During training, SageMaker makes this data stored in S3 available in the local filesystem where the mnist script is running. The ```mnist.py``` script simply loads the train and test data from disk." - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": { - "scrolled": true - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "2020-09-18 19:11:17 Starting - Starting the training job...\n", - "2020-09-18 19:11:19 Starting - Launching requested ML instances......\n", - "2020-09-18 19:12:24 Starting - Preparing the instances for training......\n", - "2020-09-18 19:13:39 Downloading - Downloading input data\n", - "2020-09-18 19:13:39 Training - Downloading the training image...\n", - "2020-09-18 19:13:59 Training - Training image download completed. Training in progress.\u001b[34m2020-09-18 19:14:00,197 sagemaker-containers INFO Imported framework sagemaker_mxnet_container.training\u001b[0m\n", - "\u001b[34m2020-09-18 19:14:00,201 sagemaker-containers INFO No GPUs detected (normal if no gpus installed)\u001b[0m\n", - "\u001b[34m2020-09-18 19:14:00,215 sagemaker_mxnet_container.training INFO MXNet training environment: {'SM_HOSTS': '[\"algo-1\"]', 'SM_NETWORK_INTERFACE_NAME': 'eth0', 'SM_HPS': '{\"learning-rate\":0.1}', 'SM_USER_ENTRY_POINT': 'mnist.py', 'SM_FRAMEWORK_PARAMS': '{}', 'SM_RESOURCE_CONFIG': '{\"current_host\":\"algo-1\",\"hosts\":[\"algo-1\"],\"network_interface_name\":\"eth0\"}', 'SM_INPUT_DATA_CONFIG': '{\"test\":{\"RecordWrapperType\":\"None\",\"S3DistributionType\":\"FullyReplicated\",\"TrainingInputMode\":\"File\"},\"train\":{\"RecordWrapperType\":\"None\",\"S3DistributionType\":\"FullyReplicated\",\"TrainingInputMode\":\"File\"}}', 'SM_OUTPUT_DATA_DIR': '/opt/ml/output/data', 'SM_CHANNELS': '[\"test\",\"train\"]', 'SM_CURRENT_HOST': 'algo-1', 'SM_MODULE_NAME': 'mnist', 'SM_LOG_LEVEL': '20', 'SM_FRAMEWORK_MODULE': 'sagemaker_mxnet_container.training:main', 'SM_INPUT_DIR': '/opt/ml/input', 'SM_INPUT_CONFIG_DIR': '/opt/ml/input/config', 'SM_OUTPUT_DIR': '/opt/ml/output', 'SM_NUM_CPUS': '4', 'SM_NUM_GPUS': '0', 'SM_MODEL_DIR': '/opt/ml/model', 'SM_MODULE_DIR': 's3://sagemaker-us-east-2-707662012936/mxnet-mnist-example/code/mxnet-training-2020-09-18-19-11-17-138/source/sourcedir.tar.gz', 'SM_TRAINING_ENV': '{\"additional_framework_parameters\":{},\"channel_input_dirs\":{\"test\":\"/opt/ml/input/data/test\",\"train\":\"/opt/ml/input/data/train\"},\"current_host\":\"algo-1\",\"framework_module\":\"sagemaker_mxnet_container.training:main\",\"hosts\":[\"algo-1\"],\"hyperparameters\":{\"learning-rate\":0.1},\"input_config_dir\":\"/opt/ml/input/config\",\"input_data_config\":{\"test\":{\"RecordWrapperType\":\"None\",\"S3DistributionType\":\"FullyReplicated\",\"TrainingInputMode\":\"File\"},\"train\":{\"RecordWrapperType\":\"None\",\"S3DistributionType\":\"FullyReplicated\",\"TrainingInputMode\":\"File\"}},\"input_dir\":\"/opt/ml/input\",\"is_master\":true,\"job_name\":\"mxnet-training-2020-09-18-19-11-17-138\",\"log_level\":20,\"master_hostname\":\"algo-1\",\"model_dir\":\"/opt/ml/model\",\"module_dir\":\"s3://sagemaker-us-east-2-707662012936/mxnet-mnist-example/code/mxnet-training-2020-09-18-19-11-17-138/source/sourcedir.tar.gz\",\"module_name\":\"mnist\",\"network_interface_name\":\"eth0\",\"num_cpus\":4,\"num_gpus\":0,\"output_data_dir\":\"/opt/ml/output/data\",\"output_dir\":\"/opt/ml/output\",\"output_intermediate_dir\":\"/opt/ml/output/intermediate\",\"resource_config\":{\"current_host\":\"algo-1\",\"hosts\":[\"algo-1\"],\"network_interface_name\":\"eth0\"},\"user_entry_point\":\"mnist.py\"}', 'SM_USER_ARGS': '[\"--learning-rate\",\"0.1\"]', 'SM_OUTPUT_INTERMEDIATE_DIR': '/opt/ml/output/intermediate', 'SM_CHANNEL_TEST': '/opt/ml/input/data/test', 'SM_CHANNEL_TRAIN': '/opt/ml/input/data/train', 'SM_HP_LEARNING-RATE': '0.1'}\u001b[0m\n", - "\u001b[34m2020-09-18 19:14:00,481 sagemaker-containers INFO Module mnist does not provide a setup.py. \u001b[0m\n", - "\u001b[34mGenerating setup.py\u001b[0m\n", - "\u001b[34m2020-09-18 19:14:00,481 sagemaker-containers INFO Generating setup.cfg\u001b[0m\n", - "\u001b[34m2020-09-18 19:14:00,481 sagemaker-containers INFO Generating MANIFEST.in\u001b[0m\n", - "\u001b[34m2020-09-18 19:14:00,481 sagemaker-containers INFO Installing module with the following command:\u001b[0m\n", - "\u001b[34m/usr/local/bin/python3.6 -m pip install -U . \u001b[0m\n", - "\u001b[34mProcessing /opt/ml/code\u001b[0m\n", - "\u001b[34mInstalling collected packages: mnist\n", - " Running setup.py install for mnist: started\n", - " Running setup.py install for mnist: finished with status 'done'\u001b[0m\n", - "\u001b[34mSuccessfully installed mnist-1.0.0\u001b[0m\n", - "\u001b[34mWARNING: You are using pip version 19.1.1, however version 20.2.3 is available.\u001b[0m\n", - "\u001b[34mYou should consider upgrading via the 'pip install --upgrade pip' command.\u001b[0m\n", - "\u001b[34m2020-09-18 19:14:02,203 sagemaker-containers INFO No GPUs detected (normal if no gpus installed)\u001b[0m\n", - "\u001b[34m2020-09-18 19:14:02,220 sagemaker-containers INFO Invoking user script\n", - "\u001b[0m\n", - "\u001b[34mTraining Env:\n", - "\u001b[0m\n", - "\u001b[34m{\n", - " \"additional_framework_parameters\": {},\n", - " \"channel_input_dirs\": {\n", - " \"test\": \"/opt/ml/input/data/test\",\n", - " \"train\": \"/opt/ml/input/data/train\"\n", - " },\n", - " \"current_host\": \"algo-1\",\n", - " \"framework_module\": \"sagemaker_mxnet_container.training:main\",\n", - " \"hosts\": [\n", - " \"algo-1\"\n", - " ],\n", - " \"hyperparameters\": {\n", - " \"learning-rate\": 0.1\n", - " },\n", - " \"input_config_dir\": \"/opt/ml/input/config\",\n", - " \"input_data_config\": {\n", - " \"test\": {\n", - " \"TrainingInputMode\": \"File\",\n", - " \"S3DistributionType\": \"FullyReplicated\",\n", - " \"RecordWrapperType\": \"None\"\n", - " },\n", - " \"train\": {\n", - " \"TrainingInputMode\": \"File\",\n", - " \"S3DistributionType\": \"FullyReplicated\",\n", - " \"RecordWrapperType\": \"None\"\n", - " }\n", - " },\n", - " \"input_dir\": \"/opt/ml/input\",\n", - " \"is_master\": true,\n", - " \"job_name\": \"mxnet-training-2020-09-18-19-11-17-138\",\n", - " \"log_level\": 20,\n", - " \"master_hostname\": \"algo-1\",\n", - " \"model_dir\": \"/opt/ml/model\",\n", - " \"module_dir\": \"s3://sagemaker-us-east-2-707662012936/mxnet-mnist-example/code/mxnet-training-2020-09-18-19-11-17-138/source/sourcedir.tar.gz\",\n", - " \"module_name\": \"mnist\",\n", - " \"network_interface_name\": \"eth0\",\n", - " \"num_cpus\": 4,\n", - " \"num_gpus\": 0,\n", - " \"output_data_dir\": \"/opt/ml/output/data\",\n", - " \"output_dir\": \"/opt/ml/output\",\n", - " \"output_intermediate_dir\": \"/opt/ml/output/intermediate\",\n", - " \"resource_config\": {\n", - " \"current_host\": \"algo-1\",\n", - " \"hosts\": [\n", - " \"algo-1\"\n", - " ],\n", - " \"network_interface_name\": \"eth0\"\n", - " },\n", - " \"user_entry_point\": \"mnist.py\"\u001b[0m\n", - "\u001b[34m}\n", - "\u001b[0m\n", - "\u001b[34mEnvironment variables:\n", - "\u001b[0m\n", - "\u001b[34mSM_HOSTS=[\"algo-1\"]\u001b[0m\n", - "\u001b[34mSM_NETWORK_INTERFACE_NAME=eth0\u001b[0m\n", - "\u001b[34mSM_HPS={\"learning-rate\":0.1}\u001b[0m\n", - "\u001b[34mSM_USER_ENTRY_POINT=mnist.py\u001b[0m\n", - "\u001b[34mSM_FRAMEWORK_PARAMS={}\u001b[0m\n", - "\u001b[34mSM_RESOURCE_CONFIG={\"current_host\":\"algo-1\",\"hosts\":[\"algo-1\"],\"network_interface_name\":\"eth0\"}\u001b[0m\n", - "\u001b[34mSM_INPUT_DATA_CONFIG={\"test\":{\"RecordWrapperType\":\"None\",\"S3DistributionType\":\"FullyReplicated\",\"TrainingInputMode\":\"File\"},\"train\":{\"RecordWrapperType\":\"None\",\"S3DistributionType\":\"FullyReplicated\",\"TrainingInputMode\":\"File\"}}\u001b[0m\n", - "\u001b[34mSM_OUTPUT_DATA_DIR=/opt/ml/output/data\u001b[0m\n", - "\u001b[34mSM_CHANNELS=[\"test\",\"train\"]\u001b[0m\n", - "\u001b[34mSM_CURRENT_HOST=algo-1\u001b[0m\n", - "\u001b[34mSM_MODULE_NAME=mnist\u001b[0m\n", - "\u001b[34mSM_LOG_LEVEL=20\u001b[0m\n", - "\u001b[34mSM_FRAMEWORK_MODULE=sagemaker_mxnet_container.training:main\u001b[0m\n", - "\u001b[34mSM_INPUT_DIR=/opt/ml/input\u001b[0m\n", - "\u001b[34mSM_INPUT_CONFIG_DIR=/opt/ml/input/config\u001b[0m\n", - "\u001b[34mSM_OUTPUT_DIR=/opt/ml/output\u001b[0m\n", - "\u001b[34mSM_NUM_CPUS=4\u001b[0m\n", - "\u001b[34mSM_NUM_GPUS=0\u001b[0m\n", - "\u001b[34mSM_MODEL_DIR=/opt/ml/model\u001b[0m\n", - "\u001b[34mSM_MODULE_DIR=s3://sagemaker-us-east-2-707662012936/mxnet-mnist-example/code/mxnet-training-2020-09-18-19-11-17-138/source/sourcedir.tar.gz\u001b[0m\n", - "\u001b[34mSM_TRAINING_ENV={\"additional_framework_parameters\":{},\"channel_input_dirs\":{\"test\":\"/opt/ml/input/data/test\",\"train\":\"/opt/ml/input/data/train\"},\"current_host\":\"algo-1\",\"framework_module\":\"sagemaker_mxnet_container.training:main\",\"hosts\":[\"algo-1\"],\"hyperparameters\":{\"learning-rate\":0.1},\"input_config_dir\":\"/opt/ml/input/config\",\"input_data_config\":{\"test\":{\"RecordWrapperType\":\"None\",\"S3DistributionType\":\"FullyReplicated\",\"TrainingInputMode\":\"File\"},\"train\":{\"RecordWrapperType\":\"None\",\"S3DistributionType\":\"FullyReplicated\",\"TrainingInputMode\":\"File\"}},\"input_dir\":\"/opt/ml/input\",\"is_master\":true,\"job_name\":\"mxnet-training-2020-09-18-19-11-17-138\",\"log_level\":20,\"master_hostname\":\"algo-1\",\"model_dir\":\"/opt/ml/model\",\"module_dir\":\"s3://sagemaker-us-east-2-707662012936/mxnet-mnist-example/code/mxnet-training-2020-09-18-19-11-17-138/source/sourcedir.tar.gz\",\"module_name\":\"mnist\",\"network_interface_name\":\"eth0\",\"num_cpus\":4,\"num_gpus\":0,\"output_data_dir\":\"/opt/ml/output/data\",\"output_dir\":\"/opt/ml/output\",\"output_intermediate_dir\":\"/opt/ml/output/intermediate\",\"resource_config\":{\"current_host\":\"algo-1\",\"hosts\":[\"algo-1\"],\"network_interface_name\":\"eth0\"},\"user_entry_point\":\"mnist.py\"}\u001b[0m\n", - "\u001b[34mSM_USER_ARGS=[\"--learning-rate\",\"0.1\"]\u001b[0m\n", - "\u001b[34mSM_OUTPUT_INTERMEDIATE_DIR=/opt/ml/output/intermediate\u001b[0m\n", - "\u001b[34mSM_CHANNEL_TEST=/opt/ml/input/data/test\u001b[0m\n", - "\u001b[34mSM_CHANNEL_TRAIN=/opt/ml/input/data/train\u001b[0m\n", - "\u001b[34mSM_HP_LEARNING-RATE=0.1\u001b[0m\n", - "\u001b[34mPYTHONPATH=/usr/local/bin:/usr/local/lib/python36.zip:/usr/local/lib/python3.6:/usr/local/lib/python3.6/lib-dynload:/usr/local/lib/python3.6/site-packages\n", - "\u001b[0m\n", - "\u001b[34mInvoking script with the following command:\n", - "\u001b[0m\n", - "\u001b[34m/usr/local/bin/python3.6 -m mnist --learning-rate 0.1\n", - "\n", - "\u001b[0m\n", - "\u001b[34mINFO:root:Epoch[0] Batch [0-100]#011Speed: 48537.71 samples/sec#011accuracy=0.105248\u001b[0m\n", - "\u001b[34mINFO:root:Epoch[0] Batch [100-200]#011Speed: 52232.08 samples/sec#011accuracy=0.117400\u001b[0m\n", - "\u001b[34mINFO:root:Epoch[0] Batch [200-300]#011Speed: 52729.16 samples/sec#011accuracy=0.112400\u001b[0m\n", - "\u001b[34mINFO:root:Epoch[0] Batch [300-400]#011Speed: 58271.46 samples/sec#011accuracy=0.111800\u001b[0m\n", - "\u001b[34mINFO:root:Epoch[0] Batch [400-500]#011Speed: 51936.64 samples/sec#011accuracy=0.111900\u001b[0m\n", - "\u001b[34mINFO:root:Epoch[0] Train-accuracy=0.132550\u001b[0m\n", - "\u001b[34mINFO:root:Epoch[0] Time cost=1.159\u001b[0m\n", - "\u001b[34mINFO:root:Epoch[0] Validation-accuracy=0.350200\u001b[0m\n", - "\u001b[34mINFO:root:Epoch[1] Batch [0-100]#011Speed: 46740.22 samples/sec#011accuracy=0.473168\u001b[0m\n", - "\u001b[34mINFO:root:Epoch[1] Batch [100-200]#011Speed: 59956.92 samples/sec#011accuracy=0.666000\u001b[0m\n", - "\u001b[34mINFO:root:Epoch[1] Batch [200-300]#011Speed: 59902.97 samples/sec#011accuracy=0.763500\u001b[0m\n", - "\u001b[34mINFO:root:Epoch[1] Batch [300-400]#011Speed: 57584.56 samples/sec#011accuracy=0.799100\u001b[0m\n", - "\u001b[34mINFO:root:Epoch[1] Batch [400-500]#011Speed: 59448.98 samples/sec#011accuracy=0.829800\u001b[0m\n", - "\u001b[34mINFO:root:Epoch[1] Train-accuracy=0.727917\u001b[0m\n", - "\u001b[34mINFO:root:Epoch[1] Time cost=1.065\u001b[0m\n", - "\u001b[34mINFO:root:Epoch[1] Validation-accuracy=0.859700\u001b[0m\n", - "\u001b[34mINFO:root:Epoch[2] Batch [0-100]#011Speed: 47147.88 samples/sec#011accuracy=0.857723\u001b[0m\n", - "\u001b[34mINFO:root:Epoch[2] Batch [100-200]#011Speed: 48038.83 samples/sec#011accuracy=0.868600\u001b[0m\n", - "\u001b[34mINFO:root:Epoch[2] Batch [200-300]#011Speed: 51587.54 samples/sec#011accuracy=0.889400\u001b[0m\n", - "\u001b[34mINFO:root:Epoch[2] Batch [300-400]#011Speed: 54230.68 samples/sec#011accuracy=0.899400\u001b[0m\n", - "\u001b[34mINFO:root:Epoch[2] Batch [400-500]#011Speed: 49632.21 samples/sec#011accuracy=0.905200\u001b[0m\n", - "\u001b[34mINFO:root:Epoch[2] Train-accuracy=0.888900\u001b[0m\n", - "\u001b[34mINFO:root:Epoch[2] Time cost=1.195\u001b[0m\n", - "\u001b[34mINFO:root:Epoch[2] Validation-accuracy=0.920000\u001b[0m\n", - "\u001b[34mINFO:root:Epoch[3] Batch [0-100]#011Speed: 39163.75 samples/sec#011accuracy=0.920594\u001b[0m\n", - "\u001b[34mINFO:root:Epoch[3] Batch [100-200]#011Speed: 55325.51 samples/sec#011accuracy=0.920200\u001b[0m\n", - "\u001b[34mINFO:root:Epoch[3] Batch [200-300]#011Speed: 58630.19 samples/sec#011accuracy=0.928000\u001b[0m\n", - "\u001b[34mINFO:root:Epoch[3] Batch [300-400]#011Speed: 57232.00 samples/sec#011accuracy=0.929900\u001b[0m\n", - "\u001b[34mINFO:root:Epoch[3] Batch [400-500]#011Speed: 58584.25 samples/sec#011accuracy=0.930200\u001b[0m\n", - "\u001b[34mINFO:root:Epoch[3] Train-accuracy=0.927933\u001b[0m\n", - "\u001b[34mINFO:root:Epoch[3] Time cost=1.152\u001b[0m\n", - "\u001b[34mINFO:root:Epoch[3] Validation-accuracy=0.937500\u001b[0m\n", - "\u001b[34mINFO:root:Epoch[4] Batch [0-100]#011Speed: 47603.31 samples/sec#011accuracy=0.943762\u001b[0m\n", - "\u001b[34mINFO:root:Epoch[4] Batch [100-200]#011Speed: 54678.19 samples/sec#011accuracy=0.944800\u001b[0m\n", - "\u001b[34mINFO:root:Epoch[4] Batch [200-300]#011Speed: 55427.86 samples/sec#011accuracy=0.943600\u001b[0m\n", - "\u001b[34mINFO:root:Epoch[4] Batch [300-400]#011Speed: 52224.08 samples/sec#011accuracy=0.947500\u001b[0m\n", - "\u001b[34mINFO:root:Epoch[4] Batch [400-500]#011Speed: 59256.14 samples/sec#011accuracy=0.949900\u001b[0m\n", - "\u001b[34mINFO:root:Epoch[4] Train-accuracy=0.947067\u001b[0m\n", - "\u001b[34mINFO:root:Epoch[4] Time cost=1.113\u001b[0m\n", - "\u001b[34mINFO:root:Epoch[4] Validation-accuracy=0.951700\u001b[0m\n", - "\u001b[34mINFO:root:Epoch[5] Batch [0-100]#011Speed: 48969.19 samples/sec#011accuracy=0.956238\u001b[0m\n", - "\u001b[34mINFO:root:Epoch[5] Batch [100-200]#011Speed: 61169.63 samples/sec#011accuracy=0.955600\u001b[0m\n", - "\u001b[34mINFO:root:Epoch[5] Batch [200-300]#011Speed: 51660.99 samples/sec#011accuracy=0.955900\u001b[0m\n", - "\u001b[34mINFO:root:Epoch[5] Batch [300-400]#011Speed: 50272.49 samples/sec#011accuracy=0.959100\u001b[0m\n", - "\u001b[34mINFO:root:Epoch[5] Batch [400-500]#011Speed: 56537.68 samples/sec#011accuracy=0.959400\u001b[0m\n", - "\u001b[34mINFO:root:Epoch[5] Train-accuracy=0.957833\u001b[0m\n", - "\u001b[34mINFO:root:Epoch[5] Time cost=1.135\u001b[0m\n", - "\u001b[34mINFO:root:Epoch[5] Validation-accuracy=0.957800\u001b[0m\n", - "\u001b[34mINFO:root:Epoch[6] Batch [0-100]#011Speed: 48367.56 samples/sec#011accuracy=0.965644\u001b[0m\n", - "\u001b[34mINFO:root:Epoch[6] Batch [100-200]#011Speed: 52162.12 samples/sec#011accuracy=0.964000\u001b[0m\n", - "\u001b[34mINFO:root:Epoch[6] Batch [200-300]#011Speed: 42420.57 samples/sec#011accuracy=0.965700\u001b[0m\n", - "\u001b[34mINFO:root:Epoch[6] Batch [300-400]#011Speed: 58030.24 samples/sec#011accuracy=0.963800\u001b[0m\n", - "\u001b[34mINFO:root:Epoch[6] Batch [400-500]#011Speed: 59894.67 samples/sec#011accuracy=0.967100\u001b[0m\n", - "\u001b[34mINFO:root:Epoch[6] Train-accuracy=0.965133\u001b[0m\n", - "\u001b[34mINFO:root:Epoch[6] Time cost=1.186\u001b[0m\n", - "\u001b[34mINFO:root:Epoch[6] Validation-accuracy=0.964100\u001b[0m\n", - "\u001b[34mINFO:root:Epoch[7] Batch [0-100]#011Speed: 45140.09 samples/sec#011accuracy=0.971188\u001b[0m\n", - "\u001b[34mINFO:root:Epoch[7] Batch [100-200]#011Speed: 52699.15 samples/sec#011accuracy=0.966600\u001b[0m\n", - "\u001b[34mINFO:root:Epoch[7] Batch [200-300]#011Speed: 49742.93 samples/sec#011accuracy=0.969700\u001b[0m\n", - "\u001b[34mINFO:root:Epoch[7] Batch [300-400]#011Speed: 55433.43 samples/sec#011accuracy=0.968400\u001b[0m\n", - "\u001b[34mINFO:root:Epoch[7] Batch [400-500]#011Speed: 49421.79 samples/sec#011accuracy=0.968600\u001b[0m\n", - "\u001b[34mINFO:root:Epoch[7] Train-accuracy=0.969383\u001b[0m\n", - "\u001b[34mINFO:root:Epoch[7] Time cost=1.210\u001b[0m\n", - "\u001b[34mINFO:root:Epoch[7] Validation-accuracy=0.965900\u001b[0m\n", - "\u001b[34mINFO:root:Epoch[8] Batch [0-100]#011Speed: 46661.86 samples/sec#011accuracy=0.973168\u001b[0m\n", - "\u001b[34mINFO:root:Epoch[8] Batch [100-200]#011Speed: 55705.76 samples/sec#011accuracy=0.971900\u001b[0m\n", - "\u001b[34mINFO:root:Epoch[8] Batch [200-300]#011Speed: 57230.98 samples/sec#011accuracy=0.974000\u001b[0m\n", - "\u001b[34mINFO:root:Epoch[8] Batch [300-400]#011Speed: 60181.19 samples/sec#011accuracy=0.974400\u001b[0m\n", - "\u001b[34mINFO:root:Epoch[8] Batch [400-500]#011Speed: 59841.18 samples/sec#011accuracy=0.971400\u001b[0m\n", - "\u001b[34mINFO:root:Epoch[8] Train-accuracy=0.973050\u001b[0m\n", - "\u001b[34mINFO:root:Epoch[8] Time cost=1.087\u001b[0m\n", - "\u001b[34mINFO:root:Epoch[8] Validation-accuracy=0.969000\u001b[0m\n", - "\u001b[34mINFO:root:Epoch[9] Batch [0-100]#011Speed: 37346.64 samples/sec#011accuracy=0.977228\u001b[0m\n", - "\u001b[34mINFO:root:Epoch[9] Batch [100-200]#011Speed: 60791.24 samples/sec#011accuracy=0.976100\u001b[0m\n", - "\u001b[34mINFO:root:Epoch[9] Batch [200-300]#011Speed: 56732.68 samples/sec#011accuracy=0.975800\u001b[0m\n", - "\u001b[34mINFO:root:Epoch[9] Batch [300-400]#011Speed: 56863.21 samples/sec#011accuracy=0.976400\u001b[0m\n", - "\u001b[34mINFO:root:Epoch[9] Batch [400-500]#011Speed: 43948.87 samples/sec#011accuracy=0.976000\u001b[0m\n", - "\u001b[34mINFO:root:Epoch[9] Train-accuracy=0.976433\u001b[0m\n", - "\u001b[34mINFO:root:Epoch[9] Time cost=1.205\u001b[0m\n", - "\u001b[34mINFO:root:Epoch[9] Validation-accuracy=0.969100\u001b[0m\n", - "\u001b[34m2020-09-18 19:14:21,054 sagemaker-containers INFO Reporting training SUCCESS\u001b[0m\n", - "\n", - "2020-09-18 19:14:32 Uploading - Uploading generated training model\n", - "2020-09-18 19:14:32 Completed - Training job completed\n", - "Training seconds: 71\n", - "Billable seconds: 71\n", - "CPU times: user 534 ms, sys: 33.9 ms, total: 567 ms\n", - "Wall time: 3min 41s\n" - ] - } - ], - "source": [ - "%%time\n", - "\n", - "train_data_location = 's3://sagemaker-sample-data-{}/mxnet/mnist/train'.format(region)\n", - "test_data_location = 's3://sagemaker-sample-data-{}/mxnet/mnist/test'.format(region)\n", - "\n", - "mnist_estimator.fit({'train': train_data_location, 'test': test_data_location})" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [], - "source": [ - "list_response = list(artifact.Artifact.list(source_uri=train_data_location, sagemaker_boto_client=sagemaker_client))\n", - "if len(list_response):\n", - " train_data_location_artifact_arn = list_response[0].artifact_arn\n", - "else:\n", - " train_data_location_artifact_arn = artifact.Artifact.create(\n", - " artifact_name='train-data',\n", - " artifact_type='TrainingData',\n", - " source_uri=train_data_location,\n", - " sagemaker_boto_client=sagemaker_client,\n", - " ).artifact_arn\n", - "\n", - "list_response = list(artifact.Artifact.list(source_uri=test_data_location, sagemaker_boto_client=sagemaker_client))\n", - "if len(list_response):\n", - " test_data_location_artifact_arn = list_response[0].artifact_arn\n", - "else:\n", - " test_data_location_artifact_arn = artifact.Artifact.create(\n", - " artifact_name='test-data',\n", - " artifact_type='TestData',\n", - " source_uri=test_data_location,\n", - " sagemaker_boto_client=sagemaker_client,\n", - " ).artifact_arn" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": { - "collapsed": false, - "jupyter": { - "outputs_hidden": false - }, - "pycharm": { - "name": "#%%\n" - } - }, - "outputs": [], - "source": [ - "# associate the artifacts\n", - "\n", - "training_job_name = mnist_estimator.latest_training_job.job_name\n", - "\n", - "trial_component = sagemaker_client.describe_trial_component(TrialComponentName=training_job_name + '-aws-training-job')\n", - "trial_component_arn=trial_component['TrialComponentArn']\n", - "\n", - "input_artifacts = [code_artifact_arn, train_data_location_artifact_arn, test_data_location_artifact_arn]\n", - "for artifact_arn in input_artifacts:\n", - " try:\n", - " association.Association.create(\n", - " source_arn=artifact_arn,\n", - " destination_arn=trial_component_arn,\n", - " association_type='ContributedTo',\n", - " sagemaker_boto_client=sagemaker_client,\n", - " )\n", - " except:\n", - " logging.info('association between {} and {} already exists', artifact_arn, trial_component_arn)\n", - "\n", - "output_artifacts = [model_location_artifact_arn]\n", - "for artifact_arn in output_artifacts:\n", - " try:\n", - " association.Association.create(\n", - " source_arn=trial_component_arn,\n", - " destination_arn=artifact_arn,\n", - " association_type='Produced',\n", - " sagemaker_boto_client=sagemaker_client,\n", - " )\n", - " except:\n", - " logging.info('association between {} and {} already exists', artifact_arn, trial_component_arn)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Creating an inference Endpoint\n", - "\n", - "After training, we use the ``MXNet estimator`` object to build and deploy an ``MXNetPredictor``. This creates a Sagemaker **Endpoint** -- a hosted prediction service that we can use to perform inference. \n", - "\n", - "The arguments to the ``deploy`` function allow us to set the number and type of instances that will be used for the Endpoint. These do not need to be the same as the values we used for the training job. For example, you can train a model on a set of GPU-based instances, and then deploy the Endpoint to a fleet of CPU-based instances. Here we will deploy the model to a single ``ml.m4.xlarge`` instance." - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "-------------!CPU times: user 368 ms, sys: 12.5 ms, total: 380 ms\n", - "Wall time: 6min 32s\n" - ] - } - ], - "source": [ - "%%time\n", - "\n", - "predictor = mnist_estimator.deploy(initial_instance_count=1,\n", - " instance_type='ml.m4.xlarge')" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "Association(sagemaker_boto_client=,source_arn='arn:aws:sagemaker:us-east-2:707662012936:experiment-trial-component/mxnet-training-2020-09-18-19-07-09-802-aws-training-job',destination_arn='arn:aws:sagemaker:us-east-2:707662012936:context/mxnet-training-2020-09-18-19-21-54-609',association_type=None,response_metadata={'RequestId': 'd6391fc9-edd2-43b9-8338-d996287f1140', 'HTTPStatusCode': 200, 'HTTPHeaders': {'x-amzn-requestid': 'd6391fc9-edd2-43b9-8338-d996287f1140', 'content-type': 'application/x-amz-json-1.1', 'content-length': '246', 'date': 'Fri, 18 Sep 2020 19:28:54 GMT'}, 'RetryAttempts': 0})" - ] - }, - "execution_count": 15, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from sagemaker.lineage import context\n", - "\n", - "endpoint = sagemaker_client.describe_endpoint(EndpointName=predictor.endpoint_name)\n", - "endpoint_arn = endpoint['EndpointArn']\n", - "\n", - "list_response = list(context.Context.list(source_uri=endpoint_arn, sagemaker_boto_client=sagemaker_client))\n", - "if len(list_response):\n", - " endpoint_context_arn = list_response[0].context_arn\n", - "else:\n", - " endpoint_context_arn = context.Context.create(\n", - " context_name=predictor.endpoint_name,\n", - " context_type='Endpoint',\n", - " source_uri=endpoint_arn,\n", - " sagemaker_boto_client=sagemaker_client, \n", - " ).context_arn\n", - "\n", - "association.Association.create(\n", - " source_arn=trial_component_arn,\n", - " destination_arn=endpoint_context_arn,\n", - " sagemaker_boto_client=sagemaker_client,\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "metadata": {}, - "outputs": [], - "source": [ - "predictor.delete_endpoint()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "%run lineage_visualizer.py\n", - "\n", - "import lineage_visualizer\n", - "\n", - "vis = LineageVisualizer(sagemaker_client)\n", - "vis.both(endpoint_context_arn)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "file_name = vis.write_yaml()\n", - "f = open(file_name, \"r\")\n", - "print(f.read())" - ] - } - ], - "metadata": { - "jupytext": { - "formats": "ipynb,md" - }, - "kernelspec": { - "display_name": "conda_mxnet_p36", - "language": "python", - "name": "conda_mxnet_p36" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.10" - }, - "notice": "Copyright 2017 Amazon.com, Inc. or its affiliates. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the \"License\"). You may not use this file except in compliance with the License. A copy of the License is located at http://aws.amazon.com/apache2.0/ or in the \"license\" file accompanying this file. This file is distributed on an \"AS IS\" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License." - }, - "nbformat": 4, - "nbformat_minor": 4 -} \ No newline at end of file diff --git a/lineage/lineage_visualizer.py b/lineage/lineage_visualizer.py deleted file mode 100644 index 4b318e1a5f..0000000000 --- a/lineage/lineage_visualizer.py +++ /dev/null @@ -1,97 +0,0 @@ -from __future__ import division -import matplotlib as mpl -import matplotlib.pyplot as plt -import networkx as nx -from sagemaker.lineage.context import Context -from sagemaker.lineage.artifact import Artifact -from sagemaker.lineage.association import Association -from sagemaker.lineage.action import Action -from datetime import datetime - - -class LineageVisualizer(object): - def __init__(self, sagemaker_client): - self._sm_client = sagemaker_client - - def upstream(self, start_arn): - edges = self._get_upstream_lineage(start_arn) - self._plot_lineage(edges) - - def _get_upstream_lineage(self, start_arn): - upstream_associations = Association.list( - destination_arn=start_arn, sagemaker_boto_client=self._sm_client - ) - unexplored_associations = list(upstream_associations) - edges = [] - while unexplored_associations: - association = unexplored_associations.pop() - src = association.source_arn - dest = association.destination_arn - edges.append(association) - upstream_associations = Association.list( - destination_arn=src, sagemaker_boto_client=self._sm_client - ) - unexplored_associations.extend(upstream_associations) - return edges - - def downstream(self, start_arn): - edges = self._get_downstream_lineage(start_arn) - self._plot_lineage(edges) - - def _get_downstream_lineage(self, start_arn): - downstream_associations = Association.list( - source_arn=start_arn, sagemaker_boto_client=self._sm_client - ) - unexplored_associations = list(downstream_associations) - edges = [] - while unexplored_associations: - association = unexplored_associations.pop() - src = association.source_arn - dest = association.destination_arn - edges.append(association) - downstream_associations = Association.list( - destination_arn=src, sagemaker_boto_client=self._sm_client - ) - unexplored_associations.extend(downstream_associations) - return edges - - def both(self, start_arn): - upstream = self._get_upstream_lineage(start_arn) - downstream = self._get_downstream_lineage(start_arn) - all = [] - if upstream: - all.extend(upstream) - if downstream: - all.extend(downstream) - self._plot_lineage(all) - - def write_yaml(self): - file_name = f"graph_{datetime.now().timestamp()}.yaml" - nx.write_yaml(self._g, file_name) - return file_name - - def _plot_lineage(self, edges): - G = nx.DiGraph() - - for edge in edges: - source_name = edge.source_arn.split("/")[1] - source_name = f"{edge.source_type}-({source_name})" - G.add_node(source_name) - dest_name = edge.destination_arn.split("/")[1] - dest_name = f"{edge.desination_type}-({dest_name})" - G.add_node(dest_name) - G.add_edge(source_name, dest_name) - self._g = G - M = G.number_of_edges() - - pos = nx.layout.spring_layout(G) - nodes = nx.draw_networkx_nodes(G, pos, node_size=500) - nx.draw_networkx_labels(G, pos) - edges = nx.draw_networkx_edges(G, pos, arrowstyle="->", arrowsize=30, width=1, arrows=True) - - ax = plt.gca() - ax.patch.set_facecolor("white") - ax.figure.set_size_inches(10, 10) - # fig= plt.figure(figsize=(10,10)) - plt.title("foo") - plt.show() diff --git a/lineage/mnist.py b/lineage/mnist.py deleted file mode 100644 index d09aed57d8..0000000000 --- a/lineage/mnist.py +++ /dev/null @@ -1,183 +0,0 @@ -import argparse -import gzip -import json -import logging -import os -import struct - -import mxnet as mx -import numpy as np - - -def load_data(path): - with gzip.open(find_file(path, "labels.gz")) as flbl: - struct.unpack(">II", flbl.read(8)) - labels = np.fromstring(flbl.read(), dtype=np.int8) - with gzip.open(find_file(path, "images.gz")) as fimg: - _, _, rows, cols = struct.unpack(">IIII", fimg.read(16)) - images = np.fromstring(fimg.read(), dtype=np.uint8).reshape(len(labels), rows, cols) - images = images.reshape(images.shape[0], 1, 28, 28).astype(np.float32) / 255 - return labels, images - - -def find_file(root_path, file_name): - for root, dirs, files in os.walk(root_path): - if file_name in files: - return os.path.join(root, file_name) - - -def build_graph(): - data = mx.sym.var("data") - data = mx.sym.flatten(data=data) - fc1 = mx.sym.FullyConnected(data=data, num_hidden=128) - act1 = mx.sym.Activation(data=fc1, act_type="relu") - fc2 = mx.sym.FullyConnected(data=act1, num_hidden=64) - act2 = mx.sym.Activation(data=fc2, act_type="relu") - fc3 = mx.sym.FullyConnected(data=act2, num_hidden=10) - return mx.sym.SoftmaxOutput(data=fc3, name="softmax") - - -def get_training_context(num_gpus): - if num_gpus: - return [mx.gpu(i) for i in range(num_gpus)] - else: - return mx.cpu() - - -def train( - batch_size, - epochs, - learning_rate, - num_gpus, - training_channel, - testing_channel, - hosts, - current_host, - model_dir, -): - checkpoints_dir = "/opt/ml/checkpoints" - checkpoints_enabled = os.path.exists(checkpoints_dir) - - (train_labels, train_images) = load_data(training_channel) - (test_labels, test_images) = load_data(testing_channel) - # Data parallel training - shard the data so each host - # only trains on a subset of the total data. - shard_size = len(train_images) // len(hosts) - for i, host in enumerate(hosts): - if host == current_host: - start = shard_size * i - end = start + shard_size - break - - train_iter = mx.io.NDArrayIter( - train_images[start:end], train_labels[start:end], batch_size, shuffle=True - ) - val_iter = mx.io.NDArrayIter(test_images, test_labels, batch_size) - - logging.getLogger().setLevel(logging.DEBUG) - - kvstore = "local" if len(hosts) == 1 else "dist_sync" - - mlp_model = mx.mod.Module(symbol=build_graph(), context=get_training_context(num_gpus)) - - checkpoint_callback = None - if checkpoints_enabled: - # Create a checkpoint callback that checkpoints the model params and - # the optimizer state at the given path after every epoch. - checkpoint_callback = mx.callback.module_checkpoint( - mlp_model, os.path.join(checkpoints_dir, "mnist"), period=1, save_optimizer_states=True - ) - mlp_model.fit( - train_iter, - eval_data=val_iter, - kvstore=kvstore, - optimizer="sgd", - optimizer_params={"learning_rate": learning_rate}, - eval_metric="acc", - epoch_end_callback=checkpoint_callback, - batch_end_callback=mx.callback.Speedometer(batch_size, 100), - num_epoch=epochs, - ) - - if current_host == hosts[0]: - save(model_dir, mlp_model) - - -def save(model_dir, model): - model.symbol.save(os.path.join(model_dir, "model-symbol.json")) - model.save_params(os.path.join(model_dir, "model-0000.params")) - - signature = [ - {"name": data_desc.name, "shape": [dim for dim in data_desc.shape]} - for data_desc in model.data_shapes - ] - with open(os.path.join(model_dir, "model-shapes.json"), "w") as f: - json.dump(signature, f) - - -def parse_args(): - parser = argparse.ArgumentParser() - - parser.add_argument("--batch-size", type=int, default=100) - parser.add_argument("--epochs", type=int, default=10) - parser.add_argument("--learning-rate", type=float, default=0.1) - - parser.add_argument("--model-dir", type=str, default=os.environ["SM_MODEL_DIR"]) - parser.add_argument("--train", type=str, default=os.environ["SM_CHANNEL_TRAIN"]) - parser.add_argument("--test", type=str, default=os.environ["SM_CHANNEL_TEST"]) - - parser.add_argument("--current-host", type=str, default=os.environ["SM_CURRENT_HOST"]) - parser.add_argument("--hosts", type=list, default=json.loads(os.environ["SM_HOSTS"])) - - return parser.parse_args() - - -### NOTE: this function cannot use MXNet -def neo_preprocess(payload, content_type): - import logging - import numpy as np - import io - - logging.info("Invoking user-defined pre-processing function") - - if content_type != "application/vnd+python.numpy+binary": - raise RuntimeError("Content type must be application/vnd+python.numpy+binary") - - f = io.BytesIO(payload) - return np.load(f) - - -### NOTE: this function cannot use MXNet -def neo_postprocess(result): - import logging - import numpy as np - import json - - logging.info("Invoking user-defined post-processing function") - - # Softmax (assumes batch size 1) - result = np.squeeze(result) - result_exp = np.exp(result - np.max(result)) - result = result_exp / np.sum(result_exp) - - response_body = json.dumps(result.tolist()) - content_type = "application/json" - - return response_body, content_type - - -if __name__ == "__main__": - args = parse_args() - num_gpus = int(os.environ["SM_NUM_GPUS"]) - - train( - args.batch_size, - args.epochs, - args.learning_rate, - num_gpus, - args.train, - args.test, - args.hosts, - args.current_host, - args.model_dir, - )