diff --git a/sagemaker_processing/basic_sagemaker_data_processing/basic_sagemaker_processing_outputs.ipynb b/sagemaker_processing/basic_sagemaker_data_processing/basic_sagemaker_processing_outputs.ipynb deleted file mode 100644 index 3859d094e4..0000000000 --- a/sagemaker_processing/basic_sagemaker_data_processing/basic_sagemaker_processing_outputs.ipynb +++ /dev/null @@ -1,711 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "id": "a8f18b23", - "metadata": { - "papermill": { - "duration": 0.006395, - "end_time": "2022-04-18T00:08:55.010149", - "exception": false, - "start_time": "2022-04-18T00:08:55.003754", - "status": "completed" - }, - "tags": [] - }, - "source": [ - "# Get started with SageMaker Processing\n", - "\n", - "This notebook corresponds to the section \"Preprocessing Data With The Built-In Scikit-Learn Container\" in the blog post [Amazon SageMaker Processing – Fully Managed Data Processing and Model Evaluation](https://aws.amazon.com/blogs/aws/amazon-sagemaker-processing-fully-managed-data-processing-and-model-evaluation/). \n", - "It shows a lightweight example of using SageMaker Processing to create train, test, and validation datasets. SageMaker Processing is used to create these datasets, which then are written back to S3.\n", - "\n", - "## Runtime\n", - "\n", - "This notebook takes approximately 5 minutes to run.\n", - "\n", - "## Contents\n", - "\n", - "1. [Prepare resources](#Prepare-resources)\n", - "1. [Download data](#Download-data)\n", - "1. [Prepare Processing script](#Prepare-Processing-script)\n", - "1. [Run Processing job](#Run-Processing-job)\n", - "1. [Conclusion](#Conclusion)" - ] - }, - { - "cell_type": "markdown", - "id": "3cf7028a", - "metadata": { - "papermill": { - "duration": 0.006333, - "end_time": "2022-04-18T00:08:55.022942", - "exception": false, - "start_time": "2022-04-18T00:08:55.016609", - "status": "completed" - }, - "tags": [] - }, - "source": [ - "## Prepare resources\n", - "\n", - "First, let’s create an SKLearnProcessor object, passing the scikit-learn version we want to use, as well as our managed infrastructure requirements." - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "id": "862f8d1f", - "metadata": { - "execution": { - "iopub.execute_input": "2022-04-18T00:08:55.039310Z", - "iopub.status.busy": "2022-04-18T00:08:55.038857Z", - "iopub.status.idle": "2022-04-18T00:08:56.057474Z", - "shell.execute_reply": "2022-04-18T00:08:56.057892Z" - }, - "papermill": { - "duration": 1.028712, - "end_time": "2022-04-18T00:08:56.058050", - "exception": false, - "start_time": "2022-04-18T00:08:55.029338", - "status": "completed" - }, - "tags": [] - }, - "outputs": [], - "source": [ - "import boto3\n", - "import sagemaker\n", - "from sagemaker import get_execution_role\n", - "from sagemaker.sklearn.processing import SKLearnProcessor\n", - "\n", - "region = sagemaker.Session().boto_region_name\n", - "role = get_execution_role()\n", - "sklearn_processor = SKLearnProcessor(\n", - " framework_version=\"1.0-1\", role=role, instance_type=\"ml.m5.xlarge\", instance_count=1\n", - ")" - ] - }, - { - "cell_type": "markdown", - "id": "b35ea4ea", - "metadata": { - "papermill": { - "duration": 0.006588, - "end_time": "2022-04-18T00:08:56.071404", - "exception": false, - "start_time": "2022-04-18T00:08:56.064816", - "status": "completed" - }, - "tags": [] - }, - "source": [ - "## Download data\n", - "\n", - "Read in the raw data from a public S3 bucket. This example uses the [Census-Income (KDD) Dataset](https://archive.ics.uci.edu/ml/datasets/Census-Income+%28KDD%29) from the UCI Machine Learning Repository.\n", - "\n", - "> Dua, D. and Graff, C. (2019). UCI Machine Learning Repository [http://archive.ics.uci.edu/ml]. Irvine, CA: University of California, School of Information and Computer Science." - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "id": "6eaf6050", - "metadata": { - "execution": { - "iopub.execute_input": "2022-04-18T00:08:56.096003Z", - "iopub.status.busy": "2022-04-18T00:08:56.095500Z", - "iopub.status.idle": "2022-04-18T00:09:00.816015Z", - "shell.execute_reply": "2022-04-18T00:09:00.815586Z" - }, - "papermill": { - "duration": 4.738175, - "end_time": "2022-04-18T00:09:00.816126", - "exception": false, - "start_time": "2022-04-18T00:08:56.077951", - "status": "completed" - }, - "tags": [] - }, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
ageclass of workerdetailed industry recodedetailed occupation recodeeducationwage per hourenroll in edu inst last wkmarital statmajor industry codemajor occupation code...country of birth fathercountry of birth mothercountry of birth selfcitizenshipown business or self employedfill inc questionnaire for veteran's adminveterans benefitsweeks worked in yearyearincome
073Not in universe00High school graduate0Not in universeWidowedNot in universe or childrenNot in universe...United-StatesUnited-StatesUnited-StatesNative- Born in the United States0Not in universe2095- 50000.
158Self-employed-not incorporated434Some college but no degree0Not in universeDivorcedConstructionPrecision production craft & repair...United-StatesUnited-StatesUnited-StatesNative- Born in the United States0Not in universe25294- 50000.
218Not in universe0010th grade0High schoolNever marriedNot in universe or childrenNot in universe...VietnamVietnamVietnamForeign born- Not a citizen of U S0Not in universe2095- 50000.
39Not in universe00Children0Not in universeNever marriedNot in universe or childrenNot in universe...United-StatesUnited-StatesUnited-StatesNative- Born in the United States0Not in universe0094- 50000.
410Not in universe00Children0Not in universeNever marriedNot in universe or childrenNot in universe...United-StatesUnited-StatesUnited-StatesNative- Born in the United States0Not in universe0094- 50000.
\n", - "

5 rows × 42 columns

\n", - "
" - ], - "text/plain": [ - " age class of worker detailed industry recode \\\n", - "0 73 Not in universe 0 \n", - "1 58 Self-employed-not incorporated 4 \n", - "2 18 Not in universe 0 \n", - "3 9 Not in universe 0 \n", - "4 10 Not in universe 0 \n", - "\n", - " detailed occupation recode education wage per hour \\\n", - "0 0 High school graduate 0 \n", - "1 34 Some college but no degree 0 \n", - "2 0 10th grade 0 \n", - "3 0 Children 0 \n", - "4 0 Children 0 \n", - "\n", - " enroll in edu inst last wk marital stat major industry code \\\n", - "0 Not in universe Widowed Not in universe or children \n", - "1 Not in universe Divorced Construction \n", - "2 High school Never married Not in universe or children \n", - "3 Not in universe Never married Not in universe or children \n", - "4 Not in universe Never married Not in universe or children \n", - "\n", - " major occupation code ... country of birth father \\\n", - "0 Not in universe ... United-States \n", - "1 Precision production craft & repair ... United-States \n", - "2 Not in universe ... Vietnam \n", - "3 Not in universe ... United-States \n", - "4 Not in universe ... United-States \n", - "\n", - " country of birth mother country of birth self \\\n", - "0 United-States United-States \n", - "1 United-States United-States \n", - "2 Vietnam Vietnam \n", - "3 United-States United-States \n", - "4 United-States United-States \n", - "\n", - " citizenship own business or self employed \\\n", - "0 Native- Born in the United States 0 \n", - "1 Native- Born in the United States 0 \n", - "2 Foreign born- Not a citizen of U S 0 \n", - "3 Native- Born in the United States 0 \n", - "4 Native- Born in the United States 0 \n", - "\n", - " fill inc questionnaire for veteran's admin veterans benefits \\\n", - "0 Not in universe 2 \n", - "1 Not in universe 2 \n", - "2 Not in universe 2 \n", - "3 Not in universe 0 \n", - "4 Not in universe 0 \n", - "\n", - " weeks worked in year year income \n", - "0 0 95 - 50000. \n", - "1 52 94 - 50000. \n", - "2 0 95 - 50000. \n", - "3 0 94 - 50000. \n", - "4 0 94 - 50000. \n", - "\n", - "[5 rows x 42 columns]" - ] - }, - "execution_count": 3, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "import pandas as pd\n", - "\n", - "s3 = boto3.client(\"s3\")\n", - "s3.download_file(\n", - " \"sagemaker-sample-data-{}\".format(region),\n", - " \"processing/census/census-income.csv\",\n", - " \"census-income.csv\",\n", - ")\n", - "df = pd.read_csv(\"census-income.csv\")\n", - "df.to_csv(\"dataset.csv\")\n", - "df.head()" - ] - }, - { - "cell_type": "markdown", - "id": "240ac1a3", - "metadata": { - "papermill": { - "duration": 0.007158, - "end_time": "2022-04-18T00:09:00.830803", - "exception": false, - "start_time": "2022-04-18T00:09:00.823645", - "status": "completed" - }, - "tags": [] - }, - "source": [ - "## Prepare Processing script\n", - "\n", - "Write the Python script that will be run by SageMaker Processing. This script reads the single data file from S3; splits the rows into train, test, and validation sets; and then writes the three output files to S3." - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "id": "64b8e90c", - "metadata": { - "execution": { - "iopub.execute_input": "2022-04-18T00:09:00.849237Z", - "iopub.status.busy": "2022-04-18T00:09:00.848512Z", - "iopub.status.idle": "2022-04-18T00:09:00.851175Z", - "shell.execute_reply": "2022-04-18T00:09:00.851554Z" - }, - "papermill": { - "duration": 0.013691, - "end_time": "2022-04-18T00:09:00.851669", - "exception": false, - "start_time": "2022-04-18T00:09:00.837978", - "status": "completed" - }, - "tags": [] - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Writing preprocessing.py\n" - ] - } - ], - "source": [ - "%%writefile preprocessing.py\n", - "import pandas as pd\n", - "import os\n", - "from sklearn.model_selection import train_test_split\n", - "\n", - "input_data_path = os.path.join(\"/opt/ml/processing/input\", \"dataset.csv\")\n", - "df = pd.read_csv(input_data_path)\n", - "print(\"Shape of data is:\", df.shape)\n", - "train, test = train_test_split(df, test_size=0.2)\n", - "train, validation = train_test_split(train, test_size=0.2)\n", - "\n", - "try:\n", - " os.makedirs(\"/opt/ml/processing/output/train\")\n", - " os.makedirs(\"/opt/ml/processing/output/validation\")\n", - " os.makedirs(\"/opt/ml/processing/output/test\")\n", - " print(\"Successfully created directories\")\n", - "except Exception as e:\n", - " # if the Processing call already creates these directories (or directory otherwise cannot be created)\n", - " print(e)\n", - " print(\"Could not make directories\")\n", - " pass\n", - "\n", - "try:\n", - " train.to_csv(\"/opt/ml/processing/output/train/train.csv\")\n", - " validation.to_csv(\"/opt/ml/processing/output/validation/validation.csv\")\n", - " test.to_csv(\"/opt/ml/processing/output/test/test.csv\")\n", - " print(\"Wrote files successfully\")\n", - "except Exception as e:\n", - " print(\"Failed to write the files\")\n", - " print(e)\n", - " pass\n", - "\n", - "print(\"Completed running the processing job\")" - ] - }, - { - "cell_type": "markdown", - "id": "1bab3ff2", - "metadata": { - "papermill": { - "duration": 0.007373, - "end_time": "2022-04-18T00:09:00.866414", - "exception": false, - "start_time": "2022-04-18T00:09:00.859041", - "status": "completed" - }, - "tags": [] - }, - "source": [ - "## Run Processing job" - ] - }, - { - "cell_type": "markdown", - "id": "68190117", - "metadata": { - "papermill": { - "duration": 0.007318, - "end_time": "2022-04-18T00:09:00.881109", - "exception": false, - "start_time": "2022-04-18T00:09:00.873791", - "status": "completed" - }, - "tags": [] - }, - "source": [ - "Run the Processing job, specifying the script name, input file, and output files." - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "id": "450368db", - "metadata": { - "execution": { - "iopub.execute_input": "2022-04-18T00:09:00.901375Z", - "iopub.status.busy": "2022-04-18T00:09:00.900644Z", - "iopub.status.idle": "2022-04-18T00:13:44.601792Z", - "shell.execute_reply": "2022-04-18T00:13:44.602212Z" - }, - "papermill": { - "duration": 283.713812, - "end_time": "2022-04-18T00:13:44.602351", - "exception": false, - "start_time": "2022-04-18T00:09:00.888539", - "status": "completed" - }, - "tags": [] - }, - "outputs": [], - "source": [ - "%%capture output\n", - "\n", - "from sagemaker.processing import ProcessingInput, ProcessingOutput\n", - "\n", - "sklearn_processor.run(\n", - " code=\"preprocessing.py\",\n", - " # arguments = [\"arg1\", \"arg2\"], # Arguments can optionally be specified here\n", - " inputs=[ProcessingInput(source=\"dataset.csv\", destination=\"/opt/ml/processing/input\")],\n", - " outputs=[\n", - " ProcessingOutput(source=\"/opt/ml/processing/output/train\"),\n", - " ProcessingOutput(source=\"/opt/ml/processing/output/validation\"),\n", - " ProcessingOutput(source=\"/opt/ml/processing/output/test\"),\n", - " ],\n", - ")" - ] - }, - { - "cell_type": "markdown", - "id": "135c2776", - "metadata": { - "papermill": { - "duration": 0.007543, - "end_time": "2022-04-18T00:13:44.617780", - "exception": false, - "start_time": "2022-04-18T00:13:44.610237", - "status": "completed" - }, - "tags": [] - }, - "source": [ - "Get the Processing job logs and retrieve the job name." - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "id": "8f3e9edf", - "metadata": { - "execution": { - "iopub.execute_input": "2022-04-18T00:13:44.636660Z", - "iopub.status.busy": "2022-04-18T00:13:44.636183Z", - "iopub.status.idle": "2022-04-18T00:13:44.638287Z", - "shell.execute_reply": "2022-04-18T00:13:44.638643Z" - }, - "papermill": { - "duration": 0.013467, - "end_time": "2022-04-18T00:13:44.638753", - "exception": false, - "start_time": "2022-04-18T00:13:44.625286", - "status": "completed" - }, - "tags": [] - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "Job Name: sagemaker-scikit-learn-2022-04-18-00-09-00-899\n", - "Inputs: [{'InputName': 'input-1', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://sagemaker-us-west-2-000000000000/sagemaker-scikit-learn-2022-04-18-00-09-00-899/input/input-1/dataset.csv', 'LocalPath': '/opt/ml/processing/input', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S3CompressionType': 'None'}}, {'InputName': 'code', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://sagemaker-us-west-2-000000000000/sagemaker-scikit-learn-2022-04-18-00-09-00-899/input/code/preprocessing.py', 'LocalPath': '/opt/ml/processing/input/code', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S3CompressionType': 'None'}}]\n", - "Outputs: [{'OutputName': 'output-1', 'AppManaged': False, 'S3Output': {'S3Uri': 's3://sagemaker-us-west-2-000000000000/sagemaker-scikit-learn-2022-04-18-00-09-00-899/output/output-1', 'LocalPath': '/opt/ml/processing/output/train', 'S3UploadMode': 'EndOfJob'}}, {'OutputName': 'output-2', 'AppManaged': False, 'S3Output': {'S3Uri': 's3://sagemaker-us-west-2-000000000000/sagemaker-scikit-learn-2022-04-18-00-09-00-899/output/output-2', 'LocalPath': '/opt/ml/processing/output/validation', 'S3UploadMode': 'EndOfJob'}}, {'OutputName': 'output-3', 'AppManaged': False, 'S3Output': {'S3Uri': 's3://sagemaker-us-west-2-000000000000/sagemaker-scikit-learn-2022-04-18-00-09-00-899/output/output-3', 'LocalPath': '/opt/ml/processing/output/test', 'S3UploadMode': 'EndOfJob'}}]\n", - "...........................\n", - "\u001b[34mShape of data is: (199523, 43)\u001b[0m\n", - "\u001b[34m[Errno 17] File exists: '/opt/ml/processing/output/train'\u001b[0m\n", - "\u001b[34mCould not make directories\u001b[0m\n", - "\u001b[34mWrote files successfully\u001b[0m\n", - "\u001b[34mCompleted running the processing job\u001b[0m\n", - "\n" - ] - } - ], - "source": [ - "print(output)\n", - "job_name = str(output).split(\"\\n\")[1].split(\" \")[-1]" - ] - }, - { - "cell_type": "markdown", - "id": "386f656a", - "metadata": { - "papermill": { - "duration": 0.007802, - "end_time": "2022-04-18T00:13:44.654395", - "exception": false, - "start_time": "2022-04-18T00:13:44.646593", - "status": "completed" - }, - "tags": [] - }, - "source": [ - "Confirm that the output dataset files were written to S3." - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "id": "9b885d73", - "metadata": { - "execution": { - "iopub.execute_input": "2022-04-18T00:13:44.677694Z", - "iopub.status.busy": "2022-04-18T00:13:44.677253Z", - "iopub.status.idle": "2022-04-18T00:13:45.043939Z", - "shell.execute_reply": "2022-04-18T00:13:45.044361Z" - }, - "papermill": { - "duration": 0.382286, - "end_time": "2022-04-18T00:13:45.044503", - "exception": false, - "start_time": "2022-04-18T00:13:44.662217", - "status": "completed" - }, - "tags": [] - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "s3://sagemaker-us-west-2-000000000000/sagemaker-scikit-learn-2022-04-18-00-09-00-899/output/output-1/train.csv\n", - "s3://sagemaker-us-west-2-000000000000/sagemaker-scikit-learn-2022-04-18-00-09-00-899/output/output-2/validation.csv\n", - "s3://sagemaker-us-west-2-000000000000/sagemaker-scikit-learn-2022-04-18-00-09-00-899/output/output-3/test.csv\n" - ] - } - ], - "source": [ - "import boto3\n", - "\n", - "s3_client = boto3.client(\"s3\")\n", - "default_bucket = sagemaker.Session().default_bucket()\n", - "for i in range(1, 4):\n", - " prefix = s3_client.list_objects(\n", - " Bucket=default_bucket, Prefix=job_name + \"/output/output-\" + str(i) + \"/\"\n", - " )[\"Contents\"][0][\"Key\"]\n", - " print(\"s3://\" + default_bucket + \"/\" + prefix)" - ] - }, - { - "cell_type": "markdown", - "id": "bd191e62", - "metadata": { - "papermill": { - "duration": 0.008184, - "end_time": "2022-04-18T00:13:45.060991", - "exception": false, - "start_time": "2022-04-18T00:13:45.052807", - "status": "completed" - }, - "tags": [] - }, - "source": [ - "## Conclusion\n", - "\n", - "In this notebook, we read a dataset from S3 and processed it into train, test, and validation sets using a SageMaker Processing job. You can extend this example for preprocessing your own datasets in preparation for machine learning or other applications." - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.10" - }, - "papermill": { - "default_parameters": {}, - "duration": 291.085709, - "end_time": "2022-04-18T00:13:45.485219", - "environment_variables": {}, - "exception": null, - "input_path": "basic_sagemaker_processing.ipynb", - "output_path": "/opt/ml/processing/output/basic_sagemaker_processing-2022-04-18-00-04-13.ipynb", - "parameters": { - "kms_key": "arn:aws:kms:us-west-2:000000000000:1234abcd-12ab-34cd-56ef-1234567890ab" - }, - "start_time": "2022-04-18T00:08:54.399510", - "version": "2.3.4" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} \ No newline at end of file