From 1e7751ceba00f13635b3264252f5352f9f6e53d2 Mon Sep 17 00:00:00 2001 From: atqy Date: Fri, 29 Jul 2022 04:34:37 -0700 Subject: [PATCH] delete notebook --- .../basic_sagemaker_processing.ipynb | 242 ------------------ 1 file changed, 242 deletions(-) delete mode 100644 sagemaker_processing/basic_sagemaker_data_processing/basic_sagemaker_processing.ipynb diff --git a/sagemaker_processing/basic_sagemaker_data_processing/basic_sagemaker_processing.ipynb b/sagemaker_processing/basic_sagemaker_data_processing/basic_sagemaker_processing.ipynb deleted file mode 100644 index a35ad37da7..0000000000 --- a/sagemaker_processing/basic_sagemaker_data_processing/basic_sagemaker_processing.ipynb +++ /dev/null @@ -1,242 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Get started with SageMaker Processing\n", - "\n", - "test ci.\n", - "\n", - "This notebook corresponds to the section \"Preprocessing Data With The Built-In Scikit-Learn Container\" in the blog post [Amazon SageMaker Processing – Fully Managed Data Processing and Model Evaluation](https://aws.amazon.com/blogs/aws/amazon-sagemaker-processing-fully-managed-data-processing-and-model-evaluation/). \n", - "It shows a lightweight example of using SageMaker Processing to create train, test, and validation datasets. SageMaker Processing is used to create these datasets, which then are written back to S3.\n", - "\n", - "## Runtime\n", - "\n", - "This notebook takes approximately 5 minutes to run.\n", - "\n", - "## Contents\n", - "\n", - "1. [Prepare resources](#Prepare-resources)\n", - "1. [Download data](#Download-data)\n", - "1. [Prepare Processing script](#Prepare-Processing-script)\n", - "1. [Run Processing job](#Run-Processing-job)\n", - "1. [Conclusion](#Conclusion)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Prepare resources\n", - "\n", - "First, let’s create an SKLearnProcessor object, passing the scikit-learn version we want to use, as well as our managed infrastructure requirements." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import boto3\n", - "import sagemaker\n", - "from sagemaker import get_execution_role\n", - "from sagemaker.sklearn.processing import SKLearnProcessor\n", - "\n", - "region = sagemaker.Session().boto_region_name\n", - "role = get_execution_role()\n", - "sklearn_processor = SKLearnProcessor(\n", - " framework_version=\"1.0-1\", role=role, instance_type=\"ml.m5.xlarge\", instance_count=1\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Download data\n", - "\n", - "Read in the raw data from a public S3 bucket. This example uses the [Census-Income (KDD) Dataset](https://archive.ics.uci.edu/ml/datasets/Census-Income+%28KDD%29) from the UCI Machine Learning Repository.\n", - "\n", - "> Dua, D. and Graff, C. (2019). UCI Machine Learning Repository [http://archive.ics.uci.edu/ml]. Irvine, CA: University of California, School of Information and Computer Science." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import pandas as pd\n", - "\n", - "s3 = boto3.client(\"s3\")\n", - "s3.download_file(\n", - " \"sagemaker-sample-data-{}\".format(region),\n", - " \"processing/census/census-income.csv\",\n", - " \"census-income.csv\",\n", - ")\n", - "df = pd.read_csv(\"census-income.csv\")\n", - "df.to_csv(\"dataset.csv\")\n", - "df.head()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Prepare Processing script\n", - "\n", - "Write the Python script that will be run by SageMaker Processing. This script reads the single data file from S3; splits the rows into train, test, and validation sets; and then writes the three output files to S3." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "%%writefile preprocessing.py\n", - "import pandas as pd\n", - "import os\n", - "from sklearn.model_selection import train_test_split\n", - "\n", - "input_data_path = os.path.join(\"/opt/ml/processing/input\", \"dataset.csv\")\n", - "df = pd.read_csv(input_data_path)\n", - "print(\"Shape of data is:\", df.shape)\n", - "train, test = train_test_split(df, test_size=0.2)\n", - "train, validation = train_test_split(train, test_size=0.2)\n", - "\n", - "try:\n", - " os.makedirs(\"/opt/ml/processing/output/train\")\n", - " os.makedirs(\"/opt/ml/processing/output/validation\")\n", - " os.makedirs(\"/opt/ml/processing/output/test\")\n", - " print(\"Successfully created directories\")\n", - "except Exception as e:\n", - " # if the Processing call already creates these directories (or directory otherwise cannot be created)\n", - " print(e)\n", - " print(\"Could not make directories\")\n", - " pass\n", - "\n", - "try:\n", - " train.to_csv(\"/opt/ml/processing/output/train/train.csv\")\n", - " validation.to_csv(\"/opt/ml/processing/output/validation/validation.csv\")\n", - " test.to_csv(\"/opt/ml/processing/output/test/test.csv\")\n", - " print(\"Wrote files successfully\")\n", - "except Exception as e:\n", - " print(\"Failed to write the files\")\n", - " print(e)\n", - " pass\n", - "\n", - "print(\"Completed running the processing job\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Run Processing job" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Run the Processing job, specifying the script name, input file, and output files." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "%%capture output\n", - "\n", - "from sagemaker.processing import ProcessingInput, ProcessingOutput\n", - "\n", - "sklearn_processor.run(\n", - " code=\"preprocessing.py\",\n", - " # arguments = [\"arg1\", \"arg2\"], # Arguments can optionally be specified here\n", - " inputs=[ProcessingInput(source=\"dataset.csv\", destination=\"/opt/ml/processing/input\")],\n", - " outputs=[\n", - " ProcessingOutput(source=\"/opt/ml/processing/output/train\"),\n", - " ProcessingOutput(source=\"/opt/ml/processing/output/validation\"),\n", - " ProcessingOutput(source=\"/opt/ml/processing/output/test\"),\n", - " ],\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Get the Processing job logs and retrieve the job name." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "print(output)\n", - "job_name = str(output).split(\"\\n\")[1].split(\" \")[-1]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Confirm that the output dataset files were written to S3." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import boto3\n", - "\n", - "s3_client = boto3.client(\"s3\")\n", - "default_bucket = sagemaker.Session().default_bucket()\n", - "for i in range(1, 4):\n", - " prefix = s3_client.list_objects(\n", - " Bucket=default_bucket, Prefix=job_name + \"/output/output-\" + str(i) + \"/\"\n", - " )[\"Contents\"][0][\"Key\"]\n", - " print(\"s3://\" + default_bucket + \"/\" + prefix)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Conclusion\n", - "\n", - "In this notebook, we read a dataset from S3 and processed it into train, test, and validation sets using a SageMaker Processing job. You can extend this example for preprocessing your own datasets in preparation for machine learning or other applications." - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.9" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -}