From 1e7751ceba00f13635b3264252f5352f9f6e53d2 Mon Sep 17 00:00:00 2001
From: atqy <atqy@amazon.com>
Date: Fri, 29 Jul 2022 04:34:37 -0700
Subject: [PATCH] delete notebook

---
 .../basic_sagemaker_processing.ipynb          | 242 ------------------
 1 file changed, 242 deletions(-)
 delete mode 100644 sagemaker_processing/basic_sagemaker_data_processing/basic_sagemaker_processing.ipynb

diff --git a/sagemaker_processing/basic_sagemaker_data_processing/basic_sagemaker_processing.ipynb b/sagemaker_processing/basic_sagemaker_data_processing/basic_sagemaker_processing.ipynb
deleted file mode 100644
index a35ad37da7..0000000000
--- a/sagemaker_processing/basic_sagemaker_data_processing/basic_sagemaker_processing.ipynb
+++ /dev/null
@@ -1,242 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# Get started with SageMaker Processing\n",
-    "\n",
-    "test ci.\n",
-    "\n",
-    "This notebook corresponds to the section \"Preprocessing Data With The Built-In Scikit-Learn Container\" in the blog post [Amazon SageMaker Processing – Fully Managed Data Processing and Model Evaluation](https://aws.amazon.com/blogs/aws/amazon-sagemaker-processing-fully-managed-data-processing-and-model-evaluation/). \n",
-    "It shows a lightweight example of using SageMaker Processing to create train, test, and validation datasets. SageMaker Processing is used to create these datasets, which then are written back to S3.\n",
-    "\n",
-    "## Runtime\n",
-    "\n",
-    "This notebook takes approximately 5 minutes to run.\n",
-    "\n",
-    "## Contents\n",
-    "\n",
-    "1. [Prepare resources](#Prepare-resources)\n",
-    "1. [Download data](#Download-data)\n",
-    "1. [Prepare Processing script](#Prepare-Processing-script)\n",
-    "1. [Run Processing job](#Run-Processing-job)\n",
-    "1. [Conclusion](#Conclusion)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Prepare resources\n",
-    "\n",
-    "First, let’s create an SKLearnProcessor object, passing the scikit-learn version we want to use, as well as our managed infrastructure requirements."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import boto3\n",
-    "import sagemaker\n",
-    "from sagemaker import get_execution_role\n",
-    "from sagemaker.sklearn.processing import SKLearnProcessor\n",
-    "\n",
-    "region = sagemaker.Session().boto_region_name\n",
-    "role = get_execution_role()\n",
-    "sklearn_processor = SKLearnProcessor(\n",
-    "    framework_version=\"1.0-1\", role=role, instance_type=\"ml.m5.xlarge\", instance_count=1\n",
-    ")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Download data\n",
-    "\n",
-    "Read in the raw data from a public S3 bucket. This example uses the [Census-Income (KDD) Dataset](https://archive.ics.uci.edu/ml/datasets/Census-Income+%28KDD%29) from the UCI Machine Learning Repository.\n",
-    "\n",
-    "> Dua, D. and Graff, C. (2019). UCI Machine Learning Repository [http://archive.ics.uci.edu/ml]. Irvine, CA: University of California, School of Information and Computer Science."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import pandas as pd\n",
-    "\n",
-    "s3 = boto3.client(\"s3\")\n",
-    "s3.download_file(\n",
-    "    \"sagemaker-sample-data-{}\".format(region),\n",
-    "    \"processing/census/census-income.csv\",\n",
-    "    \"census-income.csv\",\n",
-    ")\n",
-    "df = pd.read_csv(\"census-income.csv\")\n",
-    "df.to_csv(\"dataset.csv\")\n",
-    "df.head()"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Prepare Processing script\n",
-    "\n",
-    "Write the Python script that will be run by SageMaker Processing. This script reads the single data file from S3; splits the rows into train, test, and validation sets; and then writes the three output files to S3."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "%%writefile preprocessing.py\n",
-    "import pandas as pd\n",
-    "import os\n",
-    "from sklearn.model_selection import train_test_split\n",
-    "\n",
-    "input_data_path = os.path.join(\"/opt/ml/processing/input\", \"dataset.csv\")\n",
-    "df = pd.read_csv(input_data_path)\n",
-    "print(\"Shape of data is:\", df.shape)\n",
-    "train, test = train_test_split(df, test_size=0.2)\n",
-    "train, validation = train_test_split(train, test_size=0.2)\n",
-    "\n",
-    "try:\n",
-    "    os.makedirs(\"/opt/ml/processing/output/train\")\n",
-    "    os.makedirs(\"/opt/ml/processing/output/validation\")\n",
-    "    os.makedirs(\"/opt/ml/processing/output/test\")\n",
-    "    print(\"Successfully created directories\")\n",
-    "except Exception as e:\n",
-    "    # if the Processing call already creates these directories (or directory otherwise cannot be created)\n",
-    "    print(e)\n",
-    "    print(\"Could not make directories\")\n",
-    "    pass\n",
-    "\n",
-    "try:\n",
-    "    train.to_csv(\"/opt/ml/processing/output/train/train.csv\")\n",
-    "    validation.to_csv(\"/opt/ml/processing/output/validation/validation.csv\")\n",
-    "    test.to_csv(\"/opt/ml/processing/output/test/test.csv\")\n",
-    "    print(\"Wrote files successfully\")\n",
-    "except Exception as e:\n",
-    "    print(\"Failed to write the files\")\n",
-    "    print(e)\n",
-    "    pass\n",
-    "\n",
-    "print(\"Completed running the processing job\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Run Processing job"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Run the Processing job, specifying the script name, input file, and output files."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "%%capture output\n",
-    "\n",
-    "from sagemaker.processing import ProcessingInput, ProcessingOutput\n",
-    "\n",
-    "sklearn_processor.run(\n",
-    "    code=\"preprocessing.py\",\n",
-    "    # arguments = [\"arg1\", \"arg2\"], # Arguments can optionally be specified here\n",
-    "    inputs=[ProcessingInput(source=\"dataset.csv\", destination=\"/opt/ml/processing/input\")],\n",
-    "    outputs=[\n",
-    "        ProcessingOutput(source=\"/opt/ml/processing/output/train\"),\n",
-    "        ProcessingOutput(source=\"/opt/ml/processing/output/validation\"),\n",
-    "        ProcessingOutput(source=\"/opt/ml/processing/output/test\"),\n",
-    "    ],\n",
-    ")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Get the Processing job logs and retrieve the job name."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "print(output)\n",
-    "job_name = str(output).split(\"\\n\")[1].split(\" \")[-1]"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Confirm that the output dataset files were written to S3."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import boto3\n",
-    "\n",
-    "s3_client = boto3.client(\"s3\")\n",
-    "default_bucket = sagemaker.Session().default_bucket()\n",
-    "for i in range(1, 4):\n",
-    "    prefix = s3_client.list_objects(\n",
-    "        Bucket=default_bucket, Prefix=job_name + \"/output/output-\" + str(i) + \"/\"\n",
-    "    )[\"Contents\"][0][\"Key\"]\n",
-    "    print(\"s3://\" + default_bucket + \"/\" + prefix)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Conclusion\n",
-    "\n",
-    "In this notebook, we read a dataset from S3 and processed it into train, test, and validation sets using a SageMaker Processing job. You can extend this example for preprocessing your own datasets in preparation for machine learning or other applications."
-   ]
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.7.9"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 4
-}