diff --git a/.ipynb_checkpoints/new baseline model 29Nov17-checkpoint.ipynb b/.ipynb_checkpoints/new baseline model 29Nov17-checkpoint.ipynb
new file mode 100644
index 0000000..c090793
--- /dev/null
+++ b/.ipynb_checkpoints/new baseline model 29Nov17-checkpoint.ipynb
@@ -0,0 +1,5969 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ ""
+ ],
+ "text/vnd.plotly.v1+html": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "import pandas as pd\n",
+ "import numpy as np\n",
+ "import matplotlib.pyplot as plt\n",
+ "%matplotlib inline\n",
+ "import seaborn as sns\n",
+ "import plotly.offline as py\n",
+ "import plotly.graph_objs as go\n",
+ "py.init_notebook_mode()\n",
+ "import gc"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "seed = 46\n",
+ "np.random.seed(seed)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def load_data(data_path):\n",
+ " train = pd.read_csv('%s/train.csv' % data_path, parse_dates=['date'])\n",
+ " test = pd.read_csv('%s/test.csv' % data_path, parse_dates=['date'])\n",
+ " items = pd.read_csv('%s/items.csv' % data_path)\n",
+ " return train, test, items"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 59,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "/Users/jinyang/anaconda/envs/py35/lib/python3.5/site-packages/IPython/core/interactiveshell.py:2821: DtypeWarning:\n",
+ "\n",
+ "Columns (5) have mixed types. Specify dtype option on import or set low_memory=False.\n",
+ "\n"
+ ]
+ }
+ ],
+ "source": [
+ "data_path = './data'\n",
+ "train, test, items = load_data(data_path)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def print_summary_train_test(train, test):\n",
+ " print ('Train min/max date: %s / %s' % (train['date'].min(), train['date'].max()))\n",
+ " print ('Test min/max date: %s / %s' % ( test['date'].min(), test['date'].max()))\n",
+ " print ('')\n",
+ " print ('Number of days in train: %d' % ((train['date'].max() - train['date'].min()).days + 1))\n",
+ " print ('Number of days in validation: %d' % (( test['date'].max() - test['date'].min()).days + 1))\n",
+ " print ('')\n",
+ " print ('Train shape: %d rows' % train.shape[0])\n",
+ " print ('Test shape: %d rows' % test.shape[0])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Train min/max date: 2013-01-01 00:00:00 / 2017-08-15 00:00:00\n",
+ "Test min/max date: 2017-08-16 00:00:00 / 2017-08-31 00:00:00\n",
+ "\n",
+ "Number of days in train: 1688\n",
+ "Number of days in validation: 16\n",
+ "\n",
+ "Train shape: 125497040 rows\n",
+ "Test shape: 3370464 rows\n"
+ ]
+ }
+ ],
+ "source": [
+ "print_summary_train_test(train, test)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Divide Train data into Validation(last two weeks of train data) and Training(the rest)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import splitter"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "train_last_date = train['date'].max()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 10,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "2017-07-26 00:00:00 2017-08-10 00:00:00\n"
+ ]
+ }
+ ],
+ "source": [
+ "begin_of_validation, end_of_validation = splitter.get_validation_period(train_last_date)\n",
+ "print(begin_of_validation, end_of_validation)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 11,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "train_train, train_validation = splitter.split_validation_train_by_validation_period(train, begin_of_validation, end_of_validation)\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 12,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Train min/max date: 2013-01-01 00:00:00 / 2017-07-25 00:00:00\n",
+ "Test min/max date: 2017-07-26 00:00:00 / 2017-08-10 00:00:00\n",
+ "\n",
+ "Number of days in train: 1667\n",
+ "Number of days in validation: 16\n",
+ "\n",
+ "Train shape: 123296175 rows\n",
+ "Test shape: 1679408 rows\n"
+ ]
+ }
+ ],
+ "source": [
+ "print_summary_train_test(train_train, train_validation)\n",
+ " "
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Evaluation Metric"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 13,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import evaluation\n",
+ "from sklearn.metrics import mean_squared_error"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## How many items in Test data set are not seen in Train data set \n",
+ "## vs. how many items in Validation are not seen in Training"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 14,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def get_unseen_item_percentage(train, test):\n",
+ " test_items = test['item_nbr'].unique()\n",
+ " train_items = train['item_nbr'].unique()\n",
+ " test_items_unseen_in_train = set(test_items) - set(train_items)\n",
+ " unseen_percentage = len(test_items_unseen_in_train)/len(test_items)\n",
+ " print(\"{:.2f}% of items in the test data set are not seen in the train data set\".format(unseen_percentage*100))\n",
+ " return unseen_percentage"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 15,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "1.54% of items in the test data set are not seen in the train data set\n"
+ ]
+ },
+ {
+ "data": {
+ "text/plain": [
+ "0.015380671622660855"
+ ]
+ },
+ "execution_count": 15,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "get_unseen_item_percentage(train, test)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 16,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "0.55% of items in the test data set are not seen in the train data set\n"
+ ]
+ },
+ {
+ "data": {
+ "text/plain": [
+ "0.005454545454545455"
+ ]
+ },
+ "execution_count": 16,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "get_unseen_item_percentage(train_train, train_validation)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## How many (item, store) in Test data set are not seen in Train data set\n",
+ "## vs. how many (item, store) in Validation are not seen in Training"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 17,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def get_unseen_item_store_pair_percentage(train, test):\n",
+ " cols_item_store = ['item_nbr', 'store_nbr']\n",
+ " train_item_store_grouped = train.groupby(cols_item_store)['unit_sales'].mean().reset_index()\n",
+ " cols_to_use = test.columns.drop('unit_sales') if 'unit_sales' in test.columns else test.columns\n",
+ " test_join_train_item_store_grouped = pd.merge(test[cols_to_use], train_item_store_grouped, on=cols_item_store, how='left')\n",
+ " test_unseen = test_join_train_item_store_grouped[test_join_train_item_store_grouped['unit_sales'].isnull()]\n",
+ " unseen_percentage = test_unseen.shape[0]/test.shape[0]\n",
+ " print(\"{:.2f}% of (item,store) pairs in the test data set are not seen in the train data set\".format(unseen_percentage*100))\n",
+ " return unseen_percentage"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 18,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "21.10% of (item,store) pairs in the test data set are not seen in the train data set\n"
+ ]
+ },
+ {
+ "data": {
+ "text/plain": [
+ "0.21096679863662687"
+ ]
+ },
+ "execution_count": 18,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "get_unseen_item_store_pair_percentage(train, test)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 18,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "0.13% of (item,store) pairs in the test data set are not seen in the train data set\n"
+ ]
+ },
+ {
+ "data": {
+ "text/plain": [
+ "0.0013326124443851642"
+ ]
+ },
+ "execution_count": 18,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "get_unseen_item_store_pair_percentage(train_train, train_validation)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Using constant prediction"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 19,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "8.563926854265649"
+ ]
+ },
+ "execution_count": 19,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "train_train.unit_sales.mean()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 20,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "8.554865268437672"
+ ]
+ },
+ "execution_count": 20,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "train.unit_sales.mean()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 21,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "(1679408, 6)"
+ ]
+ },
+ "execution_count": 21,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "train_validation.shape"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 22,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "train_validation_prediction = pd.DataFrame({'id': train_validation.loc[:, 'id'], 'prediction_sales': train_train.unit_sales.mean()})"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 23,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "(1679408, 2)"
+ ]
+ },
+ "execution_count": 23,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "train_validation_prediction.shape"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 24,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "test_prediction = pd.DataFrame({'id': test.loc[:, 'id'], 'prediction_sales': train.unit_sales.mean()})"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 25,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " id | \n",
+ " prediction_sales | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 123296175 | \n",
+ " 123296175 | \n",
+ " 8.563927 | \n",
+ "
\n",
+ " \n",
+ " 123296176 | \n",
+ " 123296176 | \n",
+ " 8.563927 | \n",
+ "
\n",
+ " \n",
+ " 123296177 | \n",
+ " 123296177 | \n",
+ " 8.563927 | \n",
+ "
\n",
+ " \n",
+ " 123296178 | \n",
+ " 123296178 | \n",
+ " 8.563927 | \n",
+ "
\n",
+ " \n",
+ " 123296179 | \n",
+ " 123296179 | \n",
+ " 8.563927 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " id prediction_sales\n",
+ "123296175 123296175 8.563927\n",
+ "123296176 123296176 8.563927\n",
+ "123296177 123296177 8.563927\n",
+ "123296178 123296178 8.563927\n",
+ "123296179 123296179 8.563927"
+ ]
+ },
+ "execution_count": 25,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "train_validation_prediction.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 26,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " id | \n",
+ " prediction_sales | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 125497040 | \n",
+ " 8.554865 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 125497041 | \n",
+ " 8.554865 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 125497042 | \n",
+ " 8.554865 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 125497043 | \n",
+ " 8.554865 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 125497044 | \n",
+ " 8.554865 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " id prediction_sales\n",
+ "0 125497040 8.554865\n",
+ "1 125497041 8.554865\n",
+ "2 125497042 8.554865\n",
+ "3 125497043 8.554865\n",
+ "4 125497044 8.554865"
+ ]
+ },
+ "execution_count": 26,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "test_prediction.head()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Evaluation"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 27,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def get_prediction_constant(train_train):\n",
+ " return train_train.unit_sales.mean()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 28,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def get_clean_prediction(train_train, train_validation):\n",
+ " predictions = get_prediction_constant(train_train)\n",
+ " return pd.DataFrame({'id': train_validation.loc[:, 'id'], 'prediction_sales': predictions})"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 29,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def get_item_expanded_df(test, items):\n",
+ " return pd.merge(test, items, on='item_nbr', how='left')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 30,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def get_evaluation_using_constant_baseline(train_train, train_validation):\n",
+ " train_validation_prediction = get_clean_prediction(train_train, train_validation)\n",
+ " train_validation_expanded = get_item_expanded_df(train_validation, items)\n",
+ " train_validation_weights = train_validation_expanded ['perishable']*0.25+1\n",
+ " return nwrmsle(train_validation_prediction['prediction_sales'], train_validation['unit_sales'], train_validation_weights)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 31,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "train_validation_expanded = get_item_expanded_df(train_validation, items)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 32,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "train_validation_weights = train_validation_expanded ['perishable']*0.25+1"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 33,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "0 1.00\n",
+ "1 1.25\n",
+ "2 1.00\n",
+ "3 1.00\n",
+ "4 1.00\n",
+ "Name: perishable, dtype: float64"
+ ]
+ },
+ "execution_count": 33,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "train_validation_weights.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 34,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def nwrmsle(predictions, targets, weights):\n",
+ " print(targets.shape)\n",
+ " targets[targets<0]=0\n",
+ " weights = 1 + 0.25 * weights\n",
+ " print(predictions.shape, targets.shape, weights.shape)\n",
+ " log_square_errors = (np.log(predictions.values + 1) - np.log(targets.values + 1)) ** 2\n",
+ " return(np.sqrt(np.sum(weights.values * log_square_errors) / np.sum(weights)))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 35,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "(1679408,)"
+ ]
+ },
+ "execution_count": 35,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "train_validation_prediction['prediction_sales'].shape"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 36,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "(1679408, 2)"
+ ]
+ },
+ "execution_count": 36,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "train_validation_prediction.shape"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 37,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "(1679408,)"
+ ]
+ },
+ "execution_count": 37,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "train_validation['unit_sales'].shape"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 38,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "(1679408, 6)"
+ ]
+ },
+ "execution_count": 38,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "train_validation.shape"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 39,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "(1679408,)\n",
+ "(1679408,) (1679408,) (1679408,)\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "/Users/jinyang/anaconda/envs/py35/lib/python3.5/site-packages/ipykernel_launcher.py:3: SettingWithCopyWarning:\n",
+ "\n",
+ "\n",
+ "A value is trying to be set on a copy of a slice from a DataFrame\n",
+ "\n",
+ "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n",
+ "\n",
+ "/Users/jinyang/anaconda/envs/py35/lib/python3.5/site-packages/pandas/core/generic.py:5088: SettingWithCopyWarning:\n",
+ "\n",
+ "\n",
+ "A value is trying to be set on a copy of a slice from a DataFrame\n",
+ "\n",
+ "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n",
+ "\n",
+ "/Users/jinyang/anaconda/envs/py35/lib/python3.5/site-packages/ipykernel_launcher.py:1: SettingWithCopyWarning:\n",
+ "\n",
+ "\n",
+ "A value is trying to be set on a copy of a slice from a DataFrame\n",
+ "\n",
+ "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n",
+ "\n"
+ ]
+ }
+ ],
+ "source": [
+ "train_validation_metric = nwrmsle(train_validation_prediction['prediction_sales'], train_validation['unit_sales'], train_validation_weights)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 40,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "1.0371859208825527"
+ ]
+ },
+ "execution_count": 40,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "train_validation_metric"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "In Leaderboard 1.710"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## How to improve the similarity between validation performance and test performance?"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Strategy 1: Remove Items from Training data set"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 41,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " id | \n",
+ " date | \n",
+ " store_nbr | \n",
+ " item_nbr | \n",
+ " onpromotion | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 125497040 | \n",
+ " 2017-08-16 | \n",
+ " 1 | \n",
+ " 96995 | \n",
+ " False | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 125497041 | \n",
+ " 2017-08-16 | \n",
+ " 1 | \n",
+ " 99197 | \n",
+ " False | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 125497042 | \n",
+ " 2017-08-16 | \n",
+ " 1 | \n",
+ " 103501 | \n",
+ " False | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 125497043 | \n",
+ " 2017-08-16 | \n",
+ " 1 | \n",
+ " 103520 | \n",
+ " False | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 125497044 | \n",
+ " 2017-08-16 | \n",
+ " 1 | \n",
+ " 103665 | \n",
+ " False | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " id date store_nbr item_nbr onpromotion\n",
+ "0 125497040 2017-08-16 1 96995 False\n",
+ "1 125497041 2017-08-16 1 99197 False\n",
+ "2 125497042 2017-08-16 1 103501 False\n",
+ "3 125497043 2017-08-16 1 103520 False\n",
+ "4 125497044 2017-08-16 1 103665 False"
+ ]
+ },
+ "execution_count": 41,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "test.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 42,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "(3901,)"
+ ]
+ },
+ "execution_count": 42,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "test.item_nbr.unique().shape"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 43,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "0.007690335811330428"
+ ]
+ },
+ "execution_count": 43,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "30/3901"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 44,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def move_items_from_train_to_validation(train, validation, items_to_remove):\n",
+ " train2 = train[~train.item_nbr.isin(items_to_remove)]\n",
+ " validation_to_add = train[train.item_nbr.isin(items_to_remove)]\n",
+ " validation2 = validation.append(validation_to_add)\n",
+ " return train2, validation2\n",
+ "\n",
+ "\n",
+ "def move_random_items_from_train_to_validation(train, validation, num_items_to_remove):\n",
+ " train_items = train['item_nbr'].unique()\n",
+ " items_to_remove = np.random.choice(train_items, num_items_to_remove)\n",
+ " train2, validation2 = move_items_from_train_to_validation(train, validation, items_to_remove)\n",
+ " print(\"Moved {} items from train data to test data\".format(num_items_to_remove))\n",
+ " print(\"train data: {} -> {} rows\".format(train.shape[0], train2.shape[0]))\n",
+ " print(\"validation data: {} -> {} rows\".format(validation.shape[0], validation2.shape[0]))\n",
+ " return train2, validation2"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 45,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Moved 30 items from train data to test data\n",
+ "train data: 123296175 -> 122487773 rows\n",
+ "validation data: 1679408 -> 2487810 rows\n"
+ ]
+ }
+ ],
+ "source": [
+ "num_items_to_move = 30\n",
+ "train_train2, train_validation2 = move_random_items_from_train_to_validation(train_train, train_validation, num_items_to_move)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 46,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def move_random_items_from_train_to_validation(train, validation, num_items_to_remove):\n",
+ " train_items = train['item_nbr'].unique()\n",
+ " items_to_remove = np.random.choice(train_items, num_items_to_remove)\n",
+ " train2, validation2 = move_items_from_train_to_validation(train, validation, items_to_remove)\n",
+ " print(\"Moved {} items from train data to test data\".format(num_items_to_remove))\n",
+ " print(\"train data: {} -> {} rows\".format(train.shape[0], train2.shape[0]))\n",
+ " print(\"validation data: {} -> {} rows\".format(validation.shape[0], validation2.shape[0]))\n",
+ " return train2, validation2"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 47,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Moved 10 items from train data to test data\n",
+ "train data: 123296175 -> 122972539 rows\n",
+ "validation data: 1679408 -> 2003044 rows\n"
+ ]
+ }
+ ],
+ "source": [
+ "num_items_to_remove = 10\n",
+ "train_train2, train_validation2 = move_random_items_from_train_to_validation(train_train, train_validation, num_items_to_remove)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Clear memory for previous train_train and train_validation"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 48,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "288"
+ ]
+ },
+ "execution_count": 48,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "train_train = train_train2\n",
+ "train_validation = train_validation2\n",
+ "del train_train2\n",
+ "del train_validation2\n",
+ "gc.collect()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 49,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "0.80% of items in the test data set are not seen in the train data set\n"
+ ]
+ },
+ {
+ "data": {
+ "text/plain": [
+ "0.00804985717995326"
+ ]
+ },
+ "execution_count": 49,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "get_unseen_item_percentage(train_train, train_validation)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 50,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "16.51% of (item,store) pairs in the test data set are not seen in the train data set\n"
+ ]
+ },
+ {
+ "data": {
+ "text/plain": [
+ "0.16514714604372147"
+ ]
+ },
+ "execution_count": 50,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "get_unseen_item_store_pair_percentage(train_train, train_validation)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Now, much more items are unseen in validation data"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 51,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "(2003044,)\n",
+ "(2003044,) (2003044,) (2003044,)\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "/Users/jinyang/anaconda/envs/py35/lib/python3.5/site-packages/ipykernel_launcher.py:3: SettingWithCopyWarning:\n",
+ "\n",
+ "\n",
+ "A value is trying to be set on a copy of a slice from a DataFrame\n",
+ "\n",
+ "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n",
+ "\n"
+ ]
+ }
+ ],
+ "source": [
+ "train_validation_metric = get_evaluation_using_constant_baseline(train_train, train_validation)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 52,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "1.033043851782858"
+ ]
+ },
+ "execution_count": 52,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "train_validation_metric"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "In Leaderboard 1.710"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Removing items hardly changed the validation score, which implies that we are doing worse job on predicting seen items"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Analyze the performance on each group: seen (item, store), seen class, unseen class"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Group test data into the three groups"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 53,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def group_test_data(test, train):\n",
+ " def get_classes_from_expanded(df_expanded):\n",
+ " return df_expanded['class'].unique()\n",
+ " cols_item_store = ['item_nbr', 'store_nbr']\n",
+ " train_item_store_grouped = train.groupby(cols_item_store)['unit_sales'].mean().reset_index()\n",
+ " cols_to_use = test.columns.drop('unit_sales') if 'unit_sales' in test.columns else test.columns\n",
+ " test_join_train_item_store_grouped = pd.merge(test[cols_to_use], train_item_store_grouped, on=cols_item_store, how='left')\n",
+ " test_seen = test_join_train_item_store_grouped[test_join_train_item_store_grouped['unit_sales'].notnull()]\n",
+ " test_unseen = test_join_train_item_store_grouped[test_join_train_item_store_grouped['unit_sales'].isnull()]\n",
+ " test_unseen_expanded = get_item_expanded_df(test_unseen, items)\n",
+ " test_unseen_class = get_classes_from_expanded(test_unseen_expanded)\n",
+ " train_expanded = get_item_expanded_df(train, items)\n",
+ " train_class = get_classes_from_expanded(train_expanded)\n",
+ " test_unseen_class_diff = set(test_unseen_class) - set(train_class)\n",
+ " test_unseen_class_same = set(test_unseen_class) - test_unseen_class_diff\n",
+ " test_unseen_class_seen = test_unseen_expanded[test_unseen_expanded['class'].isin(test_unseen_class_same)]\n",
+ " test_unseen_class_unseen = test_unseen_expanded[test_unseen_expanded['class'].isin(test_unseen_class_diff)]\n",
+ " return test_seen, test_unseen_class_seen, test_unseen_class_unseen"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 54,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "test_seen, test_unseen_class_seen,test_unseen_class_unseen = group_test_data(test, train)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 55,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "True\n"
+ ]
+ }
+ ],
+ "source": [
+ "print(test.shape[0]== test_seen.shape[0]+ test_unseen_class_seen.shape[0]+test_unseen_class_unseen.shape[0])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 56,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def get_baseline_prediction_test_seen(test_seen_expanded, train):\n",
+ " cols_item_store = ['item_nbr', 'store_nbr']\n",
+ " cols_test_expanded = test_seen_expanded.columns\n",
+ " cols_prediction = ['id', 'unit_sales', 'perishable']\n",
+ " train_item_store_grouped = train.groupby(cols_item_store)['unit_sales'].mean().reset_index()\n",
+ " cols_to_use = cols_test_expanded.drop('unit_sales') if 'unit_sales' in cols_test_expanded else cols_test_expanded\n",
+ " test_join_train_item_store_grouped = pd.merge(test_seen_expanded[cols_to_use], train_item_store_grouped, on=cols_item_store, how='left')\n",
+ " is_data_seen = test_join_train_item_store_grouped['unit_sales'].notnull()\n",
+ " join_seen = test_join_train_item_store_grouped[is_data_seen.values]\n",
+ " return join_seen[cols_prediction].rename(columns={'unit_sales':'prediction_sales'})"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 57,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def get_baseline_prediction_test_unseen_class_seen(test_unseen_class_seen, train_expanded):\n",
+ " test_unseen_class_seen_class = test_unseen_class_seen['class'].unique()\n",
+ " train_sub = train_expanded[train_expanded['class'].isin(test_unseen_class_seen_class)]\n",
+ " train_sub_class_grouped = train_sub.groupby('class').mean().reset_index()\n",
+ " train_sub_class_grouped = train_sub_class_grouped[['class', 'unit_sales']]\n",
+ " test_unseen_class_seen_join_train_sub_class_grouped = pd.merge(test_unseen_class_seen, train_sub_class_grouped, on='class', how='left')\n",
+ " return test_unseen_class_seen_join_train_sub_class_grouped.drop('unit_sales_x', axis=1)\\\n",
+ " .rename(columns={'unit_sales_y':'prediction_sales'})\n",
+ " "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 58,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def get_baseline_prediction_test_unseen_class_unseen(test_unseen_class_unseen, train_expanded):\n",
+ " test_unseen_class_unseen_family = test_unseen_class_unseen['family'].unique()\n",
+ " train_sub = train_expanded[train_expanded['family'].isin(test_unseen_class_unseen_family)]\n",
+ " train_sub_family_grouped = train_sub.groupby('family').mean().reset_index()\n",
+ " train_sub_family_grouped = train_sub_family_grouped[['family', 'unit_sales']]\n",
+ " test_unseen_class_unseen_join_train_sub_family_grouped = pd.merge(test_unseen_class_unseen, train_sub_family_grouped, on='family', how='left')\n",
+ " return test_unseen_class_unseen_join_train_sub_family_grouped.drop('unit_sales_x', axis=1)\\\n",
+ " .rename(columns={'unit_sales_y':'prediction_sales'})\n",
+ " "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 59,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def clean_predictions(predictions):\n",
+ " predictions[predictions<0]=0\n",
+ " return predictions.round().astype(int)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 60,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def clean_targets(targets):\n",
+ " targets[targets<0]=0\n",
+ " return targets"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 61,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def get_baseline_predictions(test, train):\n",
+ " cols_to_use =['id', 'perishable', 'prediction_sales']\n",
+ " test_seen, test_unseen_class_seen, test_unseen_class_unseen = group_test_data(test, train)\n",
+ " train_expanded = get_item_expanded_df(train, items)\n",
+ " test_seen_expanded = get_item_expanded_df(test_seen, items)\n",
+ " prediction_test_seen = get_baseline_prediction_test_seen(test_seen_expanded, train)[cols_to_use]\n",
+ " prediction_test_unseen_class_seen = get_baseline_prediction_test_unseen_class_seen(test_unseen_class_seen, train_expanded)[cols_to_use]\n",
+ " prediction_test_unseen_class_unseen = get_baseline_prediction_test_unseen_class_unseen(test_unseen_class_unseen, train_expanded)[cols_to_use]\n",
+ " baseline_predictions = prediction_test_seen.append(prediction_test_unseen_class_seen).append(prediction_test_unseen_class_unseen)\n",
+ " cleaned_predictions = clean_predictions(baseline_predictions['prediction_sales'])\n",
+ " baseline_predictions.loc[:, 'prediction_sales']=cleaned_predictions\n",
+ " return baseline_predictions"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 62,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# prediction_test_seen, prediction_test_unseen_class_seen, prediction_test_unseen_class_unseen=get_baseline_predictions(test, train)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 63,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "/Users/jinyang/anaconda/envs/py35/lib/python3.5/site-packages/ipykernel_launcher.py:2: SettingWithCopyWarning:\n",
+ "\n",
+ "\n",
+ "A value is trying to be set on a copy of a slice from a DataFrame\n",
+ "\n",
+ "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n",
+ "\n"
+ ]
+ }
+ ],
+ "source": [
+ "baseline_predictions_validation = get_baseline_predictions(train_validation, train_train)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 64,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " id | \n",
+ " perishable | \n",
+ " prediction_sales | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 123296175 | \n",
+ " 0 | \n",
+ " 3 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 123296176 | \n",
+ " 1 | \n",
+ " 4 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 123296177 | \n",
+ " 0 | \n",
+ " 5 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 123296178 | \n",
+ " 0 | \n",
+ " 10 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 123296179 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " id perishable prediction_sales\n",
+ "0 123296175 0 3\n",
+ "1 123296176 1 4\n",
+ "2 123296177 0 5\n",
+ "3 123296178 0 10\n",
+ "4 123296179 0 1"
+ ]
+ },
+ "execution_count": 64,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "baseline_predictions_validation.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 65,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "targets_validation = pd.merge(baseline_predictions_validation, train_validation, on='id', how='left')['unit_sales']"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 66,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def get_evaluation(baseline_predictions_validation, targets_validation):\n",
+ " predictions = baseline_predictions_validation.prediction_sales\n",
+ " cleaned_targets = clean_targets(targets_validation)\n",
+ " weights = baseline_predictions_validation.perishable\n",
+ " return nwrmsle(predictions, cleaned_targets, weights)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 67,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "(2003044,)\n",
+ "(2003044,) (2003044,) (2003044,)\n"
+ ]
+ }
+ ],
+ "source": [
+ "validation_metric = get_evaluation(baseline_predictions_validation, targets_validation)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 68,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "0.67828104376604859"
+ ]
+ },
+ "execution_count": 68,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "validation_metric"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 69,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Validation NWRMSLE: 0.67828\n",
+ "Validation MSE: 353.981\n"
+ ]
+ }
+ ],
+ "source": [
+ "mse = mean_squared_error(baseline_predictions_validation['prediction_sales'], targets_validation)\n",
+ "print('Validation NWRMSLE: %.5f' % (validation_metric))\n",
+ "print('Validation MSE: %.3f' % (mse))"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "on Leaderboard: 1.369"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 70,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "0.67828104376604859"
+ ]
+ },
+ "execution_count": 70,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "validation_metric"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Empirical baseline model"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 20,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "Timestamp('2017-07-25 00:00:00')"
+ ]
+ },
+ "execution_count": 20,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "train_train.date.max()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 22,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "2017-07-26 00:00:00 2017-08-10 00:00:00\n"
+ ]
+ }
+ ],
+ "source": [
+ "print(train_validation.date.min(), train_validation.date.max())"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 24,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "begin = pd.to_datetime('2017-07-05')\n",
+ "end = pd.to_datetime('2017-07-20')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 30,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "period_to_use_validation = train_train[(train_train.date>=begin) & (train_train.date<=end)]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 27,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "(1673324, 6)"
+ ]
+ },
+ "execution_count": 27,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "period_to_use_validation.shape"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 29,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " id | \n",
+ " date | \n",
+ " store_nbr | \n",
+ " item_nbr | \n",
+ " unit_sales | \n",
+ " onpromotion | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 121094288 | \n",
+ " 121094288 | \n",
+ " 2017-07-05 | \n",
+ " 1 | \n",
+ " 103520 | \n",
+ " 1.0 | \n",
+ " False | \n",
+ "
\n",
+ " \n",
+ " 121094289 | \n",
+ " 121094289 | \n",
+ " 2017-07-05 | \n",
+ " 1 | \n",
+ " 103665 | \n",
+ " 4.0 | \n",
+ " False | \n",
+ "
\n",
+ " \n",
+ " 121094290 | \n",
+ " 121094290 | \n",
+ " 2017-07-05 | \n",
+ " 1 | \n",
+ " 105574 | \n",
+ " 7.0 | \n",
+ " False | \n",
+ "
\n",
+ " \n",
+ " 121094291 | \n",
+ " 121094291 | \n",
+ " 2017-07-05 | \n",
+ " 1 | \n",
+ " 105575 | \n",
+ " 15.0 | \n",
+ " False | \n",
+ "
\n",
+ " \n",
+ " 121094292 | \n",
+ " 121094292 | \n",
+ " 2017-07-05 | \n",
+ " 1 | \n",
+ " 105577 | \n",
+ " 1.0 | \n",
+ " False | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " id date store_nbr item_nbr unit_sales onpromotion\n",
+ "121094288 121094288 2017-07-05 1 103520 1.0 False\n",
+ "121094289 121094289 2017-07-05 1 103665 4.0 False\n",
+ "121094290 121094290 2017-07-05 1 105574 7.0 False\n",
+ "121094291 121094291 2017-07-05 1 105575 15.0 False\n",
+ "121094292 121094292 2017-07-05 1 105577 1.0 False"
+ ]
+ },
+ "execution_count": 29,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "period_to_use_validation.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 40,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "2017-07-05 00:00:00 2017-07-20 00:00:00\n"
+ ]
+ }
+ ],
+ "source": [
+ "print(period_to_use_validation.date.min(), period_to_use_validation.date.max())"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 60,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "0.99% of items in the test data set are not seen in the train data set\n"
+ ]
+ },
+ {
+ "data": {
+ "text/plain": [
+ "0.009870129870129871"
+ ]
+ },
+ "execution_count": 60,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "get_unseen_item_percentage(period_to_use_validation, train_validation)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 61,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "1.23% of (item,store) pairs in the test data set are not seen in the train data set\n"
+ ]
+ },
+ {
+ "data": {
+ "text/plain": [
+ "0.012323390147004183"
+ ]
+ },
+ "execution_count": 61,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "get_unseen_item_store_pair_percentage(period_to_use_validation, train_validation )"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 62,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "2.13% of items in the test data set are not seen in the train data set\n"
+ ]
+ },
+ {
+ "data": {
+ "text/plain": [
+ "0.02127659574468085"
+ ]
+ },
+ "execution_count": 62,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "get_unseen_item_percentage(train_validation, test)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 63,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "30.47% of (item,store) pairs in the test data set are not seen in the train data set\n"
+ ]
+ },
+ {
+ "data": {
+ "text/plain": [
+ "0.3047319300844038"
+ ]
+ },
+ "execution_count": 63,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "get_unseen_item_store_pair_percentage(train_validation, test )"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 64,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "21.10% of (item,store) pairs in the test data set are not seen in the train data set\n"
+ ]
+ },
+ {
+ "data": {
+ "text/plain": [
+ "0.21096679863662687"
+ ]
+ },
+ "execution_count": 64,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "get_unseen_item_store_pair_percentage(train, test )"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Group data"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 68,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def get_item_expanded_df(test, items):\n",
+ " return pd.merge(test, items, on='item_nbr', how='left')\n",
+ "def group_test_data(test, train):\n",
+ " def get_classes_from_expanded(df_expanded):\n",
+ " return df_expanded['class'].unique()\n",
+ " cols_item_store = ['item_nbr', 'store_nbr']\n",
+ " train_item_store_grouped = train.groupby(cols_item_store)['unit_sales'].mean().reset_index()\n",
+ " cols_to_use = test.columns.drop('unit_sales') if 'unit_sales' in test.columns else test.columns\n",
+ " test_join_train_item_store_grouped = pd.merge(test[cols_to_use], train_item_store_grouped, on=cols_item_store, how='left')\n",
+ " test_seen = test_join_train_item_store_grouped[test_join_train_item_store_grouped['unit_sales'].notnull()]\n",
+ " test_unseen = test_join_train_item_store_grouped[test_join_train_item_store_grouped['unit_sales'].isnull()]\n",
+ " test_unseen_expanded = get_item_expanded_df(test_unseen, items)\n",
+ " test_unseen_class = get_classes_from_expanded(test_unseen_expanded)\n",
+ " train_expanded = get_item_expanded_df(train, items)\n",
+ " train_class = get_classes_from_expanded(train_expanded)\n",
+ " test_unseen_class_diff = set(test_unseen_class) - set(train_class)\n",
+ " test_unseen_class_same = set(test_unseen_class) - test_unseen_class_diff\n",
+ " test_unseen_class_seen = test_unseen_expanded[test_unseen_expanded['class'].isin(test_unseen_class_same)]\n",
+ " test_unseen_class_unseen = test_unseen_expanded[test_unseen_expanded['class'].isin(test_unseen_class_diff)]\n",
+ " return test_seen, test_unseen_class_seen, test_unseen_class_unseen"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 69,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "validation_seen, validation_unseen_class_seen,validation_unseen_class_unseen = group_test_data(train_validation, period_to_use_validation)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 75,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "(1679408, 6)"
+ ]
+ },
+ "execution_count": 75,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "train_validation.shape"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 70,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "(1658712, 6)"
+ ]
+ },
+ "execution_count": 70,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "validation_seen.shape"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 72,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "(20691, 9)"
+ ]
+ },
+ "execution_count": 72,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "validation_unseen_class_seen.shape"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 73,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "(5, 9)"
+ ]
+ },
+ "execution_count": 73,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "validation_unseen_class_unseen.shape"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 76,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "0.012323390147004183"
+ ]
+ },
+ "execution_count": 76,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "(1679408-1658712)/1679408"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 77,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "test_seen, test_unseen_class_seen,test_unseen_class_unseen = group_test_data(test, train_validation)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 78,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "(3370464, 5)"
+ ]
+ },
+ "execution_count": 78,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "test.shape"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 79,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "(2343376, 6)"
+ ]
+ },
+ "execution_count": 79,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "test_seen.shape"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 80,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "(1019312, 9)"
+ ]
+ },
+ "execution_count": 80,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "test_unseen_class_seen.shape"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 81,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "(7776, 9)"
+ ]
+ },
+ "execution_count": 81,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "test_unseen_class_unseen.shape"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 82,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "0.3047319300844038"
+ ]
+ },
+ "execution_count": 82,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "(3370464-2343376)/3370464"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 83,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "array(['2017-07-05T00:00:00.000000000', '2017-07-06T00:00:00.000000000',\n",
+ " '2017-07-07T00:00:00.000000000', '2017-07-08T00:00:00.000000000',\n",
+ " '2017-07-09T00:00:00.000000000', '2017-07-10T00:00:00.000000000',\n",
+ " '2017-07-11T00:00:00.000000000', '2017-07-12T00:00:00.000000000',\n",
+ " '2017-07-13T00:00:00.000000000', '2017-07-14T00:00:00.000000000',\n",
+ " '2017-07-15T00:00:00.000000000', '2017-07-16T00:00:00.000000000',\n",
+ " '2017-07-17T00:00:00.000000000', '2017-07-18T00:00:00.000000000',\n",
+ " '2017-07-19T00:00:00.000000000', '2017-07-20T00:00:00.000000000'], dtype='datetime64[ns]')"
+ ]
+ },
+ "execution_count": 83,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "period_to_use_validation.date.unique()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 84,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "array(['2017-07-26T00:00:00.000000000', '2017-07-27T00:00:00.000000000',\n",
+ " '2017-07-28T00:00:00.000000000', '2017-07-29T00:00:00.000000000',\n",
+ " '2017-07-30T00:00:00.000000000', '2017-07-31T00:00:00.000000000',\n",
+ " '2017-08-01T00:00:00.000000000', '2017-08-02T00:00:00.000000000',\n",
+ " '2017-08-03T00:00:00.000000000', '2017-08-04T00:00:00.000000000',\n",
+ " '2017-08-05T00:00:00.000000000', '2017-08-06T00:00:00.000000000',\n",
+ " '2017-08-07T00:00:00.000000000', '2017-08-08T00:00:00.000000000',\n",
+ " '2017-08-09T00:00:00.000000000', '2017-08-10T00:00:00.000000000'], dtype='datetime64[ns]')"
+ ]
+ },
+ "execution_count": 84,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "train_validation.date.unique()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 87,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "Timestamp('2017-07-05 00:00:00')"
+ ]
+ },
+ "execution_count": 87,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "train_validation.date.min() - pd.DateOffset(days=21)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 89,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " id | \n",
+ " date | \n",
+ " store_nbr | \n",
+ " item_nbr | \n",
+ " unit_sales | \n",
+ " onpromotion | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 121094288 | \n",
+ " 121094288 | \n",
+ " 2017-07-05 | \n",
+ " 1 | \n",
+ " 103520 | \n",
+ " 1.0 | \n",
+ " False | \n",
+ "
\n",
+ " \n",
+ " 121094289 | \n",
+ " 121094289 | \n",
+ " 2017-07-05 | \n",
+ " 1 | \n",
+ " 103665 | \n",
+ " 4.0 | \n",
+ " False | \n",
+ "
\n",
+ " \n",
+ " 121094290 | \n",
+ " 121094290 | \n",
+ " 2017-07-05 | \n",
+ " 1 | \n",
+ " 105574 | \n",
+ " 7.0 | \n",
+ " False | \n",
+ "
\n",
+ " \n",
+ " 121094291 | \n",
+ " 121094291 | \n",
+ " 2017-07-05 | \n",
+ " 1 | \n",
+ " 105575 | \n",
+ " 15.0 | \n",
+ " False | \n",
+ "
\n",
+ " \n",
+ " 121094292 | \n",
+ " 121094292 | \n",
+ " 2017-07-05 | \n",
+ " 1 | \n",
+ " 105577 | \n",
+ " 1.0 | \n",
+ " False | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " id date store_nbr item_nbr unit_sales onpromotion\n",
+ "121094288 121094288 2017-07-05 1 103520 1.0 False\n",
+ "121094289 121094289 2017-07-05 1 103665 4.0 False\n",
+ "121094290 121094290 2017-07-05 1 105574 7.0 False\n",
+ "121094291 121094291 2017-07-05 1 105575 15.0 False\n",
+ "121094292 121094292 2017-07-05 1 105577 1.0 False"
+ ]
+ },
+ "execution_count": 89,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "period_to_use_validation.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "end_of_validation_period - pd.DateOffset(days=15)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 92,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "/Users/jinyang/anaconda/envs/py35/lib/python3.5/site-packages/ipykernel_launcher.py:1: SettingWithCopyWarning:\n",
+ "\n",
+ "\n",
+ "A value is trying to be set on a copy of a slice from a DataFrame.\n",
+ "Try using .loc[row_indexer,col_indexer] = value instead\n",
+ "\n",
+ "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n",
+ "\n"
+ ]
+ }
+ ],
+ "source": [
+ "period_to_use_validation['prediction_date']=period_to_use_validation['date']+pd.DateOffset(days=21)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 93,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " id | \n",
+ " date | \n",
+ " store_nbr | \n",
+ " item_nbr | \n",
+ " unit_sales | \n",
+ " onpromotion | \n",
+ " prediction_date | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 121094288 | \n",
+ " 121094288 | \n",
+ " 2017-07-05 | \n",
+ " 1 | \n",
+ " 103520 | \n",
+ " 1.0 | \n",
+ " False | \n",
+ " 2017-07-26 | \n",
+ "
\n",
+ " \n",
+ " 121094289 | \n",
+ " 121094289 | \n",
+ " 2017-07-05 | \n",
+ " 1 | \n",
+ " 103665 | \n",
+ " 4.0 | \n",
+ " False | \n",
+ " 2017-07-26 | \n",
+ "
\n",
+ " \n",
+ " 121094290 | \n",
+ " 121094290 | \n",
+ " 2017-07-05 | \n",
+ " 1 | \n",
+ " 105574 | \n",
+ " 7.0 | \n",
+ " False | \n",
+ " 2017-07-26 | \n",
+ "
\n",
+ " \n",
+ " 121094291 | \n",
+ " 121094291 | \n",
+ " 2017-07-05 | \n",
+ " 1 | \n",
+ " 105575 | \n",
+ " 15.0 | \n",
+ " False | \n",
+ " 2017-07-26 | \n",
+ "
\n",
+ " \n",
+ " 121094292 | \n",
+ " 121094292 | \n",
+ " 2017-07-05 | \n",
+ " 1 | \n",
+ " 105577 | \n",
+ " 1.0 | \n",
+ " False | \n",
+ " 2017-07-26 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " id date store_nbr item_nbr unit_sales onpromotion \\\n",
+ "121094288 121094288 2017-07-05 1 103520 1.0 False \n",
+ "121094289 121094289 2017-07-05 1 103665 4.0 False \n",
+ "121094290 121094290 2017-07-05 1 105574 7.0 False \n",
+ "121094291 121094291 2017-07-05 1 105575 15.0 False \n",
+ "121094292 121094292 2017-07-05 1 105577 1.0 False \n",
+ "\n",
+ " prediction_date \n",
+ "121094288 2017-07-26 \n",
+ "121094289 2017-07-26 \n",
+ "121094290 2017-07-26 \n",
+ "121094291 2017-07-26 \n",
+ "121094292 2017-07-26 "
+ ]
+ },
+ "execution_count": 93,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "period_to_use_validation.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 95,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "period_to_use_validation = period_to_use_validation.rename(columns={'date':'date_old', 'prediction_date':'date'})"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 96,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " id | \n",
+ " date_old | \n",
+ " store_nbr | \n",
+ " item_nbr | \n",
+ " unit_sales | \n",
+ " onpromotion | \n",
+ " date | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 121094288 | \n",
+ " 121094288 | \n",
+ " 2017-07-05 | \n",
+ " 1 | \n",
+ " 103520 | \n",
+ " 1.0 | \n",
+ " False | \n",
+ " 2017-07-26 | \n",
+ "
\n",
+ " \n",
+ " 121094289 | \n",
+ " 121094289 | \n",
+ " 2017-07-05 | \n",
+ " 1 | \n",
+ " 103665 | \n",
+ " 4.0 | \n",
+ " False | \n",
+ " 2017-07-26 | \n",
+ "
\n",
+ " \n",
+ " 121094290 | \n",
+ " 121094290 | \n",
+ " 2017-07-05 | \n",
+ " 1 | \n",
+ " 105574 | \n",
+ " 7.0 | \n",
+ " False | \n",
+ " 2017-07-26 | \n",
+ "
\n",
+ " \n",
+ " 121094291 | \n",
+ " 121094291 | \n",
+ " 2017-07-05 | \n",
+ " 1 | \n",
+ " 105575 | \n",
+ " 15.0 | \n",
+ " False | \n",
+ " 2017-07-26 | \n",
+ "
\n",
+ " \n",
+ " 121094292 | \n",
+ " 121094292 | \n",
+ " 2017-07-05 | \n",
+ " 1 | \n",
+ " 105577 | \n",
+ " 1.0 | \n",
+ " False | \n",
+ " 2017-07-26 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " id date_old store_nbr item_nbr unit_sales onpromotion \\\n",
+ "121094288 121094288 2017-07-05 1 103520 1.0 False \n",
+ "121094289 121094289 2017-07-05 1 103665 4.0 False \n",
+ "121094290 121094290 2017-07-05 1 105574 7.0 False \n",
+ "121094291 121094291 2017-07-05 1 105575 15.0 False \n",
+ "121094292 121094292 2017-07-05 1 105577 1.0 False \n",
+ "\n",
+ " date \n",
+ "121094288 2017-07-26 \n",
+ "121094289 2017-07-26 \n",
+ "121094290 2017-07-26 \n",
+ "121094291 2017-07-26 \n",
+ "121094292 2017-07-26 "
+ ]
+ },
+ "execution_count": 96,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "period_to_use_validation.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 105,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " id | \n",
+ " date | \n",
+ " store_nbr | \n",
+ " item_nbr | \n",
+ " unit_sales | \n",
+ " onpromotion | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 123296175 | \n",
+ " 123296175 | \n",
+ " 2017-07-26 | \n",
+ " 1 | \n",
+ " 103520 | \n",
+ " 1.0 | \n",
+ " False | \n",
+ "
\n",
+ " \n",
+ " 123296176 | \n",
+ " 123296176 | \n",
+ " 2017-07-26 | \n",
+ " 1 | \n",
+ " 103665 | \n",
+ " 4.0 | \n",
+ " False | \n",
+ "
\n",
+ " \n",
+ " 123296177 | \n",
+ " 123296177 | \n",
+ " 2017-07-26 | \n",
+ " 1 | \n",
+ " 105574 | \n",
+ " 9.0 | \n",
+ " False | \n",
+ "
\n",
+ " \n",
+ " 123296178 | \n",
+ " 123296178 | \n",
+ " 2017-07-26 | \n",
+ " 1 | \n",
+ " 105575 | \n",
+ " 6.0 | \n",
+ " False | \n",
+ "
\n",
+ " \n",
+ " 123296179 | \n",
+ " 123296179 | \n",
+ " 2017-07-26 | \n",
+ " 1 | \n",
+ " 105693 | \n",
+ " 2.0 | \n",
+ " True | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " id date store_nbr item_nbr unit_sales onpromotion\n",
+ "123296175 123296175 2017-07-26 1 103520 1.0 False\n",
+ "123296176 123296176 2017-07-26 1 103665 4.0 False\n",
+ "123296177 123296177 2017-07-26 1 105574 9.0 False\n",
+ "123296178 123296178 2017-07-26 1 105575 6.0 False\n",
+ "123296179 123296179 2017-07-26 1 105693 2.0 True"
+ ]
+ },
+ "execution_count": 105,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "train_validation.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 98,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "cols_to_use=['date', 'store_nbr', 'item_nbr', 'unit_sales']"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 102,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "(142663, 3)"
+ ]
+ },
+ "execution_count": 102,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "validation_seen.groupby(['store_nbr', 'item_nbr'])['unit_sales'].mean().reset_index().shape"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 103,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "(147405, 3)"
+ ]
+ },
+ "execution_count": 103,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "period_to_use_validation.groupby(['store_nbr', 'item_nbr'])['unit_sales'].mean().reset_index().shape"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 117,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "validation_join_period = pd.merge(train_validation, period_to_use_validation[cols_to_use].rename(columns={'unit_sales':'prediction_sales'}), on=['date', 'store_nbr', 'item_nbr'], how='left')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 118,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " id | \n",
+ " date | \n",
+ " store_nbr | \n",
+ " item_nbr | \n",
+ " unit_sales | \n",
+ " onpromotion | \n",
+ " prediction_sales | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 123296175 | \n",
+ " 2017-07-26 | \n",
+ " 1 | \n",
+ " 103520 | \n",
+ " 1.0 | \n",
+ " False | \n",
+ " 1.0 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 123296176 | \n",
+ " 2017-07-26 | \n",
+ " 1 | \n",
+ " 103665 | \n",
+ " 4.0 | \n",
+ " False | \n",
+ " 4.0 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 123296177 | \n",
+ " 2017-07-26 | \n",
+ " 1 | \n",
+ " 105574 | \n",
+ " 9.0 | \n",
+ " False | \n",
+ " 7.0 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 123296178 | \n",
+ " 2017-07-26 | \n",
+ " 1 | \n",
+ " 105575 | \n",
+ " 6.0 | \n",
+ " False | \n",
+ " 15.0 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 123296179 | \n",
+ " 2017-07-26 | \n",
+ " 1 | \n",
+ " 105693 | \n",
+ " 2.0 | \n",
+ " True | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " id date store_nbr item_nbr unit_sales onpromotion \\\n",
+ "0 123296175 2017-07-26 1 103520 1.0 False \n",
+ "1 123296176 2017-07-26 1 103665 4.0 False \n",
+ "2 123296177 2017-07-26 1 105574 9.0 False \n",
+ "3 123296178 2017-07-26 1 105575 6.0 False \n",
+ "4 123296179 2017-07-26 1 105693 2.0 True \n",
+ "\n",
+ " prediction_sales \n",
+ "0 1.0 \n",
+ "1 4.0 \n",
+ "2 7.0 \n",
+ "3 15.0 \n",
+ "4 NaN "
+ ]
+ },
+ "execution_count": 118,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "validation_join_period.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 121,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "validation_seen_prediction = validation_join_period[validation_join_period.prediction_sales.notnull()]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 122,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " id | \n",
+ " date | \n",
+ " store_nbr | \n",
+ " item_nbr | \n",
+ " unit_sales | \n",
+ " onpromotion | \n",
+ " prediction_sales | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 123296175 | \n",
+ " 2017-07-26 | \n",
+ " 1 | \n",
+ " 103520 | \n",
+ " 1.0 | \n",
+ " False | \n",
+ " 1.0 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 123296176 | \n",
+ " 2017-07-26 | \n",
+ " 1 | \n",
+ " 103665 | \n",
+ " 4.0 | \n",
+ " False | \n",
+ " 4.0 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 123296177 | \n",
+ " 2017-07-26 | \n",
+ " 1 | \n",
+ " 105574 | \n",
+ " 9.0 | \n",
+ " False | \n",
+ " 7.0 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 123296178 | \n",
+ " 2017-07-26 | \n",
+ " 1 | \n",
+ " 105575 | \n",
+ " 6.0 | \n",
+ " False | \n",
+ " 15.0 | \n",
+ "
\n",
+ " \n",
+ " 6 | \n",
+ " 123296181 | \n",
+ " 2017-07-26 | \n",
+ " 1 | \n",
+ " 105857 | \n",
+ " 10.0 | \n",
+ " False | \n",
+ " 5.0 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " id date store_nbr item_nbr unit_sales onpromotion \\\n",
+ "0 123296175 2017-07-26 1 103520 1.0 False \n",
+ "1 123296176 2017-07-26 1 103665 4.0 False \n",
+ "2 123296177 2017-07-26 1 105574 9.0 False \n",
+ "3 123296178 2017-07-26 1 105575 6.0 False \n",
+ "6 123296181 2017-07-26 1 105857 10.0 False \n",
+ "\n",
+ " prediction_sales \n",
+ "0 1.0 \n",
+ "1 4.0 \n",
+ "2 7.0 \n",
+ "3 15.0 \n",
+ "6 5.0 "
+ ]
+ },
+ "execution_count": 122,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "validation_seen_prediction.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 123,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "0"
+ ]
+ },
+ "execution_count": 123,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "validation_seen_prediction.prediction_sales.isnull().sum()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 124,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "0.7997026333088803"
+ ]
+ },
+ "execution_count": 124,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "validation_seen_prediction.shape[0]/train_validation.shape[0]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 133,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "validation_seen_prediction_expanded = pd.merge(validation_seen_prediction, items, on='item_nbr', how='left')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 125,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " id | \n",
+ " date | \n",
+ " store_nbr | \n",
+ " item_nbr | \n",
+ " unit_sales | \n",
+ " onpromotion | \n",
+ " prediction_sales | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 123296175 | \n",
+ " 2017-07-26 | \n",
+ " 1 | \n",
+ " 103520 | \n",
+ " 1.0 | \n",
+ " False | \n",
+ " 1.0 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 123296176 | \n",
+ " 2017-07-26 | \n",
+ " 1 | \n",
+ " 103665 | \n",
+ " 4.0 | \n",
+ " False | \n",
+ " 4.0 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 123296177 | \n",
+ " 2017-07-26 | \n",
+ " 1 | \n",
+ " 105574 | \n",
+ " 9.0 | \n",
+ " False | \n",
+ " 7.0 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 123296178 | \n",
+ " 2017-07-26 | \n",
+ " 1 | \n",
+ " 105575 | \n",
+ " 6.0 | \n",
+ " False | \n",
+ " 15.0 | \n",
+ "
\n",
+ " \n",
+ " 6 | \n",
+ " 123296181 | \n",
+ " 2017-07-26 | \n",
+ " 1 | \n",
+ " 105857 | \n",
+ " 10.0 | \n",
+ " False | \n",
+ " 5.0 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " id date store_nbr item_nbr unit_sales onpromotion \\\n",
+ "0 123296175 2017-07-26 1 103520 1.0 False \n",
+ "1 123296176 2017-07-26 1 103665 4.0 False \n",
+ "2 123296177 2017-07-26 1 105574 9.0 False \n",
+ "3 123296178 2017-07-26 1 105575 6.0 False \n",
+ "6 123296181 2017-07-26 1 105857 10.0 False \n",
+ "\n",
+ " prediction_sales \n",
+ "0 1.0 \n",
+ "1 4.0 \n",
+ "2 7.0 \n",
+ "3 15.0 \n",
+ "6 5.0 "
+ ]
+ },
+ "execution_count": 125,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "validation_seen_prediction.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 136,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def clean_targets(targets):\n",
+ " targets[targets<0]=0\n",
+ " return targets\n",
+ "def clean_predictions(predictions):\n",
+ " predictions[predictions<0]=0\n",
+ " return predictions.round().astype(int)\n",
+ "def nwrmsle(predictions, targets, weights):\n",
+ " print(targets.shape)\n",
+ " targets[targets<0]=0\n",
+ " weights = 1 + 0.25 * weights\n",
+ " print(predictions.shape, targets.shape, weights.shape)\n",
+ " log_square_errors = (np.log(predictions.values + 1) - np.log(targets.values + 1)) ** 2\n",
+ " return(np.sqrt(np.sum(weights.values * log_square_errors) / np.sum(weights)))\n",
+ "def get_evaluation(baseline_predictions_validation, targets_validation):\n",
+ " predictions = baseline_predictions_validation.prediction_sales\n",
+ " predictions = clean_predictions(predictions)\n",
+ " cleaned_targets = clean_targets(targets_validation)\n",
+ " weights = baseline_predictions_validation.perishable\n",
+ " return nwrmsle(predictions, cleaned_targets, weights)\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 137,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "True"
+ ]
+ },
+ "execution_count": 137,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "np.any(validation_seen_prediction_expanded.prediction_sales<0)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 138,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "/Users/jinyang/anaconda/envs/py35/lib/python3.5/site-packages/ipykernel_launcher.py:5: SettingWithCopyWarning:\n",
+ "\n",
+ "\n",
+ "A value is trying to be set on a copy of a slice from a DataFrame\n",
+ "\n",
+ "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n",
+ "\n",
+ "/Users/jinyang/anaconda/envs/py35/lib/python3.5/site-packages/ipykernel_launcher.py:2: SettingWithCopyWarning:\n",
+ "\n",
+ "\n",
+ "A value is trying to be set on a copy of a slice from a DataFrame\n",
+ "\n",
+ "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n",
+ "\n",
+ "/Users/jinyang/anaconda/envs/py35/lib/python3.5/site-packages/ipykernel_launcher.py:9: SettingWithCopyWarning:\n",
+ "\n",
+ "\n",
+ "A value is trying to be set on a copy of a slice from a DataFrame\n",
+ "\n",
+ "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n",
+ "\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "(1343027,)\n",
+ "(1343027,) (1343027,) (1343027,)\n"
+ ]
+ },
+ {
+ "data": {
+ "text/plain": [
+ "0.71883112494880497"
+ ]
+ },
+ "execution_count": 138,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "get_evaluation(validation_seen_prediction_expanded, validation_seen_prediction_expanded['unit_sales'])"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "comparision from the old baseline model on the same subset: 1.1433622314160428"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 140,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "(1343027,)"
+ ]
+ },
+ "execution_count": 140,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "validation_seen_prediction_expanded.id.shape"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 142,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "validation_seen_prediction_expanded['id'].to_csv('seen_empirical', index=False, header=False)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## For Test data"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 143,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "array(['2017-07-26T00:00:00.000000000', '2017-07-27T00:00:00.000000000',\n",
+ " '2017-07-28T00:00:00.000000000', '2017-07-29T00:00:00.000000000',\n",
+ " '2017-07-30T00:00:00.000000000', '2017-07-31T00:00:00.000000000',\n",
+ " '2017-08-01T00:00:00.000000000', '2017-08-02T00:00:00.000000000',\n",
+ " '2017-08-03T00:00:00.000000000', '2017-08-04T00:00:00.000000000',\n",
+ " '2017-08-05T00:00:00.000000000', '2017-08-06T00:00:00.000000000',\n",
+ " '2017-08-07T00:00:00.000000000', '2017-08-08T00:00:00.000000000',\n",
+ " '2017-08-09T00:00:00.000000000', '2017-08-10T00:00:00.000000000'], dtype='datetime64[ns]')"
+ ]
+ },
+ "execution_count": 143,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "train_validation.date.unique()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 144,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "array(['2017-08-16T00:00:00.000000000', '2017-08-17T00:00:00.000000000',\n",
+ " '2017-08-18T00:00:00.000000000', '2017-08-19T00:00:00.000000000',\n",
+ " '2017-08-20T00:00:00.000000000', '2017-08-21T00:00:00.000000000',\n",
+ " '2017-08-22T00:00:00.000000000', '2017-08-23T00:00:00.000000000',\n",
+ " '2017-08-24T00:00:00.000000000', '2017-08-25T00:00:00.000000000',\n",
+ " '2017-08-26T00:00:00.000000000', '2017-08-27T00:00:00.000000000',\n",
+ " '2017-08-28T00:00:00.000000000', '2017-08-29T00:00:00.000000000',\n",
+ " '2017-08-30T00:00:00.000000000', '2017-08-31T00:00:00.000000000'], dtype='datetime64[ns]')"
+ ]
+ },
+ "execution_count": 144,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "test.date.unique()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 145,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "train_validation['prediction_date']=train_validation['date']+pd.DateOffset(days=21)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 146,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "train_validation = train_validation.rename(columns={'date':'date_old', 'prediction_date':'date'})"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 147,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " id | \n",
+ " date_old | \n",
+ " store_nbr | \n",
+ " item_nbr | \n",
+ " unit_sales | \n",
+ " onpromotion | \n",
+ " date | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 123296175 | \n",
+ " 123296175 | \n",
+ " 2017-07-26 | \n",
+ " 1 | \n",
+ " 103520 | \n",
+ " 1.0 | \n",
+ " False | \n",
+ " 2017-08-16 | \n",
+ "
\n",
+ " \n",
+ " 123296176 | \n",
+ " 123296176 | \n",
+ " 2017-07-26 | \n",
+ " 1 | \n",
+ " 103665 | \n",
+ " 4.0 | \n",
+ " False | \n",
+ " 2017-08-16 | \n",
+ "
\n",
+ " \n",
+ " 123296177 | \n",
+ " 123296177 | \n",
+ " 2017-07-26 | \n",
+ " 1 | \n",
+ " 105574 | \n",
+ " 9.0 | \n",
+ " False | \n",
+ " 2017-08-16 | \n",
+ "
\n",
+ " \n",
+ " 123296178 | \n",
+ " 123296178 | \n",
+ " 2017-07-26 | \n",
+ " 1 | \n",
+ " 105575 | \n",
+ " 6.0 | \n",
+ " False | \n",
+ " 2017-08-16 | \n",
+ "
\n",
+ " \n",
+ " 123296179 | \n",
+ " 123296179 | \n",
+ " 2017-07-26 | \n",
+ " 1 | \n",
+ " 105693 | \n",
+ " 2.0 | \n",
+ " True | \n",
+ " 2017-08-16 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " id date_old store_nbr item_nbr unit_sales onpromotion \\\n",
+ "123296175 123296175 2017-07-26 1 103520 1.0 False \n",
+ "123296176 123296176 2017-07-26 1 103665 4.0 False \n",
+ "123296177 123296177 2017-07-26 1 105574 9.0 False \n",
+ "123296178 123296178 2017-07-26 1 105575 6.0 False \n",
+ "123296179 123296179 2017-07-26 1 105693 2.0 True \n",
+ "\n",
+ " date \n",
+ "123296175 2017-08-16 \n",
+ "123296176 2017-08-16 \n",
+ "123296177 2017-08-16 \n",
+ "123296178 2017-08-16 \n",
+ "123296179 2017-08-16 "
+ ]
+ },
+ "execution_count": 147,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "train_validation.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 148,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "cols_to_use=['date', 'store_nbr', 'item_nbr', 'unit_sales']"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 149,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "test_join_period = pd.merge(test, train_validation[cols_to_use].rename(columns={'unit_sales':'prediction_sales'}), on=['date', 'store_nbr', 'item_nbr'], how='left')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 150,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " id | \n",
+ " date | \n",
+ " store_nbr | \n",
+ " item_nbr | \n",
+ " onpromotion | \n",
+ " prediction_sales | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 125497040 | \n",
+ " 2017-08-16 | \n",
+ " 1 | \n",
+ " 96995 | \n",
+ " False | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 125497041 | \n",
+ " 2017-08-16 | \n",
+ " 1 | \n",
+ " 99197 | \n",
+ " False | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 125497042 | \n",
+ " 2017-08-16 | \n",
+ " 1 | \n",
+ " 103501 | \n",
+ " False | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 125497043 | \n",
+ " 2017-08-16 | \n",
+ " 1 | \n",
+ " 103520 | \n",
+ " False | \n",
+ " 1.0 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 125497044 | \n",
+ " 2017-08-16 | \n",
+ " 1 | \n",
+ " 103665 | \n",
+ " False | \n",
+ " 4.0 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " id date store_nbr item_nbr onpromotion prediction_sales\n",
+ "0 125497040 2017-08-16 1 96995 False NaN\n",
+ "1 125497041 2017-08-16 1 99197 False NaN\n",
+ "2 125497042 2017-08-16 1 103501 False NaN\n",
+ "3 125497043 2017-08-16 1 103520 False 1.0\n",
+ "4 125497044 2017-08-16 1 103665 False 4.0"
+ ]
+ },
+ "execution_count": 150,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "test_join_period.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 151,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "test_seen_prediction = test_join_period[test_join_period.prediction_sales.notnull()]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 152,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " id | \n",
+ " date | \n",
+ " store_nbr | \n",
+ " item_nbr | \n",
+ " onpromotion | \n",
+ " prediction_sales | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 3 | \n",
+ " 125497043 | \n",
+ " 2017-08-16 | \n",
+ " 1 | \n",
+ " 103520 | \n",
+ " False | \n",
+ " 1.0 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 125497044 | \n",
+ " 2017-08-16 | \n",
+ " 1 | \n",
+ " 103665 | \n",
+ " False | \n",
+ " 4.0 | \n",
+ "
\n",
+ " \n",
+ " 5 | \n",
+ " 125497045 | \n",
+ " 2017-08-16 | \n",
+ " 1 | \n",
+ " 105574 | \n",
+ " False | \n",
+ " 9.0 | \n",
+ "
\n",
+ " \n",
+ " 6 | \n",
+ " 125497046 | \n",
+ " 2017-08-16 | \n",
+ " 1 | \n",
+ " 105575 | \n",
+ " False | \n",
+ " 6.0 | \n",
+ "
\n",
+ " \n",
+ " 9 | \n",
+ " 125497049 | \n",
+ " 2017-08-16 | \n",
+ " 1 | \n",
+ " 105693 | \n",
+ " False | \n",
+ " 2.0 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " id date store_nbr item_nbr onpromotion prediction_sales\n",
+ "3 125497043 2017-08-16 1 103520 False 1.0\n",
+ "4 125497044 2017-08-16 1 103665 False 4.0\n",
+ "5 125497045 2017-08-16 1 105574 False 9.0\n",
+ "6 125497046 2017-08-16 1 105575 False 6.0\n",
+ "9 125497049 2017-08-16 1 105693 False 2.0"
+ ]
+ },
+ "execution_count": 152,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "test_seen_prediction.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 153,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "(1677035, 6)"
+ ]
+ },
+ "execution_count": 153,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "test_seen_prediction.shape"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 155,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "(3370464, 5)"
+ ]
+ },
+ "execution_count": 155,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "test.shape"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 154,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "0.4975679906386776"
+ ]
+ },
+ "execution_count": 154,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "test_seen_prediction.shape[0]/test.shape[0]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 157,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "test_seen_prediction_expanded = pd.merge(test_seen_prediction, items, on='item_nbr', how='left')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 160,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " id | \n",
+ " date | \n",
+ " store_nbr | \n",
+ " item_nbr | \n",
+ " onpromotion | \n",
+ " prediction_sales | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 3 | \n",
+ " 125497043 | \n",
+ " 2017-08-16 | \n",
+ " 1 | \n",
+ " 103520 | \n",
+ " False | \n",
+ " 1.0 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 125497044 | \n",
+ " 2017-08-16 | \n",
+ " 1 | \n",
+ " 103665 | \n",
+ " False | \n",
+ " 4.0 | \n",
+ "
\n",
+ " \n",
+ " 5 | \n",
+ " 125497045 | \n",
+ " 2017-08-16 | \n",
+ " 1 | \n",
+ " 105574 | \n",
+ " False | \n",
+ " 9.0 | \n",
+ "
\n",
+ " \n",
+ " 6 | \n",
+ " 125497046 | \n",
+ " 2017-08-16 | \n",
+ " 1 | \n",
+ " 105575 | \n",
+ " False | \n",
+ " 6.0 | \n",
+ "
\n",
+ " \n",
+ " 9 | \n",
+ " 125497049 | \n",
+ " 2017-08-16 | \n",
+ " 1 | \n",
+ " 105693 | \n",
+ " False | \n",
+ " 2.0 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " id date store_nbr item_nbr onpromotion prediction_sales\n",
+ "3 125497043 2017-08-16 1 103520 False 1.0\n",
+ "4 125497044 2017-08-16 1 103665 False 4.0\n",
+ "5 125497045 2017-08-16 1 105574 False 9.0\n",
+ "6 125497046 2017-08-16 1 105575 False 6.0\n",
+ "9 125497049 2017-08-16 1 105693 False 2.0"
+ ]
+ },
+ "execution_count": 160,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "test_seen_prediction.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 161,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "False"
+ ]
+ },
+ "execution_count": 161,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "np.all(test_seen_prediction.prediction_sales>=0)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 162,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "/Users/jinyang/anaconda/envs/py35/lib/python3.5/site-packages/ipykernel_launcher.py:5: SettingWithCopyWarning:\n",
+ "\n",
+ "\n",
+ "A value is trying to be set on a copy of a slice from a DataFrame\n",
+ "\n",
+ "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n",
+ "\n",
+ "/Users/jinyang/anaconda/envs/py35/lib/python3.5/site-packages/pandas/core/generic.py:5088: SettingWithCopyWarning:\n",
+ "\n",
+ "\n",
+ "A value is trying to be set on a copy of a slice from a DataFrame\n",
+ "\n",
+ "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n",
+ "\n",
+ "/Users/jinyang/anaconda/envs/py35/lib/python3.5/site-packages/ipykernel_launcher.py:1: SettingWithCopyWarning:\n",
+ "\n",
+ "\n",
+ "A value is trying to be set on a copy of a slice from a DataFrame\n",
+ "\n",
+ "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n",
+ "\n",
+ "/Users/jinyang/anaconda/envs/py35/lib/python3.5/site-packages/ipykernel_launcher.py:1: SettingWithCopyWarning:\n",
+ "\n",
+ "\n",
+ "A value is trying to be set on a copy of a slice from a DataFrame.\n",
+ "Try using .loc[row_indexer,col_indexer] = value instead\n",
+ "\n",
+ "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n",
+ "\n"
+ ]
+ }
+ ],
+ "source": [
+ "test_seen_prediction['prediction_sales'] = clean_predictions(test_seen_prediction.prediction_sales)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 163,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "True"
+ ]
+ },
+ "execution_count": 163,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "np.all(test_seen_prediction.prediction_sales>=0)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 164,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "id int64\n",
+ "date datetime64[ns]\n",
+ "store_nbr int64\n",
+ "item_nbr int64\n",
+ "onpromotion bool\n",
+ "prediction_sales int64\n",
+ "dtype: object"
+ ]
+ },
+ "execution_count": 164,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "test_seen_prediction.dtypes"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 169,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "(1677035, 6)"
+ ]
+ },
+ "execution_count": 169,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "test_seen_prediction.shape"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 165,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "baseline_predictions_test = pd.read_csv('baseline_predictions_test')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 166,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " id | \n",
+ " perishable | \n",
+ " prediction_sales | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 125497040 | \n",
+ " 0 | \n",
+ " 2 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 125497041 | \n",
+ " 0 | \n",
+ " 3 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 125497043 | \n",
+ " 0 | \n",
+ " 3 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 125497044 | \n",
+ " 1 | \n",
+ " 4 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 125497045 | \n",
+ " 0 | \n",
+ " 5 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " id perishable prediction_sales\n",
+ "0 125497040 0 2\n",
+ "1 125497041 0 3\n",
+ "2 125497043 0 3\n",
+ "3 125497044 1 4\n",
+ "4 125497045 0 5"
+ ]
+ },
+ "execution_count": 166,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "baseline_predictions_test.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 167,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "test_predictions_subset = baseline_predictions_test[baseline_predictions_test.id.isin(test_seen_prediction.id)]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 168,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "(1677035, 3)"
+ ]
+ },
+ "execution_count": 168,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "test_predictions_subset.shape"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 170,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "test_predictions_subset2 = baseline_predictions_test[~baseline_predictions_test.id.isin(test_seen_prediction.id)]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 171,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "(1693429, 3)"
+ ]
+ },
+ "execution_count": 171,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "test_predictions_subset2.shape"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 172,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "True"
+ ]
+ },
+ "execution_count": 172,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "baseline_predictions_test.shape[0]==test_predictions_subset.shape[0]+test_predictions_subset2.shape[0]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 173,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " id | \n",
+ " perishable | \n",
+ " prediction_sales | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 2 | \n",
+ " 125497043 | \n",
+ " 0 | \n",
+ " 3 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 125497044 | \n",
+ " 1 | \n",
+ " 4 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 125497045 | \n",
+ " 0 | \n",
+ " 5 | \n",
+ "
\n",
+ " \n",
+ " 5 | \n",
+ " 125497046 | \n",
+ " 0 | \n",
+ " 10 | \n",
+ "
\n",
+ " \n",
+ " 7 | \n",
+ " 125497049 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " id perishable prediction_sales\n",
+ "2 125497043 0 3\n",
+ "3 125497044 1 4\n",
+ "4 125497045 0 5\n",
+ "5 125497046 0 10\n",
+ "7 125497049 0 1"
+ ]
+ },
+ "execution_count": 173,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "test_predictions_subset.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 174,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " id | \n",
+ " date | \n",
+ " store_nbr | \n",
+ " item_nbr | \n",
+ " onpromotion | \n",
+ " prediction_sales | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 3 | \n",
+ " 125497043 | \n",
+ " 2017-08-16 | \n",
+ " 1 | \n",
+ " 103520 | \n",
+ " False | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 125497044 | \n",
+ " 2017-08-16 | \n",
+ " 1 | \n",
+ " 103665 | \n",
+ " False | \n",
+ " 4 | \n",
+ "
\n",
+ " \n",
+ " 5 | \n",
+ " 125497045 | \n",
+ " 2017-08-16 | \n",
+ " 1 | \n",
+ " 105574 | \n",
+ " False | \n",
+ " 9 | \n",
+ "
\n",
+ " \n",
+ " 6 | \n",
+ " 125497046 | \n",
+ " 2017-08-16 | \n",
+ " 1 | \n",
+ " 105575 | \n",
+ " False | \n",
+ " 6 | \n",
+ "
\n",
+ " \n",
+ " 9 | \n",
+ " 125497049 | \n",
+ " 2017-08-16 | \n",
+ " 1 | \n",
+ " 105693 | \n",
+ " False | \n",
+ " 2 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " id date store_nbr item_nbr onpromotion prediction_sales\n",
+ "3 125497043 2017-08-16 1 103520 False 1\n",
+ "4 125497044 2017-08-16 1 103665 False 4\n",
+ "5 125497045 2017-08-16 1 105574 False 9\n",
+ "6 125497046 2017-08-16 1 105575 False 6\n",
+ "9 125497049 2017-08-16 1 105693 False 2"
+ ]
+ },
+ "execution_count": 174,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "test_seen_prediction.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 182,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "/Users/jinyang/anaconda/envs/py35/lib/python3.5/site-packages/pandas/core/indexing.py:601: SettingWithCopyWarning:\n",
+ "\n",
+ "\n",
+ "A value is trying to be set on a copy of a slice from a DataFrame.\n",
+ "Try using .loc[row_indexer,col_indexer] = value instead\n",
+ "\n",
+ "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n",
+ "\n"
+ ]
+ }
+ ],
+ "source": [
+ "test_predictions_subset.loc[:, 'prediction_sales']=test_seen_prediction['prediction_sales']"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 183,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " id | \n",
+ " perishable | \n",
+ " prediction_sales | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 2 | \n",
+ " 125497043 | \n",
+ " 0 | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 125497044 | \n",
+ " 1 | \n",
+ " 1.0 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 125497045 | \n",
+ " 0 | \n",
+ " 4.0 | \n",
+ "
\n",
+ " \n",
+ " 5 | \n",
+ " 125497046 | \n",
+ " 0 | \n",
+ " 9.0 | \n",
+ "
\n",
+ " \n",
+ " 7 | \n",
+ " 125497049 | \n",
+ " 0 | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " id perishable prediction_sales\n",
+ "2 125497043 0 NaN\n",
+ "3 125497044 1 1.0\n",
+ "4 125497045 0 4.0\n",
+ "5 125497046 0 9.0\n",
+ "7 125497049 0 NaN"
+ ]
+ },
+ "execution_count": 183,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "test_predictions_subset.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 184,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "cols_to_use=['id', 'prediction_sales']"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 188,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "new_test_predictions = test_seen_prediction[cols_to_use].append(test_predictions_subset2[cols_to_use])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 189,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "(3370464, 2)"
+ ]
+ },
+ "execution_count": 189,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "new_test_predictions .shape"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 190,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " id | \n",
+ " prediction_sales | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 3 | \n",
+ " 125497043 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 125497044 | \n",
+ " 4 | \n",
+ "
\n",
+ " \n",
+ " 5 | \n",
+ " 125497045 | \n",
+ " 9 | \n",
+ "
\n",
+ " \n",
+ " 6 | \n",
+ " 125497046 | \n",
+ " 6 | \n",
+ "
\n",
+ " \n",
+ " 9 | \n",
+ " 125497049 | \n",
+ " 2 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " id prediction_sales\n",
+ "3 125497043 1\n",
+ "4 125497044 4\n",
+ "5 125497045 9\n",
+ "6 125497046 6\n",
+ "9 125497049 2"
+ ]
+ },
+ "execution_count": 190,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "new_test_predictions.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 195,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "submission = new_test_predictions.rename( columns = {'prediction_sales':'unit_sales'})"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 196,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " id | \n",
+ " unit_sales | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 3 | \n",
+ " 125497043 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 125497044 | \n",
+ " 4 | \n",
+ "
\n",
+ " \n",
+ " 5 | \n",
+ " 125497045 | \n",
+ " 9 | \n",
+ "
\n",
+ " \n",
+ " 6 | \n",
+ " 125497046 | \n",
+ " 6 | \n",
+ "
\n",
+ " \n",
+ " 9 | \n",
+ " 125497049 | \n",
+ " 2 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " id unit_sales\n",
+ "3 125497043 1\n",
+ "4 125497044 4\n",
+ "5 125497045 9\n",
+ "6 125497046 6\n",
+ "9 125497049 2"
+ ]
+ },
+ "execution_count": 196,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "submission.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 197,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "submission.to_csv('empirical_29nov17.csv', index=False)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 88,
+ "metadata": {},
+ "outputs": [
+ {
+ "ename": "NameError",
+ "evalue": "name 'end_of_validation_period' is not defined",
+ "output_type": "error",
+ "traceback": [
+ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+ "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)",
+ "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mend_of_validation_period\u001b[0m \u001b[0;34m-\u001b[0m \u001b[0mpd\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mDateOffset\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdays\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m15\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
+ "\u001b[0;31mNameError\u001b[0m: name 'end_of_validation_period' is not defined"
+ ]
+ }
+ ],
+ "source": [
+ "cols_item_store = ['item_nbr', 'store_nbr']\n",
+ "cols_test_expanded = test_seen_expanded.columns\n",
+ "cols_prediction = ['id', 'unit_sales', 'perishable']\n",
+ "cols_to_use = cols_test_expanded.drop('unit_sales') if 'unit_sales' in cols_test_expanded else cols_test_expanded\n",
+ "test_join_train = pd.merge(test_seen_expanded[cols_to_use], train, on=cols_item_store, how='left')\n",
+ "is_data_seen = test_join_train_item_store_grouped['unit_sales'].notnull()\n",
+ "join_seen = test_join_train_item_store_grouped[is_data_seen.values]\n",
+ "# return join_seen[cols_prediction].rename(columns={'unit_sales':'prediction_sales'})"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 85,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "test_seen_expanded = get_item_expanded_df(validation_seen, items)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def get_empirical_baseline_prediction_test_seen(test_seen_expanded, train):\n",
+ " cols_item_store = ['item_nbr', 'store_nbr']\n",
+ " cols_test_expanded = test_seen_expanded.columns\n",
+ " cols_prediction = ['id', 'unit_sales', 'perishable']\n",
+ " train_item_store_grouped = train.\n",
+ " cols_to_use = cols_test_expanded.drop('unit_sales') if 'unit_sales' in cols_test_expanded else cols_test_expanded\n",
+ " test_join_train_item_store_grouped = pd.merge(test_seen_expanded[cols_to_use], train_item_store_grouped, on=cols_item_store, how='left')\n",
+ " is_data_seen = test_join_train_item_store_grouped['unit_sales'].notnull()\n",
+ " join_seen = test_join_train_item_store_grouped[is_data_seen.values]\n",
+ " return join_seen[cols_prediction].rename(columns={'unit_sales':'prediction_sales'})"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Submission"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 72,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "/Users/jinyang/anaconda/envs/py35/lib/python3.5/site-packages/ipykernel_launcher.py:2: SettingWithCopyWarning:\n",
+ "\n",
+ "\n",
+ "A value is trying to be set on a copy of a slice from a DataFrame\n",
+ "\n",
+ "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n",
+ "\n"
+ ]
+ }
+ ],
+ "source": [
+ "baseline_predictions_test = get_baseline_predictions(test, train)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 73,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " id | \n",
+ " perishable | \n",
+ " prediction_sales | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 125497040 | \n",
+ " 0 | \n",
+ " 2 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 125497041 | \n",
+ " 0 | \n",
+ " 3 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 125497043 | \n",
+ " 0 | \n",
+ " 3 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 125497044 | \n",
+ " 1 | \n",
+ " 4 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 125497045 | \n",
+ " 0 | \n",
+ " 5 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " id perishable prediction_sales\n",
+ "0 125497040 0 2\n",
+ "1 125497041 0 3\n",
+ "2 125497043 0 3\n",
+ "3 125497044 1 4\n",
+ "4 125497045 0 5"
+ ]
+ },
+ "execution_count": 73,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "baseline_predictions_test.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 74,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "test_submission=baseline_predictions_test[['id', 'prediction_sales']].rename(columns={'prediction_sales':'unit_sales'})"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 75,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " id | \n",
+ " unit_sales | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 125497040 | \n",
+ " 2 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 125497041 | \n",
+ " 3 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 125497043 | \n",
+ " 3 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 125497044 | \n",
+ " 4 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 125497045 | \n",
+ " 5 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " id unit_sales\n",
+ "0 125497040 2\n",
+ "1 125497041 3\n",
+ "2 125497043 3\n",
+ "3 125497044 4\n",
+ "4 125497045 5"
+ ]
+ },
+ "execution_count": 75,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "test_submission.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 76,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "True"
+ ]
+ },
+ "execution_count": 76,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "np.all(test_submission.unit_sales>=0)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 77,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "id int64\n",
+ "unit_sales int64\n",
+ "dtype: object"
+ ]
+ },
+ "execution_count": 77,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "test_submission.dtypes"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 78,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# test_submission.to_csv('baseline_submission_20171127.csv', index=False)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Evaluation of each group"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 79,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "validation_seen, validation_unseen_class_seen,validation_unseen_class_unseen = group_test_data(train_validation, train_train)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 80,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "Index(['id', 'date', 'store_nbr', 'item_nbr', 'onpromotion', 'unit_sales'], dtype='object')"
+ ]
+ },
+ "execution_count": 80,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "validation_seen.columns"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 81,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "seen_baseline_predictions_validation = baseline_predictions_validation[ baseline_predictions_validation.id.isin(validation_seen.id)]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 82,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "unseen_class_seen_baseline_predictions_validation = baseline_predictions_validation[ baseline_predictions_validation.id.isin(validation_unseen_class_seen.id)]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 83,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "unseen_class_unseen_baseline_predictions_validation = baseline_predictions_validation[ baseline_predictions_validation.id.isin(validation_unseen_class_unseen.id)]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 84,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "8.406943132552255"
+ ]
+ },
+ "execution_count": 84,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "baseline_predictions_validation.prediction_sales.mean()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 85,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "8.53563678093009"
+ ]
+ },
+ "execution_count": 85,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "seen_baseline_predictions_validation.prediction_sales.mean()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 86,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "7.756378630680307"
+ ]
+ },
+ "execution_count": 86,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "unseen_class_seen_baseline_predictions_validation.prediction_sales.mean()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 87,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "7.2"
+ ]
+ },
+ "execution_count": 87,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "unseen_class_unseen_baseline_predictions_validation.prediction_sales.mean()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 88,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "seen_baseline_predictions_test = baseline_predictions_test[ baseline_predictions_test.id.isin(test_seen.id)]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 89,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "unseen_class_seen_baseline_predictions_test = baseline_predictions_test[ baseline_predictions_test.id.isin(test_unseen_class_seen.id)]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 90,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "unseen_class_unseen_baseline_predictions_test = baseline_predictions_test[ baseline_predictions_test.id.isin(test_unseen_class_unseen.id)]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 91,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "7.350565382095759"
+ ]
+ },
+ "execution_count": 91,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "baseline_predictions_test.prediction_sales.mean()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 92,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "6.949336092844724"
+ ]
+ },
+ "execution_count": 92,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "seen_baseline_predictions_test.prediction_sales.mean()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 93,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "8.855705681997609"
+ ]
+ },
+ "execution_count": 93,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "unseen_class_seen_baseline_predictions_test.prediction_sales.mean()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 94,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "7.0"
+ ]
+ },
+ "execution_count": 94,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "unseen_class_unseen_baseline_predictions_test.prediction_sales.mean()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "validation and test looks different in means of predictions"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 95,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "8.573084378706207"
+ ]
+ },
+ "execution_count": 95,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "train_train.unit_sales.mean()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 96,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "8.554865268437672"
+ ]
+ },
+ "execution_count": 96,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "train.unit_sales.mean()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Why do seen items have low mean in test data?"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 97,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "Index(['id', 'perishable', 'prediction_sales'], dtype='object')"
+ ]
+ },
+ "execution_count": 97,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "seen_baseline_predictions_test.columns"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 98,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "Index(['id', 'date', 'store_nbr', 'item_nbr', 'onpromotion', 'unit_sales'], dtype='object')"
+ ]
+ },
+ "execution_count": 98,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "test_seen.columns"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 99,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "test_seen_mean = test_seen.groupby(['item_nbr','store_nbr'])['unit_sales'].mean().reset_index()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 100,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "1.000000 1063\n",
+ "2.000000 387\n",
+ "1.500000 269\n",
+ "1.333333 211\n",
+ "3.000000 169\n",
+ "1.200000 142\n",
+ "1.666667 132\n",
+ "4.000000 110\n",
+ "1.250000 105\n",
+ "1.600000 90\n",
+ "1.400000 87\n",
+ "2.500000 83\n",
+ "1.166667 75\n",
+ "1.750000 74\n",
+ "2.333333 72\n",
+ "1.142857 70\n",
+ "1.800000 70\n",
+ "5.000000 69\n",
+ "1.714286 66\n",
+ "1.833333 65\n",
+ "1.428571 63\n",
+ "2.400000 61\n",
+ "2.666667 59\n",
+ "1.285714 58\n",
+ "1.571429 56\n",
+ "1.222222 55\n",
+ "6.000000 54\n",
+ "1.375000 53\n",
+ "3.333333 52\n",
+ "3.500000 52\n",
+ " ... \n",
+ "5.931522 1\n",
+ "5.983607 1\n",
+ "10.427236 1\n",
+ "6.328264 1\n",
+ "2.569665 1\n",
+ "4.409535 1\n",
+ "35.648855 1\n",
+ "4.074074 1\n",
+ "14.786531 1\n",
+ "1.416867 1\n",
+ "2.202128 1\n",
+ "89.903846 1\n",
+ "4.600505 1\n",
+ "2.375946 1\n",
+ "20.287293 1\n",
+ "4.557390 1\n",
+ "2.670120 1\n",
+ "2.405458 1\n",
+ "8.606928 1\n",
+ "2.559347 1\n",
+ "11.537549 1\n",
+ "4.129799 1\n",
+ "5.267884 1\n",
+ "12.886924 1\n",
+ "5.454833 1\n",
+ "15.486842 1\n",
+ "5.792963 1\n",
+ "4.558394 1\n",
+ "5.431433 1\n",
+ "3.750600 1\n",
+ "Name: unit_sales, Length: 139461, dtype: int64"
+ ]
+ },
+ "execution_count": 100,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "test_seen_mean.unit_sales.value_counts()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 101,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "(3370464, 5)"
+ ]
+ },
+ "execution_count": 101,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "test.shape"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 102,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "(2003044, 6)"
+ ]
+ },
+ "execution_count": 102,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "train_validation.shape"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 103,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "(2659408, 6)"
+ ]
+ },
+ "execution_count": 103,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "test_seen.shape"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 104,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "(1672247, 6)"
+ ]
+ },
+ "execution_count": 104,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "validation_seen.shape"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 105,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "Index(['id', 'date', 'store_nbr', 'item_nbr', 'onpromotion', 'unit_sales'], dtype='object')"
+ ]
+ },
+ "execution_count": 105,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "validation_seen.columns"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 106,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "validation_seen_mean = validation_seen.groupby(['item_nbr', 'store_nbr'])['unit_sales'].mean().reset_index()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 107,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "1.000000 196\n",
+ "2.000000 183\n",
+ "1.500000 148\n",
+ "1.333333 109\n",
+ "2.500000 71\n",
+ "1.250000 70\n",
+ "4.000000 67\n",
+ "3.000000 66\n",
+ "1.750000 57\n",
+ "1.200000 47\n",
+ "1.666667 47\n",
+ "5.000000 43\n",
+ "1.428571 41\n",
+ "1.571429 40\n",
+ "1.375000 39\n",
+ "1.166667 37\n",
+ "1.222222 34\n",
+ "2.250000 34\n",
+ "6.000000 34\n",
+ "1.666667 31\n",
+ "1.125000 31\n",
+ "1.285714 30\n",
+ "2.750000 30\n",
+ "2.333333 30\n",
+ "1.142857 30\n",
+ "1.444444 29\n",
+ "3.500000 28\n",
+ "2.666667 27\n",
+ "1.300000 27\n",
+ "1.625000 26\n",
+ " ... \n",
+ "5.414894 1\n",
+ "5.689526 1\n",
+ "10.992721 1\n",
+ "4.405322 1\n",
+ "2.751479 1\n",
+ "3.373206 1\n",
+ "1.844920 1\n",
+ "3.732102 1\n",
+ "1.887872 1\n",
+ "3.881868 1\n",
+ "6.143824 1\n",
+ "4.322581 1\n",
+ "3.486506 1\n",
+ "4.022444 1\n",
+ "1.949593 1\n",
+ "3.965174 1\n",
+ "6.439222 1\n",
+ "9.427083 1\n",
+ "5.120801 1\n",
+ "1.376404 1\n",
+ "2.192547 1\n",
+ "11.182663 1\n",
+ "6.961364 1\n",
+ "7.380435 1\n",
+ "17.839545 1\n",
+ "1.691099 1\n",
+ "9.129005 1\n",
+ "1.983607 1\n",
+ "25.861189 1\n",
+ "9.211574 1\n",
+ "Name: unit_sales, Length: 132023, dtype: int64"
+ ]
+ },
+ "execution_count": 107,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "validation_seen_mean.unit_sales.value_counts()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 108,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "(146060, 3)"
+ ]
+ },
+ "execution_count": 108,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "validation_seen_mean.shape"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 109,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "baseline_predictions_validation.to_csv('baseline_predictions_validation', index=False)\n",
+ "baseline_predictions_test.to_csv('baseline_predictions_test', index=False)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.5.3"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/.ipynb_checkpoints/validation_strategy_20171127-checkpoint.ipynb b/.ipynb_checkpoints/validation_strategy_20171127-checkpoint.ipynb
new file mode 100644
index 0000000..1e60e29
--- /dev/null
+++ b/.ipynb_checkpoints/validation_strategy_20171127-checkpoint.ipynb
@@ -0,0 +1,3325 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": 18,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ ""
+ ],
+ "text/vnd.plotly.v1+html": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "import pandas as pd\n",
+ "import numpy as np\n",
+ "import matplotlib.pyplot as plt\n",
+ "%matplotlib inline\n",
+ "import seaborn as sns\n",
+ "import plotly.offline as py\n",
+ "import plotly.graph_objs as go\n",
+ "py.init_notebook_mode()\n",
+ "import gc"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 19,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "seed = 46\n",
+ "np.random.seed(seed)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 20,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def load_data(data_path):\n",
+ " train = pd.read_csv('%s/train.csv' % data_path, parse_dates=['date'])\n",
+ " test = pd.read_csv('%s/test.csv' % data_path, parse_dates=['date'])\n",
+ " items = pd.read_csv('%s/items.csv' % data_path)\n",
+ " return train, test, items"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 21,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "/Users/jinyang/anaconda/envs/py35/lib/python3.5/site-packages/IPython/core/interactiveshell.py:2821: DtypeWarning:\n",
+ "\n",
+ "Columns (5) have mixed types. Specify dtype option on import or set low_memory=False.\n",
+ "\n"
+ ]
+ }
+ ],
+ "source": [
+ "data_path = './data'\n",
+ "train, test, items = load_data(data_path)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def print_summary_train_test(train, test):\n",
+ " print ('Train min/max date: %s / %s' % (train['date'].min(), train['date'].max()))\n",
+ " print ('Test min/max date: %s / %s' % ( test['date'].min(), test['date'].max()))\n",
+ " print ('')\n",
+ " print ('Number of days in train: %d' % ((train['date'].max() - train['date'].min()).days + 1))\n",
+ " print ('Number of days in validation: %d' % (( test['date'].max() - test['date'].min()).days + 1))\n",
+ " print ('')\n",
+ " print ('Train shape: %d rows' % train.shape[0])\n",
+ " print ('Test shape: %d rows' % test.shape[0])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Train min/max date: 2013-01-01 00:00:00 / 2017-08-15 00:00:00\n",
+ "Test min/max date: 2017-08-16 00:00:00 / 2017-08-31 00:00:00\n",
+ "\n",
+ "Number of days in train: 1688\n",
+ "Number of days in validation: 16\n",
+ "\n",
+ "Train shape: 125497040 rows\n",
+ "Test shape: 3370464 rows\n"
+ ]
+ }
+ ],
+ "source": [
+ "print_summary_train_test(train, test)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Divide Train data into Validation(last two weeks of train data) and Training(the rest)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 22,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import splitter"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "train_last_date = train['date'].max()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "2017-07-26 00:00:00 2017-08-10 00:00:00\n"
+ ]
+ }
+ ],
+ "source": [
+ "begin_of_validation, end_of_validation = splitter.get_validation_period(train_last_date)\n",
+ "print(begin_of_validation, end_of_validation)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 10,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "train_train, train_validation = splitter.split_validation_train_by_validation_period(train, begin_of_validation, end_of_validation)\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 11,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Train min/max date: 2013-01-01 00:00:00 / 2017-07-25 00:00:00\n",
+ "Test min/max date: 2017-07-26 00:00:00 / 2017-08-10 00:00:00\n",
+ "\n",
+ "Number of days in train: 1667\n",
+ "Number of days in validation: 16\n",
+ "\n",
+ "Train shape: 123296175 rows\n",
+ "Test shape: 1679408 rows\n"
+ ]
+ }
+ ],
+ "source": [
+ "print_summary_train_test(train_train, train_validation)\n",
+ " "
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Evaluation Metric"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 12,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import evaluation\n",
+ "from sklearn.metrics import mean_squared_error"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## How many items in Test data set are not seen in Train data set \n",
+ "## vs. how many items in Validation are not seen in Training"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 13,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def get_unseen_item_percentage(train, test):\n",
+ " test_items = test['item_nbr'].unique()\n",
+ " train_items = train['item_nbr'].unique()\n",
+ " test_items_unseen_in_train = set(test_items) - set(train_items)\n",
+ " unseen_percentage = len(test_items_unseen_in_train)/len(test_items)\n",
+ " print(\"{:.2f}% of items in the test data set are not seen in the train data set\".format(unseen_percentage*100))\n",
+ " return unseen_percentage"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 14,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "1.54% of items in the test data set are not seen in the train data set\n"
+ ]
+ },
+ {
+ "data": {
+ "text/plain": [
+ "0.015380671622660855"
+ ]
+ },
+ "execution_count": 14,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "get_unseen_item_percentage(train, test)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 15,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "0.55% of items in the test data set are not seen in the train data set\n"
+ ]
+ },
+ {
+ "data": {
+ "text/plain": [
+ "0.005454545454545455"
+ ]
+ },
+ "execution_count": 15,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "get_unseen_item_percentage(train_train, train_validation)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## How many (item, store) in Test data set are not seen in Train data set\n",
+ "## vs. how many (item, store) in Validation are not seen in Training"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 16,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def get_unseen_item_store_pair_percentage(train, test):\n",
+ " cols_item_store = ['item_nbr', 'store_nbr']\n",
+ " train_item_store_grouped = train.groupby(cols_item_store)['unit_sales'].mean().reset_index()\n",
+ " cols_to_use = test.columns.drop('unit_sales') if 'unit_sales' in test.columns else test.columns\n",
+ " test_join_train_item_store_grouped = pd.merge(test[cols_to_use], train_item_store_grouped, on=cols_item_store, how='left')\n",
+ " test_unseen = test_join_train_item_store_grouped[test_join_train_item_store_grouped['unit_sales'].isnull()]\n",
+ " unseen_percentage = test_unseen.shape[0]/test.shape[0]\n",
+ " print(\"{:.2f}% of (item,store) pairs in the test data set are not seen in the train data set\".format(unseen_percentage*100))\n",
+ " return unseen_percentage"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 17,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "21.10% of (item,store) pairs in the test data set are not seen in the train data set\n"
+ ]
+ },
+ {
+ "data": {
+ "text/plain": [
+ "0.21096679863662687"
+ ]
+ },
+ "execution_count": 17,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "get_unseen_item_store_pair_percentage(train, test)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 18,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "0.13% of (item,store) pairs in the test data set are not seen in the train data set\n"
+ ]
+ },
+ {
+ "data": {
+ "text/plain": [
+ "0.0013326124443851642"
+ ]
+ },
+ "execution_count": 18,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "get_unseen_item_store_pair_percentage(train_train, train_validation)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Using constant prediction"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 19,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "8.563926854265649"
+ ]
+ },
+ "execution_count": 19,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "train_train.unit_sales.mean()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 20,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "8.554865268437672"
+ ]
+ },
+ "execution_count": 20,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "train.unit_sales.mean()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 21,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "(1679408, 6)"
+ ]
+ },
+ "execution_count": 21,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "train_validation.shape"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 22,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "train_validation_prediction = pd.DataFrame({'id': train_validation.loc[:, 'id'], 'prediction_sales': train_train.unit_sales.mean()})"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 23,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "(1679408, 2)"
+ ]
+ },
+ "execution_count": 23,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "train_validation_prediction.shape"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 24,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "test_prediction = pd.DataFrame({'id': test.loc[:, 'id'], 'prediction_sales': train.unit_sales.mean()})"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 25,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " id | \n",
+ " prediction_sales | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 123296175 | \n",
+ " 123296175 | \n",
+ " 8.563927 | \n",
+ "
\n",
+ " \n",
+ " 123296176 | \n",
+ " 123296176 | \n",
+ " 8.563927 | \n",
+ "
\n",
+ " \n",
+ " 123296177 | \n",
+ " 123296177 | \n",
+ " 8.563927 | \n",
+ "
\n",
+ " \n",
+ " 123296178 | \n",
+ " 123296178 | \n",
+ " 8.563927 | \n",
+ "
\n",
+ " \n",
+ " 123296179 | \n",
+ " 123296179 | \n",
+ " 8.563927 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " id prediction_sales\n",
+ "123296175 123296175 8.563927\n",
+ "123296176 123296176 8.563927\n",
+ "123296177 123296177 8.563927\n",
+ "123296178 123296178 8.563927\n",
+ "123296179 123296179 8.563927"
+ ]
+ },
+ "execution_count": 25,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "train_validation_prediction.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 26,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " id | \n",
+ " prediction_sales | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 125497040 | \n",
+ " 8.554865 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 125497041 | \n",
+ " 8.554865 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 125497042 | \n",
+ " 8.554865 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 125497043 | \n",
+ " 8.554865 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 125497044 | \n",
+ " 8.554865 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " id prediction_sales\n",
+ "0 125497040 8.554865\n",
+ "1 125497041 8.554865\n",
+ "2 125497042 8.554865\n",
+ "3 125497043 8.554865\n",
+ "4 125497044 8.554865"
+ ]
+ },
+ "execution_count": 26,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "test_prediction.head()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Evaluation"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 27,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def get_prediction_constant(train_train):\n",
+ " return train_train.unit_sales.mean()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 28,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def get_clean_prediction(train_train, train_validation):\n",
+ " predictions = get_prediction_constant(train_train)\n",
+ " return pd.DataFrame({'id': train_validation.loc[:, 'id'], 'prediction_sales': predictions})"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 29,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def get_item_expanded_df(test, items):\n",
+ " return pd.merge(test, items, on='item_nbr', how='left')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 30,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def get_evaluation_using_constant_baseline(train_train, train_validation):\n",
+ " train_validation_prediction = get_clean_prediction(train_train, train_validation)\n",
+ " train_validation_expanded = get_item_expanded_df(train_validation, items)\n",
+ " train_validation_weights = train_validation_expanded ['perishable']*0.25+1\n",
+ " return nwrmsle(train_validation_prediction['prediction_sales'], train_validation['unit_sales'], train_validation_weights)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 31,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "train_validation_expanded = get_item_expanded_df(train_validation, items)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 32,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "train_validation_weights = train_validation_expanded ['perishable']*0.25+1"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 33,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "0 1.00\n",
+ "1 1.25\n",
+ "2 1.00\n",
+ "3 1.00\n",
+ "4 1.00\n",
+ "Name: perishable, dtype: float64"
+ ]
+ },
+ "execution_count": 33,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "train_validation_weights.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 17,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def nwrmsle(predictions, targets, weights):\n",
+ " print(targets.shape)\n",
+ " targets[targets<0]=0\n",
+ " weights = 1 + 0.25 * weights\n",
+ " print(predictions.shape, targets.shape, weights.shape)\n",
+ " log_square_errors = (np.log(predictions.values + 1) - np.log(targets.values + 1)) ** 2\n",
+ " return(np.sqrt(np.sum(weights.values * log_square_errors) / np.sum(weights)))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 35,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "(1679408,)"
+ ]
+ },
+ "execution_count": 35,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "train_validation_prediction['prediction_sales'].shape"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 36,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "(1679408, 2)"
+ ]
+ },
+ "execution_count": 36,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "train_validation_prediction.shape"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 37,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "(1679408,)"
+ ]
+ },
+ "execution_count": 37,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "train_validation['unit_sales'].shape"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 38,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "(1679408, 6)"
+ ]
+ },
+ "execution_count": 38,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "train_validation.shape"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 39,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "(1679408,)\n",
+ "(1679408,) (1679408,) (1679408,)\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "/Users/jinyang/anaconda/envs/py35/lib/python3.5/site-packages/ipykernel_launcher.py:3: SettingWithCopyWarning:\n",
+ "\n",
+ "\n",
+ "A value is trying to be set on a copy of a slice from a DataFrame\n",
+ "\n",
+ "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n",
+ "\n",
+ "/Users/jinyang/anaconda/envs/py35/lib/python3.5/site-packages/pandas/core/generic.py:5088: SettingWithCopyWarning:\n",
+ "\n",
+ "\n",
+ "A value is trying to be set on a copy of a slice from a DataFrame\n",
+ "\n",
+ "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n",
+ "\n",
+ "/Users/jinyang/anaconda/envs/py35/lib/python3.5/site-packages/ipykernel_launcher.py:1: SettingWithCopyWarning:\n",
+ "\n",
+ "\n",
+ "A value is trying to be set on a copy of a slice from a DataFrame\n",
+ "\n",
+ "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n",
+ "\n"
+ ]
+ }
+ ],
+ "source": [
+ "train_validation_metric = nwrmsle(train_validation_prediction['prediction_sales'], train_validation['unit_sales'], train_validation_weights)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 40,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "1.0371859208825527"
+ ]
+ },
+ "execution_count": 40,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "train_validation_metric"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "In Leaderboard 1.710"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## How to improve the similarity between validation performance and test performance?"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Strategy 1: Remove Items from Training data set"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 41,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " id | \n",
+ " date | \n",
+ " store_nbr | \n",
+ " item_nbr | \n",
+ " onpromotion | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 125497040 | \n",
+ " 2017-08-16 | \n",
+ " 1 | \n",
+ " 96995 | \n",
+ " False | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 125497041 | \n",
+ " 2017-08-16 | \n",
+ " 1 | \n",
+ " 99197 | \n",
+ " False | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 125497042 | \n",
+ " 2017-08-16 | \n",
+ " 1 | \n",
+ " 103501 | \n",
+ " False | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 125497043 | \n",
+ " 2017-08-16 | \n",
+ " 1 | \n",
+ " 103520 | \n",
+ " False | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 125497044 | \n",
+ " 2017-08-16 | \n",
+ " 1 | \n",
+ " 103665 | \n",
+ " False | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " id date store_nbr item_nbr onpromotion\n",
+ "0 125497040 2017-08-16 1 96995 False\n",
+ "1 125497041 2017-08-16 1 99197 False\n",
+ "2 125497042 2017-08-16 1 103501 False\n",
+ "3 125497043 2017-08-16 1 103520 False\n",
+ "4 125497044 2017-08-16 1 103665 False"
+ ]
+ },
+ "execution_count": 41,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "test.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 42,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "(3901,)"
+ ]
+ },
+ "execution_count": 42,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "test.item_nbr.unique().shape"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 43,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "0.007690335811330428"
+ ]
+ },
+ "execution_count": 43,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "30/3901"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 44,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def move_items_from_train_to_validation(train, validation, items_to_remove):\n",
+ " train2 = train[~train.item_nbr.isin(items_to_remove)]\n",
+ " validation_to_add = train[train.item_nbr.isin(items_to_remove)]\n",
+ " validation2 = validation.append(validation_to_add)\n",
+ " return train2, validation2\n",
+ "\n",
+ "\n",
+ "def move_random_items_from_train_to_validation(train, validation, num_items_to_remove):\n",
+ " train_items = train['item_nbr'].unique()\n",
+ " items_to_remove = np.random.choice(train_items, num_items_to_remove)\n",
+ " train2, validation2 = move_items_from_train_to_validation(train, validation, items_to_remove)\n",
+ " print(\"Moved {} items from train data to test data\".format(num_items_to_remove))\n",
+ " print(\"train data: {} -> {} rows\".format(train.shape[0], train2.shape[0]))\n",
+ " print(\"validation data: {} -> {} rows\".format(validation.shape[0], validation2.shape[0]))\n",
+ " return train2, validation2"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 45,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Moved 30 items from train data to test data\n",
+ "train data: 123296175 -> 122487773 rows\n",
+ "validation data: 1679408 -> 2487810 rows\n"
+ ]
+ }
+ ],
+ "source": [
+ "num_items_to_move = 30\n",
+ "train_train2, train_validation2 = move_random_items_from_train_to_validation(train_train, train_validation, num_items_to_move)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 46,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def move_random_items_from_train_to_validation(train, validation, num_items_to_remove):\n",
+ " train_items = train['item_nbr'].unique()\n",
+ " items_to_remove = np.random.choice(train_items, num_items_to_remove)\n",
+ " train2, validation2 = move_items_from_train_to_validation(train, validation, items_to_remove)\n",
+ " print(\"Moved {} items from train data to test data\".format(num_items_to_remove))\n",
+ " print(\"train data: {} -> {} rows\".format(train.shape[0], train2.shape[0]))\n",
+ " print(\"validation data: {} -> {} rows\".format(validation.shape[0], validation2.shape[0]))\n",
+ " return train2, validation2"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 47,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Moved 10 items from train data to test data\n",
+ "train data: 123296175 -> 122972539 rows\n",
+ "validation data: 1679408 -> 2003044 rows\n"
+ ]
+ }
+ ],
+ "source": [
+ "num_items_to_remove = 10\n",
+ "train_train2, train_validation2 = move_random_items_from_train_to_validation(train_train, train_validation, num_items_to_remove)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Clear memory for previous train_train and train_validation"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 48,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "288"
+ ]
+ },
+ "execution_count": 48,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "train_train = train_train2\n",
+ "train_validation = train_validation2\n",
+ "del train_train2\n",
+ "del train_validation2\n",
+ "gc.collect()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 49,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "0.80% of items in the test data set are not seen in the train data set\n"
+ ]
+ },
+ {
+ "data": {
+ "text/plain": [
+ "0.00804985717995326"
+ ]
+ },
+ "execution_count": 49,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "get_unseen_item_percentage(train_train, train_validation)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 50,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "16.51% of (item,store) pairs in the test data set are not seen in the train data set\n"
+ ]
+ },
+ {
+ "data": {
+ "text/plain": [
+ "0.16514714604372147"
+ ]
+ },
+ "execution_count": 50,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "get_unseen_item_store_pair_percentage(train_train, train_validation)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Now, much more items are unseen in validation data"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 51,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "(2003044,)\n",
+ "(2003044,) (2003044,) (2003044,)\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "/Users/jinyang/anaconda/envs/py35/lib/python3.5/site-packages/ipykernel_launcher.py:3: SettingWithCopyWarning:\n",
+ "\n",
+ "\n",
+ "A value is trying to be set on a copy of a slice from a DataFrame\n",
+ "\n",
+ "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n",
+ "\n"
+ ]
+ }
+ ],
+ "source": [
+ "train_validation_metric = get_evaluation_using_constant_baseline(train_train, train_validation)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 52,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "1.033043851782858"
+ ]
+ },
+ "execution_count": 52,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "train_validation_metric"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "In Leaderboard 1.710"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Removing items hardly changed the validation score, which implies that we are doing worse job on predicting seen items"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Analyze the performance on each group: seen (item, store), seen class, unseen class"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Group test data into the three groups"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 53,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def group_test_data(test, train):\n",
+ " def get_classes_from_expanded(df_expanded):\n",
+ " return df_expanded['class'].unique()\n",
+ " cols_item_store = ['item_nbr', 'store_nbr']\n",
+ " train_item_store_grouped = train.groupby(cols_item_store)['unit_sales'].mean().reset_index()\n",
+ " cols_to_use = test.columns.drop('unit_sales') if 'unit_sales' in test.columns else test.columns\n",
+ " test_join_train_item_store_grouped = pd.merge(test[cols_to_use], train_item_store_grouped, on=cols_item_store, how='left')\n",
+ " test_seen = test_join_train_item_store_grouped[test_join_train_item_store_grouped['unit_sales'].notnull()]\n",
+ " test_unseen = test_join_train_item_store_grouped[test_join_train_item_store_grouped['unit_sales'].isnull()]\n",
+ " test_unseen_expanded = get_item_expanded_df(test_unseen, items)\n",
+ " test_unseen_class = get_classes_from_expanded(test_unseen_expanded)\n",
+ " train_expanded = get_item_expanded_df(train, items)\n",
+ " train_class = get_classes_from_expanded(train_expanded)\n",
+ " test_unseen_class_diff = set(test_unseen_class) - set(train_class)\n",
+ " test_unseen_class_same = set(test_unseen_class) - test_unseen_class_diff\n",
+ " test_unseen_class_seen = test_unseen_expanded[test_unseen_expanded['class'].isin(test_unseen_class_same)]\n",
+ " test_unseen_class_unseen = test_unseen_expanded[test_unseen_expanded['class'].isin(test_unseen_class_diff)]\n",
+ " return test_seen, test_unseen_class_seen, test_unseen_class_unseen"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 54,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "test_seen, test_unseen_class_seen,test_unseen_class_unseen = group_test_data(test, train)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 55,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "True\n"
+ ]
+ }
+ ],
+ "source": [
+ "print(test.shape[0]== test_seen.shape[0]+ test_unseen_class_seen.shape[0]+test_unseen_class_unseen.shape[0])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 56,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def get_baseline_prediction_test_seen(test_seen_expanded, train):\n",
+ " cols_item_store = ['item_nbr', 'store_nbr']\n",
+ " cols_test_expanded = test_seen_expanded.columns\n",
+ " cols_prediction = ['id', 'unit_sales', 'perishable']\n",
+ " train_item_store_grouped = train.groupby(cols_item_store)['unit_sales'].mean().reset_index()\n",
+ " cols_to_use = cols_test_expanded.drop('unit_sales') if 'unit_sales' in cols_test_expanded else cols_test_expanded\n",
+ " test_join_train_item_store_grouped = pd.merge(test_seen_expanded[cols_to_use], train_item_store_grouped, on=cols_item_store, how='left')\n",
+ " is_data_seen = test_join_train_item_store_grouped['unit_sales'].notnull()\n",
+ " join_seen = test_join_train_item_store_grouped[is_data_seen.values]\n",
+ " return join_seen[cols_prediction].rename(columns={'unit_sales':'prediction_sales'})"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 57,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def get_baseline_prediction_test_unseen_class_seen(test_unseen_class_seen, train_expanded):\n",
+ " test_unseen_class_seen_class = test_unseen_class_seen['class'].unique()\n",
+ " train_sub = train_expanded[train_expanded['class'].isin(test_unseen_class_seen_class)]\n",
+ " train_sub_class_grouped = train_sub.groupby('class').mean().reset_index()\n",
+ " train_sub_class_grouped = train_sub_class_grouped[['class', 'unit_sales']]\n",
+ " test_unseen_class_seen_join_train_sub_class_grouped = pd.merge(test_unseen_class_seen, train_sub_class_grouped, on='class', how='left')\n",
+ " return test_unseen_class_seen_join_train_sub_class_grouped.drop('unit_sales_x', axis=1)\\\n",
+ " .rename(columns={'unit_sales_y':'prediction_sales'})\n",
+ " "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 58,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def get_baseline_prediction_test_unseen_class_unseen(test_unseen_class_unseen, train_expanded):\n",
+ " test_unseen_class_unseen_family = test_unseen_class_unseen['family'].unique()\n",
+ " train_sub = train_expanded[train_expanded['family'].isin(test_unseen_class_unseen_family)]\n",
+ " train_sub_family_grouped = train_sub.groupby('family').mean().reset_index()\n",
+ " train_sub_family_grouped = train_sub_family_grouped[['family', 'unit_sales']]\n",
+ " test_unseen_class_unseen_join_train_sub_family_grouped = pd.merge(test_unseen_class_unseen, train_sub_family_grouped, on='family', how='left')\n",
+ " return test_unseen_class_unseen_join_train_sub_family_grouped.drop('unit_sales_x', axis=1)\\\n",
+ " .rename(columns={'unit_sales_y':'prediction_sales'})\n",
+ " "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 59,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def clean_predictions(predictions):\n",
+ " predictions[predictions<0]=0\n",
+ " return predictions.round().astype(int)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 26,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def clean_targets(targets):\n",
+ " targets[targets<0]=0\n",
+ " return targets"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 61,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def get_baseline_predictions(test, train):\n",
+ " cols_to_use =['id', 'perishable', 'prediction_sales']\n",
+ " test_seen, test_unseen_class_seen, test_unseen_class_unseen = group_test_data(test, train)\n",
+ " train_expanded = get_item_expanded_df(train, items)\n",
+ " test_seen_expanded = get_item_expanded_df(test_seen, items)\n",
+ " prediction_test_seen = get_baseline_prediction_test_seen(test_seen_expanded, train)[cols_to_use]\n",
+ " prediction_test_unseen_class_seen = get_baseline_prediction_test_unseen_class_seen(test_unseen_class_seen, train_expanded)[cols_to_use]\n",
+ " prediction_test_unseen_class_unseen = get_baseline_prediction_test_unseen_class_unseen(test_unseen_class_unseen, train_expanded)[cols_to_use]\n",
+ " baseline_predictions = prediction_test_seen.append(prediction_test_unseen_class_seen).append(prediction_test_unseen_class_unseen)\n",
+ " cleaned_predictions = clean_predictions(baseline_predictions['prediction_sales'])\n",
+ " baseline_predictions.loc[:, 'prediction_sales']=cleaned_predictions\n",
+ " return baseline_predictions"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 62,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# prediction_test_seen, prediction_test_unseen_class_seen, prediction_test_unseen_class_unseen=get_baseline_predictions(test, train)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 63,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "/Users/jinyang/anaconda/envs/py35/lib/python3.5/site-packages/ipykernel_launcher.py:2: SettingWithCopyWarning:\n",
+ "\n",
+ "\n",
+ "A value is trying to be set on a copy of a slice from a DataFrame\n",
+ "\n",
+ "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n",
+ "\n"
+ ]
+ }
+ ],
+ "source": [
+ "baseline_predictions_validation = get_baseline_predictions(train_validation, train_train)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 64,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " id | \n",
+ " perishable | \n",
+ " prediction_sales | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 123296175 | \n",
+ " 0 | \n",
+ " 3 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 123296176 | \n",
+ " 1 | \n",
+ " 4 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 123296177 | \n",
+ " 0 | \n",
+ " 5 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 123296178 | \n",
+ " 0 | \n",
+ " 10 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 123296179 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " id perishable prediction_sales\n",
+ "0 123296175 0 3\n",
+ "1 123296176 1 4\n",
+ "2 123296177 0 5\n",
+ "3 123296178 0 10\n",
+ "4 123296179 0 1"
+ ]
+ },
+ "execution_count": 64,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "baseline_predictions_validation.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 65,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "targets_validation = pd.merge(baseline_predictions_validation, train_validation, on='id', how='left')['unit_sales']"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 16,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def get_evaluation(baseline_predictions_validation, targets_validation):\n",
+ " predictions = baseline_predictions_validation.prediction_sales\n",
+ " cleaned_targets = clean_targets(targets_validation)\n",
+ " weights = baseline_predictions_validation.perishable\n",
+ " return nwrmsle(predictions, cleaned_targets, weights)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 67,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "(2003044,)\n",
+ "(2003044,) (2003044,) (2003044,)\n"
+ ]
+ }
+ ],
+ "source": [
+ "validation_metric = get_evaluation(baseline_predictions_validation, targets_validation)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 68,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "0.67828104376604859"
+ ]
+ },
+ "execution_count": 68,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "validation_metric"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 69,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Validation NWRMSLE: 0.67828\n",
+ "Validation MSE: 353.981\n"
+ ]
+ }
+ ],
+ "source": [
+ "mse = mean_squared_error(baseline_predictions_validation['prediction_sales'], targets_validation)\n",
+ "print('Validation NWRMSLE: %.5f' % (validation_metric))\n",
+ "print('Validation MSE: %.3f' % (mse))"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "on Leaderboard: 1.369"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 70,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "0.67828104376604859"
+ ]
+ },
+ "execution_count": 70,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "validation_metric"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Submission"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 72,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "/Users/jinyang/anaconda/envs/py35/lib/python3.5/site-packages/ipykernel_launcher.py:2: SettingWithCopyWarning:\n",
+ "\n",
+ "\n",
+ "A value is trying to be set on a copy of a slice from a DataFrame\n",
+ "\n",
+ "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n",
+ "\n"
+ ]
+ }
+ ],
+ "source": [
+ "baseline_predictions_test = get_baseline_predictions(test, train)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 73,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " id | \n",
+ " perishable | \n",
+ " prediction_sales | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 125497040 | \n",
+ " 0 | \n",
+ " 2 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 125497041 | \n",
+ " 0 | \n",
+ " 3 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 125497043 | \n",
+ " 0 | \n",
+ " 3 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 125497044 | \n",
+ " 1 | \n",
+ " 4 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 125497045 | \n",
+ " 0 | \n",
+ " 5 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " id perishable prediction_sales\n",
+ "0 125497040 0 2\n",
+ "1 125497041 0 3\n",
+ "2 125497043 0 3\n",
+ "3 125497044 1 4\n",
+ "4 125497045 0 5"
+ ]
+ },
+ "execution_count": 73,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "baseline_predictions_test.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 74,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "test_submission=baseline_predictions_test[['id', 'prediction_sales']].rename(columns={'prediction_sales':'unit_sales'})"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 75,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " id | \n",
+ " unit_sales | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 125497040 | \n",
+ " 2 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 125497041 | \n",
+ " 3 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 125497043 | \n",
+ " 3 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 125497044 | \n",
+ " 4 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 125497045 | \n",
+ " 5 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " id unit_sales\n",
+ "0 125497040 2\n",
+ "1 125497041 3\n",
+ "2 125497043 3\n",
+ "3 125497044 4\n",
+ "4 125497045 5"
+ ]
+ },
+ "execution_count": 75,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "test_submission.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 76,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "True"
+ ]
+ },
+ "execution_count": 76,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "np.all(test_submission.unit_sales>=0)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 77,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "id int64\n",
+ "unit_sales int64\n",
+ "dtype: object"
+ ]
+ },
+ "execution_count": 77,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "test_submission.dtypes"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 78,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# test_submission.to_csv('baseline_submission_20171127.csv', index=False)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Evaluation of each group"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 79,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "validation_seen, validation_unseen_class_seen,validation_unseen_class_unseen = group_test_data(train_validation, train_train)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 80,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "Index(['id', 'date', 'store_nbr', 'item_nbr', 'onpromotion', 'unit_sales'], dtype='object')"
+ ]
+ },
+ "execution_count": 80,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "validation_seen.columns"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 81,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "seen_baseline_predictions_validation = baseline_predictions_validation[ baseline_predictions_validation.id.isin(validation_seen.id)]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 82,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "unseen_class_seen_baseline_predictions_validation = baseline_predictions_validation[ baseline_predictions_validation.id.isin(validation_unseen_class_seen.id)]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 83,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "unseen_class_unseen_baseline_predictions_validation = baseline_predictions_validation[ baseline_predictions_validation.id.isin(validation_unseen_class_unseen.id)]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 84,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "8.406943132552255"
+ ]
+ },
+ "execution_count": 84,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "baseline_predictions_validation.prediction_sales.mean()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 85,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "8.53563678093009"
+ ]
+ },
+ "execution_count": 85,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "seen_baseline_predictions_validation.prediction_sales.mean()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 86,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "7.756378630680307"
+ ]
+ },
+ "execution_count": 86,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "unseen_class_seen_baseline_predictions_validation.prediction_sales.mean()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 87,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "7.2"
+ ]
+ },
+ "execution_count": 87,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "unseen_class_unseen_baseline_predictions_validation.prediction_sales.mean()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 88,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "seen_baseline_predictions_test = baseline_predictions_test[ baseline_predictions_test.id.isin(test_seen.id)]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 89,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "unseen_class_seen_baseline_predictions_test = baseline_predictions_test[ baseline_predictions_test.id.isin(test_unseen_class_seen.id)]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 90,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "unseen_class_unseen_baseline_predictions_test = baseline_predictions_test[ baseline_predictions_test.id.isin(test_unseen_class_unseen.id)]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 91,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "7.350565382095759"
+ ]
+ },
+ "execution_count": 91,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "baseline_predictions_test.prediction_sales.mean()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 92,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "6.949336092844724"
+ ]
+ },
+ "execution_count": 92,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "seen_baseline_predictions_test.prediction_sales.mean()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 93,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "8.855705681997609"
+ ]
+ },
+ "execution_count": 93,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "unseen_class_seen_baseline_predictions_test.prediction_sales.mean()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 94,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "7.0"
+ ]
+ },
+ "execution_count": 94,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "unseen_class_unseen_baseline_predictions_test.prediction_sales.mean()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "validation and test looks different in means of predictions"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 95,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "8.573084378706207"
+ ]
+ },
+ "execution_count": 95,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "train_train.unit_sales.mean()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 96,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "8.554865268437672"
+ ]
+ },
+ "execution_count": 96,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "train.unit_sales.mean()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Why do seen items have low mean in test data?"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 97,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "Index(['id', 'perishable', 'prediction_sales'], dtype='object')"
+ ]
+ },
+ "execution_count": 97,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "seen_baseline_predictions_test.columns"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 98,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "Index(['id', 'date', 'store_nbr', 'item_nbr', 'onpromotion', 'unit_sales'], dtype='object')"
+ ]
+ },
+ "execution_count": 98,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "test_seen.columns"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 99,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "test_seen_mean = test_seen.groupby(['item_nbr','store_nbr'])['unit_sales'].mean().reset_index()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 100,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "1.000000 1063\n",
+ "2.000000 387\n",
+ "1.500000 269\n",
+ "1.333333 211\n",
+ "3.000000 169\n",
+ "1.200000 142\n",
+ "1.666667 132\n",
+ "4.000000 110\n",
+ "1.250000 105\n",
+ "1.600000 90\n",
+ "1.400000 87\n",
+ "2.500000 83\n",
+ "1.166667 75\n",
+ "1.750000 74\n",
+ "2.333333 72\n",
+ "1.142857 70\n",
+ "1.800000 70\n",
+ "5.000000 69\n",
+ "1.714286 66\n",
+ "1.833333 65\n",
+ "1.428571 63\n",
+ "2.400000 61\n",
+ "2.666667 59\n",
+ "1.285714 58\n",
+ "1.571429 56\n",
+ "1.222222 55\n",
+ "6.000000 54\n",
+ "1.375000 53\n",
+ "3.333333 52\n",
+ "3.500000 52\n",
+ " ... \n",
+ "5.931522 1\n",
+ "5.983607 1\n",
+ "10.427236 1\n",
+ "6.328264 1\n",
+ "2.569665 1\n",
+ "4.409535 1\n",
+ "35.648855 1\n",
+ "4.074074 1\n",
+ "14.786531 1\n",
+ "1.416867 1\n",
+ "2.202128 1\n",
+ "89.903846 1\n",
+ "4.600505 1\n",
+ "2.375946 1\n",
+ "20.287293 1\n",
+ "4.557390 1\n",
+ "2.670120 1\n",
+ "2.405458 1\n",
+ "8.606928 1\n",
+ "2.559347 1\n",
+ "11.537549 1\n",
+ "4.129799 1\n",
+ "5.267884 1\n",
+ "12.886924 1\n",
+ "5.454833 1\n",
+ "15.486842 1\n",
+ "5.792963 1\n",
+ "4.558394 1\n",
+ "5.431433 1\n",
+ "3.750600 1\n",
+ "Name: unit_sales, Length: 139461, dtype: int64"
+ ]
+ },
+ "execution_count": 100,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "test_seen_mean.unit_sales.value_counts()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 101,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "(3370464, 5)"
+ ]
+ },
+ "execution_count": 101,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "test.shape"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 102,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "(2003044, 6)"
+ ]
+ },
+ "execution_count": 102,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "train_validation.shape"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 103,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "(2659408, 6)"
+ ]
+ },
+ "execution_count": 103,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "test_seen.shape"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 104,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "(1672247, 6)"
+ ]
+ },
+ "execution_count": 104,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "validation_seen.shape"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 105,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "Index(['id', 'date', 'store_nbr', 'item_nbr', 'onpromotion', 'unit_sales'], dtype='object')"
+ ]
+ },
+ "execution_count": 105,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "validation_seen.columns"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 106,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "validation_seen_mean = validation_seen.groupby(['item_nbr', 'store_nbr'])['unit_sales'].mean().reset_index()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 107,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "1.000000 196\n",
+ "2.000000 183\n",
+ "1.500000 148\n",
+ "1.333333 109\n",
+ "2.500000 71\n",
+ "1.250000 70\n",
+ "4.000000 67\n",
+ "3.000000 66\n",
+ "1.750000 57\n",
+ "1.200000 47\n",
+ "1.666667 47\n",
+ "5.000000 43\n",
+ "1.428571 41\n",
+ "1.571429 40\n",
+ "1.375000 39\n",
+ "1.166667 37\n",
+ "1.222222 34\n",
+ "2.250000 34\n",
+ "6.000000 34\n",
+ "1.666667 31\n",
+ "1.125000 31\n",
+ "1.285714 30\n",
+ "2.750000 30\n",
+ "2.333333 30\n",
+ "1.142857 30\n",
+ "1.444444 29\n",
+ "3.500000 28\n",
+ "2.666667 27\n",
+ "1.300000 27\n",
+ "1.625000 26\n",
+ " ... \n",
+ "5.414894 1\n",
+ "5.689526 1\n",
+ "10.992721 1\n",
+ "4.405322 1\n",
+ "2.751479 1\n",
+ "3.373206 1\n",
+ "1.844920 1\n",
+ "3.732102 1\n",
+ "1.887872 1\n",
+ "3.881868 1\n",
+ "6.143824 1\n",
+ "4.322581 1\n",
+ "3.486506 1\n",
+ "4.022444 1\n",
+ "1.949593 1\n",
+ "3.965174 1\n",
+ "6.439222 1\n",
+ "9.427083 1\n",
+ "5.120801 1\n",
+ "1.376404 1\n",
+ "2.192547 1\n",
+ "11.182663 1\n",
+ "6.961364 1\n",
+ "7.380435 1\n",
+ "17.839545 1\n",
+ "1.691099 1\n",
+ "9.129005 1\n",
+ "1.983607 1\n",
+ "25.861189 1\n",
+ "9.211574 1\n",
+ "Name: unit_sales, Length: 132023, dtype: int64"
+ ]
+ },
+ "execution_count": 107,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "validation_seen_mean.unit_sales.value_counts()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 108,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "(146060, 3)"
+ ]
+ },
+ "execution_count": 108,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "validation_seen_mean.shape"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 109,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "baseline_predictions_validation.to_csv('baseline_predictions_validation', index=False)\n",
+ "baseline_predictions_test.to_csv('baseline_predictions_test', index=False)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 110,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " id | \n",
+ " perishable | \n",
+ " prediction_sales | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 123296175 | \n",
+ " 0 | \n",
+ " 3 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 123296176 | \n",
+ " 1 | \n",
+ " 4 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 123296177 | \n",
+ " 0 | \n",
+ " 5 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 123296178 | \n",
+ " 0 | \n",
+ " 10 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 123296179 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " id perishable prediction_sales\n",
+ "0 123296175 0 3\n",
+ "1 123296176 1 4\n",
+ "2 123296177 0 5\n",
+ "3 123296178 0 10\n",
+ "4 123296179 0 1"
+ ]
+ },
+ "execution_count": 110,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "seen_baseline_predictions_validation.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 112,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " id | \n",
+ " date | \n",
+ " store_nbr | \n",
+ " item_nbr | \n",
+ " unit_sales | \n",
+ " onpromotion | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 123296175 | \n",
+ " 123296175 | \n",
+ " 2017-07-26 | \n",
+ " 1 | \n",
+ " 103520 | \n",
+ " 1.0 | \n",
+ " False | \n",
+ "
\n",
+ " \n",
+ " 123296176 | \n",
+ " 123296176 | \n",
+ " 2017-07-26 | \n",
+ " 1 | \n",
+ " 103665 | \n",
+ " 4.0 | \n",
+ " False | \n",
+ "
\n",
+ " \n",
+ " 123296177 | \n",
+ " 123296177 | \n",
+ " 2017-07-26 | \n",
+ " 1 | \n",
+ " 105574 | \n",
+ " 9.0 | \n",
+ " False | \n",
+ "
\n",
+ " \n",
+ " 123296178 | \n",
+ " 123296178 | \n",
+ " 2017-07-26 | \n",
+ " 1 | \n",
+ " 105575 | \n",
+ " 6.0 | \n",
+ " False | \n",
+ "
\n",
+ " \n",
+ " 123296179 | \n",
+ " 123296179 | \n",
+ " 2017-07-26 | \n",
+ " 1 | \n",
+ " 105693 | \n",
+ " 2.0 | \n",
+ " True | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " id date store_nbr item_nbr unit_sales onpromotion\n",
+ "123296175 123296175 2017-07-26 1 103520 1.0 False\n",
+ "123296176 123296176 2017-07-26 1 103665 4.0 False\n",
+ "123296177 123296177 2017-07-26 1 105574 9.0 False\n",
+ "123296178 123296178 2017-07-26 1 105575 6.0 False\n",
+ "123296179 123296179 2017-07-26 1 105693 2.0 True"
+ ]
+ },
+ "execution_count": 112,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "train_validation.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "seen_empirical = pd.read_csv('seen_empirical', header=None)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 123296175 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 123296176 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 123296177 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 123296178 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 123296181 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " 0\n",
+ "0 123296175\n",
+ "1 123296176\n",
+ "2 123296177\n",
+ "3 123296178\n",
+ "4 123296181"
+ ]
+ },
+ "execution_count": 7,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "seen_empirical.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import pandas as pd"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "baseline_predictions_validation = pd.read_csv('baseline_predictions_validation')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " id | \n",
+ " perishable | \n",
+ " prediction_sales | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 123296175 | \n",
+ " 0 | \n",
+ " 3 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 123296176 | \n",
+ " 1 | \n",
+ " 4 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 123296177 | \n",
+ " 0 | \n",
+ " 5 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 123296178 | \n",
+ " 0 | \n",
+ " 10 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 123296179 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " id perishable prediction_sales\n",
+ "0 123296175 0 3\n",
+ "1 123296176 1 4\n",
+ "2 123296177 0 5\n",
+ "3 123296178 0 10\n",
+ "4 123296179 0 1"
+ ]
+ },
+ "execution_count": 4,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "baseline_predictions_validation.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 14,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "baseline_predictions_validation_subset = baseline_predictions_validation[baseline_predictions_validation.id.isin(seen_empirical[[0]][0])]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 15,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "(1343027, 3)"
+ ]
+ },
+ "execution_count": 15,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "baseline_predictions_validation_subset.shape"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 23,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "train_subset = train[train.id.isin(seen_empirical[[0]][0])]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 27,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "/Users/jinyang/anaconda/envs/py35/lib/python3.5/site-packages/ipykernel_launcher.py:2: SettingWithCopyWarning:\n",
+ "\n",
+ "\n",
+ "A value is trying to be set on a copy of a slice from a DataFrame\n",
+ "\n",
+ "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n",
+ "\n",
+ "/Users/jinyang/anaconda/envs/py35/lib/python3.5/site-packages/pandas/core/generic.py:5088: SettingWithCopyWarning:\n",
+ "\n",
+ "\n",
+ "A value is trying to be set on a copy of a slice from a DataFrame\n",
+ "\n",
+ "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n",
+ "\n",
+ "/Users/jinyang/anaconda/envs/py35/lib/python3.5/site-packages/ipykernel_launcher.py:3: SettingWithCopyWarning:\n",
+ "\n",
+ "\n",
+ "A value is trying to be set on a copy of a slice from a DataFrame\n",
+ "\n",
+ "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n",
+ "\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "(1343027,)\n",
+ "(1343027,) (1343027,) (1343027,)\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "/Users/jinyang/anaconda/envs/py35/lib/python3.5/site-packages/ipykernel_launcher.py:5: SettingWithCopyWarning:\n",
+ "\n",
+ "\n",
+ "A value is trying to be set on a copy of a slice from a DataFrame\n",
+ "\n",
+ "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n",
+ "\n"
+ ]
+ },
+ {
+ "data": {
+ "text/plain": [
+ "1.1433622314160428"
+ ]
+ },
+ "execution_count": 27,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "get_evaluation(baseline_predictions_validation_subset, train_subset['unit_sales'])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 29,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "baseline_predictions_test = pd.read_csv('baseline_predictions_test')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 30,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " id | \n",
+ " perishable | \n",
+ " prediction_sales | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 125497040 | \n",
+ " 0 | \n",
+ " 2 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 125497041 | \n",
+ " 0 | \n",
+ " 3 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 125497043 | \n",
+ " 0 | \n",
+ " 3 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 125497044 | \n",
+ " 1 | \n",
+ " 4 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 125497045 | \n",
+ " 0 | \n",
+ " 5 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " id perishable prediction_sales\n",
+ "0 125497040 0 2\n",
+ "1 125497041 0 3\n",
+ "2 125497043 0 3\n",
+ "3 125497044 1 4\n",
+ "4 125497045 0 5"
+ ]
+ },
+ "execution_count": 30,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "baseline_predictions_test.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.5.3"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/notebook b/notebook
new file mode 100644
index 0000000..1e60e29
--- /dev/null
+++ b/notebook
@@ -0,0 +1,3325 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": 18,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ ""
+ ],
+ "text/vnd.plotly.v1+html": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "import pandas as pd\n",
+ "import numpy as np\n",
+ "import matplotlib.pyplot as plt\n",
+ "%matplotlib inline\n",
+ "import seaborn as sns\n",
+ "import plotly.offline as py\n",
+ "import plotly.graph_objs as go\n",
+ "py.init_notebook_mode()\n",
+ "import gc"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 19,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "seed = 46\n",
+ "np.random.seed(seed)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 20,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def load_data(data_path):\n",
+ " train = pd.read_csv('%s/train.csv' % data_path, parse_dates=['date'])\n",
+ " test = pd.read_csv('%s/test.csv' % data_path, parse_dates=['date'])\n",
+ " items = pd.read_csv('%s/items.csv' % data_path)\n",
+ " return train, test, items"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 21,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "/Users/jinyang/anaconda/envs/py35/lib/python3.5/site-packages/IPython/core/interactiveshell.py:2821: DtypeWarning:\n",
+ "\n",
+ "Columns (5) have mixed types. Specify dtype option on import or set low_memory=False.\n",
+ "\n"
+ ]
+ }
+ ],
+ "source": [
+ "data_path = './data'\n",
+ "train, test, items = load_data(data_path)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def print_summary_train_test(train, test):\n",
+ " print ('Train min/max date: %s / %s' % (train['date'].min(), train['date'].max()))\n",
+ " print ('Test min/max date: %s / %s' % ( test['date'].min(), test['date'].max()))\n",
+ " print ('')\n",
+ " print ('Number of days in train: %d' % ((train['date'].max() - train['date'].min()).days + 1))\n",
+ " print ('Number of days in validation: %d' % (( test['date'].max() - test['date'].min()).days + 1))\n",
+ " print ('')\n",
+ " print ('Train shape: %d rows' % train.shape[0])\n",
+ " print ('Test shape: %d rows' % test.shape[0])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Train min/max date: 2013-01-01 00:00:00 / 2017-08-15 00:00:00\n",
+ "Test min/max date: 2017-08-16 00:00:00 / 2017-08-31 00:00:00\n",
+ "\n",
+ "Number of days in train: 1688\n",
+ "Number of days in validation: 16\n",
+ "\n",
+ "Train shape: 125497040 rows\n",
+ "Test shape: 3370464 rows\n"
+ ]
+ }
+ ],
+ "source": [
+ "print_summary_train_test(train, test)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Divide Train data into Validation(last two weeks of train data) and Training(the rest)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 22,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import splitter"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "train_last_date = train['date'].max()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "2017-07-26 00:00:00 2017-08-10 00:00:00\n"
+ ]
+ }
+ ],
+ "source": [
+ "begin_of_validation, end_of_validation = splitter.get_validation_period(train_last_date)\n",
+ "print(begin_of_validation, end_of_validation)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 10,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "train_train, train_validation = splitter.split_validation_train_by_validation_period(train, begin_of_validation, end_of_validation)\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 11,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Train min/max date: 2013-01-01 00:00:00 / 2017-07-25 00:00:00\n",
+ "Test min/max date: 2017-07-26 00:00:00 / 2017-08-10 00:00:00\n",
+ "\n",
+ "Number of days in train: 1667\n",
+ "Number of days in validation: 16\n",
+ "\n",
+ "Train shape: 123296175 rows\n",
+ "Test shape: 1679408 rows\n"
+ ]
+ }
+ ],
+ "source": [
+ "print_summary_train_test(train_train, train_validation)\n",
+ " "
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Evaluation Metric"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 12,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import evaluation\n",
+ "from sklearn.metrics import mean_squared_error"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## How many items in Test data set are not seen in Train data set \n",
+ "## vs. how many items in Validation are not seen in Training"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 13,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def get_unseen_item_percentage(train, test):\n",
+ " test_items = test['item_nbr'].unique()\n",
+ " train_items = train['item_nbr'].unique()\n",
+ " test_items_unseen_in_train = set(test_items) - set(train_items)\n",
+ " unseen_percentage = len(test_items_unseen_in_train)/len(test_items)\n",
+ " print(\"{:.2f}% of items in the test data set are not seen in the train data set\".format(unseen_percentage*100))\n",
+ " return unseen_percentage"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 14,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "1.54% of items in the test data set are not seen in the train data set\n"
+ ]
+ },
+ {
+ "data": {
+ "text/plain": [
+ "0.015380671622660855"
+ ]
+ },
+ "execution_count": 14,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "get_unseen_item_percentage(train, test)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 15,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "0.55% of items in the test data set are not seen in the train data set\n"
+ ]
+ },
+ {
+ "data": {
+ "text/plain": [
+ "0.005454545454545455"
+ ]
+ },
+ "execution_count": 15,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "get_unseen_item_percentage(train_train, train_validation)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## How many (item, store) in Test data set are not seen in Train data set\n",
+ "## vs. how many (item, store) in Validation are not seen in Training"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 16,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def get_unseen_item_store_pair_percentage(train, test):\n",
+ " cols_item_store = ['item_nbr', 'store_nbr']\n",
+ " train_item_store_grouped = train.groupby(cols_item_store)['unit_sales'].mean().reset_index()\n",
+ " cols_to_use = test.columns.drop('unit_sales') if 'unit_sales' in test.columns else test.columns\n",
+ " test_join_train_item_store_grouped = pd.merge(test[cols_to_use], train_item_store_grouped, on=cols_item_store, how='left')\n",
+ " test_unseen = test_join_train_item_store_grouped[test_join_train_item_store_grouped['unit_sales'].isnull()]\n",
+ " unseen_percentage = test_unseen.shape[0]/test.shape[0]\n",
+ " print(\"{:.2f}% of (item,store) pairs in the test data set are not seen in the train data set\".format(unseen_percentage*100))\n",
+ " return unseen_percentage"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 17,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "21.10% of (item,store) pairs in the test data set are not seen in the train data set\n"
+ ]
+ },
+ {
+ "data": {
+ "text/plain": [
+ "0.21096679863662687"
+ ]
+ },
+ "execution_count": 17,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "get_unseen_item_store_pair_percentage(train, test)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 18,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "0.13% of (item,store) pairs in the test data set are not seen in the train data set\n"
+ ]
+ },
+ {
+ "data": {
+ "text/plain": [
+ "0.0013326124443851642"
+ ]
+ },
+ "execution_count": 18,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "get_unseen_item_store_pair_percentage(train_train, train_validation)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Using constant prediction"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 19,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "8.563926854265649"
+ ]
+ },
+ "execution_count": 19,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "train_train.unit_sales.mean()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 20,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "8.554865268437672"
+ ]
+ },
+ "execution_count": 20,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "train.unit_sales.mean()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 21,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "(1679408, 6)"
+ ]
+ },
+ "execution_count": 21,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "train_validation.shape"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 22,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "train_validation_prediction = pd.DataFrame({'id': train_validation.loc[:, 'id'], 'prediction_sales': train_train.unit_sales.mean()})"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 23,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "(1679408, 2)"
+ ]
+ },
+ "execution_count": 23,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "train_validation_prediction.shape"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 24,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "test_prediction = pd.DataFrame({'id': test.loc[:, 'id'], 'prediction_sales': train.unit_sales.mean()})"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 25,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " id | \n",
+ " prediction_sales | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 123296175 | \n",
+ " 123296175 | \n",
+ " 8.563927 | \n",
+ "
\n",
+ " \n",
+ " 123296176 | \n",
+ " 123296176 | \n",
+ " 8.563927 | \n",
+ "
\n",
+ " \n",
+ " 123296177 | \n",
+ " 123296177 | \n",
+ " 8.563927 | \n",
+ "
\n",
+ " \n",
+ " 123296178 | \n",
+ " 123296178 | \n",
+ " 8.563927 | \n",
+ "
\n",
+ " \n",
+ " 123296179 | \n",
+ " 123296179 | \n",
+ " 8.563927 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " id prediction_sales\n",
+ "123296175 123296175 8.563927\n",
+ "123296176 123296176 8.563927\n",
+ "123296177 123296177 8.563927\n",
+ "123296178 123296178 8.563927\n",
+ "123296179 123296179 8.563927"
+ ]
+ },
+ "execution_count": 25,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "train_validation_prediction.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 26,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " id | \n",
+ " prediction_sales | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 125497040 | \n",
+ " 8.554865 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 125497041 | \n",
+ " 8.554865 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 125497042 | \n",
+ " 8.554865 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 125497043 | \n",
+ " 8.554865 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 125497044 | \n",
+ " 8.554865 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " id prediction_sales\n",
+ "0 125497040 8.554865\n",
+ "1 125497041 8.554865\n",
+ "2 125497042 8.554865\n",
+ "3 125497043 8.554865\n",
+ "4 125497044 8.554865"
+ ]
+ },
+ "execution_count": 26,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "test_prediction.head()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Evaluation"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 27,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def get_prediction_constant(train_train):\n",
+ " return train_train.unit_sales.mean()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 28,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def get_clean_prediction(train_train, train_validation):\n",
+ " predictions = get_prediction_constant(train_train)\n",
+ " return pd.DataFrame({'id': train_validation.loc[:, 'id'], 'prediction_sales': predictions})"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 29,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def get_item_expanded_df(test, items):\n",
+ " return pd.merge(test, items, on='item_nbr', how='left')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 30,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def get_evaluation_using_constant_baseline(train_train, train_validation):\n",
+ " train_validation_prediction = get_clean_prediction(train_train, train_validation)\n",
+ " train_validation_expanded = get_item_expanded_df(train_validation, items)\n",
+ " train_validation_weights = train_validation_expanded ['perishable']*0.25+1\n",
+ " return nwrmsle(train_validation_prediction['prediction_sales'], train_validation['unit_sales'], train_validation_weights)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 31,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "train_validation_expanded = get_item_expanded_df(train_validation, items)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 32,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "train_validation_weights = train_validation_expanded ['perishable']*0.25+1"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 33,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "0 1.00\n",
+ "1 1.25\n",
+ "2 1.00\n",
+ "3 1.00\n",
+ "4 1.00\n",
+ "Name: perishable, dtype: float64"
+ ]
+ },
+ "execution_count": 33,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "train_validation_weights.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 17,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def nwrmsle(predictions, targets, weights):\n",
+ " print(targets.shape)\n",
+ " targets[targets<0]=0\n",
+ " weights = 1 + 0.25 * weights\n",
+ " print(predictions.shape, targets.shape, weights.shape)\n",
+ " log_square_errors = (np.log(predictions.values + 1) - np.log(targets.values + 1)) ** 2\n",
+ " return(np.sqrt(np.sum(weights.values * log_square_errors) / np.sum(weights)))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 35,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "(1679408,)"
+ ]
+ },
+ "execution_count": 35,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "train_validation_prediction['prediction_sales'].shape"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 36,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "(1679408, 2)"
+ ]
+ },
+ "execution_count": 36,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "train_validation_prediction.shape"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 37,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "(1679408,)"
+ ]
+ },
+ "execution_count": 37,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "train_validation['unit_sales'].shape"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 38,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "(1679408, 6)"
+ ]
+ },
+ "execution_count": 38,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "train_validation.shape"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 39,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "(1679408,)\n",
+ "(1679408,) (1679408,) (1679408,)\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "/Users/jinyang/anaconda/envs/py35/lib/python3.5/site-packages/ipykernel_launcher.py:3: SettingWithCopyWarning:\n",
+ "\n",
+ "\n",
+ "A value is trying to be set on a copy of a slice from a DataFrame\n",
+ "\n",
+ "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n",
+ "\n",
+ "/Users/jinyang/anaconda/envs/py35/lib/python3.5/site-packages/pandas/core/generic.py:5088: SettingWithCopyWarning:\n",
+ "\n",
+ "\n",
+ "A value is trying to be set on a copy of a slice from a DataFrame\n",
+ "\n",
+ "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n",
+ "\n",
+ "/Users/jinyang/anaconda/envs/py35/lib/python3.5/site-packages/ipykernel_launcher.py:1: SettingWithCopyWarning:\n",
+ "\n",
+ "\n",
+ "A value is trying to be set on a copy of a slice from a DataFrame\n",
+ "\n",
+ "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n",
+ "\n"
+ ]
+ }
+ ],
+ "source": [
+ "train_validation_metric = nwrmsle(train_validation_prediction['prediction_sales'], train_validation['unit_sales'], train_validation_weights)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 40,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "1.0371859208825527"
+ ]
+ },
+ "execution_count": 40,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "train_validation_metric"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "In Leaderboard 1.710"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## How to improve the similarity between validation performance and test performance?"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Strategy 1: Remove Items from Training data set"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 41,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " id | \n",
+ " date | \n",
+ " store_nbr | \n",
+ " item_nbr | \n",
+ " onpromotion | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 125497040 | \n",
+ " 2017-08-16 | \n",
+ " 1 | \n",
+ " 96995 | \n",
+ " False | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 125497041 | \n",
+ " 2017-08-16 | \n",
+ " 1 | \n",
+ " 99197 | \n",
+ " False | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 125497042 | \n",
+ " 2017-08-16 | \n",
+ " 1 | \n",
+ " 103501 | \n",
+ " False | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 125497043 | \n",
+ " 2017-08-16 | \n",
+ " 1 | \n",
+ " 103520 | \n",
+ " False | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 125497044 | \n",
+ " 2017-08-16 | \n",
+ " 1 | \n",
+ " 103665 | \n",
+ " False | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " id date store_nbr item_nbr onpromotion\n",
+ "0 125497040 2017-08-16 1 96995 False\n",
+ "1 125497041 2017-08-16 1 99197 False\n",
+ "2 125497042 2017-08-16 1 103501 False\n",
+ "3 125497043 2017-08-16 1 103520 False\n",
+ "4 125497044 2017-08-16 1 103665 False"
+ ]
+ },
+ "execution_count": 41,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "test.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 42,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "(3901,)"
+ ]
+ },
+ "execution_count": 42,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "test.item_nbr.unique().shape"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 43,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "0.007690335811330428"
+ ]
+ },
+ "execution_count": 43,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "30/3901"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 44,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def move_items_from_train_to_validation(train, validation, items_to_remove):\n",
+ " train2 = train[~train.item_nbr.isin(items_to_remove)]\n",
+ " validation_to_add = train[train.item_nbr.isin(items_to_remove)]\n",
+ " validation2 = validation.append(validation_to_add)\n",
+ " return train2, validation2\n",
+ "\n",
+ "\n",
+ "def move_random_items_from_train_to_validation(train, validation, num_items_to_remove):\n",
+ " train_items = train['item_nbr'].unique()\n",
+ " items_to_remove = np.random.choice(train_items, num_items_to_remove)\n",
+ " train2, validation2 = move_items_from_train_to_validation(train, validation, items_to_remove)\n",
+ " print(\"Moved {} items from train data to test data\".format(num_items_to_remove))\n",
+ " print(\"train data: {} -> {} rows\".format(train.shape[0], train2.shape[0]))\n",
+ " print(\"validation data: {} -> {} rows\".format(validation.shape[0], validation2.shape[0]))\n",
+ " return train2, validation2"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 45,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Moved 30 items from train data to test data\n",
+ "train data: 123296175 -> 122487773 rows\n",
+ "validation data: 1679408 -> 2487810 rows\n"
+ ]
+ }
+ ],
+ "source": [
+ "num_items_to_move = 30\n",
+ "train_train2, train_validation2 = move_random_items_from_train_to_validation(train_train, train_validation, num_items_to_move)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 46,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def move_random_items_from_train_to_validation(train, validation, num_items_to_remove):\n",
+ " train_items = train['item_nbr'].unique()\n",
+ " items_to_remove = np.random.choice(train_items, num_items_to_remove)\n",
+ " train2, validation2 = move_items_from_train_to_validation(train, validation, items_to_remove)\n",
+ " print(\"Moved {} items from train data to test data\".format(num_items_to_remove))\n",
+ " print(\"train data: {} -> {} rows\".format(train.shape[0], train2.shape[0]))\n",
+ " print(\"validation data: {} -> {} rows\".format(validation.shape[0], validation2.shape[0]))\n",
+ " return train2, validation2"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 47,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Moved 10 items from train data to test data\n",
+ "train data: 123296175 -> 122972539 rows\n",
+ "validation data: 1679408 -> 2003044 rows\n"
+ ]
+ }
+ ],
+ "source": [
+ "num_items_to_remove = 10\n",
+ "train_train2, train_validation2 = move_random_items_from_train_to_validation(train_train, train_validation, num_items_to_remove)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Clear memory for previous train_train and train_validation"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 48,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "288"
+ ]
+ },
+ "execution_count": 48,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "train_train = train_train2\n",
+ "train_validation = train_validation2\n",
+ "del train_train2\n",
+ "del train_validation2\n",
+ "gc.collect()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 49,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "0.80% of items in the test data set are not seen in the train data set\n"
+ ]
+ },
+ {
+ "data": {
+ "text/plain": [
+ "0.00804985717995326"
+ ]
+ },
+ "execution_count": 49,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "get_unseen_item_percentage(train_train, train_validation)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 50,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "16.51% of (item,store) pairs in the test data set are not seen in the train data set\n"
+ ]
+ },
+ {
+ "data": {
+ "text/plain": [
+ "0.16514714604372147"
+ ]
+ },
+ "execution_count": 50,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "get_unseen_item_store_pair_percentage(train_train, train_validation)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Now, much more items are unseen in validation data"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 51,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "(2003044,)\n",
+ "(2003044,) (2003044,) (2003044,)\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "/Users/jinyang/anaconda/envs/py35/lib/python3.5/site-packages/ipykernel_launcher.py:3: SettingWithCopyWarning:\n",
+ "\n",
+ "\n",
+ "A value is trying to be set on a copy of a slice from a DataFrame\n",
+ "\n",
+ "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n",
+ "\n"
+ ]
+ }
+ ],
+ "source": [
+ "train_validation_metric = get_evaluation_using_constant_baseline(train_train, train_validation)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 52,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "1.033043851782858"
+ ]
+ },
+ "execution_count": 52,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "train_validation_metric"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "In Leaderboard 1.710"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Removing items hardly changed the validation score, which implies that we are doing worse job on predicting seen items"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Analyze the performance on each group: seen (item, store), seen class, unseen class"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Group test data into the three groups"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 53,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def group_test_data(test, train):\n",
+ " def get_classes_from_expanded(df_expanded):\n",
+ " return df_expanded['class'].unique()\n",
+ " cols_item_store = ['item_nbr', 'store_nbr']\n",
+ " train_item_store_grouped = train.groupby(cols_item_store)['unit_sales'].mean().reset_index()\n",
+ " cols_to_use = test.columns.drop('unit_sales') if 'unit_sales' in test.columns else test.columns\n",
+ " test_join_train_item_store_grouped = pd.merge(test[cols_to_use], train_item_store_grouped, on=cols_item_store, how='left')\n",
+ " test_seen = test_join_train_item_store_grouped[test_join_train_item_store_grouped['unit_sales'].notnull()]\n",
+ " test_unseen = test_join_train_item_store_grouped[test_join_train_item_store_grouped['unit_sales'].isnull()]\n",
+ " test_unseen_expanded = get_item_expanded_df(test_unseen, items)\n",
+ " test_unseen_class = get_classes_from_expanded(test_unseen_expanded)\n",
+ " train_expanded = get_item_expanded_df(train, items)\n",
+ " train_class = get_classes_from_expanded(train_expanded)\n",
+ " test_unseen_class_diff = set(test_unseen_class) - set(train_class)\n",
+ " test_unseen_class_same = set(test_unseen_class) - test_unseen_class_diff\n",
+ " test_unseen_class_seen = test_unseen_expanded[test_unseen_expanded['class'].isin(test_unseen_class_same)]\n",
+ " test_unseen_class_unseen = test_unseen_expanded[test_unseen_expanded['class'].isin(test_unseen_class_diff)]\n",
+ " return test_seen, test_unseen_class_seen, test_unseen_class_unseen"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 54,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "test_seen, test_unseen_class_seen,test_unseen_class_unseen = group_test_data(test, train)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 55,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "True\n"
+ ]
+ }
+ ],
+ "source": [
+ "print(test.shape[0]== test_seen.shape[0]+ test_unseen_class_seen.shape[0]+test_unseen_class_unseen.shape[0])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 56,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def get_baseline_prediction_test_seen(test_seen_expanded, train):\n",
+ " cols_item_store = ['item_nbr', 'store_nbr']\n",
+ " cols_test_expanded = test_seen_expanded.columns\n",
+ " cols_prediction = ['id', 'unit_sales', 'perishable']\n",
+ " train_item_store_grouped = train.groupby(cols_item_store)['unit_sales'].mean().reset_index()\n",
+ " cols_to_use = cols_test_expanded.drop('unit_sales') if 'unit_sales' in cols_test_expanded else cols_test_expanded\n",
+ " test_join_train_item_store_grouped = pd.merge(test_seen_expanded[cols_to_use], train_item_store_grouped, on=cols_item_store, how='left')\n",
+ " is_data_seen = test_join_train_item_store_grouped['unit_sales'].notnull()\n",
+ " join_seen = test_join_train_item_store_grouped[is_data_seen.values]\n",
+ " return join_seen[cols_prediction].rename(columns={'unit_sales':'prediction_sales'})"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 57,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def get_baseline_prediction_test_unseen_class_seen(test_unseen_class_seen, train_expanded):\n",
+ " test_unseen_class_seen_class = test_unseen_class_seen['class'].unique()\n",
+ " train_sub = train_expanded[train_expanded['class'].isin(test_unseen_class_seen_class)]\n",
+ " train_sub_class_grouped = train_sub.groupby('class').mean().reset_index()\n",
+ " train_sub_class_grouped = train_sub_class_grouped[['class', 'unit_sales']]\n",
+ " test_unseen_class_seen_join_train_sub_class_grouped = pd.merge(test_unseen_class_seen, train_sub_class_grouped, on='class', how='left')\n",
+ " return test_unseen_class_seen_join_train_sub_class_grouped.drop('unit_sales_x', axis=1)\\\n",
+ " .rename(columns={'unit_sales_y':'prediction_sales'})\n",
+ " "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 58,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def get_baseline_prediction_test_unseen_class_unseen(test_unseen_class_unseen, train_expanded):\n",
+ " test_unseen_class_unseen_family = test_unseen_class_unseen['family'].unique()\n",
+ " train_sub = train_expanded[train_expanded['family'].isin(test_unseen_class_unseen_family)]\n",
+ " train_sub_family_grouped = train_sub.groupby('family').mean().reset_index()\n",
+ " train_sub_family_grouped = train_sub_family_grouped[['family', 'unit_sales']]\n",
+ " test_unseen_class_unseen_join_train_sub_family_grouped = pd.merge(test_unseen_class_unseen, train_sub_family_grouped, on='family', how='left')\n",
+ " return test_unseen_class_unseen_join_train_sub_family_grouped.drop('unit_sales_x', axis=1)\\\n",
+ " .rename(columns={'unit_sales_y':'prediction_sales'})\n",
+ " "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 59,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def clean_predictions(predictions):\n",
+ " predictions[predictions<0]=0\n",
+ " return predictions.round().astype(int)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 26,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def clean_targets(targets):\n",
+ " targets[targets<0]=0\n",
+ " return targets"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 61,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def get_baseline_predictions(test, train):\n",
+ " cols_to_use =['id', 'perishable', 'prediction_sales']\n",
+ " test_seen, test_unseen_class_seen, test_unseen_class_unseen = group_test_data(test, train)\n",
+ " train_expanded = get_item_expanded_df(train, items)\n",
+ " test_seen_expanded = get_item_expanded_df(test_seen, items)\n",
+ " prediction_test_seen = get_baseline_prediction_test_seen(test_seen_expanded, train)[cols_to_use]\n",
+ " prediction_test_unseen_class_seen = get_baseline_prediction_test_unseen_class_seen(test_unseen_class_seen, train_expanded)[cols_to_use]\n",
+ " prediction_test_unseen_class_unseen = get_baseline_prediction_test_unseen_class_unseen(test_unseen_class_unseen, train_expanded)[cols_to_use]\n",
+ " baseline_predictions = prediction_test_seen.append(prediction_test_unseen_class_seen).append(prediction_test_unseen_class_unseen)\n",
+ " cleaned_predictions = clean_predictions(baseline_predictions['prediction_sales'])\n",
+ " baseline_predictions.loc[:, 'prediction_sales']=cleaned_predictions\n",
+ " return baseline_predictions"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 62,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# prediction_test_seen, prediction_test_unseen_class_seen, prediction_test_unseen_class_unseen=get_baseline_predictions(test, train)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 63,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "/Users/jinyang/anaconda/envs/py35/lib/python3.5/site-packages/ipykernel_launcher.py:2: SettingWithCopyWarning:\n",
+ "\n",
+ "\n",
+ "A value is trying to be set on a copy of a slice from a DataFrame\n",
+ "\n",
+ "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n",
+ "\n"
+ ]
+ }
+ ],
+ "source": [
+ "baseline_predictions_validation = get_baseline_predictions(train_validation, train_train)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 64,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " id | \n",
+ " perishable | \n",
+ " prediction_sales | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 123296175 | \n",
+ " 0 | \n",
+ " 3 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 123296176 | \n",
+ " 1 | \n",
+ " 4 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 123296177 | \n",
+ " 0 | \n",
+ " 5 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 123296178 | \n",
+ " 0 | \n",
+ " 10 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 123296179 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " id perishable prediction_sales\n",
+ "0 123296175 0 3\n",
+ "1 123296176 1 4\n",
+ "2 123296177 0 5\n",
+ "3 123296178 0 10\n",
+ "4 123296179 0 1"
+ ]
+ },
+ "execution_count": 64,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "baseline_predictions_validation.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 65,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "targets_validation = pd.merge(baseline_predictions_validation, train_validation, on='id', how='left')['unit_sales']"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 16,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def get_evaluation(baseline_predictions_validation, targets_validation):\n",
+ " predictions = baseline_predictions_validation.prediction_sales\n",
+ " cleaned_targets = clean_targets(targets_validation)\n",
+ " weights = baseline_predictions_validation.perishable\n",
+ " return nwrmsle(predictions, cleaned_targets, weights)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 67,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "(2003044,)\n",
+ "(2003044,) (2003044,) (2003044,)\n"
+ ]
+ }
+ ],
+ "source": [
+ "validation_metric = get_evaluation(baseline_predictions_validation, targets_validation)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 68,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "0.67828104376604859"
+ ]
+ },
+ "execution_count": 68,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "validation_metric"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 69,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Validation NWRMSLE: 0.67828\n",
+ "Validation MSE: 353.981\n"
+ ]
+ }
+ ],
+ "source": [
+ "mse = mean_squared_error(baseline_predictions_validation['prediction_sales'], targets_validation)\n",
+ "print('Validation NWRMSLE: %.5f' % (validation_metric))\n",
+ "print('Validation MSE: %.3f' % (mse))"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "on Leaderboard: 1.369"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 70,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "0.67828104376604859"
+ ]
+ },
+ "execution_count": 70,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "validation_metric"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Submission"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 72,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "/Users/jinyang/anaconda/envs/py35/lib/python3.5/site-packages/ipykernel_launcher.py:2: SettingWithCopyWarning:\n",
+ "\n",
+ "\n",
+ "A value is trying to be set on a copy of a slice from a DataFrame\n",
+ "\n",
+ "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n",
+ "\n"
+ ]
+ }
+ ],
+ "source": [
+ "baseline_predictions_test = get_baseline_predictions(test, train)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 73,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " id | \n",
+ " perishable | \n",
+ " prediction_sales | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 125497040 | \n",
+ " 0 | \n",
+ " 2 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 125497041 | \n",
+ " 0 | \n",
+ " 3 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 125497043 | \n",
+ " 0 | \n",
+ " 3 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 125497044 | \n",
+ " 1 | \n",
+ " 4 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 125497045 | \n",
+ " 0 | \n",
+ " 5 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " id perishable prediction_sales\n",
+ "0 125497040 0 2\n",
+ "1 125497041 0 3\n",
+ "2 125497043 0 3\n",
+ "3 125497044 1 4\n",
+ "4 125497045 0 5"
+ ]
+ },
+ "execution_count": 73,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "baseline_predictions_test.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 74,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "test_submission=baseline_predictions_test[['id', 'prediction_sales']].rename(columns={'prediction_sales':'unit_sales'})"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 75,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " id | \n",
+ " unit_sales | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 125497040 | \n",
+ " 2 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 125497041 | \n",
+ " 3 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 125497043 | \n",
+ " 3 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 125497044 | \n",
+ " 4 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 125497045 | \n",
+ " 5 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " id unit_sales\n",
+ "0 125497040 2\n",
+ "1 125497041 3\n",
+ "2 125497043 3\n",
+ "3 125497044 4\n",
+ "4 125497045 5"
+ ]
+ },
+ "execution_count": 75,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "test_submission.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 76,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "True"
+ ]
+ },
+ "execution_count": 76,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "np.all(test_submission.unit_sales>=0)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 77,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "id int64\n",
+ "unit_sales int64\n",
+ "dtype: object"
+ ]
+ },
+ "execution_count": 77,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "test_submission.dtypes"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 78,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# test_submission.to_csv('baseline_submission_20171127.csv', index=False)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Evaluation of each group"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 79,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "validation_seen, validation_unseen_class_seen,validation_unseen_class_unseen = group_test_data(train_validation, train_train)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 80,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "Index(['id', 'date', 'store_nbr', 'item_nbr', 'onpromotion', 'unit_sales'], dtype='object')"
+ ]
+ },
+ "execution_count": 80,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "validation_seen.columns"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 81,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "seen_baseline_predictions_validation = baseline_predictions_validation[ baseline_predictions_validation.id.isin(validation_seen.id)]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 82,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "unseen_class_seen_baseline_predictions_validation = baseline_predictions_validation[ baseline_predictions_validation.id.isin(validation_unseen_class_seen.id)]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 83,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "unseen_class_unseen_baseline_predictions_validation = baseline_predictions_validation[ baseline_predictions_validation.id.isin(validation_unseen_class_unseen.id)]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 84,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "8.406943132552255"
+ ]
+ },
+ "execution_count": 84,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "baseline_predictions_validation.prediction_sales.mean()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 85,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "8.53563678093009"
+ ]
+ },
+ "execution_count": 85,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "seen_baseline_predictions_validation.prediction_sales.mean()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 86,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "7.756378630680307"
+ ]
+ },
+ "execution_count": 86,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "unseen_class_seen_baseline_predictions_validation.prediction_sales.mean()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 87,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "7.2"
+ ]
+ },
+ "execution_count": 87,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "unseen_class_unseen_baseline_predictions_validation.prediction_sales.mean()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 88,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "seen_baseline_predictions_test = baseline_predictions_test[ baseline_predictions_test.id.isin(test_seen.id)]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 89,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "unseen_class_seen_baseline_predictions_test = baseline_predictions_test[ baseline_predictions_test.id.isin(test_unseen_class_seen.id)]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 90,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "unseen_class_unseen_baseline_predictions_test = baseline_predictions_test[ baseline_predictions_test.id.isin(test_unseen_class_unseen.id)]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 91,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "7.350565382095759"
+ ]
+ },
+ "execution_count": 91,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "baseline_predictions_test.prediction_sales.mean()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 92,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "6.949336092844724"
+ ]
+ },
+ "execution_count": 92,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "seen_baseline_predictions_test.prediction_sales.mean()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 93,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "8.855705681997609"
+ ]
+ },
+ "execution_count": 93,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "unseen_class_seen_baseline_predictions_test.prediction_sales.mean()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 94,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "7.0"
+ ]
+ },
+ "execution_count": 94,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "unseen_class_unseen_baseline_predictions_test.prediction_sales.mean()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "validation and test looks different in means of predictions"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 95,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "8.573084378706207"
+ ]
+ },
+ "execution_count": 95,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "train_train.unit_sales.mean()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 96,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "8.554865268437672"
+ ]
+ },
+ "execution_count": 96,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "train.unit_sales.mean()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Why do seen items have low mean in test data?"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 97,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "Index(['id', 'perishable', 'prediction_sales'], dtype='object')"
+ ]
+ },
+ "execution_count": 97,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "seen_baseline_predictions_test.columns"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 98,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "Index(['id', 'date', 'store_nbr', 'item_nbr', 'onpromotion', 'unit_sales'], dtype='object')"
+ ]
+ },
+ "execution_count": 98,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "test_seen.columns"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 99,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "test_seen_mean = test_seen.groupby(['item_nbr','store_nbr'])['unit_sales'].mean().reset_index()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 100,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "1.000000 1063\n",
+ "2.000000 387\n",
+ "1.500000 269\n",
+ "1.333333 211\n",
+ "3.000000 169\n",
+ "1.200000 142\n",
+ "1.666667 132\n",
+ "4.000000 110\n",
+ "1.250000 105\n",
+ "1.600000 90\n",
+ "1.400000 87\n",
+ "2.500000 83\n",
+ "1.166667 75\n",
+ "1.750000 74\n",
+ "2.333333 72\n",
+ "1.142857 70\n",
+ "1.800000 70\n",
+ "5.000000 69\n",
+ "1.714286 66\n",
+ "1.833333 65\n",
+ "1.428571 63\n",
+ "2.400000 61\n",
+ "2.666667 59\n",
+ "1.285714 58\n",
+ "1.571429 56\n",
+ "1.222222 55\n",
+ "6.000000 54\n",
+ "1.375000 53\n",
+ "3.333333 52\n",
+ "3.500000 52\n",
+ " ... \n",
+ "5.931522 1\n",
+ "5.983607 1\n",
+ "10.427236 1\n",
+ "6.328264 1\n",
+ "2.569665 1\n",
+ "4.409535 1\n",
+ "35.648855 1\n",
+ "4.074074 1\n",
+ "14.786531 1\n",
+ "1.416867 1\n",
+ "2.202128 1\n",
+ "89.903846 1\n",
+ "4.600505 1\n",
+ "2.375946 1\n",
+ "20.287293 1\n",
+ "4.557390 1\n",
+ "2.670120 1\n",
+ "2.405458 1\n",
+ "8.606928 1\n",
+ "2.559347 1\n",
+ "11.537549 1\n",
+ "4.129799 1\n",
+ "5.267884 1\n",
+ "12.886924 1\n",
+ "5.454833 1\n",
+ "15.486842 1\n",
+ "5.792963 1\n",
+ "4.558394 1\n",
+ "5.431433 1\n",
+ "3.750600 1\n",
+ "Name: unit_sales, Length: 139461, dtype: int64"
+ ]
+ },
+ "execution_count": 100,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "test_seen_mean.unit_sales.value_counts()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 101,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "(3370464, 5)"
+ ]
+ },
+ "execution_count": 101,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "test.shape"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 102,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "(2003044, 6)"
+ ]
+ },
+ "execution_count": 102,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "train_validation.shape"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 103,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "(2659408, 6)"
+ ]
+ },
+ "execution_count": 103,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "test_seen.shape"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 104,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "(1672247, 6)"
+ ]
+ },
+ "execution_count": 104,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "validation_seen.shape"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 105,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "Index(['id', 'date', 'store_nbr', 'item_nbr', 'onpromotion', 'unit_sales'], dtype='object')"
+ ]
+ },
+ "execution_count": 105,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "validation_seen.columns"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 106,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "validation_seen_mean = validation_seen.groupby(['item_nbr', 'store_nbr'])['unit_sales'].mean().reset_index()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 107,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "1.000000 196\n",
+ "2.000000 183\n",
+ "1.500000 148\n",
+ "1.333333 109\n",
+ "2.500000 71\n",
+ "1.250000 70\n",
+ "4.000000 67\n",
+ "3.000000 66\n",
+ "1.750000 57\n",
+ "1.200000 47\n",
+ "1.666667 47\n",
+ "5.000000 43\n",
+ "1.428571 41\n",
+ "1.571429 40\n",
+ "1.375000 39\n",
+ "1.166667 37\n",
+ "1.222222 34\n",
+ "2.250000 34\n",
+ "6.000000 34\n",
+ "1.666667 31\n",
+ "1.125000 31\n",
+ "1.285714 30\n",
+ "2.750000 30\n",
+ "2.333333 30\n",
+ "1.142857 30\n",
+ "1.444444 29\n",
+ "3.500000 28\n",
+ "2.666667 27\n",
+ "1.300000 27\n",
+ "1.625000 26\n",
+ " ... \n",
+ "5.414894 1\n",
+ "5.689526 1\n",
+ "10.992721 1\n",
+ "4.405322 1\n",
+ "2.751479 1\n",
+ "3.373206 1\n",
+ "1.844920 1\n",
+ "3.732102 1\n",
+ "1.887872 1\n",
+ "3.881868 1\n",
+ "6.143824 1\n",
+ "4.322581 1\n",
+ "3.486506 1\n",
+ "4.022444 1\n",
+ "1.949593 1\n",
+ "3.965174 1\n",
+ "6.439222 1\n",
+ "9.427083 1\n",
+ "5.120801 1\n",
+ "1.376404 1\n",
+ "2.192547 1\n",
+ "11.182663 1\n",
+ "6.961364 1\n",
+ "7.380435 1\n",
+ "17.839545 1\n",
+ "1.691099 1\n",
+ "9.129005 1\n",
+ "1.983607 1\n",
+ "25.861189 1\n",
+ "9.211574 1\n",
+ "Name: unit_sales, Length: 132023, dtype: int64"
+ ]
+ },
+ "execution_count": 107,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "validation_seen_mean.unit_sales.value_counts()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 108,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "(146060, 3)"
+ ]
+ },
+ "execution_count": 108,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "validation_seen_mean.shape"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 109,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "baseline_predictions_validation.to_csv('baseline_predictions_validation', index=False)\n",
+ "baseline_predictions_test.to_csv('baseline_predictions_test', index=False)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 110,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " id | \n",
+ " perishable | \n",
+ " prediction_sales | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 123296175 | \n",
+ " 0 | \n",
+ " 3 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 123296176 | \n",
+ " 1 | \n",
+ " 4 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 123296177 | \n",
+ " 0 | \n",
+ " 5 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 123296178 | \n",
+ " 0 | \n",
+ " 10 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 123296179 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " id perishable prediction_sales\n",
+ "0 123296175 0 3\n",
+ "1 123296176 1 4\n",
+ "2 123296177 0 5\n",
+ "3 123296178 0 10\n",
+ "4 123296179 0 1"
+ ]
+ },
+ "execution_count": 110,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "seen_baseline_predictions_validation.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 112,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " id | \n",
+ " date | \n",
+ " store_nbr | \n",
+ " item_nbr | \n",
+ " unit_sales | \n",
+ " onpromotion | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 123296175 | \n",
+ " 123296175 | \n",
+ " 2017-07-26 | \n",
+ " 1 | \n",
+ " 103520 | \n",
+ " 1.0 | \n",
+ " False | \n",
+ "
\n",
+ " \n",
+ " 123296176 | \n",
+ " 123296176 | \n",
+ " 2017-07-26 | \n",
+ " 1 | \n",
+ " 103665 | \n",
+ " 4.0 | \n",
+ " False | \n",
+ "
\n",
+ " \n",
+ " 123296177 | \n",
+ " 123296177 | \n",
+ " 2017-07-26 | \n",
+ " 1 | \n",
+ " 105574 | \n",
+ " 9.0 | \n",
+ " False | \n",
+ "
\n",
+ " \n",
+ " 123296178 | \n",
+ " 123296178 | \n",
+ " 2017-07-26 | \n",
+ " 1 | \n",
+ " 105575 | \n",
+ " 6.0 | \n",
+ " False | \n",
+ "
\n",
+ " \n",
+ " 123296179 | \n",
+ " 123296179 | \n",
+ " 2017-07-26 | \n",
+ " 1 | \n",
+ " 105693 | \n",
+ " 2.0 | \n",
+ " True | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " id date store_nbr item_nbr unit_sales onpromotion\n",
+ "123296175 123296175 2017-07-26 1 103520 1.0 False\n",
+ "123296176 123296176 2017-07-26 1 103665 4.0 False\n",
+ "123296177 123296177 2017-07-26 1 105574 9.0 False\n",
+ "123296178 123296178 2017-07-26 1 105575 6.0 False\n",
+ "123296179 123296179 2017-07-26 1 105693 2.0 True"
+ ]
+ },
+ "execution_count": 112,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "train_validation.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "seen_empirical = pd.read_csv('seen_empirical', header=None)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 123296175 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 123296176 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 123296177 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 123296178 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 123296181 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " 0\n",
+ "0 123296175\n",
+ "1 123296176\n",
+ "2 123296177\n",
+ "3 123296178\n",
+ "4 123296181"
+ ]
+ },
+ "execution_count": 7,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "seen_empirical.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import pandas as pd"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "baseline_predictions_validation = pd.read_csv('baseline_predictions_validation')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " id | \n",
+ " perishable | \n",
+ " prediction_sales | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 123296175 | \n",
+ " 0 | \n",
+ " 3 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 123296176 | \n",
+ " 1 | \n",
+ " 4 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 123296177 | \n",
+ " 0 | \n",
+ " 5 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 123296178 | \n",
+ " 0 | \n",
+ " 10 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 123296179 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " id perishable prediction_sales\n",
+ "0 123296175 0 3\n",
+ "1 123296176 1 4\n",
+ "2 123296177 0 5\n",
+ "3 123296178 0 10\n",
+ "4 123296179 0 1"
+ ]
+ },
+ "execution_count": 4,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "baseline_predictions_validation.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 14,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "baseline_predictions_validation_subset = baseline_predictions_validation[baseline_predictions_validation.id.isin(seen_empirical[[0]][0])]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 15,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "(1343027, 3)"
+ ]
+ },
+ "execution_count": 15,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "baseline_predictions_validation_subset.shape"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 23,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "train_subset = train[train.id.isin(seen_empirical[[0]][0])]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 27,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "/Users/jinyang/anaconda/envs/py35/lib/python3.5/site-packages/ipykernel_launcher.py:2: SettingWithCopyWarning:\n",
+ "\n",
+ "\n",
+ "A value is trying to be set on a copy of a slice from a DataFrame\n",
+ "\n",
+ "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n",
+ "\n",
+ "/Users/jinyang/anaconda/envs/py35/lib/python3.5/site-packages/pandas/core/generic.py:5088: SettingWithCopyWarning:\n",
+ "\n",
+ "\n",
+ "A value is trying to be set on a copy of a slice from a DataFrame\n",
+ "\n",
+ "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n",
+ "\n",
+ "/Users/jinyang/anaconda/envs/py35/lib/python3.5/site-packages/ipykernel_launcher.py:3: SettingWithCopyWarning:\n",
+ "\n",
+ "\n",
+ "A value is trying to be set on a copy of a slice from a DataFrame\n",
+ "\n",
+ "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n",
+ "\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "(1343027,)\n",
+ "(1343027,) (1343027,) (1343027,)\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "/Users/jinyang/anaconda/envs/py35/lib/python3.5/site-packages/ipykernel_launcher.py:5: SettingWithCopyWarning:\n",
+ "\n",
+ "\n",
+ "A value is trying to be set on a copy of a slice from a DataFrame\n",
+ "\n",
+ "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n",
+ "\n"
+ ]
+ },
+ {
+ "data": {
+ "text/plain": [
+ "1.1433622314160428"
+ ]
+ },
+ "execution_count": 27,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "get_evaluation(baseline_predictions_validation_subset, train_subset['unit_sales'])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 29,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "baseline_predictions_test = pd.read_csv('baseline_predictions_test')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 30,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " id | \n",
+ " perishable | \n",
+ " prediction_sales | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 125497040 | \n",
+ " 0 | \n",
+ " 2 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 125497041 | \n",
+ " 0 | \n",
+ " 3 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 125497043 | \n",
+ " 0 | \n",
+ " 3 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 125497044 | \n",
+ " 1 | \n",
+ " 4 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 125497045 | \n",
+ " 0 | \n",
+ " 5 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " id perishable prediction_sales\n",
+ "0 125497040 0 2\n",
+ "1 125497041 0 3\n",
+ "2 125497043 0 3\n",
+ "3 125497044 1 4\n",
+ "4 125497045 0 5"
+ ]
+ },
+ "execution_count": 30,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "baseline_predictions_test.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.5.3"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}