diff --git a/.ipynb_checkpoints/new baseline model 29Nov17-checkpoint.ipynb b/.ipynb_checkpoints/new baseline model 29Nov17-checkpoint.ipynb new file mode 100644 index 0000000..c090793 --- /dev/null +++ b/.ipynb_checkpoints/new baseline model 29Nov17-checkpoint.ipynb @@ -0,0 +1,5969 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "" + ], + "text/vnd.plotly.v1+html": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "import matplotlib.pyplot as plt\n", + "%matplotlib inline\n", + "import seaborn as sns\n", + "import plotly.offline as py\n", + "import plotly.graph_objs as go\n", + "py.init_notebook_mode()\n", + "import gc" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "seed = 46\n", + "np.random.seed(seed)" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "def load_data(data_path):\n", + " train = pd.read_csv('%s/train.csv' % data_path, parse_dates=['date'])\n", + " test = pd.read_csv('%s/test.csv' % data_path, parse_dates=['date'])\n", + " items = pd.read_csv('%s/items.csv' % data_path)\n", + " return train, test, items" + ] + }, + { + "cell_type": "code", + "execution_count": 59, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/jinyang/anaconda/envs/py35/lib/python3.5/site-packages/IPython/core/interactiveshell.py:2821: DtypeWarning:\n", + "\n", + "Columns (5) have mixed types. Specify dtype option on import or set low_memory=False.\n", + "\n" + ] + } + ], + "source": [ + "data_path = './data'\n", + "train, test, items = load_data(data_path)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "def print_summary_train_test(train, test):\n", + " print ('Train min/max date: %s / %s' % (train['date'].min(), train['date'].max()))\n", + " print ('Test min/max date: %s / %s' % ( test['date'].min(), test['date'].max()))\n", + " print ('')\n", + " print ('Number of days in train: %d' % ((train['date'].max() - train['date'].min()).days + 1))\n", + " print ('Number of days in validation: %d' % (( test['date'].max() - test['date'].min()).days + 1))\n", + " print ('')\n", + " print ('Train shape: %d rows' % train.shape[0])\n", + " print ('Test shape: %d rows' % test.shape[0])" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Train min/max date: 2013-01-01 00:00:00 / 2017-08-15 00:00:00\n", + "Test min/max date: 2017-08-16 00:00:00 / 2017-08-31 00:00:00\n", + "\n", + "Number of days in train: 1688\n", + "Number of days in validation: 16\n", + "\n", + "Train shape: 125497040 rows\n", + "Test shape: 3370464 rows\n" + ] + } + ], + "source": [ + "print_summary_train_test(train, test)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Divide Train data into Validation(last two weeks of train data) and Training(the rest)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "import splitter" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "train_last_date = train['date'].max()" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2017-07-26 00:00:00 2017-08-10 00:00:00\n" + ] + } + ], + "source": [ + "begin_of_validation, end_of_validation = splitter.get_validation_period(train_last_date)\n", + "print(begin_of_validation, end_of_validation)" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "train_train, train_validation = splitter.split_validation_train_by_validation_period(train, begin_of_validation, end_of_validation)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Train min/max date: 2013-01-01 00:00:00 / 2017-07-25 00:00:00\n", + "Test min/max date: 2017-07-26 00:00:00 / 2017-08-10 00:00:00\n", + "\n", + "Number of days in train: 1667\n", + "Number of days in validation: 16\n", + "\n", + "Train shape: 123296175 rows\n", + "Test shape: 1679408 rows\n" + ] + } + ], + "source": [ + "print_summary_train_test(train_train, train_validation)\n", + " " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Evaluation Metric" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [], + "source": [ + "import evaluation\n", + "from sklearn.metrics import mean_squared_error" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## How many items in Test data set are not seen in Train data set \n", + "## vs. how many items in Validation are not seen in Training" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [], + "source": [ + "def get_unseen_item_percentage(train, test):\n", + " test_items = test['item_nbr'].unique()\n", + " train_items = train['item_nbr'].unique()\n", + " test_items_unseen_in_train = set(test_items) - set(train_items)\n", + " unseen_percentage = len(test_items_unseen_in_train)/len(test_items)\n", + " print(\"{:.2f}% of items in the test data set are not seen in the train data set\".format(unseen_percentage*100))\n", + " return unseen_percentage" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "1.54% of items in the test data set are not seen in the train data set\n" + ] + }, + { + "data": { + "text/plain": [ + "0.015380671622660855" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "get_unseen_item_percentage(train, test)" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0.55% of items in the test data set are not seen in the train data set\n" + ] + }, + { + "data": { + "text/plain": [ + "0.005454545454545455" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "get_unseen_item_percentage(train_train, train_validation)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## How many (item, store) in Test data set are not seen in Train data set\n", + "## vs. how many (item, store) in Validation are not seen in Training" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [], + "source": [ + "def get_unseen_item_store_pair_percentage(train, test):\n", + " cols_item_store = ['item_nbr', 'store_nbr']\n", + " train_item_store_grouped = train.groupby(cols_item_store)['unit_sales'].mean().reset_index()\n", + " cols_to_use = test.columns.drop('unit_sales') if 'unit_sales' in test.columns else test.columns\n", + " test_join_train_item_store_grouped = pd.merge(test[cols_to_use], train_item_store_grouped, on=cols_item_store, how='left')\n", + " test_unseen = test_join_train_item_store_grouped[test_join_train_item_store_grouped['unit_sales'].isnull()]\n", + " unseen_percentage = test_unseen.shape[0]/test.shape[0]\n", + " print(\"{:.2f}% of (item,store) pairs in the test data set are not seen in the train data set\".format(unseen_percentage*100))\n", + " return unseen_percentage" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "21.10% of (item,store) pairs in the test data set are not seen in the train data set\n" + ] + }, + { + "data": { + "text/plain": [ + "0.21096679863662687" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "get_unseen_item_store_pair_percentage(train, test)" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0.13% of (item,store) pairs in the test data set are not seen in the train data set\n" + ] + }, + { + "data": { + "text/plain": [ + "0.0013326124443851642" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "get_unseen_item_store_pair_percentage(train_train, train_validation)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Using constant prediction" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "8.563926854265649" + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "train_train.unit_sales.mean()" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "8.554865268437672" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "train.unit_sales.mean()" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(1679408, 6)" + ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "train_validation.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [], + "source": [ + "train_validation_prediction = pd.DataFrame({'id': train_validation.loc[:, 'id'], 'prediction_sales': train_train.unit_sales.mean()})" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(1679408, 2)" + ] + }, + "execution_count": 23, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "train_validation_prediction.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [], + "source": [ + "test_prediction = pd.DataFrame({'id': test.loc[:, 'id'], 'prediction_sales': train.unit_sales.mean()})" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idprediction_sales
1232961751232961758.563927
1232961761232961768.563927
1232961771232961778.563927
1232961781232961788.563927
1232961791232961798.563927
\n", + "
" + ], + "text/plain": [ + " id prediction_sales\n", + "123296175 123296175 8.563927\n", + "123296176 123296176 8.563927\n", + "123296177 123296177 8.563927\n", + "123296178 123296178 8.563927\n", + "123296179 123296179 8.563927" + ] + }, + "execution_count": 25, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "train_validation_prediction.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idprediction_sales
01254970408.554865
11254970418.554865
21254970428.554865
31254970438.554865
41254970448.554865
\n", + "
" + ], + "text/plain": [ + " id prediction_sales\n", + "0 125497040 8.554865\n", + "1 125497041 8.554865\n", + "2 125497042 8.554865\n", + "3 125497043 8.554865\n", + "4 125497044 8.554865" + ] + }, + "execution_count": 26, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "test_prediction.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Evaluation" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": {}, + "outputs": [], + "source": [ + "def get_prediction_constant(train_train):\n", + " return train_train.unit_sales.mean()" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": {}, + "outputs": [], + "source": [ + "def get_clean_prediction(train_train, train_validation):\n", + " predictions = get_prediction_constant(train_train)\n", + " return pd.DataFrame({'id': train_validation.loc[:, 'id'], 'prediction_sales': predictions})" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": {}, + "outputs": [], + "source": [ + "def get_item_expanded_df(test, items):\n", + " return pd.merge(test, items, on='item_nbr', how='left')" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": {}, + "outputs": [], + "source": [ + "def get_evaluation_using_constant_baseline(train_train, train_validation):\n", + " train_validation_prediction = get_clean_prediction(train_train, train_validation)\n", + " train_validation_expanded = get_item_expanded_df(train_validation, items)\n", + " train_validation_weights = train_validation_expanded ['perishable']*0.25+1\n", + " return nwrmsle(train_validation_prediction['prediction_sales'], train_validation['unit_sales'], train_validation_weights)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": {}, + "outputs": [], + "source": [ + "train_validation_expanded = get_item_expanded_df(train_validation, items)" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "metadata": {}, + "outputs": [], + "source": [ + "train_validation_weights = train_validation_expanded ['perishable']*0.25+1" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0 1.00\n", + "1 1.25\n", + "2 1.00\n", + "3 1.00\n", + "4 1.00\n", + "Name: perishable, dtype: float64" + ] + }, + "execution_count": 33, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "train_validation_weights.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "metadata": {}, + "outputs": [], + "source": [ + "def nwrmsle(predictions, targets, weights):\n", + " print(targets.shape)\n", + " targets[targets<0]=0\n", + " weights = 1 + 0.25 * weights\n", + " print(predictions.shape, targets.shape, weights.shape)\n", + " log_square_errors = (np.log(predictions.values + 1) - np.log(targets.values + 1)) ** 2\n", + " return(np.sqrt(np.sum(weights.values * log_square_errors) / np.sum(weights)))" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(1679408,)" + ] + }, + "execution_count": 35, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "train_validation_prediction['prediction_sales'].shape" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(1679408, 2)" + ] + }, + "execution_count": 36, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "train_validation_prediction.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(1679408,)" + ] + }, + "execution_count": 37, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "train_validation['unit_sales'].shape" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(1679408, 6)" + ] + }, + "execution_count": 38, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "train_validation.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(1679408,)\n", + "(1679408,) (1679408,) (1679408,)\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/jinyang/anaconda/envs/py35/lib/python3.5/site-packages/ipykernel_launcher.py:3: SettingWithCopyWarning:\n", + "\n", + "\n", + "A value is trying to be set on a copy of a slice from a DataFrame\n", + "\n", + "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n", + "\n", + "/Users/jinyang/anaconda/envs/py35/lib/python3.5/site-packages/pandas/core/generic.py:5088: SettingWithCopyWarning:\n", + "\n", + "\n", + "A value is trying to be set on a copy of a slice from a DataFrame\n", + "\n", + "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n", + "\n", + "/Users/jinyang/anaconda/envs/py35/lib/python3.5/site-packages/ipykernel_launcher.py:1: SettingWithCopyWarning:\n", + "\n", + "\n", + "A value is trying to be set on a copy of a slice from a DataFrame\n", + "\n", + "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n", + "\n" + ] + } + ], + "source": [ + "train_validation_metric = nwrmsle(train_validation_prediction['prediction_sales'], train_validation['unit_sales'], train_validation_weights)" + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "1.0371859208825527" + ] + }, + "execution_count": 40, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "train_validation_metric" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In Leaderboard 1.710" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## How to improve the similarity between validation performance and test performance?" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Strategy 1: Remove Items from Training data set" + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
iddatestore_nbritem_nbronpromotion
01254970402017-08-16196995False
11254970412017-08-16199197False
21254970422017-08-161103501False
31254970432017-08-161103520False
41254970442017-08-161103665False
\n", + "
" + ], + "text/plain": [ + " id date store_nbr item_nbr onpromotion\n", + "0 125497040 2017-08-16 1 96995 False\n", + "1 125497041 2017-08-16 1 99197 False\n", + "2 125497042 2017-08-16 1 103501 False\n", + "3 125497043 2017-08-16 1 103520 False\n", + "4 125497044 2017-08-16 1 103665 False" + ] + }, + "execution_count": 41, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "test.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(3901,)" + ] + }, + "execution_count": 42, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "test.item_nbr.unique().shape" + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.007690335811330428" + ] + }, + "execution_count": 43, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "30/3901" + ] + }, + { + "cell_type": "code", + "execution_count": 44, + "metadata": {}, + "outputs": [], + "source": [ + "def move_items_from_train_to_validation(train, validation, items_to_remove):\n", + " train2 = train[~train.item_nbr.isin(items_to_remove)]\n", + " validation_to_add = train[train.item_nbr.isin(items_to_remove)]\n", + " validation2 = validation.append(validation_to_add)\n", + " return train2, validation2\n", + "\n", + "\n", + "def move_random_items_from_train_to_validation(train, validation, num_items_to_remove):\n", + " train_items = train['item_nbr'].unique()\n", + " items_to_remove = np.random.choice(train_items, num_items_to_remove)\n", + " train2, validation2 = move_items_from_train_to_validation(train, validation, items_to_remove)\n", + " print(\"Moved {} items from train data to test data\".format(num_items_to_remove))\n", + " print(\"train data: {} -> {} rows\".format(train.shape[0], train2.shape[0]))\n", + " print(\"validation data: {} -> {} rows\".format(validation.shape[0], validation2.shape[0]))\n", + " return train2, validation2" + ] + }, + { + "cell_type": "code", + "execution_count": 45, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Moved 30 items from train data to test data\n", + "train data: 123296175 -> 122487773 rows\n", + "validation data: 1679408 -> 2487810 rows\n" + ] + } + ], + "source": [ + "num_items_to_move = 30\n", + "train_train2, train_validation2 = move_random_items_from_train_to_validation(train_train, train_validation, num_items_to_move)" + ] + }, + { + "cell_type": "code", + "execution_count": 46, + "metadata": {}, + "outputs": [], + "source": [ + "def move_random_items_from_train_to_validation(train, validation, num_items_to_remove):\n", + " train_items = train['item_nbr'].unique()\n", + " items_to_remove = np.random.choice(train_items, num_items_to_remove)\n", + " train2, validation2 = move_items_from_train_to_validation(train, validation, items_to_remove)\n", + " print(\"Moved {} items from train data to test data\".format(num_items_to_remove))\n", + " print(\"train data: {} -> {} rows\".format(train.shape[0], train2.shape[0]))\n", + " print(\"validation data: {} -> {} rows\".format(validation.shape[0], validation2.shape[0]))\n", + " return train2, validation2" + ] + }, + { + "cell_type": "code", + "execution_count": 47, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Moved 10 items from train data to test data\n", + "train data: 123296175 -> 122972539 rows\n", + "validation data: 1679408 -> 2003044 rows\n" + ] + } + ], + "source": [ + "num_items_to_remove = 10\n", + "train_train2, train_validation2 = move_random_items_from_train_to_validation(train_train, train_validation, num_items_to_remove)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Clear memory for previous train_train and train_validation" + ] + }, + { + "cell_type": "code", + "execution_count": 48, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "288" + ] + }, + "execution_count": 48, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "train_train = train_train2\n", + "train_validation = train_validation2\n", + "del train_train2\n", + "del train_validation2\n", + "gc.collect()" + ] + }, + { + "cell_type": "code", + "execution_count": 49, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0.80% of items in the test data set are not seen in the train data set\n" + ] + }, + { + "data": { + "text/plain": [ + "0.00804985717995326" + ] + }, + "execution_count": 49, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "get_unseen_item_percentage(train_train, train_validation)" + ] + }, + { + "cell_type": "code", + "execution_count": 50, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "16.51% of (item,store) pairs in the test data set are not seen in the train data set\n" + ] + }, + { + "data": { + "text/plain": [ + "0.16514714604372147" + ] + }, + "execution_count": 50, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "get_unseen_item_store_pair_percentage(train_train, train_validation)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now, much more items are unseen in validation data" + ] + }, + { + "cell_type": "code", + "execution_count": 51, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(2003044,)\n", + "(2003044,) (2003044,) (2003044,)\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/jinyang/anaconda/envs/py35/lib/python3.5/site-packages/ipykernel_launcher.py:3: SettingWithCopyWarning:\n", + "\n", + "\n", + "A value is trying to be set on a copy of a slice from a DataFrame\n", + "\n", + "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n", + "\n" + ] + } + ], + "source": [ + "train_validation_metric = get_evaluation_using_constant_baseline(train_train, train_validation)" + ] + }, + { + "cell_type": "code", + "execution_count": 52, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "1.033043851782858" + ] + }, + "execution_count": 52, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "train_validation_metric" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In Leaderboard 1.710" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Removing items hardly changed the validation score, which implies that we are doing worse job on predicting seen items" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Analyze the performance on each group: seen (item, store), seen class, unseen class" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Group test data into the three groups" + ] + }, + { + "cell_type": "code", + "execution_count": 53, + "metadata": {}, + "outputs": [], + "source": [ + "def group_test_data(test, train):\n", + " def get_classes_from_expanded(df_expanded):\n", + " return df_expanded['class'].unique()\n", + " cols_item_store = ['item_nbr', 'store_nbr']\n", + " train_item_store_grouped = train.groupby(cols_item_store)['unit_sales'].mean().reset_index()\n", + " cols_to_use = test.columns.drop('unit_sales') if 'unit_sales' in test.columns else test.columns\n", + " test_join_train_item_store_grouped = pd.merge(test[cols_to_use], train_item_store_grouped, on=cols_item_store, how='left')\n", + " test_seen = test_join_train_item_store_grouped[test_join_train_item_store_grouped['unit_sales'].notnull()]\n", + " test_unseen = test_join_train_item_store_grouped[test_join_train_item_store_grouped['unit_sales'].isnull()]\n", + " test_unseen_expanded = get_item_expanded_df(test_unseen, items)\n", + " test_unseen_class = get_classes_from_expanded(test_unseen_expanded)\n", + " train_expanded = get_item_expanded_df(train, items)\n", + " train_class = get_classes_from_expanded(train_expanded)\n", + " test_unseen_class_diff = set(test_unseen_class) - set(train_class)\n", + " test_unseen_class_same = set(test_unseen_class) - test_unseen_class_diff\n", + " test_unseen_class_seen = test_unseen_expanded[test_unseen_expanded['class'].isin(test_unseen_class_same)]\n", + " test_unseen_class_unseen = test_unseen_expanded[test_unseen_expanded['class'].isin(test_unseen_class_diff)]\n", + " return test_seen, test_unseen_class_seen, test_unseen_class_unseen" + ] + }, + { + "cell_type": "code", + "execution_count": 54, + "metadata": {}, + "outputs": [], + "source": [ + "test_seen, test_unseen_class_seen,test_unseen_class_unseen = group_test_data(test, train)" + ] + }, + { + "cell_type": "code", + "execution_count": 55, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "True\n" + ] + } + ], + "source": [ + "print(test.shape[0]== test_seen.shape[0]+ test_unseen_class_seen.shape[0]+test_unseen_class_unseen.shape[0])" + ] + }, + { + "cell_type": "code", + "execution_count": 56, + "metadata": {}, + "outputs": [], + "source": [ + "def get_baseline_prediction_test_seen(test_seen_expanded, train):\n", + " cols_item_store = ['item_nbr', 'store_nbr']\n", + " cols_test_expanded = test_seen_expanded.columns\n", + " cols_prediction = ['id', 'unit_sales', 'perishable']\n", + " train_item_store_grouped = train.groupby(cols_item_store)['unit_sales'].mean().reset_index()\n", + " cols_to_use = cols_test_expanded.drop('unit_sales') if 'unit_sales' in cols_test_expanded else cols_test_expanded\n", + " test_join_train_item_store_grouped = pd.merge(test_seen_expanded[cols_to_use], train_item_store_grouped, on=cols_item_store, how='left')\n", + " is_data_seen = test_join_train_item_store_grouped['unit_sales'].notnull()\n", + " join_seen = test_join_train_item_store_grouped[is_data_seen.values]\n", + " return join_seen[cols_prediction].rename(columns={'unit_sales':'prediction_sales'})" + ] + }, + { + "cell_type": "code", + "execution_count": 57, + "metadata": {}, + "outputs": [], + "source": [ + "def get_baseline_prediction_test_unseen_class_seen(test_unseen_class_seen, train_expanded):\n", + " test_unseen_class_seen_class = test_unseen_class_seen['class'].unique()\n", + " train_sub = train_expanded[train_expanded['class'].isin(test_unseen_class_seen_class)]\n", + " train_sub_class_grouped = train_sub.groupby('class').mean().reset_index()\n", + " train_sub_class_grouped = train_sub_class_grouped[['class', 'unit_sales']]\n", + " test_unseen_class_seen_join_train_sub_class_grouped = pd.merge(test_unseen_class_seen, train_sub_class_grouped, on='class', how='left')\n", + " return test_unseen_class_seen_join_train_sub_class_grouped.drop('unit_sales_x', axis=1)\\\n", + " .rename(columns={'unit_sales_y':'prediction_sales'})\n", + " " + ] + }, + { + "cell_type": "code", + "execution_count": 58, + "metadata": {}, + "outputs": [], + "source": [ + "def get_baseline_prediction_test_unseen_class_unseen(test_unseen_class_unseen, train_expanded):\n", + " test_unseen_class_unseen_family = test_unseen_class_unseen['family'].unique()\n", + " train_sub = train_expanded[train_expanded['family'].isin(test_unseen_class_unseen_family)]\n", + " train_sub_family_grouped = train_sub.groupby('family').mean().reset_index()\n", + " train_sub_family_grouped = train_sub_family_grouped[['family', 'unit_sales']]\n", + " test_unseen_class_unseen_join_train_sub_family_grouped = pd.merge(test_unseen_class_unseen, train_sub_family_grouped, on='family', how='left')\n", + " return test_unseen_class_unseen_join_train_sub_family_grouped.drop('unit_sales_x', axis=1)\\\n", + " .rename(columns={'unit_sales_y':'prediction_sales'})\n", + " " + ] + }, + { + "cell_type": "code", + "execution_count": 59, + "metadata": {}, + "outputs": [], + "source": [ + "def clean_predictions(predictions):\n", + " predictions[predictions<0]=0\n", + " return predictions.round().astype(int)" + ] + }, + { + "cell_type": "code", + "execution_count": 60, + "metadata": {}, + "outputs": [], + "source": [ + "def clean_targets(targets):\n", + " targets[targets<0]=0\n", + " return targets" + ] + }, + { + "cell_type": "code", + "execution_count": 61, + "metadata": {}, + "outputs": [], + "source": [ + "def get_baseline_predictions(test, train):\n", + " cols_to_use =['id', 'perishable', 'prediction_sales']\n", + " test_seen, test_unseen_class_seen, test_unseen_class_unseen = group_test_data(test, train)\n", + " train_expanded = get_item_expanded_df(train, items)\n", + " test_seen_expanded = get_item_expanded_df(test_seen, items)\n", + " prediction_test_seen = get_baseline_prediction_test_seen(test_seen_expanded, train)[cols_to_use]\n", + " prediction_test_unseen_class_seen = get_baseline_prediction_test_unseen_class_seen(test_unseen_class_seen, train_expanded)[cols_to_use]\n", + " prediction_test_unseen_class_unseen = get_baseline_prediction_test_unseen_class_unseen(test_unseen_class_unseen, train_expanded)[cols_to_use]\n", + " baseline_predictions = prediction_test_seen.append(prediction_test_unseen_class_seen).append(prediction_test_unseen_class_unseen)\n", + " cleaned_predictions = clean_predictions(baseline_predictions['prediction_sales'])\n", + " baseline_predictions.loc[:, 'prediction_sales']=cleaned_predictions\n", + " return baseline_predictions" + ] + }, + { + "cell_type": "code", + "execution_count": 62, + "metadata": {}, + "outputs": [], + "source": [ + "# prediction_test_seen, prediction_test_unseen_class_seen, prediction_test_unseen_class_unseen=get_baseline_predictions(test, train)" + ] + }, + { + "cell_type": "code", + "execution_count": 63, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/jinyang/anaconda/envs/py35/lib/python3.5/site-packages/ipykernel_launcher.py:2: SettingWithCopyWarning:\n", + "\n", + "\n", + "A value is trying to be set on a copy of a slice from a DataFrame\n", + "\n", + "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n", + "\n" + ] + } + ], + "source": [ + "baseline_predictions_validation = get_baseline_predictions(train_validation, train_train)" + ] + }, + { + "cell_type": "code", + "execution_count": 64, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idperishableprediction_sales
012329617503
112329617614
212329617705
3123296178010
412329617901
\n", + "
" + ], + "text/plain": [ + " id perishable prediction_sales\n", + "0 123296175 0 3\n", + "1 123296176 1 4\n", + "2 123296177 0 5\n", + "3 123296178 0 10\n", + "4 123296179 0 1" + ] + }, + "execution_count": 64, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "baseline_predictions_validation.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 65, + "metadata": {}, + "outputs": [], + "source": [ + "targets_validation = pd.merge(baseline_predictions_validation, train_validation, on='id', how='left')['unit_sales']" + ] + }, + { + "cell_type": "code", + "execution_count": 66, + "metadata": {}, + "outputs": [], + "source": [ + "def get_evaluation(baseline_predictions_validation, targets_validation):\n", + " predictions = baseline_predictions_validation.prediction_sales\n", + " cleaned_targets = clean_targets(targets_validation)\n", + " weights = baseline_predictions_validation.perishable\n", + " return nwrmsle(predictions, cleaned_targets, weights)" + ] + }, + { + "cell_type": "code", + "execution_count": 67, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(2003044,)\n", + "(2003044,) (2003044,) (2003044,)\n" + ] + } + ], + "source": [ + "validation_metric = get_evaluation(baseline_predictions_validation, targets_validation)" + ] + }, + { + "cell_type": "code", + "execution_count": 68, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.67828104376604859" + ] + }, + "execution_count": 68, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "validation_metric" + ] + }, + { + "cell_type": "code", + "execution_count": 69, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Validation NWRMSLE: 0.67828\n", + "Validation MSE: 353.981\n" + ] + } + ], + "source": [ + "mse = mean_squared_error(baseline_predictions_validation['prediction_sales'], targets_validation)\n", + "print('Validation NWRMSLE: %.5f' % (validation_metric))\n", + "print('Validation MSE: %.3f' % (mse))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "on Leaderboard: 1.369" + ] + }, + { + "cell_type": "code", + "execution_count": 70, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.67828104376604859" + ] + }, + "execution_count": 70, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "validation_metric" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Empirical baseline model" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Timestamp('2017-07-25 00:00:00')" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "train_train.date.max()" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2017-07-26 00:00:00 2017-08-10 00:00:00\n" + ] + } + ], + "source": [ + "print(train_validation.date.min(), train_validation.date.max())" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [], + "source": [ + "begin = pd.to_datetime('2017-07-05')\n", + "end = pd.to_datetime('2017-07-20')" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": {}, + "outputs": [], + "source": [ + "period_to_use_validation = train_train[(train_train.date>=begin) & (train_train.date<=end)]" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(1673324, 6)" + ] + }, + "execution_count": 27, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "period_to_use_validation.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
iddatestore_nbritem_nbrunit_salesonpromotion
1210942881210942882017-07-0511035201.0False
1210942891210942892017-07-0511036654.0False
1210942901210942902017-07-0511055747.0False
1210942911210942912017-07-05110557515.0False
1210942921210942922017-07-0511055771.0False
\n", + "
" + ], + "text/plain": [ + " id date store_nbr item_nbr unit_sales onpromotion\n", + "121094288 121094288 2017-07-05 1 103520 1.0 False\n", + "121094289 121094289 2017-07-05 1 103665 4.0 False\n", + "121094290 121094290 2017-07-05 1 105574 7.0 False\n", + "121094291 121094291 2017-07-05 1 105575 15.0 False\n", + "121094292 121094292 2017-07-05 1 105577 1.0 False" + ] + }, + "execution_count": 29, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "period_to_use_validation.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2017-07-05 00:00:00 2017-07-20 00:00:00\n" + ] + } + ], + "source": [ + "print(period_to_use_validation.date.min(), period_to_use_validation.date.max())" + ] + }, + { + "cell_type": "code", + "execution_count": 60, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0.99% of items in the test data set are not seen in the train data set\n" + ] + }, + { + "data": { + "text/plain": [ + "0.009870129870129871" + ] + }, + "execution_count": 60, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "get_unseen_item_percentage(period_to_use_validation, train_validation)" + ] + }, + { + "cell_type": "code", + "execution_count": 61, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "1.23% of (item,store) pairs in the test data set are not seen in the train data set\n" + ] + }, + { + "data": { + "text/plain": [ + "0.012323390147004183" + ] + }, + "execution_count": 61, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "get_unseen_item_store_pair_percentage(period_to_use_validation, train_validation )" + ] + }, + { + "cell_type": "code", + "execution_count": 62, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2.13% of items in the test data set are not seen in the train data set\n" + ] + }, + { + "data": { + "text/plain": [ + "0.02127659574468085" + ] + }, + "execution_count": 62, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "get_unseen_item_percentage(train_validation, test)" + ] + }, + { + "cell_type": "code", + "execution_count": 63, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "30.47% of (item,store) pairs in the test data set are not seen in the train data set\n" + ] + }, + { + "data": { + "text/plain": [ + "0.3047319300844038" + ] + }, + "execution_count": 63, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "get_unseen_item_store_pair_percentage(train_validation, test )" + ] + }, + { + "cell_type": "code", + "execution_count": 64, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "21.10% of (item,store) pairs in the test data set are not seen in the train data set\n" + ] + }, + { + "data": { + "text/plain": [ + "0.21096679863662687" + ] + }, + "execution_count": 64, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "get_unseen_item_store_pair_percentage(train, test )" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Group data" + ] + }, + { + "cell_type": "code", + "execution_count": 68, + "metadata": {}, + "outputs": [], + "source": [ + "def get_item_expanded_df(test, items):\n", + " return pd.merge(test, items, on='item_nbr', how='left')\n", + "def group_test_data(test, train):\n", + " def get_classes_from_expanded(df_expanded):\n", + " return df_expanded['class'].unique()\n", + " cols_item_store = ['item_nbr', 'store_nbr']\n", + " train_item_store_grouped = train.groupby(cols_item_store)['unit_sales'].mean().reset_index()\n", + " cols_to_use = test.columns.drop('unit_sales') if 'unit_sales' in test.columns else test.columns\n", + " test_join_train_item_store_grouped = pd.merge(test[cols_to_use], train_item_store_grouped, on=cols_item_store, how='left')\n", + " test_seen = test_join_train_item_store_grouped[test_join_train_item_store_grouped['unit_sales'].notnull()]\n", + " test_unseen = test_join_train_item_store_grouped[test_join_train_item_store_grouped['unit_sales'].isnull()]\n", + " test_unseen_expanded = get_item_expanded_df(test_unseen, items)\n", + " test_unseen_class = get_classes_from_expanded(test_unseen_expanded)\n", + " train_expanded = get_item_expanded_df(train, items)\n", + " train_class = get_classes_from_expanded(train_expanded)\n", + " test_unseen_class_diff = set(test_unseen_class) - set(train_class)\n", + " test_unseen_class_same = set(test_unseen_class) - test_unseen_class_diff\n", + " test_unseen_class_seen = test_unseen_expanded[test_unseen_expanded['class'].isin(test_unseen_class_same)]\n", + " test_unseen_class_unseen = test_unseen_expanded[test_unseen_expanded['class'].isin(test_unseen_class_diff)]\n", + " return test_seen, test_unseen_class_seen, test_unseen_class_unseen" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 69, + "metadata": {}, + "outputs": [], + "source": [ + "validation_seen, validation_unseen_class_seen,validation_unseen_class_unseen = group_test_data(train_validation, period_to_use_validation)" + ] + }, + { + "cell_type": "code", + "execution_count": 75, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(1679408, 6)" + ] + }, + "execution_count": 75, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "train_validation.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 70, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(1658712, 6)" + ] + }, + "execution_count": 70, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "validation_seen.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 72, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(20691, 9)" + ] + }, + "execution_count": 72, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "validation_unseen_class_seen.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 73, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(5, 9)" + ] + }, + "execution_count": 73, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "validation_unseen_class_unseen.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 76, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.012323390147004183" + ] + }, + "execution_count": 76, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "(1679408-1658712)/1679408" + ] + }, + { + "cell_type": "code", + "execution_count": 77, + "metadata": {}, + "outputs": [], + "source": [ + "test_seen, test_unseen_class_seen,test_unseen_class_unseen = group_test_data(test, train_validation)" + ] + }, + { + "cell_type": "code", + "execution_count": 78, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(3370464, 5)" + ] + }, + "execution_count": 78, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "test.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 79, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(2343376, 6)" + ] + }, + "execution_count": 79, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "test_seen.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 80, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(1019312, 9)" + ] + }, + "execution_count": 80, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "test_unseen_class_seen.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 81, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(7776, 9)" + ] + }, + "execution_count": 81, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "test_unseen_class_unseen.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 82, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.3047319300844038" + ] + }, + "execution_count": 82, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "(3370464-2343376)/3370464" + ] + }, + { + "cell_type": "code", + "execution_count": 83, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array(['2017-07-05T00:00:00.000000000', '2017-07-06T00:00:00.000000000',\n", + " '2017-07-07T00:00:00.000000000', '2017-07-08T00:00:00.000000000',\n", + " '2017-07-09T00:00:00.000000000', '2017-07-10T00:00:00.000000000',\n", + " '2017-07-11T00:00:00.000000000', '2017-07-12T00:00:00.000000000',\n", + " '2017-07-13T00:00:00.000000000', '2017-07-14T00:00:00.000000000',\n", + " '2017-07-15T00:00:00.000000000', '2017-07-16T00:00:00.000000000',\n", + " '2017-07-17T00:00:00.000000000', '2017-07-18T00:00:00.000000000',\n", + " '2017-07-19T00:00:00.000000000', '2017-07-20T00:00:00.000000000'], dtype='datetime64[ns]')" + ] + }, + "execution_count": 83, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "period_to_use_validation.date.unique()" + ] + }, + { + "cell_type": "code", + "execution_count": 84, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array(['2017-07-26T00:00:00.000000000', '2017-07-27T00:00:00.000000000',\n", + " '2017-07-28T00:00:00.000000000', '2017-07-29T00:00:00.000000000',\n", + " '2017-07-30T00:00:00.000000000', '2017-07-31T00:00:00.000000000',\n", + " '2017-08-01T00:00:00.000000000', '2017-08-02T00:00:00.000000000',\n", + " '2017-08-03T00:00:00.000000000', '2017-08-04T00:00:00.000000000',\n", + " '2017-08-05T00:00:00.000000000', '2017-08-06T00:00:00.000000000',\n", + " '2017-08-07T00:00:00.000000000', '2017-08-08T00:00:00.000000000',\n", + " '2017-08-09T00:00:00.000000000', '2017-08-10T00:00:00.000000000'], dtype='datetime64[ns]')" + ] + }, + "execution_count": 84, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "train_validation.date.unique()" + ] + }, + { + "cell_type": "code", + "execution_count": 87, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Timestamp('2017-07-05 00:00:00')" + ] + }, + "execution_count": 87, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "train_validation.date.min() - pd.DateOffset(days=21)" + ] + }, + { + "cell_type": "code", + "execution_count": 89, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
iddatestore_nbritem_nbrunit_salesonpromotion
1210942881210942882017-07-0511035201.0False
1210942891210942892017-07-0511036654.0False
1210942901210942902017-07-0511055747.0False
1210942911210942912017-07-05110557515.0False
1210942921210942922017-07-0511055771.0False
\n", + "
" + ], + "text/plain": [ + " id date store_nbr item_nbr unit_sales onpromotion\n", + "121094288 121094288 2017-07-05 1 103520 1.0 False\n", + "121094289 121094289 2017-07-05 1 103665 4.0 False\n", + "121094290 121094290 2017-07-05 1 105574 7.0 False\n", + "121094291 121094291 2017-07-05 1 105575 15.0 False\n", + "121094292 121094292 2017-07-05 1 105577 1.0 False" + ] + }, + "execution_count": 89, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "period_to_use_validation.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "end_of_validation_period - pd.DateOffset(days=15)" + ] + }, + { + "cell_type": "code", + "execution_count": 92, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/jinyang/anaconda/envs/py35/lib/python3.5/site-packages/ipykernel_launcher.py:1: SettingWithCopyWarning:\n", + "\n", + "\n", + "A value is trying to be set on a copy of a slice from a DataFrame.\n", + "Try using .loc[row_indexer,col_indexer] = value instead\n", + "\n", + "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n", + "\n" + ] + } + ], + "source": [ + "period_to_use_validation['prediction_date']=period_to_use_validation['date']+pd.DateOffset(days=21)" + ] + }, + { + "cell_type": "code", + "execution_count": 93, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
iddatestore_nbritem_nbrunit_salesonpromotionprediction_date
1210942881210942882017-07-0511035201.0False2017-07-26
1210942891210942892017-07-0511036654.0False2017-07-26
1210942901210942902017-07-0511055747.0False2017-07-26
1210942911210942912017-07-05110557515.0False2017-07-26
1210942921210942922017-07-0511055771.0False2017-07-26
\n", + "
" + ], + "text/plain": [ + " id date store_nbr item_nbr unit_sales onpromotion \\\n", + "121094288 121094288 2017-07-05 1 103520 1.0 False \n", + "121094289 121094289 2017-07-05 1 103665 4.0 False \n", + "121094290 121094290 2017-07-05 1 105574 7.0 False \n", + "121094291 121094291 2017-07-05 1 105575 15.0 False \n", + "121094292 121094292 2017-07-05 1 105577 1.0 False \n", + "\n", + " prediction_date \n", + "121094288 2017-07-26 \n", + "121094289 2017-07-26 \n", + "121094290 2017-07-26 \n", + "121094291 2017-07-26 \n", + "121094292 2017-07-26 " + ] + }, + "execution_count": 93, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "period_to_use_validation.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 95, + "metadata": {}, + "outputs": [], + "source": [ + "period_to_use_validation = period_to_use_validation.rename(columns={'date':'date_old', 'prediction_date':'date'})" + ] + }, + { + "cell_type": "code", + "execution_count": 96, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
iddate_oldstore_nbritem_nbrunit_salesonpromotiondate
1210942881210942882017-07-0511035201.0False2017-07-26
1210942891210942892017-07-0511036654.0False2017-07-26
1210942901210942902017-07-0511055747.0False2017-07-26
1210942911210942912017-07-05110557515.0False2017-07-26
1210942921210942922017-07-0511055771.0False2017-07-26
\n", + "
" + ], + "text/plain": [ + " id date_old store_nbr item_nbr unit_sales onpromotion \\\n", + "121094288 121094288 2017-07-05 1 103520 1.0 False \n", + "121094289 121094289 2017-07-05 1 103665 4.0 False \n", + "121094290 121094290 2017-07-05 1 105574 7.0 False \n", + "121094291 121094291 2017-07-05 1 105575 15.0 False \n", + "121094292 121094292 2017-07-05 1 105577 1.0 False \n", + "\n", + " date \n", + "121094288 2017-07-26 \n", + "121094289 2017-07-26 \n", + "121094290 2017-07-26 \n", + "121094291 2017-07-26 \n", + "121094292 2017-07-26 " + ] + }, + "execution_count": 96, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "period_to_use_validation.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 105, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
iddatestore_nbritem_nbrunit_salesonpromotion
1232961751232961752017-07-2611035201.0False
1232961761232961762017-07-2611036654.0False
1232961771232961772017-07-2611055749.0False
1232961781232961782017-07-2611055756.0False
1232961791232961792017-07-2611056932.0True
\n", + "
" + ], + "text/plain": [ + " id date store_nbr item_nbr unit_sales onpromotion\n", + "123296175 123296175 2017-07-26 1 103520 1.0 False\n", + "123296176 123296176 2017-07-26 1 103665 4.0 False\n", + "123296177 123296177 2017-07-26 1 105574 9.0 False\n", + "123296178 123296178 2017-07-26 1 105575 6.0 False\n", + "123296179 123296179 2017-07-26 1 105693 2.0 True" + ] + }, + "execution_count": 105, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "train_validation.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 98, + "metadata": {}, + "outputs": [], + "source": [ + "cols_to_use=['date', 'store_nbr', 'item_nbr', 'unit_sales']" + ] + }, + { + "cell_type": "code", + "execution_count": 102, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(142663, 3)" + ] + }, + "execution_count": 102, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "validation_seen.groupby(['store_nbr', 'item_nbr'])['unit_sales'].mean().reset_index().shape" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 103, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(147405, 3)" + ] + }, + "execution_count": 103, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "period_to_use_validation.groupby(['store_nbr', 'item_nbr'])['unit_sales'].mean().reset_index().shape" + ] + }, + { + "cell_type": "code", + "execution_count": 117, + "metadata": {}, + "outputs": [], + "source": [ + "validation_join_period = pd.merge(train_validation, period_to_use_validation[cols_to_use].rename(columns={'unit_sales':'prediction_sales'}), on=['date', 'store_nbr', 'item_nbr'], how='left')" + ] + }, + { + "cell_type": "code", + "execution_count": 118, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
iddatestore_nbritem_nbrunit_salesonpromotionprediction_sales
01232961752017-07-2611035201.0False1.0
11232961762017-07-2611036654.0False4.0
21232961772017-07-2611055749.0False7.0
31232961782017-07-2611055756.0False15.0
41232961792017-07-2611056932.0TrueNaN
\n", + "
" + ], + "text/plain": [ + " id date store_nbr item_nbr unit_sales onpromotion \\\n", + "0 123296175 2017-07-26 1 103520 1.0 False \n", + "1 123296176 2017-07-26 1 103665 4.0 False \n", + "2 123296177 2017-07-26 1 105574 9.0 False \n", + "3 123296178 2017-07-26 1 105575 6.0 False \n", + "4 123296179 2017-07-26 1 105693 2.0 True \n", + "\n", + " prediction_sales \n", + "0 1.0 \n", + "1 4.0 \n", + "2 7.0 \n", + "3 15.0 \n", + "4 NaN " + ] + }, + "execution_count": 118, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "validation_join_period.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 121, + "metadata": {}, + "outputs": [], + "source": [ + "validation_seen_prediction = validation_join_period[validation_join_period.prediction_sales.notnull()]" + ] + }, + { + "cell_type": "code", + "execution_count": 122, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
iddatestore_nbritem_nbrunit_salesonpromotionprediction_sales
01232961752017-07-2611035201.0False1.0
11232961762017-07-2611036654.0False4.0
21232961772017-07-2611055749.0False7.0
31232961782017-07-2611055756.0False15.0
61232961812017-07-26110585710.0False5.0
\n", + "
" + ], + "text/plain": [ + " id date store_nbr item_nbr unit_sales onpromotion \\\n", + "0 123296175 2017-07-26 1 103520 1.0 False \n", + "1 123296176 2017-07-26 1 103665 4.0 False \n", + "2 123296177 2017-07-26 1 105574 9.0 False \n", + "3 123296178 2017-07-26 1 105575 6.0 False \n", + "6 123296181 2017-07-26 1 105857 10.0 False \n", + "\n", + " prediction_sales \n", + "0 1.0 \n", + "1 4.0 \n", + "2 7.0 \n", + "3 15.0 \n", + "6 5.0 " + ] + }, + "execution_count": 122, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "validation_seen_prediction.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 123, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0" + ] + }, + "execution_count": 123, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "validation_seen_prediction.prediction_sales.isnull().sum()" + ] + }, + { + "cell_type": "code", + "execution_count": 124, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.7997026333088803" + ] + }, + "execution_count": 124, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "validation_seen_prediction.shape[0]/train_validation.shape[0]" + ] + }, + { + "cell_type": "code", + "execution_count": 133, + "metadata": {}, + "outputs": [], + "source": [ + "validation_seen_prediction_expanded = pd.merge(validation_seen_prediction, items, on='item_nbr', how='left')" + ] + }, + { + "cell_type": "code", + "execution_count": 125, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
iddatestore_nbritem_nbrunit_salesonpromotionprediction_sales
01232961752017-07-2611035201.0False1.0
11232961762017-07-2611036654.0False4.0
21232961772017-07-2611055749.0False7.0
31232961782017-07-2611055756.0False15.0
61232961812017-07-26110585710.0False5.0
\n", + "
" + ], + "text/plain": [ + " id date store_nbr item_nbr unit_sales onpromotion \\\n", + "0 123296175 2017-07-26 1 103520 1.0 False \n", + "1 123296176 2017-07-26 1 103665 4.0 False \n", + "2 123296177 2017-07-26 1 105574 9.0 False \n", + "3 123296178 2017-07-26 1 105575 6.0 False \n", + "6 123296181 2017-07-26 1 105857 10.0 False \n", + "\n", + " prediction_sales \n", + "0 1.0 \n", + "1 4.0 \n", + "2 7.0 \n", + "3 15.0 \n", + "6 5.0 " + ] + }, + "execution_count": 125, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "validation_seen_prediction.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 136, + "metadata": {}, + "outputs": [], + "source": [ + "def clean_targets(targets):\n", + " targets[targets<0]=0\n", + " return targets\n", + "def clean_predictions(predictions):\n", + " predictions[predictions<0]=0\n", + " return predictions.round().astype(int)\n", + "def nwrmsle(predictions, targets, weights):\n", + " print(targets.shape)\n", + " targets[targets<0]=0\n", + " weights = 1 + 0.25 * weights\n", + " print(predictions.shape, targets.shape, weights.shape)\n", + " log_square_errors = (np.log(predictions.values + 1) - np.log(targets.values + 1)) ** 2\n", + " return(np.sqrt(np.sum(weights.values * log_square_errors) / np.sum(weights)))\n", + "def get_evaluation(baseline_predictions_validation, targets_validation):\n", + " predictions = baseline_predictions_validation.prediction_sales\n", + " predictions = clean_predictions(predictions)\n", + " cleaned_targets = clean_targets(targets_validation)\n", + " weights = baseline_predictions_validation.perishable\n", + " return nwrmsle(predictions, cleaned_targets, weights)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 137, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "True" + ] + }, + "execution_count": 137, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "np.any(validation_seen_prediction_expanded.prediction_sales<0)" + ] + }, + { + "cell_type": "code", + "execution_count": 138, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/jinyang/anaconda/envs/py35/lib/python3.5/site-packages/ipykernel_launcher.py:5: SettingWithCopyWarning:\n", + "\n", + "\n", + "A value is trying to be set on a copy of a slice from a DataFrame\n", + "\n", + "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n", + "\n", + "/Users/jinyang/anaconda/envs/py35/lib/python3.5/site-packages/ipykernel_launcher.py:2: SettingWithCopyWarning:\n", + "\n", + "\n", + "A value is trying to be set on a copy of a slice from a DataFrame\n", + "\n", + "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n", + "\n", + "/Users/jinyang/anaconda/envs/py35/lib/python3.5/site-packages/ipykernel_launcher.py:9: SettingWithCopyWarning:\n", + "\n", + "\n", + "A value is trying to be set on a copy of a slice from a DataFrame\n", + "\n", + "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n", + "\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(1343027,)\n", + "(1343027,) (1343027,) (1343027,)\n" + ] + }, + { + "data": { + "text/plain": [ + "0.71883112494880497" + ] + }, + "execution_count": 138, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "get_evaluation(validation_seen_prediction_expanded, validation_seen_prediction_expanded['unit_sales'])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "comparision from the old baseline model on the same subset: 1.1433622314160428" + ] + }, + { + "cell_type": "code", + "execution_count": 140, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(1343027,)" + ] + }, + "execution_count": 140, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "validation_seen_prediction_expanded.id.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 142, + "metadata": {}, + "outputs": [], + "source": [ + "validation_seen_prediction_expanded['id'].to_csv('seen_empirical', index=False, header=False)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## For Test data" + ] + }, + { + "cell_type": "code", + "execution_count": 143, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array(['2017-07-26T00:00:00.000000000', '2017-07-27T00:00:00.000000000',\n", + " '2017-07-28T00:00:00.000000000', '2017-07-29T00:00:00.000000000',\n", + " '2017-07-30T00:00:00.000000000', '2017-07-31T00:00:00.000000000',\n", + " '2017-08-01T00:00:00.000000000', '2017-08-02T00:00:00.000000000',\n", + " '2017-08-03T00:00:00.000000000', '2017-08-04T00:00:00.000000000',\n", + " '2017-08-05T00:00:00.000000000', '2017-08-06T00:00:00.000000000',\n", + " '2017-08-07T00:00:00.000000000', '2017-08-08T00:00:00.000000000',\n", + " '2017-08-09T00:00:00.000000000', '2017-08-10T00:00:00.000000000'], dtype='datetime64[ns]')" + ] + }, + "execution_count": 143, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "train_validation.date.unique()" + ] + }, + { + "cell_type": "code", + "execution_count": 144, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array(['2017-08-16T00:00:00.000000000', '2017-08-17T00:00:00.000000000',\n", + " '2017-08-18T00:00:00.000000000', '2017-08-19T00:00:00.000000000',\n", + " '2017-08-20T00:00:00.000000000', '2017-08-21T00:00:00.000000000',\n", + " '2017-08-22T00:00:00.000000000', '2017-08-23T00:00:00.000000000',\n", + " '2017-08-24T00:00:00.000000000', '2017-08-25T00:00:00.000000000',\n", + " '2017-08-26T00:00:00.000000000', '2017-08-27T00:00:00.000000000',\n", + " '2017-08-28T00:00:00.000000000', '2017-08-29T00:00:00.000000000',\n", + " '2017-08-30T00:00:00.000000000', '2017-08-31T00:00:00.000000000'], dtype='datetime64[ns]')" + ] + }, + "execution_count": 144, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "test.date.unique()" + ] + }, + { + "cell_type": "code", + "execution_count": 145, + "metadata": {}, + "outputs": [], + "source": [ + "train_validation['prediction_date']=train_validation['date']+pd.DateOffset(days=21)" + ] + }, + { + "cell_type": "code", + "execution_count": 146, + "metadata": {}, + "outputs": [], + "source": [ + "train_validation = train_validation.rename(columns={'date':'date_old', 'prediction_date':'date'})" + ] + }, + { + "cell_type": "code", + "execution_count": 147, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
iddate_oldstore_nbritem_nbrunit_salesonpromotiondate
1232961751232961752017-07-2611035201.0False2017-08-16
1232961761232961762017-07-2611036654.0False2017-08-16
1232961771232961772017-07-2611055749.0False2017-08-16
1232961781232961782017-07-2611055756.0False2017-08-16
1232961791232961792017-07-2611056932.0True2017-08-16
\n", + "
" + ], + "text/plain": [ + " id date_old store_nbr item_nbr unit_sales onpromotion \\\n", + "123296175 123296175 2017-07-26 1 103520 1.0 False \n", + "123296176 123296176 2017-07-26 1 103665 4.0 False \n", + "123296177 123296177 2017-07-26 1 105574 9.0 False \n", + "123296178 123296178 2017-07-26 1 105575 6.0 False \n", + "123296179 123296179 2017-07-26 1 105693 2.0 True \n", + "\n", + " date \n", + "123296175 2017-08-16 \n", + "123296176 2017-08-16 \n", + "123296177 2017-08-16 \n", + "123296178 2017-08-16 \n", + "123296179 2017-08-16 " + ] + }, + "execution_count": 147, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "train_validation.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 148, + "metadata": {}, + "outputs": [], + "source": [ + "cols_to_use=['date', 'store_nbr', 'item_nbr', 'unit_sales']" + ] + }, + { + "cell_type": "code", + "execution_count": 149, + "metadata": {}, + "outputs": [], + "source": [ + "test_join_period = pd.merge(test, train_validation[cols_to_use].rename(columns={'unit_sales':'prediction_sales'}), on=['date', 'store_nbr', 'item_nbr'], how='left')" + ] + }, + { + "cell_type": "code", + "execution_count": 150, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
iddatestore_nbritem_nbronpromotionprediction_sales
01254970402017-08-16196995FalseNaN
11254970412017-08-16199197FalseNaN
21254970422017-08-161103501FalseNaN
31254970432017-08-161103520False1.0
41254970442017-08-161103665False4.0
\n", + "
" + ], + "text/plain": [ + " id date store_nbr item_nbr onpromotion prediction_sales\n", + "0 125497040 2017-08-16 1 96995 False NaN\n", + "1 125497041 2017-08-16 1 99197 False NaN\n", + "2 125497042 2017-08-16 1 103501 False NaN\n", + "3 125497043 2017-08-16 1 103520 False 1.0\n", + "4 125497044 2017-08-16 1 103665 False 4.0" + ] + }, + "execution_count": 150, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "test_join_period.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 151, + "metadata": {}, + "outputs": [], + "source": [ + "test_seen_prediction = test_join_period[test_join_period.prediction_sales.notnull()]" + ] + }, + { + "cell_type": "code", + "execution_count": 152, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
iddatestore_nbritem_nbronpromotionprediction_sales
31254970432017-08-161103520False1.0
41254970442017-08-161103665False4.0
51254970452017-08-161105574False9.0
61254970462017-08-161105575False6.0
91254970492017-08-161105693False2.0
\n", + "
" + ], + "text/plain": [ + " id date store_nbr item_nbr onpromotion prediction_sales\n", + "3 125497043 2017-08-16 1 103520 False 1.0\n", + "4 125497044 2017-08-16 1 103665 False 4.0\n", + "5 125497045 2017-08-16 1 105574 False 9.0\n", + "6 125497046 2017-08-16 1 105575 False 6.0\n", + "9 125497049 2017-08-16 1 105693 False 2.0" + ] + }, + "execution_count": 152, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "test_seen_prediction.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 153, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(1677035, 6)" + ] + }, + "execution_count": 153, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "test_seen_prediction.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 155, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(3370464, 5)" + ] + }, + "execution_count": 155, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "test.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 154, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.4975679906386776" + ] + }, + "execution_count": 154, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "test_seen_prediction.shape[0]/test.shape[0]" + ] + }, + { + "cell_type": "code", + "execution_count": 157, + "metadata": {}, + "outputs": [], + "source": [ + "test_seen_prediction_expanded = pd.merge(test_seen_prediction, items, on='item_nbr', how='left')" + ] + }, + { + "cell_type": "code", + "execution_count": 160, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
iddatestore_nbritem_nbronpromotionprediction_sales
31254970432017-08-161103520False1.0
41254970442017-08-161103665False4.0
51254970452017-08-161105574False9.0
61254970462017-08-161105575False6.0
91254970492017-08-161105693False2.0
\n", + "
" + ], + "text/plain": [ + " id date store_nbr item_nbr onpromotion prediction_sales\n", + "3 125497043 2017-08-16 1 103520 False 1.0\n", + "4 125497044 2017-08-16 1 103665 False 4.0\n", + "5 125497045 2017-08-16 1 105574 False 9.0\n", + "6 125497046 2017-08-16 1 105575 False 6.0\n", + "9 125497049 2017-08-16 1 105693 False 2.0" + ] + }, + "execution_count": 160, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "test_seen_prediction.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 161, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "False" + ] + }, + "execution_count": 161, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "np.all(test_seen_prediction.prediction_sales>=0)" + ] + }, + { + "cell_type": "code", + "execution_count": 162, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/jinyang/anaconda/envs/py35/lib/python3.5/site-packages/ipykernel_launcher.py:5: SettingWithCopyWarning:\n", + "\n", + "\n", + "A value is trying to be set on a copy of a slice from a DataFrame\n", + "\n", + "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n", + "\n", + "/Users/jinyang/anaconda/envs/py35/lib/python3.5/site-packages/pandas/core/generic.py:5088: SettingWithCopyWarning:\n", + "\n", + "\n", + "A value is trying to be set on a copy of a slice from a DataFrame\n", + "\n", + "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n", + "\n", + "/Users/jinyang/anaconda/envs/py35/lib/python3.5/site-packages/ipykernel_launcher.py:1: SettingWithCopyWarning:\n", + "\n", + "\n", + "A value is trying to be set on a copy of a slice from a DataFrame\n", + "\n", + "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n", + "\n", + "/Users/jinyang/anaconda/envs/py35/lib/python3.5/site-packages/ipykernel_launcher.py:1: SettingWithCopyWarning:\n", + "\n", + "\n", + "A value is trying to be set on a copy of a slice from a DataFrame.\n", + "Try using .loc[row_indexer,col_indexer] = value instead\n", + "\n", + "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n", + "\n" + ] + } + ], + "source": [ + "test_seen_prediction['prediction_sales'] = clean_predictions(test_seen_prediction.prediction_sales)" + ] + }, + { + "cell_type": "code", + "execution_count": 163, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "True" + ] + }, + "execution_count": 163, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "np.all(test_seen_prediction.prediction_sales>=0)" + ] + }, + { + "cell_type": "code", + "execution_count": 164, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "id int64\n", + "date datetime64[ns]\n", + "store_nbr int64\n", + "item_nbr int64\n", + "onpromotion bool\n", + "prediction_sales int64\n", + "dtype: object" + ] + }, + "execution_count": 164, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "test_seen_prediction.dtypes" + ] + }, + { + "cell_type": "code", + "execution_count": 169, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(1677035, 6)" + ] + }, + "execution_count": 169, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "test_seen_prediction.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 165, + "metadata": {}, + "outputs": [], + "source": [ + "baseline_predictions_test = pd.read_csv('baseline_predictions_test')" + ] + }, + { + "cell_type": "code", + "execution_count": 166, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idperishableprediction_sales
012549704002
112549704103
212549704303
312549704414
412549704505
\n", + "
" + ], + "text/plain": [ + " id perishable prediction_sales\n", + "0 125497040 0 2\n", + "1 125497041 0 3\n", + "2 125497043 0 3\n", + "3 125497044 1 4\n", + "4 125497045 0 5" + ] + }, + "execution_count": 166, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "baseline_predictions_test.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 167, + "metadata": {}, + "outputs": [], + "source": [ + "test_predictions_subset = baseline_predictions_test[baseline_predictions_test.id.isin(test_seen_prediction.id)]" + ] + }, + { + "cell_type": "code", + "execution_count": 168, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(1677035, 3)" + ] + }, + "execution_count": 168, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "test_predictions_subset.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 170, + "metadata": {}, + "outputs": [], + "source": [ + "test_predictions_subset2 = baseline_predictions_test[~baseline_predictions_test.id.isin(test_seen_prediction.id)]" + ] + }, + { + "cell_type": "code", + "execution_count": 171, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(1693429, 3)" + ] + }, + "execution_count": 171, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "test_predictions_subset2.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 172, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "True" + ] + }, + "execution_count": 172, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "baseline_predictions_test.shape[0]==test_predictions_subset.shape[0]+test_predictions_subset2.shape[0]" + ] + }, + { + "cell_type": "code", + "execution_count": 173, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idperishableprediction_sales
212549704303
312549704414
412549704505
5125497046010
712549704901
\n", + "
" + ], + "text/plain": [ + " id perishable prediction_sales\n", + "2 125497043 0 3\n", + "3 125497044 1 4\n", + "4 125497045 0 5\n", + "5 125497046 0 10\n", + "7 125497049 0 1" + ] + }, + "execution_count": 173, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "test_predictions_subset.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 174, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
iddatestore_nbritem_nbronpromotionprediction_sales
31254970432017-08-161103520False1
41254970442017-08-161103665False4
51254970452017-08-161105574False9
61254970462017-08-161105575False6
91254970492017-08-161105693False2
\n", + "
" + ], + "text/plain": [ + " id date store_nbr item_nbr onpromotion prediction_sales\n", + "3 125497043 2017-08-16 1 103520 False 1\n", + "4 125497044 2017-08-16 1 103665 False 4\n", + "5 125497045 2017-08-16 1 105574 False 9\n", + "6 125497046 2017-08-16 1 105575 False 6\n", + "9 125497049 2017-08-16 1 105693 False 2" + ] + }, + "execution_count": 174, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "test_seen_prediction.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 182, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/jinyang/anaconda/envs/py35/lib/python3.5/site-packages/pandas/core/indexing.py:601: SettingWithCopyWarning:\n", + "\n", + "\n", + "A value is trying to be set on a copy of a slice from a DataFrame.\n", + "Try using .loc[row_indexer,col_indexer] = value instead\n", + "\n", + "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n", + "\n" + ] + } + ], + "source": [ + "test_predictions_subset.loc[:, 'prediction_sales']=test_seen_prediction['prediction_sales']" + ] + }, + { + "cell_type": "code", + "execution_count": 183, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idperishableprediction_sales
21254970430NaN
312549704411.0
412549704504.0
512549704609.0
71254970490NaN
\n", + "
" + ], + "text/plain": [ + " id perishable prediction_sales\n", + "2 125497043 0 NaN\n", + "3 125497044 1 1.0\n", + "4 125497045 0 4.0\n", + "5 125497046 0 9.0\n", + "7 125497049 0 NaN" + ] + }, + "execution_count": 183, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "test_predictions_subset.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 184, + "metadata": {}, + "outputs": [], + "source": [ + "cols_to_use=['id', 'prediction_sales']" + ] + }, + { + "cell_type": "code", + "execution_count": 188, + "metadata": {}, + "outputs": [], + "source": [ + "new_test_predictions = test_seen_prediction[cols_to_use].append(test_predictions_subset2[cols_to_use])" + ] + }, + { + "cell_type": "code", + "execution_count": 189, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(3370464, 2)" + ] + }, + "execution_count": 189, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "new_test_predictions .shape" + ] + }, + { + "cell_type": "code", + "execution_count": 190, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idprediction_sales
31254970431
41254970444
51254970459
61254970466
91254970492
\n", + "
" + ], + "text/plain": [ + " id prediction_sales\n", + "3 125497043 1\n", + "4 125497044 4\n", + "5 125497045 9\n", + "6 125497046 6\n", + "9 125497049 2" + ] + }, + "execution_count": 190, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "new_test_predictions.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 195, + "metadata": {}, + "outputs": [], + "source": [ + "submission = new_test_predictions.rename( columns = {'prediction_sales':'unit_sales'})" + ] + }, + { + "cell_type": "code", + "execution_count": 196, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idunit_sales
31254970431
41254970444
51254970459
61254970466
91254970492
\n", + "
" + ], + "text/plain": [ + " id unit_sales\n", + "3 125497043 1\n", + "4 125497044 4\n", + "5 125497045 9\n", + "6 125497046 6\n", + "9 125497049 2" + ] + }, + "execution_count": 196, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "submission.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 197, + "metadata": {}, + "outputs": [], + "source": [ + "submission.to_csv('empirical_29nov17.csv', index=False)" + ] + }, + { + "cell_type": "code", + "execution_count": 88, + "metadata": {}, + "outputs": [ + { + "ename": "NameError", + "evalue": "name 'end_of_validation_period' is not defined", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mend_of_validation_period\u001b[0m \u001b[0;34m-\u001b[0m \u001b[0mpd\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mDateOffset\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdays\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m15\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", + "\u001b[0;31mNameError\u001b[0m: name 'end_of_validation_period' is not defined" + ] + } + ], + "source": [ + "cols_item_store = ['item_nbr', 'store_nbr']\n", + "cols_test_expanded = test_seen_expanded.columns\n", + "cols_prediction = ['id', 'unit_sales', 'perishable']\n", + "cols_to_use = cols_test_expanded.drop('unit_sales') if 'unit_sales' in cols_test_expanded else cols_test_expanded\n", + "test_join_train = pd.merge(test_seen_expanded[cols_to_use], train, on=cols_item_store, how='left')\n", + "is_data_seen = test_join_train_item_store_grouped['unit_sales'].notnull()\n", + "join_seen = test_join_train_item_store_grouped[is_data_seen.values]\n", + "# return join_seen[cols_prediction].rename(columns={'unit_sales':'prediction_sales'})" + ] + }, + { + "cell_type": "code", + "execution_count": 85, + "metadata": {}, + "outputs": [], + "source": [ + "test_seen_expanded = get_item_expanded_df(validation_seen, items)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def get_empirical_baseline_prediction_test_seen(test_seen_expanded, train):\n", + " cols_item_store = ['item_nbr', 'store_nbr']\n", + " cols_test_expanded = test_seen_expanded.columns\n", + " cols_prediction = ['id', 'unit_sales', 'perishable']\n", + " train_item_store_grouped = train.\n", + " cols_to_use = cols_test_expanded.drop('unit_sales') if 'unit_sales' in cols_test_expanded else cols_test_expanded\n", + " test_join_train_item_store_grouped = pd.merge(test_seen_expanded[cols_to_use], train_item_store_grouped, on=cols_item_store, how='left')\n", + " is_data_seen = test_join_train_item_store_grouped['unit_sales'].notnull()\n", + " join_seen = test_join_train_item_store_grouped[is_data_seen.values]\n", + " return join_seen[cols_prediction].rename(columns={'unit_sales':'prediction_sales'})" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Submission" + ] + }, + { + "cell_type": "code", + "execution_count": 72, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/jinyang/anaconda/envs/py35/lib/python3.5/site-packages/ipykernel_launcher.py:2: SettingWithCopyWarning:\n", + "\n", + "\n", + "A value is trying to be set on a copy of a slice from a DataFrame\n", + "\n", + "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n", + "\n" + ] + } + ], + "source": [ + "baseline_predictions_test = get_baseline_predictions(test, train)" + ] + }, + { + "cell_type": "code", + "execution_count": 73, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idperishableprediction_sales
012549704002
112549704103
212549704303
312549704414
412549704505
\n", + "
" + ], + "text/plain": [ + " id perishable prediction_sales\n", + "0 125497040 0 2\n", + "1 125497041 0 3\n", + "2 125497043 0 3\n", + "3 125497044 1 4\n", + "4 125497045 0 5" + ] + }, + "execution_count": 73, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "baseline_predictions_test.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 74, + "metadata": {}, + "outputs": [], + "source": [ + "test_submission=baseline_predictions_test[['id', 'prediction_sales']].rename(columns={'prediction_sales':'unit_sales'})" + ] + }, + { + "cell_type": "code", + "execution_count": 75, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idunit_sales
01254970402
11254970413
21254970433
31254970444
41254970455
\n", + "
" + ], + "text/plain": [ + " id unit_sales\n", + "0 125497040 2\n", + "1 125497041 3\n", + "2 125497043 3\n", + "3 125497044 4\n", + "4 125497045 5" + ] + }, + "execution_count": 75, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "test_submission.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 76, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "True" + ] + }, + "execution_count": 76, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "np.all(test_submission.unit_sales>=0)" + ] + }, + { + "cell_type": "code", + "execution_count": 77, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "id int64\n", + "unit_sales int64\n", + "dtype: object" + ] + }, + "execution_count": 77, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "test_submission.dtypes" + ] + }, + { + "cell_type": "code", + "execution_count": 78, + "metadata": {}, + "outputs": [], + "source": [ + "# test_submission.to_csv('baseline_submission_20171127.csv', index=False)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Evaluation of each group" + ] + }, + { + "cell_type": "code", + "execution_count": 79, + "metadata": {}, + "outputs": [], + "source": [ + "validation_seen, validation_unseen_class_seen,validation_unseen_class_unseen = group_test_data(train_validation, train_train)" + ] + }, + { + "cell_type": "code", + "execution_count": 80, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Index(['id', 'date', 'store_nbr', 'item_nbr', 'onpromotion', 'unit_sales'], dtype='object')" + ] + }, + "execution_count": 80, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "validation_seen.columns" + ] + }, + { + "cell_type": "code", + "execution_count": 81, + "metadata": {}, + "outputs": [], + "source": [ + "seen_baseline_predictions_validation = baseline_predictions_validation[ baseline_predictions_validation.id.isin(validation_seen.id)]" + ] + }, + { + "cell_type": "code", + "execution_count": 82, + "metadata": {}, + "outputs": [], + "source": [ + "unseen_class_seen_baseline_predictions_validation = baseline_predictions_validation[ baseline_predictions_validation.id.isin(validation_unseen_class_seen.id)]" + ] + }, + { + "cell_type": "code", + "execution_count": 83, + "metadata": {}, + "outputs": [], + "source": [ + "unseen_class_unseen_baseline_predictions_validation = baseline_predictions_validation[ baseline_predictions_validation.id.isin(validation_unseen_class_unseen.id)]" + ] + }, + { + "cell_type": "code", + "execution_count": 84, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "8.406943132552255" + ] + }, + "execution_count": 84, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "baseline_predictions_validation.prediction_sales.mean()" + ] + }, + { + "cell_type": "code", + "execution_count": 85, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "8.53563678093009" + ] + }, + "execution_count": 85, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "seen_baseline_predictions_validation.prediction_sales.mean()" + ] + }, + { + "cell_type": "code", + "execution_count": 86, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "7.756378630680307" + ] + }, + "execution_count": 86, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "unseen_class_seen_baseline_predictions_validation.prediction_sales.mean()" + ] + }, + { + "cell_type": "code", + "execution_count": 87, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "7.2" + ] + }, + "execution_count": 87, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "unseen_class_unseen_baseline_predictions_validation.prediction_sales.mean()" + ] + }, + { + "cell_type": "code", + "execution_count": 88, + "metadata": {}, + "outputs": [], + "source": [ + "seen_baseline_predictions_test = baseline_predictions_test[ baseline_predictions_test.id.isin(test_seen.id)]" + ] + }, + { + "cell_type": "code", + "execution_count": 89, + "metadata": {}, + "outputs": [], + "source": [ + "unseen_class_seen_baseline_predictions_test = baseline_predictions_test[ baseline_predictions_test.id.isin(test_unseen_class_seen.id)]" + ] + }, + { + "cell_type": "code", + "execution_count": 90, + "metadata": {}, + "outputs": [], + "source": [ + "unseen_class_unseen_baseline_predictions_test = baseline_predictions_test[ baseline_predictions_test.id.isin(test_unseen_class_unseen.id)]" + ] + }, + { + "cell_type": "code", + "execution_count": 91, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "7.350565382095759" + ] + }, + "execution_count": 91, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "baseline_predictions_test.prediction_sales.mean()" + ] + }, + { + "cell_type": "code", + "execution_count": 92, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "6.949336092844724" + ] + }, + "execution_count": 92, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "seen_baseline_predictions_test.prediction_sales.mean()" + ] + }, + { + "cell_type": "code", + "execution_count": 93, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "8.855705681997609" + ] + }, + "execution_count": 93, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "unseen_class_seen_baseline_predictions_test.prediction_sales.mean()" + ] + }, + { + "cell_type": "code", + "execution_count": 94, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "7.0" + ] + }, + "execution_count": 94, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "unseen_class_unseen_baseline_predictions_test.prediction_sales.mean()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "validation and test looks different in means of predictions" + ] + }, + { + "cell_type": "code", + "execution_count": 95, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "8.573084378706207" + ] + }, + "execution_count": 95, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "train_train.unit_sales.mean()" + ] + }, + { + "cell_type": "code", + "execution_count": 96, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "8.554865268437672" + ] + }, + "execution_count": 96, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "train.unit_sales.mean()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Why do seen items have low mean in test data?" + ] + }, + { + "cell_type": "code", + "execution_count": 97, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Index(['id', 'perishable', 'prediction_sales'], dtype='object')" + ] + }, + "execution_count": 97, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "seen_baseline_predictions_test.columns" + ] + }, + { + "cell_type": "code", + "execution_count": 98, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Index(['id', 'date', 'store_nbr', 'item_nbr', 'onpromotion', 'unit_sales'], dtype='object')" + ] + }, + "execution_count": 98, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "test_seen.columns" + ] + }, + { + "cell_type": "code", + "execution_count": 99, + "metadata": {}, + "outputs": [], + "source": [ + "test_seen_mean = test_seen.groupby(['item_nbr','store_nbr'])['unit_sales'].mean().reset_index()" + ] + }, + { + "cell_type": "code", + "execution_count": 100, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "1.000000 1063\n", + "2.000000 387\n", + "1.500000 269\n", + "1.333333 211\n", + "3.000000 169\n", + "1.200000 142\n", + "1.666667 132\n", + "4.000000 110\n", + "1.250000 105\n", + "1.600000 90\n", + "1.400000 87\n", + "2.500000 83\n", + "1.166667 75\n", + "1.750000 74\n", + "2.333333 72\n", + "1.142857 70\n", + "1.800000 70\n", + "5.000000 69\n", + "1.714286 66\n", + "1.833333 65\n", + "1.428571 63\n", + "2.400000 61\n", + "2.666667 59\n", + "1.285714 58\n", + "1.571429 56\n", + "1.222222 55\n", + "6.000000 54\n", + "1.375000 53\n", + "3.333333 52\n", + "3.500000 52\n", + " ... \n", + "5.931522 1\n", + "5.983607 1\n", + "10.427236 1\n", + "6.328264 1\n", + "2.569665 1\n", + "4.409535 1\n", + "35.648855 1\n", + "4.074074 1\n", + "14.786531 1\n", + "1.416867 1\n", + "2.202128 1\n", + "89.903846 1\n", + "4.600505 1\n", + "2.375946 1\n", + "20.287293 1\n", + "4.557390 1\n", + "2.670120 1\n", + "2.405458 1\n", + "8.606928 1\n", + "2.559347 1\n", + "11.537549 1\n", + "4.129799 1\n", + "5.267884 1\n", + "12.886924 1\n", + "5.454833 1\n", + "15.486842 1\n", + "5.792963 1\n", + "4.558394 1\n", + "5.431433 1\n", + "3.750600 1\n", + "Name: unit_sales, Length: 139461, dtype: int64" + ] + }, + "execution_count": 100, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "test_seen_mean.unit_sales.value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": 101, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(3370464, 5)" + ] + }, + "execution_count": 101, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "test.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 102, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(2003044, 6)" + ] + }, + "execution_count": 102, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "train_validation.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 103, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(2659408, 6)" + ] + }, + "execution_count": 103, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "test_seen.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 104, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(1672247, 6)" + ] + }, + "execution_count": 104, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "validation_seen.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 105, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Index(['id', 'date', 'store_nbr', 'item_nbr', 'onpromotion', 'unit_sales'], dtype='object')" + ] + }, + "execution_count": 105, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "validation_seen.columns" + ] + }, + { + "cell_type": "code", + "execution_count": 106, + "metadata": {}, + "outputs": [], + "source": [ + "validation_seen_mean = validation_seen.groupby(['item_nbr', 'store_nbr'])['unit_sales'].mean().reset_index()" + ] + }, + { + "cell_type": "code", + "execution_count": 107, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "1.000000 196\n", + "2.000000 183\n", + "1.500000 148\n", + "1.333333 109\n", + "2.500000 71\n", + "1.250000 70\n", + "4.000000 67\n", + "3.000000 66\n", + "1.750000 57\n", + "1.200000 47\n", + "1.666667 47\n", + "5.000000 43\n", + "1.428571 41\n", + "1.571429 40\n", + "1.375000 39\n", + "1.166667 37\n", + "1.222222 34\n", + "2.250000 34\n", + "6.000000 34\n", + "1.666667 31\n", + "1.125000 31\n", + "1.285714 30\n", + "2.750000 30\n", + "2.333333 30\n", + "1.142857 30\n", + "1.444444 29\n", + "3.500000 28\n", + "2.666667 27\n", + "1.300000 27\n", + "1.625000 26\n", + " ... \n", + "5.414894 1\n", + "5.689526 1\n", + "10.992721 1\n", + "4.405322 1\n", + "2.751479 1\n", + "3.373206 1\n", + "1.844920 1\n", + "3.732102 1\n", + "1.887872 1\n", + "3.881868 1\n", + "6.143824 1\n", + "4.322581 1\n", + "3.486506 1\n", + "4.022444 1\n", + "1.949593 1\n", + "3.965174 1\n", + "6.439222 1\n", + "9.427083 1\n", + "5.120801 1\n", + "1.376404 1\n", + "2.192547 1\n", + "11.182663 1\n", + "6.961364 1\n", + "7.380435 1\n", + "17.839545 1\n", + "1.691099 1\n", + "9.129005 1\n", + "1.983607 1\n", + "25.861189 1\n", + "9.211574 1\n", + "Name: unit_sales, Length: 132023, dtype: int64" + ] + }, + "execution_count": 107, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "validation_seen_mean.unit_sales.value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": 108, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(146060, 3)" + ] + }, + "execution_count": 108, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "validation_seen_mean.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 109, + "metadata": {}, + "outputs": [], + "source": [ + "baseline_predictions_validation.to_csv('baseline_predictions_validation', index=False)\n", + "baseline_predictions_test.to_csv('baseline_predictions_test', index=False)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.5.3" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/.ipynb_checkpoints/validation_strategy_20171127-checkpoint.ipynb b/.ipynb_checkpoints/validation_strategy_20171127-checkpoint.ipynb new file mode 100644 index 0000000..1e60e29 --- /dev/null +++ b/.ipynb_checkpoints/validation_strategy_20171127-checkpoint.ipynb @@ -0,0 +1,3325 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "" + ], + "text/vnd.plotly.v1+html": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "import matplotlib.pyplot as plt\n", + "%matplotlib inline\n", + "import seaborn as sns\n", + "import plotly.offline as py\n", + "import plotly.graph_objs as go\n", + "py.init_notebook_mode()\n", + "import gc" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [], + "source": [ + "seed = 46\n", + "np.random.seed(seed)" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [], + "source": [ + "def load_data(data_path):\n", + " train = pd.read_csv('%s/train.csv' % data_path, parse_dates=['date'])\n", + " test = pd.read_csv('%s/test.csv' % data_path, parse_dates=['date'])\n", + " items = pd.read_csv('%s/items.csv' % data_path)\n", + " return train, test, items" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/jinyang/anaconda/envs/py35/lib/python3.5/site-packages/IPython/core/interactiveshell.py:2821: DtypeWarning:\n", + "\n", + "Columns (5) have mixed types. Specify dtype option on import or set low_memory=False.\n", + "\n" + ] + } + ], + "source": [ + "data_path = './data'\n", + "train, test, items = load_data(data_path)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "def print_summary_train_test(train, test):\n", + " print ('Train min/max date: %s / %s' % (train['date'].min(), train['date'].max()))\n", + " print ('Test min/max date: %s / %s' % ( test['date'].min(), test['date'].max()))\n", + " print ('')\n", + " print ('Number of days in train: %d' % ((train['date'].max() - train['date'].min()).days + 1))\n", + " print ('Number of days in validation: %d' % (( test['date'].max() - test['date'].min()).days + 1))\n", + " print ('')\n", + " print ('Train shape: %d rows' % train.shape[0])\n", + " print ('Test shape: %d rows' % test.shape[0])" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Train min/max date: 2013-01-01 00:00:00 / 2017-08-15 00:00:00\n", + "Test min/max date: 2017-08-16 00:00:00 / 2017-08-31 00:00:00\n", + "\n", + "Number of days in train: 1688\n", + "Number of days in validation: 16\n", + "\n", + "Train shape: 125497040 rows\n", + "Test shape: 3370464 rows\n" + ] + } + ], + "source": [ + "print_summary_train_test(train, test)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Divide Train data into Validation(last two weeks of train data) and Training(the rest)" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [], + "source": [ + "import splitter" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "train_last_date = train['date'].max()" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2017-07-26 00:00:00 2017-08-10 00:00:00\n" + ] + } + ], + "source": [ + "begin_of_validation, end_of_validation = splitter.get_validation_period(train_last_date)\n", + "print(begin_of_validation, end_of_validation)" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "train_train, train_validation = splitter.split_validation_train_by_validation_period(train, begin_of_validation, end_of_validation)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Train min/max date: 2013-01-01 00:00:00 / 2017-07-25 00:00:00\n", + "Test min/max date: 2017-07-26 00:00:00 / 2017-08-10 00:00:00\n", + "\n", + "Number of days in train: 1667\n", + "Number of days in validation: 16\n", + "\n", + "Train shape: 123296175 rows\n", + "Test shape: 1679408 rows\n" + ] + } + ], + "source": [ + "print_summary_train_test(train_train, train_validation)\n", + " " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Evaluation Metric" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [], + "source": [ + "import evaluation\n", + "from sklearn.metrics import mean_squared_error" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## How many items in Test data set are not seen in Train data set \n", + "## vs. how many items in Validation are not seen in Training" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [], + "source": [ + "def get_unseen_item_percentage(train, test):\n", + " test_items = test['item_nbr'].unique()\n", + " train_items = train['item_nbr'].unique()\n", + " test_items_unseen_in_train = set(test_items) - set(train_items)\n", + " unseen_percentage = len(test_items_unseen_in_train)/len(test_items)\n", + " print(\"{:.2f}% of items in the test data set are not seen in the train data set\".format(unseen_percentage*100))\n", + " return unseen_percentage" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "1.54% of items in the test data set are not seen in the train data set\n" + ] + }, + { + "data": { + "text/plain": [ + "0.015380671622660855" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "get_unseen_item_percentage(train, test)" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0.55% of items in the test data set are not seen in the train data set\n" + ] + }, + { + "data": { + "text/plain": [ + "0.005454545454545455" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "get_unseen_item_percentage(train_train, train_validation)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## How many (item, store) in Test data set are not seen in Train data set\n", + "## vs. how many (item, store) in Validation are not seen in Training" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [], + "source": [ + "def get_unseen_item_store_pair_percentage(train, test):\n", + " cols_item_store = ['item_nbr', 'store_nbr']\n", + " train_item_store_grouped = train.groupby(cols_item_store)['unit_sales'].mean().reset_index()\n", + " cols_to_use = test.columns.drop('unit_sales') if 'unit_sales' in test.columns else test.columns\n", + " test_join_train_item_store_grouped = pd.merge(test[cols_to_use], train_item_store_grouped, on=cols_item_store, how='left')\n", + " test_unseen = test_join_train_item_store_grouped[test_join_train_item_store_grouped['unit_sales'].isnull()]\n", + " unseen_percentage = test_unseen.shape[0]/test.shape[0]\n", + " print(\"{:.2f}% of (item,store) pairs in the test data set are not seen in the train data set\".format(unseen_percentage*100))\n", + " return unseen_percentage" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "21.10% of (item,store) pairs in the test data set are not seen in the train data set\n" + ] + }, + { + "data": { + "text/plain": [ + "0.21096679863662687" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "get_unseen_item_store_pair_percentage(train, test)" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0.13% of (item,store) pairs in the test data set are not seen in the train data set\n" + ] + }, + { + "data": { + "text/plain": [ + "0.0013326124443851642" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "get_unseen_item_store_pair_percentage(train_train, train_validation)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Using constant prediction" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "8.563926854265649" + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "train_train.unit_sales.mean()" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "8.554865268437672" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "train.unit_sales.mean()" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(1679408, 6)" + ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "train_validation.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [], + "source": [ + "train_validation_prediction = pd.DataFrame({'id': train_validation.loc[:, 'id'], 'prediction_sales': train_train.unit_sales.mean()})" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(1679408, 2)" + ] + }, + "execution_count": 23, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "train_validation_prediction.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [], + "source": [ + "test_prediction = pd.DataFrame({'id': test.loc[:, 'id'], 'prediction_sales': train.unit_sales.mean()})" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idprediction_sales
1232961751232961758.563927
1232961761232961768.563927
1232961771232961778.563927
1232961781232961788.563927
1232961791232961798.563927
\n", + "
" + ], + "text/plain": [ + " id prediction_sales\n", + "123296175 123296175 8.563927\n", + "123296176 123296176 8.563927\n", + "123296177 123296177 8.563927\n", + "123296178 123296178 8.563927\n", + "123296179 123296179 8.563927" + ] + }, + "execution_count": 25, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "train_validation_prediction.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idprediction_sales
01254970408.554865
11254970418.554865
21254970428.554865
31254970438.554865
41254970448.554865
\n", + "
" + ], + "text/plain": [ + " id prediction_sales\n", + "0 125497040 8.554865\n", + "1 125497041 8.554865\n", + "2 125497042 8.554865\n", + "3 125497043 8.554865\n", + "4 125497044 8.554865" + ] + }, + "execution_count": 26, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "test_prediction.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Evaluation" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": {}, + "outputs": [], + "source": [ + "def get_prediction_constant(train_train):\n", + " return train_train.unit_sales.mean()" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": {}, + "outputs": [], + "source": [ + "def get_clean_prediction(train_train, train_validation):\n", + " predictions = get_prediction_constant(train_train)\n", + " return pd.DataFrame({'id': train_validation.loc[:, 'id'], 'prediction_sales': predictions})" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": {}, + "outputs": [], + "source": [ + "def get_item_expanded_df(test, items):\n", + " return pd.merge(test, items, on='item_nbr', how='left')" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": {}, + "outputs": [], + "source": [ + "def get_evaluation_using_constant_baseline(train_train, train_validation):\n", + " train_validation_prediction = get_clean_prediction(train_train, train_validation)\n", + " train_validation_expanded = get_item_expanded_df(train_validation, items)\n", + " train_validation_weights = train_validation_expanded ['perishable']*0.25+1\n", + " return nwrmsle(train_validation_prediction['prediction_sales'], train_validation['unit_sales'], train_validation_weights)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": {}, + "outputs": [], + "source": [ + "train_validation_expanded = get_item_expanded_df(train_validation, items)" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "metadata": {}, + "outputs": [], + "source": [ + "train_validation_weights = train_validation_expanded ['perishable']*0.25+1" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0 1.00\n", + "1 1.25\n", + "2 1.00\n", + "3 1.00\n", + "4 1.00\n", + "Name: perishable, dtype: float64" + ] + }, + "execution_count": 33, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "train_validation_weights.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [], + "source": [ + "def nwrmsle(predictions, targets, weights):\n", + " print(targets.shape)\n", + " targets[targets<0]=0\n", + " weights = 1 + 0.25 * weights\n", + " print(predictions.shape, targets.shape, weights.shape)\n", + " log_square_errors = (np.log(predictions.values + 1) - np.log(targets.values + 1)) ** 2\n", + " return(np.sqrt(np.sum(weights.values * log_square_errors) / np.sum(weights)))" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(1679408,)" + ] + }, + "execution_count": 35, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "train_validation_prediction['prediction_sales'].shape" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(1679408, 2)" + ] + }, + "execution_count": 36, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "train_validation_prediction.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(1679408,)" + ] + }, + "execution_count": 37, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "train_validation['unit_sales'].shape" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(1679408, 6)" + ] + }, + "execution_count": 38, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "train_validation.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(1679408,)\n", + "(1679408,) (1679408,) (1679408,)\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/jinyang/anaconda/envs/py35/lib/python3.5/site-packages/ipykernel_launcher.py:3: SettingWithCopyWarning:\n", + "\n", + "\n", + "A value is trying to be set on a copy of a slice from a DataFrame\n", + "\n", + "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n", + "\n", + "/Users/jinyang/anaconda/envs/py35/lib/python3.5/site-packages/pandas/core/generic.py:5088: SettingWithCopyWarning:\n", + "\n", + "\n", + "A value is trying to be set on a copy of a slice from a DataFrame\n", + "\n", + "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n", + "\n", + "/Users/jinyang/anaconda/envs/py35/lib/python3.5/site-packages/ipykernel_launcher.py:1: SettingWithCopyWarning:\n", + "\n", + "\n", + "A value is trying to be set on a copy of a slice from a DataFrame\n", + "\n", + "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n", + "\n" + ] + } + ], + "source": [ + "train_validation_metric = nwrmsle(train_validation_prediction['prediction_sales'], train_validation['unit_sales'], train_validation_weights)" + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "1.0371859208825527" + ] + }, + "execution_count": 40, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "train_validation_metric" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In Leaderboard 1.710" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## How to improve the similarity between validation performance and test performance?" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Strategy 1: Remove Items from Training data set" + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
iddatestore_nbritem_nbronpromotion
01254970402017-08-16196995False
11254970412017-08-16199197False
21254970422017-08-161103501False
31254970432017-08-161103520False
41254970442017-08-161103665False
\n", + "
" + ], + "text/plain": [ + " id date store_nbr item_nbr onpromotion\n", + "0 125497040 2017-08-16 1 96995 False\n", + "1 125497041 2017-08-16 1 99197 False\n", + "2 125497042 2017-08-16 1 103501 False\n", + "3 125497043 2017-08-16 1 103520 False\n", + "4 125497044 2017-08-16 1 103665 False" + ] + }, + "execution_count": 41, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "test.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(3901,)" + ] + }, + "execution_count": 42, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "test.item_nbr.unique().shape" + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.007690335811330428" + ] + }, + "execution_count": 43, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "30/3901" + ] + }, + { + "cell_type": "code", + "execution_count": 44, + "metadata": {}, + "outputs": [], + "source": [ + "def move_items_from_train_to_validation(train, validation, items_to_remove):\n", + " train2 = train[~train.item_nbr.isin(items_to_remove)]\n", + " validation_to_add = train[train.item_nbr.isin(items_to_remove)]\n", + " validation2 = validation.append(validation_to_add)\n", + " return train2, validation2\n", + "\n", + "\n", + "def move_random_items_from_train_to_validation(train, validation, num_items_to_remove):\n", + " train_items = train['item_nbr'].unique()\n", + " items_to_remove = np.random.choice(train_items, num_items_to_remove)\n", + " train2, validation2 = move_items_from_train_to_validation(train, validation, items_to_remove)\n", + " print(\"Moved {} items from train data to test data\".format(num_items_to_remove))\n", + " print(\"train data: {} -> {} rows\".format(train.shape[0], train2.shape[0]))\n", + " print(\"validation data: {} -> {} rows\".format(validation.shape[0], validation2.shape[0]))\n", + " return train2, validation2" + ] + }, + { + "cell_type": "code", + "execution_count": 45, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Moved 30 items from train data to test data\n", + "train data: 123296175 -> 122487773 rows\n", + "validation data: 1679408 -> 2487810 rows\n" + ] + } + ], + "source": [ + "num_items_to_move = 30\n", + "train_train2, train_validation2 = move_random_items_from_train_to_validation(train_train, train_validation, num_items_to_move)" + ] + }, + { + "cell_type": "code", + "execution_count": 46, + "metadata": {}, + "outputs": [], + "source": [ + "def move_random_items_from_train_to_validation(train, validation, num_items_to_remove):\n", + " train_items = train['item_nbr'].unique()\n", + " items_to_remove = np.random.choice(train_items, num_items_to_remove)\n", + " train2, validation2 = move_items_from_train_to_validation(train, validation, items_to_remove)\n", + " print(\"Moved {} items from train data to test data\".format(num_items_to_remove))\n", + " print(\"train data: {} -> {} rows\".format(train.shape[0], train2.shape[0]))\n", + " print(\"validation data: {} -> {} rows\".format(validation.shape[0], validation2.shape[0]))\n", + " return train2, validation2" + ] + }, + { + "cell_type": "code", + "execution_count": 47, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Moved 10 items from train data to test data\n", + "train data: 123296175 -> 122972539 rows\n", + "validation data: 1679408 -> 2003044 rows\n" + ] + } + ], + "source": [ + "num_items_to_remove = 10\n", + "train_train2, train_validation2 = move_random_items_from_train_to_validation(train_train, train_validation, num_items_to_remove)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Clear memory for previous train_train and train_validation" + ] + }, + { + "cell_type": "code", + "execution_count": 48, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "288" + ] + }, + "execution_count": 48, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "train_train = train_train2\n", + "train_validation = train_validation2\n", + "del train_train2\n", + "del train_validation2\n", + "gc.collect()" + ] + }, + { + "cell_type": "code", + "execution_count": 49, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0.80% of items in the test data set are not seen in the train data set\n" + ] + }, + { + "data": { + "text/plain": [ + "0.00804985717995326" + ] + }, + "execution_count": 49, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "get_unseen_item_percentage(train_train, train_validation)" + ] + }, + { + "cell_type": "code", + "execution_count": 50, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "16.51% of (item,store) pairs in the test data set are not seen in the train data set\n" + ] + }, + { + "data": { + "text/plain": [ + "0.16514714604372147" + ] + }, + "execution_count": 50, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "get_unseen_item_store_pair_percentage(train_train, train_validation)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now, much more items are unseen in validation data" + ] + }, + { + "cell_type": "code", + "execution_count": 51, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(2003044,)\n", + "(2003044,) (2003044,) (2003044,)\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/jinyang/anaconda/envs/py35/lib/python3.5/site-packages/ipykernel_launcher.py:3: SettingWithCopyWarning:\n", + "\n", + "\n", + "A value is trying to be set on a copy of a slice from a DataFrame\n", + "\n", + "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n", + "\n" + ] + } + ], + "source": [ + "train_validation_metric = get_evaluation_using_constant_baseline(train_train, train_validation)" + ] + }, + { + "cell_type": "code", + "execution_count": 52, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "1.033043851782858" + ] + }, + "execution_count": 52, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "train_validation_metric" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In Leaderboard 1.710" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Removing items hardly changed the validation score, which implies that we are doing worse job on predicting seen items" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Analyze the performance on each group: seen (item, store), seen class, unseen class" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Group test data into the three groups" + ] + }, + { + "cell_type": "code", + "execution_count": 53, + "metadata": {}, + "outputs": [], + "source": [ + "def group_test_data(test, train):\n", + " def get_classes_from_expanded(df_expanded):\n", + " return df_expanded['class'].unique()\n", + " cols_item_store = ['item_nbr', 'store_nbr']\n", + " train_item_store_grouped = train.groupby(cols_item_store)['unit_sales'].mean().reset_index()\n", + " cols_to_use = test.columns.drop('unit_sales') if 'unit_sales' in test.columns else test.columns\n", + " test_join_train_item_store_grouped = pd.merge(test[cols_to_use], train_item_store_grouped, on=cols_item_store, how='left')\n", + " test_seen = test_join_train_item_store_grouped[test_join_train_item_store_grouped['unit_sales'].notnull()]\n", + " test_unseen = test_join_train_item_store_grouped[test_join_train_item_store_grouped['unit_sales'].isnull()]\n", + " test_unseen_expanded = get_item_expanded_df(test_unseen, items)\n", + " test_unseen_class = get_classes_from_expanded(test_unseen_expanded)\n", + " train_expanded = get_item_expanded_df(train, items)\n", + " train_class = get_classes_from_expanded(train_expanded)\n", + " test_unseen_class_diff = set(test_unseen_class) - set(train_class)\n", + " test_unseen_class_same = set(test_unseen_class) - test_unseen_class_diff\n", + " test_unseen_class_seen = test_unseen_expanded[test_unseen_expanded['class'].isin(test_unseen_class_same)]\n", + " test_unseen_class_unseen = test_unseen_expanded[test_unseen_expanded['class'].isin(test_unseen_class_diff)]\n", + " return test_seen, test_unseen_class_seen, test_unseen_class_unseen" + ] + }, + { + "cell_type": "code", + "execution_count": 54, + "metadata": {}, + "outputs": [], + "source": [ + "test_seen, test_unseen_class_seen,test_unseen_class_unseen = group_test_data(test, train)" + ] + }, + { + "cell_type": "code", + "execution_count": 55, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "True\n" + ] + } + ], + "source": [ + "print(test.shape[0]== test_seen.shape[0]+ test_unseen_class_seen.shape[0]+test_unseen_class_unseen.shape[0])" + ] + }, + { + "cell_type": "code", + "execution_count": 56, + "metadata": {}, + "outputs": [], + "source": [ + "def get_baseline_prediction_test_seen(test_seen_expanded, train):\n", + " cols_item_store = ['item_nbr', 'store_nbr']\n", + " cols_test_expanded = test_seen_expanded.columns\n", + " cols_prediction = ['id', 'unit_sales', 'perishable']\n", + " train_item_store_grouped = train.groupby(cols_item_store)['unit_sales'].mean().reset_index()\n", + " cols_to_use = cols_test_expanded.drop('unit_sales') if 'unit_sales' in cols_test_expanded else cols_test_expanded\n", + " test_join_train_item_store_grouped = pd.merge(test_seen_expanded[cols_to_use], train_item_store_grouped, on=cols_item_store, how='left')\n", + " is_data_seen = test_join_train_item_store_grouped['unit_sales'].notnull()\n", + " join_seen = test_join_train_item_store_grouped[is_data_seen.values]\n", + " return join_seen[cols_prediction].rename(columns={'unit_sales':'prediction_sales'})" + ] + }, + { + "cell_type": "code", + "execution_count": 57, + "metadata": {}, + "outputs": [], + "source": [ + "def get_baseline_prediction_test_unseen_class_seen(test_unseen_class_seen, train_expanded):\n", + " test_unseen_class_seen_class = test_unseen_class_seen['class'].unique()\n", + " train_sub = train_expanded[train_expanded['class'].isin(test_unseen_class_seen_class)]\n", + " train_sub_class_grouped = train_sub.groupby('class').mean().reset_index()\n", + " train_sub_class_grouped = train_sub_class_grouped[['class', 'unit_sales']]\n", + " test_unseen_class_seen_join_train_sub_class_grouped = pd.merge(test_unseen_class_seen, train_sub_class_grouped, on='class', how='left')\n", + " return test_unseen_class_seen_join_train_sub_class_grouped.drop('unit_sales_x', axis=1)\\\n", + " .rename(columns={'unit_sales_y':'prediction_sales'})\n", + " " + ] + }, + { + "cell_type": "code", + "execution_count": 58, + "metadata": {}, + "outputs": [], + "source": [ + "def get_baseline_prediction_test_unseen_class_unseen(test_unseen_class_unseen, train_expanded):\n", + " test_unseen_class_unseen_family = test_unseen_class_unseen['family'].unique()\n", + " train_sub = train_expanded[train_expanded['family'].isin(test_unseen_class_unseen_family)]\n", + " train_sub_family_grouped = train_sub.groupby('family').mean().reset_index()\n", + " train_sub_family_grouped = train_sub_family_grouped[['family', 'unit_sales']]\n", + " test_unseen_class_unseen_join_train_sub_family_grouped = pd.merge(test_unseen_class_unseen, train_sub_family_grouped, on='family', how='left')\n", + " return test_unseen_class_unseen_join_train_sub_family_grouped.drop('unit_sales_x', axis=1)\\\n", + " .rename(columns={'unit_sales_y':'prediction_sales'})\n", + " " + ] + }, + { + "cell_type": "code", + "execution_count": 59, + "metadata": {}, + "outputs": [], + "source": [ + "def clean_predictions(predictions):\n", + " predictions[predictions<0]=0\n", + " return predictions.round().astype(int)" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [], + "source": [ + "def clean_targets(targets):\n", + " targets[targets<0]=0\n", + " return targets" + ] + }, + { + "cell_type": "code", + "execution_count": 61, + "metadata": {}, + "outputs": [], + "source": [ + "def get_baseline_predictions(test, train):\n", + " cols_to_use =['id', 'perishable', 'prediction_sales']\n", + " test_seen, test_unseen_class_seen, test_unseen_class_unseen = group_test_data(test, train)\n", + " train_expanded = get_item_expanded_df(train, items)\n", + " test_seen_expanded = get_item_expanded_df(test_seen, items)\n", + " prediction_test_seen = get_baseline_prediction_test_seen(test_seen_expanded, train)[cols_to_use]\n", + " prediction_test_unseen_class_seen = get_baseline_prediction_test_unseen_class_seen(test_unseen_class_seen, train_expanded)[cols_to_use]\n", + " prediction_test_unseen_class_unseen = get_baseline_prediction_test_unseen_class_unseen(test_unseen_class_unseen, train_expanded)[cols_to_use]\n", + " baseline_predictions = prediction_test_seen.append(prediction_test_unseen_class_seen).append(prediction_test_unseen_class_unseen)\n", + " cleaned_predictions = clean_predictions(baseline_predictions['prediction_sales'])\n", + " baseline_predictions.loc[:, 'prediction_sales']=cleaned_predictions\n", + " return baseline_predictions" + ] + }, + { + "cell_type": "code", + "execution_count": 62, + "metadata": {}, + "outputs": [], + "source": [ + "# prediction_test_seen, prediction_test_unseen_class_seen, prediction_test_unseen_class_unseen=get_baseline_predictions(test, train)" + ] + }, + { + "cell_type": "code", + "execution_count": 63, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/jinyang/anaconda/envs/py35/lib/python3.5/site-packages/ipykernel_launcher.py:2: SettingWithCopyWarning:\n", + "\n", + "\n", + "A value is trying to be set on a copy of a slice from a DataFrame\n", + "\n", + "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n", + "\n" + ] + } + ], + "source": [ + "baseline_predictions_validation = get_baseline_predictions(train_validation, train_train)" + ] + }, + { + "cell_type": "code", + "execution_count": 64, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idperishableprediction_sales
012329617503
112329617614
212329617705
3123296178010
412329617901
\n", + "
" + ], + "text/plain": [ + " id perishable prediction_sales\n", + "0 123296175 0 3\n", + "1 123296176 1 4\n", + "2 123296177 0 5\n", + "3 123296178 0 10\n", + "4 123296179 0 1" + ] + }, + "execution_count": 64, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "baseline_predictions_validation.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 65, + "metadata": {}, + "outputs": [], + "source": [ + "targets_validation = pd.merge(baseline_predictions_validation, train_validation, on='id', how='left')['unit_sales']" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [], + "source": [ + "def get_evaluation(baseline_predictions_validation, targets_validation):\n", + " predictions = baseline_predictions_validation.prediction_sales\n", + " cleaned_targets = clean_targets(targets_validation)\n", + " weights = baseline_predictions_validation.perishable\n", + " return nwrmsle(predictions, cleaned_targets, weights)" + ] + }, + { + "cell_type": "code", + "execution_count": 67, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(2003044,)\n", + "(2003044,) (2003044,) (2003044,)\n" + ] + } + ], + "source": [ + "validation_metric = get_evaluation(baseline_predictions_validation, targets_validation)" + ] + }, + { + "cell_type": "code", + "execution_count": 68, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.67828104376604859" + ] + }, + "execution_count": 68, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "validation_metric" + ] + }, + { + "cell_type": "code", + "execution_count": 69, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Validation NWRMSLE: 0.67828\n", + "Validation MSE: 353.981\n" + ] + } + ], + "source": [ + "mse = mean_squared_error(baseline_predictions_validation['prediction_sales'], targets_validation)\n", + "print('Validation NWRMSLE: %.5f' % (validation_metric))\n", + "print('Validation MSE: %.3f' % (mse))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "on Leaderboard: 1.369" + ] + }, + { + "cell_type": "code", + "execution_count": 70, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.67828104376604859" + ] + }, + "execution_count": 70, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "validation_metric" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Submission" + ] + }, + { + "cell_type": "code", + "execution_count": 72, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/jinyang/anaconda/envs/py35/lib/python3.5/site-packages/ipykernel_launcher.py:2: SettingWithCopyWarning:\n", + "\n", + "\n", + "A value is trying to be set on a copy of a slice from a DataFrame\n", + "\n", + "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n", + "\n" + ] + } + ], + "source": [ + "baseline_predictions_test = get_baseline_predictions(test, train)" + ] + }, + { + "cell_type": "code", + "execution_count": 73, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idperishableprediction_sales
012549704002
112549704103
212549704303
312549704414
412549704505
\n", + "
" + ], + "text/plain": [ + " id perishable prediction_sales\n", + "0 125497040 0 2\n", + "1 125497041 0 3\n", + "2 125497043 0 3\n", + "3 125497044 1 4\n", + "4 125497045 0 5" + ] + }, + "execution_count": 73, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "baseline_predictions_test.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 74, + "metadata": {}, + "outputs": [], + "source": [ + "test_submission=baseline_predictions_test[['id', 'prediction_sales']].rename(columns={'prediction_sales':'unit_sales'})" + ] + }, + { + "cell_type": "code", + "execution_count": 75, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idunit_sales
01254970402
11254970413
21254970433
31254970444
41254970455
\n", + "
" + ], + "text/plain": [ + " id unit_sales\n", + "0 125497040 2\n", + "1 125497041 3\n", + "2 125497043 3\n", + "3 125497044 4\n", + "4 125497045 5" + ] + }, + "execution_count": 75, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "test_submission.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 76, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "True" + ] + }, + "execution_count": 76, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "np.all(test_submission.unit_sales>=0)" + ] + }, + { + "cell_type": "code", + "execution_count": 77, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "id int64\n", + "unit_sales int64\n", + "dtype: object" + ] + }, + "execution_count": 77, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "test_submission.dtypes" + ] + }, + { + "cell_type": "code", + "execution_count": 78, + "metadata": {}, + "outputs": [], + "source": [ + "# test_submission.to_csv('baseline_submission_20171127.csv', index=False)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Evaluation of each group" + ] + }, + { + "cell_type": "code", + "execution_count": 79, + "metadata": {}, + "outputs": [], + "source": [ + "validation_seen, validation_unseen_class_seen,validation_unseen_class_unseen = group_test_data(train_validation, train_train)" + ] + }, + { + "cell_type": "code", + "execution_count": 80, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Index(['id', 'date', 'store_nbr', 'item_nbr', 'onpromotion', 'unit_sales'], dtype='object')" + ] + }, + "execution_count": 80, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "validation_seen.columns" + ] + }, + { + "cell_type": "code", + "execution_count": 81, + "metadata": {}, + "outputs": [], + "source": [ + "seen_baseline_predictions_validation = baseline_predictions_validation[ baseline_predictions_validation.id.isin(validation_seen.id)]" + ] + }, + { + "cell_type": "code", + "execution_count": 82, + "metadata": {}, + "outputs": [], + "source": [ + "unseen_class_seen_baseline_predictions_validation = baseline_predictions_validation[ baseline_predictions_validation.id.isin(validation_unseen_class_seen.id)]" + ] + }, + { + "cell_type": "code", + "execution_count": 83, + "metadata": {}, + "outputs": [], + "source": [ + "unseen_class_unseen_baseline_predictions_validation = baseline_predictions_validation[ baseline_predictions_validation.id.isin(validation_unseen_class_unseen.id)]" + ] + }, + { + "cell_type": "code", + "execution_count": 84, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "8.406943132552255" + ] + }, + "execution_count": 84, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "baseline_predictions_validation.prediction_sales.mean()" + ] + }, + { + "cell_type": "code", + "execution_count": 85, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "8.53563678093009" + ] + }, + "execution_count": 85, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "seen_baseline_predictions_validation.prediction_sales.mean()" + ] + }, + { + "cell_type": "code", + "execution_count": 86, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "7.756378630680307" + ] + }, + "execution_count": 86, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "unseen_class_seen_baseline_predictions_validation.prediction_sales.mean()" + ] + }, + { + "cell_type": "code", + "execution_count": 87, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "7.2" + ] + }, + "execution_count": 87, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "unseen_class_unseen_baseline_predictions_validation.prediction_sales.mean()" + ] + }, + { + "cell_type": "code", + "execution_count": 88, + "metadata": {}, + "outputs": [], + "source": [ + "seen_baseline_predictions_test = baseline_predictions_test[ baseline_predictions_test.id.isin(test_seen.id)]" + ] + }, + { + "cell_type": "code", + "execution_count": 89, + "metadata": {}, + "outputs": [], + "source": [ + "unseen_class_seen_baseline_predictions_test = baseline_predictions_test[ baseline_predictions_test.id.isin(test_unseen_class_seen.id)]" + ] + }, + { + "cell_type": "code", + "execution_count": 90, + "metadata": {}, + "outputs": [], + "source": [ + "unseen_class_unseen_baseline_predictions_test = baseline_predictions_test[ baseline_predictions_test.id.isin(test_unseen_class_unseen.id)]" + ] + }, + { + "cell_type": "code", + "execution_count": 91, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "7.350565382095759" + ] + }, + "execution_count": 91, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "baseline_predictions_test.prediction_sales.mean()" + ] + }, + { + "cell_type": "code", + "execution_count": 92, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "6.949336092844724" + ] + }, + "execution_count": 92, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "seen_baseline_predictions_test.prediction_sales.mean()" + ] + }, + { + "cell_type": "code", + "execution_count": 93, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "8.855705681997609" + ] + }, + "execution_count": 93, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "unseen_class_seen_baseline_predictions_test.prediction_sales.mean()" + ] + }, + { + "cell_type": "code", + "execution_count": 94, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "7.0" + ] + }, + "execution_count": 94, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "unseen_class_unseen_baseline_predictions_test.prediction_sales.mean()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "validation and test looks different in means of predictions" + ] + }, + { + "cell_type": "code", + "execution_count": 95, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "8.573084378706207" + ] + }, + "execution_count": 95, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "train_train.unit_sales.mean()" + ] + }, + { + "cell_type": "code", + "execution_count": 96, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "8.554865268437672" + ] + }, + "execution_count": 96, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "train.unit_sales.mean()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Why do seen items have low mean in test data?" + ] + }, + { + "cell_type": "code", + "execution_count": 97, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Index(['id', 'perishable', 'prediction_sales'], dtype='object')" + ] + }, + "execution_count": 97, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "seen_baseline_predictions_test.columns" + ] + }, + { + "cell_type": "code", + "execution_count": 98, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Index(['id', 'date', 'store_nbr', 'item_nbr', 'onpromotion', 'unit_sales'], dtype='object')" + ] + }, + "execution_count": 98, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "test_seen.columns" + ] + }, + { + "cell_type": "code", + "execution_count": 99, + "metadata": {}, + "outputs": [], + "source": [ + "test_seen_mean = test_seen.groupby(['item_nbr','store_nbr'])['unit_sales'].mean().reset_index()" + ] + }, + { + "cell_type": "code", + "execution_count": 100, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "1.000000 1063\n", + "2.000000 387\n", + "1.500000 269\n", + "1.333333 211\n", + "3.000000 169\n", + "1.200000 142\n", + "1.666667 132\n", + "4.000000 110\n", + "1.250000 105\n", + "1.600000 90\n", + "1.400000 87\n", + "2.500000 83\n", + "1.166667 75\n", + "1.750000 74\n", + "2.333333 72\n", + "1.142857 70\n", + "1.800000 70\n", + "5.000000 69\n", + "1.714286 66\n", + "1.833333 65\n", + "1.428571 63\n", + "2.400000 61\n", + "2.666667 59\n", + "1.285714 58\n", + "1.571429 56\n", + "1.222222 55\n", + "6.000000 54\n", + "1.375000 53\n", + "3.333333 52\n", + "3.500000 52\n", + " ... \n", + "5.931522 1\n", + "5.983607 1\n", + "10.427236 1\n", + "6.328264 1\n", + "2.569665 1\n", + "4.409535 1\n", + "35.648855 1\n", + "4.074074 1\n", + "14.786531 1\n", + "1.416867 1\n", + "2.202128 1\n", + "89.903846 1\n", + "4.600505 1\n", + "2.375946 1\n", + "20.287293 1\n", + "4.557390 1\n", + "2.670120 1\n", + "2.405458 1\n", + "8.606928 1\n", + "2.559347 1\n", + "11.537549 1\n", + "4.129799 1\n", + "5.267884 1\n", + "12.886924 1\n", + "5.454833 1\n", + "15.486842 1\n", + "5.792963 1\n", + "4.558394 1\n", + "5.431433 1\n", + "3.750600 1\n", + "Name: unit_sales, Length: 139461, dtype: int64" + ] + }, + "execution_count": 100, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "test_seen_mean.unit_sales.value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": 101, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(3370464, 5)" + ] + }, + "execution_count": 101, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "test.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 102, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(2003044, 6)" + ] + }, + "execution_count": 102, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "train_validation.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 103, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(2659408, 6)" + ] + }, + "execution_count": 103, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "test_seen.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 104, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(1672247, 6)" + ] + }, + "execution_count": 104, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "validation_seen.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 105, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Index(['id', 'date', 'store_nbr', 'item_nbr', 'onpromotion', 'unit_sales'], dtype='object')" + ] + }, + "execution_count": 105, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "validation_seen.columns" + ] + }, + { + "cell_type": "code", + "execution_count": 106, + "metadata": {}, + "outputs": [], + "source": [ + "validation_seen_mean = validation_seen.groupby(['item_nbr', 'store_nbr'])['unit_sales'].mean().reset_index()" + ] + }, + { + "cell_type": "code", + "execution_count": 107, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "1.000000 196\n", + "2.000000 183\n", + "1.500000 148\n", + "1.333333 109\n", + "2.500000 71\n", + "1.250000 70\n", + "4.000000 67\n", + "3.000000 66\n", + "1.750000 57\n", + "1.200000 47\n", + "1.666667 47\n", + "5.000000 43\n", + "1.428571 41\n", + "1.571429 40\n", + "1.375000 39\n", + "1.166667 37\n", + "1.222222 34\n", + "2.250000 34\n", + "6.000000 34\n", + "1.666667 31\n", + "1.125000 31\n", + "1.285714 30\n", + "2.750000 30\n", + "2.333333 30\n", + "1.142857 30\n", + "1.444444 29\n", + "3.500000 28\n", + "2.666667 27\n", + "1.300000 27\n", + "1.625000 26\n", + " ... \n", + "5.414894 1\n", + "5.689526 1\n", + "10.992721 1\n", + "4.405322 1\n", + "2.751479 1\n", + "3.373206 1\n", + "1.844920 1\n", + "3.732102 1\n", + "1.887872 1\n", + "3.881868 1\n", + "6.143824 1\n", + "4.322581 1\n", + "3.486506 1\n", + "4.022444 1\n", + "1.949593 1\n", + "3.965174 1\n", + "6.439222 1\n", + "9.427083 1\n", + "5.120801 1\n", + "1.376404 1\n", + "2.192547 1\n", + "11.182663 1\n", + "6.961364 1\n", + "7.380435 1\n", + "17.839545 1\n", + "1.691099 1\n", + "9.129005 1\n", + "1.983607 1\n", + "25.861189 1\n", + "9.211574 1\n", + "Name: unit_sales, Length: 132023, dtype: int64" + ] + }, + "execution_count": 107, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "validation_seen_mean.unit_sales.value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": 108, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(146060, 3)" + ] + }, + "execution_count": 108, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "validation_seen_mean.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 109, + "metadata": {}, + "outputs": [], + "source": [ + "baseline_predictions_validation.to_csv('baseline_predictions_validation', index=False)\n", + "baseline_predictions_test.to_csv('baseline_predictions_test', index=False)" + ] + }, + { + "cell_type": "code", + "execution_count": 110, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idperishableprediction_sales
012329617503
112329617614
212329617705
3123296178010
412329617901
\n", + "
" + ], + "text/plain": [ + " id perishable prediction_sales\n", + "0 123296175 0 3\n", + "1 123296176 1 4\n", + "2 123296177 0 5\n", + "3 123296178 0 10\n", + "4 123296179 0 1" + ] + }, + "execution_count": 110, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "seen_baseline_predictions_validation.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 112, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
iddatestore_nbritem_nbrunit_salesonpromotion
1232961751232961752017-07-2611035201.0False
1232961761232961762017-07-2611036654.0False
1232961771232961772017-07-2611055749.0False
1232961781232961782017-07-2611055756.0False
1232961791232961792017-07-2611056932.0True
\n", + "
" + ], + "text/plain": [ + " id date store_nbr item_nbr unit_sales onpromotion\n", + "123296175 123296175 2017-07-26 1 103520 1.0 False\n", + "123296176 123296176 2017-07-26 1 103665 4.0 False\n", + "123296177 123296177 2017-07-26 1 105574 9.0 False\n", + "123296178 123296178 2017-07-26 1 105575 6.0 False\n", + "123296179 123296179 2017-07-26 1 105693 2.0 True" + ] + }, + "execution_count": 112, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "train_validation.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "seen_empirical = pd.read_csv('seen_empirical', header=None)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
0
0123296175
1123296176
2123296177
3123296178
4123296181
\n", + "
" + ], + "text/plain": [ + " 0\n", + "0 123296175\n", + "1 123296176\n", + "2 123296177\n", + "3 123296178\n", + "4 123296181" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "seen_empirical.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "baseline_predictions_validation = pd.read_csv('baseline_predictions_validation')" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idperishableprediction_sales
012329617503
112329617614
212329617705
3123296178010
412329617901
\n", + "
" + ], + "text/plain": [ + " id perishable prediction_sales\n", + "0 123296175 0 3\n", + "1 123296176 1 4\n", + "2 123296177 0 5\n", + "3 123296178 0 10\n", + "4 123296179 0 1" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "baseline_predictions_validation.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [], + "source": [ + "baseline_predictions_validation_subset = baseline_predictions_validation[baseline_predictions_validation.id.isin(seen_empirical[[0]][0])]" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(1343027, 3)" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "baseline_predictions_validation_subset.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [], + "source": [ + "train_subset = train[train.id.isin(seen_empirical[[0]][0])]" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/jinyang/anaconda/envs/py35/lib/python3.5/site-packages/ipykernel_launcher.py:2: SettingWithCopyWarning:\n", + "\n", + "\n", + "A value is trying to be set on a copy of a slice from a DataFrame\n", + "\n", + "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n", + "\n", + "/Users/jinyang/anaconda/envs/py35/lib/python3.5/site-packages/pandas/core/generic.py:5088: SettingWithCopyWarning:\n", + "\n", + "\n", + "A value is trying to be set on a copy of a slice from a DataFrame\n", + "\n", + "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n", + "\n", + "/Users/jinyang/anaconda/envs/py35/lib/python3.5/site-packages/ipykernel_launcher.py:3: SettingWithCopyWarning:\n", + "\n", + "\n", + "A value is trying to be set on a copy of a slice from a DataFrame\n", + "\n", + "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n", + "\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(1343027,)\n", + "(1343027,) (1343027,) (1343027,)\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/jinyang/anaconda/envs/py35/lib/python3.5/site-packages/ipykernel_launcher.py:5: SettingWithCopyWarning:\n", + "\n", + "\n", + "A value is trying to be set on a copy of a slice from a DataFrame\n", + "\n", + "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n", + "\n" + ] + }, + { + "data": { + "text/plain": [ + "1.1433622314160428" + ] + }, + "execution_count": 27, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "get_evaluation(baseline_predictions_validation_subset, train_subset['unit_sales'])" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": {}, + "outputs": [], + "source": [ + "baseline_predictions_test = pd.read_csv('baseline_predictions_test')" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idperishableprediction_sales
012549704002
112549704103
212549704303
312549704414
412549704505
\n", + "
" + ], + "text/plain": [ + " id perishable prediction_sales\n", + "0 125497040 0 2\n", + "1 125497041 0 3\n", + "2 125497043 0 3\n", + "3 125497044 1 4\n", + "4 125497045 0 5" + ] + }, + "execution_count": 30, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "baseline_predictions_test.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.5.3" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/notebook b/notebook new file mode 100644 index 0000000..1e60e29 --- /dev/null +++ b/notebook @@ -0,0 +1,3325 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "" + ], + "text/vnd.plotly.v1+html": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "import matplotlib.pyplot as plt\n", + "%matplotlib inline\n", + "import seaborn as sns\n", + "import plotly.offline as py\n", + "import plotly.graph_objs as go\n", + "py.init_notebook_mode()\n", + "import gc" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [], + "source": [ + "seed = 46\n", + "np.random.seed(seed)" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [], + "source": [ + "def load_data(data_path):\n", + " train = pd.read_csv('%s/train.csv' % data_path, parse_dates=['date'])\n", + " test = pd.read_csv('%s/test.csv' % data_path, parse_dates=['date'])\n", + " items = pd.read_csv('%s/items.csv' % data_path)\n", + " return train, test, items" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/jinyang/anaconda/envs/py35/lib/python3.5/site-packages/IPython/core/interactiveshell.py:2821: DtypeWarning:\n", + "\n", + "Columns (5) have mixed types. Specify dtype option on import or set low_memory=False.\n", + "\n" + ] + } + ], + "source": [ + "data_path = './data'\n", + "train, test, items = load_data(data_path)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "def print_summary_train_test(train, test):\n", + " print ('Train min/max date: %s / %s' % (train['date'].min(), train['date'].max()))\n", + " print ('Test min/max date: %s / %s' % ( test['date'].min(), test['date'].max()))\n", + " print ('')\n", + " print ('Number of days in train: %d' % ((train['date'].max() - train['date'].min()).days + 1))\n", + " print ('Number of days in validation: %d' % (( test['date'].max() - test['date'].min()).days + 1))\n", + " print ('')\n", + " print ('Train shape: %d rows' % train.shape[0])\n", + " print ('Test shape: %d rows' % test.shape[0])" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Train min/max date: 2013-01-01 00:00:00 / 2017-08-15 00:00:00\n", + "Test min/max date: 2017-08-16 00:00:00 / 2017-08-31 00:00:00\n", + "\n", + "Number of days in train: 1688\n", + "Number of days in validation: 16\n", + "\n", + "Train shape: 125497040 rows\n", + "Test shape: 3370464 rows\n" + ] + } + ], + "source": [ + "print_summary_train_test(train, test)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Divide Train data into Validation(last two weeks of train data) and Training(the rest)" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [], + "source": [ + "import splitter" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "train_last_date = train['date'].max()" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2017-07-26 00:00:00 2017-08-10 00:00:00\n" + ] + } + ], + "source": [ + "begin_of_validation, end_of_validation = splitter.get_validation_period(train_last_date)\n", + "print(begin_of_validation, end_of_validation)" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "train_train, train_validation = splitter.split_validation_train_by_validation_period(train, begin_of_validation, end_of_validation)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Train min/max date: 2013-01-01 00:00:00 / 2017-07-25 00:00:00\n", + "Test min/max date: 2017-07-26 00:00:00 / 2017-08-10 00:00:00\n", + "\n", + "Number of days in train: 1667\n", + "Number of days in validation: 16\n", + "\n", + "Train shape: 123296175 rows\n", + "Test shape: 1679408 rows\n" + ] + } + ], + "source": [ + "print_summary_train_test(train_train, train_validation)\n", + " " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Evaluation Metric" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [], + "source": [ + "import evaluation\n", + "from sklearn.metrics import mean_squared_error" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## How many items in Test data set are not seen in Train data set \n", + "## vs. how many items in Validation are not seen in Training" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [], + "source": [ + "def get_unseen_item_percentage(train, test):\n", + " test_items = test['item_nbr'].unique()\n", + " train_items = train['item_nbr'].unique()\n", + " test_items_unseen_in_train = set(test_items) - set(train_items)\n", + " unseen_percentage = len(test_items_unseen_in_train)/len(test_items)\n", + " print(\"{:.2f}% of items in the test data set are not seen in the train data set\".format(unseen_percentage*100))\n", + " return unseen_percentage" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "1.54% of items in the test data set are not seen in the train data set\n" + ] + }, + { + "data": { + "text/plain": [ + "0.015380671622660855" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "get_unseen_item_percentage(train, test)" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0.55% of items in the test data set are not seen in the train data set\n" + ] + }, + { + "data": { + "text/plain": [ + "0.005454545454545455" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "get_unseen_item_percentage(train_train, train_validation)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## How many (item, store) in Test data set are not seen in Train data set\n", + "## vs. how many (item, store) in Validation are not seen in Training" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [], + "source": [ + "def get_unseen_item_store_pair_percentage(train, test):\n", + " cols_item_store = ['item_nbr', 'store_nbr']\n", + " train_item_store_grouped = train.groupby(cols_item_store)['unit_sales'].mean().reset_index()\n", + " cols_to_use = test.columns.drop('unit_sales') if 'unit_sales' in test.columns else test.columns\n", + " test_join_train_item_store_grouped = pd.merge(test[cols_to_use], train_item_store_grouped, on=cols_item_store, how='left')\n", + " test_unseen = test_join_train_item_store_grouped[test_join_train_item_store_grouped['unit_sales'].isnull()]\n", + " unseen_percentage = test_unseen.shape[0]/test.shape[0]\n", + " print(\"{:.2f}% of (item,store) pairs in the test data set are not seen in the train data set\".format(unseen_percentage*100))\n", + " return unseen_percentage" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "21.10% of (item,store) pairs in the test data set are not seen in the train data set\n" + ] + }, + { + "data": { + "text/plain": [ + "0.21096679863662687" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "get_unseen_item_store_pair_percentage(train, test)" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0.13% of (item,store) pairs in the test data set are not seen in the train data set\n" + ] + }, + { + "data": { + "text/plain": [ + "0.0013326124443851642" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "get_unseen_item_store_pair_percentage(train_train, train_validation)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Using constant prediction" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "8.563926854265649" + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "train_train.unit_sales.mean()" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "8.554865268437672" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "train.unit_sales.mean()" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(1679408, 6)" + ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "train_validation.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [], + "source": [ + "train_validation_prediction = pd.DataFrame({'id': train_validation.loc[:, 'id'], 'prediction_sales': train_train.unit_sales.mean()})" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(1679408, 2)" + ] + }, + "execution_count": 23, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "train_validation_prediction.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [], + "source": [ + "test_prediction = pd.DataFrame({'id': test.loc[:, 'id'], 'prediction_sales': train.unit_sales.mean()})" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idprediction_sales
1232961751232961758.563927
1232961761232961768.563927
1232961771232961778.563927
1232961781232961788.563927
1232961791232961798.563927
\n", + "
" + ], + "text/plain": [ + " id prediction_sales\n", + "123296175 123296175 8.563927\n", + "123296176 123296176 8.563927\n", + "123296177 123296177 8.563927\n", + "123296178 123296178 8.563927\n", + "123296179 123296179 8.563927" + ] + }, + "execution_count": 25, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "train_validation_prediction.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idprediction_sales
01254970408.554865
11254970418.554865
21254970428.554865
31254970438.554865
41254970448.554865
\n", + "
" + ], + "text/plain": [ + " id prediction_sales\n", + "0 125497040 8.554865\n", + "1 125497041 8.554865\n", + "2 125497042 8.554865\n", + "3 125497043 8.554865\n", + "4 125497044 8.554865" + ] + }, + "execution_count": 26, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "test_prediction.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Evaluation" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": {}, + "outputs": [], + "source": [ + "def get_prediction_constant(train_train):\n", + " return train_train.unit_sales.mean()" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": {}, + "outputs": [], + "source": [ + "def get_clean_prediction(train_train, train_validation):\n", + " predictions = get_prediction_constant(train_train)\n", + " return pd.DataFrame({'id': train_validation.loc[:, 'id'], 'prediction_sales': predictions})" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": {}, + "outputs": [], + "source": [ + "def get_item_expanded_df(test, items):\n", + " return pd.merge(test, items, on='item_nbr', how='left')" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": {}, + "outputs": [], + "source": [ + "def get_evaluation_using_constant_baseline(train_train, train_validation):\n", + " train_validation_prediction = get_clean_prediction(train_train, train_validation)\n", + " train_validation_expanded = get_item_expanded_df(train_validation, items)\n", + " train_validation_weights = train_validation_expanded ['perishable']*0.25+1\n", + " return nwrmsle(train_validation_prediction['prediction_sales'], train_validation['unit_sales'], train_validation_weights)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": {}, + "outputs": [], + "source": [ + "train_validation_expanded = get_item_expanded_df(train_validation, items)" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "metadata": {}, + "outputs": [], + "source": [ + "train_validation_weights = train_validation_expanded ['perishable']*0.25+1" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0 1.00\n", + "1 1.25\n", + "2 1.00\n", + "3 1.00\n", + "4 1.00\n", + "Name: perishable, dtype: float64" + ] + }, + "execution_count": 33, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "train_validation_weights.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [], + "source": [ + "def nwrmsle(predictions, targets, weights):\n", + " print(targets.shape)\n", + " targets[targets<0]=0\n", + " weights = 1 + 0.25 * weights\n", + " print(predictions.shape, targets.shape, weights.shape)\n", + " log_square_errors = (np.log(predictions.values + 1) - np.log(targets.values + 1)) ** 2\n", + " return(np.sqrt(np.sum(weights.values * log_square_errors) / np.sum(weights)))" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(1679408,)" + ] + }, + "execution_count": 35, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "train_validation_prediction['prediction_sales'].shape" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(1679408, 2)" + ] + }, + "execution_count": 36, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "train_validation_prediction.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(1679408,)" + ] + }, + "execution_count": 37, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "train_validation['unit_sales'].shape" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(1679408, 6)" + ] + }, + "execution_count": 38, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "train_validation.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(1679408,)\n", + "(1679408,) (1679408,) (1679408,)\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/jinyang/anaconda/envs/py35/lib/python3.5/site-packages/ipykernel_launcher.py:3: SettingWithCopyWarning:\n", + "\n", + "\n", + "A value is trying to be set on a copy of a slice from a DataFrame\n", + "\n", + "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n", + "\n", + "/Users/jinyang/anaconda/envs/py35/lib/python3.5/site-packages/pandas/core/generic.py:5088: SettingWithCopyWarning:\n", + "\n", + "\n", + "A value is trying to be set on a copy of a slice from a DataFrame\n", + "\n", + "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n", + "\n", + "/Users/jinyang/anaconda/envs/py35/lib/python3.5/site-packages/ipykernel_launcher.py:1: SettingWithCopyWarning:\n", + "\n", + "\n", + "A value is trying to be set on a copy of a slice from a DataFrame\n", + "\n", + "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n", + "\n" + ] + } + ], + "source": [ + "train_validation_metric = nwrmsle(train_validation_prediction['prediction_sales'], train_validation['unit_sales'], train_validation_weights)" + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "1.0371859208825527" + ] + }, + "execution_count": 40, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "train_validation_metric" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In Leaderboard 1.710" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## How to improve the similarity between validation performance and test performance?" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Strategy 1: Remove Items from Training data set" + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
iddatestore_nbritem_nbronpromotion
01254970402017-08-16196995False
11254970412017-08-16199197False
21254970422017-08-161103501False
31254970432017-08-161103520False
41254970442017-08-161103665False
\n", + "
" + ], + "text/plain": [ + " id date store_nbr item_nbr onpromotion\n", + "0 125497040 2017-08-16 1 96995 False\n", + "1 125497041 2017-08-16 1 99197 False\n", + "2 125497042 2017-08-16 1 103501 False\n", + "3 125497043 2017-08-16 1 103520 False\n", + "4 125497044 2017-08-16 1 103665 False" + ] + }, + "execution_count": 41, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "test.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(3901,)" + ] + }, + "execution_count": 42, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "test.item_nbr.unique().shape" + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.007690335811330428" + ] + }, + "execution_count": 43, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "30/3901" + ] + }, + { + "cell_type": "code", + "execution_count": 44, + "metadata": {}, + "outputs": [], + "source": [ + "def move_items_from_train_to_validation(train, validation, items_to_remove):\n", + " train2 = train[~train.item_nbr.isin(items_to_remove)]\n", + " validation_to_add = train[train.item_nbr.isin(items_to_remove)]\n", + " validation2 = validation.append(validation_to_add)\n", + " return train2, validation2\n", + "\n", + "\n", + "def move_random_items_from_train_to_validation(train, validation, num_items_to_remove):\n", + " train_items = train['item_nbr'].unique()\n", + " items_to_remove = np.random.choice(train_items, num_items_to_remove)\n", + " train2, validation2 = move_items_from_train_to_validation(train, validation, items_to_remove)\n", + " print(\"Moved {} items from train data to test data\".format(num_items_to_remove))\n", + " print(\"train data: {} -> {} rows\".format(train.shape[0], train2.shape[0]))\n", + " print(\"validation data: {} -> {} rows\".format(validation.shape[0], validation2.shape[0]))\n", + " return train2, validation2" + ] + }, + { + "cell_type": "code", + "execution_count": 45, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Moved 30 items from train data to test data\n", + "train data: 123296175 -> 122487773 rows\n", + "validation data: 1679408 -> 2487810 rows\n" + ] + } + ], + "source": [ + "num_items_to_move = 30\n", + "train_train2, train_validation2 = move_random_items_from_train_to_validation(train_train, train_validation, num_items_to_move)" + ] + }, + { + "cell_type": "code", + "execution_count": 46, + "metadata": {}, + "outputs": [], + "source": [ + "def move_random_items_from_train_to_validation(train, validation, num_items_to_remove):\n", + " train_items = train['item_nbr'].unique()\n", + " items_to_remove = np.random.choice(train_items, num_items_to_remove)\n", + " train2, validation2 = move_items_from_train_to_validation(train, validation, items_to_remove)\n", + " print(\"Moved {} items from train data to test data\".format(num_items_to_remove))\n", + " print(\"train data: {} -> {} rows\".format(train.shape[0], train2.shape[0]))\n", + " print(\"validation data: {} -> {} rows\".format(validation.shape[0], validation2.shape[0]))\n", + " return train2, validation2" + ] + }, + { + "cell_type": "code", + "execution_count": 47, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Moved 10 items from train data to test data\n", + "train data: 123296175 -> 122972539 rows\n", + "validation data: 1679408 -> 2003044 rows\n" + ] + } + ], + "source": [ + "num_items_to_remove = 10\n", + "train_train2, train_validation2 = move_random_items_from_train_to_validation(train_train, train_validation, num_items_to_remove)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Clear memory for previous train_train and train_validation" + ] + }, + { + "cell_type": "code", + "execution_count": 48, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "288" + ] + }, + "execution_count": 48, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "train_train = train_train2\n", + "train_validation = train_validation2\n", + "del train_train2\n", + "del train_validation2\n", + "gc.collect()" + ] + }, + { + "cell_type": "code", + "execution_count": 49, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0.80% of items in the test data set are not seen in the train data set\n" + ] + }, + { + "data": { + "text/plain": [ + "0.00804985717995326" + ] + }, + "execution_count": 49, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "get_unseen_item_percentage(train_train, train_validation)" + ] + }, + { + "cell_type": "code", + "execution_count": 50, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "16.51% of (item,store) pairs in the test data set are not seen in the train data set\n" + ] + }, + { + "data": { + "text/plain": [ + "0.16514714604372147" + ] + }, + "execution_count": 50, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "get_unseen_item_store_pair_percentage(train_train, train_validation)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now, much more items are unseen in validation data" + ] + }, + { + "cell_type": "code", + "execution_count": 51, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(2003044,)\n", + "(2003044,) (2003044,) (2003044,)\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/jinyang/anaconda/envs/py35/lib/python3.5/site-packages/ipykernel_launcher.py:3: SettingWithCopyWarning:\n", + "\n", + "\n", + "A value is trying to be set on a copy of a slice from a DataFrame\n", + "\n", + "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n", + "\n" + ] + } + ], + "source": [ + "train_validation_metric = get_evaluation_using_constant_baseline(train_train, train_validation)" + ] + }, + { + "cell_type": "code", + "execution_count": 52, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "1.033043851782858" + ] + }, + "execution_count": 52, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "train_validation_metric" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In Leaderboard 1.710" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Removing items hardly changed the validation score, which implies that we are doing worse job on predicting seen items" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Analyze the performance on each group: seen (item, store), seen class, unseen class" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Group test data into the three groups" + ] + }, + { + "cell_type": "code", + "execution_count": 53, + "metadata": {}, + "outputs": [], + "source": [ + "def group_test_data(test, train):\n", + " def get_classes_from_expanded(df_expanded):\n", + " return df_expanded['class'].unique()\n", + " cols_item_store = ['item_nbr', 'store_nbr']\n", + " train_item_store_grouped = train.groupby(cols_item_store)['unit_sales'].mean().reset_index()\n", + " cols_to_use = test.columns.drop('unit_sales') if 'unit_sales' in test.columns else test.columns\n", + " test_join_train_item_store_grouped = pd.merge(test[cols_to_use], train_item_store_grouped, on=cols_item_store, how='left')\n", + " test_seen = test_join_train_item_store_grouped[test_join_train_item_store_grouped['unit_sales'].notnull()]\n", + " test_unseen = test_join_train_item_store_grouped[test_join_train_item_store_grouped['unit_sales'].isnull()]\n", + " test_unseen_expanded = get_item_expanded_df(test_unseen, items)\n", + " test_unseen_class = get_classes_from_expanded(test_unseen_expanded)\n", + " train_expanded = get_item_expanded_df(train, items)\n", + " train_class = get_classes_from_expanded(train_expanded)\n", + " test_unseen_class_diff = set(test_unseen_class) - set(train_class)\n", + " test_unseen_class_same = set(test_unseen_class) - test_unseen_class_diff\n", + " test_unseen_class_seen = test_unseen_expanded[test_unseen_expanded['class'].isin(test_unseen_class_same)]\n", + " test_unseen_class_unseen = test_unseen_expanded[test_unseen_expanded['class'].isin(test_unseen_class_diff)]\n", + " return test_seen, test_unseen_class_seen, test_unseen_class_unseen" + ] + }, + { + "cell_type": "code", + "execution_count": 54, + "metadata": {}, + "outputs": [], + "source": [ + "test_seen, test_unseen_class_seen,test_unseen_class_unseen = group_test_data(test, train)" + ] + }, + { + "cell_type": "code", + "execution_count": 55, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "True\n" + ] + } + ], + "source": [ + "print(test.shape[0]== test_seen.shape[0]+ test_unseen_class_seen.shape[0]+test_unseen_class_unseen.shape[0])" + ] + }, + { + "cell_type": "code", + "execution_count": 56, + "metadata": {}, + "outputs": [], + "source": [ + "def get_baseline_prediction_test_seen(test_seen_expanded, train):\n", + " cols_item_store = ['item_nbr', 'store_nbr']\n", + " cols_test_expanded = test_seen_expanded.columns\n", + " cols_prediction = ['id', 'unit_sales', 'perishable']\n", + " train_item_store_grouped = train.groupby(cols_item_store)['unit_sales'].mean().reset_index()\n", + " cols_to_use = cols_test_expanded.drop('unit_sales') if 'unit_sales' in cols_test_expanded else cols_test_expanded\n", + " test_join_train_item_store_grouped = pd.merge(test_seen_expanded[cols_to_use], train_item_store_grouped, on=cols_item_store, how='left')\n", + " is_data_seen = test_join_train_item_store_grouped['unit_sales'].notnull()\n", + " join_seen = test_join_train_item_store_grouped[is_data_seen.values]\n", + " return join_seen[cols_prediction].rename(columns={'unit_sales':'prediction_sales'})" + ] + }, + { + "cell_type": "code", + "execution_count": 57, + "metadata": {}, + "outputs": [], + "source": [ + "def get_baseline_prediction_test_unseen_class_seen(test_unseen_class_seen, train_expanded):\n", + " test_unseen_class_seen_class = test_unseen_class_seen['class'].unique()\n", + " train_sub = train_expanded[train_expanded['class'].isin(test_unseen_class_seen_class)]\n", + " train_sub_class_grouped = train_sub.groupby('class').mean().reset_index()\n", + " train_sub_class_grouped = train_sub_class_grouped[['class', 'unit_sales']]\n", + " test_unseen_class_seen_join_train_sub_class_grouped = pd.merge(test_unseen_class_seen, train_sub_class_grouped, on='class', how='left')\n", + " return test_unseen_class_seen_join_train_sub_class_grouped.drop('unit_sales_x', axis=1)\\\n", + " .rename(columns={'unit_sales_y':'prediction_sales'})\n", + " " + ] + }, + { + "cell_type": "code", + "execution_count": 58, + "metadata": {}, + "outputs": [], + "source": [ + "def get_baseline_prediction_test_unseen_class_unseen(test_unseen_class_unseen, train_expanded):\n", + " test_unseen_class_unseen_family = test_unseen_class_unseen['family'].unique()\n", + " train_sub = train_expanded[train_expanded['family'].isin(test_unseen_class_unseen_family)]\n", + " train_sub_family_grouped = train_sub.groupby('family').mean().reset_index()\n", + " train_sub_family_grouped = train_sub_family_grouped[['family', 'unit_sales']]\n", + " test_unseen_class_unseen_join_train_sub_family_grouped = pd.merge(test_unseen_class_unseen, train_sub_family_grouped, on='family', how='left')\n", + " return test_unseen_class_unseen_join_train_sub_family_grouped.drop('unit_sales_x', axis=1)\\\n", + " .rename(columns={'unit_sales_y':'prediction_sales'})\n", + " " + ] + }, + { + "cell_type": "code", + "execution_count": 59, + "metadata": {}, + "outputs": [], + "source": [ + "def clean_predictions(predictions):\n", + " predictions[predictions<0]=0\n", + " return predictions.round().astype(int)" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [], + "source": [ + "def clean_targets(targets):\n", + " targets[targets<0]=0\n", + " return targets" + ] + }, + { + "cell_type": "code", + "execution_count": 61, + "metadata": {}, + "outputs": [], + "source": [ + "def get_baseline_predictions(test, train):\n", + " cols_to_use =['id', 'perishable', 'prediction_sales']\n", + " test_seen, test_unseen_class_seen, test_unseen_class_unseen = group_test_data(test, train)\n", + " train_expanded = get_item_expanded_df(train, items)\n", + " test_seen_expanded = get_item_expanded_df(test_seen, items)\n", + " prediction_test_seen = get_baseline_prediction_test_seen(test_seen_expanded, train)[cols_to_use]\n", + " prediction_test_unseen_class_seen = get_baseline_prediction_test_unseen_class_seen(test_unseen_class_seen, train_expanded)[cols_to_use]\n", + " prediction_test_unseen_class_unseen = get_baseline_prediction_test_unseen_class_unseen(test_unseen_class_unseen, train_expanded)[cols_to_use]\n", + " baseline_predictions = prediction_test_seen.append(prediction_test_unseen_class_seen).append(prediction_test_unseen_class_unseen)\n", + " cleaned_predictions = clean_predictions(baseline_predictions['prediction_sales'])\n", + " baseline_predictions.loc[:, 'prediction_sales']=cleaned_predictions\n", + " return baseline_predictions" + ] + }, + { + "cell_type": "code", + "execution_count": 62, + "metadata": {}, + "outputs": [], + "source": [ + "# prediction_test_seen, prediction_test_unseen_class_seen, prediction_test_unseen_class_unseen=get_baseline_predictions(test, train)" + ] + }, + { + "cell_type": "code", + "execution_count": 63, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/jinyang/anaconda/envs/py35/lib/python3.5/site-packages/ipykernel_launcher.py:2: SettingWithCopyWarning:\n", + "\n", + "\n", + "A value is trying to be set on a copy of a slice from a DataFrame\n", + "\n", + "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n", + "\n" + ] + } + ], + "source": [ + "baseline_predictions_validation = get_baseline_predictions(train_validation, train_train)" + ] + }, + { + "cell_type": "code", + "execution_count": 64, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idperishableprediction_sales
012329617503
112329617614
212329617705
3123296178010
412329617901
\n", + "
" + ], + "text/plain": [ + " id perishable prediction_sales\n", + "0 123296175 0 3\n", + "1 123296176 1 4\n", + "2 123296177 0 5\n", + "3 123296178 0 10\n", + "4 123296179 0 1" + ] + }, + "execution_count": 64, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "baseline_predictions_validation.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 65, + "metadata": {}, + "outputs": [], + "source": [ + "targets_validation = pd.merge(baseline_predictions_validation, train_validation, on='id', how='left')['unit_sales']" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [], + "source": [ + "def get_evaluation(baseline_predictions_validation, targets_validation):\n", + " predictions = baseline_predictions_validation.prediction_sales\n", + " cleaned_targets = clean_targets(targets_validation)\n", + " weights = baseline_predictions_validation.perishable\n", + " return nwrmsle(predictions, cleaned_targets, weights)" + ] + }, + { + "cell_type": "code", + "execution_count": 67, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(2003044,)\n", + "(2003044,) (2003044,) (2003044,)\n" + ] + } + ], + "source": [ + "validation_metric = get_evaluation(baseline_predictions_validation, targets_validation)" + ] + }, + { + "cell_type": "code", + "execution_count": 68, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.67828104376604859" + ] + }, + "execution_count": 68, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "validation_metric" + ] + }, + { + "cell_type": "code", + "execution_count": 69, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Validation NWRMSLE: 0.67828\n", + "Validation MSE: 353.981\n" + ] + } + ], + "source": [ + "mse = mean_squared_error(baseline_predictions_validation['prediction_sales'], targets_validation)\n", + "print('Validation NWRMSLE: %.5f' % (validation_metric))\n", + "print('Validation MSE: %.3f' % (mse))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "on Leaderboard: 1.369" + ] + }, + { + "cell_type": "code", + "execution_count": 70, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.67828104376604859" + ] + }, + "execution_count": 70, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "validation_metric" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Submission" + ] + }, + { + "cell_type": "code", + "execution_count": 72, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/jinyang/anaconda/envs/py35/lib/python3.5/site-packages/ipykernel_launcher.py:2: SettingWithCopyWarning:\n", + "\n", + "\n", + "A value is trying to be set on a copy of a slice from a DataFrame\n", + "\n", + "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n", + "\n" + ] + } + ], + "source": [ + "baseline_predictions_test = get_baseline_predictions(test, train)" + ] + }, + { + "cell_type": "code", + "execution_count": 73, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idperishableprediction_sales
012549704002
112549704103
212549704303
312549704414
412549704505
\n", + "
" + ], + "text/plain": [ + " id perishable prediction_sales\n", + "0 125497040 0 2\n", + "1 125497041 0 3\n", + "2 125497043 0 3\n", + "3 125497044 1 4\n", + "4 125497045 0 5" + ] + }, + "execution_count": 73, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "baseline_predictions_test.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 74, + "metadata": {}, + "outputs": [], + "source": [ + "test_submission=baseline_predictions_test[['id', 'prediction_sales']].rename(columns={'prediction_sales':'unit_sales'})" + ] + }, + { + "cell_type": "code", + "execution_count": 75, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idunit_sales
01254970402
11254970413
21254970433
31254970444
41254970455
\n", + "
" + ], + "text/plain": [ + " id unit_sales\n", + "0 125497040 2\n", + "1 125497041 3\n", + "2 125497043 3\n", + "3 125497044 4\n", + "4 125497045 5" + ] + }, + "execution_count": 75, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "test_submission.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 76, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "True" + ] + }, + "execution_count": 76, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "np.all(test_submission.unit_sales>=0)" + ] + }, + { + "cell_type": "code", + "execution_count": 77, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "id int64\n", + "unit_sales int64\n", + "dtype: object" + ] + }, + "execution_count": 77, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "test_submission.dtypes" + ] + }, + { + "cell_type": "code", + "execution_count": 78, + "metadata": {}, + "outputs": [], + "source": [ + "# test_submission.to_csv('baseline_submission_20171127.csv', index=False)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Evaluation of each group" + ] + }, + { + "cell_type": "code", + "execution_count": 79, + "metadata": {}, + "outputs": [], + "source": [ + "validation_seen, validation_unseen_class_seen,validation_unseen_class_unseen = group_test_data(train_validation, train_train)" + ] + }, + { + "cell_type": "code", + "execution_count": 80, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Index(['id', 'date', 'store_nbr', 'item_nbr', 'onpromotion', 'unit_sales'], dtype='object')" + ] + }, + "execution_count": 80, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "validation_seen.columns" + ] + }, + { + "cell_type": "code", + "execution_count": 81, + "metadata": {}, + "outputs": [], + "source": [ + "seen_baseline_predictions_validation = baseline_predictions_validation[ baseline_predictions_validation.id.isin(validation_seen.id)]" + ] + }, + { + "cell_type": "code", + "execution_count": 82, + "metadata": {}, + "outputs": [], + "source": [ + "unseen_class_seen_baseline_predictions_validation = baseline_predictions_validation[ baseline_predictions_validation.id.isin(validation_unseen_class_seen.id)]" + ] + }, + { + "cell_type": "code", + "execution_count": 83, + "metadata": {}, + "outputs": [], + "source": [ + "unseen_class_unseen_baseline_predictions_validation = baseline_predictions_validation[ baseline_predictions_validation.id.isin(validation_unseen_class_unseen.id)]" + ] + }, + { + "cell_type": "code", + "execution_count": 84, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "8.406943132552255" + ] + }, + "execution_count": 84, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "baseline_predictions_validation.prediction_sales.mean()" + ] + }, + { + "cell_type": "code", + "execution_count": 85, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "8.53563678093009" + ] + }, + "execution_count": 85, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "seen_baseline_predictions_validation.prediction_sales.mean()" + ] + }, + { + "cell_type": "code", + "execution_count": 86, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "7.756378630680307" + ] + }, + "execution_count": 86, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "unseen_class_seen_baseline_predictions_validation.prediction_sales.mean()" + ] + }, + { + "cell_type": "code", + "execution_count": 87, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "7.2" + ] + }, + "execution_count": 87, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "unseen_class_unseen_baseline_predictions_validation.prediction_sales.mean()" + ] + }, + { + "cell_type": "code", + "execution_count": 88, + "metadata": {}, + "outputs": [], + "source": [ + "seen_baseline_predictions_test = baseline_predictions_test[ baseline_predictions_test.id.isin(test_seen.id)]" + ] + }, + { + "cell_type": "code", + "execution_count": 89, + "metadata": {}, + "outputs": [], + "source": [ + "unseen_class_seen_baseline_predictions_test = baseline_predictions_test[ baseline_predictions_test.id.isin(test_unseen_class_seen.id)]" + ] + }, + { + "cell_type": "code", + "execution_count": 90, + "metadata": {}, + "outputs": [], + "source": [ + "unseen_class_unseen_baseline_predictions_test = baseline_predictions_test[ baseline_predictions_test.id.isin(test_unseen_class_unseen.id)]" + ] + }, + { + "cell_type": "code", + "execution_count": 91, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "7.350565382095759" + ] + }, + "execution_count": 91, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "baseline_predictions_test.prediction_sales.mean()" + ] + }, + { + "cell_type": "code", + "execution_count": 92, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "6.949336092844724" + ] + }, + "execution_count": 92, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "seen_baseline_predictions_test.prediction_sales.mean()" + ] + }, + { + "cell_type": "code", + "execution_count": 93, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "8.855705681997609" + ] + }, + "execution_count": 93, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "unseen_class_seen_baseline_predictions_test.prediction_sales.mean()" + ] + }, + { + "cell_type": "code", + "execution_count": 94, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "7.0" + ] + }, + "execution_count": 94, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "unseen_class_unseen_baseline_predictions_test.prediction_sales.mean()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "validation and test looks different in means of predictions" + ] + }, + { + "cell_type": "code", + "execution_count": 95, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "8.573084378706207" + ] + }, + "execution_count": 95, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "train_train.unit_sales.mean()" + ] + }, + { + "cell_type": "code", + "execution_count": 96, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "8.554865268437672" + ] + }, + "execution_count": 96, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "train.unit_sales.mean()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Why do seen items have low mean in test data?" + ] + }, + { + "cell_type": "code", + "execution_count": 97, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Index(['id', 'perishable', 'prediction_sales'], dtype='object')" + ] + }, + "execution_count": 97, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "seen_baseline_predictions_test.columns" + ] + }, + { + "cell_type": "code", + "execution_count": 98, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Index(['id', 'date', 'store_nbr', 'item_nbr', 'onpromotion', 'unit_sales'], dtype='object')" + ] + }, + "execution_count": 98, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "test_seen.columns" + ] + }, + { + "cell_type": "code", + "execution_count": 99, + "metadata": {}, + "outputs": [], + "source": [ + "test_seen_mean = test_seen.groupby(['item_nbr','store_nbr'])['unit_sales'].mean().reset_index()" + ] + }, + { + "cell_type": "code", + "execution_count": 100, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "1.000000 1063\n", + "2.000000 387\n", + "1.500000 269\n", + "1.333333 211\n", + "3.000000 169\n", + "1.200000 142\n", + "1.666667 132\n", + "4.000000 110\n", + "1.250000 105\n", + "1.600000 90\n", + "1.400000 87\n", + "2.500000 83\n", + "1.166667 75\n", + "1.750000 74\n", + "2.333333 72\n", + "1.142857 70\n", + "1.800000 70\n", + "5.000000 69\n", + "1.714286 66\n", + "1.833333 65\n", + "1.428571 63\n", + "2.400000 61\n", + "2.666667 59\n", + "1.285714 58\n", + "1.571429 56\n", + "1.222222 55\n", + "6.000000 54\n", + "1.375000 53\n", + "3.333333 52\n", + "3.500000 52\n", + " ... \n", + "5.931522 1\n", + "5.983607 1\n", + "10.427236 1\n", + "6.328264 1\n", + "2.569665 1\n", + "4.409535 1\n", + "35.648855 1\n", + "4.074074 1\n", + "14.786531 1\n", + "1.416867 1\n", + "2.202128 1\n", + "89.903846 1\n", + "4.600505 1\n", + "2.375946 1\n", + "20.287293 1\n", + "4.557390 1\n", + "2.670120 1\n", + "2.405458 1\n", + "8.606928 1\n", + "2.559347 1\n", + "11.537549 1\n", + "4.129799 1\n", + "5.267884 1\n", + "12.886924 1\n", + "5.454833 1\n", + "15.486842 1\n", + "5.792963 1\n", + "4.558394 1\n", + "5.431433 1\n", + "3.750600 1\n", + "Name: unit_sales, Length: 139461, dtype: int64" + ] + }, + "execution_count": 100, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "test_seen_mean.unit_sales.value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": 101, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(3370464, 5)" + ] + }, + "execution_count": 101, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "test.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 102, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(2003044, 6)" + ] + }, + "execution_count": 102, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "train_validation.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 103, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(2659408, 6)" + ] + }, + "execution_count": 103, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "test_seen.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 104, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(1672247, 6)" + ] + }, + "execution_count": 104, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "validation_seen.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 105, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Index(['id', 'date', 'store_nbr', 'item_nbr', 'onpromotion', 'unit_sales'], dtype='object')" + ] + }, + "execution_count": 105, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "validation_seen.columns" + ] + }, + { + "cell_type": "code", + "execution_count": 106, + "metadata": {}, + "outputs": [], + "source": [ + "validation_seen_mean = validation_seen.groupby(['item_nbr', 'store_nbr'])['unit_sales'].mean().reset_index()" + ] + }, + { + "cell_type": "code", + "execution_count": 107, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "1.000000 196\n", + "2.000000 183\n", + "1.500000 148\n", + "1.333333 109\n", + "2.500000 71\n", + "1.250000 70\n", + "4.000000 67\n", + "3.000000 66\n", + "1.750000 57\n", + "1.200000 47\n", + "1.666667 47\n", + "5.000000 43\n", + "1.428571 41\n", + "1.571429 40\n", + "1.375000 39\n", + "1.166667 37\n", + "1.222222 34\n", + "2.250000 34\n", + "6.000000 34\n", + "1.666667 31\n", + "1.125000 31\n", + "1.285714 30\n", + "2.750000 30\n", + "2.333333 30\n", + "1.142857 30\n", + "1.444444 29\n", + "3.500000 28\n", + "2.666667 27\n", + "1.300000 27\n", + "1.625000 26\n", + " ... \n", + "5.414894 1\n", + "5.689526 1\n", + "10.992721 1\n", + "4.405322 1\n", + "2.751479 1\n", + "3.373206 1\n", + "1.844920 1\n", + "3.732102 1\n", + "1.887872 1\n", + "3.881868 1\n", + "6.143824 1\n", + "4.322581 1\n", + "3.486506 1\n", + "4.022444 1\n", + "1.949593 1\n", + "3.965174 1\n", + "6.439222 1\n", + "9.427083 1\n", + "5.120801 1\n", + "1.376404 1\n", + "2.192547 1\n", + "11.182663 1\n", + "6.961364 1\n", + "7.380435 1\n", + "17.839545 1\n", + "1.691099 1\n", + "9.129005 1\n", + "1.983607 1\n", + "25.861189 1\n", + "9.211574 1\n", + "Name: unit_sales, Length: 132023, dtype: int64" + ] + }, + "execution_count": 107, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "validation_seen_mean.unit_sales.value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": 108, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(146060, 3)" + ] + }, + "execution_count": 108, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "validation_seen_mean.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 109, + "metadata": {}, + "outputs": [], + "source": [ + "baseline_predictions_validation.to_csv('baseline_predictions_validation', index=False)\n", + "baseline_predictions_test.to_csv('baseline_predictions_test', index=False)" + ] + }, + { + "cell_type": "code", + "execution_count": 110, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idperishableprediction_sales
012329617503
112329617614
212329617705
3123296178010
412329617901
\n", + "
" + ], + "text/plain": [ + " id perishable prediction_sales\n", + "0 123296175 0 3\n", + "1 123296176 1 4\n", + "2 123296177 0 5\n", + "3 123296178 0 10\n", + "4 123296179 0 1" + ] + }, + "execution_count": 110, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "seen_baseline_predictions_validation.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 112, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
iddatestore_nbritem_nbrunit_salesonpromotion
1232961751232961752017-07-2611035201.0False
1232961761232961762017-07-2611036654.0False
1232961771232961772017-07-2611055749.0False
1232961781232961782017-07-2611055756.0False
1232961791232961792017-07-2611056932.0True
\n", + "
" + ], + "text/plain": [ + " id date store_nbr item_nbr unit_sales onpromotion\n", + "123296175 123296175 2017-07-26 1 103520 1.0 False\n", + "123296176 123296176 2017-07-26 1 103665 4.0 False\n", + "123296177 123296177 2017-07-26 1 105574 9.0 False\n", + "123296178 123296178 2017-07-26 1 105575 6.0 False\n", + "123296179 123296179 2017-07-26 1 105693 2.0 True" + ] + }, + "execution_count": 112, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "train_validation.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "seen_empirical = pd.read_csv('seen_empirical', header=None)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
0
0123296175
1123296176
2123296177
3123296178
4123296181
\n", + "
" + ], + "text/plain": [ + " 0\n", + "0 123296175\n", + "1 123296176\n", + "2 123296177\n", + "3 123296178\n", + "4 123296181" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "seen_empirical.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "baseline_predictions_validation = pd.read_csv('baseline_predictions_validation')" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idperishableprediction_sales
012329617503
112329617614
212329617705
3123296178010
412329617901
\n", + "
" + ], + "text/plain": [ + " id perishable prediction_sales\n", + "0 123296175 0 3\n", + "1 123296176 1 4\n", + "2 123296177 0 5\n", + "3 123296178 0 10\n", + "4 123296179 0 1" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "baseline_predictions_validation.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [], + "source": [ + "baseline_predictions_validation_subset = baseline_predictions_validation[baseline_predictions_validation.id.isin(seen_empirical[[0]][0])]" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(1343027, 3)" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "baseline_predictions_validation_subset.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [], + "source": [ + "train_subset = train[train.id.isin(seen_empirical[[0]][0])]" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/jinyang/anaconda/envs/py35/lib/python3.5/site-packages/ipykernel_launcher.py:2: SettingWithCopyWarning:\n", + "\n", + "\n", + "A value is trying to be set on a copy of a slice from a DataFrame\n", + "\n", + "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n", + "\n", + "/Users/jinyang/anaconda/envs/py35/lib/python3.5/site-packages/pandas/core/generic.py:5088: SettingWithCopyWarning:\n", + "\n", + "\n", + "A value is trying to be set on a copy of a slice from a DataFrame\n", + "\n", + "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n", + "\n", + "/Users/jinyang/anaconda/envs/py35/lib/python3.5/site-packages/ipykernel_launcher.py:3: SettingWithCopyWarning:\n", + "\n", + "\n", + "A value is trying to be set on a copy of a slice from a DataFrame\n", + "\n", + "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n", + "\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(1343027,)\n", + "(1343027,) (1343027,) (1343027,)\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/jinyang/anaconda/envs/py35/lib/python3.5/site-packages/ipykernel_launcher.py:5: SettingWithCopyWarning:\n", + "\n", + "\n", + "A value is trying to be set on a copy of a slice from a DataFrame\n", + "\n", + "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n", + "\n" + ] + }, + { + "data": { + "text/plain": [ + "1.1433622314160428" + ] + }, + "execution_count": 27, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "get_evaluation(baseline_predictions_validation_subset, train_subset['unit_sales'])" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": {}, + "outputs": [], + "source": [ + "baseline_predictions_test = pd.read_csv('baseline_predictions_test')" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idperishableprediction_sales
012549704002
112549704103
212549704303
312549704414
412549704505
\n", + "
" + ], + "text/plain": [ + " id perishable prediction_sales\n", + "0 125497040 0 2\n", + "1 125497041 0 3\n", + "2 125497043 0 3\n", + "3 125497044 1 4\n", + "4 125497045 0 5" + ] + }, + "execution_count": 30, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "baseline_predictions_test.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.5.3" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}