Skip to content
This repository has been archived by the owner on Apr 6, 2020. It is now read-only.

Commit

Permalink
removed some features
Browse files Browse the repository at this point in the history
  • Loading branch information
arifwider committed Dec 22, 2017
1 parent 8940924 commit 5da7ff5
Show file tree
Hide file tree
Showing 3 changed files with 5 additions and 36 deletions.
2 changes: 1 addition & 1 deletion src/decision_tree.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,7 @@ def make_predictions(clf, validate, train):
print("Making prediction on validation data")
validate_dropped = validate.drop('unit_sales', axis=1).fillna(-1)
validate_preds = clf.predict(validate_dropped)
# validate_preds = overwrite_unseen_prediction_with_zero(validate_preds, train, validate)
validate_preds = overwrite_unseen_prediction_with_zero(validate_preds, train, validate)
return validate_preds


Expand Down
31 changes: 0 additions & 31 deletions src/merger.py
Original file line number Diff line number Diff line change
Expand Up @@ -119,16 +119,6 @@ def add_tables(base_table, tables):
return bigTable


def add_percentage_transactions(bigTable):
bigTable['percent_in_transactions'] = bigTable.unit_sales / bigTable.transactions
return bigTable


def add_transactions_per_capita(bigTable):
bigTable['transactions_per_capita'] = bigTable.transactions / bigTable.residents
return bigTable


def write_data_to_s3(table, filename, timestamp, sample=''):
s3resource = boto3.resource('s3')
s3client = boto3.client('s3')
Expand All @@ -147,15 +137,6 @@ def write_data_to_s3(table, filename, timestamp, sample=''):
s3resource.Object(s3bucket, '{key}/{filename}'.format(key=key, filename=filename)).put(Body=csv_buffer.getvalue())


def add_sales_variance(bigTable):
""" Adds a new column reporting the variance
in unit_sales for each (item, store) tuple
"""
df = bigTable.groupby(['store_nbr', 'item_nbr'])['unit_sales'].var().reset_index()
bigTable2 = bigTable.merge(df.rename(columns={'unit_sales': 'item_store_sales_variance'}), on=['store_nbr', 'item_nbr'])
return bigTable2


if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser()
Expand Down Expand Up @@ -186,15 +167,6 @@ def add_sales_variance(bigTable):
print("Adding days off")
bigTable = add_days_off(bigTable, tables)

print("Adding item sales per store transaction")
bigTable = add_percentage_transactions(bigTable)

print("Adding transactions per capita")
bigTable = add_transactions_per_capita(bigTable)

print("Calculating item-store sale variance")
bigTable = add_sales_variance(bigTable)

# Make test.csv have the same features as bigTable
# TODO: Make this less spaghetti code by doing both
# merges at the same time
Expand All @@ -210,9 +182,6 @@ def add_sales_variance(bigTable):
print("Adding days off")
bigTestTable = add_days_off(bigTestTable, tables)

print("Adding transactions per capita")
bigTestTable = add_transactions_per_capita(bigTestTable)

print("Adding NaNs for item-store sale variance")
print("Adding NaNs for item sales per store transaction")
print("Adding NaNs for unit_sales")
Expand Down
8 changes: 4 additions & 4 deletions src/splitter.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,11 @@
from io import StringIO


def get_validation_period(latest_date_train):
# we want from wednesday to thursday (encoded as int 3) for a 15 day period
def get_validation_period(latest_date_train, days_back=15):
# for Kaggle we want from Wednesday to Thursday for a 15 day period
offset = (latest_date_train.weekday() - 3) % 7
end_of_validation_period = latest_date_train - pd.DateOffset(days=offset)
begin_of_validation_period = end_of_validation_period - pd.DateOffset(days=15)
begin_of_validation_period = end_of_validation_period - pd.DateOffset(days=days_back)
return (begin_of_validation_period, end_of_validation_period)


Expand Down Expand Up @@ -71,7 +71,7 @@ def move_random_items_from_train_to_validation(train, validation, num_items_to_r

latest_date = train['date'].max()

begin_of_validation, end_of_validation = get_validation_period(latest_date)
begin_of_validation, end_of_validation = get_validation_period(latest_date, 15)

print("Splitting data between {} and {}".format(begin_of_validation, end_of_validation))
train_train, train_validation = split_validation_train_by_validation_period(train, begin_of_validation,
Expand Down

0 comments on commit 5da7ff5

Please sign in to comment.