From fe37ddf757c494389de2e170bd4c3bb82697998e Mon Sep 17 00:00:00 2001 From: Isabel Date: Thu, 19 Oct 2023 13:02:57 +0100 Subject: [PATCH 1/5] updating for new npm scores --- ahl_targets/getters/get_data.py | 3 +- ahl_targets/getters/simulated_outcomes.py | 4 +- ahl_targets/notebooks/npm_annex.py | 141 +++++++++++++++++----- 3 files changed, 111 insertions(+), 37 deletions(-) diff --git a/ahl_targets/getters/get_data.py b/ahl_targets/getters/get_data.py index 3778955..47c305e 100644 --- a/ahl_targets/getters/get_data.py +++ b/ahl_targets/getters/get_data.py @@ -492,8 +492,9 @@ def get_npm() -> pd.DataFrame: """ return download_obj( BUCKET_NAME, - "in_home/processed/npm_with_nut.csv", + "in_home/processed/npm_with_nut.parquet", download_as="dataframe", + kwargs_boto={"Config": TransferConfig(io_chunksize=20947892)}, ) diff --git a/ahl_targets/getters/simulated_outcomes.py b/ahl_targets/getters/simulated_outcomes.py index 87ad662..a239267 100644 --- a/ahl_targets/getters/simulated_outcomes.py +++ b/ahl_targets/getters/simulated_outcomes.py @@ -30,9 +30,7 @@ def npm_agg() -> pd.DataFrame: """ """ return pd.read_csv( - load_with_encoding( - "ahl-private-data", "in_home/data_outputs/targets_annex/npm_agg.csv" - ), + load_with_encoding("ahl-private-data", "in_home/processed/targets/npm_agg.csv"), encoding="ISO-8859-1", ) diff --git a/ahl_targets/notebooks/npm_annex.py b/ahl_targets/notebooks/npm_annex.py index a3b4f18..96fb38b 100644 --- a/ahl_targets/notebooks/npm_annex.py +++ b/ahl_targets/notebooks/npm_annex.py @@ -19,9 +19,39 @@ os.mkdir(path) +def npm_density_plot(plt_df_sub): + chart = ( + alt.Chart(plt_df_sub) + .transform_density( + "npm_w", as_=["size", "density"], groupby=["when"], bandwidth=2 + ) + .mark_line() + .encode( + x=alt.X( + "size:Q", + axis=alt.Axis( + title="Sales weighted average NPM score", + ), + ), + y=alt.Y("density:Q", axis=alt.Axis(title="Weighted sales (%)", format="%")), + color=alt.Color("when:N", legend=alt.Legend(title="")), + ) + ) + return configure_plots( + chart, + "", + "", + 16, + 14, + 14, + ) + + # read data store_data = get_data.model_data() results_df = get_sim_data.npm_agg() +npm_data = get_data.get_npm() + # create aggregate data with weights store_weight_npm = su.weighted_npm(store_data) @@ -82,7 +112,9 @@ ignore_index=True, ) # Save as csv (for use in chart Y) -avg_retailer.to_csv(PROJECT_DIR / "outputs/reports/chart_csv/chartY.csv", index=False) +avg_retailer.to_csv( + PROJECT_DIR / "outputs/reports/chart_csv/chartY_updated.csv", index=False +) # Generate before-after variables @@ -107,47 +139,91 @@ / store_weight_npm.groupby(["product_code"])["kg_w"].sum() ).reset_index(name="npm_w") -baseline_prod.to_csv(PROJECT_DIR / "outputs/reports/chart_csv/chartC.csv") - +alt.data_transformers.disable_max_rows() +### Creating data for chart C (with updated NPM data) ### +chart_c_df = baseline_prod.copy() +chart_c_df["npm_w"] = ((-2) * chart_c_df["npm_w"]) + 70 +npm_density_plot(chart_c_df) +chart_c_df.to_csv(PROJECT_DIR / "outputs/reports/chart_csv/chartC_updated.csv") -# Data for chart C -baseline_prod["npm_rounded"] = baseline_prod["npm_w"].round(0) +chart_c_df["npm_rounded"] = chart_c_df["npm_w"].round(0) # Percent of products with each NPM score npm_share = ( - (baseline_prod["npm_rounded"].value_counts(normalize=True) * 100) + (chart_c_df["npm_rounded"].value_counts(normalize=True) * 100) .reset_index() .rename(columns={"index": "npm", "npm_rounded": "Percent Share"}) ) +npm_share.to_csv( + PROJECT_DIR / "outputs/reports/chart_csv/chartC2_alternative_npm_share.csv", + index=False, +) -alt.data_transformers.disable_max_rows() +### Chart B - nutrient distribution ### +npm_store_df = npm_data.merge( + store_data[["PurchaseId", "Period", "store_cat", "is_food", "itemisation_level_3"]], + left_on=["purchase_id", "period"], + right_on=["PurchaseId", "Period"], + how="inner", +).drop(columns=["PurchaseId", "Period"]) -def npm_density_plot(plt_df_sub): - chart = ( - alt.Chart(plt_df_sub) - .transform_density( - "npm_w", as_=["size", "density"], groupby=["when"], bandwidth=2 - ) - .mark_line() - .encode( - x=alt.X( - "size:Q", - axis=alt.Axis( - title="Sales weighted average NPM score", - ), - ), - y=alt.Y("density:Q", axis=alt.Axis(title="Weighted sales (%)", format="%")), - color=alt.Color("when:N", legend=alt.Legend(title="")), - ) - ) - return configure_plots( - chart, - "", - "", - 16, - 14, - 14, +# Products grouped by NPM score to get avg: sugar, salt...ect per 100g +prod_per_100 = ( + npm_store_df.groupby(["product_code"])[ + [ + "kcal_per_100g", + "sat_per_100g", + "prot_per_100g", + "sug_per_100g", + "sod_per_100g", + "fibre_per_100g", + ] + ] + .mean() + .reset_index() +) +prod_100_npm = prod_per_100.merge( + chart_c_df[["product_code", "npm_w"]], + left_on="product_code", + right_on="product_code", +).drop(["product_code"], axis=1) + +prod_100_npm["npm_w"] = prod_100_npm["npm_w"].round(0) + +prod_100_npm.rename( + columns={ + "npm_w": "npm_score", + }, + inplace=True, +) + + +prod_100_npm = ( + prod_100_npm.groupby(["npm_score"]) + .mean() + .reset_index() + .melt( + id_vars=["npm_score"], + var_name="component", + value_name="per 100g", ) +) +# Saving CSV file (for chartB) +prod_100_npm.to_csv( + PROJECT_DIR / f"outputs/reports/chart_csv/chartB_updated.csv", index=False +) + + +#### Previous code (for reference) #### + +# Data for chart C +baseline_prod["npm_rounded"] = baseline_prod["npm_w"].round(0) +# Percent of products with each NPM score +npm_share = ( + (baseline_prod["npm_rounded"].value_counts(normalize=True) * 100) + .reset_index() + .rename(columns={"index": "npm", "npm_rounded": "Percent Share"}) +) # Updated version of Chart C with new NPM data @@ -162,7 +238,6 @@ def npm_density_plot(plt_df_sub): driver=webdr, ) - # Save as csv (for use in chart C) npm_share.to_csv(PROJECT_DIR / "outputs/reports/chart_csv/chartC2_v2.csv", index=False) From 62542e8112dea4fcbb98204ffaf9bd4afb59039d Mon Sep 17 00:00:00 2001 From: Isabel Date: Thu, 19 Oct 2023 16:57:48 +0100 Subject: [PATCH 2/5] adding in info on hfss and high npm scores --- ahl_targets/notebooks/npm_annex.py | 57 ++++++++++++++++++++++++++++++ 1 file changed, 57 insertions(+) diff --git a/ahl_targets/notebooks/npm_annex.py b/ahl_targets/notebooks/npm_annex.py index 96fb38b..4e5a7ff 100644 --- a/ahl_targets/notebooks/npm_annex.py +++ b/ahl_targets/notebooks/npm_annex.py @@ -1,5 +1,6 @@ from ahl_targets.getters import simulated_outcomes as get_sim_data from ahl_targets.getters import get_data +from ahl_targets.pipeline import product_transformation as pt from ahl_targets.utils.plotting import configure_plots from ahl_targets.utils.altair_save_utils import ( google_chrome_driver_setup, @@ -57,6 +58,62 @@ def npm_density_plot(plt_df_sub): store_weight_npm = su.weighted_npm(store_data) store_weight_npm["prod_weight_g"] = store_weight_npm.pipe(su.prod_weight_g) + +# HFSS info +store_data_hfss = pt.type(store_data) +store_data_hfss = pt.in_scope(store_data_hfss) + +store_data_hfss["weight_kcal"] = ( + store_data_hfss["Gross Up Weight"] * store_data_hfss["Energy KCal"] +) +store_data_hfss["weight_vol"] = ( + store_data_hfss["Gross Up Weight"] * store_data_hfss["volume_up"] +) +store_data_hfss["weight_prod"] = store_data_hfss["Gross Up Weight"] +store_data_hfss["weight_none"] = 1 + +# HFSS volume weighted shares +hfss_shares_volume = ( + store_data_hfss.groupby(["in_scope"])["weight_vol"].sum() + / store_data_hfss["weight_vol"].sum() +) +# HFSS product weighted shares +hfss_shares_prod = ( + store_data_hfss.groupby(["in_scope"])["weight_prod"].sum() + / store_data_hfss["weight_prod"].sum() +) + +hfss_shares_none = ( + store_data_hfss.groupby(["in_scope"])["weight_none"].sum() + / store_data_hfss["weight_none"].sum() +) + +hfss_shares_kcal = ( + store_data_hfss.groupby(["in_scope"])["weight_kcal"].sum() + / store_data_hfss["weight_kcal"].sum() +) + +# Create new column high NPM >= 4 (1 else 0) +store_data_hfss["high_npm"] = store_data_hfss["npm_score"].apply( + lambda x: 1 if x >= 4 else 0 +) + +hfss_high_volume = ( + store_data_hfss.groupby(["high_npm"])["weight_vol"].sum() + / store_data_hfss["weight_vol"].sum() +) + +hfss_high_prod = ( + store_data_hfss.groupby(["high_npm"])["weight_prod"].sum() + / store_data_hfss["weight_prod"].sum() +) + +hfss_high_kcal = ( + store_data_hfss.groupby(["high_npm"])["weight_kcal"].sum() + / store_data_hfss["weight_kcal"].sum() +) + + # average across all iterations avg = ( results_df.groupby( From a72f0f41c6b325698342e04ba3d165ab9ea26960 Mon Sep 17 00:00:00 2001 From: Isabel Date: Fri, 20 Oct 2023 09:41:51 +0100 Subject: [PATCH 3/5] updating weighted averages --- ahl_targets/notebooks/npm_annex.py | 30 +++++++++++++++++------------- 1 file changed, 17 insertions(+), 13 deletions(-) diff --git a/ahl_targets/notebooks/npm_annex.py b/ahl_targets/notebooks/npm_annex.py index 4e5a7ff..a27b501 100644 --- a/ahl_targets/notebooks/npm_annex.py +++ b/ahl_targets/notebooks/npm_annex.py @@ -69,8 +69,7 @@ def npm_density_plot(plt_df_sub): store_data_hfss["weight_vol"] = ( store_data_hfss["Gross Up Weight"] * store_data_hfss["volume_up"] ) -store_data_hfss["weight_prod"] = store_data_hfss["Gross Up Weight"] -store_data_hfss["weight_none"] = 1 +store_data_hfss["unweighted"] = store_data_hfss["Gross Up Weight"] # HFSS volume weighted shares hfss_shares_volume = ( @@ -78,14 +77,9 @@ def npm_density_plot(plt_df_sub): / store_data_hfss["weight_vol"].sum() ) # HFSS product weighted shares -hfss_shares_prod = ( - store_data_hfss.groupby(["in_scope"])["weight_prod"].sum() - / store_data_hfss["weight_prod"].sum() -) - -hfss_shares_none = ( - store_data_hfss.groupby(["in_scope"])["weight_none"].sum() - / store_data_hfss["weight_none"].sum() +hfss_shares_unweighted = ( + store_data_hfss.groupby(["in_scope"])["unweighted"].sum() + / store_data_hfss["unweighted"].sum() ) hfss_shares_kcal = ( @@ -93,6 +87,16 @@ def npm_density_plot(plt_df_sub): / store_data_hfss["weight_kcal"].sum() ) +# Shares of unique products sold (sort by purchase date) +unique_prods = store_data_hfss.sort_values( + by=["Purchase Date"], ascending=False +).drop_duplicates(subset=["product_code"], keep="first") +unique_prods_sold = ( + unique_prods.groupby(["in_scope"])["product_code"].nunique() + / unique_prods["product_code"].nunique() +) + + # Create new column high NPM >= 4 (1 else 0) store_data_hfss["high_npm"] = store_data_hfss["npm_score"].apply( lambda x: 1 if x >= 4 else 0 @@ -103,9 +107,9 @@ def npm_density_plot(plt_df_sub): / store_data_hfss["weight_vol"].sum() ) -hfss_high_prod = ( - store_data_hfss.groupby(["high_npm"])["weight_prod"].sum() - / store_data_hfss["weight_prod"].sum() +hfss_high_unweighted = ( + store_data_hfss.groupby(["high_npm"])["unweighted"].sum() + / store_data_hfss["unweighted"].sum() ) hfss_high_kcal = ( From 724a693dc16336b43ed079c5e499b331e8da9e88 Mon Sep 17 00:00:00 2001 From: ElenaMariani Date: Mon, 6 Nov 2023 15:28:24 +0000 Subject: [PATCH 4/5] remove salt from chart B --- ahl_targets/notebooks/npm_annex.py | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) diff --git a/ahl_targets/notebooks/npm_annex.py b/ahl_targets/notebooks/npm_annex.py index a27b501..baa1324 100644 --- a/ahl_targets/notebooks/npm_annex.py +++ b/ahl_targets/notebooks/npm_annex.py @@ -222,7 +222,20 @@ def npm_density_plot(plt_df_sub): ### Chart B - nutrient distribution ### npm_store_df = npm_data.merge( - store_data[["PurchaseId", "Period", "store_cat", "is_food", "itemisation_level_3"]], + store_data[ + [ + "PurchaseId", + "Period", + "store_cat", + "is_food", + "itemisation_level_3", + "rst_4_extended", + "rst_4_market", + "rst_4_market_sector", + "rst_4_sub_market", + "rst_4_trading_area", + ] + ], left_on=["purchase_id", "period"], right_on=["PurchaseId", "Period"], how="inner", @@ -230,7 +243,7 @@ def npm_density_plot(plt_df_sub): # Products grouped by NPM score to get avg: sugar, salt...ect per 100g prod_per_100 = ( - npm_store_df.groupby(["product_code"])[ + npm_store_df.groupby(["product_code", "rst_4_market"])[ [ "kcal_per_100g", "sat_per_100g", @@ -243,6 +256,11 @@ def npm_density_plot(plt_df_sub): .mean() .reset_index() ) + +# remove salt + +prod_per_100 = prod_per_100[prod_per_100["rst_4_market"] != "Salt"] + prod_100_npm = prod_per_100.merge( chart_c_df[["product_code", "npm_w"]], left_on="product_code", From 894deec476fd80051d513b5a64cc6a3e754cb1f0 Mon Sep 17 00:00:00 2001 From: Isabel Date: Tue, 28 Nov 2023 09:54:40 +0000 Subject: [PATCH 5/5] adding changes to produce charts for retailers --- ahl_targets/notebooks/npm_annex.py | 46 ++++++++++++++++++++++++++++++ 1 file changed, 46 insertions(+) diff --git a/ahl_targets/notebooks/npm_annex.py b/ahl_targets/notebooks/npm_annex.py index baa1324..2232d6b 100644 --- a/ahl_targets/notebooks/npm_annex.py +++ b/ahl_targets/notebooks/npm_annex.py @@ -53,6 +53,12 @@ def npm_density_plot(plt_df_sub): results_df = get_sim_data.npm_agg() npm_data = get_data.get_npm() +# Get product categories +prod_cats = ( + store_data[["Product Code", "store_cat", "rst_4_market_sector"]] + .drop_duplicates() + .copy() +) # create aggregate data with weights store_weight_npm = su.weighted_npm(store_data) @@ -153,6 +159,15 @@ def npm_density_plot(plt_df_sub): .sum() / store_weight_npm["kg_w"].groupby(store_weight_npm["store_cat"]).sum() ).reset_index(name="npm") + +avg_retailer = ( + (store_weight_npm["kg_w"] * store_weight_npm["npm_score"]) + .groupby(store_weight_npm["store_cat"]) + .sum() + / store_weight_npm["kg_w"].sum() +).reset_index(name="npm") + + # Add in row manually for where store == 'Target' and npm == avg['mean_npm_kg_new'] where npm_reduction == 3, sales_change_low == 5 and sales_change_high == 10 avg_retailer = pd.concat( [ @@ -177,6 +192,37 @@ def npm_density_plot(plt_df_sub): PROJECT_DIR / "outputs/reports/chart_csv/chartY_updated.csv", index=False ) +# Category level NPM - for specific retailer +store_weight_npm_cat = ( + store_weight_npm.copy() + .merge( + prod_cats, + left_on=["product_code", "store_cat"], + right_on=["Product Code", "store_cat"], + how="left", + ) + .drop(columns=["Product Code"]) +) + +store_weight_npm_cat = store_weight_npm_cat[ + store_weight_npm_cat["store_cat"] == "Total Asda" +].copy() + +retailer_npm_cat = ( + (store_weight_npm_cat["npm_score"] * store_weight_npm_cat["kg_w"]) + .groupby(store_weight_npm_cat["rst_4_market_sector"]) + .sum() + / store_weight_npm_cat.groupby(["rst_4_market_sector"])["kg_w"].sum() +).reset_index(name="npm_w") + +# Plot NPM by category (horizontal bar chart) sorted by npm +retailer_npm_cat.sort_values(by="npm_w", ascending=True).plot.barh( + x="rst_4_market_sector", + y="npm_w", + figsize=(7, 5), + legend=False, +) + # Generate before-after variables baseline_columns = avg.filter(like="_baseline")