From fe37ddf757c494389de2e170bd4c3bb82697998e Mon Sep 17 00:00:00 2001
From: Isabel <isabel.stewart@nesta.org.uk>
Date: Thu, 19 Oct 2023 13:02:57 +0100
Subject: [PATCH 1/5] updating for new npm scores

---
 ahl_targets/getters/get_data.py           |   3 +-
 ahl_targets/getters/simulated_outcomes.py |   4 +-
 ahl_targets/notebooks/npm_annex.py        | 141 +++++++++++++++++-----
 3 files changed, 111 insertions(+), 37 deletions(-)

diff --git a/ahl_targets/getters/get_data.py b/ahl_targets/getters/get_data.py
index 3778955..47c305e 100644
--- a/ahl_targets/getters/get_data.py
+++ b/ahl_targets/getters/get_data.py
@@ -492,8 +492,9 @@ def get_npm() -> pd.DataFrame:
     """
     return download_obj(
         BUCKET_NAME,
-        "in_home/processed/npm_with_nut.csv",
+        "in_home/processed/npm_with_nut.parquet",
         download_as="dataframe",
+        kwargs_boto={"Config": TransferConfig(io_chunksize=20947892)},
     )
 
 
diff --git a/ahl_targets/getters/simulated_outcomes.py b/ahl_targets/getters/simulated_outcomes.py
index 87ad662..a239267 100644
--- a/ahl_targets/getters/simulated_outcomes.py
+++ b/ahl_targets/getters/simulated_outcomes.py
@@ -30,9 +30,7 @@ def npm_agg() -> pd.DataFrame:
     """ """
 
     return pd.read_csv(
-        load_with_encoding(
-            "ahl-private-data", "in_home/data_outputs/targets_annex/npm_agg.csv"
-        ),
+        load_with_encoding("ahl-private-data", "in_home/processed/targets/npm_agg.csv"),
         encoding="ISO-8859-1",
     )
 
diff --git a/ahl_targets/notebooks/npm_annex.py b/ahl_targets/notebooks/npm_annex.py
index a3b4f18..96fb38b 100644
--- a/ahl_targets/notebooks/npm_annex.py
+++ b/ahl_targets/notebooks/npm_annex.py
@@ -19,9 +19,39 @@
     os.mkdir(path)
 
 
+def npm_density_plot(plt_df_sub):
+    chart = (
+        alt.Chart(plt_df_sub)
+        .transform_density(
+            "npm_w", as_=["size", "density"], groupby=["when"], bandwidth=2
+        )
+        .mark_line()
+        .encode(
+            x=alt.X(
+                "size:Q",
+                axis=alt.Axis(
+                    title="Sales weighted average NPM score",
+                ),
+            ),
+            y=alt.Y("density:Q", axis=alt.Axis(title="Weighted sales (%)", format="%")),
+            color=alt.Color("when:N", legend=alt.Legend(title="")),
+        )
+    )
+    return configure_plots(
+        chart,
+        "",
+        "",
+        16,
+        14,
+        14,
+    )
+
+
 # read data
 store_data = get_data.model_data()
 results_df = get_sim_data.npm_agg()
+npm_data = get_data.get_npm()
+
 
 # create aggregate data with weights
 store_weight_npm = su.weighted_npm(store_data)
@@ -82,7 +112,9 @@
     ignore_index=True,
 )
 # Save as csv (for use in chart Y)
-avg_retailer.to_csv(PROJECT_DIR / "outputs/reports/chart_csv/chartY.csv", index=False)
+avg_retailer.to_csv(
+    PROJECT_DIR / "outputs/reports/chart_csv/chartY_updated.csv", index=False
+)
 
 
 # Generate before-after variables
@@ -107,47 +139,91 @@
     / store_weight_npm.groupby(["product_code"])["kg_w"].sum()
 ).reset_index(name="npm_w")
 
-baseline_prod.to_csv(PROJECT_DIR / "outputs/reports/chart_csv/chartC.csv")
-
+alt.data_transformers.disable_max_rows()
+### Creating data for chart C (with updated NPM data) ###
+chart_c_df = baseline_prod.copy()
+chart_c_df["npm_w"] = ((-2) * chart_c_df["npm_w"]) + 70
+npm_density_plot(chart_c_df)
+chart_c_df.to_csv(PROJECT_DIR / "outputs/reports/chart_csv/chartC_updated.csv")
 
-# Data for chart C
-baseline_prod["npm_rounded"] = baseline_prod["npm_w"].round(0)
+chart_c_df["npm_rounded"] = chart_c_df["npm_w"].round(0)
 # Percent of products with each NPM score
 npm_share = (
-    (baseline_prod["npm_rounded"].value_counts(normalize=True) * 100)
+    (chart_c_df["npm_rounded"].value_counts(normalize=True) * 100)
     .reset_index()
     .rename(columns={"index": "npm", "npm_rounded": "Percent Share"})
 )
+npm_share.to_csv(
+    PROJECT_DIR / "outputs/reports/chart_csv/chartC2_alternative_npm_share.csv",
+    index=False,
+)
 
-alt.data_transformers.disable_max_rows()
 
+### Chart B - nutrient distribution ###
+npm_store_df = npm_data.merge(
+    store_data[["PurchaseId", "Period", "store_cat", "is_food", "itemisation_level_3"]],
+    left_on=["purchase_id", "period"],
+    right_on=["PurchaseId", "Period"],
+    how="inner",
+).drop(columns=["PurchaseId", "Period"])
 
-def npm_density_plot(plt_df_sub):
-    chart = (
-        alt.Chart(plt_df_sub)
-        .transform_density(
-            "npm_w", as_=["size", "density"], groupby=["when"], bandwidth=2
-        )
-        .mark_line()
-        .encode(
-            x=alt.X(
-                "size:Q",
-                axis=alt.Axis(
-                    title="Sales weighted average NPM score",
-                ),
-            ),
-            y=alt.Y("density:Q", axis=alt.Axis(title="Weighted sales (%)", format="%")),
-            color=alt.Color("when:N", legend=alt.Legend(title="")),
-        )
-    )
-    return configure_plots(
-        chart,
-        "",
-        "",
-        16,
-        14,
-        14,
+# Products grouped by NPM score to get avg: sugar, salt...ect per 100g
+prod_per_100 = (
+    npm_store_df.groupby(["product_code"])[
+        [
+            "kcal_per_100g",
+            "sat_per_100g",
+            "prot_per_100g",
+            "sug_per_100g",
+            "sod_per_100g",
+            "fibre_per_100g",
+        ]
+    ]
+    .mean()
+    .reset_index()
+)
+prod_100_npm = prod_per_100.merge(
+    chart_c_df[["product_code", "npm_w"]],
+    left_on="product_code",
+    right_on="product_code",
+).drop(["product_code"], axis=1)
+
+prod_100_npm["npm_w"] = prod_100_npm["npm_w"].round(0)
+
+prod_100_npm.rename(
+    columns={
+        "npm_w": "npm_score",
+    },
+    inplace=True,
+)
+
+
+prod_100_npm = (
+    prod_100_npm.groupby(["npm_score"])
+    .mean()
+    .reset_index()
+    .melt(
+        id_vars=["npm_score"],
+        var_name="component",
+        value_name="per 100g",
     )
+)
+# Saving CSV file (for chartB)
+prod_100_npm.to_csv(
+    PROJECT_DIR / f"outputs/reports/chart_csv/chartB_updated.csv", index=False
+)
+
+
+#### Previous code (for reference) ####
+
+# Data for chart C
+baseline_prod["npm_rounded"] = baseline_prod["npm_w"].round(0)
+# Percent of products with each NPM score
+npm_share = (
+    (baseline_prod["npm_rounded"].value_counts(normalize=True) * 100)
+    .reset_index()
+    .rename(columns={"index": "npm", "npm_rounded": "Percent Share"})
+)
 
 
 # Updated version of Chart C with new NPM data
@@ -162,7 +238,6 @@ def npm_density_plot(plt_df_sub):
     driver=webdr,
 )
 
-
 # Save as csv (for use in chart C)
 npm_share.to_csv(PROJECT_DIR / "outputs/reports/chart_csv/chartC2_v2.csv", index=False)
 

From 62542e8112dea4fcbb98204ffaf9bd4afb59039d Mon Sep 17 00:00:00 2001
From: Isabel <isabel.stewart@nesta.org.uk>
Date: Thu, 19 Oct 2023 16:57:48 +0100
Subject: [PATCH 2/5] adding in info on hfss and high npm scores

---
 ahl_targets/notebooks/npm_annex.py | 57 ++++++++++++++++++++++++++++++
 1 file changed, 57 insertions(+)

diff --git a/ahl_targets/notebooks/npm_annex.py b/ahl_targets/notebooks/npm_annex.py
index 96fb38b..4e5a7ff 100644
--- a/ahl_targets/notebooks/npm_annex.py
+++ b/ahl_targets/notebooks/npm_annex.py
@@ -1,5 +1,6 @@
 from ahl_targets.getters import simulated_outcomes as get_sim_data
 from ahl_targets.getters import get_data
+from ahl_targets.pipeline import product_transformation as pt
 from ahl_targets.utils.plotting import configure_plots
 from ahl_targets.utils.altair_save_utils import (
     google_chrome_driver_setup,
@@ -57,6 +58,62 @@ def npm_density_plot(plt_df_sub):
 store_weight_npm = su.weighted_npm(store_data)
 store_weight_npm["prod_weight_g"] = store_weight_npm.pipe(su.prod_weight_g)
 
+
+# HFSS info
+store_data_hfss = pt.type(store_data)
+store_data_hfss = pt.in_scope(store_data_hfss)
+
+store_data_hfss["weight_kcal"] = (
+    store_data_hfss["Gross Up Weight"] * store_data_hfss["Energy KCal"]
+)
+store_data_hfss["weight_vol"] = (
+    store_data_hfss["Gross Up Weight"] * store_data_hfss["volume_up"]
+)
+store_data_hfss["weight_prod"] = store_data_hfss["Gross Up Weight"]
+store_data_hfss["weight_none"] = 1
+
+# HFSS volume weighted shares
+hfss_shares_volume = (
+    store_data_hfss.groupby(["in_scope"])["weight_vol"].sum()
+    / store_data_hfss["weight_vol"].sum()
+)
+# HFSS product weighted shares
+hfss_shares_prod = (
+    store_data_hfss.groupby(["in_scope"])["weight_prod"].sum()
+    / store_data_hfss["weight_prod"].sum()
+)
+
+hfss_shares_none = (
+    store_data_hfss.groupby(["in_scope"])["weight_none"].sum()
+    / store_data_hfss["weight_none"].sum()
+)
+
+hfss_shares_kcal = (
+    store_data_hfss.groupby(["in_scope"])["weight_kcal"].sum()
+    / store_data_hfss["weight_kcal"].sum()
+)
+
+# Create new column high NPM >= 4 (1 else 0)
+store_data_hfss["high_npm"] = store_data_hfss["npm_score"].apply(
+    lambda x: 1 if x >= 4 else 0
+)
+
+hfss_high_volume = (
+    store_data_hfss.groupby(["high_npm"])["weight_vol"].sum()
+    / store_data_hfss["weight_vol"].sum()
+)
+
+hfss_high_prod = (
+    store_data_hfss.groupby(["high_npm"])["weight_prod"].sum()
+    / store_data_hfss["weight_prod"].sum()
+)
+
+hfss_high_kcal = (
+    store_data_hfss.groupby(["high_npm"])["weight_kcal"].sum()
+    / store_data_hfss["weight_kcal"].sum()
+)
+
+
 # average across all iterations
 avg = (
     results_df.groupby(

From a72f0f41c6b325698342e04ba3d165ab9ea26960 Mon Sep 17 00:00:00 2001
From: Isabel <isabel.stewart@nesta.org.uk>
Date: Fri, 20 Oct 2023 09:41:51 +0100
Subject: [PATCH 3/5] updating weighted averages

---
 ahl_targets/notebooks/npm_annex.py | 30 +++++++++++++++++-------------
 1 file changed, 17 insertions(+), 13 deletions(-)

diff --git a/ahl_targets/notebooks/npm_annex.py b/ahl_targets/notebooks/npm_annex.py
index 4e5a7ff..a27b501 100644
--- a/ahl_targets/notebooks/npm_annex.py
+++ b/ahl_targets/notebooks/npm_annex.py
@@ -69,8 +69,7 @@ def npm_density_plot(plt_df_sub):
 store_data_hfss["weight_vol"] = (
     store_data_hfss["Gross Up Weight"] * store_data_hfss["volume_up"]
 )
-store_data_hfss["weight_prod"] = store_data_hfss["Gross Up Weight"]
-store_data_hfss["weight_none"] = 1
+store_data_hfss["unweighted"] = store_data_hfss["Gross Up Weight"]
 
 # HFSS volume weighted shares
 hfss_shares_volume = (
@@ -78,14 +77,9 @@ def npm_density_plot(plt_df_sub):
     / store_data_hfss["weight_vol"].sum()
 )
 # HFSS product weighted shares
-hfss_shares_prod = (
-    store_data_hfss.groupby(["in_scope"])["weight_prod"].sum()
-    / store_data_hfss["weight_prod"].sum()
-)
-
-hfss_shares_none = (
-    store_data_hfss.groupby(["in_scope"])["weight_none"].sum()
-    / store_data_hfss["weight_none"].sum()
+hfss_shares_unweighted = (
+    store_data_hfss.groupby(["in_scope"])["unweighted"].sum()
+    / store_data_hfss["unweighted"].sum()
 )
 
 hfss_shares_kcal = (
@@ -93,6 +87,16 @@ def npm_density_plot(plt_df_sub):
     / store_data_hfss["weight_kcal"].sum()
 )
 
+# Shares of unique products sold (sort by purchase date)
+unique_prods = store_data_hfss.sort_values(
+    by=["Purchase Date"], ascending=False
+).drop_duplicates(subset=["product_code"], keep="first")
+unique_prods_sold = (
+    unique_prods.groupby(["in_scope"])["product_code"].nunique()
+    / unique_prods["product_code"].nunique()
+)
+
+
 # Create new column high NPM >= 4 (1 else 0)
 store_data_hfss["high_npm"] = store_data_hfss["npm_score"].apply(
     lambda x: 1 if x >= 4 else 0
@@ -103,9 +107,9 @@ def npm_density_plot(plt_df_sub):
     / store_data_hfss["weight_vol"].sum()
 )
 
-hfss_high_prod = (
-    store_data_hfss.groupby(["high_npm"])["weight_prod"].sum()
-    / store_data_hfss["weight_prod"].sum()
+hfss_high_unweighted = (
+    store_data_hfss.groupby(["high_npm"])["unweighted"].sum()
+    / store_data_hfss["unweighted"].sum()
 )
 
 hfss_high_kcal = (

From 724a693dc16336b43ed079c5e499b331e8da9e88 Mon Sep 17 00:00:00 2001
From: ElenaMariani <elena.mariani@nesta.org.uk>
Date: Mon, 6 Nov 2023 15:28:24 +0000
Subject: [PATCH 4/5] remove salt from chart B

---
 ahl_targets/notebooks/npm_annex.py | 22 ++++++++++++++++++++--
 1 file changed, 20 insertions(+), 2 deletions(-)

diff --git a/ahl_targets/notebooks/npm_annex.py b/ahl_targets/notebooks/npm_annex.py
index a27b501..baa1324 100644
--- a/ahl_targets/notebooks/npm_annex.py
+++ b/ahl_targets/notebooks/npm_annex.py
@@ -222,7 +222,20 @@ def npm_density_plot(plt_df_sub):
 
 ### Chart B - nutrient distribution ###
 npm_store_df = npm_data.merge(
-    store_data[["PurchaseId", "Period", "store_cat", "is_food", "itemisation_level_3"]],
+    store_data[
+        [
+            "PurchaseId",
+            "Period",
+            "store_cat",
+            "is_food",
+            "itemisation_level_3",
+            "rst_4_extended",
+            "rst_4_market",
+            "rst_4_market_sector",
+            "rst_4_sub_market",
+            "rst_4_trading_area",
+        ]
+    ],
     left_on=["purchase_id", "period"],
     right_on=["PurchaseId", "Period"],
     how="inner",
@@ -230,7 +243,7 @@ def npm_density_plot(plt_df_sub):
 
 # Products grouped by NPM score to get avg: sugar, salt...ect per 100g
 prod_per_100 = (
-    npm_store_df.groupby(["product_code"])[
+    npm_store_df.groupby(["product_code", "rst_4_market"])[
         [
             "kcal_per_100g",
             "sat_per_100g",
@@ -243,6 +256,11 @@ def npm_density_plot(plt_df_sub):
     .mean()
     .reset_index()
 )
+
+# remove salt
+
+prod_per_100 = prod_per_100[prod_per_100["rst_4_market"] != "Salt"]
+
 prod_100_npm = prod_per_100.merge(
     chart_c_df[["product_code", "npm_w"]],
     left_on="product_code",

From 894deec476fd80051d513b5a64cc6a3e754cb1f0 Mon Sep 17 00:00:00 2001
From: Isabel <isabel.stewart@nesta.org.uk>
Date: Tue, 28 Nov 2023 09:54:40 +0000
Subject: [PATCH 5/5] adding changes to produce charts for retailers

---
 ahl_targets/notebooks/npm_annex.py | 46 ++++++++++++++++++++++++++++++
 1 file changed, 46 insertions(+)

diff --git a/ahl_targets/notebooks/npm_annex.py b/ahl_targets/notebooks/npm_annex.py
index baa1324..2232d6b 100644
--- a/ahl_targets/notebooks/npm_annex.py
+++ b/ahl_targets/notebooks/npm_annex.py
@@ -53,6 +53,12 @@ def npm_density_plot(plt_df_sub):
 results_df = get_sim_data.npm_agg()
 npm_data = get_data.get_npm()
 
+# Get product categories
+prod_cats = (
+    store_data[["Product Code", "store_cat", "rst_4_market_sector"]]
+    .drop_duplicates()
+    .copy()
+)
 
 # create aggregate data with weights
 store_weight_npm = su.weighted_npm(store_data)
@@ -153,6 +159,15 @@ def npm_density_plot(plt_df_sub):
     .sum()
     / store_weight_npm["kg_w"].groupby(store_weight_npm["store_cat"]).sum()
 ).reset_index(name="npm")
+
+avg_retailer = (
+    (store_weight_npm["kg_w"] * store_weight_npm["npm_score"])
+    .groupby(store_weight_npm["store_cat"])
+    .sum()
+    / store_weight_npm["kg_w"].sum()
+).reset_index(name="npm")
+
+
 # Add in row manually for where store == 'Target' and npm == avg['mean_npm_kg_new'] where npm_reduction == 3, sales_change_low == 5 and sales_change_high == 10
 avg_retailer = pd.concat(
     [
@@ -177,6 +192,37 @@ def npm_density_plot(plt_df_sub):
     PROJECT_DIR / "outputs/reports/chart_csv/chartY_updated.csv", index=False
 )
 
+# Category level NPM - for specific retailer
+store_weight_npm_cat = (
+    store_weight_npm.copy()
+    .merge(
+        prod_cats,
+        left_on=["product_code", "store_cat"],
+        right_on=["Product Code", "store_cat"],
+        how="left",
+    )
+    .drop(columns=["Product Code"])
+)
+
+store_weight_npm_cat = store_weight_npm_cat[
+    store_weight_npm_cat["store_cat"] == "Total Asda"
+].copy()
+
+retailer_npm_cat = (
+    (store_weight_npm_cat["npm_score"] * store_weight_npm_cat["kg_w"])
+    .groupby(store_weight_npm_cat["rst_4_market_sector"])
+    .sum()
+    / store_weight_npm_cat.groupby(["rst_4_market_sector"])["kg_w"].sum()
+).reset_index(name="npm_w")
+
+# Plot NPM by category (horizontal bar chart) sorted by npm
+retailer_npm_cat.sort_values(by="npm_w", ascending=True).plot.barh(
+    x="rst_4_market_sector",
+    y="npm_w",
+    figsize=(7, 5),
+    legend=False,
+)
+
 
 # Generate before-after variables
 baseline_columns = avg.filter(like="_baseline")