Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Updating charts with new npm scores #77

Open
wants to merge 5 commits into
base: dev
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion ahl_targets/getters/get_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -492,8 +492,9 @@ def get_npm() -> pd.DataFrame:
"""
return download_obj(
BUCKET_NAME,
"in_home/processed/npm_with_nut.csv",
"in_home/processed/npm_with_nut.parquet",
download_as="dataframe",
kwargs_boto={"Config": TransferConfig(io_chunksize=20947892)},
)


Expand Down
4 changes: 1 addition & 3 deletions ahl_targets/getters/simulated_outcomes.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,9 +30,7 @@ def npm_agg() -> pd.DataFrame:
""" """

return pd.read_csv(
load_with_encoding(
"ahl-private-data", "in_home/data_outputs/targets_annex/npm_agg.csv"
),
load_with_encoding("ahl-private-data", "in_home/processed/targets/npm_agg.csv"),
encoding="ISO-8859-1",
)

Expand Down
198 changes: 165 additions & 33 deletions ahl_targets/notebooks/npm_annex.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from ahl_targets.getters import simulated_outcomes as get_sim_data
from ahl_targets.getters import get_data
from ahl_targets.pipeline import product_transformation as pt
from ahl_targets.utils.plotting import configure_plots
from ahl_targets.utils.altair_save_utils import (
google_chrome_driver_setup,
Expand All @@ -19,14 +20,100 @@
os.mkdir(path)


def npm_density_plot(plt_df_sub):
chart = (
alt.Chart(plt_df_sub)
.transform_density(
"npm_w", as_=["size", "density"], groupby=["when"], bandwidth=2
)
.mark_line()
.encode(
x=alt.X(
"size:Q",
axis=alt.Axis(
title="Sales weighted average NPM score",
),
),
y=alt.Y("density:Q", axis=alt.Axis(title="Weighted sales (%)", format="%")),
color=alt.Color("when:N", legend=alt.Legend(title="")),
)
)
return configure_plots(
chart,
"",
"",
16,
14,
14,
)


# read data
store_data = get_data.model_data()
results_df = get_sim_data.npm_agg()
npm_data = get_data.get_npm()


# create aggregate data with weights
store_weight_npm = su.weighted_npm(store_data)
store_weight_npm["prod_weight_g"] = store_weight_npm.pipe(su.prod_weight_g)


# HFSS info
store_data_hfss = pt.type(store_data)
store_data_hfss = pt.in_scope(store_data_hfss)

store_data_hfss["weight_kcal"] = (
store_data_hfss["Gross Up Weight"] * store_data_hfss["Energy KCal"]
)
store_data_hfss["weight_vol"] = (
store_data_hfss["Gross Up Weight"] * store_data_hfss["volume_up"]
)
store_data_hfss["weight_prod"] = store_data_hfss["Gross Up Weight"]
store_data_hfss["weight_none"] = 1

# HFSS volume weighted shares
hfss_shares_volume = (
store_data_hfss.groupby(["in_scope"])["weight_vol"].sum()
/ store_data_hfss["weight_vol"].sum()
)
# HFSS product weighted shares
hfss_shares_prod = (
store_data_hfss.groupby(["in_scope"])["weight_prod"].sum()
/ store_data_hfss["weight_prod"].sum()
)

hfss_shares_none = (
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think that this is not very interpretable as it's just giving the sample average (rather than the population). By unweighted we normally mean the share calculated on the unique set of products and I think this is actually what the figure in the appendix refers to.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@ElenaMariani I think I just need to remove this variable as its not used in the document I think I had created it previously just to sense check the values against some in a spreadsheet and so copied it over to this file.

Its a good point on what the appendix is referring to (I think I did have the unweighted mean also in another script but didn't copy it over here). Lets discuss on our call later.

store_data_hfss.groupby(["in_scope"])["weight_none"].sum()
/ store_data_hfss["weight_none"].sum()
)

hfss_shares_kcal = (
store_data_hfss.groupby(["in_scope"])["weight_kcal"].sum()
/ store_data_hfss["weight_kcal"].sum()
)

# Create new column high NPM >= 4 (1 else 0)
store_data_hfss["high_npm"] = store_data_hfss["npm_score"].apply(
lambda x: 1 if x >= 4 else 0
)

hfss_high_volume = (
store_data_hfss.groupby(["high_npm"])["weight_vol"].sum()
/ store_data_hfss["weight_vol"].sum()
)

hfss_high_prod = (
store_data_hfss.groupby(["high_npm"])["weight_prod"].sum()
/ store_data_hfss["weight_prod"].sum()
)

hfss_high_kcal = (
store_data_hfss.groupby(["high_npm"])["weight_kcal"].sum()
/ store_data_hfss["weight_kcal"].sum()
)


# average across all iterations
avg = (
results_df.groupby(
Expand Down Expand Up @@ -82,7 +169,9 @@
ignore_index=True,
)
# Save as csv (for use in chart Y)
avg_retailer.to_csv(PROJECT_DIR / "outputs/reports/chart_csv/chartY.csv", index=False)
avg_retailer.to_csv(
PROJECT_DIR / "outputs/reports/chart_csv/chartY_updated.csv", index=False
)


# Generate before-after variables
Expand All @@ -107,47 +196,91 @@
/ store_weight_npm.groupby(["product_code"])["kg_w"].sum()
).reset_index(name="npm_w")

baseline_prod.to_csv(PROJECT_DIR / "outputs/reports/chart_csv/chartC.csv")

alt.data_transformers.disable_max_rows()
### Creating data for chart C (with updated NPM data) ###
chart_c_df = baseline_prod.copy()
chart_c_df["npm_w"] = ((-2) * chart_c_df["npm_w"]) + 70
npm_density_plot(chart_c_df)
chart_c_df.to_csv(PROJECT_DIR / "outputs/reports/chart_csv/chartC_updated.csv")

# Data for chart C
baseline_prod["npm_rounded"] = baseline_prod["npm_w"].round(0)
chart_c_df["npm_rounded"] = chart_c_df["npm_w"].round(0)
# Percent of products with each NPM score
npm_share = (
(baseline_prod["npm_rounded"].value_counts(normalize=True) * 100)
(chart_c_df["npm_rounded"].value_counts(normalize=True) * 100)
.reset_index()
.rename(columns={"index": "npm", "npm_rounded": "Percent Share"})
)
npm_share.to_csv(
PROJECT_DIR / "outputs/reports/chart_csv/chartC2_alternative_npm_share.csv",
index=False,
)

alt.data_transformers.disable_max_rows()

### Chart B - nutrient distribution ###
npm_store_df = npm_data.merge(
store_data[["PurchaseId", "Period", "store_cat", "is_food", "itemisation_level_3"]],
left_on=["purchase_id", "period"],
right_on=["PurchaseId", "Period"],
how="inner",
).drop(columns=["PurchaseId", "Period"])

def npm_density_plot(plt_df_sub):
chart = (
alt.Chart(plt_df_sub)
.transform_density(
"npm_w", as_=["size", "density"], groupby=["when"], bandwidth=2
)
.mark_line()
.encode(
x=alt.X(
"size:Q",
axis=alt.Axis(
title="Sales weighted average NPM score",
),
),
y=alt.Y("density:Q", axis=alt.Axis(title="Weighted sales (%)", format="%")),
color=alt.Color("when:N", legend=alt.Legend(title="")),
)
)
return configure_plots(
chart,
"",
"",
16,
14,
14,
# Products grouped by NPM score to get avg: sugar, salt...ect per 100g
prod_per_100 = (
npm_store_df.groupby(["product_code"])[
[
"kcal_per_100g",
"sat_per_100g",
"prot_per_100g",
"sug_per_100g",
"sod_per_100g",
"fibre_per_100g",
]
]
.mean()
.reset_index()
)
prod_100_npm = prod_per_100.merge(
chart_c_df[["product_code", "npm_w"]],
left_on="product_code",
right_on="product_code",
).drop(["product_code"], axis=1)

prod_100_npm["npm_w"] = prod_100_npm["npm_w"].round(0)

prod_100_npm.rename(
columns={
"npm_w": "npm_score",
},
inplace=True,
)


prod_100_npm = (
prod_100_npm.groupby(["npm_score"])
.mean()
.reset_index()
.melt(
id_vars=["npm_score"],
var_name="component",
value_name="per 100g",
)
)
# Saving CSV file (for chartB)
prod_100_npm.to_csv(
PROJECT_DIR / f"outputs/reports/chart_csv/chartB_updated.csv", index=False
)


#### Previous code (for reference) ####

# Data for chart C
baseline_prod["npm_rounded"] = baseline_prod["npm_w"].round(0)
# Percent of products with each NPM score
npm_share = (
(baseline_prod["npm_rounded"].value_counts(normalize=True) * 100)
.reset_index()
.rename(columns={"index": "npm", "npm_rounded": "Percent Share"})
)


# Updated version of Chart C with new NPM data
Expand All @@ -162,7 +295,6 @@ def npm_density_plot(plt_df_sub):
driver=webdr,
)


# Save as csv (for use in chart C)
npm_share.to_csv(PROJECT_DIR / "outputs/reports/chart_csv/chartC2_v2.csv", index=False)

Expand Down