Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Run npm model on adult intake #99

Draft
wants to merge 16 commits into
base: dev
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
141 changes: 141 additions & 0 deletions ahl_targets/analysis/change_checks_Oct24/compare_new_to_old_file.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,141 @@
"""This file does two things:
- Merges the new values back to the original file to check that for each unique purchase (defined by purchase_id and period) the key values are the same
(ie merging + transfer from diets) has worked as expected

On investigation, I found that there were unexpected products in the new file that weren't in the old file.
These were products that were in categories that were intentionally added back in.
The majority of these products had missing NPM scores, so were likely excluded in the original analysis for this reason.
~100 had, and it's hard to tell why they weren't included. However, this is a negligible number accounting for <1kcal pp per day, so I've removed them and saved an updated file.
"""

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


from nesta_ds_utils.loading_saving.S3 import upload_obj
from ahl_targets import BUCKET_NAME, PROJECT_DIR
from ahl_targets.utils import simulation_utils as su
from ahl_targets.getters import get_data
from ahl_targets.getters import simulated_outcomes as get_sim_data
import yaml


from ahl_targets.utils import diets
from ahl_targets.getters import get_data_v2 as g2
import logging


if __name__ == "__main__":

adult_pop = 51718632
no_days = 365

# Read in data

orig_data = get_data.model_data()
df = g2.new_model_data()

# Merge on NPM score
npm = get_data.full_npm()

df_npm = df.merge(
npm[["purchase_id", "period", "npm_score", "kcal_per_100g"]],
on=["purchase_id", "period"],
how="left",
)

# Ensure unique products IDs
df_npm["unique_id"] = df_npm["purchase_id"].astype(str) + df_npm["period"].astype(
str
)
orig_data["unique_id"] = orig_data["purchase_id"].astype(str) + orig_data[
"period"
].astype(str)

# Update the new data file with an indicator of whether it was in the original analysis
df_npm["is_in_old"] = df_npm["unique_id"].isin(orig_data["unique_id"])

# Rename variables the equivalent in the old model
df_npm = df_npm.rename(
columns={
"panel_id": "Panel Id",
"gross_up_weight": "Gross Up Weight",
"volume": "volume_up",
"store_level_3": "store_cat",
"energy_kcal": "Energy KCal",
"quantity": "Quantity",
"spend": "Spend",
}
)

# Get the same products and check ED and NPM score differences at product level - all matches

store_data_comp = orig_data.merge(
df_npm[["purchase_id", "period", "npm_score", "kcal_per_100g", "volume_up"]],
on=["purchase_id", "period"],
how="left",
suffixes=("", "_new"),
)

store_data_comp["npm_diff"] = (
store_data_comp["npm_score"] - store_data_comp["npm_score_new"]
)
store_data_comp["volume_diff"] = (
store_data_comp["volume_up"] - store_data_comp["volume_up_new"]
)
store_data_comp["ed_diff"] = (
store_data_comp["ed"] - store_data_comp["kcal_per_100g"]
)

logging.info(
f"Number of records with different NPM: {store_data_comp[store_data_comp['npm_diff'] != 0].shape[0]}"
)
logging.info(
f"Number of records with different volume: {store_data_comp[store_data_comp['volume_diff'] != 0].shape[0]}"
)
logging.info(
f"Number of records with different ed: {store_data_comp[store_data_comp['ed_diff'] != 0].shape[0]}"
)

# Remove products that weren't in the categories added back in and weren't in the old model

# List of categories intentionally added to original targets file
to_keep = [
"Cooking Oils",
"Total Ice Cream",
"Fresh Cream",
"Lards+Compounds",
"Vinegar",
"Breakfast Cereals",
"Defined Milk+Cream Prd(B)",
]

# Filter for products that weren't in the old model, or in the categories to add
added_new = df_npm[~df_npm["is_in_old"]]
added_surprise = added_new[~added_new["rst_4_market"].isin(to_keep)]

# The majority of these have missing NPM scores (likely why they were excluded in the original)

logging.info(f"Total surprise additions: {added_surprise.shape[0]}")
logging.info(
f"Number dropped due to missing NPM scores: {added_surprise['npm_score'].isna().sum()}"
)

# Overall kcal per person per day is minimal

logging.info(
f"Total kcal per person per day of surprise additions: {added_surprise['energy_kcal_weighted'].sum()/adult_pop/no_days}"
)

# Therefore, just remove them and check the baseline effect
logging.info("Saving list of products to remove")

added_new = added_new[~added_new["unique_id"].isin(added_surprise["unique_id"])]

upload_obj(
added_surprise,
BUCKET_NAME,
"in_home/processed/targets/oct_24_update/additions_to_remove.csv",
kwargs_writing={"index": False},
)
135 changes: 135 additions & 0 deletions ahl_targets/analysis/change_checks_Oct24/new_swa_analysis.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,135 @@
# ---
# jupyter:
# jupytext:
# cell_metadata_filter: -all
# comment_magics: true
# text_representation:
# extension: .py
# format_name: light
# format_version: '1.5'
# jupytext_version: 1.16.2
# kernelspec:
# display_name: ahl_targets
# language: python
# name: python3
# ---

# +
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


from nesta_ds_utils.loading_saving.S3 import upload_obj
from ahl_targets import BUCKET_NAME, PROJECT_DIR
from ahl_targets.utils import simulation_utils as su
from ahl_targets.getters import get_data
from ahl_targets.getters import simulated_outcomes as get_sim_data
import yaml


from ahl_targets.utils import diets
from ahl_targets.getters import get_data_v2 as g2
import logging

# -

to_keep = [
"Cooking Oils",
"Total Ice Cream",
"Fresh Cream",
"Lards+Compounds",
"Vinegar",
"Breakfast Cereals",
"Defined Milk+Cream Prd(B)",
]

prod_table = get_data.product_metadata()

# Read in data
agg_df = g2.get_agg_data()
agg_df_adj = g2.get_agg_data_vol_adjusted()


def get_swas(store_weight_npm, prod_metadata=prod_table):

# Calculate SWA NPM
store_weight_npm["swa_npm"] = (
store_weight_npm["kg_w"] * store_weight_npm["npm_score"]
)

# Calculate SWA NPM by market
total_swa_npm = store_weight_npm["swa_npm"].sum()

# Calculate SWA NPM by store
store_weight_npm["kg_w_store_cat"] = store_weight_npm[
"kg_w"
] / store_weight_npm.groupby("store_cat")["kg_w"].transform("sum")
store_weight_npm["swa_npm_store_cat"] = (
store_weight_npm["kg_w_store_cat"] * store_weight_npm["npm_score"]
)
store_swa_npm = (
store_weight_npm["swa_npm_store_cat"]
.groupby(store_weight_npm["store_cat"])
.sum()
)

# Merge back markets
store_weight_npm = store_weight_npm.merge(
prod_metadata[["product_code", "rst_4_market", "rst_4_extended"]],
on=["product_code"],
how="left",
)

# Calculate SWA NPM by rst_4_market
store_weight_npm["kg_w_rst_4_market"] = store_weight_npm[
"kg_w"
] / store_weight_npm.groupby("rst_4_market")["kg_w"].transform("sum")
store_weight_npm["swa_npm_rst_4_market"] = (
store_weight_npm["kg_w_rst_4_market"] * store_weight_npm["npm_score"]
)
rst_4_market_swa_npm = (
store_weight_npm["swa_npm_rst_4_market"]
.groupby(store_weight_npm["rst_4_market"])
.sum()
)

return total_swa_npm, store_swa_npm, rst_4_market_swa_npm, store_weight_npm


# +
# Import original data
# I did this to check that the SWA function works as expected (produces the same results as the original data). Have commented out the import to reduce the memory usage in the notebook
# Original results saved here: https://docs.google.com/spreadsheets/d/1ED3rxbJzZNi6lgRLUvwsL0NWz3FLSko4G6oJ2ohbSM0/edit?gid=0#gid=0)

# orig_data = get_data.model_data()

# store_weight_npm_orig = su.weighted_npm(orig_data)
# store_weight_npm_orig["prod_weight_g"] = store_weight_npm_orig.pipe(su.prod_weight_g)


# Check the SWA NPM for the original file
# total_swa_npm_orig, swa_by_store_orig, swa_by_market, agg_df_orig = get_swas(store_weight_npm_orig)

# +
# Compare SWA for volume-adjusted and non-volume adjusted data

total_swa, swa_by_store, swa_by_market, swa_all = get_swas(agg_df)
total_swa_adj, swa_by_store_adj, swa_by_market_adj, swa_all_adj = get_swas(agg_df_adj)

# +
# Plot SWA NPM values for volume-adjusted added categories

swa_by_market_adj = swa_by_market_adj.reset_index()

display(swa_by_market_adj[swa_by_market_adj["rst_4_market"].isin(to_keep)])

print(f"Total SWA NPM for volume-adjusted data: {total_swa_adj}")

# +
# Get count of HFSS products with and within added categories
agg_df_adj["hfss"] = agg_df_adj["npm_score"] > 4

agg_df_adj["hfss"].value_counts()

agg_df_adj[~agg_df_adj["rst_4_market"].isin(to_keep)]["hfss"].value_counts()
Loading