diff --git a/ahl_targets/analysis/change_checks_Oct24/compare_new_to_old_file.py b/ahl_targets/analysis/change_checks_Oct24/compare_new_to_old_file.py new file mode 100644 index 0000000..0c9521e --- /dev/null +++ b/ahl_targets/analysis/change_checks_Oct24/compare_new_to_old_file.py @@ -0,0 +1,141 @@ +"""This file does two things: +- Merges the new values back to the original file to check that for each unique purchase (defined by purchase_id and period) the key values are the same +(ie merging + transfer from diets) has worked as expected + +On investigation, I found that there were unexpected products in the new file that weren't in the old file. +These were products that were in categories that were intentionally added back in. +The majority of these products had missing NPM scores, so were likely excluded in the original analysis for this reason. +~100 had, and it's hard to tell why they weren't included. However, this is a negligible number accounting for <1kcal pp per day, so I've removed them and saved an updated file. +""" + +import pandas as pd +import numpy as np +import matplotlib.pyplot as plt + + +from nesta_ds_utils.loading_saving.S3 import upload_obj +from ahl_targets import BUCKET_NAME, PROJECT_DIR +from ahl_targets.utils import simulation_utils as su +from ahl_targets.getters import get_data +from ahl_targets.getters import simulated_outcomes as get_sim_data +import yaml + + +from ahl_targets.utils import diets +from ahl_targets.getters import get_data_v2 as g2 +import logging + + +if __name__ == "__main__": + + adult_pop = 51718632 + no_days = 365 + + # Read in data + + orig_data = get_data.model_data() + df = g2.new_model_data() + + # Merge on NPM score + npm = get_data.full_npm() + + df_npm = df.merge( + npm[["purchase_id", "period", "npm_score", "kcal_per_100g"]], + on=["purchase_id", "period"], + how="left", + ) + + # Ensure unique products IDs + df_npm["unique_id"] = df_npm["purchase_id"].astype(str) + df_npm["period"].astype( + str + ) + orig_data["unique_id"] = orig_data["purchase_id"].astype(str) + orig_data[ + "period" + ].astype(str) + + # Update the new data file with an indicator of whether it was in the original analysis + df_npm["is_in_old"] = df_npm["unique_id"].isin(orig_data["unique_id"]) + + # Rename variables the equivalent in the old model + df_npm = df_npm.rename( + columns={ + "panel_id": "Panel Id", + "gross_up_weight": "Gross Up Weight", + "volume": "volume_up", + "store_level_3": "store_cat", + "energy_kcal": "Energy KCal", + "quantity": "Quantity", + "spend": "Spend", + } + ) + + # Get the same products and check ED and NPM score differences at product level - all matches + + store_data_comp = orig_data.merge( + df_npm[["purchase_id", "period", "npm_score", "kcal_per_100g", "volume_up"]], + on=["purchase_id", "period"], + how="left", + suffixes=("", "_new"), + ) + + store_data_comp["npm_diff"] = ( + store_data_comp["npm_score"] - store_data_comp["npm_score_new"] + ) + store_data_comp["volume_diff"] = ( + store_data_comp["volume_up"] - store_data_comp["volume_up_new"] + ) + store_data_comp["ed_diff"] = ( + store_data_comp["ed"] - store_data_comp["kcal_per_100g"] + ) + + logging.info( + f"Number of records with different NPM: {store_data_comp[store_data_comp['npm_diff'] != 0].shape[0]}" + ) + logging.info( + f"Number of records with different volume: {store_data_comp[store_data_comp['volume_diff'] != 0].shape[0]}" + ) + logging.info( + f"Number of records with different ed: {store_data_comp[store_data_comp['ed_diff'] != 0].shape[0]}" + ) + + # Remove products that weren't in the categories added back in and weren't in the old model + + # List of categories intentionally added to original targets file + to_keep = [ + "Cooking Oils", + "Total Ice Cream", + "Fresh Cream", + "Lards+Compounds", + "Vinegar", + "Breakfast Cereals", + "Defined Milk+Cream Prd(B)", + ] + + # Filter for products that weren't in the old model, or in the categories to add + added_new = df_npm[~df_npm["is_in_old"]] + added_surprise = added_new[~added_new["rst_4_market"].isin(to_keep)] + + # The majority of these have missing NPM scores (likely why they were excluded in the original) + + logging.info(f"Total surprise additions: {added_surprise.shape[0]}") + logging.info( + f"Number dropped due to missing NPM scores: {added_surprise['npm_score'].isna().sum()}" + ) + + # Overall kcal per person per day is minimal + + logging.info( + f"Total kcal per person per day of surprise additions: {added_surprise['energy_kcal_weighted'].sum()/adult_pop/no_days}" + ) + + # Therefore, just remove them and check the baseline effect + logging.info("Saving list of products to remove") + + added_new = added_new[~added_new["unique_id"].isin(added_surprise["unique_id"])] + + upload_obj( + added_surprise, + BUCKET_NAME, + "in_home/processed/targets/oct_24_update/additions_to_remove.csv", + kwargs_writing={"index": False}, + ) diff --git a/ahl_targets/analysis/change_checks_Oct24/new_swa_analysis.py b/ahl_targets/analysis/change_checks_Oct24/new_swa_analysis.py new file mode 100644 index 0000000..759ce14 --- /dev/null +++ b/ahl_targets/analysis/change_checks_Oct24/new_swa_analysis.py @@ -0,0 +1,135 @@ +# --- +# jupyter: +# jupytext: +# cell_metadata_filter: -all +# comment_magics: true +# text_representation: +# extension: .py +# format_name: light +# format_version: '1.5' +# jupytext_version: 1.16.2 +# kernelspec: +# display_name: ahl_targets +# language: python +# name: python3 +# --- + +# + +import pandas as pd +import numpy as np +import matplotlib.pyplot as plt + + +from nesta_ds_utils.loading_saving.S3 import upload_obj +from ahl_targets import BUCKET_NAME, PROJECT_DIR +from ahl_targets.utils import simulation_utils as su +from ahl_targets.getters import get_data +from ahl_targets.getters import simulated_outcomes as get_sim_data +import yaml + + +from ahl_targets.utils import diets +from ahl_targets.getters import get_data_v2 as g2 +import logging + +# - + +to_keep = [ + "Cooking Oils", + "Total Ice Cream", + "Fresh Cream", + "Lards+Compounds", + "Vinegar", + "Breakfast Cereals", + "Defined Milk+Cream Prd(B)", +] + +prod_table = get_data.product_metadata() + +# Read in data +agg_df = g2.get_agg_data() +agg_df_adj = g2.get_agg_data_vol_adjusted() + + +def get_swas(store_weight_npm, prod_metadata=prod_table): + + # Calculate SWA NPM + store_weight_npm["swa_npm"] = ( + store_weight_npm["kg_w"] * store_weight_npm["npm_score"] + ) + + # Calculate SWA NPM by market + total_swa_npm = store_weight_npm["swa_npm"].sum() + + # Calculate SWA NPM by store + store_weight_npm["kg_w_store_cat"] = store_weight_npm[ + "kg_w" + ] / store_weight_npm.groupby("store_cat")["kg_w"].transform("sum") + store_weight_npm["swa_npm_store_cat"] = ( + store_weight_npm["kg_w_store_cat"] * store_weight_npm["npm_score"] + ) + store_swa_npm = ( + store_weight_npm["swa_npm_store_cat"] + .groupby(store_weight_npm["store_cat"]) + .sum() + ) + + # Merge back markets + store_weight_npm = store_weight_npm.merge( + prod_metadata[["product_code", "rst_4_market", "rst_4_extended"]], + on=["product_code"], + how="left", + ) + + # Calculate SWA NPM by rst_4_market + store_weight_npm["kg_w_rst_4_market"] = store_weight_npm[ + "kg_w" + ] / store_weight_npm.groupby("rst_4_market")["kg_w"].transform("sum") + store_weight_npm["swa_npm_rst_4_market"] = ( + store_weight_npm["kg_w_rst_4_market"] * store_weight_npm["npm_score"] + ) + rst_4_market_swa_npm = ( + store_weight_npm["swa_npm_rst_4_market"] + .groupby(store_weight_npm["rst_4_market"]) + .sum() + ) + + return total_swa_npm, store_swa_npm, rst_4_market_swa_npm, store_weight_npm + + +# + +# Import original data +# I did this to check that the SWA function works as expected (produces the same results as the original data). Have commented out the import to reduce the memory usage in the notebook +# Original results saved here: https://docs.google.com/spreadsheets/d/1ED3rxbJzZNi6lgRLUvwsL0NWz3FLSko4G6oJ2ohbSM0/edit?gid=0#gid=0) + +# orig_data = get_data.model_data() + +# store_weight_npm_orig = su.weighted_npm(orig_data) +# store_weight_npm_orig["prod_weight_g"] = store_weight_npm_orig.pipe(su.prod_weight_g) + + +# Check the SWA NPM for the original file +# total_swa_npm_orig, swa_by_store_orig, swa_by_market, agg_df_orig = get_swas(store_weight_npm_orig) + +# + +# Compare SWA for volume-adjusted and non-volume adjusted data + +total_swa, swa_by_store, swa_by_market, swa_all = get_swas(agg_df) +total_swa_adj, swa_by_store_adj, swa_by_market_adj, swa_all_adj = get_swas(agg_df_adj) + +# + +# Plot SWA NPM values for volume-adjusted added categories + +swa_by_market_adj = swa_by_market_adj.reset_index() + +display(swa_by_market_adj[swa_by_market_adj["rst_4_market"].isin(to_keep)]) + +print(f"Total SWA NPM for volume-adjusted data: {total_swa_adj}") + +# + +# Get count of HFSS products with and within added categories +agg_df_adj["hfss"] = agg_df_adj["npm_score"] > 4 + +agg_df_adj["hfss"].value_counts() + +agg_df_adj[~agg_df_adj["rst_4_market"].isin(to_keep)]["hfss"].value_counts() diff --git a/ahl_targets/getters/get_data_v2.py b/ahl_targets/getters/get_data_v2.py new file mode 100644 index 0000000..40e8f24 --- /dev/null +++ b/ahl_targets/getters/get_data_v2.py @@ -0,0 +1,141 @@ +"""Getters for files required for the 2024 updates. Some of these are taken from the diets repository: https://github.com/nestauk/ahl_diets_evidence""" + +# Imports +from nesta_ds_utils.loading_saving.S3 import download_obj +import pandas as pd +from ahl_targets import BUCKET_NAME +from boto3.s3.transfer import TransferConfig + + +# Functions +def new_model_data() -> pd.DataFrame: + """Reads the new model data file. + + Returns: + pd.DataFrame: new model data + """ + + return download_obj( + BUCKET_NAME, + "in_home/processed/retailer_targets_baseline/baseline_retailer_targets_input_2024.parquet", + download_as="dataframe", + kwargs_boto={"Config": TransferConfig(io_chunksize=20947892)}, + ) + + +def coefficients_2024() -> pd.DataFrame: + """Reads the new coefficients file. + + Returns: + pd.DataFrame: new coefficients + """ + + return download_obj( + BUCKET_NAME, + "in_home/processed/targets/oct_24_update/coefficients.parquet", + download_as="dataframe", + kwargs_boto={"Config": TransferConfig(io_chunksize=20947892)}, + ) + + +def inhome_purchase_apr_dec(): + """Returns the in-home purchase subset based on project requirements. + Returns: + pd.DataFrame: in-home purchase subset + """ + return download_obj( + BUCKET_NAME, + "diets_sotn/purchase_files_subsetted/inhome_purchase_apr_dec.parquet", + download_as="dataframe", + kwargs_boto={"Config": TransferConfig(io_chunksize=20947892)}, + ) + + +def nutrition_inhome() -> pd.DataFrame: + """Reads the nutrition data file. + + Returns: + pd.DataFrame: nutrition data + """ + + return download_obj( + BUCKET_NAME, + "in_home/processed/nutrition_clean.parquet", + download_as="dataframe", + kwargs_boto={"Config": TransferConfig(io_chunksize=20947892)}, + ) + + +def product_table_inhome() -> pd.DataFrame: + """Reads the product table file. + + Returns: + pd.DataFrame: product table + """ + + return download_obj( + BUCKET_NAME, + "in_home/processed/product_metadata.csv", + download_as="dataframe", + ) + + +def store_table_inhome() -> pd.DataFrame: + """Reads the store table file. + + Returns: + pd.DataFrame: store table + """ + + return download_obj( + BUCKET_NAME, + "in_home/processed/store_table.csv", + download_as="dataframe", + ) + + +def get_demographics_data(): + """Returns the household demographics data for all individuals in the both in home and out of home panels. + Returns: + pd.DataFrame: demographics data + """ + return download_obj( + BUCKET_NAME, + "ooh/processed/household_demog_table_v3.csv", + download_as="dataframe", + ) + + +def get_products_to_drop(): + """Returns a series of products that were not included in the original analysis (mostly due to missing NPM scores). + Returns: + pd.Series: unique_ids of products to drop + """ + return download_obj( + BUCKET_NAME, + "in_home/processed/targets/oct_24_update/additions_to_remove.csv", + download_as="dataframe", + ) + + +def df_npm_2024(): + """Returns the new model data (2024) with npm merged on. + Returns: + pd.DataFrame: new model data + """ + return download_obj( + BUCKET_NAME, + "in_home/processed/targets/oct_24_update/df_npm.parquet", + download_as="dataframe", + kwargs_boto={"Config": TransferConfig(io_chunksize=20947892)}, + ) + + +def get_agg_data_2024(): + """Returns the new model data (2024) aggregated by store. *This is the input to the npm_simulation model in* `ahl_targets/pipeline/2024_update/simulation_npm.py`.""" + return download_obj( + BUCKET_NAME, + "in_home/processed/targets/oct_24_update/store_weight.parquet", + download_as="dataframe", + kwargs_boto={"Config": TransferConfig(io_chunksize=20947892)}, + ) diff --git a/ahl_targets/pipeline/2024_update/README.md b/ahl_targets/pipeline/2024_update/README.md new file mode 100644 index 0000000..39a2e87 --- /dev/null +++ b/ahl_targets/pipeline/2024_update/README.md @@ -0,0 +1,94 @@ +UPDATES IN PROGRESS (27/11/24) - README TO BE UPDATED! + +In October 2024 we updated the targets model used to produce retailer targets numbers. + +The update changed the input data to: + +- [FOR NOW] Read in data from the ahl_diets_evidence repo, created in this script: +- [TBC FUTURE WORK] Read in data from ahl_core_data, including updated NPM scores (with differences for <0.25% to original scores) +- Adjust the kcal and volume values to reflect the adult proportions of household intake created in the diets work +- Add in categories excluded in the retailer targets original work as they were measured in litres, but judged as in scope when reviewing NPM technical guidance (https://assets.publishing.service.gov.uk/government/uploads/system/uploads/attachment_data/file/694145/Annex__A_the_2018_review_of_the_UK_nutrient_profiling_model.pdf). The categories added back in are defined rst_4_market categories "Cooking Oils", "Total Ice Cream", "Fresh Cream", "Lards+Compounds", "Vinegar", "Breakfast Cereals", "Defined Milk+Cream Prd(B)". +- Adjust the volume of the added categories to create equivalent measures between kg and litres +- Remove 35176 products which were not included in the original targets work and are not in the categories originally added back in. Of these, 35130 were missing NPM values and the remaining 46 account for just 0.82kcal pp per day. Analysis of this and creation of the list of products to remove are saved in + +These changes are implemented in + +[TBC] We also made minor adjustments to the model to: + +[TBC] This resulted in changes to both the population total kcal reduction from the model, and the SWA NPM baseline of each store. Updated charts are saved in: + +############################################### + +Updated Readme in progress below: + +# NPM Modelling Update 02/12/2024 + +All of these changes are implemented in []. + +## Summary + +This folder contains the scripts used to update the input data and methodology used in calculating retailer health targets. Since the development of this model, subsequent work, such as on the "Diets: State of the Nation" (DSotN) report, has enabled us to iterate on and improve the impact modelling. Here we will present an overview of these updates and the impact, followed by more detail on the changes. + +## Outcome + +### Impact + +On kcal_pp_per_day +On swa_npm shifts by store + +Include figures + +## Updates overview + +### 1. Input data update + +The input data file has been updated to be consistent with the output DSotN. This should have no effect as essentially the data is the same, just much better and consistently processed to make work across projects easier. This file is currently created here [to add] + +[To do: Update to read in from `ahl_core_data` instead] + +### 2. Only consider adult calorie intake + +The model is updated to only calculate the impact of the retailer targets model on adult calorie intake. Accordingly, the mean is more meaningful, as it isn't dragged down by all of the age groups that consume far fewer calories per day. + +This is done using a methodology developed in the DSotN project, whereby the average kcal intake of children is estimated from their age and gender, and therefore a conversion factor for the proportion of a household's intake attributable to adults is calculated. The functions used to calculate can be found in `ahl_targets/utils/diets.py`. + +### 3. Increasing the number of categories in scope + +Several food categories were excluded from the original modelling as they are measured in litres, but are judged to be in scope after reviewing [NPM technical guidance](https://assets.publishing.service.gov.uk/government/uploads/system/uploads/attachment_data/file/694145/Annex__A_the_2018_review_of_the_UK_nutrient_profiling_model.pdf). These categories are: + +- Cooking Oils +- Total Ice Cream +- Fresh Cream +- Lards+Compounds +- Vinegar +- Breakfast Cereals +- Defined Milk+Cream Prd(B) + +N.B. The category field used in the data is `rst_4_market`. + +### 4. Apply specific gravity adjustment + +The above food categories have volume measured in litres, and have have varying densities such that the water density approximation (1L ~ 1kg) is insufficiently accurate. Currently this adjustment is applied at the `rst_4_market_sector` category level and uses guidlines found here [To add] + +### 5. Remove data entry errors + +There exist some outlying products in the data that are now deemed to be data entry errors. Those removed include those that appear to have an energy density of greater than 900 kcal/100g (impossible) and those with a volume of 0. This is best practice although the impact on the result is negligible: 0.0075% of the total products are removed. + +## Detail: Quantifying the impact of changes + +The natural metrics for quantfying the impact of the changes above is through their effects on: + +- The mean calorie reduction per person per day in the simulation outcome +- The npm reduction for each store predicted in the simulated outcome + +The table below shows the impact of each of these updated steps (cumulatively): + +[to add] + +## Detail: Understanding the relationship of the input data with the results of the DSotN report + +The published mean kcal consumption of food bought for purchase in-home is x. The flowchart below shows how this number becomes the y input to the retailer targets model. + +## Detail: Additional robustness considerations + +... diff --git a/ahl_targets/pipeline/coefficients.py b/ahl_targets/pipeline/2024_update/coefficients.py similarity index 83% rename from ahl_targets/pipeline/coefficients.py rename to ahl_targets/pipeline/2024_update/coefficients.py index a45581a..9ea22cd 100644 --- a/ahl_targets/pipeline/coefficients.py +++ b/ahl_targets/pipeline/2024_update/coefficients.py @@ -1,4 +1,6 @@ -from ahl_targets.getters import get_data +"""2024 Update: re-calculating the coefficients for the new baseline file""" + +from ahl_targets.getters import get_data_v2 as g2 import pandas as pd import statsmodels.api as sm from nesta_ds_utils.loading_saving.S3 import upload_obj @@ -6,10 +8,12 @@ if __name__ == "__main__": - store_data = get_data.model_data() - reg_data = store_data[ - ["product_code", "npm_score", "ed", "rst_4_market_sector"] - ].drop_duplicates() + store_data = g2.df_npm_2024() + reg_data = ( + store_data[["product_code", "npm_score", "ed", "rst_4_market_sector"]] + .drop_duplicates() + .dropna() + ) coefficients = [] error = [] @@ -81,13 +85,6 @@ upload_obj( coefficients_df, BUCKET_NAME, - "in_home/processed/targets/coefficients.csv", - kwargs_writing={"index": False}, - ) - - upload_obj( - combined_df, - BUCKET_NAME, - "in_home/processed/targets/ed_npm_regression_output.csv", - kwargs_writing={"index": False}, + "in_home/processed/targets/oct_24_update/coefficients.parquet", + kwargs_writing={"compression": "zstd", "engine": "pyarrow"}, ) diff --git a/ahl_targets/pipeline/2024_update/create_new_datafile.py b/ahl_targets/pipeline/2024_update/create_new_datafile.py new file mode 100644 index 0000000..d9e62fa --- /dev/null +++ b/ahl_targets/pipeline/2024_update/create_new_datafile.py @@ -0,0 +1,125 @@ +""" +2024 Update: This script generates the new input data file for the retailer targets model. For detailed information on the updates applied and reasoning refer to the README: `ahl_targets/pipeline/2024_update/README.md`. +""" + +from nesta_ds_utils.loading_saving.S3 import upload_obj +from ahl_targets import BUCKET_NAME +from ahl_targets.utils import simulation_utils as su +from ahl_targets.getters import get_data +from ahl_targets.utils import diets +from ahl_targets.getters import get_data_v2 as g2 +import logging + +# Define variables +adult_pop = 51718632 +no_days = 365 + +if __name__ == "__main__": + logging.info("Building model data. This script takes about 10 minutes to run") + # Read in the new model file (details on how this is created can be found in the diets repository: https://github.com/nestauk/ahl_diets_evidence/blob/45_targets_data/ahl_diets_evidence/pipeline/number_calories_gb_retailer_checks.py) + df = g2.new_model_data() + + # Merge on NPM scores + npm = get_data.full_npm() + + df_npm = df.merge( + npm[["purchase_id", "period", "npm_score"]], + on=["purchase_id", "period"], + how="left", + ) + + # Create unique products IDs + df_npm["unique_id"] = df_npm["purchase_id"].astype(str) + df_npm["period"].astype( + str + ) + + # Quick fix: Rename variables the same names as in the old model. This allows us to reuse the same functions. + df_npm = df_npm.rename( + columns={ + "panel_id": "Panel Id", + "gross_up_weight": "Gross Up Weight", + "adjusted_volume": "volume_up", + "store_level_3": "store_cat", + "energy_kcal": "Energy KCal", + "quantity": "Quantity", + "spend": "Spend", + } + ) + + ## Apply adult intake conversion factors + logging.info("Adjusting kcal and volume values to reflect adult intake") + + # Load demographic data and calculate adult intake conversion factors + demog_df = g2.get_demographics_data() + adult_intake = diets.adult_intake(demog_df) + + # Get total prop intake for each household + adult_intake = adult_intake.groupby("Panel Id")["prop_intake"].sum().reset_index() + + # Merge adult intake to store_data + df_npm = df_npm.merge( + adult_intake[["Panel Id", "prop_intake"]], on="Panel Id", how="left" + ) + + ##Adjust kcal and volume values to reflect adult intake + + # Set original names to "old" + df_npm = df_npm.rename( + columns={"Energy KCal": "old_kcal", "volume_up": "old_volume_up"} + ) + + # Adjust kcal and volume by adult prop intake + df_npm["Energy KCal"] = df_npm["old_kcal"] * df_npm["prop_intake"] + df_npm["volume_up"] = df_npm["old_volume_up"] * df_npm["prop_intake"] + df_npm["weighted_kcal"] = df_npm["Energy KCal"] * df_npm["Gross Up Weight"] + + #### Update: This section drops a few products that didn't appear in the original file. Going to keep them in for now, not sure why they were dropped. + + # Drop products that weren't in the old file (and aren't intentially added back in) + + added_surprise = g2.get_products_to_drop() + + df_npm = df_npm[~df_npm["unique_id"].isin(added_surprise["unique_id"].astype(str))] + + logging.info( + f"Kcal pp per day baseline: {df_npm['weighted_kcal'].sum() / adult_pop / no_days}" + ) + + ## Calculate weighted NPM dataframe + logging.info("Calculating weighted NPM dataframe (model input)") + + # Load product data + prod_table = get_data.product_metadata() + + # Calculate weighted npm for each store + store_weight_npm = su.weighted_npm(df_npm) + + # Calculate product weights in grams + store_weight_npm["prod_weight_g"] = store_weight_npm.pipe(su.prod_weight_g) + + logging.info( + "NPM SWA baseline: {}".format( + (store_weight_npm["npm_score"] * store_weight_npm["kg_w"]).sum() + ) + ) + + save_prompt = input( + "Would you like to save and overwrite the existing model on S3? (y/n)" + ) + + if save_prompt == "y": + logging.info("Saving the new data files to S3") + + upload_obj( + df_npm, + BUCKET_NAME, + "in_home/processed/targets/oct_24_update/df_npm.parquet", + kwargs_writing={"compression": "zstd", "engine": "pyarrow"}, + ) + + upload_obj( + store_weight_npm, + BUCKET_NAME, + "in_home/processed/targets/oct_24_update/store_weight.parquet", + kwargs_writing={"compression": "zstd", "engine": "pyarrow"}, + ) diff --git a/ahl_targets/pipeline/simulation_npm.py b/ahl_targets/pipeline/2024_update/simulation_npm.py similarity index 77% rename from ahl_targets/pipeline/simulation_npm.py rename to ahl_targets/pipeline/2024_update/simulation_npm.py index b06d835..8855fb3 100644 --- a/ahl_targets/pipeline/simulation_npm.py +++ b/ahl_targets/pipeline/2024_update/simulation_npm.py @@ -1,41 +1,36 @@ +""" +### 2024 Update ### + +This is a copy of the original npm simulation model (`ahl_targets/pipeline/simulation_npm_legacy.py`) with the following adjustements: +- The input data has been updated based on following of updated model assumptions. +- A few additional edits to improve model performance and readability. + +See full details of the 2024 update to the npm simulation model in `ahl_targets/pipeline/2024_update/README.md`. + +""" + import pandas as pd import numpy as np from nesta_ds_utils.loading_saving.S3 import upload_obj from ahl_targets import BUCKET_NAME, PROJECT_DIR from ahl_targets.utils import simulation_utils as su from ahl_targets.getters import get_data -from ahl_targets.getters import simulated_outcomes as get_sim_data import yaml +import logging +from ahl_targets.getters import get_data_v2 as g2 -if __name__ == "__main__": - with open( - f"{PROJECT_DIR}/ahl_targets/config/npm_model.yaml", - "r", - ) as f: - modeling_params = yaml.safe_load(f) - - num_iterations = modeling_params["num_iterations"] - product_share_reform_values = modeling_params["product_share_reform_values"] - product_share_sales_values = modeling_params["product_share_sales_values"] - npm_reduction_values = modeling_params["npm_decrease_values"] - unhealthy_sales_change_values = modeling_params["unhealthy_sales_change_values"] - healthy_sales_change_values = modeling_params["healthy_sales_change_values"] - - # set seed for reproducibility - - np.random.seed(42) - - # read data - - store_data = get_data.model_data() - prod_table = get_data.product_metadata() - - store_weight_npm = su.weighted_npm(store_data) - store_weight_npm["prod_weight_g"] = store_weight_npm.pipe(su.prod_weight_g) - - # Print the coefficients table - coefficients_df = get_sim_data.coefficients_df() +def simulation_npm( + store_weight_npm: pd.DataFrame, + num_iterations: list, + product_share_reform_values: list, + product_share_sales_values: list, + npm_reduction_values: list, + unhealthy_sales_change_values: list, + healthy_sales_change_values: list, + coefficients_df: pd.DataFrame, + prod_table: pd.DataFrame, +) -> pd.DataFrame: results = [] results_data = [] @@ -52,6 +47,8 @@ high_npm = store_weight_npm[npm_cut].copy() low_npm = store_weight_npm[~npm_cut].copy() + # Note there are a few products with NaN npm values that will be assigned to low_npm. + unique_products = pd.DataFrame( store_weight_npm[(store_weight_npm["npm_score"] >= 4)][ "product_code" @@ -147,15 +144,16 @@ mean_npm_kg_baseline = ( randomised["kg_w"] * randomised["npm_score"] ).sum() + mean_npm_kcal_baseline = ( randomised["kcal_w"] * randomised["npm_score"] ).sum() kcal_pp_baseline = ( - randomised["total_kcal"].sum() / 65121700 / 365 + randomised["total_kcal"].sum() / 51718632 / 365 ) kcal_pp_new = ( - randomised["new_kcal_tot"].sum() / 65121700 / 365 + randomised["new_kcal_tot"].sum() / 51718632 / 365 ) total_prod_baseline = randomised["total_prod"].sum() @@ -163,7 +161,7 @@ spend_baseline = ( ((randomised["total_prod"] * randomised["spend"]).sum()) - / 65121700 + / 51718632 / 52 ) spend_new = ( @@ -173,7 +171,7 @@ * randomised["spend"] ).sum() ) - / 65121700 + / 51718632 / 52 ) @@ -208,15 +206,86 @@ ) ) - # Create the DataFrame from the list of results - results_df = pd.DataFrame(results) + return pd.DataFrame(results) + + +if __name__ == "__main__": + with open( + f"{PROJECT_DIR}/ahl_targets/config/npm_model.yaml", + "r", + ) as f: + modeling_params = yaml.safe_load(f) + + num_iterations = modeling_params["num_iterations"] + product_share_reform_values = modeling_params["product_share_reform_values"] + product_share_sales_values = modeling_params["product_share_sales_values"] + npm_reduction_values = modeling_params["npm_decrease_values"] + unhealthy_sales_change_values = modeling_params["unhealthy_sales_change_values"] + healthy_sales_change_values = modeling_params["healthy_sales_change_values"] + + # set seed for reproducibility + + # np.random.seed(42) - results_data_df = pd.concat(results_data, ignore_index=True) + # Read in main dataframe - # upload to S3 - upload_obj( - results_df, - BUCKET_NAME, - "in_home/processed/targets/npm_agg.csv", - kwargs_writing={"index": False}, + store_weight_npm = g2.get_agg_data_2024() + + logging.info( + "kcal pp baseline: {}".format( + store_weight_npm["total_kcal"].sum() / 51718632 / 365 + ) + ) + logging.info( + "NPM SWA baseline: {}".format( + (store_weight_npm["npm_score"] * store_weight_npm["kg_w"]).sum() + ) ) + + # Read in product metadata + prod_table = get_data.product_metadata() + + # Get coefficients for relationship between NPM and ED + coefficients_df = g2.coefficients_2024() + + # Run simulation + results_df = simulation_npm( + store_weight_npm, + num_iterations, + product_share_reform_values, + product_share_sales_values, + npm_reduction_values, + unhealthy_sales_change_values, + healthy_sales_change_values, + coefficients_df, + prod_table, + ) + + kcal_diff = ( + (results_df["kcal_pp_baseline"] - results_df["kcal_pp_new"]).mean().round(2) + ) + + # Print kcal pp baseline + logging.info("Kcal pp new: {}".format(results_df["kcal_pp_new"].mean())) + logging.info("NPM SWA new: {}".format(results_df["mean_npm_kg_new"].mean())) + + logging.info( + "Difference in kcal pp: {}".format( + (results_df["kcal_pp_baseline"] - results_df["kcal_pp_new"]).mean() + ) + ) + + save_prompt = input( + "Would you like to save and overwrite the existing model on S3? (y/n)" + ) + + if save_prompt == "y": + + logging.info("Saving the model output to S3") + + upload_obj( + results_df, + BUCKET_NAME, + f"in_home/processed/targets/oct_24_update/model_results_{kcal_diff}.csv", + kwargs_writing={"index": False}, + ) diff --git a/ahl_targets/pipeline/coefficients_legacy.py b/ahl_targets/pipeline/coefficients_legacy.py new file mode 100644 index 0000000..ab2f91d --- /dev/null +++ b/ahl_targets/pipeline/coefficients_legacy.py @@ -0,0 +1,99 @@ +""" +######### +This script is now outdated. Please find updated npm model in `pipeline/2024_updates/ahl_targets/pipeline/2024_updates/simulation_npm.py`. +######### +""" + +from ahl_targets.getters import get_data +import pandas as pd +import statsmodels.api as sm +from nesta_ds_utils.loading_saving.S3 import upload_obj +from ahl_targets import BUCKET_NAME + + +if __name__ == "__main__": + store_data = get_data.model_data() + reg_data = store_data[ + ["product_code", "npm_score", "ed", "rst_4_market_sector"] + ].drop_duplicates() + + coefficients = [] + error = [] + stderror = [] + lowci = [] + highci = [] + + # Iterate over unique categories and run the regression for each subset + for category in reg_data["rst_4_market_sector"].unique(): + subset = reg_data[reg_data["rst_4_market_sector"] == category] + + # Fit the regression model + X = subset["npm_score"] + X = sm.add_constant(X) # Add a constant term to include intercept + y = subset["ed"] + model = sm.OLS(y, X) + results = model.fit() + + # Extract the intercept and coefficient values + coefficient = results.params[1] + + # Add the coefficients to the DataFrame + coefficients.append( + {"rst_4_market_sector": category, "Coefficient": coefficient} + ) + + # Extract standard error + std_error = results.bse[1] + + # Add the standard error to the DataFrame + stderror.append({"rst_4_market_sector": category, "Standard Error": std_error}) + + # Extract R squared + rsquared = results.rsquared + + # Add the R squared to the DataFrame + error.append({"rst_4_market_sector": category, "R Squared": rsquared}) + + # Extract confidence intervals for coefficients + low = results.conf_int(alpha=0.05).iloc[1, 0] + high = results.conf_int(alpha=0.05).iloc[1, 1] + + lowci.append({"rst_4_market_sector": category, "Low CI": low}) + highci.append({"rst_4_market_sector": category, "High CI": high}) + + # Print the coefficients table + coefficients_df = pd.DataFrame(coefficients) + + # Print the error table + error_df = pd.DataFrame(error) + + # Print the standard error table + stderror_df = pd.DataFrame(stderror) + + # Print the confidence interval tables + lowci_df = pd.DataFrame(lowci) + highci_df = pd.DataFrame(highci) + + # combine tables + combined_df = ( + coefficients_df.merge(stderror_df, on="rst_4_market_sector") + .merge(error_df, on="rst_4_market_sector") + .merge(lowci_df, on="rst_4_market_sector") + .merge(highci_df, on="rst_4_market_sector") + .drop_duplicates() + ) + + # upload to S3 + upload_obj( + coefficients_df, + BUCKET_NAME, + "in_home/processed/targets/coefficients.csv", + kwargs_writing={"index": False}, + ) + + upload_obj( + combined_df, + BUCKET_NAME, + "in_home/processed/targets/ed_npm_regression_output.csv", + kwargs_writing={"index": False}, + ) diff --git a/ahl_targets/pipeline/model_data_diets.py b/ahl_targets/pipeline/model_data_diets.py new file mode 100644 index 0000000..0f777b2 --- /dev/null +++ b/ahl_targets/pipeline/model_data_diets.py @@ -0,0 +1,183 @@ +""" +This script aims to prepare the data by applying the same exclusions as in diets: +1) Cutting out Jan-Mar data +2) Leaving drinks in +3) Leaving products with (ed<900) in - Flag - if this is making the change we probably shouldn't switch to doing this! +4) Leaving all stores in +""" + +from ahl_targets.getters import get_data +from ahl_targets.utils import stores_transformation as stores +from ahl_targets.utils import product_transformation as product +from nesta_ds_utils.loading_saving.S3 import upload_obj +from ahl_targets.utils.io import load_with_encoding +import logging +from ahl_targets import BUCKET_NAME +import pandas as pd +from ahl_targets import PROJECT_DIR + + +# Getters for files that have local paths +def get_store_itemisation_line() -> pd.DataFrame: + """Reads the dataset of store lines.""" + return pd.read_csv( + load_with_encoding( + BUCKET_NAME, "in_home/latest_data/store_itemisation_lines.csv" + ), + encoding="ISO-8859-1", + ) + + +def get_store_itemisation_coding() -> pd.DataFrame: + """Reads the dataset of store codes.""" + return pd.read_csv( + load_with_encoding( + BUCKET_NAME, "in_home/latest_data/store_itemisation_coding.csv" + ), + encoding="ISO-8859-1", + ) + + +if __name__ == "__main__": + store_coding = get_store_itemisation_coding() + store_lines = get_store_itemisation_line() + pur_rec_vol = get_data.purchase_records_volume() + prod_table = get_data.product_metadata() + npm = get_data.full_npm() + nut = ( + get_data.nutrition() + ) # Flag: Have downloaded this file from : https://eu-west-2.console.aws.amazon.com/s3/object/ahl-private-data?region=eu-west-2&bucketType=general&prefix=in_home/latest_data/nutrition_data.csv + + logging.info("This script takes about 20 minutes to run") + + # define stores to keep + keep_stores = [ + "Total Tesco", + "Total Sainsbury's", + "Total Asda", + "Total Morrisons", + "Aldi", + "Lidl", + "Total Waitrose", + "The Co-Operative", + "Total Marks & Spencer", + "Total Iceland", + "Ocado Internet", + ] + # create custom taxonomy + store_levels = stores.custom_taxonomy( + stores.taxonomy( + store_coding, + store_lines, + ) + ) + + ### 4) Leaving all stores in ### + # subset to stores that are part of this analysis + # stores_sub = store_levels[store_levels["store_cat"].isin(keep_stores)] + + # stores that are not part of the analysis + stores_not_sub = store_levels[~store_levels["store_cat"].isin(keep_stores)] + + logging.info("merge purchase record with product info") + dat1 = pur_rec_vol.merge( + prod_table, left_on="Product Code", right_on="product_code" + ) + + # 2) Leaving drinks in + # logging.info("retain food only") + # dat1 = product.is_food(dat1).query("is_food == 1") + + logging.info("filter to kg only") + # filter to kg only + dat2_original = dat1.query("reported_volume == 'Kilos'") + + # For info - how many products are filtered? + logging.info( + f"% of sales volume filtered out: {1 - (dat2_original.Quantity * dat2_original.Volume * dat2_original['Gross Up Weight']).sum() / (dat1.Quantity * dat1.Volume * dat1['Gross Up Weight']).sum()}" + ) + # 57% filtered!! Is this right!? This will also filter out drinks as it filters out L + + # Let's include L and say 1L = 1kg + dat2 = dat1.query( + "(reported_volume == 'Kilos') | (reported_volume == 'Litres')" + ).copy() + + logging.info( + f"% of sales volume filtered out: {1 - (dat2.Quantity * dat2.Volume * dat2['Gross Up Weight']).sum() / (dat1.Quantity * dat1.Volume * dat1['Gross Up Weight']).sum()}" + ) + # 5% filtered + + logging.info("merge with store data and npm") + + # Merge with npm and store names (model data) + store_data = dat2.merge( + store_levels, left_on="Store Code", right_on="store_id" + ).merge( + npm[["purchase_id", "period", "npm_score", "kcal_per_100g"]], + left_on=["PurchaseId", "Period"], + right_on=["purchase_id", "period"], + ) + + store_data.rename(columns={"kcal_per_100g": "ed"}, inplace=True) + + # remove implausible values (not removing anymore but keeping into see how many are stripped) + store_data_ed = store_data[store_data["ed"] < 900].copy() + + logging.info( + f'% of sales volume filtered by ed < 900: {1 - (store_data_ed.Quantity * store_data_ed.Volume * store_data_ed["Gross Up Weight"]).sum() / (store_data.Quantity * store_data.Volume * store_data["Gross Up Weight"]).sum()}' + ) + + # Cutting out Jan-Mar data (1) + # Filter data from April 2021 onwards + store_data = store_data[ + pd.to_datetime(store_data["Purchase Date"], format="%d/%m/%Y") >= "01/04/2021" + ] + + # Local save store data at this point for crashes + store_data.to_csv( + PROJECT_DIR / "outputs/model_data_unfiltered_pre.csv", index=False + ) + + out = store_data.merge( + nut[["Purchase Number", "Purchase Period", "Energy KCal"]], + left_on=["PurchaseId", "Period"], + right_on=["Purchase Number", "Purchase Period"], + ) + + # # Merge with npm and store names (NOT model data) + # store_data_not_sub = dat2.merge( + # stores_not_sub, left_on="Store Code", right_on="store_id" + # ).merge( + # npm[["purchase_id", "period", "npm_score", "kcal_per_100g"]], + # left_on=["PurchaseId", "Period"], + # right_on=["purchase_id", "period"], + # ) + + # store_data_not_sub.rename(columns={"kcal_per_100g": "ed"}, inplace=True) + + # # remove implausible values + # store_data_not_sub = store_data_not_sub[store_data_not_sub["ed"] < 900].copy() + + # out_not_sub = store_data_not_sub.merge( + # nut[["Purchase Number", "Purchase Period", "Energy KCal"]], + # left_on=["PurchaseId", "Period"], + # right_on=["Purchase Number", "Purchase Period"], + # ) + + # upload_obj( + # out, + # BUCKET_NAME, + # "in_home/processed/targets/model_data.parquet", + # kwargs_writing={"compression": "zstd", "engine": "pyarrow"}, + # ) + + # upload_obj( + # out_not_sub, + # BUCKET_NAME, + # "in_home/processed/targets/excluded_data.parquet", + # kwargs_writing={"compression": "zstd", "engine": "pyarrow"}, + # ) + + # Local save + out.to_csv(PROJECT_DIR / "outputs/model_data_unfiltered.csv", index=False) diff --git a/ahl_targets/pipeline/simulation_npm_legacy.py b/ahl_targets/pipeline/simulation_npm_legacy.py new file mode 100644 index 0000000..e0ed8b4 --- /dev/null +++ b/ahl_targets/pipeline/simulation_npm_legacy.py @@ -0,0 +1,228 @@ +""" +######### +This script is now outdated. Please find updated npm model in `pipeline/2024_updates/ahl_targets/pipeline/2024_updates/simulation_npm.py`. +######### +""" + +import pandas as pd +import numpy as np +from nesta_ds_utils.loading_saving.S3 import upload_obj +from ahl_targets import BUCKET_NAME, PROJECT_DIR +from ahl_targets.utils import simulation_utils as su +from ahl_targets.getters import get_data +from ahl_targets.getters import simulated_outcomes as get_sim_data +import yaml + + +if __name__ == "__main__": + with open( + f"{PROJECT_DIR}/ahl_targets/config/npm_model.yaml", + "r", + ) as f: + modeling_params = yaml.safe_load(f) + + num_iterations = modeling_params["num_iterations"] + product_share_reform_values = modeling_params["product_share_reform_values"] + product_share_sales_values = modeling_params["product_share_sales_values"] + npm_reduction_values = modeling_params["npm_decrease_values"] + unhealthy_sales_change_values = modeling_params["unhealthy_sales_change_values"] + healthy_sales_change_values = modeling_params["healthy_sales_change_values"] + + # set seed for reproducibility + + np.random.seed(42) + + # read data + + store_data = get_data.model_data() + prod_table = get_data.product_metadata() + + store_weight_npm = su.weighted_npm(store_data) + store_weight_npm["prod_weight_g"] = store_weight_npm.pipe(su.prod_weight_g) + + # Print the coefficients table + coefficients_df = get_sim_data.coefficients_df() + + results = [] + results_data = [] + + # Nested loop to iterate through different values of product_share and ed_reduction + for product_share_reform in product_share_reform_values: + for product_share_sale in product_share_sales_values: + for npm_reduction in npm_reduction_values: + for sales_change_high in unhealthy_sales_change_values: + for sales_change_low in healthy_sales_change_values: + # Repeat the code num_iterations times + for _ in range(num_iterations[0]): + npm_cut = store_weight_npm["npm_score"] >= 4 + high_npm = store_weight_npm[npm_cut].copy() + low_npm = store_weight_npm[~npm_cut].copy() + + unique_products = pd.DataFrame( + store_weight_npm[(store_weight_npm["npm_score"] >= 4)][ + "product_code" + ].unique(), + columns=["product_code"], + ) + + unique_products["indicator_reform"] = np.random.choice( + [0, 1], + size=len(unique_products), + p=[1 - product_share_reform, product_share_reform], + ) + + high_npm = high_npm.merge( + unique_products, on="product_code", how="left" + ) + high_npm["indicator_sale"] = np.random.choice( + [0, 1], + size=len(high_npm), + p=[1 - product_share_sale, product_share_sale], + ) + high_npm["new_total_prod"] = np.where( + high_npm["indicator_sale"] == 1, + high_npm["total_prod"] * (1 - sales_change_high / 100), + high_npm["total_prod"], + ) + + low_npm["indicator_reform"] = 0 + low_npm["indicator_sale"] = np.random.choice( + [0, 1], + size=len(low_npm), + p=[1 - product_share_sale, product_share_sale], + ) + low_npm["new_total_prod"] = np.where( + low_npm["indicator_sale"] == 1, + low_npm["total_prod"] * (1 + sales_change_low / 100), + low_npm["total_prod"], + ) + + randomised = pd.concat( + [high_npm, low_npm], ignore_index=True + ) + randomised["new_total_kg"] = ( + randomised["new_total_prod"] + * randomised["prod_weight_g"] + / 1000 + ) + randomised["new_npm"] = randomised.pipe( + su.apply_reduction_npm, npm_reduction + ) + + randomised = randomised.merge( + prod_table[["rst_4_market_sector", "product_code"]], + on="product_code", + ).merge( + coefficients_df, on="rst_4_market_sector", how="left" + ) + + randomised["ed_pred"] = np.where( + npm_reduction > 0, + randomised["ed"] + - randomised["Coefficient"] * npm_reduction, + randomised["ed"], + ) + + randomised["new_ed"] = np.where( + randomised["indicator_reform"] == 1, + randomised["ed_pred"], + randomised["ed"], + ) + randomised["new_kcal_tot"] = ( + randomised["new_ed"] + / 100 + * randomised["prod_weight_g"] + * randomised["new_total_prod"] + ) + randomised["kcal_w_new"] = ( + randomised["new_kcal_tot"] + / randomised["new_kcal_tot"].sum() + ) + randomised["kg_w_new"] = ( + randomised["new_total_kg"] + / randomised["new_total_kg"].sum() + ) + + mean_npm_kg_new = ( + randomised["kg_w_new"] * randomised["new_npm"] + ).sum() + mean_npm_kcal_new = ( + randomised["kcal_w_new"] * randomised["new_npm"] + ).sum() + + mean_npm_kg_baseline = ( + randomised["kg_w"] * randomised["npm_score"] + ).sum() + mean_npm_kcal_baseline = ( + randomised["kcal_w"] * randomised["npm_score"] + ).sum() + + kcal_pp_baseline = ( + randomised["total_kcal"].sum() / 65121700 / 365 + ) + kcal_pp_new = ( + randomised["new_kcal_tot"].sum() / 65121700 / 365 + ) + + total_prod_baseline = randomised["total_prod"].sum() + total_prod_new = randomised["new_total_prod"].sum() + + spend_baseline = ( + ((randomised["total_prod"] * randomised["spend"]).sum()) + / 65121700 + / 52 + ) + spend_new = ( + ( + ( + randomised["new_total_prod"] + * randomised["spend"] + ).sum() + ) + / 65121700 + / 52 + ) + + # Append the results to the list + results.append( + { + "product_share_reform": product_share_reform, + "product_share_sale": product_share_sale, + "sales_change_high": sales_change_high, + "sales_change_low": sales_change_low, + "npm_reduction": npm_reduction, + "mean_npm_kg_new": mean_npm_kg_new, + "mean_npm_kcal_new": mean_npm_kcal_new, + "mean_npm_kg_baseline": mean_npm_kg_baseline, + "mean_npm_kcal_baseline": mean_npm_kcal_baseline, + "kcal_pp_baseline": kcal_pp_baseline, + "kcal_pp_new": kcal_pp_new, + "total_prod_baseline": total_prod_baseline, + "total_prod_new": total_prod_new, + "spend_baseline": spend_baseline, + "spend_new": spend_new, + } + ) + + results_data.append( + randomised.assign( + product_share_reform=product_share_reform, + product_share_sale=product_share_sale, + npm_reduction=npm_reduction, + sales_change_high=sales_change_high, + sales_change_low=sales_change_low, + ) + ) + + # Create the DataFrame from the list of results + results_df = pd.DataFrame(results) + + results_data_df = pd.concat(results_data, ignore_index=True) + + # upload to S3 + upload_obj( + results_df, + BUCKET_NAME, + "in_home/processed/targets/npm_agg.csv", + kwargs_writing={"index": False}, + ) diff --git a/ahl_targets/pipeline/simulation_npm_v2.py b/ahl_targets/pipeline/simulation_npm_v2.py new file mode 100644 index 0000000..a628d05 --- /dev/null +++ b/ahl_targets/pipeline/simulation_npm_v2.py @@ -0,0 +1,292 @@ +""" +This is a quick run of testing the predicted kcal per person per day reductions on the **adult population** only. +Approach: + +- Merge on the proportion of a household's intake that is consumed by adults (from functions pulled from diets work) to the input data. +- Otherwise leave the inputs and script unchanged. + +__________ +Results: +New baseline: 1787.8 +New adjusted: 1732.4 +Diff: 55.3 + +The old baseline was 1629.2 calories per person per day. + +Results v2: +This uses the input generated in `pipeline/model_data_diets.py` which should have the same assumptions as the diets work. +Baseline: 1549 + +How is it even lower! There seem to be some problems with the file, including a different set of column names coming out of the model_data.py script to those currently in the S3 bucket. Other issues are: +- The 'Period' and 'Purchase Date' columns do not match, so unsure when the date of purchases are + +__________ +Causes for potential concern: +- The new baseline is much lower than on diets (2393.3). This is surprising as it's not obvious where there would large differences between the purchase data. +- Have new data deliveries from Kantar meant that the Panel ID merge is no longer matching to the right people? (I don't think so) +- ... +""" + +import pandas as pd +import numpy as np +from nesta_ds_utils.loading_saving.S3 import upload_obj +from ahl_targets import BUCKET_NAME, PROJECT_DIR +from ahl_targets.utils import simulation_utils as su +from ahl_targets.getters import get_data +from ahl_targets.getters import simulated_outcomes as get_sim_data +import yaml + + +if __name__ == "__main__": + with open( + f"{PROJECT_DIR}/ahl_targets/config/npm_model.yaml", + "r", + ) as f: + modeling_params = yaml.safe_load(f) + + num_iterations = modeling_params["num_iterations"] + product_share_reform_values = modeling_params["product_share_reform_values"] + product_share_sales_values = modeling_params["product_share_sales_values"] + npm_reduction_values = modeling_params["npm_decrease_values"] + unhealthy_sales_change_values = modeling_params["unhealthy_sales_change_values"] + healthy_sales_change_values = modeling_params["healthy_sales_change_values"] + + # set seed for reproducibility + + np.random.seed(42) + + # read data + + # store_data = get_data.model_data() + + store_data = pd.read_csv( + PROJECT_DIR / "outputs/model_data_unfiltered.csv" + ) # The data with diets specs applied + + ########## v2 edit ########## + from ahl_targets.utils import diets + from ahl_targets.getters import get_data_v2 as g2 + + # Get adult intake + demog_df = g2.get_demographics_data() + adult_intake = diets.adult_intake(demog_df) + + # Get total prop intake for each household + adult_intake = adult_intake.groupby("Panel Id")["prop_intake"].sum().reset_index() + + # Merge adult intake to store_data + store_data = store_data.merge( + adult_intake[["Panel Id", "prop_intake"]], on="Panel Id", how="left" + ) + + ##Adjust kcal and volume values to reflect adult intake + + # Temp edit: Volume = volume up, is this right? + store_data["volume_up"] = store_data["Volume"].astype(float) + store_data["Energy KCal"] = store_data["Energy KCal"].astype(float) + + # Set original names to "old" + store_data = store_data.rename( + columns={"Energy KCal": "old_kcal", "volume_up": "old_volume_up"} + ) + + # Adjust kcal and volume by adult prop intake + store_data["Energy KCal"] = store_data["old_kcal"] * store_data["prop_intake"] + store_data["volume_up"] = store_data["old_volume_up"] * store_data["prop_intake"] + + ############################# + + prod_table = get_data.product_metadata() + + store_weight_npm = su.weighted_npm(store_data) + store_weight_npm["prod_weight_g"] = store_weight_npm.pipe(su.prod_weight_g) + + # Print the coefficients table + coefficients_df = get_sim_data.coefficients_df() + + results = [] + results_data = [] + + # Nested loop to iterate through different values of product_share and ed_reduction + for product_share_reform in product_share_reform_values: + for product_share_sale in product_share_sales_values: + for npm_reduction in npm_reduction_values: + for sales_change_high in unhealthy_sales_change_values: + for sales_change_low in healthy_sales_change_values: + # Repeat the code num_iterations times + for _ in range(num_iterations[0]): + npm_cut = store_weight_npm["npm_score"] >= 4 + high_npm = store_weight_npm[npm_cut].copy() + low_npm = store_weight_npm[~npm_cut].copy() + + unique_products = pd.DataFrame( + store_weight_npm[(store_weight_npm["npm_score"] >= 4)][ + "product_code" + ].unique(), + columns=["product_code"], + ) + + unique_products["indicator_reform"] = np.random.choice( + [0, 1], + size=len(unique_products), + p=[1 - product_share_reform, product_share_reform], + ) + + high_npm = high_npm.merge( + unique_products, on="product_code", how="left" + ) + high_npm["indicator_sale"] = np.random.choice( + [0, 1], + size=len(high_npm), + p=[1 - product_share_sale, product_share_sale], + ) + high_npm["new_total_prod"] = np.where( + high_npm["indicator_sale"] == 1, + high_npm["total_prod"] * (1 - sales_change_high / 100), + high_npm["total_prod"], + ) + + low_npm["indicator_reform"] = 0 + low_npm["indicator_sale"] = np.random.choice( + [0, 1], + size=len(low_npm), + p=[1 - product_share_sale, product_share_sale], + ) + low_npm["new_total_prod"] = np.where( + low_npm["indicator_sale"] == 1, + low_npm["total_prod"] * (1 + sales_change_low / 100), + low_npm["total_prod"], + ) + + randomised = pd.concat( + [high_npm, low_npm], ignore_index=True + ) + randomised["new_total_kg"] = ( + randomised["new_total_prod"] + * randomised["prod_weight_g"] + / 1000 + ) + randomised["new_npm"] = randomised.pipe( + su.apply_reduction_npm, npm_reduction + ) + + randomised = randomised.merge( + prod_table[["rst_4_market_sector", "product_code"]], + on="product_code", + ).merge( + coefficients_df, on="rst_4_market_sector", how="left" + ) + + randomised["ed_pred"] = np.where( + npm_reduction > 0, + randomised["ed"] + - randomised["Coefficient"] * npm_reduction, + randomised["ed"], + ) + + randomised["new_ed"] = np.where( + randomised["indicator_reform"] == 1, + randomised["ed_pred"], + randomised["ed"], + ) + randomised["new_kcal_tot"] = ( + randomised["new_ed"] + / 100 + * randomised["prod_weight_g"] + * randomised["new_total_prod"] + ) + randomised["kcal_w_new"] = ( + randomised["new_kcal_tot"] + / randomised["new_kcal_tot"].sum() + ) + randomised["kg_w_new"] = ( + randomised["new_total_kg"] + / randomised["new_total_kg"].sum() + ) + + mean_npm_kg_new = ( + randomised["kg_w_new"] * randomised["new_npm"] + ).sum() + mean_npm_kcal_new = ( + randomised["kcal_w_new"] * randomised["new_npm"] + ).sum() + + mean_npm_kg_baseline = ( + randomised["kg_w"] * randomised["npm_score"] + ).sum() + mean_npm_kcal_baseline = ( + randomised["kcal_w"] * randomised["npm_score"] + ).sum() + + kcal_pp_baseline = ( + randomised["total_kcal"].sum() / 51718632 / 365 + ) + kcal_pp_new = ( + randomised["new_kcal_tot"].sum() / 51718632 / 365 + ) + + total_prod_baseline = randomised["total_prod"].sum() + total_prod_new = randomised["new_total_prod"].sum() + + spend_baseline = ( + ((randomised["total_prod"] * randomised["spend"]).sum()) + / 51718632 + / 52 + ) + spend_new = ( + ( + ( + randomised["new_total_prod"] + * randomised["spend"] + ).sum() + ) + / 51718632 + / 52 + ) + + # Append the results to the list + results.append( + { + "product_share_reform": product_share_reform, + "product_share_sale": product_share_sale, + "sales_change_high": sales_change_high, + "sales_change_low": sales_change_low, + "npm_reduction": npm_reduction, + "mean_npm_kg_new": mean_npm_kg_new, + "mean_npm_kcal_new": mean_npm_kcal_new, + "mean_npm_kg_baseline": mean_npm_kg_baseline, + "mean_npm_kcal_baseline": mean_npm_kcal_baseline, + "kcal_pp_baseline": kcal_pp_baseline, + "kcal_pp_new": kcal_pp_new, + "total_prod_baseline": total_prod_baseline, + "total_prod_new": total_prod_new, + "spend_baseline": spend_baseline, + "spend_new": spend_new, + } + ) + + results_data.append( + randomised.assign( + product_share_reform=product_share_reform, + product_share_sale=product_share_sale, + npm_reduction=npm_reduction, + sales_change_high=sales_change_high, + sales_change_low=sales_change_low, + ) + ) + + # Create the DataFrame from the list of results + results_df = pd.DataFrame(results) + + results_data_df = pd.concat(results_data, ignore_index=True) + + # upload to S3 + # upload_obj( + # results_df, + # BUCKET_NAME, + # "in_home/processed/targets/npm_agg.csv", + # kwargs_writing={"index": False}, + # ) + + # Local save + results_df.to_csv(PROJECT_DIR / "outputs/npm_sim_results.csv", index=False) diff --git a/ahl_targets/utils/diets.py b/ahl_targets/utils/diets.py new file mode 100644 index 0000000..96c4e6f --- /dev/null +++ b/ahl_targets/utils/diets.py @@ -0,0 +1,109 @@ +"""These are utils functions taken from the diets repo""" + +import pandas as pd +import numpy as np + +## Functions used to calculate the adult intake conversion factors + + +def ind_size_conv(pan_ind: pd.DataFrame) -> pd.DataFrame: + """ + Using the adult equivilent conversion factor for reccomended daily intake of kcal. + Creates prop_intake field for each individual, which represents the proportion of the household size + (using the adult equivilent conversion factor for reccomended daily intake of kcal) that the individual represents. + Link: https://www.researchgate.net/figure/Adult-equivalent-conversion-factors-for-estimated-calorie-requirements-according-to-age_tbl1_49704894 + + Args: + pan_ind (pd.DataFrame): Pandas dataframe of household members + Returns: + pd.DateFrame: Household converted total size + """ + age_group = ["0-1", "1-3", "4-6", "7-10", "11-14", "15-18", "19-24", "25-50", "51+"] + + d = { + "age_group": [ + "0-1", + "1-3", + "4-6", + "7-10", + "11-14", + "15-18", + "19-24", + "25-50", + "51+", + "11-14", + "15-18", + "19-24", + "25-50", + "51+", + ], + "group": [ + "children", + "children", + "children", + "children", + "M", + "M", + "M", + "M", + "M", + "F", + "F", + "F", + "F", + "F", + ], + "conversion": [ + 0.29, + 0.51, + 0.71, + 0.78, + 0.98, + 1.18, + 1.14, + 1.14, + 0.90, + 0.86, + 0.86, + 0.86, + 0.86, + 0.75, + ], + } + conversion_table = pd.DataFrame(data=d) + + bins = [0, 1, 4, 7, 11, 15, 19, 25, 51, 150] + + pan_ind["age_group"] = pd.cut( + pan_ind["Age"], bins=bins, labels=age_group, right=False + ) + pan_ind["group"] = np.where(pan_ind["Age"] < 11, "children", pan_ind["Gender"]) + pan_ind_conv = pan_ind.merge( + conversion_table, how="left", on=["age_group", "group"] + ).copy() + hh_conv = ( + pan_ind_conv.groupby(["Panel Id"])["conversion"] + .sum() + .reset_index(name="hh_conv") + ) + # merge back to pan_ind_conv + pan_ind_conv = pan_ind_conv.merge(hh_conv, how="left", on="Panel Id") + pan_ind_conv["prop_intake"] = pan_ind_conv["conversion"] / pan_ind_conv["hh_conv"] + return pan_ind_conv + + +def adult_intake(demog_df: pd.DataFrame) -> pd.DataFrame: + """ + Calculate the proportion of alcohol intake for each individual over 18. + + Args: + demog_df (pd.DataFrame): The demographic DataFrame with conversion information. + child_energy_intake (pd.DataFrame): The child energy intake DataFrame. + + Returns: + pd.DataFrame: The individual table with the proportion of alcohol intake. + """ + ind_table_conv = ind_size_conv(demog_df) + # subset to >= 18 + ind_table_conv_18 = ind_table_conv[ind_table_conv["Age"] >= 18].copy() + return ind_table_conv_18