nestauk · maxhrhadley · Oct 9, 2024 · Oct 9, 2024 · Oct 10, 2024 · Oct 10, 2024
diff --git a/ahl_targets/analysis/change_checks_Oct24/compare_new_to_old_file.py b/ahl_targets/analysis/change_checks_Oct24/compare_new_to_old_file.py
@@ -0,0 +1,141 @@
+"""This file does two things:
+- Merges the new values back to the original file to check that for each unique purchase (defined by purchase_id and period) the key values are the same
+(ie merging + transfer from diets) has worked as expected
+
+On investigation, I found that there were unexpected products in the new file that weren't in the old file.
+These were products that were in categories that were intentionally added back in.
+The majority of these products had missing NPM scores, so were likely excluded in the original analysis for this reason.
+~100 had, and it's hard to tell why they weren't included. However, this is a negligible number accounting for <1kcal pp per day, so I've removed them and saved an updated file.
+"""
+
+import pandas as pd
+import numpy as np
+import matplotlib.pyplot as plt
+
+
+from nesta_ds_utils.loading_saving.S3 import upload_obj
+from ahl_targets import BUCKET_NAME, PROJECT_DIR
+from ahl_targets.utils import simulation_utils as su
+from ahl_targets.getters import get_data
+from ahl_targets.getters import simulated_outcomes as get_sim_data
+import yaml
+
+
+from ahl_targets.utils import diets
+from ahl_targets.getters import get_data_v2 as g2
+import logging
+
+
+if __name__ == "__main__":
+
+    adult_pop = 51718632
+    no_days = 365
+
+    # Read in data
+
+    orig_data = get_data.model_data()
+    df = g2.new_model_data()
+
+    # Merge on NPM score
+    npm = get_data.full_npm()
+
+    df_npm = df.merge(
+        npm[["purchase_id", "period", "npm_score", "kcal_per_100g"]],
+        on=["purchase_id", "period"],
+        how="left",
+    )
+
+    # Ensure unique products IDs
+    df_npm["unique_id"] = df_npm["purchase_id"].astype(str) + df_npm["period"].astype(
+        str
+    )
+    orig_data["unique_id"] = orig_data["purchase_id"].astype(str) + orig_data[
+        "period"
+    ].astype(str)
+
+    # Update the new data file with an indicator of whether it was in the original analysis
+    df_npm["is_in_old"] = df_npm["unique_id"].isin(orig_data["unique_id"])
+
+    # Rename variables the equivalent in the old model
+    df_npm = df_npm.rename(
+        columns={
+            "panel_id": "Panel Id",
+            "gross_up_weight": "Gross Up Weight",
+            "volume": "volume_up",
+            "store_level_3": "store_cat",
+            "energy_kcal": "Energy KCal",
+            "quantity": "Quantity",
+            "spend": "Spend",
+        }
+    )
+
+    # Get the same products and check ED and NPM score differences at product level - all matches
+
+    store_data_comp = orig_data.merge(
+        df_npm[["purchase_id", "period", "npm_score", "kcal_per_100g", "volume_up"]],
+        on=["purchase_id", "period"],
+        how="left",
+        suffixes=("", "_new"),
+    )
+
+    store_data_comp["npm_diff"] = (
+        store_data_comp["npm_score"] - store_data_comp["npm_score_new"]
+    )
+    store_data_comp["volume_diff"] = (
+        store_data_comp["volume_up"] - store_data_comp["volume_up_new"]
+    )
+    store_data_comp["ed_diff"] = (
+        store_data_comp["ed"] - store_data_comp["kcal_per_100g"]
+    )
+
+    logging.info(
+        f"Number of records with different NPM: {store_data_comp[store_data_comp['npm_diff'] != 0].shape[0]}"
+    )
+    logging.info(
+        f"Number of records with different volume: {store_data_comp[store_data_comp['volume_diff'] != 0].shape[0]}"
+    )
+    logging.info(
+        f"Number of records with different ed: {store_data_comp[store_data_comp['ed_diff'] != 0].shape[0]}"
+    )
+
+    # Remove products that weren't in the categories added back in and weren't in the old model
+
+    # List of categories intentionally added to original targets file
+    to_keep = [
+        "Cooking Oils",
+        "Total Ice Cream",
+        "Fresh Cream",
+        "Lards+Compounds",
+        "Vinegar",
+        "Breakfast Cereals",
+        "Defined Milk+Cream Prd(B)",
+    ]
+
+    # Filter for products that weren't in the old model, or in the categories to add
+    added_new = df_npm[~df_npm["is_in_old"]]
+    added_surprise = added_new[~added_new["rst_4_market"].isin(to_keep)]
+
+    # The majority of these have missing NPM scores (likely why they were excluded in the original)
+
+    logging.info(f"Total surprise additions: {added_surprise.shape[0]}")
+    logging.info(
+        f"Number dropped due to missing NPM scores: {added_surprise['npm_score'].isna().sum()}"
+    )
+
+    # Overall kcal per person per day is minimal
+
+    logging.info(
+        f"Total kcal per person per day of surprise additions: {added_surprise['energy_kcal_weighted'].sum()/adult_pop/no_days}"
+    )
+
+    # Therefore, just remove them and check the baseline effect
+    logging.info("Saving list of products to remove")
+
+    added_new = added_new[~added_new["unique_id"].isin(added_surprise["unique_id"])]
+
+    upload_obj(
+        added_surprise,
+        BUCKET_NAME,
+        "in_home/processed/targets/oct_24_update/additions_to_remove.csv",
+        kwargs_writing={"index": False},
+    )
diff --git a/ahl_targets/analysis/change_checks_Oct24/new_swa_analysis.py b/ahl_targets/analysis/change_checks_Oct24/new_swa_analysis.py
@@ -0,0 +1,135 @@
+# ---
+# jupyter:
+#   jupytext:
+#     cell_metadata_filter: -all
+#     comment_magics: true
+#     text_representation:
+#       extension: .py
+#       format_name: light
+#       format_version: '1.5'
+#       jupytext_version: 1.16.2
+#   kernelspec:
+#     display_name: ahl_targets
+#     language: python
+#     name: python3
+# ---
+
+# +
+import pandas as pd
+import numpy as np
+import matplotlib.pyplot as plt
+
+
+from nesta_ds_utils.loading_saving.S3 import upload_obj
+from ahl_targets import BUCKET_NAME, PROJECT_DIR
+from ahl_targets.utils import simulation_utils as su
+from ahl_targets.getters import get_data
+from ahl_targets.getters import simulated_outcomes as get_sim_data
+import yaml
+
+
+from ahl_targets.utils import diets
+from ahl_targets.getters import get_data_v2 as g2
+import logging
+
+# -
+
+to_keep = [
+    "Cooking Oils",
+    "Total Ice Cream",
+    "Fresh Cream",
+    "Lards+Compounds",
+    "Vinegar",
+    "Breakfast Cereals",
+    "Defined Milk+Cream Prd(B)",
+]
+
+prod_table = get_data.product_metadata()
+
+# Read in data
+agg_df = g2.get_agg_data()
+agg_df_adj = g2.get_agg_data_vol_adjusted()
+
+
+def get_swas(store_weight_npm, prod_metadata=prod_table):
+
+    # Calculate SWA NPM
+    store_weight_npm["swa_npm"] = (
+        store_weight_npm["kg_w"] * store_weight_npm["npm_score"]
+    )
+
+    # Calculate SWA NPM by market
+    total_swa_npm = store_weight_npm["swa_npm"].sum()
+
+    # Calculate SWA NPM by store
+    store_weight_npm["kg_w_store_cat"] = store_weight_npm[
+        "kg_w"
+    ] / store_weight_npm.groupby("store_cat")["kg_w"].transform("sum")
+    store_weight_npm["swa_npm_store_cat"] = (
+        store_weight_npm["kg_w_store_cat"] * store_weight_npm["npm_score"]
+    )
+    store_swa_npm = (
+        store_weight_npm["swa_npm_store_cat"]
+        .groupby(store_weight_npm["store_cat"])
+        .sum()
+    )
+
+    # Merge back markets
+    store_weight_npm = store_weight_npm.merge(
+        prod_metadata[["product_code", "rst_4_market", "rst_4_extended"]],
+        on=["product_code"],
+        how="left",
+    )
+
+    # Calculate SWA NPM by rst_4_market
+    store_weight_npm["kg_w_rst_4_market"] = store_weight_npm[
+        "kg_w"
+    ] / store_weight_npm.groupby("rst_4_market")["kg_w"].transform("sum")
+    store_weight_npm["swa_npm_rst_4_market"] = (
+        store_weight_npm["kg_w_rst_4_market"] * store_weight_npm["npm_score"]
+    )
+    rst_4_market_swa_npm = (
+        store_weight_npm["swa_npm_rst_4_market"]
+        .groupby(store_weight_npm["rst_4_market"])
+        .sum()
+    )
+
+    return total_swa_npm, store_swa_npm, rst_4_market_swa_npm, store_weight_npm
+
+
+# +
+# Import original data
+# I did this to check that the SWA function works as expected (produces the same results as the original data). Have commented out the import to reduce the memory usage in the notebook
+# Original results saved here: https://docs.google.com/spreadsheets/d/1ED3rxbJzZNi6lgRLUvwsL0NWz3FLSko4G6oJ2ohbSM0/edit?gid=0#gid=0)
+
+# orig_data = get_data.model_data()
+
+# store_weight_npm_orig = su.weighted_npm(orig_data)
+# store_weight_npm_orig["prod_weight_g"] = store_weight_npm_orig.pipe(su.prod_weight_g)
+
+
+# Check the SWA NPM for the original file
+# total_swa_npm_orig, swa_by_store_orig, swa_by_market, agg_df_orig = get_swas(store_weight_npm_orig)
+
+# +
+# Compare SWA for volume-adjusted and non-volume adjusted data
+
+total_swa, swa_by_store, swa_by_market, swa_all = get_swas(agg_df)
+total_swa_adj, swa_by_store_adj, swa_by_market_adj, swa_all_adj = get_swas(agg_df_adj)
+
+# +
+# Plot SWA NPM values for volume-adjusted added categories
+
+swa_by_market_adj = swa_by_market_adj.reset_index()
+
+display(swa_by_market_adj[swa_by_market_adj["rst_4_market"].isin(to_keep)])
+
+print(f"Total SWA NPM for volume-adjusted data: {total_swa_adj}")
+
+# +
+# Get count of HFSS products with and within added categories
+agg_df_adj["hfss"] = agg_df_adj["npm_score"] > 4
+
+agg_df_adj["hfss"].value_counts()
+
+agg_df_adj[~agg_df_adj["rst_4_market"].isin(to_keep)]["hfss"].value_counts()