nestauk · crispy-wonton · Sep 24, 2024 · Sep 24, 2024 · Sep 24, 2024 · Sep 24, 2024
diff --git a/asf_heat_pump_suitability/config/base.yaml b/asf_heat_pump_suitability/config/base.yaml
@@ -2,8 +2,8 @@ data_source:
   gb_ons_postcode_dir_url: "https://www.arcgis.com/sharing/rest/content/items/487a5ba62c8b4da08f01eb3c08e304f6/data" # Aug 2023 data
   gb_ons_postcode_dir_file_path: "Data/ONSPD_AUG_2023_UK.csv" # Aug 2023 data
   UK_ons_postcode_dir: "s3://asf-heat-pump-suitability/source_data/ONSPD_AUG_2023_UK.csv"
-  EW_census_housing_characteristics: "s3://asf-heat-pump-suitability/source_data/2021census_Mar2023update_housing_characteristics_E_W.xlsx" # 2021 census, Mar 2023 update
   EW_census_tenure: "s3://asf-heat-pump-suitability/source_data/2021census_2023Mar_tenure_E_W.csv"
+  S_census_tenure: "s3://asf-heat-pump-suitability/source_data/2022_Scotlands_census_tenure_S.csv"
   EW_census_number_of_rooms: "s3://asf-heat-pump-suitability/source_data/2021census_Mar2023_number_of_rooms_E_W.csv"
   EW_census_number_of_households: "s3://asf-heat-pump-suitability/source_data/2021_vMar2023_census_numberofhouseholds_EW.csv"
   EW_census_land_area: "s3://asf-heat-pump-suitability/source_data/2021_vMar2021_census_landareaKM_EW.csv"
@@ -12,6 +12,7 @@ data_source:
   GB_ons_garden_space_access: "s3://asf-heat-pump-suitability/source_data/ONS_Apr2020_access_to_garden_space.xlsx"
   GB_osopen_uprn_latlon: "s3://asf-heat-pump-suitability/source_data/osopenuprn_202405_csv.zip"
   EW_census_accommodation_type: "s3://asf-heat-pump-suitability/source_data/2021census_Mar2023_accommodation_type_E_W.csv"
+  S_census_accommodation_type: "s3://asf-heat-pump-suitability/source_data/2022_Scotlands_census_accommodation_type_S.csv"
   UK_ons_lad_bounds: "s3://asf-heat-pump-suitability/source_data/Local_Authority_Districts_December_2023_Boundaries_UK_BFE_-2600600853110041429/LAD_DEC_2023_UK_BFE.shp"
   EW_inspire_land_extent_dir: "s3://asf-heat-pump-suitability/source_data/inspire_ew/"
   S_inspire_land_extent_dir: "s3://asf-heat-pump-suitability/source_data/inspire_scotland/"
@@ -48,25 +49,6 @@ data_source:
   EW_inspire_url: "https://use-land-property-data.service.gov.uk/datasets/inspire/download"
   S_scottish_gov_DZ2011_boundaries: "s3://asf-heat-pump-suitability/source_data/2014_Scottish_Government_DataZoneBoundaries_2011_S/SG_DataZone_Bdry_2011.shp"
   S_NRScotland_dwellings: "s3://asf-heat-pump-suitability/source_data/June2024_NRScotland_households_and_dwellings_S.xlsx"
-usecols:
-  epc:
-    - COUNTRY
-    - ADDRESS1
-    - ADDRESS2
-    - POSTCODE
-    - CURRENT_ENERGY_EFFICIENCY
-    - CURRENT_ENERGY_RATING
-    - CURR_ENERGY_RATING_NUM
-    - ENERGY_RATING_CAT
-    - UPRN
-    - TENURE
-    - PROPERTY_TYPE
-    - BUILT_FORM
-    - CONSTRUCTION_AGE_BAND
-    - CO2_EMISSIONS_CURRENT
-    - NUMBER_HABITABLE_ROOMS
-    - HEATING_SYSTEM
-    - HEATING_FUEL
 mapping:
   build_year_pre_cols:
     - BP_PRE_1900

diff --git a/asf_heat_pump_suitability/getters/get_target.py b/asf_heat_pump_suitability/getters/get_target.py
@@ -1,10 +1,10 @@
 import polars as pl
-import warnings
+import polars.selectors as cs
 
 from asf_heat_pump_suitability import config
-from asf_heat_pump_suitability.getters import base_getters
 
 
+# TODO will need to add number of rooms target data for Scotland if we revert to using it
 def get_df_target_nrooms() -> pl.DataFrame:
     """
     Get dataframe of counts of total number of rooms for properties in all LSOAs in England and Wales. Where number of rooms
@@ -34,10 +34,23 @@ def get_df_target_nrooms() -> pl.DataFrame:
     return df
 
 
-def get_df_target_property_type_uncensored() -> pl.DataFrame:
+def transform_df_target_property_type() -> pl.DataFrame:
     """
-    Get dataframe of property type counts for all LSOAs in England and Wales. Dataframe has no censored values. Source:
-    census data 2021.
+    Load and transform property type counts per LSOA/data zone for England, Scotland, and Wales from census data.
+
+    Returns:
+        pl.DataFrame: property type counts for England, Scotland, and Wales per LSOA
+    """
+    ew_df = load_transform_df_target_property_type_ew()
+    s_df = load_transform_df_target_property_type_scotland()
+    s_df = s_df.select(ew_df.columns)
+
+    return pl.concat([ew_df, s_df], how="vertical")
+
+
+def load_transform_df_target_property_type_ew() -> pl.DataFrame:
+    """
+    Get dataframe of property type counts for all LSOAs in England and Wales from census data.
 
     Returns:
         pl.Dataframe: counts of property type for all LSOAs in England and Wales
@@ -78,50 +91,130 @@ def get_df_target_property_type_uncensored() -> pl.DataFrame:
         )
         .rename(
             {
-                "Detached": "Detached whole house or bungalow",
-                "Semi-detached": "Semi-detached whole house or bungalow",
-                "Terraced": "Terraced (including end-terrace) whole house or bungalow",
+                "Terraced": "Terraced (including end-terrace)",
+                "A caravan or other mobile or temporary structure": "Caravan or other mobile or temporary structure",
             }
         )
     )
 
     return df
 
 
-def get_df_target_property_type(fill_censored: int = 1) -> pl.DataFrame:
+def load_transform_df_target_property_type_scotland() -> pl.DataFrame:
     """
-    Get dataframe of property type counts for all LSOAs in England and Wales, and fill censored values (counts below 10)
-    with given constant. Source: census data 2021.
-
-    Args:
-        fill_censored (int): value to fill censored values with, [0-10]. Default 0.
+    Load and transform dataframe of property type counts for data zones in Scotland from census data.
 
     Returns:
-        pl.Dataframe: counts of property type for all LSOAs in England and Wales
+        pl.Dataframe: counts of property type for all data zones in Scotland
     """
-    content = base_getters.get_content_from_s3_path(
-        config["data_source"]["EW_census_housing_characteristics"]
+    df = pl.read_csv(
+        config["data_source"]["S_census_accommodation_type"],
+        skip_rows=10,
+        columns=list(range(0, 11)),
+        infer_schema_length=10000,
     )
-    df = pl.read_excel(content, sheet_name="2c", engine="calamine")
-
-    # Remove empty header rows
     df = (
-        df.rename(df[2].to_dicts().pop())
-        .slice(
-            3,
+        df[1:]
+        .drop_nulls(subset=cs.numeric())
+        .drop(
+            [
+                "Whole house or bungalow: Total",
+                "Flat, maisonette or apartment: Total",
+                "All occupied households",
+            ]
         )
-        .drop(["Area Name"])
-        .rename({"Area Code": "lsoa"})
     )
-    df = _fill_df_censored_values(df, fill_censored)
+    flats_cols = [col for col in df.columns if "Flat" in col]
+    df = (
+        df.with_columns(
+            pl.sum_horizontal(flats_cols).alias("Flat, maisonette or apartment")
+        )
+        .drop(flats_cols)
+        .rename(
+            {
+                col: col.replace("Whole house or bungalow: ", "")
+                for col in df.select(cs.numeric()).columns
+            }
+        )
+        .rename(
+            {"Type of accomodation": "lsoa"}
+        )  # The Data Zone (lsoa) column name is mislabelled due to .csv formatting
+    )
+
+    # A small number of rows seem to erroneously have zero values for all property types, we need to remove them
+    df = df.filter(
+        pl.sum_horizontal(
+            [
+                "Detached",
+                "Semi-detached",
+                "Terraced (including end-terrace)",
+                "Caravan or other mobile or temporary structure",
+                "Flat, maisonette or apartment",
+            ]
+        )
+        != 0
+    )
 
     return df
 
 
-def get_df_target_tenure_uncensored() -> pl.DataFrame:
+def transform_df_target_tenure() -> pl.DataFrame:
+    """
+    Load and transform tenure type counts per LSOA/data zone for England, Scotland, and Wales from census data.
+
+    Returns:
+        pl.DataFrame: tenure type counts per LSOA/data zone for England, Scotland, and Wales
+    """
+    ew_df = load_transform_df_target_tenure_ew()
+    s_df = load_transform_df_target_tenure_scotland()
+    s_df = s_df.select(ew_df.columns)
+
+    return pl.concat([ew_df, s_df], how="vertical")
+
+
+def load_transform_df_target_tenure_scotland() -> pl.DataFrame:
+    """
+    Load and transform tenure type counts per data zone in Scotland from census data.
+
+    Returns:
+        pl.DataFrame: tenure type counts per data zone in Scotland
+    """
+    df = pl.read_csv(
+        config["data_source"]["S_census_tenure"],
+        skip_rows=10,
+        columns=list(range(1, 4)),
+        infer_schema_length=10000,
+    )
+    df = (
+        df.drop_nulls()
+        .rename({"Intermediate Zone - Data Zone 2011": "lsoa"})
+        .pivot("Household Tenure", index="lsoa", values="Count")
+        .drop([col for col in df.columns if "Total" in col])
+    )
+    private_rental = [col for col in df.columns if "Private" in col]
+    private_rental.extend(["Lives Rent Free"])
+    df = df.with_columns(
+        pl.sum_horizontal([col for col in df.columns if "Owned" in col]).alias(
+            "owner-occupied"
+        ),
+        pl.sum_horizontal(private_rental).alias("rental (private)"),
+        pl.sum_horizontal([col for col in df.columns if "Social" in col]).alias(
+            "rental (social)"
+        ),
+    )
+
+    # A small number of rows seem to erroneously have zero values for all tenure types, we need to remove them
+    df = df.filter(
+        pl.sum_horizontal(["owner-occupied", "rental (social)", "rental (private)"])
+        != 0
+    )
+
+    return df.select(["lsoa", "owner-occupied", "rental (social)", "rental (private)"])
+
+
+def load_transform_df_target_tenure_ew() -> pl.DataFrame:
     """
-    Get dataframe of tenure type counts for all LSOAs in England and Wales. Dataframe has no censored values. Source:
-    census data 2021.
+    Get dataframe of tenure type counts for all LSOAs in England and Wales from census data.
 
     Returns:
         pl.Dataframe: counts of tenure type for all LSOAs in England and Wales
@@ -168,44 +261,6 @@ def get_df_target_tenure_uncensored() -> pl.DataFrame:
     return df
 
 
-def get_df_target_tenure(fill_censored: int = 1) -> pl.DataFrame:
-    """
-    Get dataframe of tenure type counts for all LSOAs in England and Wales, and fill censored values (counts below 10)
-    with given constant. Source: census data 2021.
-
-    Args:
-        fill_censored (int): value to fill censored values with, [0-10]. Default 0.
-
-    Returns:
-        pl.Dataframe: counts of tenure type for all LSOAs in England and Wales
-    """
-    content = base_getters.get_content_from_path(
-        config["data_source"]["EW_census_housing_characteristics"]
-    )
-    df = pl.read_excel(content, sheet_name="3c", engine="calamine")
-
-    # Remove empty header rows
-    df = (
-        df.rename(df[2].to_dicts().pop())
-        .slice(
-            3,
-        )
-        .drop(["Area Name"])
-        .rename(
-            {
-                "Area Code": "lsoa",
-                "Owned or shared ownership": "owner-occupied",
-                "Social Rented": "rental (social)",
-                "Private Rented or lives rent free": "rental (private)",
-            }
-        )
-    )
-
-    df = _fill_df_censored_values(df, fill_censored)
-
-    return df
-
-
 def get_df_target_build_year(
     pre_cols: list = config["mapping"]["build_year_pre_cols"],
     post_cols: list = config["mapping"]["build_year_post_cols"],
@@ -250,26 +305,3 @@ def get_df_target_build_year_la() -> pl.DataFrame:
     df = df.select(["lsoa", "pre_1930", "post_1930", "unknown"])
 
     return df
-
-
-def _fill_df_censored_values(df: pl.DataFrame, val: int) -> pl.DataFrame:
-    """
-    Fill censored values in a target dataframe with a given value.
-
-    Args:
-        df (pl.DataFrame): dataframe
-        val (int): value to fill censored values with, [0-10]
-
-    Returns:
-        pl.DataFrame: dataframe with filled values
-    """
-    if not (0 <= val <= 10):
-        warnings.warn(
-            "Value to fill censored target data should be within range [0-10]. "
-            "Values outside this range may significantly change target proportions."
-        )
-    cols = df.columns
-    cols.remove("lsoa")
-    df = df.with_columns([pl.col(cols).str.replace("c", f"{val}").cast(pl.Int64)])
-
-    return df
diff --git a/asf_heat_pump_suitability/pipeline/reweight_epc/prepare_sample.py b/asf_heat_pump_suitability/pipeline/reweight_epc/prepare_sample.py
@@ -71,21 +71,21 @@ def add_col_property_type(df: pl.DataFrame) -> pl.DataFrame:
             pl.col("PROPERTY_TYPE").is_in(["House", "Bungalow"]),
             pl.col("BUILT_FORM") == "Detached",
         )
-        .then(pl.lit("Detached whole house or bungalow"))
+        .then(pl.lit("Detached"))
         .when(
             pl.col("PROPERTY_TYPE").is_in(["House", "Bungalow"]),
             pl.col("BUILT_FORM") == "Semi-Detached",
         )
-        .then(pl.lit("Semi-detached whole house or bungalow"))
+        .then(pl.lit("Semi-detached"))
         .when(
             pl.col("PROPERTY_TYPE").is_in(["House", "Bungalow"]),
             pl.col("BUILT_FORM").is_in(terraced),
         )
-        .then(pl.lit("Terraced (including end-terrace) whole house or bungalow"))
+        .then(pl.lit("Terraced (including end-terrace)"))
         .when(pl.col("PROPERTY_TYPE").is_in(["Flat", "Maisonette"]))
         .then(pl.lit("Flat, maisonette or apartment"))
         .when(pl.col("PROPERTY_TYPE").is_in(["Park home"]))
-        .then(pl.lit("A caravan or other mobile or temporary structure"))
+        .then(pl.lit("Caravan or other mobile or temporary structure"))
         .alias("property_type")
     )
 

diff --git a/asf_heat_pump_suitability/pipeline/reweight_epc/prepare_target.py b/asf_heat_pump_suitability/pipeline/reweight_epc/prepare_target.py
@@ -63,11 +63,9 @@ def get_dict_dfs_counts(
     count_dict = {}
 
     if "property_type" in features:
-        count_dict["property_type"] = (
-            get_target.get_df_target_property_type_uncensored()
-        )
+        count_dict["property_type"] = get_target.transform_df_target_property_type()
     if "tenure" in features:
-        count_dict["tenure"] = get_target.get_df_target_tenure_uncensored()
+        count_dict["tenure"] = get_target.transform_df_target_tenure()
     if "build_year" in features:
         if not use_la_build_year:
             count_dict["build_year"] = get_target.get_df_target_build_year()

diff --git a/asf_heat_pump_suitability/pipeline/reweight_epc/reweight_epc.py b/asf_heat_pump_suitability/pipeline/reweight_epc/reweight_epc.py
@@ -88,6 +88,8 @@ def generate_balance_sample(
         sample = sample.filter(~pl.col(feature).is_in(missing))
     lost_rows = len_before - len(sample)
 
+    # TODO generating dummies will fail and cause pipeline error if all rows are removed from sample in code above
+    # TODO we need to check if len(sample) > 0 and only proceed with remaining code if it is
     # Add dummy rows for feature categories missing from sample but present in target
     dummies = generate_df_dummies(lsoa_marginals=lsoa_marginals, sample=sample)
     sample = pl.concat([sample, dummies[sample.columns]])