Merge branch 'dev' into 84_bugfix_avg_garden

nestauk · Jan 7, 2025 · c349b1a · c349b1a
2 parents 35efc74 + a691bb6
commit c349b1a
Show file tree

Hide file tree

Showing 27 changed files with 690 additions and 649 deletions.
diff --git a/README.md b/README.md
@@ -61,7 +61,17 @@ loops (SGLs).
 This pipeline therefore computes a conventional score and a Nesta score for each of the four tech types listed: eight
 heat pump suitability scores are calculated in total per LSOA. Scores are first computed per property based on presence/
 absence of certain characteristics of the property/area using a simple additive model (see table below). Scores are then
-averaged per property before finally aggregating to LSOA level.
+averaged per property and weighted\* before finally aggregating to LSOA level. Note that a property must have at least 4
+of the required features to calculate heat pump suitability to be assigned a suitability score and an LSOA must have data
+for at least 15 properties to be included in the final suitability per LSOA dataset.
+
+_\*Scores will only be weighted for an LSOA if the proportion of EPC properties in that LSOA that have a weight is above a
+specified threshold - the default threshold (and the threshold we have used for our published results) is 50%. Individual
+properties do not receive a weight if they are missing data required for weighting._
+
+_If the threshold is not met for a given LSOA, suitability scores for that LSOA will be unweighted and labelled as such.
+Unweighted scores may not accurately represent the suitability of an LSOA for a given heating technology as a whole and
+should therefore be interpreted with caution._
 
 |                                                                                   | ASHP (S) | ASHP (N) | GSHP (S) | GSHP (N) | SGL (S) | SGL (N) | HN (S) | HN (N) |
 | --------------------------------------------------------------------------------- | -------- | -------- | -------- | -------- | ------- | ------- | ------ | ------ |

diff --git a/asf_heat_pump_suitability/analysis/exploratory/aggregate_build_year_per_la.py b/asf_heat_pump_suitability/analysis/exploratory/aggregate_build_year_per_la.py
@@ -58,7 +58,7 @@
         la_build_year, how="left", on=["LAD23CD", "LAD23NM"]
     )
 
-    save_utils.save_parquet_to_s3(
+    save_utils.save_to_s3(
         df=la_build_year,
         path="s3://asf-heat-pump-suitability/source_data_minor_edits/2015cdrc_dwelling_ages_E_W_per_la_02.parquet",
     )
diff --git a/asf_heat_pump_suitability/analysis/exploratory/subset_epc_north_england.py b/asf_heat_pump_suitability/analysis/exploratory/subset_epc_north_england.py
@@ -58,4 +58,4 @@
 
     # Save
     save_as = "s3://asf-heat-pump-suitability/source_data_minor_edits/northern_england_epc_processed_dedupl-0.parquet"
-    save_utils.save_parquet_to_s3(epc_df_north, save_as)
+    save_utils.save_to_s3(epc_df_north, save_as)
diff --git a/asf_heat_pump_suitability/getters/get_datasets.py b/asf_heat_pump_suitability/getters/get_datasets.py
@@ -180,7 +180,7 @@ def load_gdf_welsh_gov_conservation_areas(**kwargs) -> gpd.GeoDataFrame:
 
 def get_df_ons_number_of_households() -> pl.DataFrame:
     """
-    Get raw ONS 'Number of households' dataset.
+    Get raw ONS 'Number of households' per LSOA for England and Wales.
 
     Returns:
         pl.DataFrame: raw ONS 'Number of households' dataset
@@ -201,7 +201,8 @@ def get_df_ons_number_of_households() -> pl.DataFrame:
 
 def get_df_ons_land_area() -> pl.DataFrame:
     """
-    Get raw ONS 'land area' dataset.
+    Get raw ONS 'land area' dataset. Contains Standard Area Measurements of ‘Land Area’ (Area to Mean High Water
+    Excluding Area of Inland Water) for England and Wales.
 
     Returns:
         pl.DataFrame: raw ONS 'land area' dataset
@@ -278,7 +279,7 @@ def load_gdf_scotgov_data_zone_bounds(**kwargs) -> gpd.GeoDataFrame:
         **kwargs for geopandas.read_file()
 
     Returns:
-        gpd.GeoDataFrame: boundary polygons and area data for 2011 Scottish Data Zones
+        gpd.GeoDataFrame: boundary polygons and area standard area measurement data for 2011 Scottish Data Zones
     """
     return gpd.read_file(
         config["data_source"]["S_scottish_gov_DZ2011_boundaries"], **kwargs

diff --git a/asf_heat_pump_suitability/pipeline/README.md b/asf_heat_pump_suitability/pipeline/README.md
@@ -11,98 +11,46 @@ asf_heat_pump_suitability/pipeline
 ├───reweight_epc/
 │    Modules with functions to prepare and conduct reweighting with IPF
 ├───run_scripts/
-│    Scripts to weight EPC and add new features
+│    All run scripts to weight EPC, add new features, and calculate suitability
 ├───sampling/
 │    Scripts to generate samples of EPC data, e.g. for use in testing
 ├───suitability/
-│    Scripts to calculate heat pump suitability from enhanced EPC data
+│    Modules with functions to calculate heat pump suitability
 ```
 
 ## Run full pipeline to generate heat pump suitability scores
 
-To weight the EPC data, add the new features, and calculate heat pump suitability per property / LSOA, run the following
-files in order as shown below. For each script below, we list the specific inputs and outputs for the 2023 Q4 EPC
-dataset as examples, but arguments can be adjusted as required:
+To calculate heat pump suitability, you first need to produce the required inputs:
+To weight the EPC data, add new features, and estimate garden size of properties in preparation for calculating suitability, you can run the
+following files in any order. All scripts take the preprocessed and deduplicated EPC dataset (output from `asf-daps`) in
+parquet file format as input via the `--epc` argument. Ensure you set the `--year` and `--quarter` arguments to correspond to those of the EPC
+dataset when running each script. See the script `.py` files for more detailed running instructions.
 
-1. `run_scripts/run_compute_epc_weights.py`
+E.g. to run the pipeline for 2023 Q4 EPC data, you would set the following args for each script in the table below:
 
-   **Purpose**: Add LSOA & MSOA data to each EPC row and weight each EPC property according to LSOA with Iterative Proportional Fitting.
+- `--epc s3://asf-daps/lakehouse/processed/epc/old/deduplicated/processed_dedupl-0.parquet`
+- `--year 2023`
+- `--quarter 4`
 
-   **Run**:
+|             Script             | Purpose                                                                                                                          | Inputs                                                                                                                                                                                                                                  | Output filename                                                                                        | Output description                                                                                                                                        |
+| :----------------------------: | :------------------------------------------------------------------------------------------------------------------------------- | :-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | :----------------------------------------------------------------------------------------------------- | :-------------------------------------------------------------------------------------------------------------------------------------------------------- |
+|  `run_compute_epc_weights.py`  | Weight properties with Iterative Proportional Fitting per LSOA / Data Zone to reduce bias.                                       | EPC preprocessed and deduplicated parquet file from `asf-daps`.                                                                                                                                                                         | `[DATE]_[EPC_YEAR]_[EPC_Q]_EPC_weights.parquet`; `[DATE]_[EPC_YEAR]_[EPC_Q]_EPC_weights_stats.parquet` | Weighted EPC data and weighting run stats. Unweighted rows are not retained.                                                                              |
+|     `run_add_features.py`      | Add new features to the EPC dataset.                                                                                             | EPC preprocessed and deduplicated parquet file from `asf-daps`.                                                                                                                                                                         | `[DATE]_[EPC_YEAR]_[EPC_Q]_EPC_features.parquet`                                                       | Full preprocessed and deduplicated EPC dataset with all features added to each record where available.                                                    |
+| `run_calculate_garden_size.py` | Calculate estimated garden size for EPC UPRNs where available from INSPIRE land registry data and Microsoft building footprints. | EPC preprocessed and deduplicated parquet file from `asf-daps` and `inspire_file_bounds_[NATION(S)].geojson`. The `geojson` file contains the geospatial boundary polygons of the INSPIRE land extent files for the specified nation.\* | `[DATE]_[EPC_YEAR]_[EPC_Q]_EPC_garden_size_estimates_[NATION(S)].parquet`                              | EPC UPRNs with estimated garden sizes for the specified nation(s) (of England & Wales; Scotland; or all). UPRNs not matched to a garden are not retained. |
 
-   `python asf_heat_pump_suitability/pipeline/run_scripts/run_compute_epc_weights.py --epc_path s3://asf-daps/lakehouse/processed/epc/deduplicated/processed_dedupl-0.parquet -y 2023 -q 4`
+To calculate heat pump suitability per property / LSOA, you can then run the following file:
 
-   **Inputs**: preprocessed deduplicated EPC dataset
+|             Script             | Purpose                                                                                                                                                                                                                                                           | Inputs                                                   | Output filename                                                                                                                                                                                            | Output description                                                                                                       |
+| :----------------------------: | :---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | :------------------------------------------------------- | :--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | :----------------------------------------------------------------------------------------------------------------------- |
+| `run_calculate_suitability.py` | Calculate heat pump suitability of properties and LSOAs for four tech types (air-source heat pumps, ground-source heat pumps, shared ground loops, and heat networks) using conventional view criteria and Nesta view criteria, so 8 suitability scores in total. | EPC weights; EPC features; and EPC garden size estimates | `[DATE]_[EPC_YEAR]_[EPC_Q]_heat_pump_suitability_per_lsoa.parquet`; `[DATE]_[EPC_YEAR]_[EPC_Q]_heat_pump_suitability_per_lsoa.csv`; `[DATE]_[EPC_YEAR]_[EPC_Q]_heat_pump_suitability_per_property.parquet` | Heat pump suitability scores for four different tech types in Nesta and 'conventional' views per property, and per LSOA. |
 
-   **Outputs**:
+\*To produce the `inspire_file_bounds_[NATION(S)].geojson` file, run the files below in the given order. See the script `.py` files
+for more detailed running instructions.
 
-   - EPC dataset with weights: `s3://asf-heat-pump-suitability/outputs/2023Q4/20240824_2023_Q4_EPC_weighted.parquet`
-   - Processing time and number of rows lost per LSOA: `s3://asf-heat-pump-suitability/outputs/2023Q4/20240824_2023_Q4_EPC_weighted_stats.parquet`
-
-2. `run_scripts/run_add_features.py`
-
-   **Purpose**: Add new features to the EPC dataset:
-
-   - mean average garden size per MSOA
-   - lat/lon per UPRN
-   - property density per LSOA
-   - off gas properties by postcode
-   - listed building status per UPRN
-   - England and Wales building conservation area flag per UPRN
-
-   **Run**:
-
-   `python asf_heat_pump_suitability/pipeline/run_scripts/run_add_features.py --epc_path s3://asf-heat-pump-suitability/outputs/2023Q4/20240824_2023_Q4_EPC_weighted.parquet -y 2023 -q 4`
-
-   **Inputs**: EPC dataset with weights
-
-   **Outputs**: EPC dataset with weights and features: `s3://asf-heat-pump-suitability/outputs/2023Q4/20240827_2023_Q4_EPC_weighted_features.parquet`
-
-3. `run_scripts/run_calculate_garden_size.py`
-
-   **Purpose**: Calculate estimated garden size for EPC UPRNs where available from INSPIRE land registry data and Microsoft building
-   footprints.
-
-   **Run**:
-
-   `python asf_heat_pump_suitability/pipeline/run_scripts/run_calculate_garden_size.py --epc_path s3://asf-heat-pump-suitability/outputs/2023Q4/20240827_2023_Q4_EPC_weighted_features.parquet -y 2023 -q 4 --use_mapping s3://asf-heat-pump-suitability/source_data/2023_land_parcels_with_file_polygons.geojson`
-
-   **Inputs**: EPC dataset with weights and features
-
-   **Outputs**: estimated garden size for EPC UPRNs. NB: output contains only UPRNs matched to a garden.
-   `s3://asf-heat-pump-suitability/outputs/2023Q4/20240901_2023_Q4_EPC_garden_size_estimates_[01/02].parquet`
-
-4. `run_scripts/run_scripts/run_process_garden_size.py`
-
-   **Purpose**: Clean and process garden size estimate data and join to EPC data.
-
-   **Run**:
-   `python asf_heat_pump_suitability/pipeline/run_scripts/run_process_garden_size.py --epc_path s3://asf-heat-pump-suitability/outputs/2023Q4/20240827_2023_Q4_EPC_weighted_features.parquet --gardens_path s3://asf-heat-pump-suitability/outputs/2023Q4/20240904_2023_Q4_EPC_garden_size_estimates_complete.parquet`
-
-   **Inputs**:
-
-   - EPC dataset with weights and features
-   - Garden size estimates for EPC UPRNs
-
-     **Outputs**: EPC dataset with weights, features, and estimated garden size
-     `s3://asf-heat-pump-suitability/outputs/2023Q4/20240904_2023_Q4_EPC_weighted_features_gardens.parquet`
-
-5. `suitability/calculate_suitability.py`
-
-   **Purpose**: Calculate heat pump suitability of properties and LSOAs for four tech types (air-source heat pumps, ground-source heat
-   pumps, shared ground loops, and heat networks) using conventional view criteria and Nesta view criteria, so 8 suitability
-   scores in total.
-
-   **Run**:
-
-   `python asf_heat_pump_suitability/pipeline/suitability/calculate_suitability.py --epc_path s3://asf-heat-pump-suitability/outputs/2023Q4/20240904_2023_Q4_EPC_weighted_features_gardens.parquet`
-
-   **Inputs**: EPC dataset with weights and all features
-
-   **Outputs**:
-
-   - Heat pump suitability scores per EPC property: `s3://asf-heat-pump-suitability/outputs/2023Q4/20240830_2023_Q4_heat_pump_suitability_per_property.parquet`
-   - Heat pump suitability scores per LSOA: `s3://asf-heat-pump-suitability/outputs/2023Q4/20240830_2023_Q4_heat_pump_suitability_per_lsoa.parquet`
+1. `run_stream_inspire_files.py` - stream INSPIRE land registry files for Scotland from ROS webpage and/or INSPIRE files for England and Wales
+   from government website to S3 asf-heat-pump-suitability bucket. Files are unzipped during streaming and
+   saved to S3 in unzipped format.
+2. `run_get_inspire_file_bounds.py` - generate bounding polygons of each INSPIRE land registry file and save to S3.
 
 ## Get sample EPC datasets
 

diff --git a/asf_heat_pump_suitability/pipeline/prepare_features/anchor_properties.py b/asf_heat_pump_suitability/pipeline/prepare_features/anchor_properties.py
@@ -1,6 +1,6 @@
 """
 Module for identifying and analyzing potential anchor properties in LSOAs.
-This script can be run independentally and will output a CSV file with a list of LSOAs, the number of anchor properties in each LSOA, and the categories of anchor properties present.
+This script can be run independently and will output a CSV file with a list of LSOAs, the number of anchor properties in each LSOA, and the categories of anchor properties present.
 """
 
 # TODO implement building footprint data for improved identification accuracy
@@ -10,6 +10,7 @@
 
 import geopandas as gpd
 import pandas as pd
+import polars as pl
 
 from asf_heat_pump_suitability import config
 from asf_heat_pump_suitability.getters import get_datasets
@@ -101,12 +102,12 @@ def load_gdf_and_process_poi() -> gpd.GeoDataFrame:
     return anchor_properties
 
 
-def identify_anchor_properties_gdf() -> gpd.GeoDataFrame:
+def identify_anchor_properties_df() -> pl.DataFrame:
     """
     Identify and analyze anchor properties within LSOAs/DataZones.
 
     Returns:
-        gpd.GeoDataFrame: Summary of anchor properties by LSOA/DataZone, containing columns:
+        pl.DataFrame: Summary of anchor properties by LSOA containing columns:
             - lsoa: Unique identifier for the LSOA/DataZone
             - lsoa_name: Name of the LSOA/DataZone
             - anchor_count: Number of anchor properties in the LSOA/DataZone
@@ -173,7 +174,7 @@ def identify_anchor_properties_gdf() -> gpd.GeoDataFrame:
             f"Found {lsoa_anchor_summary['has_anchor_property'].sum()} LSOAs with suitable anchor properties"
         )
 
-        return lsoa_anchor_summary
+        return pl.from_pandas(lsoa_anchor_summary)
 
     except Exception as e:
         logger.error(f"Error in anchor property analysis: {str(e)}")
@@ -182,10 +183,10 @@ def identify_anchor_properties_gdf() -> gpd.GeoDataFrame:
 
 if __name__ == "__main__":
     try:
-        results = identify_anchor_properties_gdf()
+        results = identify_anchor_properties_df()
 
         output_path = Path("outputs/reports/anchor_property_analysis.csv")
-        results.to_csv(output_path, index=False)
+        results.write_csv(output_path)
         logger.info(f"Results saved to {output_path}")
 
     except Exception as e:

diff --git a/asf_heat_pump_suitability/pipeline/prepare_features/garden_size.py b/asf_heat_pump_suitability/pipeline/prepare_features/garden_size.py
@@ -1,6 +1,7 @@
 import geopandas as gpd
 import logging
 import pandas as pd
+import polars as pl
 
 
 def match_series_files_land_building(
@@ -126,3 +127,28 @@ def generate_gdf_garden_size(
     )
 
     return gardens_gdf
+
+
+def deduplicate_df_garden_size(df: pl.DataFrame) -> pl.DataFrame:
+    """
+    Deduplicate UPRNs matched to multiple gardens by taking the average size of the multiple gardens (for gardens
+    below a threshold size).
+
+    Args:
+        df (pl.DataFrame): UPRNs with garden size estimates
+
+    Returns:
+        pl.DataFrame: deduplicated UPRNs with garden size estimates
+    """
+    df = df.with_columns(pl.col("UPRN").is_duplicated().alias("UPRN_duplicated"))
+    # Remove gardens with area above the 97th percentile if they are matched to duplicate UPRNs
+    df = df.filter(
+        ~(
+            pl.col("UPRN_duplicated")
+            & (pl.col("garden_area_m2") > df["garden_area_m2"].quantile(quantile=0.97))
+        )
+    )
+    # Calculate median garden size for UPRNs with multiple gardens
+    df = df.group_by("UPRN").agg(pl.median("garden_area_m2"))
+
+    return df
diff --git a/asf_heat_pump_suitability/pipeline/prepare_features/grid_capacity.py b/asf_heat_pump_suitability/pipeline/prepare_features/grid_capacity.py
@@ -1,12 +1,12 @@
 import re
-import os
 from typing import Any
 import logging
 import argparse
 
 import numpy as np
 import pandas as pd
 import geopandas as gpd
+import polars as pl
 
 from asf_heat_pump_suitability.getters import get_datasets
 from asf_heat_pump_suitability import config
@@ -239,7 +239,7 @@ def assess_heatpump_suitability(
     return lsoa_data
 
 
-def calculate_grid_capacity() -> pd.DataFrame:
+def calculate_grid_capacity() -> pl.DataFrame:
     """
     Calculate the grid capacity for heat pump installations across all LSOAs.
 
@@ -290,7 +290,7 @@ def calculate_grid_capacity() -> pd.DataFrame:
     # Rename LSOA column for consistency
     result = result.rename(columns={"LSOA21CD": "lsoa"})
 
-    return result
+    return pl.from_pandas(result)
 
 
 def parse_arguments() -> argparse.Namespace:
@@ -325,4 +325,4 @@ def parse_arguments() -> argparse.Namespace:
     )
 
     if args.save_as:
-        grid_capacity_results.to_csv(args.save_as, index=False)
+        grid_capacity_results.write_csv(args.save_as)