Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Enhancements to CONSTRUCTION_AGE_BAND, not allowing for unknowns in UPRN and fix heating features #74

Merged
merged 4 commits into from
Jun 7, 2023
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
36 changes: 17 additions & 19 deletions asf_core_data/getters/epc/epc_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,9 +39,7 @@ def get_cert_rec_files(data_path, dir_name, scotland_data=False):
"""

if str(data_path) == "S3":

if scotland_data:

directories = [
Path(f).name
for f in data_getters.get_s3_dir_files(path_to_dir=str(dir_name))
Expand Down Expand Up @@ -127,9 +125,7 @@ def load_scotland_data(
data_download.extract_data(data_path / RAW_SCOTLAND_DATA_ZIP)

if scot_usecols is not None:

if v2_batch:

for i, col in enumerate(scot_usecols):
if col in base_config.scotland_field_fix_dict.keys():
scot_usecols[i] = base_config.scotland_field_fix_dict[col]
Expand Down Expand Up @@ -166,7 +162,6 @@ def load_scotland_data(
scot_usecols.remove("ENERGY_TARIFF")

if "UPRN" in scot_usecols:

scot_usecols.remove("UPRN")
scot_usecols.append("Property_UPRN")

Expand Down Expand Up @@ -203,7 +198,6 @@ def load_scotland_data(
epc_certs["COUNTRY"] = "Scotland"

if v2_batch:

# clean_dict = {"m²": "m2", "£": "£", "": ""}
# for col in epc_certs.columns:
# for enc_issue in clean_dict.keys():
Expand Down Expand Up @@ -270,7 +264,6 @@ def load_england_wales_data(
add_country_f = True

if subset in [None, "GB", "all"]:

additional_samples = 0

# Splitting samples across nations
Expand Down Expand Up @@ -428,7 +421,6 @@ def load_raw_epc_data(

# Get Scotland data
if subset in ["Scotland", "GB"]:

# data_check = True if subset == "Scotland" else False

epc_scotland_df = load_scotland_data(
Expand Down Expand Up @@ -461,9 +453,7 @@ def load_raw_epc_data(

# Merge the two datasets for GB
elif subset == "GB":

for country in ["Wales", "England"]:

epc_df = load_england_wales_data(
data_path=data_path,
rel_data_path=wales_england_path,
Expand Down Expand Up @@ -675,7 +665,6 @@ def filter_by_year(

# If year is given for filtering
if year != "all" and year is not None:

if up_to:
epc_df = epc_df.loc[epc_df["INSPECTION_DATE"].dt.year <= year]
else:
Expand All @@ -685,14 +674,23 @@ def filter_by_year(
selection_dict = {"first entry": "first", "latest entry": "last"}

if selection in ["first entry", "latest entry"]:

epc_df = (
epc_df.sort_values("INSPECTION_DATE", ascending=True)
.drop_duplicates(
subset=[building_identifier], keep=selection_dict[selection]
)
.sort_index()
)
epc_df = epc_df.sort_values("INSPECTION_DATE", ascending=True)

# Dealing with EPC entries with missing UPRN
uprn_missing = epc_df[pd.isnull(epc_df["UPRN"])]
uprn_missing = uprn_missing.drop_duplicates(
subset=["ADDRESS1", "ADDRESS2", "POSTCODE"], keep=selection_dict[selection]
).sort_index()
sofiapinto marked this conversation as resolved.
Show resolved Hide resolved

# Dealing with EPC entries with known UPRN
epc_df = epc_df[~pd.isnull(epc_df["UPRN"])]
sofiapinto marked this conversation as resolved.
Show resolved Hide resolved
epc_df = epc_df.drop_duplicates(
subset=[building_identifier], keep=selection_dict[selection]
).sort_index()

# Concatenating datasets together and sorting by inspection_date again
epc_df = pd.concat([epc_df, uprn_missing])
epc_df = epc_df.sort_values("INSPECTION_DATE", ascending=True)

elif selection is None:
epc_df = epc_df
sofiapinto marked this conversation as resolved.
Show resolved Hide resolved
Expand Down
25 changes: 17 additions & 8 deletions asf_core_data/pipeline/preprocessing/data_cleaning.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,9 @@
from asf_core_data import PROJECT_DIR, get_yaml_config, Path
from asf_core_data.config import base_config
from asf_core_data.pipeline.preprocessing import data_cleaning_utils
from asf_core_data.pipeline.preprocessing.feature_engineering import (
enhance_construction_age_band,
)

# ---------------------------------------------------------------------------------

Expand Down Expand Up @@ -267,7 +270,6 @@ def create_efficiency_mapping(efficiency_set, only_first=base_config.ONLY_FIRST_
efficiency_map = {}

for eff in efficiency_set:

# If efficiency is float (incl. NaN)
if isinstance(eff, float):
efficiency_map[eff] = np.nan
Expand Down Expand Up @@ -314,7 +316,6 @@ def clean_EFF_SCORES(df):
for feat in df.columns:
# If efficiency feature, get respective mapping
if feat.endswith("_EFF"):

df[feat] = df[feat].str.lower()
map_dict = create_efficiency_mapping(list(df[feat].unique()))

Expand Down Expand Up @@ -373,7 +374,6 @@ def standardise_dates(
]

for feature in date_features:

# Fix years starting with 00 -> 20..
df[feature] = (
df[feature].astype(str).str.replace(r"00(\d\d)", r"20\1", regex=True)
Expand Down Expand Up @@ -401,11 +401,13 @@ def standardise_unknowns(df):
pandas.DataFrame: Dataframe with cleaned up unknown values.
"""
for feat in df.columns:

if feat in data_cleaning_utils.numeric_features:
df[feat] = df[feat].replace(data_cleaning_utils.invalid_values, np.nan)
else:
df[feat] = df[feat].replace(data_cleaning_utils.invalid_values, "unknown")
if feat != "UPRN":
df[feat] = df[feat].replace(
data_cleaning_utils.invalid_values, "unknown"
)

return df

Expand All @@ -421,9 +423,7 @@ def standardise_features(df):
"""

for feat in df.columns:

if feat in data_cleaning_utils.features_to_standardise:

df[feat] = df[feat].str.strip()
feat_clean_dict = data_cleaning_utils.feature_cleaning_dict[feat]

Expand Down Expand Up @@ -486,7 +486,6 @@ def custom_clean_features(df, cap_features=False):
# [Additional cleaning functions here]

if cap_features:

cap_value_dict = {
"NUMBER_HABITABLE_ROOMS": 10,
"NUMBER_HEATED_ROOMS": 10,
Expand Down Expand Up @@ -518,6 +517,16 @@ def clean_epc_data(df):
df = standardise_dates(df)
df = custom_clean_features(df)

df["CONSTRUCTION_AGE_BAND"] = df.apply(
lambda x: enhance_construction_age_band(
x["CONSTRUCTION_AGE_BAND"],
x["TRANSACTION_TYPE"],
x["INSPECTION_DATE"],
x["COUNTRY"],
),
axis=1,
)

return df


Expand Down
Loading