From 5284022305ee0f8c30404bb9f3bd50c4c63237f6 Mon Sep 17 00:00:00 2001 From: lkstrp Date: Mon, 10 Jun 2024 09:46:36 +0200 Subject: [PATCH 1/7] fix: resolve auto downcasting warning --- powerplantmatching/cleaning.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/powerplantmatching/cleaning.py b/powerplantmatching/cleaning.py index 88b2c7a5..2e6e7472 100644 --- a/powerplantmatching/cleaning.py +++ b/powerplantmatching/cleaning.py @@ -17,6 +17,7 @@ """ Functions for vertically cleaning a dataset. """ + from __future__ import absolute_import, print_function import logging @@ -449,7 +450,10 @@ def aggregate_units( df = cliques(df, duplicates) df = df.groupby("grouped").agg(props_for_groups) - df[str_cols] = df[str_cols].replace("", np.nan) + + # Downcasting in replace is deprecated + with pd.option_context("future.no_silent_downcasting", True): + df[str_cols] = df[str_cols].replace("", np.nan).infer_objects(copy=False) df = ( df.assign( From 826f444de77cc8f45be9896fa030d5f0b8cec1ed Mon Sep 17 00:00:00 2001 From: lkstrp Date: Mon, 10 Jun 2024 15:14:37 +0200 Subject: [PATCH 2/7] feat: add MaStR data --- powerplantmatching/data.py | 96 +++++++++++++++++++++ powerplantmatching/package_data/config.yaml | 16 +++- 2 files changed, 109 insertions(+), 3 deletions(-) diff --git a/powerplantmatching/data.py b/powerplantmatching/data.py index a4c18869..ebaaa613 100644 --- a/powerplantmatching/data.py +++ b/powerplantmatching/data.py @@ -2101,6 +2101,102 @@ def GEM(raw=False, update=False, config=None): return pd.concat(data, ignore_index=True) +def MASTR( + raw=False, + update=False, + config=None, +): + """ + Get the Marktstammdatenregister (MaStR) dataset. + + Provided by the German Federal Network Agency (Bundesnetzagentur / BNetza) and + contains data on Germany, Austria and Switzerland. + + Parameters + ---------- + raw : Boolean, default False + Whether to return the original dataset + update: bool, default False + Whether to update the data from the url. + config : dict, default None + Add custom specific configuration, + e.g. powerplantmatching.config.get_config(target_countries='Italy'), + defaults to powerplantmatching.config.get_config() + + """ + config = get_config() if config is None else config + + fn = get_raw_file("MASTR", update=update, config=config) + + file_suffixes = { + "Bioenergy": "biomass.csv", + "Combustion": "combustion.csv", + "Nuclear": "nuclear.csv", + "Hydro": "hydro.csv", + # "Wind": "wind.csv", # TODO: Needs performance discussion + # "Solar": "solar.csv", + } + df = pd.DataFrame() + with ZipFile(fn, "r") as file: + for fueltype, suffix in file_suffixes.items(): + for name in file.namelist(): + if name.endswith(suffix): + df = pd.concat( + [ + df, + pd.read_csv(file.open(name), low_memory=False).assign( + Filesuffix=fueltype + ), + ] + ) + break + df = df.reset_index(drop=True) + + if raw: + return df + + RENAME_COLUMNS = { + "EinheitMastrNummer": "projectID", + "NameKraftwerk": "Name", + "Land": "Country", + "Nettonennleistung": "Capacity", + "Inbetriebnahmedatum": "DateIn", + "DatumEndgueltigeStilllegung": "DateOut", + "Laengengrad": "lon", + "Breitengrad": "lat", + } + COUNTRY_MAP = { + "Deutschland": "Germany", + "Österreich": "Austria", + "Schweiz": "Switzerland", + } + + df = ( + df.drop(columns=["Name"]) + .rename(columns=RENAME_COLUMNS) + .assign( + projectID=lambda df: "MASTR-" + df.projectID, + Country=lambda df: df.Country.map(COUNTRY_MAP), + Capacity=lambda df: df.Capacity / 1e3, # kW to MW + DateIn=lambda df: pd.to_datetime(df.DateIn).dt.year, + DateOut=lambda df: pd.to_datetime(df.DateOut).dt.year, + Technology=np.nan, + Set="PP", + ) + .loc[lambda df: df.Capacity > 1] # TODO: Needs performance discussion + .pipe( + gather_specifications, + config=config, + parse_columns=["Filesuffix", "Energietraeger"], + ) + .pipe(clean_name) + .pipe(set_column_name, "MASTR") + .pipe(config_filter, config) + ) + + return df + + # deprecated alias for GGPT @deprecated( deprecated_in="0.5.5", diff --git a/powerplantmatching/package_data/config.yaml b/powerplantmatching/package_data/config.yaml index 80118d7b..e9f58210 100644 --- a/powerplantmatching/package_data/config.yaml +++ b/powerplantmatching/package_data/config.yaml @@ -24,6 +24,7 @@ matching_sources: - BEYONDCOAL - WIKIPEDIA - GEM + - MASTR # fully_included_sources, these sources are included even without match to the final dataset fully_included_sources: @@ -195,6 +196,14 @@ GHPT: reliability_score: 4 fn: Global-Hydropower-Tracker-May-2023.csv url: https://raw.githubusercontent.com/pz-max/gem-powerplant-data/main/Global-Hydropower-Tracker-latest.csv + +MASTR: + net_capacity: true + reliability_score: 8 + fn: bnetza_open_mastr_2023-08-08_B.zip + url: https://zenodo.org/records/8225106/files/bnetza_open_mastr_2023-08-08_B.zip + + # ---------------------------------------------------------------------------- # # Data Structure Config # # ---------------------------------------------------------------------------- # @@ -273,6 +282,7 @@ target_fueltypes: combined cycle, fossil gas, mixed fossil fuels, + erdgas ] Hydro: [ @@ -285,9 +295,9 @@ target_fueltypes: hydroelectric, wasserkraft, ] - Hard Coal: [coal, coke] - Lignite: [brown coal, lignite, peat] - Oil: [oil, diesel] + Hard Coal: [coal, coke, steinkohle] + Lignite: [brown coal, lignite, peat, braunkohle] + Oil: [oil, diesel, mineralölprodukte] Geothermal: "" Solar: "" Waste: "" From ea76ab13db72b10d9b765722ee3316dcb6941d9a Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 2 Jan 2025 15:07:55 +0000 Subject: [PATCH 3/7] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- powerplantmatching/cleaning.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/powerplantmatching/cleaning.py b/powerplantmatching/cleaning.py index d7bf4d41..3fbb3c5f 100644 --- a/powerplantmatching/cleaning.py +++ b/powerplantmatching/cleaning.py @@ -17,8 +17,6 @@ Functions for vertically cleaning a dataset. """ -from __future__ import absolute_import, print_function - import logging import networkx as nx From fd7cc3b27ab5db90ecfb5b46751545ec2e74c41a Mon Sep 17 00:00:00 2001 From: Fabian Neumann Date: Fri, 3 Jan 2025 10:27:06 +0100 Subject: [PATCH 4/7] improve data cleaning and performance --- powerplantmatching/data.py | 91 +++++++++++++-------- powerplantmatching/package_data/config.yaml | 26 +++--- 2 files changed, 73 insertions(+), 44 deletions(-) diff --git a/powerplantmatching/data.py b/powerplantmatching/data.py index ea75518a..77eb8259 100644 --- a/powerplantmatching/data.py +++ b/powerplantmatching/data.py @@ -2168,68 +2168,93 @@ def MASTR( """ config = get_config() if config is None else config - fn = get_raw_file("MASTR", update=update, config=config) + RENAME_COLUMNS = { + "EinheitMastrNummer": "projectID", + "NameKraftwerk": "Name", + "Land": "Country", + "Nettonennleistung": "Capacity", + "Inbetriebnahmedatum": "DateIn", + "DatumEndgueltigeStilllegung": "DateOut", + "EinheitBetriebsstatus": "Status", + "Laengengrad": "lon", + "Breitengrad": "lat", + } + COUNTRY_MAP = { + "Deutschland": "Germany", + "Österreich": "Austria", + "Schweiz": "Switzerland", + } + PARSE_COLUMNS = [ + "ArtDerWasserkraftanlage", + "Biomasseart", + "Filesuffix", + "Energietraeger", + "Hauptbrennstoff", + "NameStromerzeugungseinheit", + ] + fn = get_raw_file("MASTR", update=update, config=config) file_suffixes = { "Bioenergy": "biomass.csv", "Combustion": "combustion.csv", "Nuclear": "nuclear.csv", "Hydro": "hydro.csv", - # "Wind": "wind.csv", # TODO: Needs performance discussion - # "Solar": "solar.csv", + "Wind": "wind.csv", + "Solar": "solar.csv", } - df = pd.DataFrame() + data_frames = [] with ZipFile(fn, "r") as file: for fueltype, suffix in file_suffixes.items(): for name in file.namelist(): if name.endswith(suffix): - df = pd.concat( - [ - df, - pd.read_csv(file.open(name), low_memory=False).assign( - Filesuffix=fueltype - ), - ] + available_columns = pd.read_csv(file.open(name), nrows=0).columns + target_columns = [ + "GeplantesInbetriebnahmedatum", + "ThermischeNutzleistung", + "KwkMastrNummer", + ] + target_columns = ( + target_columns + + PARSE_COLUMNS + + list(RENAME_COLUMNS.keys()) ) + usecols = available_columns.intersection(target_columns) + df = pd.read_csv(file.open(name), usecols=usecols).assign(Filesuffix=fueltype) + data_frames.append(df) break - df = df.reset_index(drop=True) + df = pd.concat(data_frames).reset_index(drop=True) if raw: return df - RENAME_COLUMNS = { - "EinheitMastrNummer": "projectID", - "NameKraftwerk": "Name", - "Land": "Country", - "Nettonennleistung": "Capacity", - "Inbetriebnahmedatum": "DateIn", - "DatumEndgueltigeStilllegung": "DateOut", - "Laengengrad": "lon", - "Breitengrad": "lat", - } - COUNTRY_MAP = { - "Deutschland": "Germany", - "Österreich": "Austria", - "Schweiz": "Switzerland", - } + status_list = config["MASTR"].get("status", ["In Betrieb"]) # noqa: F841 + capacity_threshold_kw = 50 df = ( - df.drop(columns=["Name"]) - .rename(columns=RENAME_COLUMNS) + raw.rename(columns=RENAME_COLUMNS) + .query("Status in @status_list") + .loc[lambda df: df.Capacity > capacity_threshold_kw] .assign( projectID=lambda df: "MASTR-" + df.projectID, Country=lambda df: df.Country.map(COUNTRY_MAP), Capacity=lambda df: df.Capacity / 1e3, # kW to MW DateIn=lambda df: pd.to_datetime(df.DateIn).dt.year, DateOut=lambda df: pd.to_datetime(df.DateOut).dt.year, - Technology=np.nan, - Set="PP", ) - .loc[lambda df: df.Capacity > 1] # TODO: Needs performance discussion + .assign( + DateIn=lambda df: df["DateIn"].combine_first( + pd.to_datetime(df["GeplantesInbetriebnahmedatum"]).dt.year + ), + ) .pipe( gather_specifications, config=config, - parse_columns=["Filesuffix", "Energietraeger"], + parse_columns=PARSE_COLUMNS, + ) + .assign( + Set=lambda df: df["Set"].where( + df["KwkMastrNummer"].isna() & df["ThermischeNutzleistung"].isna(), "CHP" + ), ) .pipe(clean_name) .pipe(set_column_name, "MASTR") diff --git a/powerplantmatching/package_data/config.yaml b/powerplantmatching/package_data/config.yaml index e8e18036..cbaeb634 100644 --- a/powerplantmatching/package_data/config.yaml +++ b/powerplantmatching/package_data/config.yaml @@ -35,7 +35,8 @@ fully_included_sources: - JRC: Country not in ['Switzerland', 'Albania', 'United Kingdom', 'Norway'] - OPSD: Country not in ['Switzerland', 'Italy', 'Spain', 'Norway', 'Austria'] - BEYONDCOAL - - GEM + - GEM: Country != 'Germany' + - MASTR parallel_duke_processes: false @@ -207,6 +208,7 @@ GHPT: MASTR: net_capacity: true reliability_score: 8 + status: ["In Betrieb", "In Planung", "Endgültig stillgelegt"] fn: bnetza_open_mastr_2023-08-08_B.zip url: https://zenodo.org/records/8225106/files/bnetza_open_mastr_2023-08-08_B.zip @@ -277,8 +279,8 @@ target_fueltypes: # given by the list. An empty string results in a regex expression containing only the key. # Parsed of representatives at the top may be overwritten by representatives further below. Other: ".*" - Solid Biomass: [biological, bioenergy, agricultural, wood, biomass] - Biogas: [biogas] + Solid Biomass: [biological, bioenergy, agricultural, wood, biomass, feste biomasse] + Biogas: [biogas, biomethan, gasförmige biomasse] Nuclear: [nuclear] Natural Gas: [ @@ -290,7 +292,8 @@ target_fueltypes: combined cycle, fossil gas, mixed fossil fuels, - erdgas + erdgas, + andere gase, ] Hydro: [ @@ -302,13 +305,14 @@ target_fueltypes: hydro, hydroelectric, wasserkraft, + wasser, ] Hard Coal: [coal, coke, steinkohle] Lignite: [brown coal, lignite, peat, braunkohle] Oil: [oil, diesel, mineralölprodukte] Geothermal: "" Solar: "" - Waste: "" + Waste: ["abfall.*", "waste"] Wind: "" Battery: [Electro-chemical, battery] target_sets: @@ -337,12 +341,12 @@ target_technologies: # A list will be converted to a regex expression matching all words (case-insensitive) # given by the list. An empty string results in a regex expression containing only the key. # Parsed of representatives at the top may be overwritten by representatives further below. - CCGT: [ccgt, gas, natural gas] - OCGT: [ocgt] - Steam Turbine: [steam, turbine] - Combustion Engine: [combustion engine] - Run-Of-River: [run-off, run off, run of river, run-of-river, ror] - Pumped Storage: [pumped hydro, pumped] + CCGT: [ccgt, gas, natural gas, gasturbinen mit abhitzekessel] + OCGT: [ocgt, gasturbinen ohne abhitzekessel] + Steam Turbine: [steam, turbine, kondensationsmaschine, gegendruckmaschine, dampfmotor] + Combustion Engine: [combustion engine, verbrennungsmotor, stirlingmotor] + Run-Of-River: [run-off, run off, run of river, run-of-river, ror, laufwasseranlage] + Pumped Storage: [pumped hydro, pumped, speicherwasseranlage] Reservoir: "" Marine: "" Onshore: "" From 50824bab62fbcaefd6d9055659f2a41aa2b90a51 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 3 Jan 2025 09:27:19 +0000 Subject: [PATCH 5/7] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- powerplantmatching/data.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/powerplantmatching/data.py b/powerplantmatching/data.py index 77eb8259..23816d65 100644 --- a/powerplantmatching/data.py +++ b/powerplantmatching/data.py @@ -2214,12 +2214,12 @@ def MASTR( "KwkMastrNummer", ] target_columns = ( - target_columns - + PARSE_COLUMNS - + list(RENAME_COLUMNS.keys()) + target_columns + PARSE_COLUMNS + list(RENAME_COLUMNS.keys()) ) usecols = available_columns.intersection(target_columns) - df = pd.read_csv(file.open(name), usecols=usecols).assign(Filesuffix=fueltype) + df = pd.read_csv(file.open(name), usecols=usecols).assign( + Filesuffix=fueltype + ) data_frames.append(df) break df = pd.concat(data_frames).reset_index(drop=True) From 34c408af11a3107763dd7f298cd0b9012166a6e9 Mon Sep 17 00:00:00 2001 From: Fabian Neumann Date: Fri, 3 Jan 2025 10:45:24 +0100 Subject: [PATCH 6/7] fix typo --- powerplantmatching/data.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/powerplantmatching/data.py b/powerplantmatching/data.py index 23816d65..ea210f35 100644 --- a/powerplantmatching/data.py +++ b/powerplantmatching/data.py @@ -2231,7 +2231,7 @@ def MASTR( capacity_threshold_kw = 50 df = ( - raw.rename(columns=RENAME_COLUMNS) + df.rename(columns=RENAME_COLUMNS) .query("Status in @status_list") .loc[lambda df: df.Capacity > capacity_threshold_kw] .assign( From 6282178ec21b532f59fa3c3eff6c8b813313ee79 Mon Sep 17 00:00:00 2001 From: Fabian Neumann Date: Fri, 3 Jan 2025 14:23:58 +0100 Subject: [PATCH 7/7] reset threshold to 1 MW for now --- powerplantmatching/data.py | 2 +- powerplantmatching/package_data/config.yaml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/powerplantmatching/data.py b/powerplantmatching/data.py index ea210f35..9f1d231e 100644 --- a/powerplantmatching/data.py +++ b/powerplantmatching/data.py @@ -2228,7 +2228,7 @@ def MASTR( return df status_list = config["MASTR"].get("status", ["In Betrieb"]) # noqa: F841 - capacity_threshold_kw = 50 + capacity_threshold_kw = 1000 df = ( df.rename(columns=RENAME_COLUMNS) diff --git a/powerplantmatching/package_data/config.yaml b/powerplantmatching/package_data/config.yaml index cbaeb634..de55eea6 100644 --- a/powerplantmatching/package_data/config.yaml +++ b/powerplantmatching/package_data/config.yaml @@ -35,7 +35,7 @@ fully_included_sources: - JRC: Country not in ['Switzerland', 'Albania', 'United Kingdom', 'Norway'] - OPSD: Country not in ['Switzerland', 'Italy', 'Spain', 'Norway', 'Austria'] - BEYONDCOAL - - GEM: Country != 'Germany' + - GEM: Country != 'Germany' or Fueltype == 'Solar' - MASTR