From 5284022305ee0f8c30404bb9f3bd50c4c63237f6 Mon Sep 17 00:00:00 2001
From: lkstrp <lkstrp@pm.me>
Date: Mon, 10 Jun 2024 09:46:36 +0200
Subject: [PATCH 1/7] fix: resolve auto downcasting warning

---
 powerplantmatching/cleaning.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/powerplantmatching/cleaning.py b/powerplantmatching/cleaning.py
index 88b2c7a5..2e6e7472 100644
--- a/powerplantmatching/cleaning.py
+++ b/powerplantmatching/cleaning.py
@@ -17,6 +17,7 @@
 """
 Functions for vertically cleaning a dataset.
 """
+
 from __future__ import absolute_import, print_function
 
 import logging
@@ -449,7 +450,10 @@ def aggregate_units(
 
     df = cliques(df, duplicates)
     df = df.groupby("grouped").agg(props_for_groups)
-    df[str_cols] = df[str_cols].replace("", np.nan)
+
+    # Downcasting in replace is deprecated
+    with pd.option_context("future.no_silent_downcasting", True):
+        df[str_cols] = df[str_cols].replace("", np.nan).infer_objects(copy=False)
 
     df = (
         df.assign(

From 826f444de77cc8f45be9896fa030d5f0b8cec1ed Mon Sep 17 00:00:00 2001
From: lkstrp <lkstrp@pm.me>
Date: Mon, 10 Jun 2024 15:14:37 +0200
Subject: [PATCH 2/7] feat: add MaStR data

---
 powerplantmatching/data.py                  | 96 +++++++++++++++++++++
 powerplantmatching/package_data/config.yaml | 16 +++-
 2 files changed, 109 insertions(+), 3 deletions(-)

diff --git a/powerplantmatching/data.py b/powerplantmatching/data.py
index a4c18869..ebaaa613 100644
--- a/powerplantmatching/data.py
+++ b/powerplantmatching/data.py
@@ -2101,6 +2101,102 @@ def GEM(raw=False, update=False, config=None):
     return pd.concat(data, ignore_index=True)
 
 
+def MASTR(
+    raw=False,
+    update=False,
+    config=None,
+):
+    """
+    Get the Marktstammdatenregister (MaStR) dataset.
+
+    Provided by the German Federal Network Agency (Bundesnetzagentur / BNetza) and
+    contains data on Germany, Austria and Switzerland.
+
+    Parameters
+    ----------
+    raw : Boolean, default False
+        Whether to return the original dataset
+    update: bool, default False
+        Whether to update the data from the url.
+    config : dict, default None
+        Add custom specific configuration,
+        e.g. powerplantmatching.config.get_config(target_countries='Italy'),
+        defaults to powerplantmatching.config.get_config()
+
+    """
+    config = get_config() if config is None else config
+
+    fn = get_raw_file("MASTR", update=update, config=config)
+
+    file_suffixes = {
+        "Bioenergy": "biomass.csv",
+        "Combustion": "combustion.csv",
+        "Nuclear": "nuclear.csv",
+        "Hydro": "hydro.csv",
+        # "Wind": "wind.csv",  # TODO: Needs performance discussion
+        # "Solar": "solar.csv",
+    }
+    df = pd.DataFrame()
+    with ZipFile(fn, "r") as file:
+        for fueltype, suffix in file_suffixes.items():
+            for name in file.namelist():
+                if name.endswith(suffix):
+                    df = pd.concat(
+                        [
+                            df,
+                            pd.read_csv(file.open(name), low_memory=False).assign(
+                                Filesuffix=fueltype
+                            ),
+                        ]
+                    )
+                    break
+    df = df.reset_index(drop=True)
+
+    if raw:
+        return df
+
+    RENAME_COLUMNS = {
+        "EinheitMastrNummer": "projectID",
+        "NameKraftwerk": "Name",
+        "Land": "Country",
+        "Nettonennleistung": "Capacity",
+        "Inbetriebnahmedatum": "DateIn",
+        "DatumEndgueltigeStilllegung": "DateOut",
+        "Laengengrad": "lon",
+        "Breitengrad": "lat",
+    }
+    COUNTRY_MAP = {
+        "Deutschland": "Germany",
+        "Österreich": "Austria",
+        "Schweiz": "Switzerland",
+    }
+
+    df = (
+        df.drop(columns=["Name"])
+        .rename(columns=RENAME_COLUMNS)
+        .assign(
+            projectID=lambda df: "MASTR-" + df.projectID,
+            Country=lambda df: df.Country.map(COUNTRY_MAP),
+            Capacity=lambda df: df.Capacity / 1e3,  # kW to MW
+            DateIn=lambda df: pd.to_datetime(df.DateIn).dt.year,
+            DateOut=lambda df: pd.to_datetime(df.DateOut).dt.year,
+            Technology=np.nan,
+            Set="PP",
+        )
+        .loc[lambda df: df.Capacity > 1]  # TODO: Needs performance discussion
+        .pipe(
+            gather_specifications,
+            config=config,
+            parse_columns=["Filesuffix", "Energietraeger"],
+        )
+        .pipe(clean_name)
+        .pipe(set_column_name, "MASTR")
+        .pipe(config_filter, config)
+    )
+
+    return df
+
+
 # deprecated alias for GGPT
 @deprecated(
     deprecated_in="0.5.5",
diff --git a/powerplantmatching/package_data/config.yaml b/powerplantmatching/package_data/config.yaml
index 80118d7b..e9f58210 100644
--- a/powerplantmatching/package_data/config.yaml
+++ b/powerplantmatching/package_data/config.yaml
@@ -24,6 +24,7 @@ matching_sources:
   - BEYONDCOAL
   - WIKIPEDIA
   - GEM
+  - MASTR
 
 # fully_included_sources, these sources are included even without match to the final dataset
 fully_included_sources:
@@ -195,6 +196,14 @@ GHPT:
   reliability_score: 4
   fn: Global-Hydropower-Tracker-May-2023.csv
   url: https://raw.githubusercontent.com/pz-max/gem-powerplant-data/main/Global-Hydropower-Tracker-latest.csv
+
+MASTR:
+  net_capacity: true
+  reliability_score: 8
+  fn: bnetza_open_mastr_2023-08-08_B.zip
+  url: https://zenodo.org/records/8225106/files/bnetza_open_mastr_2023-08-08_B.zip
+
+  
 # ---------------------------------------------------------------------------- #
 #                             Data Structure Config                            #
 # ---------------------------------------------------------------------------- #
@@ -273,6 +282,7 @@ target_fueltypes:
       combined cycle,
       fossil gas,
       mixed fossil fuels,
+      erdgas
     ]
   Hydro:
     [
@@ -285,9 +295,9 @@ target_fueltypes:
       hydroelectric,
       wasserkraft,
     ]
-  Hard Coal: [coal, coke]
-  Lignite: [brown coal, lignite, peat]
-  Oil: [oil, diesel]
+  Hard Coal: [coal, coke, steinkohle]
+  Lignite: [brown coal, lignite, peat, braunkohle]
+  Oil: [oil, diesel, mineralölprodukte]
   Geothermal: ""
   Solar: ""
   Waste: ""

From ea76ab13db72b10d9b765722ee3316dcb6941d9a Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Thu, 2 Jan 2025 15:07:55 +0000
Subject: [PATCH 3/7] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 powerplantmatching/cleaning.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/powerplantmatching/cleaning.py b/powerplantmatching/cleaning.py
index d7bf4d41..3fbb3c5f 100644
--- a/powerplantmatching/cleaning.py
+++ b/powerplantmatching/cleaning.py
@@ -17,8 +17,6 @@
 Functions for vertically cleaning a dataset.
 """
 
-from __future__ import absolute_import, print_function
-
 import logging
 
 import networkx as nx

From fd7cc3b27ab5db90ecfb5b46751545ec2e74c41a Mon Sep 17 00:00:00 2001
From: Fabian Neumann <fabian.neumann@outlook.de>
Date: Fri, 3 Jan 2025 10:27:06 +0100
Subject: [PATCH 4/7] improve data cleaning and performance

---
 powerplantmatching/data.py                  | 91 +++++++++++++--------
 powerplantmatching/package_data/config.yaml | 26 +++---
 2 files changed, 73 insertions(+), 44 deletions(-)

diff --git a/powerplantmatching/data.py b/powerplantmatching/data.py
index ea75518a..77eb8259 100644
--- a/powerplantmatching/data.py
+++ b/powerplantmatching/data.py
@@ -2168,68 +2168,93 @@ def MASTR(
     """
     config = get_config() if config is None else config
 
-    fn = get_raw_file("MASTR", update=update, config=config)
+    RENAME_COLUMNS = {
+        "EinheitMastrNummer": "projectID",
+        "NameKraftwerk": "Name",
+        "Land": "Country",
+        "Nettonennleistung": "Capacity",
+        "Inbetriebnahmedatum": "DateIn",
+        "DatumEndgueltigeStilllegung": "DateOut",
+        "EinheitBetriebsstatus": "Status",
+        "Laengengrad": "lon",
+        "Breitengrad": "lat",
+    }
+    COUNTRY_MAP = {
+        "Deutschland": "Germany",
+        "Österreich": "Austria",
+        "Schweiz": "Switzerland",
+    }
+    PARSE_COLUMNS = [
+        "ArtDerWasserkraftanlage",
+        "Biomasseart",
+        "Filesuffix",
+        "Energietraeger",
+        "Hauptbrennstoff",
+        "NameStromerzeugungseinheit",
+    ]
 
+    fn = get_raw_file("MASTR", update=update, config=config)
     file_suffixes = {
         "Bioenergy": "biomass.csv",
         "Combustion": "combustion.csv",
         "Nuclear": "nuclear.csv",
         "Hydro": "hydro.csv",
-        # "Wind": "wind.csv",  # TODO: Needs performance discussion
-        # "Solar": "solar.csv",
+        "Wind": "wind.csv",
+        "Solar": "solar.csv",
     }
-    df = pd.DataFrame()
+    data_frames = []
     with ZipFile(fn, "r") as file:
         for fueltype, suffix in file_suffixes.items():
             for name in file.namelist():
                 if name.endswith(suffix):
-                    df = pd.concat(
-                        [
-                            df,
-                            pd.read_csv(file.open(name), low_memory=False).assign(
-                                Filesuffix=fueltype
-                            ),
-                        ]
+                    available_columns = pd.read_csv(file.open(name), nrows=0).columns
+                    target_columns = [
+                        "GeplantesInbetriebnahmedatum",
+                        "ThermischeNutzleistung",
+                        "KwkMastrNummer",
+                    ]
+                    target_columns = (
+                        target_columns
+                        + PARSE_COLUMNS
+                        + list(RENAME_COLUMNS.keys())
                     )
+                    usecols = available_columns.intersection(target_columns)
+                    df = pd.read_csv(file.open(name), usecols=usecols).assign(Filesuffix=fueltype)
+                    data_frames.append(df)
                     break
-    df = df.reset_index(drop=True)
+    df = pd.concat(data_frames).reset_index(drop=True)
 
     if raw:
         return df
 
-    RENAME_COLUMNS = {
-        "EinheitMastrNummer": "projectID",
-        "NameKraftwerk": "Name",
-        "Land": "Country",
-        "Nettonennleistung": "Capacity",
-        "Inbetriebnahmedatum": "DateIn",
-        "DatumEndgueltigeStilllegung": "DateOut",
-        "Laengengrad": "lon",
-        "Breitengrad": "lat",
-    }
-    COUNTRY_MAP = {
-        "Deutschland": "Germany",
-        "Österreich": "Austria",
-        "Schweiz": "Switzerland",
-    }
+    status_list = config["MASTR"].get("status", ["In Betrieb"])  # noqa: F841
+    capacity_threshold_kw = 50
 
     df = (
-        df.drop(columns=["Name"])
-        .rename(columns=RENAME_COLUMNS)
+        raw.rename(columns=RENAME_COLUMNS)
+        .query("Status in @status_list")
+        .loc[lambda df: df.Capacity > capacity_threshold_kw]
         .assign(
             projectID=lambda df: "MASTR-" + df.projectID,
             Country=lambda df: df.Country.map(COUNTRY_MAP),
             Capacity=lambda df: df.Capacity / 1e3,  # kW to MW
             DateIn=lambda df: pd.to_datetime(df.DateIn).dt.year,
             DateOut=lambda df: pd.to_datetime(df.DateOut).dt.year,
-            Technology=np.nan,
-            Set="PP",
         )
-        .loc[lambda df: df.Capacity > 1]  # TODO: Needs performance discussion
+        .assign(
+            DateIn=lambda df: df["DateIn"].combine_first(
+                pd.to_datetime(df["GeplantesInbetriebnahmedatum"]).dt.year
+            ),
+        )
         .pipe(
             gather_specifications,
             config=config,
-            parse_columns=["Filesuffix", "Energietraeger"],
+            parse_columns=PARSE_COLUMNS,
+        )
+        .assign(
+            Set=lambda df: df["Set"].where(
+                df["KwkMastrNummer"].isna() & df["ThermischeNutzleistung"].isna(), "CHP"
+            ),
         )
         .pipe(clean_name)
         .pipe(set_column_name, "MASTR")
diff --git a/powerplantmatching/package_data/config.yaml b/powerplantmatching/package_data/config.yaml
index e8e18036..cbaeb634 100644
--- a/powerplantmatching/package_data/config.yaml
+++ b/powerplantmatching/package_data/config.yaml
@@ -35,7 +35,8 @@ fully_included_sources:
   - JRC: Country not in ['Switzerland', 'Albania', 'United Kingdom', 'Norway']
   - OPSD: Country not in ['Switzerland', 'Italy', 'Spain', 'Norway', 'Austria']
   - BEYONDCOAL
-  - GEM
+  - GEM: Country != 'Germany'
+  - MASTR
 
 
 parallel_duke_processes: false
@@ -207,6 +208,7 @@ GHPT:
 MASTR:
   net_capacity: true
   reliability_score: 8
+  status: ["In Betrieb", "In Planung", "Endgültig stillgelegt"]
   fn: bnetza_open_mastr_2023-08-08_B.zip
   url: https://zenodo.org/records/8225106/files/bnetza_open_mastr_2023-08-08_B.zip
 
@@ -277,8 +279,8 @@ target_fueltypes:
   # given by the list. An empty string results in a regex expression containing only the key.
   # Parsed of representatives at the top may be overwritten by representatives further below.
   Other: ".*"
-  Solid Biomass: [biological, bioenergy, agricultural, wood, biomass]
-  Biogas: [biogas]
+  Solid Biomass: [biological, bioenergy, agricultural, wood, biomass, feste biomasse]
+  Biogas: [biogas, biomethan, gasförmige biomasse]
   Nuclear: [nuclear]
   Natural Gas:
     [
@@ -290,7 +292,8 @@ target_fueltypes:
       combined cycle,
       fossil gas,
       mixed fossil fuels,
-      erdgas
+      erdgas,
+      andere gase,
     ]
   Hydro:
     [
@@ -302,13 +305,14 @@ target_fueltypes:
       hydro,
       hydroelectric,
       wasserkraft,
+      wasser,
     ]
   Hard Coal: [coal, coke, steinkohle]
   Lignite: [brown coal, lignite, peat, braunkohle]
   Oil: [oil, diesel, mineralölprodukte]
   Geothermal: ""
   Solar: ""
-  Waste: ""
+  Waste: ["abfall.*", "waste"]
   Wind: ""
   Battery: [Electro-chemical, battery]
 target_sets:
@@ -337,12 +341,12 @@ target_technologies:
   # A list will be converted to a regex expression matching all words (case-insensitive)
   # given by the list. An empty string results in a regex expression containing only the key.
   # Parsed of representatives at the top may be overwritten by representatives further below.
-  CCGT: [ccgt, gas, natural gas]
-  OCGT: [ocgt]
-  Steam Turbine: [steam, turbine]
-  Combustion Engine: [combustion engine]
-  Run-Of-River: [run-off, run off, run of river, run-of-river, ror]
-  Pumped Storage: [pumped hydro, pumped]
+  CCGT: [ccgt, gas, natural gas, gasturbinen mit abhitzekessel]
+  OCGT: [ocgt, gasturbinen ohne abhitzekessel]
+  Steam Turbine: [steam, turbine, kondensationsmaschine, gegendruckmaschine, dampfmotor]
+  Combustion Engine: [combustion engine, verbrennungsmotor, stirlingmotor]
+  Run-Of-River: [run-off, run off, run of river, run-of-river, ror, laufwasseranlage]
+  Pumped Storage: [pumped hydro, pumped, speicherwasseranlage]
   Reservoir: ""
   Marine: ""
   Onshore: ""

From 50824bab62fbcaefd6d9055659f2a41aa2b90a51 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Fri, 3 Jan 2025 09:27:19 +0000
Subject: [PATCH 5/7] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 powerplantmatching/data.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/powerplantmatching/data.py b/powerplantmatching/data.py
index 77eb8259..23816d65 100644
--- a/powerplantmatching/data.py
+++ b/powerplantmatching/data.py
@@ -2214,12 +2214,12 @@ def MASTR(
                         "KwkMastrNummer",
                     ]
                     target_columns = (
-                        target_columns
-                        + PARSE_COLUMNS
-                        + list(RENAME_COLUMNS.keys())
+                        target_columns + PARSE_COLUMNS + list(RENAME_COLUMNS.keys())
                     )
                     usecols = available_columns.intersection(target_columns)
-                    df = pd.read_csv(file.open(name), usecols=usecols).assign(Filesuffix=fueltype)
+                    df = pd.read_csv(file.open(name), usecols=usecols).assign(
+                        Filesuffix=fueltype
+                    )
                     data_frames.append(df)
                     break
     df = pd.concat(data_frames).reset_index(drop=True)

From 34c408af11a3107763dd7f298cd0b9012166a6e9 Mon Sep 17 00:00:00 2001
From: Fabian Neumann <fabian.neumann@outlook.de>
Date: Fri, 3 Jan 2025 10:45:24 +0100
Subject: [PATCH 6/7] fix typo

---
 powerplantmatching/data.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/powerplantmatching/data.py b/powerplantmatching/data.py
index 23816d65..ea210f35 100644
--- a/powerplantmatching/data.py
+++ b/powerplantmatching/data.py
@@ -2231,7 +2231,7 @@ def MASTR(
     capacity_threshold_kw = 50
 
     df = (
-        raw.rename(columns=RENAME_COLUMNS)
+        df.rename(columns=RENAME_COLUMNS)
         .query("Status in @status_list")
         .loc[lambda df: df.Capacity > capacity_threshold_kw]
         .assign(

From 6282178ec21b532f59fa3c3eff6c8b813313ee79 Mon Sep 17 00:00:00 2001
From: Fabian Neumann <fabian.neumann@outlook.de>
Date: Fri, 3 Jan 2025 14:23:58 +0100
Subject: [PATCH 7/7] reset threshold to 1 MW for now

---
 powerplantmatching/data.py                  | 2 +-
 powerplantmatching/package_data/config.yaml | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/powerplantmatching/data.py b/powerplantmatching/data.py
index ea210f35..9f1d231e 100644
--- a/powerplantmatching/data.py
+++ b/powerplantmatching/data.py
@@ -2228,7 +2228,7 @@ def MASTR(
         return df
 
     status_list = config["MASTR"].get("status", ["In Betrieb"])  # noqa: F841
-    capacity_threshold_kw = 50
+    capacity_threshold_kw = 1000
 
     df = (
         df.rename(columns=RENAME_COLUMNS)
diff --git a/powerplantmatching/package_data/config.yaml b/powerplantmatching/package_data/config.yaml
index cbaeb634..de55eea6 100644
--- a/powerplantmatching/package_data/config.yaml
+++ b/powerplantmatching/package_data/config.yaml
@@ -35,7 +35,7 @@ fully_included_sources:
   - JRC: Country not in ['Switzerland', 'Albania', 'United Kingdom', 'Norway']
   - OPSD: Country not in ['Switzerland', 'Italy', 'Spain', 'Norway', 'Austria']
   - BEYONDCOAL
-  - GEM: Country != 'Germany'
+  - GEM: Country != 'Germany' or Fueltype == 'Solar'
   - MASTR