PyPSA · fneum · Aug 19, 2025 · Aug 19, 2025 · Aug 19, 2025 · Aug 19, 2025
diff --git a/.gitignore b/.gitignore
@@ -99,3 +99,6 @@ test.ipynb
 
 # uv
 uv.lock
+
+# jupyter
+*.ipynb
diff --git a/doc/release-notes.rst b/doc/release-notes.rst
@@ -8,6 +8,20 @@ Upcoming Version
 
 * Drop support for Python 3.9, add support for Python 3.13. Minimum required Python version is now 3.10.
 
+* Added [GeoNuclearData](github.com/cristianst85/GeoNuclearData) dataset as `pm.data.GND()`.
+* Added [European Energy Storage Inventory](https://ses.jrc.ec.europa.eu/storage-inventory-maps) dataset as `pm.data.EESI()`.
+* Added [GloHydroRES](https://zenodo.org/records/14526360) dataset as `pm.data.GHR()`.
+* Updated ENTSOE, BEYONDCOAL, JRC, IRENASTAT and the Global Energy Monitor datasets to the latest versions.
+* Fix in `pm.data.MASTR()` the distinction of hydro technologies and between offshore and onshore wind. Also read in storage technologies.
+* Improved recognition of CHP power plants.
+* In Global Energy Monitor datasets, also read entries below capacity threshold.
+* In `pm.data.GCPT()`, add estimate for coal plant efficiency.
+* Include mothballed gas, oil and coal power plants.
+* Initially, include unit/block name in power plant name before matching.
+* Added option to retain blocks for subsets of fuel types (e.g. `clean_name: fueltypes_with_blocks: ['Nuclear']`).
+* For fully included datasets, add option to only aggregate units included in the matching process (e.g. `aggregate_only_matching_sources: ['MASTR']`).
+* Added option for multiprocessing when aggregating units of non-matched power plants (e.g. `threads_extend_by_non_matched: 16`).
+* Updating matching logic configuration.
 
 `v0.7.1 <https://github.com/PyPSA/powerplantmatching/releases/tag/v0.7.1>`__ (30th January 2024)
 =================================================================================================

diff --git a/powerplantmatching/cleaning.py b/powerplantmatching/cleaning.py
@@ -76,24 +76,78 @@ def clean_name(df, config=None):
 
     name = df.Name.astype(str).copy().apply(unidecode.unidecode)
 
+    roman_to_arabic = {
+        "I": "1",
+        "II": "2",
+        "III": "3",
+        "IV": "4",
+        "V": "5",
+        "VI": "6",
+        "VII": "7",
+        "VIII": "8",
+        "IX": "9",
+        "X": "10",
+        "XI": "11",
+    }
+    for roman, arabic in roman_to_arabic.items():
+        name = name.str.replace(rf"\b{roman}\b", arabic, regex=True)
+
     replace = config["clean_name"]["replace"]
     replace.setdefault("", [])
 
+    keep_blocks = config["clean_name"].get("fueltypes_with_blocks", [])
+    if len(keep_blocks) > 0:
+        mask = df.Fueltype.isin(keep_blocks)
+
     for key, pattern in replace.items():
         if config["clean_name"]["remove_common_words"] and (key == ""):
             common_words = pd.Series(sum(name.str.split(), [])).value_counts()
             common_words = list(common_words[common_words >= 20].index)
             pattern += common_words
-        if isinstance(pattern, list):
-            # if pattern is a list, concat all entries in a case-insensitive regex
+
+        pattern = np.atleast_1d(pattern)
+
+        # do not remove block numbers for fuel types with blocks; the regular
+        # regex [^a-zA-Z] removes non-alphabetical characters; for fueltypes to
+        # keep, the regex [^a-zA-Z0-9] is used which only removes
+        # non-alphanumerical characters
+        if len(keep_blocks) > 0 and key == " " and "[^a-zA-Z]" in pattern:
+            base = [rf"\b{p}\b" for p in pattern if p != "[^a-zA-Z]"]
+            pattern_keep = r"(?i)" + "|".join(base + [r"[^a-zA-Z0-9]"])
+            pattern_default = r"(?i)" + "|".join(base + [r"[^a-zA-Z]"])
+            name.loc[mask] = name.loc[mask].str.replace(pattern_keep, key, regex=True)
+            name.loc[~mask] = name.loc[~mask].str.replace(
+                pattern_default, key, regex=True
+            )
+
+        # do not remove block letters for fuel types with blocks; the regular
+        # regex \w would remove standalone letters, this one is skipped for
+        # fueltypes in mask
+        elif key == "" and "\w" in pattern:
+            pattern_keep = r"(?i)" + "|".join(
+                [rf"\b{p}\b" for p in pattern if p != "\w"]
+            )
+            pattern_default = r"(?i)" + "|".join([rf"\b{p}\b" for p in pattern])
+            name.loc[mask] = name.loc[mask].str.replace(pattern_keep, key, regex=True)
+            name.loc[~mask] = name.loc[~mask].str.replace(
+                pattern_default, key, regex=True
+            )
+
+        else:
             pattern = r"(?i)" + "|".join([rf"\b{p}\b" for p in pattern])
-        elif not isinstance(pattern, str):
-            raise ValueError(f"Pattern must be string or list, not {type(pattern)}")
-        name = name.str.replace(pattern, key, regex=True)
+            name = name.str.replace(pattern, key, regex=True)
 
+    # remove duplicated words; second pass necessary for edge cases
     if config["clean_name"]["remove_duplicated_words"]:
-        name = name.str.replace(r"\b(\w+)(?:\W\1\b)+", r"\1", regex=True, case=False)
-    name = name.str.strip().str.title().str.replace(r" +", " ", regex=True)
+        name = (
+            name.str.replace(r"\b(\w+)(?:\W\1\b)+", r"\1", regex=True, case=False)
+            .str.strip()
+            .str.replace(r" +", " ", regex=True)
+            .str.title()
+            .str.replace(r"\b(\w+)(?:\W\1\b)+", r"\1", regex=True, case=False)
+        )
+    else:
+        name = name.str.strip().str.title().str.replace(r" +", " ", regex=True)
 
     return df.assign(Name=name).sort_values("Name")
 
@@ -329,7 +383,16 @@ def clean_technology(df, generalize_hydros=False):
         .str.split(", ")
         .apply(lambda x: ", ".join(i.strip() for i in np.unique(x)))
     )
-    tech = tech.replace({"Ccgt": "CCGT", "Ocgt": "OCGT"}, regex=True)
+    ABBREVIATIONS = {
+        "Ccgt": "CCGT",
+        "Ocgt": "OCGT",
+        "Pv": "PV",
+        "Nas": "NaS",
+        "Nicd": "NiCd",
+        "Nanicl": "NaNiCl",
+        "Caes": "CAES",
+    }
+    tech = tech.replace(ABBREVIATIONS, regex=False)
     return df.assign(Technology=tech)
 
 
@@ -367,6 +430,7 @@ def aggregate_units(
     pre_clean_name=False,
     country_wise=True,
     config=None,
+    threads=1,
     **kwargs,
 ):
     """
@@ -385,6 +449,8 @@ def aggregate_units(
         Whether to clean the 'Name'-column before aggregating.
     country_wise : Boolean, default True
         Whether to aggregate only entries with a identical country.
+    threads : int, default 1
+        Number of threads to use
     """
     deprecated_args = {"use_saved_aggregation", "save_aggregation"}
     used_deprecated_args = deprecated_args.intersection(kwargs)
@@ -422,12 +488,27 @@ def aggregate_units(
         df = clean_name(df)
 
     logger.info(f"Aggregating blocks in data source '{ds_name}'.")
+    agg_query = None
+    if ds_name in config.get("aggregate_only_matching_sources", []):
+        for source in config["matching_sources"]:
+            if isinstance(source, dict) and ds_name in source:
+                agg_query = source[ds_name]
+                break
+
+    block_query = None
+    if with_blocks := config["clean_name"].get("fuel_type_with_blocks", []):  # noqa
+        block_query = "Fueltype in @with_blocks"
 
     if country_wise:
         countries = df.Country.unique()
-        duplicates = pd.concat([duke(df.query("Country == @c")) for c in countries])
+        country_query = "Country == @c"
+        query = " and ".join(filter(None, [agg_query, block_query, country_query]))
+        duplicates = pd.concat(
+            [duke(df.query(query), threads=threads) for c in countries]
+        )
     else:
-        duplicates = duke(df)
+        query = " and ".join(filter(None, [agg_query, block_query]))
+        duplicates = duke(df.query(query) if query else df, threads=threads)
 
     df = cliques(df, duplicates)
     df = df.groupby("grouped").agg(props_for_groups)
@@ -445,4 +526,9 @@ def aggregate_units(
         .reindex(columns=cols)
         .pipe(set_column_name, ds_name)
     )
+
+    # Remove zero values from summed non-weighted numeric columns
+    numeric_cols = df.select_dtypes(include="number").columns
+    df[numeric_cols] = df[numeric_cols].where(lambda df: df != 0)
+
     return df
diff --git a/powerplantmatching/collection.py b/powerplantmatching/collection.py
@@ -60,6 +60,10 @@ def df_by_name(name):
         get_df = getattr(data, name)
         df = get_df(config=config)
 
+        for source in config["matching_sources"]:
+            if isinstance(source, dict) and next(iter(source)) == name:
+                df = df.query(source[name])
+
         if not conf.get("aggregated_units", False):
             return aggregate_units(df, dataset_name=name, config=config)
         else:
-Original file line number
+Diff line change
@@ Expand Up / @@ -99,3 +99,6 @@ test.ipynb @@
     # uv
     uv.lock
+    # jupyter
+    *.ipynb