diff --git a/.gitignore b/.gitignore index e883d2b7..e71eeaf1 100644 --- a/.gitignore +++ b/.gitignore @@ -99,3 +99,6 @@ test.ipynb # uv uv.lock + +# jupyter +*.ipynb diff --git a/doc/release-notes.rst b/doc/release-notes.rst index 0430a104..4d2e3f33 100644 --- a/doc/release-notes.rst +++ b/doc/release-notes.rst @@ -8,6 +8,20 @@ Upcoming Version * Drop support for Python 3.9, add support for Python 3.13. Minimum required Python version is now 3.10. +* Added [GeoNuclearData](github.com/cristianst85/GeoNuclearData) dataset as `pm.data.GND()`. +* Added [European Energy Storage Inventory](https://ses.jrc.ec.europa.eu/storage-inventory-maps) dataset as `pm.data.EESI()`. +* Added [GloHydroRES](https://zenodo.org/records/14526360) dataset as `pm.data.GHR()`. +* Updated ENTSOE, BEYONDCOAL, JRC, IRENASTAT and the Global Energy Monitor datasets to the latest versions. +* Fix in `pm.data.MASTR()` the distinction of hydro technologies and between offshore and onshore wind. Also read in storage technologies. +* Improved recognition of CHP power plants. +* In Global Energy Monitor datasets, also read entries below capacity threshold. +* In `pm.data.GCPT()`, add estimate for coal plant efficiency. +* Include mothballed gas, oil and coal power plants. +* Initially, include unit/block name in power plant name before matching. +* Added option to retain blocks for subsets of fuel types (e.g. `clean_name: fueltypes_with_blocks: ['Nuclear']`). +* For fully included datasets, add option to only aggregate units included in the matching process (e.g. `aggregate_only_matching_sources: ['MASTR']`). +* Added option for multiprocessing when aggregating units of non-matched power plants (e.g. `threads_extend_by_non_matched: 16`). +* Updating matching logic configuration. `v0.7.1 `__ (30th January 2024) ================================================================================================= diff --git a/powerplantmatching/cleaning.py b/powerplantmatching/cleaning.py index 98a73477..3acf4306 100644 --- a/powerplantmatching/cleaning.py +++ b/powerplantmatching/cleaning.py @@ -76,24 +76,78 @@ def clean_name(df, config=None): name = df.Name.astype(str).copy().apply(unidecode.unidecode) + roman_to_arabic = { + "I": "1", + "II": "2", + "III": "3", + "IV": "4", + "V": "5", + "VI": "6", + "VII": "7", + "VIII": "8", + "IX": "9", + "X": "10", + "XI": "11", + } + for roman, arabic in roman_to_arabic.items(): + name = name.str.replace(rf"\b{roman}\b", arabic, regex=True) + replace = config["clean_name"]["replace"] replace.setdefault("", []) + keep_blocks = config["clean_name"].get("fueltypes_with_blocks", []) + if len(keep_blocks) > 0: + mask = df.Fueltype.isin(keep_blocks) + for key, pattern in replace.items(): if config["clean_name"]["remove_common_words"] and (key == ""): common_words = pd.Series(sum(name.str.split(), [])).value_counts() common_words = list(common_words[common_words >= 20].index) pattern += common_words - if isinstance(pattern, list): - # if pattern is a list, concat all entries in a case-insensitive regex + + pattern = np.atleast_1d(pattern) + + # do not remove block numbers for fuel types with blocks; the regular + # regex [^a-zA-Z] removes non-alphabetical characters; for fueltypes to + # keep, the regex [^a-zA-Z0-9] is used which only removes + # non-alphanumerical characters + if len(keep_blocks) > 0 and key == " " and "[^a-zA-Z]" in pattern: + base = [rf"\b{p}\b" for p in pattern if p != "[^a-zA-Z]"] + pattern_keep = r"(?i)" + "|".join(base + [r"[^a-zA-Z0-9]"]) + pattern_default = r"(?i)" + "|".join(base + [r"[^a-zA-Z]"]) + name.loc[mask] = name.loc[mask].str.replace(pattern_keep, key, regex=True) + name.loc[~mask] = name.loc[~mask].str.replace( + pattern_default, key, regex=True + ) + + # do not remove block letters for fuel types with blocks; the regular + # regex \w would remove standalone letters, this one is skipped for + # fueltypes in mask + elif key == "" and "\w" in pattern: + pattern_keep = r"(?i)" + "|".join( + [rf"\b{p}\b" for p in pattern if p != "\w"] + ) + pattern_default = r"(?i)" + "|".join([rf"\b{p}\b" for p in pattern]) + name.loc[mask] = name.loc[mask].str.replace(pattern_keep, key, regex=True) + name.loc[~mask] = name.loc[~mask].str.replace( + pattern_default, key, regex=True + ) + + else: pattern = r"(?i)" + "|".join([rf"\b{p}\b" for p in pattern]) - elif not isinstance(pattern, str): - raise ValueError(f"Pattern must be string or list, not {type(pattern)}") - name = name.str.replace(pattern, key, regex=True) + name = name.str.replace(pattern, key, regex=True) + # remove duplicated words; second pass necessary for edge cases if config["clean_name"]["remove_duplicated_words"]: - name = name.str.replace(r"\b(\w+)(?:\W\1\b)+", r"\1", regex=True, case=False) - name = name.str.strip().str.title().str.replace(r" +", " ", regex=True) + name = ( + name.str.replace(r"\b(\w+)(?:\W\1\b)+", r"\1", regex=True, case=False) + .str.strip() + .str.replace(r" +", " ", regex=True) + .str.title() + .str.replace(r"\b(\w+)(?:\W\1\b)+", r"\1", regex=True, case=False) + ) + else: + name = name.str.strip().str.title().str.replace(r" +", " ", regex=True) return df.assign(Name=name).sort_values("Name") @@ -329,7 +383,16 @@ def clean_technology(df, generalize_hydros=False): .str.split(", ") .apply(lambda x: ", ".join(i.strip() for i in np.unique(x))) ) - tech = tech.replace({"Ccgt": "CCGT", "Ocgt": "OCGT"}, regex=True) + ABBREVIATIONS = { + "Ccgt": "CCGT", + "Ocgt": "OCGT", + "Pv": "PV", + "Nas": "NaS", + "Nicd": "NiCd", + "Nanicl": "NaNiCl", + "Caes": "CAES", + } + tech = tech.replace(ABBREVIATIONS, regex=False) return df.assign(Technology=tech) @@ -367,6 +430,7 @@ def aggregate_units( pre_clean_name=False, country_wise=True, config=None, + threads=1, **kwargs, ): """ @@ -385,6 +449,8 @@ def aggregate_units( Whether to clean the 'Name'-column before aggregating. country_wise : Boolean, default True Whether to aggregate only entries with a identical country. + threads : int, default 1 + Number of threads to use """ deprecated_args = {"use_saved_aggregation", "save_aggregation"} used_deprecated_args = deprecated_args.intersection(kwargs) @@ -422,12 +488,27 @@ def aggregate_units( df = clean_name(df) logger.info(f"Aggregating blocks in data source '{ds_name}'.") + agg_query = None + if ds_name in config.get("aggregate_only_matching_sources", []): + for source in config["matching_sources"]: + if isinstance(source, dict) and ds_name in source: + agg_query = source[ds_name] + break + + block_query = None + if with_blocks := config["clean_name"].get("fuel_type_with_blocks", []): # noqa + block_query = "Fueltype in @with_blocks" if country_wise: countries = df.Country.unique() - duplicates = pd.concat([duke(df.query("Country == @c")) for c in countries]) + country_query = "Country == @c" + query = " and ".join(filter(None, [agg_query, block_query, country_query])) + duplicates = pd.concat( + [duke(df.query(query), threads=threads) for c in countries] + ) else: - duplicates = duke(df) + query = " and ".join(filter(None, [agg_query, block_query])) + duplicates = duke(df.query(query) if query else df, threads=threads) df = cliques(df, duplicates) df = df.groupby("grouped").agg(props_for_groups) @@ -445,4 +526,9 @@ def aggregate_units( .reindex(columns=cols) .pipe(set_column_name, ds_name) ) + + # Remove zero values from summed non-weighted numeric columns + numeric_cols = df.select_dtypes(include="number").columns + df[numeric_cols] = df[numeric_cols].where(lambda df: df != 0) + return df diff --git a/powerplantmatching/collection.py b/powerplantmatching/collection.py index 2405f9fa..8169affa 100644 --- a/powerplantmatching/collection.py +++ b/powerplantmatching/collection.py @@ -60,6 +60,10 @@ def df_by_name(name): get_df = getattr(data, name) df = get_df(config=config) + for source in config["matching_sources"]: + if isinstance(source, dict) and next(iter(source)) == name: + df = df.query(source[name]) + if not conf.get("aggregated_units", False): return aggregate_units(df, dataset_name=name, config=config) else: diff --git a/powerplantmatching/data.py b/powerplantmatching/data.py index 5cdaacb5..c7b0d5c2 100644 --- a/powerplantmatching/data.py +++ b/powerplantmatching/data.py @@ -6,6 +6,7 @@ Collection of power plant data bases and statistical data """ +import json import logging import os from zipfile import ZipFile @@ -24,7 +25,7 @@ gather_specifications, ) from .core import _package_data, get_config -from .heuristics import scale_to_net_capacities +from .heuristics import PLZ_to_LatLon_map, scale_to_net_capacities from .utils import ( config_filter, convert_to_short_name, @@ -57,46 +58,67 @@ def BEYONDCOAL(raw=False, update=False, config=None): config = get_config() if config is None else config fn = get_raw_file("BEYONDCOAL", update=update, config=config) - df = pd.read_excel(fn, sheet_name="Plant", header=[0, 1, 2], skiprows=[3]) + df = pd.read_excel( + fn, sheet_name="Unit", header=0, skiprows=[0, 2, 3], na_values=["unknown"] + ) + + df_plant = pd.read_excel( + fn, + sheet_name="Plant", + header=0, + skiprows=[0, 2, 3], + usecols=["BFF plant ID", "Latitude", "Longitude"], + ).set_index("BFF plant ID") + + df["lat"] = df["BFF plant ID"].map(df_plant.Latitude) + df["lon"] = df["BFF plant ID"].map(df_plant.Longitude) if raw: return df + status_list = config["BEYONDCOAL"].get("status", ["operational"]) # noqa + RENAME_COLUMNS = { - "Plant name": "Name", + "Unit name": "Name", "Fuel type": "Fueltype", - "Latitude": "lat", - "Longitude": "lon", - "Commissioning year of first unit": "DateIn", - "(Announced) Retirement year of last unit": "DateOut", - "Coal capacity open": "Capacity", - "Plant status\n(gross)": "status", - "EBC plant ID": "projectID", + "Commissioning year": "DateIn", + "Unit status\n(detailed)": "status", + "BFF unit ID": "projectID", } - phaseout_col = "Covered by country phase-out? [if yes: country phase-out year]" + SET_MAP = { + "chp": "CHP", + "conventional": "PP", + "industrial": "CHP", + "heat": "CHP", + } - df = ( - df["Plant Data"] - .droplevel(1, axis=1) - .rename(columns=RENAME_COLUMNS) - .query('status != "Cancelled"') + with pd.option_context("future.no_silent_downcasting", True): + phaseout_col = "Covered by country phase-out? [if yes: country phase-out year]" + date_out = ( + df["(Announced) Retirement year"] + .replace({"After 2030": np.nan, "By 2030": 2030}) + .astype(float) + .combine_first(pd.to_numeric(df[phaseout_col], errors="coerce")) + ) + + df_final = ( + df.rename(columns=RENAME_COLUMNS) + .query("status in @status_list") .assign( - DateOut=lambda df: df.DateOut.fillna(df[phaseout_col]).where( - lambda ds: ds <= 8000 - ), + DateOut=date_out, projectID=lambda df: "BEYOND-" + df.projectID, - Fueltype=lambda df: df.Fueltype.str.title().replace("Unknown", "Other"), - Set="PP", + Fueltype=lambda df: df.Fueltype.str.title(), + Set=lambda df: df["Unit type"].map(SET_MAP), Technology=np.nan, ) - .pipe(scale_to_net_capacities) .pipe(clean_name) .pipe(convert_to_short_name) .pipe(set_column_name, "BEYONDCOAL") .pipe(config_filter, config) ) - return df + + return df_final def OPSD( @@ -214,6 +236,10 @@ def OPSD( ) +# @deprecated( +# deprecated_in="0.8.0", +# details="Deprecated since data is not maintained. Use GEM instead.", +# ) def GEO(raw=False, update=False, config=None): """ Importer for the GEO database. @@ -286,6 +312,9 @@ def to_year(ds): res = units.join(ppl.set_index("projectID"), "projectID", rsuffix="_ppl") res["DateIn"] = res.DateIn.fillna(res.DateIn_ppl) + res["Name"] = res.Name + res["Unit_Nbr"].fillna("").apply( + lambda x: f" {x}" if x else "" + ) not_included_ppl = ppl.query("projectID not in @res.projectID") res = pd.concat([res, not_included_ppl]).pipe(set_column_name, "GEO") res = scale_to_net_capacities(res) @@ -432,6 +461,11 @@ def set_large_spanish_stores_to_reservoirs(df): .assign( Set=lambda df: np.where(df.Technology == "Run-Of-River", "PP", "Store"), Fueltype="Hydro", + Duration=lambda df: df.Duration.where(df.Duration > 0), + StorageCapacity_MWh=lambda df: df.StorageCapacity_MWh.where( + df.StorageCapacity_MWh > 0 + ), + Volume_Mm3=lambda df: df.Volume_Mm3.where(df.Volume_Mm3 > 0), ) .drop(columns=["pypsa_id", "GEO"]) .powerplant.convert_alpha2_to_country() @@ -585,6 +619,10 @@ def GPD(raw=False, update=False, config=None, filter_other_dbs=True): ) +# @deprecated( +# deprecated_in="0.8.0", +# details="Removed since data is not maintained. Use GNPT instead.", +# ) def WIKIPEDIA(raw=False, update=False, config=None): """ Importer for the WIKIPEDIA nuclear power plant database. @@ -618,7 +656,6 @@ def WIKIPEDIA(raw=False, update=False, config=None): df = ( df.rename(columns=RENAME_COLUMNS) - .pipe(clean_name) .pipe(convert_to_short_name) .assign( Fueltype="Nuclear", @@ -627,6 +664,7 @@ def WIKIPEDIA(raw=False, update=False, config=None): # plants which are not yet built are set to 2027 DateIn=lambda df: df.DateIn.where(~df.Status.str.contains("In Bau"), 2027), ) + .pipe(clean_name) .pipe(set_column_name, "WIKIPEDIA") .pipe(config_filter, config) ) @@ -1114,7 +1152,7 @@ def WEPP(raw=False, config=None): @deprecated( deprecated_in="0.5.0", - details="This function is not maintained anymore.", + details="This function is not maintained anymore. Use MASTR instead.", ) def UBA( raw=False, @@ -1237,7 +1275,7 @@ def UBA( @deprecated( deprecated_in="0.5.0", - details="This function is not maintained anymore.", + details="This function is not maintained anymore. Use MASTR instead.", ) def BNETZA( raw=False, @@ -1398,6 +1436,10 @@ def BNETZA( ) +# @deprecated( +# deprecated_in="0.8.0", +# details="Removed since data is not maintained. Use GSPT, GWPT and GHPT instead.", +# ) def OPSD_VRE(raw=False, update=False, config=None): """ Importer for the OPSD (Open Power Systems Data) renewables (VRE) @@ -1447,6 +1489,10 @@ def OPSD_VRE(raw=False, update=False, config=None): ) +# @deprecated( +# deprecated_in="0.8.0", +# details="Removed since data is not maintained. Use GSPT, GWPT and GHPT instead.", +# ) def OPSD_VRE_country(country, raw=False, update=False, config=None): """ Get country specific data from OPSD for renewables, if available. @@ -1596,45 +1642,51 @@ def GBPT(raw=False, update=False, config=None): """ config = get_config() if config is None else config fn = get_raw_file("GBPT", update=update, config=config) - df = pd.read_excel(fn, sheet_name="Data") + large = pd.read_excel(fn, sheet_name="Data") + small = pd.read_excel(fn, sheet_name="Below Threshold") + df = pd.concat([large, small], ignore_index=True) if raw: return df RENAME_COLUMNS = { - "Project name": "Name", + "Project Name": "Name", "Capacity (MW)": "Capacity", - "Fuel 1": "Fueltype", - "Operating status": "Status", + "Fuel": "Fueltype", "Latitude": "lat", "Longitude": "lon", - "Unit start year": "DateIn", - "Retired year": "DateOut", + "Start Year": "DateIn", + "Retired Year": "DateOut", + "Country/Area": "Country", "GEM phase ID": "projectID", } + fueltype_dict = { - "bioenergy - agricultural waste (solids)": "Solid Biomass", - "bioenergy - refuse (municipal and industrial wastes)": "Solid Biomass", - "bioenergy - refuse (syngas)": "Solid Biomass", - "bioenergy - agricultural waste (biogas)": "Biogas", - "bioenergy - wood & other biomass (solids)": "Solid Biomass", - "bioenergy - ethanol": "Solid Biomass", - "bioenergy - paper mill wastes": "Solid Biomass", - "bioenergy - biodiesel": "Solid Biomass", - "bioenergy - unknown": "Solid Biomass", - "bioenergy - wastewater and sewage sludge (solids or biogas)": "Solid Biomass", - "bioenergy - refuse (landfill gas)": "Biogas", - "bioenergy - agricultural waste (unknown)": "Solid Biomass", - "bioenergy - agricultural waste (syngas)": "Solid Biomass", - "bioenergy - wood & other biomass (biocoal)": "Solid Biomass", + # solid biomass + "bioenergy: agricultural waste (solids)": "Solid Biomass", + "bioenergy: agricultural waste (unknown)": "Solid Biomass", + "bioenergy: paper mill wastes": "Solid Biomass", + "bioenergy: unknown": "Solid Biomass", + "bioenergy: wood & other biomass (biocoal)": "Solid Biomass", + "bioenergy: wood & other biomass (solids)": "Solid Biomass", + "bioenergy: agricultural waste (syngas)": "Solid Biomass", + # biogas + "bioenergy: agricultural waste (biogas)": "Biogas", + "bioenergy: refuse (landfill gas)": "Biogas", + "bioenergy: wastewater and sewage sludge (solids or biogas)": "Biogas", + # oil + "bioenergy: ethanol": "Oil", + "bioenergy: biodiesel": "Oil", + # waste + "bioenergy: refuse (municipal and industrial wastes)": "Waste", + "bioenergy: refuse (syngas)": "Solid Biomass", } status_list = config["GBPT"].get("status", ["operating"]) # noqa: F841 df = df.rename(columns=RENAME_COLUMNS) df_final = ( - df.pipe(clean_name) - .pipe(set_column_name, "GBPT") + df.pipe(set_column_name, "GBPT") .pipe(convert_to_short_name) .dropna(subset="Capacity") .assign( @@ -1642,12 +1694,15 @@ def GBPT(raw=False, update=False, config=None): DateOut=df["DateOut"].apply(pd.to_numeric, errors="coerce"), lat=df["lat"].apply(pd.to_numeric, errors="coerce"), lon=df["lon"].apply(pd.to_numeric, errors="coerce"), + Fueltype=df["Fueltype"].apply( + lambda v: fueltype_dict[v.split(",")[0].strip()] + ), ) .query("Status in @status_list") .pipe(lambda x: x[df.columns.intersection(config.get("target_columns"))]) - .pipe(lambda x: x.replace({"Fueltype": fueltype_dict})) - .assign(Technology="Steam Turbine") - .assign(Set="PP") + .assign(Technology=np.nan) + .assign(Set=np.nan) + .pipe(clean_name) .pipe(config_filter, config) ) return df_final @@ -1670,7 +1725,7 @@ def GNPT(raw=False, update=False, config=None): """ config = get_config() if config is None else config fn = get_raw_file("GNPT", update=update, config=config) - df = pd.read_excel(fn, sheet_name="Data") + df = pd.read_excel(fn, sheet_name="Data", na_values=["--"]) if raw: return df @@ -1690,11 +1745,12 @@ def GNPT(raw=False, update=False, config=None): df = df.rename(columns=RENAME_COLUMNS) df_final = ( - df.pipe(clean_name) - .pipe(set_column_name, "GNPT") + df.pipe(set_column_name, "GNPT") .pipe(convert_to_short_name) .dropna(subset="Capacity") .assign( + Name=lambda df: df["Name"] + + df["Unit Name"].fillna("").apply(lambda x: f" {x}" if x else ""), DateIn=df["DateIn"].apply(pd.to_numeric, errors="coerce"), DateOut=df["DateOut"].apply(pd.to_numeric, errors="coerce"), lat=df["lat"].apply(pd.to_numeric, errors="coerce"), @@ -1705,6 +1761,7 @@ def GNPT(raw=False, update=False, config=None): .assign(Fueltype="Nuclear") .assign(Technology="Steam Turbine") .assign(Set="PP") + .pipe(clean_name) .pipe(config_filter, config) ) return df_final @@ -1728,7 +1785,7 @@ def GCPT(raw=False, update=False, config=None): config = get_config() if config is None else config fn = get_raw_file("GCPT", update=update, config=config) - df = pd.read_excel(fn, sheet_name="Units") + df = pd.read_excel(fn, sheet_name="Units", na_values=["not found", "-"]) if raw: return df @@ -1746,40 +1803,63 @@ def GCPT(raw=False, update=False, config=None): "GEM unit/phase ID": "projectID", } fueltype_dict = { + "anthracite": "Hard Coal", "bituminous": "Hard Coal", + "bituminous with CCS": "Hard Coal", "lignite": "Lignite", - "unknown": "Hard Coal", - "subbituminous": "Hard Coal", - "waste coal": "Hard Coal", - "anthracite": "Hard Coal", "lignite with CCS": "Lignite", - "bituminous with CCS": "Hard Coal", + "subbituminous": "Hard Coal", "subbituminous with CCS": "Hard Coal", + "unknown": "Hard Coal", "unknown with CCS": "Hard Coal", + "waste coal": "Hard Coal", + } + technology_dict = { + "IGCC": "CCGT", + "subcritical": "Steam Turbine", + "unknown": np.nan, + "supercritical": "Steam Turbine", + "ultra-supercritical": "Steam Turbine", } planned_retirement = df["Planned retirement"].apply(pd.to_numeric, errors="coerce") + # conservative assumption that mothballed plants (without fixed retirement + # date) went out of operation in 2024 + mothballed_retirement = df["Status"].apply( + lambda x: 2024 if x == "mothballed" else np.nan + ) + status_list = config["GCPT"].get("status", ["operating"]) # noqa: F841 + BTU_PER_KWH = 3412.14 + df = df.rename(columns=RENAME_COLUMNS) df_final = ( - df.pipe(clean_name) - .pipe(set_column_name, "GCPT") + df.pipe(set_column_name, "GCPT") .pipe(convert_to_short_name) .dropna(subset="Capacity") .assign( + Name=lambda df: df["Name"] + + df["Unit name"].fillna("").apply(lambda x: f" {x}" if x else ""), DateIn=df["DateIn"].apply(pd.to_numeric, errors="coerce"), - DateOut=df["DateOut"].apply(pd.to_numeric, errors="coerce"), + DateOut=df["DateOut"] + .apply(pd.to_numeric, errors="coerce") + .combine_first(planned_retirement) + .combine_first(mothballed_retirement), lat=df["lat"].apply(pd.to_numeric, errors="coerce"), lon=df["lon"].apply(pd.to_numeric, errors="coerce"), + Set=df["CHP"].replace({"yes": "CHP", "no": "PP"}), + Efficiency=BTU_PER_KWH / df["Heat rate (Btu per kWh)"], ) - .assign(DateOut=lambda x: x["DateOut"].combine_first(planned_retirement)) .query("Status in @status_list") .pipe(lambda x: x[df.columns.intersection(config.get("target_columns"))]) - .pipe(lambda x: x.replace({"Fueltype": fueltype_dict})) - .pipe(lambda x: x.assign(Technology="Steam Turbine")) - .pipe(lambda x: x.assign(Set="PP")) + .pipe( + lambda x: x.replace( + {"Fueltype": fueltype_dict, "Technology": technology_dict} + ) + ) + .pipe(clean_name) .pipe(config_filter, config) ) @@ -1810,11 +1890,11 @@ def GGTPT(raw=False, update=False, config=None): RENAME_COLUMNS = { "Project Name": "Name", - "Capacity (MW)": "Capacity", + "Unit Capacity (MW)": "Capacity", "Latitude": "lat", "Longitude": "lon", - "Start year": "DateIn", - "Retired year": "DateOut", + "Start Year": "DateIn", + "Retired Year": "DateOut", "Country/Area": "Country", "GEM unit ID": "projectID", } @@ -1823,8 +1903,7 @@ def GGTPT(raw=False, update=False, config=None): df = df.rename(columns=RENAME_COLUMNS) df_final = ( - df.pipe(clean_name) - .pipe(set_column_name, "GGTPT") + df.pipe(set_column_name, "GGTPT") .pipe(convert_to_short_name) .dropna(subset="Capacity") .assign( @@ -1838,6 +1917,7 @@ def GGTPT(raw=False, update=False, config=None): .assign(Fueltype="Geothermal") .assign(Technology="Steam Turbine") .assign(Set="PP") + .pipe(clean_name) .pipe(config_filter, config) ) return df_final @@ -1861,6 +1941,11 @@ def GWPT(raw=False, update=False, config=None): config = get_config() if config is None else config fn = get_raw_file("GWPT", update=update, config=config) df = pd.read_excel(fn, sheet_name="Data") + df_small = pd.read_excel(fn, sheet_name="Below Threshold") + df = pd.concat([df, df_small], ignore_index=True) + + if raw: + return df RENAME_COLUMNS = { "Project Name": "Name", @@ -1885,8 +1970,7 @@ def GWPT(raw=False, update=False, config=None): df = df.rename(columns=RENAME_COLUMNS) df_final = ( - df.pipe(clean_name) - .pipe(set_column_name, "GWPT") + df.pipe(set_column_name, "GWPT") .pipe(convert_to_short_name) .dropna(subset="Capacity") .assign( @@ -1900,6 +1984,7 @@ def GWPT(raw=False, update=False, config=None): .pipe(lambda x: x.replace({"Technology": technology_dict})) .assign(Fueltype="Wind") .assign(Set="PP") + .pipe(clean_name) .pipe(config_filter, config) ) return df_final @@ -1952,8 +2037,7 @@ def GSPT(raw=False, update=False, config=None): df = df.rename(columns=RENAME_COLUMNS) df_final = ( - df.pipe(clean_name) - .pipe(set_column_name, "GSPT") + df.pipe(set_column_name, "GSPT") .pipe(convert_to_short_name) .dropna(subset="Capacity") .assign( @@ -1967,6 +2051,7 @@ def GSPT(raw=False, update=False, config=None): .pipe(lambda x: x.replace({"Technology": technology_dict})) .assign(Fueltype="Solar") .assign(Set="PP") + .pipe(clean_name) .pipe(config_filter, config) ) return df_final @@ -1989,69 +2074,85 @@ def GGPT(raw=False, update=False, config=None): """ config = get_config() if config is None else config fn = get_raw_file("GGPT", update=update, config=config) - df = pd.read_excel(fn, sheet_name="Gas & Oil Units") + df = pd.read_excel(fn, sheet_name="Gas & Oil Units", na_values=["not found"]) + df_small = pd.read_excel( + fn, sheet_name="sub-threshold units", na_values=["not found"] + ) + df = pd.concat([df, df_small], ignore_index=True) if raw: return df RENAME_COLUMNS = { "Plant name": "Name", - "Fuel": "Fueltype", "Capacity (MW)": "Capacity", "Latitude": "lat", "Longitude": "lon", "Start year": "DateIn", "Retired year": "DateOut", "CHP": "Set", - "GEM location ID": "projectID", + "Fuel": "Fueltype", + "GEM unit ID": "projectID", + "Country/Area": "Country", + "Turbine/Engine Technology": "Technology", } + def classify_fuel(s): + if s["Fuel classification?"] in ["Gas only", "LNG only"]: + return "Natural Gas" + elif s["Fuel classification?"] == "Oil only": + return "Oil" + elif s["Fueltype"].startswith("fossil liquids"): + return "Oil" + else: + return "Natural Gas" + technology_dict = { - "GT": "Steam Turbine", - "IC": "Steam Turbine", - "CC": "CCGT", - "GT/IC": "Steam Turbine", + "gas turbine": "Steam Turbine", + "internal combustion": "Steam Turbine", + "combined cycle": "CCGT", "ICCC": "CCGT", "ISCC": "CCGT", - "ST": "Steam Turbine", + "steam turbine": "Steam Turbine", "AFC": "CCGT", + "unknown": np.nan, } set_dict = { - "Y": "CHP", - "N": "PP", - "not found": "PP", + "yes": "CHP", + "no": "PP", } status_list = config["GGPT"].get("status", ["operating"]) # noqa: F841 - gas_fuels = ["NG", "LNG", "BU", "LFG", "BG", "BFG", "COG", "CM", "H", "OG"] df = df.rename(columns=RENAME_COLUMNS) + + # conservative assumption that mothballed plants (without fixed retirement + # date) went out of operation in 2024 + mothballed_retirement = df["Status"].apply( + lambda x: 2024 if x == "mothballed" else np.nan + ) + df_final = ( - df.pipe(clean_name) - .pipe(set_column_name, "GGPT") + df.pipe(set_column_name, "GGPT") .pipe(convert_to_short_name) .dropna(subset="Capacity") - .pipe(lambda x: x.query("Capacity != 'not found'")) .assign( DateIn=df["DateIn"].apply(pd.to_numeric, errors="coerce"), - DateOut=df["DateOut"].apply(pd.to_numeric, errors="coerce"), + DateOut=df["DateOut"] + .apply(pd.to_numeric, errors="coerce") + .combine_first(df["Planned retire"]) + .combine_first(mothballed_retirement), lat=df["lat"].apply(pd.to_numeric, errors="coerce"), lon=df["lon"].apply(pd.to_numeric, errors="coerce"), - Capacity=lambda df: pd.to_numeric(df.Capacity, "coerce"), - Fueltype=df["Fueltype"].apply( - lambda s: ( - "Natural Gas" - if any(sub in gas_fuels for sub in s.split("/")) - else "Oil" - ) - ), + Capacity=df["Capacity"].apply(pd.to_numeric, errors="coerce"), + Fueltype=df.apply(classify_fuel, axis=1), ) .query("Status in @status_list") .pipe(lambda x: x[df.columns.intersection(config.get("target_columns"))]) .pipe(lambda x: x.replace({"Technology": technology_dict})) - .pipe(lambda x: x.replace({"Set": set_dict}).fillna({"Set": "PP"})) - .assign(Fueltype="Natural Gas") + .pipe(lambda x: x.replace({"Set": set_dict})) + .pipe(clean_name) .pipe(config_filter, config) ) return df_final @@ -2074,7 +2175,9 @@ def GHPT(raw=False, update=False, config=None): """ config = get_config() if config is None else config fn = get_raw_file("GHPT", update=update, config=config) - df = pd.read_excel(fn, sheet_name="Data") + large = pd.read_excel(fn, sheet_name="Data") + small = pd.read_excel(fn, sheet_name="Below Threshold") + df = pd.concat([large, small], ignore_index=True) if raw: return df @@ -2087,7 +2190,7 @@ def GHPT(raw=False, update=False, config=None): "Start Year": "DateIn", "Retired Year": "DateOut", "GEM unit ID": "projectID", - "Country 1": "Country", + "Country/Area 1": "Country", "Technology Type": "Technology", } technology_dict = { @@ -2095,13 +2198,13 @@ def GHPT(raw=False, update=False, config=None): "pumped storage": "Pumped Storage", "run-of-river": "Run-Of-River", "conventional and pumped storage": "Pumped Storage", - "conventional and run-of-river": "Run-Of-River", + "conventional and run-of-river": "Reservoir", + "unknown": "Run-Of-River", } status_list = config["GHPT"].get("status", ["operating"]) # noqa: F841 df = df.rename(columns=RENAME_COLUMNS) df_final = ( - df.pipe(clean_name) - .pipe(set_column_name, "GHPT") + df.pipe(set_column_name, "GHPT") .pipe(convert_to_short_name) .dropna(subset="Capacity") .assign( @@ -2115,6 +2218,7 @@ def GHPT(raw=False, update=False, config=None): .pipe(lambda x: x.replace({"Technology": technology_dict})) .assign(Fueltype="Hydro") .assign(Set="PP") + .pipe(clean_name) .pipe(config_filter, config) ) return df_final @@ -2162,8 +2266,11 @@ def MASTR( defaults to powerplantmatching.config.get_config() """ + config = get_config() if config is None else config + THRESHOLD_KW = 100 # noqa: F841 + RENAME_COLUMNS = { "EinheitMastrNummer": "projectID", "NameKraftwerk": "Name", @@ -2174,6 +2281,7 @@ def MASTR( "EinheitBetriebsstatus": "Status", "Laengengrad": "lon", "Breitengrad": "lat", + "WEIC": "EIC", } COUNTRY_MAP = { "Deutschland": "Germany", @@ -2187,6 +2295,9 @@ def MASTR( "Energietraeger", "Hauptbrennstoff", "NameStromerzeugungseinheit", + "NameKraftwerksblock", + "NameWindpark", + "Technologie", ] fn = get_raw_file("MASTR", update=update, config=config) @@ -2197,6 +2308,7 @@ def MASTR( "Hydro": "hydro_raw.csv", "Wind": "wind_raw.csv", "Solar": "solar_raw.csv", + "Storage": "bnetza_mastr_storage_raw.csv", } data_frames = [] with ZipFile(fn, "r") as file: @@ -2208,39 +2320,88 @@ def MASTR( "GeplantesInbetriebnahmedatum", "ThermischeNutzleistung", "KwkMastrNummer", + "Batterietechnologie", + "DatumBeginnVoruebergehendeStilllegung", + "DatumWiederaufnahmeBetrieb", + "Postleitzahl", + "Ort", + "Gemeinde", + "Landkreis", + "Lage", ] target_columns = ( target_columns + PARSE_COLUMNS + list(RENAME_COLUMNS.keys()) ) usecols = available_columns.intersection(target_columns) - df = pd.read_csv(file.open(name), usecols=usecols).assign( - Filesuffix=fueltype + df = ( + pd.read_csv(file.open(name), usecols=usecols) + .assign(Filesuffix=fueltype) + .query("Nettonennleistung >= @THRESHOLD_KW") ) data_frames.append(df) break df = pd.concat(data_frames).reset_index(drop=True) + cols = ["NutzbareSpeicherkapazitaet", "VerknuepfteEinheit"] + with ZipFile(fn, "r") as file: + fn_storage_units = ( + "bnetza_open_mastr_2025-02-09/bnetza_mastr_storage_units_raw.csv" + ) + storage_units = pd.read_csv(file.open(fn_storage_units), usecols=cols) + + storage_mwh = ( + storage_units.assign( + VerknuepfteEinheit=lambda x: x.VerknuepfteEinheit.str.split(", ") + ) + .assign(n=lambda x: x.VerknuepfteEinheit.str.len()) + .explode("VerknuepfteEinheit") + .assign(NutzbareSpeicherkapazitaet=lambda x: x.NutzbareSpeicherkapazitaet / x.n) + .set_index("VerknuepfteEinheit")["NutzbareSpeicherkapazitaet"] + ) + + df["StorageCapacity_MWh"] = ( + df["EinheitMastrNummer"].map(storage_mwh) / 1000 + ) # kWh to MWh + if raw: return df status_list = config["MASTR"].get("status", ["In Betrieb"]) # noqa: F841 - capacity_threshold_kw = 1000 - df = ( + PLZ_map = PLZ_to_LatLon_map() + df.Postleitzahl = ( + df.Postleitzahl.astype(str).str.replace(r"[^0-9]", "0", regex=True).astype(int) + ) + df["PLZ_lat"] = df.Postleitzahl.map(PLZ_map.lat) + df["PLZ_lon"] = df.Postleitzahl.map(PLZ_map.lon) + + df_processed = ( df.rename(columns=RENAME_COLUMNS) .query("Status in @status_list") - .loc[lambda df: df.Capacity > capacity_threshold_kw] .assign( projectID=lambda df: "MASTR-" + df.projectID, + Name=lambda df: df.Name.combine_first(df.NameWindpark).combine_first( + df.NameStromerzeugungseinheit + ), Country=lambda df: df.Country.map(COUNTRY_MAP), Capacity=lambda df: df.Capacity / 1e3, # kW to MW - DateIn=lambda df: pd.to_datetime(df.DateIn).dt.year, - DateOut=lambda df: pd.to_datetime(df.DateOut).dt.year, - ) - .assign( - DateIn=lambda df: df["DateIn"].combine_first( + DateIn=lambda df: pd.to_datetime(df.DateIn).dt.year.combine_first( pd.to_datetime(df["GeplantesInbetriebnahmedatum"]).dt.year ), + DateOut=lambda df: pd.to_datetime(df.DateOut).dt.year.where( + df.Status != "Vorübergehend stillgelegt", + pd.to_datetime( + df["DatumBeginnVoruebergehendeStilllegung"] + ).dt.year.where( + df["DatumWiederaufnahmeBetrieb"].isna(), + pd.to_datetime(df.DateOut).dt.year, + ), + ), + lat=lambda df: df.lat.combine_first(df.PLZ_lat), + lon=lambda df: df.lon.combine_first(df.PLZ_lon), + Duration=lambda df: df.StorageCapacity_MWh.div( + df.Capacity, fill_value=np.nan + ), ) .pipe( gather_specifications, @@ -2252,12 +2413,96 @@ def MASTR( df["KwkMastrNummer"].isna() & df["ThermischeNutzleistung"].isna(), "CHP" ), ) - .pipe(clean_name) + ) + + psw = df_processed.query( + "Energietraeger == 'Speicher' and Technologie == 'Pumpspeicher'" + ).index + df_processed.loc[psw, ["Fueltype", "Technology"]] = ["Hydro", "Pumped Storage"] + + bat = df_processed.query( + "Energietraeger == 'Speicher' and Technologie == 'Batterie'" + ).index + df_processed.loc[bat, ["Fueltype", "Set"]] = ["Battery", "Store"] + + BATTERY_MAPPING = { + "Blei-Batterie": "Pb", + "Lithium-Batterie": "Li", + "Sonstige Batterie": np.nan, + "Hochtemperaturbatterie": "NaS", + "Nickel-Cadmium- / Nickel-Metallhydridbatterie": "NiCd", + } + df_processed.loc[bat, "Technology"] = df_processed.loc[ + bat, "Batterietechnologie" + ].map(BATTERY_MAPPING) + + WIND_MAPPING = { + "Windkraft auf See": "Offshore", + "Windkraft an Land": "Onshore", + } + wind = df_processed.query("Energietraeger == 'Wind'").index + df_processed.loc[wind, "Technology"] = df_processed.loc[wind, "Lage"].map( + WIND_MAPPING + ) + + sel = df_processed.query( + "Fueltype == 'Natural Gas' and Filesuffix == 'Bioenergy'" + ).index + df_processed.loc[sel, "Fueltype"] = "Biogas" + + # one biogas unit has 'Wind' in name + sel = df_processed.query("Fueltype == 'Wind' and Filesuffix == 'Biomass'").index + df_processed.loc[sel, "Fueltype"] = "Biogas" + + # some combi-units are named wind-solar + sel = df_processed.query( + "Fueltype in ['Wind', 'Waste'] and Filesuffix == 'Solar'" + ).index + df_processed.loc[sel, ["Fueltype", "Technology"]] = ["Solar", "PV"] + + # some technologies are wrongly allocated + sel = df_processed.query("Fueltype == 'Biogas' and Technology == 'PV'").index + df_processed.loc[sel, "Technology"] = "Combustion Engine" + sel = df_processed.query( + "Fueltype == 'Hydro' and Technology == 'Steam Turbine'" + ).index + df_processed.loc[sel, "Technology"] = "Run-Of-River" + sel = df_processed.query("Fueltype == 'Solar' and Technology == 'CCGT'").index + df_processed.loc[sel, "Technology"] = "PV" + sel = df_processed.query( + "Fueltype == 'Solar' and Technology == 'OCGT' and Filesuffix == 'Combustion'" + ).index + df_processed.loc[sel, "Fueltype"] = "Natural Gas" + sel = df_processed.query( + "Fueltype == 'Wind' and Technology == 'PV' and Filesuffix == 'Solar'" + ).index + df_processed.loc[sel, "Fueltype"] = "Solar" + sel = df_processed.query( + "Fueltype == 'Wind' and Technology == 'Combustion Engine' and Filesuffix == 'Bioenergy'" + ).index + df_processed.loc[sel, "Fueltype"] = "Biogas" + + mask = df_processed.query( + "Energietraeger in ['Hydro', 'Wind', 'Solar', 'Battery'] and Set in ['Store', 'CHP']" + ).index + df_processed.loc[mask, "Set"] = "PP" + + df_processed["Name"] = df_processed.apply( + lambda x: f"{x.Name} {x.NameKraftwerksblock.replace(x.Name, '').strip()}" + if x.NameKraftwerksblock + and x.NameKraftwerksblock != x.Name + and x.Fueltype in config["clean_name"]["fueltypes_with_blocks"] + else x.Name, + axis=1, + ) + + df_final = ( + df_processed.pipe(clean_name) .pipe(set_column_name, "MASTR") .pipe(config_filter, config) ) - return df + return df_final # deprecated alias for GGPT @@ -2269,6 +2514,247 @@ def GEM_GGPT(*args, **kwargs): return GGPT(*args, **kwargs) +def EESI( + raw=False, + update=False, + config=None, +): + """ + Get the European Energy Storage Inventory (EESI) dataset. + + Provided by the European Commission's Joint Research Centre. Contains + chemical, electrochemical, thermal and mechanical energy storage + technologies in Europe. + + https://ses.jrc.ec.europa.eu/storage-inventory-maps + + https://ses.jrc.ec.europa.eu/storage-inventory-tool/api/projects + + Parameters + ---------- + raw : Boolean, default False + Whether to return the original dataset + update: bool, default False + Whether to update the data from the url. + config : dict, default None + Add custom specific configuration, e.g. + powerplantmatching.config.get_config(target_countries='Italy'), defaults + to powerplantmatching.config.get_config() + """ + + config = get_config() if config is None else config + + fn = get_raw_file("EESI", update=update, config=config) + + with open(fn) as f: + data = json.load(f) + + df = pd.json_normalize(data["projects"], sep="_") + float_cols = ["power", "capacity", "facility_latitude", "facility_longitude"] + df[float_cols] = df[float_cols].astype(float) + + if raw: + return df + + status_list = config["EESI"].get("status", ["Operational"]) # noqa: F841 + + RENAME_COLUMNS = { + "title": "Name", + "power": "Capacity", + "capacity": "StorageCapacity_MWh", + "facility_latitude": "lat", + "facility_longitude": "lon", + "facility_country": "Country", + "id": "projectID", + "technology_name": "Technology", + "status": "Status", + } + + df_processed = ( + df.rename(columns=RENAME_COLUMNS) + .query("Status in @status_list") + .assign( + projectID=lambda df: "EESI-" + df.projectID.astype(str), + StorageCapacity_MWh=lambda df: df.StorageCapacity_MWh.where( + df.StorageCapacity_MWh > 0 + ), + Capacity=lambda df: df.Capacity.where(df.Capacity > 0), + Set="Store", + ) + ) + + sel = df_processed.query("technology_parentName == 'ElectroChemical'").index + df_processed.loc[sel, "Fueltype"] = "Battery" + + sel = df_processed.query("technology_parentName == 'Thermal'").index + df_processed.loc[sel, "Fueltype"] = "Heat Storage" + + sel = df_processed.query("technology_parentName == 'Mechanical'").index + df_processed.loc[sel, "Fueltype"] = "Mechanical Storage" + + sel = df_processed.query("Technology == 'Power to Gas (H2)'").index + df_processed.loc[sel, "Fueltype"] = "Hydrogen Storage" + + sel = df_processed.query("Technology == 'Pumped Hydro Storage (PHS)'").index + df_processed.loc[sel, "Fueltype"] = "Hydro" + + TECHNOLOGY_MAPPING = { + "Power to Gas (H2)": np.nan, + "Lithium-ion batteries": "Li", + "Lead Acid batteries": "Pb", + "Sodium Sulphur batteries": "NaS", + "Redox flow batteries Vanadium": "V", + "Sodium Nickel Chloride batteries": "NaNiCl", + "Lithium-titanate battery (LTO)": "Li", + "Pumped Hydro Storage (PHS)": "Pumped Storage", + "Unespecified Storage - mechanical": np.nan, + "Compressed Air Energy Storage (CAES)": "CAES", + "Flywheel Energy Storage": "Flywheel", + "Unspecific Thermal Storage": np.nan, + "Molten salts (Sensible Thermal Energy Storage (STES))": "Molten Salt", + } + df_processed.Technology = df_processed.Technology.map(TECHNOLOGY_MAPPING) + + df_final = ( + df_processed.pipe(clean_name) + .pipe(set_column_name, "EESI") + .pipe(config_filter, config) + ) + + return df_final + + +def GND( + raw=False, + update=False, + config=None, +): + """ + Get the GeoNuclearData (GND) dataset. + + https://github.com/cristianst85/GeoNuclearData + + Parameters + ---------- + raw : Boolean, default False + Whether to return the original dataset + update: bool, default False + Whether to update the data from the url. + config : dict, default None + Add custom specific configuration, e.g. + powerplantmatching.config.get_config(target_countries='Italy'), defaults + to powerplantmatching.config.get_config() + """ + + config = get_config() if config is None else config + + fn = get_raw_file("GND", update=update, config=config) + + df = pd.read_csv(fn) + + if raw: + return df + + status_list = config["GND"].get("status", ["Operational"]) # noqa: F841 + + RENAME_COLUMNS = { + "Id": "projectID", + "Latitude": "lat", + "Longitude": "lon", + "OperationalFrom": "DateIn", + "OperationalTo": "DateOut", + } + + df_final = ( + df.rename(columns=RENAME_COLUMNS) + .query("Status in @status_list") + .assign( + projectID=lambda df: "GND-" + df.projectID.astype(str), + Capacity=lambda df: df.Capacity.where(df.Capacity > 0), + DateIn=lambda df: pd.to_datetime(df.DateIn).dt.year, + DateOut=lambda df: pd.to_datetime(df.DateOut).dt.year, + Set="PP", + Fueltype="Nuclear", + ) + .pipe(clean_name) + .pipe(set_column_name, "GND") + .pipe(config_filter, config) + ) + + return df_final + + +def GHR( + raw=False, + update=False, + config=None, +): + """ + Get the GloHydroRes (GHR) dataset. + + https://www.nature.com/articles/s41597-025-04975-0 + + https://zenodo.org/records/14526360 + + Parameters + ---------- + raw : Boolean, default False + Whether to return the original dataset + update: bool, default False + Whether to update the data from the url. + config : dict, default None + Add custom specific configuration, e.g. + powerplantmatching.config.get_config(target_countries='Italy'), defaults + to powerplantmatching.config.get_config() + """ + + config = get_config() if config is None else config + + fn = get_raw_file("GHR", update=update, config=config) + + df = pd.read_csv(fn) + + if raw: + return df + + RENAME_COLUMNS = { + "ID": "projectID", + "name": "Name", + "country": "Country", + "Latitude": "plant_lat", + "Longitude": "plant_lon", + "plant_type": "Technology", + "dam_height_m": "DamHeight_m", + "year": "DateIn", + } + TECHNOLOGY_MAP = { + "STO": "Reservoir", + "RTO": "Run-Of-River", + "PHS": "Pumped Hydro", + "canal": np.nan, + } + + df_final = ( + df.rename(columns=RENAME_COLUMNS) + .assign( + projectID=lambda df: "GHR-" + df.projectID.astype(str), + Name=lambda df: df.Name.str.split(" - ").str[0].combine_first(df.dam_name), + DateIn=lambda df: pd.to_datetime(df.DateIn).dt.year, + Technology=lambda df: df.Technology.map(TECHNOLOGY_MAP), + Volume_Mm3=lambda df: df.res_vol_km3 * 1e3, + # StorageCapacity_MWh=lambda df: 9.81 * df.dam_height_m * df.Volume_Mm3 * 0.9 / 3.6, + # Duration=lambda df: df.StorageCapacity_MWh / df.Capacity, + Set="PP", + Fueltype="Hydro", + ) + .pipe(clean_name) + .pipe(set_column_name, "GHR") + .pipe(config_filter, config) + ) + + return df_final + + def EXTERNAL_DATABASE(raw=False, update=True, config=None): """ Importer for external custom databases. diff --git a/powerplantmatching/duke.py b/powerplantmatching/duke.py index 3f093690..4c6fff7a 100644 --- a/powerplantmatching/duke.py +++ b/powerplantmatching/duke.py @@ -40,6 +40,7 @@ def duke( showmatches=False, keepfiles=False, showoutput=False, + threads=1, ): """ Run duke in different modes (Deduplication or Record Linkage Mode) to @@ -107,6 +108,7 @@ def duke( "-Dfile.encoding=UTF-8", "no.priv.garshol.duke.Duke", "--linkfile=linkfile.txt", + f"--threads={threads}", ] if singlematch: args.append("--singlematch") diff --git a/powerplantmatching/heuristics.py b/powerplantmatching/heuristics.py index 7a4eca05..08bf2b1f 100644 --- a/powerplantmatching/heuristics.py +++ b/powerplantmatching/heuristics.py @@ -55,6 +55,8 @@ def extend_by_non_matched( if config is None: config = get_config() + threads = config.get("threads_extend_by_non_matched", 1) + if isinstance(extend_by, str): label = extend_by extend_by = getattr(data, extend_by)(config=config) @@ -71,7 +73,7 @@ def extend_by_non_matched( if aggregate_added_data and not extend_by.empty: extend_by = aggregate_units( - extend_by, dataset_name=label, config=config, **aggkwargs + extend_by, dataset_name=label, config=config, threads=threads, **aggkwargs ) extend_by["projectID"] = extend_by.projectID.map(lambda x: {label: x}) else: diff --git a/powerplantmatching/package_data/PLZ_Coords_map.csv b/powerplantmatching/package_data/PLZ_Coords_map.csv index 90203fec..f6fc38cb 100644 --- a/powerplantmatching/package_data/PLZ_Coords_map.csv +++ b/powerplantmatching/package_data/PLZ_Coords_map.csv @@ -8197,3 +8197,11 @@ PLZ,lon,lat 65527,8.29686030496,50.1698531547 32760,8.89250849998,51.9103401848 65529,8.34783843133,50.256587295 +39628,11.6901777,52.6269331 +23769,11.1340848,54.4687375 +64760,8.9928567,49.540722 +78089,8.3637278,48.0748482 +99331,10.8270088,50.7108384 +98694,10.9888104,50.6365371 +19055,11.4375455,53.655925 +81248,11.4023582,48.1497765 diff --git a/powerplantmatching/package_data/config.yaml b/powerplantmatching/package_data/config.yaml index f9b88ebe..6b4f3bf7 100644 --- a/powerplantmatching/package_data/config.yaml +++ b/powerplantmatching/package_data/config.yaml @@ -1,7 +1,7 @@ # ---------------------------------------------------------------------------- # # IO Config # # ---------------------------------------------------------------------------- # -entsoe_token: +entsoe_token: "" google_api_key: # ---------------------------------------------------------------------------- # @@ -16,31 +16,45 @@ main_query: "Name != '' and (lat >= 30 or lat != lat)" matching_sources: # Make individual queries for the datasets as done in `fully_included_sources` # Queries are combined with `main_query` with an `and` operator - - ENTSOE: Fueltype != 'Solar' - - GEO: Fueltype != 'Solar' - - GPD: Fueltype != 'Solar' - - JRC: Fueltype != 'Solar' - - OPSD: Country != "Spain" and Fueltype != 'Hard Coal' and Fueltype != 'Solar' - - BEYONDCOAL: Fueltype != 'Solar' - - WIKIPEDIA: Fueltype != 'Solar' - - GEM - - MASTR + # capacity filters avoid matching of too small units (which is too time-consuming) + # wind is per turbine rather than park in MASTR and unsuitable for matching + - ENTSOE: not (Country == 'Germany' and Fueltype == 'Wind') + # wind in germany is provided by MASTR, other filters are due to large deviations to other datasets + - GEO: Capacity >= 1 and not (Country == 'Germany' and Fueltype == 'Wind') and Fueltype not in ['Oil', 'Nuclear'] and not (Country in ['Bulgaria', 'Slovakia'] and Fueltype == 'Hard Coal') + # wind in germany is provided by MASTR, nuclear is not block-wise, other filters are due to large deviations to other datasets + - GPD: Capacity >= 1 and not (Country == 'Germany' and Fueltype == 'Wind') and not (Country in ['Czechia', 'Bulgaria', 'Romania'] and Fueltype == 'Hard Coal') and Fueltype != 'Nuclear' + - JRC: Capacity >= 1 + # wind in germany is provided by MASTR, other filters are due to large deviations to other datasets + - OPSD: not (Country == 'Germany' and Fueltype == 'Wind') and ((Capacity >= 1 and Fueltype != 'Solar') or Capacity >= 3) and not (Country == 'Spain' and Fueltype == 'Hard Coal') and not (Country == 'Italy' and Fueltype == 'Natural Gas') + - BEYONDCOAL + # wind in germany is provided by MASTR + - GEM: Capacity >= 3 and not (Country == 'Germany' and Fueltype == 'Wind') + # do not match units below 1 MW (2 MW for biogas, 3 MW for solar), exclude wind in Germany from any matching + - MASTR: (Fueltype != 'Wind') and ((Fueltype == 'Solar' and Capacity >= 3) or (Fueltype == 'Biogas' and Capacity >= 2) or (Fueltype not in ['Solar', 'Biogas'] and Capacity >= 1)) + - EESI + - GHR -# fully_included_sources, these sources are included even without match to the final dataset +# # fully_included_sources, these sources are included even without match to the final dataset fully_included_sources: # Make individual queries for the datasets - - ENTSOE: (Country not in ['Switzerland', 'Ireland', 'Albania', 'Greece', 'Czech Republic', 'Bulgaria', 'United Kingdom', 'Italy', 'Serbia'] and not (Country == 'Spain' and Fueltype == 'Hydro')) or (Fueltype == 'Geothermal') - - GEO: (Country == 'Spain' and Fueltype == 'Natural Gas') - - GPD: Country in ['Finland', 'Spain'] - - JRC: Country not in ['Switzerland', 'Albania', 'United Kingdom', 'Norway'] - - OPSD: Country not in ['Switzerland', 'Italy', 'Spain', 'Norway', 'Austria'] + # wind and solar in Germany is covered by MASTR + - GEM: not (Country == 'Germany' and Fueltype in ['Solar', 'Wind']) + # battery in Germany is covered by MASTR + - EESI: Fueltype != 'Hydro' and not (Country == 'Germany' and Fueltype == 'Battery') + # exclude units smaller than 100 kW (low total capacity) and take nuclear from other datasets (good matching) + - MASTR: Capacity >= 0.1 and Fueltype != 'Nuclear' + # take small hydro outside Germany from OPSD (highest coverage) + - OPSD: Country != 'Germany' and Capacity < 1 and Capacity >= 0.1 and Fueltype == 'Hydro' - BEYONDCOAL - - GEM: Country != 'Germany' or Fueltype == 'Solar' - - MASTR + # include this selection of countries as they have poorer coverage in all other datasets + - JRC: Country in ['Italy', 'Croatia', 'Serbia', 'Slovakia'] +# these sources skip unit aggregation for fully_included_sources not covered in matching_sources +aggregate_only_matching_sources: + - MASTR # the matching process of very small units is not efficient -parallel_duke_processes: false -process_limit: 4 +parallel_duke_processes: 16 +threads_extend_by_non_matched: 16 matched_data_url: https://raw.githubusercontent.com/PyPSA/powerplantmatching/{tag}/powerplants.csv # ---------------------------------------------------------------------------- # @@ -50,20 +64,22 @@ matched_data_url: https://raw.githubusercontent.com/PyPSA/powerplantmatching/{ta opsd_vres_base_year: 2020 BNETZA: - reliability_score: 2 + reliability_score: 1 fn: Kraftwerksliste_2017_2.xlsx url: https://www.bundesnetzagentur.de/SharedDocs/Downloads/DE/Sachgebiete/Energie/Unternehmen_Institutionen/Versorgungssicherheit/Erzeugungskapazitaeten/Kraftwerksliste/Kraftwerksliste_2019_1.xlsx;jsessionid=17E419F28D025C7DD9FC6E2BEB3D088F?__blob=publicationFile&v=2 BEYONDCOAL: net_capacity: false - aggregated_units: true - reliability_score: 6 - fn: Europe_Beyond_Coal-European_Coal_Database_hc5n.xlsx - url: https://beyond-coal.eu/wp-content/uploads/2021/07/2021-04-20_Europe_Beyond_Coal-European_Coal_Database_hc5n.xlsx + aggregated_units: false + reliability_score: 4 + status: ["construction", "operational", "no longer coal", "retired", "standby", "deactivated", "retrofitting"] + fn: 2025-07-24-BeyondFossilFuels-Europe_Coal_Plants_Database.xlsx + url: https://beyondfossilfuels.org/wp-content/uploads/2025/07/2025-07-24-BeyondFossilFuels-Europe_Coal_Plants_Database.xlsx IRENA: net_capacity: true aggregated_units: true - fn: IRENASTAT_capacities_2000-2023.csv - url: https://zenodo.org/records/10952917/files/IRENASTAT_capacities_2000-2023.csv + fn: IRENASTAT_capacities_2000-2024.csv + # compiled from https://pxweb.irena.org/pxweb/en/IRENASTAT/IRENASTAT__Power%20Capacity%20and%20Generation/Country_ELECSTAT_2025_H2_PX.px/ + url: https://tubcloud.tu-berlin.de/s/p2D5E9MLWE8HPHE/download/IRENASTAT_capacities_2000-2024.csv CARMA: net_capacity: false reliability_score: 1 @@ -71,23 +87,23 @@ CARMA: fn: Full_CARMA_2009_Dataset_1.csv ENTSOE: reliability_score: 5 - url: https://raw.githubusercontent.com/pypsa-meets-earth/ppm-data-backup/main/entsoe_powerplants.csv - fn: entsoe_powerplants.csv + url: https://tubcloud.tu-berlin.de/s/QaHLH38J4A7ZF5m/download/entsoe_transparency_platform_20250820.csv + fn: entsoe_transparency_platform_20250820.csv ENTSOE-EIC: - url: https://eepublicdownloads.entsoe.eu/eic-codes-csv/W_eiccodes.csv - fn: entsoe_eic_codes.csv + url: https://eepublicdownloads.blob.core.windows.net/cio-lio/csv/W_eicCodes.csv + fn: W_eicCodes.csv JRC: - reliability_score: 4 + reliability_score: 5 fn: jrc-hydro-power-plant-database.csv - url: https://raw.githubusercontent.com/energy-modelling-toolkit/hydro-power-database/fd7535c/data/jrc-hydro-power-plant-database.csv + url: https://raw.githubusercontent.com/energy-modelling-toolkit/hydro-power-database/27e80f/data/jrc-hydro-power-plant-database.csv GEO: net_capacity: false - reliability_score: 3 + reliability_score: 2 url: https://raw.githubusercontent.com/pypsa-meets-earth/ppm-data-backup/main/global_energy_observatory_power_plants.csv fn: global_energy_observatory_power_plants.csv GEO_units: net_capacity: false - reliability_score: 3 + reliability_score: 2 url: https://raw.githubusercontent.com/pypsa-meets-earth/ppm-data-backup/main/global_energy_observatory_ppl_units.csv fn: global_energy_observatory_ppl_units.csv GPD: @@ -96,19 +112,19 @@ GPD: #if outdated, look at http://datasets.wri.org/dataset/globalpowerplantdatabase url: https://wri-dataportal-prod.s3.amazonaws.com/manual/global_power_plant_database_v_1_3.zip WIKIPEDIA: - reliability_score: 4 + reliability_score: 2 url: https://raw.githubusercontent.com/pypsa-meets-earth/ppm-data-backup/main/nuclear_plants_from_wikipedia.csv fn: nuclear_plants_from_wikipedia.csv IWPDCY: aggregated_units: true - reliability_score: 3 + reliability_score: 2 fn: IWPDCY.csv OPSD_DE: - reliability_score: 4 + reliability_score: 3 fn: conventional_power_plants_DE.csv url: https://raw.githubusercontent.com/pypsa-meets-earth/ppm-data-backup/main/conventional_power_plants_DE.csv OPSD_EU: - reliability_score: 4 + reliability_score: 3 fn: conventional_power_plants_EU.csv url: https://raw.githubusercontent.com/pypsa-meets-earth/ppm-data-backup/main/conventional_power_plants_EU.csv OPSD_VRE: @@ -139,78 +155,94 @@ OPSD_VRE_GB: url: https://data.open-power-system-data.org/renewable_power_plants/2020-08-25/renewable_power_plants_UK.csv fn: renewable_power_plants_UK.csv OPSD: - reliability_score: 4 + reliability_score: 3 Capacity_stats: url: https://data.open-power-system-data.org/national_generation_capacity/2020-10-01/national_generation_capacity_stacked.csv fn: national_generation_capacity_stacked.csv UBA: net_capacity: false - reliability_score: 4 + reliability_score: 1 fn: kraftwerke-de-ab-100-mw.xls url: https://www.umweltbundesamt.de/sites/default/files/medien/372/dokumente/kraftwerke_de_ab_100_mw_0.xls WEPP: net_capacity: false - reliability_score: 3 + reliability_score: 1 fn: platts_wepp.csv GGPT: net_capacity: false - reliability_score: 5 + reliability_score: 6 status: ["operating", "retired", "construction"] - fn: Global-Oil-and-Gas-Plant-Tracker-GOGPT-February-2024-v4.xlsx - url: https://tubcloud.tu-berlin.de/s/Be5arQgT9Z9g8Kp/download/Global-Oil-and-Gas-Plant-Tracker-GOGPT-February-2024-v4.xlsx + fn: Global-Oil-and-Gas-Plant-Tracker-GOGPT-August-2025.xlsx + url: https://tubcloud.tu-berlin.de/s/aKrt7dyNgazmgAm/download/Global-Oil-and-Gas-Plant-Tracker-GOGPT-August-2025.xlsx GEM: # combined data set of all GEM trackers net_capacity: true - reliability_score: 5 + reliability_score: 6 GCPT: net_capacity: false - reliability_score: 4 - status: ["operating", "retired", "construction"] - fn: Global-Coal-Plant-Tracker-July-2024.xlsx - url: https://tubcloud.tu-berlin.de/s/FdyKMZtr2ddRJEd/download/Global-Coal-Plant-Tracker-July-2024.xlsx + reliability_score: 6 + status: ["operating", "retired", "construction", "mothballed"] + fn: Global-Coal-Plant-Tracker-July-2025.xlsx + url: https://tubcloud.tu-berlin.de/s/etMB7qawKNwfgnk/download/Global-Coal-Plant-Tracker-July-2025.xlsx GGTPT: net_capacity: false - reliability_score: 4 - status: ["operating", "retired", "construction"] - fn: Geothermal-Power-Tracker-May-2024.xlsx - url: https://tubcloud.tu-berlin.de/s/Hz3ZD7YcKnZTs9t/download/Geothermal-Power-Tracker-May-2024.xlsx + reliability_score: 6 + aggregated_units: false + status: ["operating", "retired", "construction", "mothballed"] + fn: Geothermal-Power-Tracker-March-2025-Final.xlsx + url: https://tubcloud.tu-berlin.de/s/dNoEsLeGtCWDkoc/download/Geothermal-Power-Tracker-March-2025-Final.xlsx GWPT: net_capacity: false - reliability_score: 4 + reliability_score: 6 status: ["operating", "retired", "construction"] - fn: Global-Wind-Power-Tracker-June-2024.xlsx - url: https://tubcloud.tu-berlin.de/s/Z9b3WkAJmSnsrHD/download/Global-Wind-Power-Tracker-June-2024.xlsx + fn: Global-Wind-Power-Tracker-February-2025.xlsx + url: https://tubcloud.tu-berlin.de/s/8NSXSjPmJPXpg4W/download/Global-Wind-Power-Tracker-February-2025.xlsx GSPT: net_capacity: false - reliability_score: 4 - status: ["operating", "construction"] - fn: Global-Solar-Power-Tracker-June-2024.xlsx - url: https://tubcloud.tu-berlin.de/s/tJ5K5rA2e5XaNjM/download/Global-Solar-Power-Tracker-June-2024.xlsx + reliability_score: 6 + status: ["operating", "retired", "construction"] + fn: Global-Solar-Power-Tracker-February-2025.xlsx + url: https://tubcloud.tu-berlin.de/s/7eo4dZXMp6eB3mz/download/Global-Solar-Power-Tracker-February-2025.xlsx GBPT: net_capacity: false - reliability_score: 4 + reliability_score: 6 status: ["operating", "retired", "construction"] - fn: Global-Bioenergy-Power-Tracker-GBPT-V1.xlsx - url: https://tubcloud.tu-berlin.de/s/F34bbwcxYHL9ZR4/download/Global-Bioenergy-Power-Tracker-GBPT-V1.xlsx + fn: Global-Bioenergy-Power-Tracker-GBPT-September-2024.xlsx + url: https://tubcloud.tu-berlin.de/s/CzMBKe2rAcsoq7c/download/Global-Bioenergy-Power-Tracker-GBPT-September-2024.xlsx GNPT: net_capacity: false - reliability_score: 4 + reliability_score: 6 status: ["operating", "retired", "mothballed", "construction"] fn: Global-Nuclear-Power-Tracker-July-2024.xlsx url: https://tubcloud.tu-berlin.de/s/gXFim9EciRHrjeQ/download/Global-Nuclear-Power-Tracker-July-2024.xlsx GHPT: net_capacity: false - reliability_score: 4 + reliability_score: 6 status: ["operating", "retired", "construction"] - fn: Global-Hydropower-Tracker-April-2024.xlsx - url: https://tubcloud.tu-berlin.de/s/sEztyBLdJS5sNHY/download/Global-Hydropower-Tracker-April-2024.xlsx - + fn: Global-Hydropower-Tracker-April-2025.xlsx + url: https://tubcloud.tu-berlin.de/s/2xqxRmfP4FKTrLf/download/Global-Hydropower-Tracker-April-2025.xlsx MASTR: net_capacity: true - reliability_score: 8 - status: ["In Betrieb", "In Planung", "Endgültig stillgelegt"] + reliability_score: 7 + status: ["In Betrieb", "In Planung", "Endgültig stillgelegt", "Vorübergehend stillgelegt"] fn: bnetza_open_mastr_2025-02-09.zip url: https://zenodo.org/records/14783581/files/bnetza_open_mastr_2025-02-09.zip +EESI: + net_capacity: true + reliability_score: 5 + status: ["Operational"] # since no start years given + fn: european-energy-storage-inventory-20250817-2245.json + url: https://tubcloud.tu-berlin.de/s/RXWgYbYJpePsWAZ/download/european-energy-storage-inventory-20250817-2245.json +GND: + net_capacity: true + reliability_score: 5 + status: ["Shutdown", "Operational", "Under Construction", "Decommissioning Completed"] + url: https://raw.githubusercontent.com/cristianst85/GeoNuclearData/1bc8b4ac106af236902385b87e46c540b4864815/data/csv/denormalized/nuclear_power_plants.csv + fn: nuclear_power_plants.csv +GHR: + reliability_score: 4 + fn: GloHydroRes_vs1.csv + url: https://zenodo.org/records/14526360/files/GloHydroRes_vs1.csv # ---------------------------------------------------------------------------- # # Data Structure Config # @@ -279,42 +311,136 @@ target_fueltypes: # given by the list. An empty string results in a regex expression containing only the key. # Parsed of representatives at the top may be overwritten by representatives further below. Other: ".*" - Solid Biomass: [biological, bioenergy, agricultural, wood, biomass, feste biomasse] - Biogas: [biogas, biomethan, gasförmige biomasse] - Nuclear: [nuclear] + Solid Biomass: + - biological + - bioenergy + - agricultural + - biomass + - feste biomasse + - biomasa + - biomassa + - feste biogene stoffe + - pellets + - stroh + - straw + Biogas: + - biogas + - biogaz + - biomethan + - gasförmige biomasse + Nuclear: + - nuclear + - kernkraft + - atomkraft + - nucléaire + - atomowa + - jądrowa + - kjernekraft + - atoom Natural Gas: - [ - ccgt, - gas, - natural gas, - ocgt, - lng, - combined cycle, - fossil gas, - mixed fossil fuels, - erdgas, - andere gase, - ] + - ccgt + - gas + - natural gas + - ocgt + - lng + - combined cycle + - fossil gas + - mixed fossil fuels + - erdgas + - andere gase + - gaz + - gaz naturel + - gas natural + - naturgass + - gaz ziemny + - gass + - aardgas + - flüssiggas Hydro: - [ - run-off, - run off, - run of river, - run-of-river, - ror, - hydro, - hydroelectric, - wasserkraft, - wasser, - ] - Hard Coal: [coal, coke, steinkohle] - Lignite: [brown coal, lignite, peat, braunkohle] - Oil: [oil, diesel, mineralölprodukte] + - run-off + - run off + - run of river + - run-of-river + - ror + - hydro + - hidro + - hydraulique + - hydroelectric + - wasserkraft + - waterkracht + - wasser + - vannkraft + - vattenkraft + - wodna + - idroelettrica + - idraulica + Hard Coal: + - coal + - coke + - steinkohle + - houille + - charbon dur + - hulla + - carbón duro + - carbone duro + - antracite + - steinkul + - węgiel kamienny + - steenkool + Lignite: + - brown coal + - lignite + - peat + - braunkohle + - ligni.* + - brunatny + - brunkul + - bruinkool + Oil: + - oil + - diesel + - biodiesel + - methanol + - heizöl + - ethanol + - mineralölprodukte + - öl + - fioul + - mazout + - petrol + - olio + - olej + - carburante + - olie Geothermal: "" Solar: "" - Waste: ["abfall.*", "waste"] + Waste: + - "abfall.*" + - waste + - mva + - müll + - afval + - affald + - energy recovery + - incineration + - reststoffe + - refuse + - déchets + - ordures + - residuos + - basura + - rifiuti + - scarti + - odpady + - śmieci + - abfälle Wind: "" - Battery: [Electro-chemical, battery] + Battery: + - Electro-chemical + - battery + Mechanical Storage: "" + Heat Storage: "" + Hydrogen Storage: "" target_sets: # Provide a mapping of the keys to a list or a regex expression which are used for parsing. # A list will be converted to a regex expression matching all words (case-insensitive) @@ -322,89 +448,207 @@ target_sets: # Parsed of representatives at the top may be overwritten by representatives further below. PP: ".*" CHP: - [ - heizkraftwerk, - hkw, - kwk, - fhkw, - gud, - hp, - bhkw, - cogeneration, - power and heat, - heat and power, - chp, - ] - Store: [battery, storage, store] + - heizkraftwerk + - hkw + - kwk + - fhkw + - gud + - hp + - bhkw + - cogeneration + - power and heat + - heat and power + - chp + - cogen + - heat & power + - power & heat + - cogeneración + - cogenerazione + - kogeneracja + - combinada calor y electricidad + - kraftvarmeverk + - kraftvarmeværk + - samproduktion + - samproduksjon + - kvv + - wkk + - warmtekrachtkoppeling + - warmte-krachtcentrale + Storage: + - battery + - storage + - store + - speicher + - pumped target_technologies: # Provide a mapping of the keys to a list or a regex expression which are used for parsing. # A list will be converted to a regex expression matching all words (case-insensitive) # given by the list. An empty string results in a regex expression containing only the key. # Parsed of representatives at the top may be overwritten by representatives further below. - CCGT: [ccgt, gas, natural gas, gasturbinen mit abhitzekessel] - OCGT: [ocgt, gasturbinen ohne abhitzekessel] - Steam Turbine: [steam, turbine, kondensationsmaschine, gegendruckmaschine, dampfmotor] - Combustion Engine: [combustion engine, verbrennungsmotor, stirlingmotor] - Run-Of-River: [run-off, run off, run of river, run-of-river, ror, laufwasseranlage] - Pumped Storage: [pumped hydro, pumped, speicherwasseranlage] - Reservoir: "" + CCGT: + - ccgt + - gas + - natural gas + - gasturbinen mit abhitzekessel + OCGT: + - ocgt + - gasturbinen ohne abhitzekessel + Steam Turbine: + - steam + - turbine + - kondensationsmaschine + - gegendruckmaschine + - dampfmotor + Combustion Engine: + - combustion engine + - verbrennungsmotor + - stirlingmotor + Run-Of-River: + - run-off + - run off + - run of river + - run-of-river + - ror + - laufwasseranlage + - laufwasser + - abwasserkraft + - trinkwassersystem + - brauchwassersystem + - pasada + - przepływowa + - fluente + - elvekraft + - doorstroom + - älvkraft + Reservoir: + - reservoir + - réservoir + - impoundment + - talsperre + - stausee + - speicherwasseranlage + - speicherwasser + - barrage + - embalse + - bacino + - zbiornik + - magasinverk + - damkraftverk + - reguleringsmagasin + Pumped Storage: + - pumped hydro + - pumped + - kavernen + - bombeo + - reversible + - reversibel + - oberbecken + - unterbecken + - pompage + - pompaggio + - pompowa + - pumpekraftverk Marine: "" - Onshore: "" - Offshore: "" - PV: [pv, photo-voltaic, photo voltaic] + PV: + - pv + - photo-voltaic + - photo voltaic CSP: "" + Onshore: + - onshore + - an land + - terrestre + - landvind + - på land + - op land + - lądowy + - su terra + - en tierra + - à terre + Offshore: + - offshore + - nearshore + - auf see + - en mer + - marino + - en mar + - in mare + - morski + - havvind + - til havs + - på havet + - op zee + - zeewind clean_name: + fueltypes_with_blocks: + - Nuclear remove_common_words: false # remove words which appear more that 20 times in all entries remove_duplicated_words: true replace: " ": "[^a-zA-Z]" # non-alphabetical symbols "": # This should be a list, if remove_common_words is true. - [ - I, - II, - III, - IV, - V, - VI, - VII, - VIII, - IX, - X, - XI, - parque, - grupo, - station, - power, - plant, - unit, - kraftwerk, - kw, - hkw, - nuclear, - thermal, - heizkraftwerk, - eolico, - project, - hydroelectric, - pumped, - storage, - france, - austria, - sweden, - serbia, - ukraine, - switzerland, - slovakia, - croatia, - poland, - slovenia, - portugal, - bosnia, - and, - herzegovina, - \w, #remove single letters - ] + - I + - II + - III + - IV + - V + - VI + - VII + - VIII + - IX + - X + - XI + - parque + - grupo + - station + - power + - plant + - unit + - block + - kraftwerk + - kernkraftwerk + - wehrkraftwerk + - rheinkraftwerk + - gemeinschaftskernkraftwerk + - kernkw + - kw + - hkw + - nuclear + - hydro + - thermal + - heizkraftwerk + - eolico + - project + - hydroelectric + - hydropower + - hydroelectrique + - hydraulique + - embassament + - pumped + - storage + - france + - austria + - sweden + - serbia + - ukraine + - switzerland + - slovakia + - croatia + - poland + - slovenia + - portugal + - bosnia + - and + - herzegovina + - bulgaria + - generating + - romania + - macedonia + - latvia + - lithuania + - hungary + - \w #remove single letters "ss": "ß" # ---------------------------------------------------------------------------- # @@ -455,5 +699,6 @@ fuel_to_color: Geothermal: darkgoldenrod Battery: purple Hydrogen Storage: teal - Electro-mechanical: teal + Mechanical Storage: darkslategray + Heat Storage: darkorange Total: gold diff --git a/powerplantmatching/utils.py b/powerplantmatching/utils.py index cab6e18a..ee966212 100644 --- a/powerplantmatching/utils.py +++ b/powerplantmatching/utils.py @@ -113,16 +113,7 @@ def config_filter(df, config): main_query = config.get("main_query", "") - # individual filter from config.yaml - queries = {} - for source in config["matching_sources"]: - if isinstance(source, dict): - queries.update(source) - else: - queries[source] = "" - ds_query = queries.get(name, "") - - query = " and ".join([q for q in [target_query, main_query, ds_query] if q]) + query = " and ".join([q for q in [target_query, main_query] if q]) df = correct_manually(df, name, config=config) @@ -185,7 +176,6 @@ def set_uncommon_fueltypes_to_other(df, fillna_other=True, config=None, **kwargs default = [ "Mixed fuel types", "Electro-mechanical", - "Hydrogen Storage", ] fueltypes = kwargs.get("fueltypes", default) df.loc[df.Fueltype.isin(fueltypes), "Fueltype"] = "Other" @@ -342,7 +332,7 @@ def fun(f, q_in, q_out): q_out.put((i, f(x))) -def parmap(f, arg_list, config=None): +def parmap(f, arg_list, config=None, threads=None): """ Parallel mapping function. Use this function to parallelly map function f onto arguments in arg_list. The maximum number of parallel threads is @@ -355,11 +345,21 @@ def parmap(f, arg_list, config=None): python function with one argument arg_list : list list of arguments mapped to f + config : dict, default None + configuration dictionary + threads : int, default None + number of parallel threads """ if config is None: config = get_config() - if config["parallel_duke_processes"]: - nprocs = min(multiprocessing.cpu_count(), config["process_limit"]) + + if threads is None: + threads = config["parallel_duke_processes"] + if isinstance(threads, bool): + threads = config.get("process_limit", 1) + + if threads > 1: + nprocs = min(multiprocessing.cpu_count(), threads) logger.info(f"Run process with {nprocs} parallel threads.") q_in = multiprocessing.Queue(1) q_out = multiprocessing.Queue() diff --git a/test/test_cleaning.py b/test/test_cleaning.py index dab6c5c0..b4411075 100644 --- a/test/test_cleaning.py +++ b/test/test_cleaning.py @@ -83,7 +83,7 @@ def test_gather_specifications(data): def test_clean_name(data): res = clean_name(data) assert res.Name[0] == "Powerplant" - assert res.Name[1] == "An Hydro Powerplant" + assert res.Name[1] == "An Powerplant" assert res.Name[2] == "Another Powerplant With Whitespaces" assert res.Name[3] == "Coalition" assert res.Name[4] == "Besonders Chp" diff --git a/test/test_data.py b/test/test_data.py index 07d32843..933d0710 100755 --- a/test/test_data.py +++ b/test/test_data.py @@ -54,4 +54,7 @@ def test_url_retrieval(): def test_reduced_retrieval(): - pm.powerplants(reduced=False) + config = pm.get_config() + config["matching_sources"] = ["GEO", "GPD"] + config["fully_included_sources"] = [] + pm.powerplants(reduced=False, config=config)