TEC-Toolkit · adrianramosolive · Jan 10, 2025 · Jan 12, 2025 · Jan 12, 2025 · Jan 12, 2025
diff --git a/BEIS-UK/GWP/GWP_values.xlsx b/BEIS-UK/GWP/GWP_values.xlsx
diff --git a/BEIS-UK/README.md b/BEIS-UK/README.md
@@ -7,3 +7,4 @@ Steps needed to create the RDF in TTL:
 4. install `rdflib`. 
 5. Morph-kgc will produce an output file in n-triples. These are lengthy, so the script nt_to_ttl transforms them into turtle files. Usage: `python nt_to_ttl.py source_file.nt target_file.ttl`. The result will be the target ttl file.
 
+To generate ttl files, .bat file must be executed (in case of a Windows operating system)
diff --git a/BEIS-UK/cf_2016_v2.ini b/BEIS-UK/cf_2016_v2.ini
diff --git a/BEIS-UK/cf_2016_v3.ini b/BEIS-UK/cf_2016_v3.ini
@@ -0,0 +1,6 @@
+[CONFIGURATION]
+output_file=graphs/v3/nt/out_cf_2016_v3
+number_of_processes=1
+
+[DataSourceCSV]
+mappings=mappings/yarrml/v3/yarrrml_mappings_cf_2016_v3.yaml
diff --git a/BEIS-UK/cf_2017_v2.ini b/BEIS-UK/cf_2017_v2.ini
diff --git a/BEIS-UK/cf_2017_v3.ini b/BEIS-UK/cf_2017_v3.ini
@@ -0,0 +1,6 @@
+[CONFIGURATION]
+output_file=graphs/v3/nt/out_cf_2017_v3
+number_of_processes=1
+
+[DataSourceCSV]
+mappings=mappings/yarrml/v3/yarrrml_mappings_cf_2017_v3.yaml
diff --git a/BEIS-UK/cf_2018_v2.ini b/BEIS-UK/cf_2018_v2.ini
diff --git a/BEIS-UK/cf_2018_v3.ini b/BEIS-UK/cf_2018_v3.ini
@@ -0,0 +1,6 @@
+[CONFIGURATION]
+output_file=graphs/v3/nt/out_cf_2018_v3
+number_of_processes=1
+
+[DataSourceCSV]
+mappings=mappings/yarrml/v3/yarrrml_mappings_cf_2018_v3.yaml
diff --git a/BEIS-UK/cf_2019_v2.ini b/BEIS-UK/cf_2019_v2.ini
diff --git a/BEIS-UK/cf_2019_v3.ini b/BEIS-UK/cf_2019_v3.ini
@@ -0,0 +1,6 @@
+[CONFIGURATION]
+output_file=graphs/v3/nt/out_cf_2019_v3
+number_of_processes=1
+
+[DataSourceCSV]
+mappings=mappings/yarrml/v3/yarrrml_mappings_cf_2019_v3.yaml
diff --git a/BEIS-UK/cf_2020_v2.ini b/BEIS-UK/cf_2020_v2.ini
diff --git a/BEIS-UK/cf_2020_v3.ini b/BEIS-UK/cf_2020_v3.ini
@@ -0,0 +1,6 @@
+[CONFIGURATION]
+output_file=graphs/v3/nt/out_cf_2020_v3
+number_of_processes=1
+
+[DataSourceCSV]
+mappings=mappings/yarrml/v3/yarrrml_mappings_cf_2020_v3.yaml
diff --git a/BEIS-UK/cf_2021_v2.ini b/BEIS-UK/cf_2021_v2.ini
diff --git a/BEIS-UK/cf_2021_v3.ini b/BEIS-UK/cf_2021_v3.ini
@@ -0,0 +1,6 @@
+[CONFIGURATION]
+output_file=graphs/v3/nt/out_cf_2021_v3
+number_of_processes=1
+
+[DataSourceCSV]
+mappings=mappings/yarrml/v3/yarrrml_mappings_cf_2021_v3.yaml
diff --git a/BEIS-UK/cf_2022_v2.ini b/BEIS-UK/cf_2022_v2.ini
diff --git a/BEIS-UK/cf_2022_v3.ini b/BEIS-UK/cf_2022_v3.ini
@@ -0,0 +1,6 @@
+[CONFIGURATION]
+output_file=graphs/v3/nt/out_cf_2022_v3
+number_of_processes=1
+
+[DataSourceCSV]
+mappings=mappings/yarrml/v3/yarrrml_mappings_cf_2022_v3.yaml
diff --git a/...a/conversion_factors_2016_modified_v2.csv → ...2/conversion_factors_2016_modified_v2.csv b/...a/conversion_factors_2016_modified_v2.csv → ...2/conversion_factors_2016_modified_v2.csv
diff --git a/...a/conversion_factors_2017_modified_v2.csv → ...2/conversion_factors_2017_modified_v2.csv b/...a/conversion_factors_2017_modified_v2.csv → ...2/conversion_factors_2017_modified_v2.csv
diff --git a/...a/conversion_factors_2018_modified_v2.csv → ...2/conversion_factors_2018_modified_v2.csv b/...a/conversion_factors_2018_modified_v2.csv → ...2/conversion_factors_2018_modified_v2.csv
diff --git a/...a/conversion_factors_2019_modified_v2.csv → ...2/conversion_factors_2019_modified_v2.csv b/...a/conversion_factors_2019_modified_v2.csv → ...2/conversion_factors_2019_modified_v2.csv
diff --git a/...a/conversion_factors_2020_modified_v2.csv → ...2/conversion_factors_2020_modified_v2.csv b/...a/conversion_factors_2020_modified_v2.csv → ...2/conversion_factors_2020_modified_v2.csv
diff --git a/...a/conversion_factors_2021_modified_v2.csv → ...2/conversion_factors_2021_modified_v2.csv b/...a/conversion_factors_2021_modified_v2.csv → ...2/conversion_factors_2021_modified_v2.csv
diff --git a/...a/conversion_factors_2022_modified_v2.csv → ...2/conversion_factors_2022_modified_v2.csv b/...a/conversion_factors_2022_modified_v2.csv → ...2/conversion_factors_2022_modified_v2.csv
diff --git a/BEIS-UK/data/v3/Conversion_Factor_2016_v3.csv b/BEIS-UK/data/v3/Conversion_Factor_2016_v3.csv
diff --git a/BEIS-UK/data/v3/Conversion_Factor_2017_v3.csv b/BEIS-UK/data/v3/Conversion_Factor_2017_v3.csv
diff --git a/BEIS-UK/data/v3/Conversion_Factor_2018_v3.csv b/BEIS-UK/data/v3/Conversion_Factor_2018_v3.csv
diff --git a/BEIS-UK/data/v3/Conversion_Factor_2019_v3.csv b/BEIS-UK/data/v3/Conversion_Factor_2019_v3.csv
diff --git a/BEIS-UK/data/v3/Conversion_Factor_2020_v3.csv b/BEIS-UK/data/v3/Conversion_Factor_2020_v3.csv
diff --git a/BEIS-UK/data/v3/Conversion_Factor_2021_v3.csv b/BEIS-UK/data/v3/Conversion_Factor_2021_v3.csv
diff --git a/BEIS-UK/data/v3/Conversion_Factor_2022_v3.csv b/BEIS-UK/data/v3/Conversion_Factor_2022_v3.csv
diff --git a/BEIS-UK/data_preparation_script.txt b/BEIS-UK/data_preparation_script.txt
@@ -1,7 +1,7 @@
 1. Remove the top row Category
 2. Remove space in colum "Scope" so the column values look like this Scope 1 -> Scope1
 3. Remove all rows where GHG Conversion Factor YYYY column value is missing
-4. Remove all rows where GHG Conversion Factor YYYY column value is the string "< 1"
+4. Remove all rows where GHG Conversion Factor YYYY column value is < 1
 5. Remove all rows where GHG column value is is other than kg CO2e, kg CH4, kg CO2, kg N2O
 6. Add timestamp columns valid_from and valid_to
 7. Ad id column

diff --git a/BEIS-UK/data_processing/processing.py b/BEIS-UK/data_processing/processing.py
@@ -0,0 +1,203 @@
+import pandas as pd
+from datetime import datetime
+import os
+
+def get_formula(text):
+    formulas = text.split(' per ')[0].strip()  # Takes the part before 'per'
+    if len(formulas) > 1:
+        if 'kg' in formulas:
+            formulas = formulas.replace('kg', '').strip()  # Removes 'kg' if present
+        return formulas
+    else:
+        return formulas[0]  # If not, returns the first
+
+# Function to remove net/gross CV to add the Wikidata URL later
+def clean_parentheses(text):
+    if '(' in text:
+        return text.split('(')[0].strip()  # Takes the part before the parentheses and removes spaces
+    else:
+        return text.strip()  # If there are no parentheses, returns the original text without additional spaces
+
+# Modify the 'region' column based on 'emission_source'
+def get_region(emission_source):
+    if emission_source.startswith('Hotel_stay'):
+        # Remove the 'Hotel_stay_' prefix and replace '_' with spaces
+        return emission_source.replace('Hotel_stay_', '').replace('_', ' ')
+    else:
+        return 'United Kingdom'
+
+def nomenclature(formula):
+    if formula is None or pd.isna(formula):  # Checks if the value is None or NaN
+        return None
+    if 'of' in formula:  # Checks if 'of' is in the formula
+        formula1, formula2 = formula.split(' of ')  # Splits the string into two parts
+        # Perform mapping using the tuplas_nomen dictionary
+        formula1_mapped = tuplas_nomen.get(formula1.strip(), formula1.strip())
+        formula2_mapped = tuplas_nomen.get(formula2.strip(), formula2.strip())
+        result = f"{formula1_mapped} of {formula2_mapped}"
+        return result
+    else:
+        # Maps directly if the formula does not contain 'of'
+        return tuplas_nomen.get(formula.strip(), formula.strip())
+
+def calculate_gwp(row, year_dataset, value):
+    # Filter by year
+    relevant_gwp = df_GWP[(df_GWP['start'] <= year_dataset) & (df_GWP['end'] >= year_dataset)]
+    # Create a dictionary
+    gwp_dict = dict(zip(relevant_gwp["emission_source"], relevant_gwp["value"]))
+    return gwp_dict.get(row["emission_target_formula_aux"], value)
+
+def id_gwp(row, year_dataset, value):
+    # Filter by year
+    relevant_gwp = df_GWP[(df_GWP['start'] <= year_dataset) & (df_GWP['end'] >= year_dataset)]
+    # Create a dictionary
+    gwp_dict = dict(zip(relevant_gwp["emission_source"], relevant_gwp["id"]))
+    return gwp_dict.get(row["emission_target_formula_aux"], value)
+
+# Path to the Excel files directory
+script_dir = os.path.dirname(os.path.abspath(__file__))
+path = os.path.join(script_dir, "../data_raw/ghg-conversion-factors-2022-flat-format.xlsx")
+# Read the Excel file
+df_raw = pd.read_excel(path, sheet_name='Factors by Category', engine='openpyxl')
+
+i = 0
+headers = []
+while 'Scope' not in headers:
+    i += 1
+    headers = df_raw.iloc[i].tolist()
+df_raw = pd.read_excel(path, sheet_name='Factors by Category', engine='openpyxl', header=i+1)
+
+# Check if there is no 'ID' column and add it if missing
+if not any(col.lower() == 'id' for col in headers):
+    headers.insert(0, 'id')
+df_final = pd.DataFrame(columns=headers)
+if 'GHG/Unit' in headers:
+    ghg_label = 'GHG/Unit'
+else:
+    ghg_label = 'GHG'
+
+df_final.pop(ghg_label)
+
+# Populate df_final with data from df_raw
+for j in range(len(df_raw)):
+    row = df_raw.iloc[j].values.tolist()
+    if 'id' in df_final.columns:  # If 'id' was manually added
+        row.insert(0, None)  # Insert None at the beginning to align with 'id'
+    row = row[:len(headers)]  # Ensure the row has the same length as 'headers'
+    df_final.loc[j, headers[:len(row)]] = row
+print("Initial lines", df_final.shape[0])
+
+df_raw[ghg_label] = df_raw[ghg_label].astype(str)
+column_name = [col for col in df_raw.columns if 'GHG Conversion Factor' in col][0]
+year_dataset = int(column_name.split()[-1])
+
+df_final['Scope'] = df_final['Scope'].astype(str).str.replace(' ', '', regex=False)
+df_final.insert(7, "emission_source", None)
+df_final['emission_source'] = df_final['Level 2'].astype(str).str.replace(" ", "_", regex=False) + "_" + df_final['Level 3'].astype(str).str.replace(" ", "_", regex=False)
+
+# Load the Excel file with labels and URLs
+path_labels = os.path.join(script_dir, "../../auxiliary_op/unique_values_wikidata_urls.xlsx")
+df_labels = pd.read_excel(path_labels, engine='openpyxl')
+df_labels['label_url'] = df_labels['label_url'].str.strip()
+
+# Create a dictionary of labels and their URLs
+tuplas = dict(zip(df_labels['label'], df_labels['label_url']))
+
+# Extract URL values for columns in df_final ending with '_wd'
+for col in df_final.columns:
+    if col.endswith('_wd'):
+        # Get the base column without '_wd' and replace it with URLs from the dictionary
+        base_col = col[:-3]
+        df_final[col] = df_final[base_col].map(tuplas).fillna('-')
+
+# ---------------------- Other transformations ----------------------
+
+df_final['UOM'] = df_final['UOM'].fillna('').astype(str)
+df_final['UOM'] = df_final['UOM'].apply(clean_parentheses)
+print("Lines 1", df_final.shape[0])
+
+# Create 'UOM_wd' column by mapping values of 'UOM' to the dictionary of labels and URLs (tuplas)
+df_final['UOM_wd'] = df_final['UOM'].map(tuplas)
+# Reorder columns so 'UOM_wd' appears right after 'UOM'
+columns = list(df_final.columns)
+index_uom = columns.index('UOM')
+columns.insert(index_uom + 1, columns.pop(columns.index('UOM_wd')))
+df_final = df_final[columns]
+
+# Extract and clean 'emission_target_formula'
+df_final['emission_target_formula'] = df_raw[ghg_label].apply(get_formula)
+df_final['emission_target_formula'] = df_final['emission_target_formula'].apply(lambda x: x.replace('kg', '').strip() if isinstance(x, str) else x)
+df_final['emission_target_formula'] = df_final['emission_target_formula'].replace('nan', None)
+# Clean parentheses values in both columns
+df_final['emission_target_formula'] = df_final['emission_target_formula'].str.replace("(Net CV)", "", regex=False).str.replace("(Gross CV)", "", regex=False).str.replace("(net)", "", regex=False).str.replace("kWh ", "kWh", regex=False)
+df_final[ghg_label] = df_raw[ghg_label].str.replace("(Net CV)", "", regex=False).str.replace("(Gross CV)", "", regex=False).str.replace("(net)", "", regex=False)
+
+# Load the Excel file with formulas and their corresponding nomenclatures
+path_nomen = os.path.join(script_dir, "../../auxiliary_op/formulas.xlsx")
+df_nomen = pd.read_excel(path_nomen, engine='openpyxl')
+
+#------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+df_final['emission_target_formula_aux'] = df_final['emission_target_formula'].str.split().str[-1]
+# Create a dictionary of labels and URLs
+tuplas_nomen = dict(zip(df_nomen['formula'], df_nomen['nomenclature']))
+df_final['emission_target'] = df_final['emission_target_formula'].apply(nomenclature)
+df_final = df_final.dropna(subset=['emission_target_formula'])
+
+df_final['emission_target_wd'] = df_final['emission_target_formula'].map(tuplas)
+print("Lines 2", df_final.shape[0])
+#------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+# Extract 'GHG' from the 'GHG/Unit' field
+df_final['GHG'] = df_raw[ghg_label].str.split().str[0]
+df_final['GHG'] = df_final['GHG'].replace('nan', None).str.replace(' per unit', '', regex=False)
+# Identify rows where df_final['GHG'] is equal to 'kWh'
+rows_to_delete = df_final[df_final['GHG'] == 'kWh'].index
+# Remove those rows
+df_final = df_final.drop(rows_to_delete)
+df_final['GHG_wd'] = df_final['GHG'].map(tuplas)
+#------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+# Set date validity columns
+df_final['valid_from'] = datetime(year_dataset, 1, 1, 0, 0, 0).isoformat()
+df_final['valid_to'] = datetime(year_dataset, 12, 31, 23, 59, 59).isoformat()
+print("Lines 4", df_final.shape[0])
+#------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+# Apply the function to the 'region' column
+df_final['region'] = df_final['emission_source'].apply(get_region)
+df_final['region_wd'] = df_final['region'].map(tuplas)
+#------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+df_final = df_final.rename(columns={f'GHG Conversion Factor {year_dataset}': f'Conversion Factor {year_dataset}'})
+# Ensure values are of string type to use .str.replace() and convert to numeric
+df_final[f'Conversion Factor {year_dataset}'] = df_final[f'Conversion Factor {year_dataset}'].astype(str)
+df_final[f'Conversion Factor {year_dataset}'] = pd.to_numeric(df_final[f'Conversion Factor {year_dataset}'].str.replace(',', '', regex=False), errors='coerce')
+print("Lines 5", df_final.shape[0])
+#------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+# Drop rows with NaN in 'Conversion Factor {year_dataset}'
+df_final = df_final.dropna(subset=[f'Conversion Factor {year_dataset}'])
+print("Lines 6", df_final.shape[0])
+# Move 'Conversion Factor {year_dataset}' to the last position
+conversion_factor_column = f'Conversion Factor {year_dataset}'
+columns = [col for col in df_final.columns if col != conversion_factor_column] + [conversion_factor_column]
+df_final = df_final[columns]
+#------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+# Rename columns and ensure they are stripped of whitespace
+df_final.columns = df_final.columns.str.strip()
+#------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+# GWP
+path_GWP_values = os.path.join(script_dir, "../../GWP/GWP_values.xlsx")
+df_GWP = pd.read_excel(path_GWP_values, engine='openpyxl')
+
+df_final['GWP'] = df_final.apply(lambda row: calculate_gwp(row, year_dataset, 1), axis=1)
+df_final[f'Value {year_dataset}'] = df_final[f'Conversion Factor {year_dataset}'] / df_final['GWP']
+df_final['GWP'] = df_final.apply(lambda row: calculate_gwp(row, year_dataset, ''), axis=1)
+df_final['GWP_id'] = df_final.apply(lambda row: id_gwp(row, year_dataset, ''), axis=1)
+#------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+# If 'id' was manually added in 'headers', generate ID values
+if 'id' in df_final.columns and df_final['id'].isnull().all():
+    df_final['id'] = range(1, len(df_final) + 1)
+
+# Remove 'emission_target_formula_aux' column
+df_final = df_final.drop(columns=['emission_target_formula_aux'])
+#------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+# Save to Excel
+final_path = os.path.join(script_dir, f"../data/v3/Conversion_Factor_{year_dataset}_v3.csv")
+df_final.to_csv(final_path, index=False)
+print("Final lines", df_final.shape[0])
diff --git a/...ta_raw/Conversion_factors_2017_-_Flat_file__for_automatic_processing_only__v02-...(1).xls b/...ta_raw/Conversion_factors_2017_-_Flat_file__for_automatic_processing_only__v02-...(1).xls
diff --git a/...a_raw/Conversion_factors_2017_-_Flat_file__for_automatic_processing_only__v02-...(1).xlsx b/...a_raw/Conversion_factors_2017_-_Flat_file__for_automatic_processing_only__v02-...(1).xlsx
diff --git a/...K/data_raw/Conversion_factors_2018_-_Flat_file__for_automatic_processing_only__v01-01.xls b/...K/data_raw/Conversion_factors_2018_-_Flat_file__for_automatic_processing_only__v01-01.xls
diff --git a/.../data_raw/Conversion_factors_2018_-_Flat_file__for_automatic_processing_only__v01-01.xlsx b/.../data_raw/Conversion_factors_2018_-_Flat_file__for_automatic_processing_only__v01-01.xlsx
diff --git a/BEIS-UK/data_raw/conversion-factors-2019-flat-file-v01-02.xls b/BEIS-UK/data_raw/conversion-factors-2019-flat-file-v01-02.xls
diff --git a/BEIS-UK/data_raw/conversion-factors-2019-flat-file-v01-02.xlsx b/BEIS-UK/data_raw/conversion-factors-2019-flat-file-v01-02.xlsx
diff --git a/BEIS-UK/data_raw/conversion-factors-2021-flat-file-automatic-processing.xls b/BEIS-UK/data_raw/conversion-factors-2021-flat-file-automatic-processing.xls
diff --git a/BEIS-UK/data_raw/conversion-factors-2021-flat-file-automatic-processing.xlsx b/BEIS-UK/data_raw/conversion-factors-2021-flat-file-automatic-processing.xlsx
diff --git a/BEIS-UK/data_raw/ghg-conversion-factors-2022-flat-format.xls b/BEIS-UK/data_raw/ghg-conversion-factors-2022-flat-format.xls
diff --git a/BEIS-UK/data_raw/ghg-conversion-factors-2022-flat-format.xlsx b/BEIS-UK/data_raw/ghg-conversion-factors-2022-flat-format.xlsx
diff --git a/BEIS-UK/generate_nt_all_years.sh b/BEIS-UK/generate_nt_all_years.sh
@@ -0,0 +1,16 @@
+#!/bin/bash
+
+## Remember to first install and activate morph-kgc
+echo "Remember to install and activate morph-kgc"
+
+for file in cf_*_v3.ini; do
+    if [ -f "$file" ]; then
+        echo Dealing with "$file"
+	C:\Users\adria\Python\python.exe -m morph_kgc $file
+	year=${file:3:4}
+	out_nt="out_cf_"$year"_v3.nt"
+	echo "NT file generated: $out_nt"
+	mv $out_nt graphs/nt/"$out_nt"
+	echo "Done!"
+    fi
+done
diff --git a/BEIS-UK/generate_ttl.bat b/BEIS-UK/generate_ttl.bat
@@ -0,0 +1,42 @@
+@echo off
+
+echo Remember to install and activate morph-kgc
+
+REM Procesar los archivos .ini
+for %%f in (cf_*_v3.ini) do (
+    if exist "%%f" (
+        echo Dealing with %%f
+        C:\Users\adria\Python\python.exe -m morph_kgc %%f
+        set "year=%%~nf"
+        set "year=!year:~3,4!"
+        set "out_nt=out_cf_!year!_v3.nt"
+        echo NT file generated: !out_nt!
+        move "!out_nt!" "graphs\v3\nt\!out_nt!"
+        echo .nt Done!
+    )
+)
+
+REM Procesar los archivos .nt
+
+@echo off
+setlocal EnableDelayedExpansion
+
+for %%f in (graphs\v3\nt\*.nt) do (
+    if exist %%f (
+        echo Dealing with %%f
+        set "file_name=%%~nf"
+        set "year=!file_name:~7,4!"  REM Leer el año desde la posición correcta
+        set "out_ttl=graphs\v3\ttl\out_cf_!year!_v3.ttl"
+        set "full_out_ttl=%cd%\!out_ttl!"  REM Obtener la ruta completa
+
+        echo Generating TTL file: !out_ttl!
+        py nt_to_ttl.py %%f "!full_out_ttl!"
+        echo TTL file generated: !full_out_ttl!
+        echo Output path: !full_out_ttl!
+        echo .ttl Done!
+    ) else (
+        echo Error: %%f not found!
+    )
+)
+
+endlocal
diff --git a/BEIS-UK/generate_ttl_all_years.sh b/BEIS-UK/generate_ttl_all_years.sh
Original file line number	Diff line number	Diff line change
Expand Up		@@ -7,3 +7,4 @@ Steps needed to create the RDF in TTL:
		4. install `rdflib`.
		5. Morph-kgc will produce an output file in n-triples. These are lengthy, so the script nt_to_ttl transforms them into turtle files. Usage: `python nt_to_ttl.py source_file.nt target_file.ttl`. The result will be the target ttl file.

		To generate ttl files, .bat file must be executed (in case of a Windows operating system)