Skip to content

Commit 833dc3e

Browse files
committed
fuzzy_compare using cdist and merging on identical cols
1 parent 07cc3c9 commit 833dc3e

File tree

2 files changed

+246
-3
lines changed

2 files changed

+246
-3
lines changed

dataria/MATH.py

Lines changed: 245 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
import matplotlib.pyplot as plt
55
import numpy as np
66
import upsetplot as up
7-
from rapidfuzz import fuzz
7+
from rapidfuzz import fuzz, process
88
from .DATA import sparql_to_dataframe, get_token_matrix
99

1010
def correlation(df=None,
@@ -257,12 +257,255 @@ def upset(
257257

258258
return df_final
259259

260-
def fuzzy_compare(df1=None,df2=None,
260+
261+
262+
def fuzzy_compare(df1=None, df2=None,
263+
additional_vars_df1=None, additional_vars_df2=None,
264+
endpoint_url=None, query=None,
265+
grouping_var=None, label_var=None, element_var=None,
266+
threshold=95, match_all=False, unique_rows=False,
267+
csv_filename="comparison.csv", verbose=True):
268+
"""
269+
Fuzzy string matching between two DataFrames (or SPARQL query results) based on a common element column.
270+
271+
Supports optional grouping, label filtering, and aggregation of match statistics.
272+
273+
Args:
274+
df1 (pd.DataFrame, optional): First DataFrame.
275+
df2 (pd.DataFrame, optional): Second DataFrame. If not provided, df1 is used.
276+
additional_vars_df1 (list, optional): List of columns from df1 that will be aggregated in the result.
277+
additional_vars_df2 (list, optional): List of columns from df2 that will be aggregated in the result.
278+
endpoint_url (str, optional): SPARQL endpoint (used if df1 is None).
279+
query (str, optional): SPARQL query (used if df1 is None).
280+
grouping_var (str, optional): Column name used for grouping (must exist in both DataFrames).
281+
label_var (str or list, optional): Label conditions, one of:
282+
- str or (col, "identical") or (col, 100): require exact match on this column
283+
- (col, int < 100): require fuzzy match on this column with given threshold
284+
element_var (str): Column containing the string values to compare with fuzzy matching.
285+
threshold (int, optional): Fuzzy matching threshold for element_var (0–100). Default: 95.
286+
match_all (bool, optional): If True, only include groups where all matches exceed the threshold.
287+
unique_rows (bool, optional): If True, suppress duplicate pairings (self-joins).
288+
csv_filename (str, optional): File path to save the aggregated results. If None, skip saving.
289+
verbose (bool, optional): If True, print debug info and head of results.
290+
291+
Returns:
292+
pd.DataFrame: Aggregated match statistics between df1 and df2 (or within df1).
293+
294+
"""
295+
296+
# Load from SPARQL if needed
297+
if df1 is None and endpoint_url and query:
298+
try:
299+
df1 = sparql_to_dataframe(
300+
endpoint_url, query,
301+
csv_filename=f"query_{csv_filename}" if csv_filename else None
302+
)
303+
except Exception as e:
304+
raise ValueError(f"Failed to fetch or process SPARQL query results. Error: {e}")
305+
306+
if df2 is None:
307+
df2 = df1
308+
309+
# detect self-join (common when df2 is None)
310+
self_join = (df1 is df2)
311+
312+
additional_vars_df1 = [c for c in (additional_vars_df1 or []) if c in df1.columns]
313+
additional_vars_df2 = [c for c in (additional_vars_df2 or []) if c in df2.columns] if not self_join else additional_vars_df1
314+
315+
if element_var not in df1.columns or element_var not in df2.columns:
316+
raise ValueError(f"Column '{element_var}' not found in DataFrames.")
317+
318+
# Normalize label vars
319+
plain_labels, identical_vars, fuzzy_vars = [], [], {}
320+
if label_var:
321+
if isinstance(label_var, str):
322+
# treat string the same as identical condition
323+
identical_vars.append(label_var)
324+
elif isinstance(label_var, list):
325+
for lv in label_var:
326+
if isinstance(lv, tuple) and len(lv) == 2:
327+
col, cond = lv
328+
if cond == "identical" or cond == 100:
329+
identical_vars.append(col)
330+
elif isinstance(cond, int) and 0 <= cond < 100:
331+
fuzzy_vars[col] = cond
332+
else:
333+
raise ValueError(f"Unsupported label_var condition: {lv}")
334+
elif isinstance(lv, str):
335+
identical_vars.append(lv)
336+
else:
337+
plain_labels.append(lv)
338+
339+
# IDENTICAL FILTER
340+
matches = []
341+
if identical_vars:
342+
# preserve original row indices for both sides
343+
left = df1.reset_index().rename(columns={"index": "_i"})
344+
right = df2.reset_index().rename(columns={"index": "_j"})
345+
merged = pd.merge(left, right, on=identical_vars, suffixes=("_df1", "_df2"))
346+
if not merged.empty:
347+
arr1 = merged[f"{element_var}_df1"].astype(str).str.lower().to_numpy()
348+
arr2 = merged[f"{element_var}_df2"].astype(str).str.lower().to_numpy()
349+
idx_i = merged["_i"].to_numpy()
350+
idx_j = merged["_j"].to_numpy()
351+
352+
if threshold >= 100:
353+
# equality-only check
354+
eq = (arr1 == arr2)
355+
if self_join:
356+
eq = eq & (idx_i != idx_j) # drop diagonal
357+
for i_in_merged in np.where(eq)[0]:
358+
gi = idx_i[i_in_merged]
359+
gj = idx_j[i_in_merged]
360+
if self_join and unique_rows and gj < gi:
361+
continue
362+
matches.append((gi, gj, 100))
363+
else:
364+
for k, (s1, s2) in enumerate(zip(arr1, arr2)):
365+
gi = idx_i[k]; gj = idx_j[k]
366+
if self_join and gi == gj:
367+
continue
368+
score = fuzz.ratio(s1, s2)
369+
if score >= threshold:
370+
if not (self_join and unique_rows and gj < gi):
371+
matches.append((gi, gj, score))
372+
else:
373+
# GLOBAL MATCHING via cdist (batch, cutoff)
374+
arr1 = df1[element_var].astype(str).str.lower().to_numpy()
375+
arr2 = df2[element_var].astype(str).str.lower().to_numpy()
376+
377+
if threshold >= 100:
378+
equal_mask = (arr1[:, None] == arr2[None, :])
379+
if self_join:
380+
np.fill_diagonal(equal_mask, False)
381+
i_idx, j_idx = np.where(equal_mask)
382+
for gi, gj in zip(i_idx, j_idx):
383+
if self_join and unique_rows and gj < gi:
384+
continue
385+
matches.append((gi, gj, 100))
386+
else:
387+
batch_size = 2000
388+
for start in range(0, len(arr1), batch_size):
389+
sub1 = arr1[start:start+batch_size]
390+
scores = process.cdist(
391+
sub1, arr2,
392+
scorer=fuzz.ratio,
393+
workers=-1,
394+
score_cutoff=threshold
395+
)
396+
if hasattr(scores, "ndim"):
397+
for i, row in enumerate(scores):
398+
gi = start + i
399+
if self_join:
400+
# indices with score >= threshold
401+
js = np.nonzero(row)[0]
402+
for gj in js:
403+
if gj == gi:
404+
continue
405+
if unique_rows and gj < gi:
406+
continue
407+
matches.append((gi, gj, float(row[gj])))
408+
else:
409+
js = np.nonzero(row)[0]
410+
for gj in js:
411+
matches.append((gi, gj, float(row[gj])))
412+
else:
413+
for i, row_dict in enumerate(scores):
414+
gi = start + i
415+
for gj, sc in row_dict.items():
416+
if self_join:
417+
if gj == gi:
418+
continue
419+
if unique_rows and gj < gi:
420+
continue
421+
matches.append((gi, gj, float(sc)))
422+
423+
if not matches:
424+
if verbose: print("No matches found (after self-join filtering and threshold).")
425+
return pd.DataFrame()
426+
427+
# Build matches_df
428+
matches_df = pd.DataFrame(matches, columns=["i", "j", "score"])
429+
df1_reset = df1.reset_index(drop=True).add_suffix("_df1")
430+
df2_reset = df2.reset_index(drop=True).add_suffix("_df2")
431+
432+
matches_df = matches_df.join(df1_reset, on="i")
433+
matches_df = matches_df.join(df2_reset, on="j")
434+
435+
# Fuzzy-label filter (post-match)
436+
if fuzzy_vars:
437+
ok_mask = np.ones(len(matches_df), dtype=bool)
438+
for col, th in fuzzy_vars.items():
439+
s1 = matches_df[col].fillna("").astype(str).str.lower().to_numpy()
440+
s2 = matches_df[f"{col}_df2"].fillna("").astype(str).str.lower().to_numpy()
441+
sim = np.fromiter((fuzz.ratio(a, b) for a, b in zip(s1, s2)), dtype=float, count=len(s1))
442+
ok_mask &= (sim >= th)
443+
matches_df = matches_df[ok_mask]
444+
445+
if matches_df.empty:
446+
if verbose: print("No matches after fuzzy-label filter.")
447+
return pd.DataFrame()
448+
449+
# Grouping AFTER matching
450+
if grouping_var and grouping_var in df1.columns and grouping_var in df2.columns:
451+
matches_df["group1"] = matches_df[f"{grouping_var}_df1"]
452+
matches_df["group2"] = matches_df[f"{grouping_var}_df2"]
453+
else:
454+
matches_df["group1"] = "all"
455+
matches_df["group2"] = "all"
456+
457+
# Aggregation
458+
agg_dict = {
459+
'df1_Elements': (f"{element_var}_df1", lambda x: ", ".join(sorted(set(map(str, x))))),
460+
'df2_Elements': (f"{element_var}_df2", lambda x: ", ".join(sorted(set(map(str, x))))),
461+
'Num_Matches': ('score', 'count'),
462+
'Average_Score': ('score', 'mean'),
463+
'Min_Score': ('score', 'min'),
464+
'Max_Score': ('score', 'max'),
465+
}
466+
467+
# for col in plain_labels + identical_vars + list(fuzzy_vars.keys()):
468+
# agg_dict[col] = (f"{col}_df1", lambda x: ", ".join(sorted(set(map(str, x.dropna())))))
469+
470+
for col in additional_vars_df1:
471+
agg_dict[f'df1_{col}'] = (f"{col}_df1", 'first')
472+
for col in additional_vars_df2:
473+
agg_dict[f'df2_{col}'] = (f"{col}_df2", 'first')
474+
475+
aggregated = matches_df.groupby(['group1', 'group2']).agg(**agg_dict).reset_index()
476+
477+
# unique_rows at group level (optional, for asymmetric string groups)
478+
if unique_rows and grouping_var:
479+
aggregated = aggregated[aggregated['group1'] <= aggregated['group2']]
480+
481+
# match_all semantics
482+
if match_all:
483+
aggregated = aggregated[aggregated['Min_Score'] >= threshold]
484+
485+
# Always enforce threshold via Max_Score
486+
aggregated = aggregated[aggregated['Max_Score'] >= threshold]
487+
488+
# Save & Verbose
489+
if csv_filename:
490+
try:
491+
aggregated.to_csv(csv_filename, index=False)
492+
except Exception as e:
493+
print(f"Failed to save CSV file '{csv_filename}': {e}")
494+
495+
if verbose:
496+
print(aggregated.info())
497+
aggregated.describe()
498+
499+
return aggregated
500+
501+
502+
def fuzzy_compare_legacy(df1=None,df2=None,
261503
additional_vars_df1=None, additional_vars_df2=None,
262504
endpoint_url=None,
263505
query=None,
264506
grouping_var=None, label_var=None, element_var=None, threshold=95, match_all=False, unique_rows=False, csv_filename="comparison.csv", verbose= True):
265507
"""
508+
!! Left here, as new function not tested much....
266509
Fuzzy string matching between two DataFrames (or SPARQL query results) based on a common element column.
267510
268511
Supports optional grouping, label filtering, and aggregation of match statistics.

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
44

55
[project]
66
name = "dataria"
7-
version = "0.4.1"
7+
version = "0.4.2"
88
description = "DATAria utils"
99
authors = [
1010
{ name = "Christoph Sander" }

0 commit comments

Comments
 (0)