|
4 | 4 | import matplotlib.pyplot as plt |
5 | 5 | import numpy as np |
6 | 6 | import upsetplot as up |
7 | | -from rapidfuzz import fuzz |
| 7 | +from rapidfuzz import fuzz, process |
8 | 8 | from .DATA import sparql_to_dataframe, get_token_matrix |
9 | 9 |
|
10 | 10 | def correlation(df=None, |
@@ -257,12 +257,255 @@ def upset( |
257 | 257 |
|
258 | 258 | return df_final |
259 | 259 |
|
260 | | -def fuzzy_compare(df1=None,df2=None, |
| 260 | + |
| 261 | + |
| 262 | +def fuzzy_compare(df1=None, df2=None, |
| 263 | + additional_vars_df1=None, additional_vars_df2=None, |
| 264 | + endpoint_url=None, query=None, |
| 265 | + grouping_var=None, label_var=None, element_var=None, |
| 266 | + threshold=95, match_all=False, unique_rows=False, |
| 267 | + csv_filename="comparison.csv", verbose=True): |
| 268 | + """ |
| 269 | + Fuzzy string matching between two DataFrames (or SPARQL query results) based on a common element column. |
| 270 | +
|
| 271 | + Supports optional grouping, label filtering, and aggregation of match statistics. |
| 272 | +
|
| 273 | + Args: |
| 274 | + df1 (pd.DataFrame, optional): First DataFrame. |
| 275 | + df2 (pd.DataFrame, optional): Second DataFrame. If not provided, df1 is used. |
| 276 | + additional_vars_df1 (list, optional): List of columns from df1 that will be aggregated in the result. |
| 277 | + additional_vars_df2 (list, optional): List of columns from df2 that will be aggregated in the result. |
| 278 | + endpoint_url (str, optional): SPARQL endpoint (used if df1 is None). |
| 279 | + query (str, optional): SPARQL query (used if df1 is None). |
| 280 | + grouping_var (str, optional): Column name used for grouping (must exist in both DataFrames). |
| 281 | + label_var (str or list, optional): Label conditions, one of: |
| 282 | + - str or (col, "identical") or (col, 100): require exact match on this column |
| 283 | + - (col, int < 100): require fuzzy match on this column with given threshold |
| 284 | + element_var (str): Column containing the string values to compare with fuzzy matching. |
| 285 | + threshold (int, optional): Fuzzy matching threshold for element_var (0–100). Default: 95. |
| 286 | + match_all (bool, optional): If True, only include groups where all matches exceed the threshold. |
| 287 | + unique_rows (bool, optional): If True, suppress duplicate pairings (self-joins). |
| 288 | + csv_filename (str, optional): File path to save the aggregated results. If None, skip saving. |
| 289 | + verbose (bool, optional): If True, print debug info and head of results. |
| 290 | +
|
| 291 | + Returns: |
| 292 | + pd.DataFrame: Aggregated match statistics between df1 and df2 (or within df1). |
| 293 | +
|
| 294 | + """ |
| 295 | + |
| 296 | + # Load from SPARQL if needed |
| 297 | + if df1 is None and endpoint_url and query: |
| 298 | + try: |
| 299 | + df1 = sparql_to_dataframe( |
| 300 | + endpoint_url, query, |
| 301 | + csv_filename=f"query_{csv_filename}" if csv_filename else None |
| 302 | + ) |
| 303 | + except Exception as e: |
| 304 | + raise ValueError(f"Failed to fetch or process SPARQL query results. Error: {e}") |
| 305 | + |
| 306 | + if df2 is None: |
| 307 | + df2 = df1 |
| 308 | + |
| 309 | + # detect self-join (common when df2 is None) |
| 310 | + self_join = (df1 is df2) |
| 311 | + |
| 312 | + additional_vars_df1 = [c for c in (additional_vars_df1 or []) if c in df1.columns] |
| 313 | + additional_vars_df2 = [c for c in (additional_vars_df2 or []) if c in df2.columns] if not self_join else additional_vars_df1 |
| 314 | + |
| 315 | + if element_var not in df1.columns or element_var not in df2.columns: |
| 316 | + raise ValueError(f"Column '{element_var}' not found in DataFrames.") |
| 317 | + |
| 318 | + # Normalize label vars |
| 319 | + plain_labels, identical_vars, fuzzy_vars = [], [], {} |
| 320 | + if label_var: |
| 321 | + if isinstance(label_var, str): |
| 322 | + # treat string the same as identical condition |
| 323 | + identical_vars.append(label_var) |
| 324 | + elif isinstance(label_var, list): |
| 325 | + for lv in label_var: |
| 326 | + if isinstance(lv, tuple) and len(lv) == 2: |
| 327 | + col, cond = lv |
| 328 | + if cond == "identical" or cond == 100: |
| 329 | + identical_vars.append(col) |
| 330 | + elif isinstance(cond, int) and 0 <= cond < 100: |
| 331 | + fuzzy_vars[col] = cond |
| 332 | + else: |
| 333 | + raise ValueError(f"Unsupported label_var condition: {lv}") |
| 334 | + elif isinstance(lv, str): |
| 335 | + identical_vars.append(lv) |
| 336 | + else: |
| 337 | + plain_labels.append(lv) |
| 338 | + |
| 339 | + # IDENTICAL FILTER |
| 340 | + matches = [] |
| 341 | + if identical_vars: |
| 342 | + # preserve original row indices for both sides |
| 343 | + left = df1.reset_index().rename(columns={"index": "_i"}) |
| 344 | + right = df2.reset_index().rename(columns={"index": "_j"}) |
| 345 | + merged = pd.merge(left, right, on=identical_vars, suffixes=("_df1", "_df2")) |
| 346 | + if not merged.empty: |
| 347 | + arr1 = merged[f"{element_var}_df1"].astype(str).str.lower().to_numpy() |
| 348 | + arr2 = merged[f"{element_var}_df2"].astype(str).str.lower().to_numpy() |
| 349 | + idx_i = merged["_i"].to_numpy() |
| 350 | + idx_j = merged["_j"].to_numpy() |
| 351 | + |
| 352 | + if threshold >= 100: |
| 353 | + # equality-only check |
| 354 | + eq = (arr1 == arr2) |
| 355 | + if self_join: |
| 356 | + eq = eq & (idx_i != idx_j) # drop diagonal |
| 357 | + for i_in_merged in np.where(eq)[0]: |
| 358 | + gi = idx_i[i_in_merged] |
| 359 | + gj = idx_j[i_in_merged] |
| 360 | + if self_join and unique_rows and gj < gi: |
| 361 | + continue |
| 362 | + matches.append((gi, gj, 100)) |
| 363 | + else: |
| 364 | + for k, (s1, s2) in enumerate(zip(arr1, arr2)): |
| 365 | + gi = idx_i[k]; gj = idx_j[k] |
| 366 | + if self_join and gi == gj: |
| 367 | + continue |
| 368 | + score = fuzz.ratio(s1, s2) |
| 369 | + if score >= threshold: |
| 370 | + if not (self_join and unique_rows and gj < gi): |
| 371 | + matches.append((gi, gj, score)) |
| 372 | + else: |
| 373 | + # GLOBAL MATCHING via cdist (batch, cutoff) |
| 374 | + arr1 = df1[element_var].astype(str).str.lower().to_numpy() |
| 375 | + arr2 = df2[element_var].astype(str).str.lower().to_numpy() |
| 376 | + |
| 377 | + if threshold >= 100: |
| 378 | + equal_mask = (arr1[:, None] == arr2[None, :]) |
| 379 | + if self_join: |
| 380 | + np.fill_diagonal(equal_mask, False) |
| 381 | + i_idx, j_idx = np.where(equal_mask) |
| 382 | + for gi, gj in zip(i_idx, j_idx): |
| 383 | + if self_join and unique_rows and gj < gi: |
| 384 | + continue |
| 385 | + matches.append((gi, gj, 100)) |
| 386 | + else: |
| 387 | + batch_size = 2000 |
| 388 | + for start in range(0, len(arr1), batch_size): |
| 389 | + sub1 = arr1[start:start+batch_size] |
| 390 | + scores = process.cdist( |
| 391 | + sub1, arr2, |
| 392 | + scorer=fuzz.ratio, |
| 393 | + workers=-1, |
| 394 | + score_cutoff=threshold |
| 395 | + ) |
| 396 | + if hasattr(scores, "ndim"): |
| 397 | + for i, row in enumerate(scores): |
| 398 | + gi = start + i |
| 399 | + if self_join: |
| 400 | + # indices with score >= threshold |
| 401 | + js = np.nonzero(row)[0] |
| 402 | + for gj in js: |
| 403 | + if gj == gi: |
| 404 | + continue |
| 405 | + if unique_rows and gj < gi: |
| 406 | + continue |
| 407 | + matches.append((gi, gj, float(row[gj]))) |
| 408 | + else: |
| 409 | + js = np.nonzero(row)[0] |
| 410 | + for gj in js: |
| 411 | + matches.append((gi, gj, float(row[gj]))) |
| 412 | + else: |
| 413 | + for i, row_dict in enumerate(scores): |
| 414 | + gi = start + i |
| 415 | + for gj, sc in row_dict.items(): |
| 416 | + if self_join: |
| 417 | + if gj == gi: |
| 418 | + continue |
| 419 | + if unique_rows and gj < gi: |
| 420 | + continue |
| 421 | + matches.append((gi, gj, float(sc))) |
| 422 | + |
| 423 | + if not matches: |
| 424 | + if verbose: print("No matches found (after self-join filtering and threshold).") |
| 425 | + return pd.DataFrame() |
| 426 | + |
| 427 | + # Build matches_df |
| 428 | + matches_df = pd.DataFrame(matches, columns=["i", "j", "score"]) |
| 429 | + df1_reset = df1.reset_index(drop=True).add_suffix("_df1") |
| 430 | + df2_reset = df2.reset_index(drop=True).add_suffix("_df2") |
| 431 | + |
| 432 | + matches_df = matches_df.join(df1_reset, on="i") |
| 433 | + matches_df = matches_df.join(df2_reset, on="j") |
| 434 | + |
| 435 | + # Fuzzy-label filter (post-match) |
| 436 | + if fuzzy_vars: |
| 437 | + ok_mask = np.ones(len(matches_df), dtype=bool) |
| 438 | + for col, th in fuzzy_vars.items(): |
| 439 | + s1 = matches_df[col].fillna("").astype(str).str.lower().to_numpy() |
| 440 | + s2 = matches_df[f"{col}_df2"].fillna("").astype(str).str.lower().to_numpy() |
| 441 | + sim = np.fromiter((fuzz.ratio(a, b) for a, b in zip(s1, s2)), dtype=float, count=len(s1)) |
| 442 | + ok_mask &= (sim >= th) |
| 443 | + matches_df = matches_df[ok_mask] |
| 444 | + |
| 445 | + if matches_df.empty: |
| 446 | + if verbose: print("No matches after fuzzy-label filter.") |
| 447 | + return pd.DataFrame() |
| 448 | + |
| 449 | + # Grouping AFTER matching |
| 450 | + if grouping_var and grouping_var in df1.columns and grouping_var in df2.columns: |
| 451 | + matches_df["group1"] = matches_df[f"{grouping_var}_df1"] |
| 452 | + matches_df["group2"] = matches_df[f"{grouping_var}_df2"] |
| 453 | + else: |
| 454 | + matches_df["group1"] = "all" |
| 455 | + matches_df["group2"] = "all" |
| 456 | + |
| 457 | + # Aggregation |
| 458 | + agg_dict = { |
| 459 | + 'df1_Elements': (f"{element_var}_df1", lambda x: ", ".join(sorted(set(map(str, x))))), |
| 460 | + 'df2_Elements': (f"{element_var}_df2", lambda x: ", ".join(sorted(set(map(str, x))))), |
| 461 | + 'Num_Matches': ('score', 'count'), |
| 462 | + 'Average_Score': ('score', 'mean'), |
| 463 | + 'Min_Score': ('score', 'min'), |
| 464 | + 'Max_Score': ('score', 'max'), |
| 465 | + } |
| 466 | + |
| 467 | + # for col in plain_labels + identical_vars + list(fuzzy_vars.keys()): |
| 468 | + # agg_dict[col] = (f"{col}_df1", lambda x: ", ".join(sorted(set(map(str, x.dropna()))))) |
| 469 | + |
| 470 | + for col in additional_vars_df1: |
| 471 | + agg_dict[f'df1_{col}'] = (f"{col}_df1", 'first') |
| 472 | + for col in additional_vars_df2: |
| 473 | + agg_dict[f'df2_{col}'] = (f"{col}_df2", 'first') |
| 474 | + |
| 475 | + aggregated = matches_df.groupby(['group1', 'group2']).agg(**agg_dict).reset_index() |
| 476 | + |
| 477 | + # unique_rows at group level (optional, for asymmetric string groups) |
| 478 | + if unique_rows and grouping_var: |
| 479 | + aggregated = aggregated[aggregated['group1'] <= aggregated['group2']] |
| 480 | + |
| 481 | + # match_all semantics |
| 482 | + if match_all: |
| 483 | + aggregated = aggregated[aggregated['Min_Score'] >= threshold] |
| 484 | + |
| 485 | + # Always enforce threshold via Max_Score |
| 486 | + aggregated = aggregated[aggregated['Max_Score'] >= threshold] |
| 487 | + |
| 488 | + # Save & Verbose |
| 489 | + if csv_filename: |
| 490 | + try: |
| 491 | + aggregated.to_csv(csv_filename, index=False) |
| 492 | + except Exception as e: |
| 493 | + print(f"Failed to save CSV file '{csv_filename}': {e}") |
| 494 | + |
| 495 | + if verbose: |
| 496 | + print(aggregated.info()) |
| 497 | + aggregated.describe() |
| 498 | + |
| 499 | + return aggregated |
| 500 | + |
| 501 | + |
| 502 | +def fuzzy_compare_legacy(df1=None,df2=None, |
261 | 503 | additional_vars_df1=None, additional_vars_df2=None, |
262 | 504 | endpoint_url=None, |
263 | 505 | query=None, |
264 | 506 | grouping_var=None, label_var=None, element_var=None, threshold=95, match_all=False, unique_rows=False, csv_filename="comparison.csv", verbose= True): |
265 | 507 | """ |
| 508 | + !! Left here, as new function not tested much.... |
266 | 509 | Fuzzy string matching between two DataFrames (or SPARQL query results) based on a common element column. |
267 | 510 |
|
268 | 511 | Supports optional grouping, label filtering, and aggregation of match statistics. |
|
0 commit comments