|
62 | 62 | "from neo4j import GraphDatabase"
|
63 | 63 | ]
|
64 | 64 | },
|
| 65 | + { |
| 66 | + "cell_type": "code", |
| 67 | + "execution_count": null, |
| 68 | + "id": "29b00ea6", |
| 69 | + "metadata": {}, |
| 70 | + "outputs": [], |
| 71 | + "source": [ |
| 72 | + "# Main Colormap\n", |
| 73 | + "main_color_map = 'nipy_spectral'" |
| 74 | + ] |
| 75 | + }, |
65 | 76 | {
|
66 | 77 | "cell_type": "code",
|
67 | 78 | "execution_count": null,
|
|
171 | 182 | "# TODO run a community detection algorithm co-located in here when \"communityId\" is missing\n",
|
172 | 183 | "# TODO run a centrality algorithm co-located in here when \"centrality\" score is missing\n",
|
173 | 184 | "\n",
|
174 |
| - "def create_node_embeddings(cypher_file_name: str, parameters: dict) -> pd.DataFrame: \n", |
| 185 | + "def create_node_embeddings(cypher_file_name: str, parameters: dict, ignore_existing: bool = True) -> pd.DataFrame: \n", |
175 | 186 | " \"\"\"\n",
|
176 | 187 | " Creates an in-memory Graph projection by calling \"create_undirected_projection\", \n",
|
177 | 188 | " runs the cypher Query given as cypherFileName parameter to calculate and stream the node embeddings\n",
|
|
200 | 211 | " empty_result = pd.DataFrame(columns=[\"codeUnitName\", 'projectName', 'communityId', 'centrality', 'embedding'])\n",
|
201 | 212 | " return empty_result\n",
|
202 | 213 | "\n",
|
203 |
| - " existing_embeddings_query_filename=\"../cypher/Node_Embeddings/Node_Embeddings_0a_Query_Calculated.cypher\"\n", |
204 |
| - " embeddings = query_first_non_empty_cypher_to_data_frame(existing_embeddings_query_filename, cypher_file_name, parameters=parameters)\n", |
| 214 | + " if ignore_existing:\n", |
| 215 | + " embeddings = query_cypher_to_data_frame(cypher_file_name, parameters_=parameters)\n", |
| 216 | + " else: \n", |
| 217 | + " existing_embeddings_query_filename=\"../cypher/Node_Embeddings/Node_Embeddings_0a_Query_Calculated.cypher\"\n", |
| 218 | + " embeddings = query_first_non_empty_cypher_to_data_frame(existing_embeddings_query_filename, cypher_file_name, parameters=parameters)\n", |
205 | 219 | " display(embeddings.head()) # Display the first entries of the table\n",
|
206 | 220 | " return embeddings"
|
207 | 221 | ]
|
|
273 | 287 | "outputs": [],
|
274 | 288 | "source": [
|
275 | 289 | "def plot_2d_node_embeddings(node_embeddings_for_visualization: pd.DataFrame, title: str):\n",
|
276 |
| - " if embeddings.empty:\n", |
| 290 | + " if node_embeddings_for_visualization.empty:\n", |
277 | 291 | " print(\"No projected data to plot available\")\n",
|
278 | 292 | " return\n",
|
279 | 293 | "\n",
|
|
288 | 302 | " plot.show()"
|
289 | 303 | ]
|
290 | 304 | },
|
| 305 | + { |
| 306 | + "cell_type": "code", |
| 307 | + "execution_count": null, |
| 308 | + "id": "e80a45ec", |
| 309 | + "metadata": {}, |
| 310 | + "outputs": [], |
| 311 | + "source": [ |
| 312 | + "def get_first_2_embedding_dimensions(embeddings: pd.DataFrame) -> pd.DataFrame:\n", |
| 313 | + " \"\"\"\n", |
| 314 | + " Returns the first two dimensions of the node embeddings as a DataFrame.\n", |
| 315 | + " \"\"\"\n", |
| 316 | + " if embeddings.empty: \n", |
| 317 | + " print(\"No projected data for node embeddings dimensionality reduction available\")\n", |
| 318 | + " return embeddings\n", |
| 319 | + " \n", |
| 320 | + " # Create a new DataFrame with the results of the 2 dimensional node embeddings\n", |
| 321 | + " # and the code unit and artifact name of the query above as preparation for the plot\n", |
| 322 | + " get_first_2_embedding_dimensions = embeddings.copy()\n", |
| 323 | + "\n", |
| 324 | + " get_first_2_embedding_dimensions['x'] = [value[0] for value in get_first_2_embedding_dimensions.embedding]\n", |
| 325 | + " get_first_2_embedding_dimensions['y'] = [value[1] for value in get_first_2_embedding_dimensions.embedding]\n", |
| 326 | + " return get_first_2_embedding_dimensions" |
| 327 | + ] |
| 328 | + }, |
291 | 329 | {
|
292 | 330 | "cell_type": "code",
|
293 | 331 | "execution_count": null,
|
|
361 | 399 | " \"dependencies_projection_node\": \"Package\",\n",
|
362 | 400 | " \"dependencies_projection_weight_property\": \"weight25PercentInterfaces\",\n",
|
363 | 401 | " \"dependencies_projection_write_property\": \"embeddingsFastRandomProjection\",\n",
|
364 |
| - " \"dependencies_projection_embedding_dimension\":\"32\"\n", |
| 402 | + " \"dependencies_projection_embedding_dimension\":\"16\"\n", |
365 | 403 | "}\n",
|
366 | 404 | "embeddings = create_node_embeddings(\"../cypher/Node_Embeddings/Node_Embeddings_1d_Fast_Random_Projection_Stream.cypher\", java_package_embeddings_parameters)\n"
|
367 | 405 | ]
|
368 | 406 | },
|
| 407 | + { |
| 408 | + "cell_type": "code", |
| 409 | + "execution_count": null, |
| 410 | + "id": "1709344c", |
| 411 | + "metadata": {}, |
| 412 | + "outputs": [], |
| 413 | + "source": [ |
| 414 | + "plot_2d_node_embeddings(\n", |
| 415 | + " get_first_2_embedding_dimensions(embeddings), \n", |
| 416 | + " \"Java Package positioned by their dependency relationships (FastRP first 2 node embeddings without t-SNE)\"\n", |
| 417 | + ")" |
| 418 | + ] |
| 419 | + }, |
| 420 | + { |
| 421 | + "cell_type": "code", |
| 422 | + "execution_count": null, |
| 423 | + "id": "84642495", |
| 424 | + "metadata": {}, |
| 425 | + "outputs": [], |
| 426 | + "source": [ |
| 427 | + "import numpy.typing as numpy_typing\n", |
| 428 | + "\n", |
| 429 | + "class TunedClusteringResult:\n", |
| 430 | + " def __init__(self, labels : list, probabilities : list):\n", |
| 431 | + " self.labels = labels\n", |
| 432 | + " self.probabilities = probabilities\n", |
| 433 | + " self.cluster_count = len(set(labels)) - (1 if -1 in labels else 0)\n", |
| 434 | + "\n", |
| 435 | + "def tuned_hierarchical_density_based_spatial_clustering(embeddings: numpy_typing.NDArray, reference_community_ids: numpy_typing.NDArray) -> TunedClusteringResult:\n", |
| 436 | + " \"\"\"\n", |
| 437 | + " Applies the optimized hierarchical density-based spatial clustering algorithm (HDBSCAN) to the given node embeddings.\n", |
| 438 | + " The parameters are tuned to get results similar to the ones of the community detection algorithm.\n", |
| 439 | + " The result is a list of cluster ids for each node embedding.\n", |
| 440 | + " \"\"\"\n", |
| 441 | + " from sklearn.model_selection import RandomizedSearchCV\n", |
| 442 | + " from sklearn.cluster import HDBSCAN\n", |
| 443 | + " from sklearn.metrics import adjusted_rand_score\n", |
| 444 | + " import numpy as np\n", |
| 445 | + "\n", |
| 446 | + " # specify parameters and distributions to sample from\n", |
| 447 | + " hyper_parameter_distributions = {\n", |
| 448 | + " \"min_samples\": [2, 3, 4, 5, 6, 7, 8, 9, 10, 15, 20, 50, 100],\n", |
| 449 | + " \"min_cluster_size\": [4, 5, 6, 7, 10, 20, 30, 50, 100],\n", |
| 450 | + " \"cluster_selection_method\": [\"eom\", \"leaf\"],\n", |
| 451 | + " \"metric\": [\"euclidean\", \"manhattan\"],\n", |
| 452 | + " }\n", |
| 453 | + " \n", |
| 454 | + " def adjusted_rand_scorer_with_penalty_for_community_references(community_references):\n", |
| 455 | + " \"\"\"\n", |
| 456 | + " Creates a custom scoring function based on the Adjusted Rand Index (ARI) that penalizes for high noise ratio in clustering.\n", |
| 457 | + " Input:\n", |
| 458 | + " - community_references: The true labels of the communities for the data points.\n", |
| 459 | + " Output:\n", |
| 460 | + " - A scoring function that can directly be used for e.g. RandomizedSearchCV and that takes an estimator and data (X) and returns the ARI score with a penalty for noise ratio.\n", |
| 461 | + " \"\"\"\n", |
| 462 | + " def ari_score_with_penalty(estimator, embeddings):\n", |
| 463 | + " clustering_result = estimator.fit_predict(embeddings)\n", |
| 464 | + "\n", |
| 465 | + " # Calculate the noise ratio. Noise points are labeled as -1 in HDBSCAN.\n", |
| 466 | + " noise_ratio = np.sum(clustering_result == -1) / len(clustering_result)\n", |
| 467 | + " \n", |
| 468 | + " if np.unique(clustering_result[clustering_result != -1]).size < 2:\n", |
| 469 | + " return -1 # Return worst score if only one cluster is found or all points are noise\n", |
| 470 | + "\n", |
| 471 | + " ari = adjusted_rand_score(community_references[clustering_result != -1], clustering_result[clustering_result != -1])\n", |
| 472 | + " \n", |
| 473 | + " # Penalize for high noise: If 80% of the points are noise, even a perfect ARI of 1.0 gets scaled down to 0.2\n", |
| 474 | + " penalty = 1.0 - noise_ratio \n", |
| 475 | + " \n", |
| 476 | + " return ari * penalty\n", |
| 477 | + " return ari_score_with_penalty\n", |
| 478 | + "\n", |
| 479 | + "\n", |
| 480 | + " # Use custom CV that feeds all data to each fold (no slicing)\n", |
| 481 | + " all_data_without_slicing_cross_validator = [(np.arange(len(embeddings)), np.arange(len(embeddings)))]\n", |
| 482 | + "\n", |
| 483 | + " hdbscan_with_random_search = RandomizedSearchCV(\n", |
| 484 | + " estimator=HDBSCAN(),\n", |
| 485 | + " refit=False, # Without refit, the estimator doesn't need to implement the 'predict' method. Drawback: Only the best parameters are returned, not the best model.\n", |
| 486 | + " param_distributions=hyper_parameter_distributions,\n", |
| 487 | + " n_iter=20,\n", |
| 488 | + " scoring=adjusted_rand_scorer_with_penalty_for_community_references(reference_community_ids),\n", |
| 489 | + " cv=all_data_without_slicing_cross_validator,\n", |
| 490 | + " verbose=1\n", |
| 491 | + " )\n", |
| 492 | + "\n", |
| 493 | + " hdbscan_with_random_search.fit(embeddings)\n", |
| 494 | + "\n", |
| 495 | + " #print(\"Best adjusted rand score with noise penalty:\", hdbscan_with_random_search.best_score_)\n", |
| 496 | + " print(\"Tuned HDBSCAN parameters:\", hdbscan_with_random_search.best_params_)\n", |
| 497 | + "\n", |
| 498 | + " # Run the clustering again with the best parameters\n", |
| 499 | + " cluster_algorithm = HDBSCAN(**hdbscan_with_random_search.best_params_, allow_single_cluster=False)\n", |
| 500 | + " best_model = cluster_algorithm.fit(embeddings)\n", |
| 501 | + "\n", |
| 502 | + " results = TunedClusteringResult(best_model.labels_, best_model.probabilities_)\n", |
| 503 | + " print(f\"Number of HDBSCAN clusters (excluding noise): {results.cluster_count:.0f}\")\n", |
| 504 | + " return results" |
| 505 | + ] |
| 506 | + }, |
| 507 | + { |
| 508 | + "cell_type": "code", |
| 509 | + "execution_count": null, |
| 510 | + "id": "8e1f0227", |
| 511 | + "metadata": {}, |
| 512 | + "outputs": [], |
| 513 | + "source": [ |
| 514 | + "import numpy.typing as numpy_typing\n", |
| 515 | + "\n", |
| 516 | + "class CommunityComparingScores:\n", |
| 517 | + " def __init__(self, adjusted_rand_index: float, normalized_mutual_information: float):\n", |
| 518 | + " self.adjusted_rand_index = adjusted_rand_index\n", |
| 519 | + " self.normalized_mutual_information = normalized_mutual_information\n", |
| 520 | + " self.scores = {\n", |
| 521 | + " \"Adjusted Rand Index\": adjusted_rand_index,\n", |
| 522 | + " \"Normalized Mutual Information\": normalized_mutual_information\n", |
| 523 | + " }\n", |
| 524 | + " def __repr__(self):\n", |
| 525 | + " return f\"CommunityComparingScores(adjusted_rand_index={self.adjusted_rand_index}, normalized_mutual_information={self.normalized_mutual_information})\"\n", |
| 526 | + "\n", |
| 527 | + "def get_community_comparing_scores(cluster_labels: numpy_typing.NDArray, reference_community_ids: numpy_typing.NDArray) -> CommunityComparingScores:\n", |
| 528 | + " \"\"\"\n", |
| 529 | + " Returns a DataFrame with the scores of the clustering algorithm compared to the community detection algorithm.\n", |
| 530 | + " The scores are calculated using the adjusted rand index (ARI) and the normalized mutual information (NMI).\n", |
| 531 | + " \"\"\"\n", |
| 532 | + " from sklearn.metrics import adjusted_rand_score, normalized_mutual_info_score\n", |
| 533 | + "\n", |
| 534 | + " # Create a mask to filter out noise points. In HDBSCAN, noise points are labeled as -1\n", |
| 535 | + " mask = cluster_labels != -1\n", |
| 536 | + " ari = adjusted_rand_score(reference_community_ids[mask], cluster_labels[mask])\n", |
| 537 | + " nmi = normalized_mutual_info_score(reference_community_ids[mask], cluster_labels[mask])\n", |
| 538 | + "\n", |
| 539 | + " return CommunityComparingScores(ari, nmi)" |
| 540 | + ] |
| 541 | + }, |
| 542 | + { |
| 543 | + "cell_type": "code", |
| 544 | + "execution_count": null, |
| 545 | + "id": "3c4e8821", |
| 546 | + "metadata": {}, |
| 547 | + "outputs": [], |
| 548 | + "source": [ |
| 549 | + "def add_clustering_results_to_embeddings(embeddings: pd.DataFrame, clustering_result: TunedClusteringResult, clustering_name: str) -> pd.DataFrame:\n", |
| 550 | + " \"\"\"\n", |
| 551 | + " Adds the clustering results to the embeddings DataFrame.\n", |
| 552 | + " \"\"\"\n", |
| 553 | + " embeddings['clustering' + clustering_name + 'Label'] = clustering_result.labels\n", |
| 554 | + " embeddings['clustering' + clustering_name + 'Probability'] = clustering_result.probabilities\n", |
| 555 | + " return embeddings\n", |
| 556 | + "\n", |
| 557 | + "def get_clustering_results_distribution(embeddings: pd.DataFrame, clustering_name: str) -> pd.DataFrame:\n", |
| 558 | + " \"\"\"\n", |
| 559 | + " Returns the clustering results distribution for the given clustering name.\n", |
| 560 | + " \"\"\"\n", |
| 561 | + " return embeddings.groupby('clustering' + clustering_name + 'Label').aggregate(\n", |
| 562 | + " probability=('clustering' + clustering_name + 'Probability', 'mean'),\n", |
| 563 | + " count=('codeUnitName', 'count'),\n", |
| 564 | + " communityIds=('communityId', lambda x: list(set(x))),\n", |
| 565 | + " codeUnitNames=('codeUnitName', lambda x: list(set(x))),\n", |
| 566 | + " ).reset_index().sort_values(by='count', ascending=False)" |
| 567 | + ] |
| 568 | + }, |
| 569 | + { |
| 570 | + "cell_type": "code", |
| 571 | + "execution_count": null, |
| 572 | + "id": "6580301e", |
| 573 | + "metadata": {}, |
| 574 | + "outputs": [], |
| 575 | + "source": [ |
| 576 | + "import numpy as np\n", |
| 577 | + "\n", |
| 578 | + "embeddings_values = np.array(embeddings.embedding.tolist())\n", |
| 579 | + "community_reference_ids = np.array(embeddings.communityId.tolist())\n", |
| 580 | + "\n", |
| 581 | + "results = tuned_hierarchical_density_based_spatial_clustering(embeddings_values, community_reference_ids)\n", |
| 582 | + "\n", |
| 583 | + "community_comparing_scores = get_community_comparing_scores(results.labels, community_reference_ids)\n", |
| 584 | + "print(community_comparing_scores)\n", |
| 585 | + "\n", |
| 586 | + "embeddings = add_clustering_results_to_embeddings(embeddings, results, \"Hdbscan\")\n", |
| 587 | + "display(get_clustering_results_distribution(embeddings, \"Hdbscan\")) " |
| 588 | + ] |
| 589 | + }, |
369 | 590 | {
|
370 | 591 | "cell_type": "markdown",
|
371 | 592 | "id": "76d8bca1",
|
|
429 | 650 | " \"dependencies_projection_node\": \"Package\",\n",
|
430 | 651 | " \"dependencies_projection_weight_property\": \"weight25PercentInterfaces\",\n",
|
431 | 652 | " \"dependencies_projection_write_property\": \"embeddingsHashGNN\",\n",
|
432 |
| - " \"dependencies_projection_embedding_dimension\":\"64\"\n", |
| 653 | + " \"dependencies_projection_embedding_dimension\":\"2\"\n", |
433 | 654 | "}\n",
|
434 | 655 | "embeddings = create_node_embeddings(\"../cypher/Node_Embeddings/Node_Embeddings_2d_Hash_GNN_Stream.cypher\", java_package_embeddings_parameters)\n",
|
| 656 | + "plot_2d_node_embeddings(\n", |
| 657 | + " get_first_2_embedding_dimensions(embeddings), \n", |
| 658 | + " \"Java Package positioned by their dependency relationships (HashGNN first 2 node embeddings without t-SNE)\"\n", |
| 659 | + ")\n", |
435 | 660 | "node_embeddings_for_visualization = prepare_node_embeddings_for_2d_visualization(embeddings)\n",
|
436 | 661 | "plot_2d_node_embeddings(\n",
|
437 | 662 | " node_embeddings_for_visualization, \n",
|
|
459 | 684 | " \"dependencies_projection_node\": \"Package\",\n",
|
460 | 685 | " \"dependencies_projection_weight_property\": \"weight25PercentInterfaces\",\n",
|
461 | 686 | " \"dependencies_projection_write_property\": \"embeddingsNode2Vec\",\n",
|
462 |
| - " \"dependencies_projection_embedding_dimension\":\"32\"\n", |
| 687 | + " \"dependencies_projection_embedding_dimension\":\"2\"\n", |
463 | 688 | "}\n",
|
464 | 689 | "embeddings = create_node_embeddings(\"../cypher/Node_Embeddings/Node_Embeddings_3d_Node2Vec_Stream.cypher\", java_package_embeddings_parameters)\n",
|
| 690 | + "plot_2d_node_embeddings(\n", |
| 691 | + " get_first_2_embedding_dimensions(embeddings), \n", |
| 692 | + " \"Java Package positioned by their dependency relationships (node2vec first 2 node embeddings without t-SNE)\"\n", |
| 693 | + ")\n", |
465 | 694 | "node_embeddings_for_visualization = prepare_node_embeddings_for_2d_visualization(embeddings)\n",
|
466 | 695 | "plot_2d_node_embeddings(\n",
|
467 | 696 | " node_embeddings_for_visualization, \n",
|
|
0 commit comments