Skip to content

Commit 2a7a4f4

Browse files
committed
Add hyperparameter tuning to node embeddings
1 parent 839aa5b commit 2a7a4f4

File tree

1 file changed

+236
-7
lines changed

1 file changed

+236
-7
lines changed

jupyter/NodeEmbeddingsJava.ipynb

Lines changed: 236 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,17 @@
6262
"from neo4j import GraphDatabase"
6363
]
6464
},
65+
{
66+
"cell_type": "code",
67+
"execution_count": null,
68+
"id": "29b00ea6",
69+
"metadata": {},
70+
"outputs": [],
71+
"source": [
72+
"# Main Colormap\n",
73+
"main_color_map = 'nipy_spectral'"
74+
]
75+
},
6576
{
6677
"cell_type": "code",
6778
"execution_count": null,
@@ -171,7 +182,7 @@
171182
"# TODO run a community detection algorithm co-located in here when \"communityId\" is missing\n",
172183
"# TODO run a centrality algorithm co-located in here when \"centrality\" score is missing\n",
173184
"\n",
174-
"def create_node_embeddings(cypher_file_name: str, parameters: dict) -> pd.DataFrame: \n",
185+
"def create_node_embeddings(cypher_file_name: str, parameters: dict, ignore_existing: bool = True) -> pd.DataFrame: \n",
175186
" \"\"\"\n",
176187
" Creates an in-memory Graph projection by calling \"create_undirected_projection\", \n",
177188
" runs the cypher Query given as cypherFileName parameter to calculate and stream the node embeddings\n",
@@ -200,8 +211,11 @@
200211
" empty_result = pd.DataFrame(columns=[\"codeUnitName\", 'projectName', 'communityId', 'centrality', 'embedding'])\n",
201212
" return empty_result\n",
202213
"\n",
203-
" existing_embeddings_query_filename=\"../cypher/Node_Embeddings/Node_Embeddings_0a_Query_Calculated.cypher\"\n",
204-
" embeddings = query_first_non_empty_cypher_to_data_frame(existing_embeddings_query_filename, cypher_file_name, parameters=parameters)\n",
214+
" if ignore_existing:\n",
215+
" embeddings = query_cypher_to_data_frame(cypher_file_name, parameters_=parameters)\n",
216+
" else: \n",
217+
" existing_embeddings_query_filename=\"../cypher/Node_Embeddings/Node_Embeddings_0a_Query_Calculated.cypher\"\n",
218+
" embeddings = query_first_non_empty_cypher_to_data_frame(existing_embeddings_query_filename, cypher_file_name, parameters=parameters)\n",
205219
" display(embeddings.head()) # Display the first entries of the table\n",
206220
" return embeddings"
207221
]
@@ -273,7 +287,7 @@
273287
"outputs": [],
274288
"source": [
275289
"def plot_2d_node_embeddings(node_embeddings_for_visualization: pd.DataFrame, title: str):\n",
276-
" if embeddings.empty:\n",
290+
" if node_embeddings_for_visualization.empty:\n",
277291
" print(\"No projected data to plot available\")\n",
278292
" return\n",
279293
"\n",
@@ -288,6 +302,30 @@
288302
" plot.show()"
289303
]
290304
},
305+
{
306+
"cell_type": "code",
307+
"execution_count": null,
308+
"id": "e80a45ec",
309+
"metadata": {},
310+
"outputs": [],
311+
"source": [
312+
"def get_first_2_embedding_dimensions(embeddings: pd.DataFrame) -> pd.DataFrame:\n",
313+
" \"\"\"\n",
314+
" Returns the first two dimensions of the node embeddings as a DataFrame.\n",
315+
" \"\"\"\n",
316+
" if embeddings.empty: \n",
317+
" print(\"No projected data for node embeddings dimensionality reduction available\")\n",
318+
" return embeddings\n",
319+
" \n",
320+
" # Create a new DataFrame with the results of the 2 dimensional node embeddings\n",
321+
" # and the code unit and artifact name of the query above as preparation for the plot\n",
322+
" get_first_2_embedding_dimensions = embeddings.copy()\n",
323+
"\n",
324+
" get_first_2_embedding_dimensions['x'] = [value[0] for value in get_first_2_embedding_dimensions.embedding]\n",
325+
" get_first_2_embedding_dimensions['y'] = [value[1] for value in get_first_2_embedding_dimensions.embedding]\n",
326+
" return get_first_2_embedding_dimensions"
327+
]
328+
},
291329
{
292330
"cell_type": "code",
293331
"execution_count": null,
@@ -361,11 +399,194 @@
361399
" \"dependencies_projection_node\": \"Package\",\n",
362400
" \"dependencies_projection_weight_property\": \"weight25PercentInterfaces\",\n",
363401
" \"dependencies_projection_write_property\": \"embeddingsFastRandomProjection\",\n",
364-
" \"dependencies_projection_embedding_dimension\":\"32\"\n",
402+
" \"dependencies_projection_embedding_dimension\":\"16\"\n",
365403
"}\n",
366404
"embeddings = create_node_embeddings(\"../cypher/Node_Embeddings/Node_Embeddings_1d_Fast_Random_Projection_Stream.cypher\", java_package_embeddings_parameters)\n"
367405
]
368406
},
407+
{
408+
"cell_type": "code",
409+
"execution_count": null,
410+
"id": "1709344c",
411+
"metadata": {},
412+
"outputs": [],
413+
"source": [
414+
"plot_2d_node_embeddings(\n",
415+
" get_first_2_embedding_dimensions(embeddings), \n",
416+
" \"Java Package positioned by their dependency relationships (FastRP first 2 node embeddings without t-SNE)\"\n",
417+
")"
418+
]
419+
},
420+
{
421+
"cell_type": "code",
422+
"execution_count": null,
423+
"id": "84642495",
424+
"metadata": {},
425+
"outputs": [],
426+
"source": [
427+
"import numpy.typing as numpy_typing\n",
428+
"\n",
429+
"class TunedClusteringResult:\n",
430+
" def __init__(self, labels : list, probabilities : list):\n",
431+
" self.labels = labels\n",
432+
" self.probabilities = probabilities\n",
433+
" self.cluster_count = len(set(labels)) - (1 if -1 in labels else 0)\n",
434+
"\n",
435+
"def tuned_hierarchical_density_based_spatial_clustering(embeddings: numpy_typing.NDArray, reference_community_ids: numpy_typing.NDArray) -> TunedClusteringResult:\n",
436+
" \"\"\"\n",
437+
" Applies the optimized hierarchical density-based spatial clustering algorithm (HDBSCAN) to the given node embeddings.\n",
438+
" The parameters are tuned to get results similar to the ones of the community detection algorithm.\n",
439+
" The result is a list of cluster ids for each node embedding.\n",
440+
" \"\"\"\n",
441+
" from sklearn.model_selection import RandomizedSearchCV\n",
442+
" from sklearn.cluster import HDBSCAN\n",
443+
" from sklearn.metrics import adjusted_rand_score\n",
444+
" import numpy as np\n",
445+
"\n",
446+
" # specify parameters and distributions to sample from\n",
447+
" hyper_parameter_distributions = {\n",
448+
" \"min_samples\": [2, 3, 4, 5, 6, 7, 8, 9, 10, 15, 20, 50, 100],\n",
449+
" \"min_cluster_size\": [4, 5, 6, 7, 10, 20, 30, 50, 100],\n",
450+
" \"cluster_selection_method\": [\"eom\", \"leaf\"],\n",
451+
" \"metric\": [\"euclidean\", \"manhattan\"],\n",
452+
" }\n",
453+
" \n",
454+
" def adjusted_rand_scorer_with_penalty_for_community_references(community_references):\n",
455+
" \"\"\"\n",
456+
" Creates a custom scoring function based on the Adjusted Rand Index (ARI) that penalizes for high noise ratio in clustering.\n",
457+
" Input:\n",
458+
" - community_references: The true labels of the communities for the data points.\n",
459+
" Output:\n",
460+
" - A scoring function that can directly be used for e.g. RandomizedSearchCV and that takes an estimator and data (X) and returns the ARI score with a penalty for noise ratio.\n",
461+
" \"\"\"\n",
462+
" def ari_score_with_penalty(estimator, embeddings):\n",
463+
" clustering_result = estimator.fit_predict(embeddings)\n",
464+
"\n",
465+
" # Calculate the noise ratio. Noise points are labeled as -1 in HDBSCAN.\n",
466+
" noise_ratio = np.sum(clustering_result == -1) / len(clustering_result)\n",
467+
" \n",
468+
" if np.unique(clustering_result[clustering_result != -1]).size < 2:\n",
469+
" return -1 # Return worst score if only one cluster is found or all points are noise\n",
470+
"\n",
471+
" ari = adjusted_rand_score(community_references[clustering_result != -1], clustering_result[clustering_result != -1])\n",
472+
" \n",
473+
" # Penalize for high noise: If 80% of the points are noise, even a perfect ARI of 1.0 gets scaled down to 0.2\n",
474+
" penalty = 1.0 - noise_ratio \n",
475+
" \n",
476+
" return ari * penalty\n",
477+
" return ari_score_with_penalty\n",
478+
"\n",
479+
"\n",
480+
" # Use custom CV that feeds all data to each fold (no slicing)\n",
481+
" all_data_without_slicing_cross_validator = [(np.arange(len(embeddings)), np.arange(len(embeddings)))]\n",
482+
"\n",
483+
" hdbscan_with_random_search = RandomizedSearchCV(\n",
484+
" estimator=HDBSCAN(),\n",
485+
" refit=False, # Without refit, the estimator doesn't need to implement the 'predict' method. Drawback: Only the best parameters are returned, not the best model.\n",
486+
" param_distributions=hyper_parameter_distributions,\n",
487+
" n_iter=20,\n",
488+
" scoring=adjusted_rand_scorer_with_penalty_for_community_references(reference_community_ids),\n",
489+
" cv=all_data_without_slicing_cross_validator,\n",
490+
" verbose=1\n",
491+
" )\n",
492+
"\n",
493+
" hdbscan_with_random_search.fit(embeddings)\n",
494+
"\n",
495+
" #print(\"Best adjusted rand score with noise penalty:\", hdbscan_with_random_search.best_score_)\n",
496+
" print(\"Tuned HDBSCAN parameters:\", hdbscan_with_random_search.best_params_)\n",
497+
"\n",
498+
" # Run the clustering again with the best parameters\n",
499+
" cluster_algorithm = HDBSCAN(**hdbscan_with_random_search.best_params_, allow_single_cluster=False)\n",
500+
" best_model = cluster_algorithm.fit(embeddings)\n",
501+
"\n",
502+
" results = TunedClusteringResult(best_model.labels_, best_model.probabilities_)\n",
503+
" print(f\"Number of HDBSCAN clusters (excluding noise): {results.cluster_count:.0f}\")\n",
504+
" return results"
505+
]
506+
},
507+
{
508+
"cell_type": "code",
509+
"execution_count": null,
510+
"id": "8e1f0227",
511+
"metadata": {},
512+
"outputs": [],
513+
"source": [
514+
"import numpy.typing as numpy_typing\n",
515+
"\n",
516+
"class CommunityComparingScores:\n",
517+
" def __init__(self, adjusted_rand_index: float, normalized_mutual_information: float):\n",
518+
" self.adjusted_rand_index = adjusted_rand_index\n",
519+
" self.normalized_mutual_information = normalized_mutual_information\n",
520+
" self.scores = {\n",
521+
" \"Adjusted Rand Index\": adjusted_rand_index,\n",
522+
" \"Normalized Mutual Information\": normalized_mutual_information\n",
523+
" }\n",
524+
" def __repr__(self):\n",
525+
" return f\"CommunityComparingScores(adjusted_rand_index={self.adjusted_rand_index}, normalized_mutual_information={self.normalized_mutual_information})\"\n",
526+
"\n",
527+
"def get_community_comparing_scores(cluster_labels: numpy_typing.NDArray, reference_community_ids: numpy_typing.NDArray) -> CommunityComparingScores:\n",
528+
" \"\"\"\n",
529+
" Returns a DataFrame with the scores of the clustering algorithm compared to the community detection algorithm.\n",
530+
" The scores are calculated using the adjusted rand index (ARI) and the normalized mutual information (NMI).\n",
531+
" \"\"\"\n",
532+
" from sklearn.metrics import adjusted_rand_score, normalized_mutual_info_score\n",
533+
"\n",
534+
" # Create a mask to filter out noise points. In HDBSCAN, noise points are labeled as -1\n",
535+
" mask = cluster_labels != -1\n",
536+
" ari = adjusted_rand_score(reference_community_ids[mask], cluster_labels[mask])\n",
537+
" nmi = normalized_mutual_info_score(reference_community_ids[mask], cluster_labels[mask])\n",
538+
"\n",
539+
" return CommunityComparingScores(ari, nmi)"
540+
]
541+
},
542+
{
543+
"cell_type": "code",
544+
"execution_count": null,
545+
"id": "3c4e8821",
546+
"metadata": {},
547+
"outputs": [],
548+
"source": [
549+
"def add_clustering_results_to_embeddings(embeddings: pd.DataFrame, clustering_result: TunedClusteringResult, clustering_name: str) -> pd.DataFrame:\n",
550+
" \"\"\"\n",
551+
" Adds the clustering results to the embeddings DataFrame.\n",
552+
" \"\"\"\n",
553+
" embeddings['clustering' + clustering_name + 'Label'] = clustering_result.labels\n",
554+
" embeddings['clustering' + clustering_name + 'Probability'] = clustering_result.probabilities\n",
555+
" return embeddings\n",
556+
"\n",
557+
"def get_clustering_results_distribution(embeddings: pd.DataFrame, clustering_name: str) -> pd.DataFrame:\n",
558+
" \"\"\"\n",
559+
" Returns the clustering results distribution for the given clustering name.\n",
560+
" \"\"\"\n",
561+
" return embeddings.groupby('clustering' + clustering_name + 'Label').aggregate(\n",
562+
" probability=('clustering' + clustering_name + 'Probability', 'mean'),\n",
563+
" count=('codeUnitName', 'count'),\n",
564+
" communityIds=('communityId', lambda x: list(set(x))),\n",
565+
" codeUnitNames=('codeUnitName', lambda x: list(set(x))),\n",
566+
" ).reset_index().sort_values(by='count', ascending=False)"
567+
]
568+
},
569+
{
570+
"cell_type": "code",
571+
"execution_count": null,
572+
"id": "6580301e",
573+
"metadata": {},
574+
"outputs": [],
575+
"source": [
576+
"import numpy as np\n",
577+
"\n",
578+
"embeddings_values = np.array(embeddings.embedding.tolist())\n",
579+
"community_reference_ids = np.array(embeddings.communityId.tolist())\n",
580+
"\n",
581+
"results = tuned_hierarchical_density_based_spatial_clustering(embeddings_values, community_reference_ids)\n",
582+
"\n",
583+
"community_comparing_scores = get_community_comparing_scores(results.labels, community_reference_ids)\n",
584+
"print(community_comparing_scores)\n",
585+
"\n",
586+
"embeddings = add_clustering_results_to_embeddings(embeddings, results, \"Hdbscan\")\n",
587+
"display(get_clustering_results_distribution(embeddings, \"Hdbscan\")) "
588+
]
589+
},
369590
{
370591
"cell_type": "markdown",
371592
"id": "76d8bca1",
@@ -429,9 +650,13 @@
429650
" \"dependencies_projection_node\": \"Package\",\n",
430651
" \"dependencies_projection_weight_property\": \"weight25PercentInterfaces\",\n",
431652
" \"dependencies_projection_write_property\": \"embeddingsHashGNN\",\n",
432-
" \"dependencies_projection_embedding_dimension\":\"64\"\n",
653+
" \"dependencies_projection_embedding_dimension\":\"2\"\n",
433654
"}\n",
434655
"embeddings = create_node_embeddings(\"../cypher/Node_Embeddings/Node_Embeddings_2d_Hash_GNN_Stream.cypher\", java_package_embeddings_parameters)\n",
656+
"plot_2d_node_embeddings(\n",
657+
" get_first_2_embedding_dimensions(embeddings), \n",
658+
" \"Java Package positioned by their dependency relationships (HashGNN first 2 node embeddings without t-SNE)\"\n",
659+
")\n",
435660
"node_embeddings_for_visualization = prepare_node_embeddings_for_2d_visualization(embeddings)\n",
436661
"plot_2d_node_embeddings(\n",
437662
" node_embeddings_for_visualization, \n",
@@ -459,9 +684,13 @@
459684
" \"dependencies_projection_node\": \"Package\",\n",
460685
" \"dependencies_projection_weight_property\": \"weight25PercentInterfaces\",\n",
461686
" \"dependencies_projection_write_property\": \"embeddingsNode2Vec\",\n",
462-
" \"dependencies_projection_embedding_dimension\":\"32\"\n",
687+
" \"dependencies_projection_embedding_dimension\":\"2\"\n",
463688
"}\n",
464689
"embeddings = create_node_embeddings(\"../cypher/Node_Embeddings/Node_Embeddings_3d_Node2Vec_Stream.cypher\", java_package_embeddings_parameters)\n",
690+
"plot_2d_node_embeddings(\n",
691+
" get_first_2_embedding_dimensions(embeddings), \n",
692+
" \"Java Package positioned by their dependency relationships (node2vec first 2 node embeddings without t-SNE)\"\n",
693+
")\n",
465694
"node_embeddings_for_visualization = prepare_node_embeddings_for_2d_visualization(embeddings)\n",
466695
"plot_2d_node_embeddings(\n",
467696
" node_embeddings_for_visualization, \n",

0 commit comments

Comments
 (0)