Skip to content

Commit 29ebbcb

Browse files
committed
Add hyperparameter tuning to node embeddings
1 parent bcfac5d commit 29ebbcb

File tree

2 files changed

+255
-15
lines changed

2 files changed

+255
-15
lines changed

jupyter/NodeEmbeddingsJava.ipynb

Lines changed: 253 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,17 @@
6262
"from neo4j import GraphDatabase"
6363
]
6464
},
65+
{
66+
"cell_type": "code",
67+
"execution_count": null,
68+
"id": "29b00ea6",
69+
"metadata": {},
70+
"outputs": [],
71+
"source": [
72+
"# Main Colormap\n",
73+
"main_color_map = 'nipy_spectral'"
74+
]
75+
},
6576
{
6677
"cell_type": "code",
6778
"execution_count": null,
@@ -171,7 +182,7 @@
171182
"# TODO run a community detection algorithm co-located in here when \"communityId\" is missing\n",
172183
"# TODO run a centrality algorithm co-located in here when \"centrality\" score is missing\n",
173184
"\n",
174-
"def create_node_embeddings(cypher_file_name: str, parameters: dict) -> pd.DataFrame: \n",
185+
"def create_node_embeddings(cypher_file_name: str, parameters: dict, ignore_existing: bool = True) -> pd.DataFrame: \n",
175186
" \"\"\"\n",
176187
" Creates an in-memory Graph projection by calling \"create_undirected_projection\", \n",
177188
" runs the cypher Query given as cypherFileName parameter to calculate and stream the node embeddings\n",
@@ -197,11 +208,14 @@
197208
" \n",
198209
" if not is_data_available:\n",
199210
" print(\"No projected data for node embeddings calculation available\")\n",
200-
" empty_result = pd.DataFrame(columns=[\"codeUnitName\", 'projectName', 'communityId', 'centrality', 'embedding'])\n",
211+
" empty_result = pd.DataFrame(columns=[\"codeUnitName\", 'projectName', 'nodeElementId', 'communityId', 'centrality', 'embedding'])\n",
201212
" return empty_result\n",
202213
"\n",
203-
" existing_embeddings_query_filename=\"../cypher/Node_Embeddings/Node_Embeddings_0a_Query_Calculated.cypher\"\n",
204-
" embeddings = query_first_non_empty_cypher_to_data_frame(existing_embeddings_query_filename, cypher_file_name, parameters=parameters)\n",
214+
" if ignore_existing:\n",
215+
" embeddings = query_cypher_to_data_frame(cypher_file_name, parameters_=parameters)\n",
216+
" else: \n",
217+
" existing_embeddings_query_filename=\"../cypher/Node_Embeddings/Node_Embeddings_0a_Query_Calculated.cypher\"\n",
218+
" embeddings = query_first_non_empty_cypher_to_data_frame(existing_embeddings_query_filename, cypher_file_name, parameters=parameters)\n",
205219
" display(embeddings.head()) # Display the first entries of the table\n",
206220
" return embeddings"
207221
]
@@ -255,7 +269,9 @@
255269
" node_embeddings_for_visualization = pd.DataFrame(data = {\n",
256270
" \"codeUnit\": embeddings.codeUnitName,\n",
257271
" \"artifact\": embeddings.projectName,\n",
272+
" \"nodeElementId\": embeddings.nodeElementId,\n",
258273
" \"communityId\": embeddings.communityId,\n",
274+
" \"clusteringTunedHDBSCANLabel\": embeddings.clusteringTunedHDBSCANLabel,\n",
259275
" \"centrality\": embeddings.centrality,\n",
260276
" \"x\": [value[0] for value in two_dimension_node_embeddings],\n",
261277
" \"y\": [value[1] for value in two_dimension_node_embeddings]\n",
@@ -273,19 +289,42 @@
273289
"outputs": [],
274290
"source": [
275291
"def plot_2d_node_embeddings(node_embeddings_for_visualization: pd.DataFrame, title: str):\n",
276-
" if embeddings.empty:\n",
292+
" if node_embeddings_for_visualization.empty:\n",
277293
" print(\"No projected data to plot available\")\n",
278294
" return\n",
295+
" \n",
296+
" figure, (top, bottom) = plot.subplots(nrows=2, ncols=1, figsize=(10, 15))\n",
297+
" figure.suptitle(title)\n",
298+
" figure.subplots_adjust(top=0.92, hspace=0.2)\n",
299+
"\n",
300+
" node_embeddings_non_noise_cluster = node_embeddings_for_visualization[node_embeddings_for_visualization.clusteringTunedHDBSCANLabel != -1]\n",
301+
" node_embeddings_noise_cluster = node_embeddings_for_visualization[node_embeddings_for_visualization.clusteringTunedHDBSCANLabel == -1]\n",
279302
"\n",
280-
" plot.scatter(\n",
303+
" # Print the graph communities as a reference in the top plot\n",
304+
" top.set_title(\"Leiden Community Detection\")\n",
305+
" top.scatter(\n",
281306
" x=node_embeddings_for_visualization.x,\n",
282307
" y=node_embeddings_for_visualization.y,\n",
283308
" s=node_embeddings_for_visualization.centrality * 300,\n",
284309
" c=node_embeddings_for_visualization.communityId,\n",
285310
" cmap=main_color_map,\n",
286311
" )\n",
287-
" plot.title(title)\n",
288-
" plot.show()"
312+
"\n",
313+
" # Print the clustering results based on the node embeddings in the bottom plot\n",
314+
" bottom.set_title(\"HDBSCAN Clustering\")\n",
315+
" bottom.scatter(\n",
316+
" x=node_embeddings_non_noise_cluster.x,\n",
317+
" y=node_embeddings_non_noise_cluster.y,\n",
318+
" s=node_embeddings_non_noise_cluster.centrality * 300,\n",
319+
" c=node_embeddings_non_noise_cluster.clusteringTunedHDBSCANLabel,\n",
320+
" cmap=main_color_map,\n",
321+
" )\n",
322+
" bottom.scatter(\n",
323+
" x=node_embeddings_noise_cluster.x,\n",
324+
" y=node_embeddings_noise_cluster.y,\n",
325+
" s=node_embeddings_noise_cluster.centrality * 300,\n",
326+
" c='lightgrey'\n",
327+
" )"
289328
]
290329
},
291330
{
@@ -363,7 +402,208 @@
363402
" \"dependencies_projection_write_property\": \"embeddingsFastRandomProjection\",\n",
364403
" \"dependencies_projection_embedding_dimension\":\"32\"\n",
365404
"}\n",
366-
"embeddings = create_node_embeddings(\"../cypher/Node_Embeddings/Node_Embeddings_1d_Fast_Random_Projection_Stream.cypher\", java_package_embeddings_parameters)\n"
405+
"embeddings = create_node_embeddings(\"../cypher/Node_Embeddings/Node_Embeddings_1d_Fast_Random_Projection_Stream.cypher\", java_package_embeddings_parameters)"
406+
]
407+
},
408+
{
409+
"cell_type": "code",
410+
"execution_count": null,
411+
"id": "84642495",
412+
"metadata": {},
413+
"outputs": [],
414+
"source": [
415+
"import numpy.typing as numpy_typing\n",
416+
"\n",
417+
"class TunedClusteringResult:\n",
418+
" def __init__(self, labels : list, probabilities : list):\n",
419+
" self.labels = labels\n",
420+
" self.probabilities = probabilities\n",
421+
" self.cluster_count = len(set(labels)) - (1 if -1 in labels else 0)\n",
422+
" self.noise_count = np.sum(labels == -1)\n",
423+
" self.noise_ratio = self.noise_count / len(labels) if len(labels) > 0 else 0\n",
424+
" def __repr__(self):\n",
425+
" return f\"TunedClusteringResult(cluster_count={self.cluster_count}, noise_count={self.noise_count}, noise_ratio={self.noise_ratio}, labels=[...], probabilities=[...], )\"\n",
426+
"\n",
427+
"def tuned_hierarchical_density_based_spatial_clustering(embeddings: numpy_typing.NDArray, reference_community_ids: numpy_typing.NDArray) -> TunedClusteringResult:\n",
428+
" \"\"\"\n",
429+
" Applies the optimized hierarchical density-based spatial clustering algorithm (HDBSCAN) to the given node embeddings.\n",
430+
" The parameters are tuned to get results similar to the ones of the community detection algorithm.\n",
431+
" The result is a list of cluster ids for each node embedding.\n",
432+
" \"\"\"\n",
433+
" from sklearn.model_selection import GridSearchCV\n",
434+
" from sklearn.cluster import HDBSCAN\n",
435+
" from sklearn.metrics import adjusted_rand_score\n",
436+
" import numpy as np\n",
437+
"\n",
438+
" # specify parameters and distributions to sample from\n",
439+
" hyper_parameter_distributions = {\n",
440+
" \"min_samples\": [2, 3, 4, 5, 6, 7, 10, 20, 30, 50, 100],\n",
441+
" \"min_cluster_size\": [4, 5, 6, 7, 10, 20, 30, 50, 100],\n",
442+
" \"cluster_selection_method\": [\"eom\", \"leaf\"],\n",
443+
" \"metric\": [\"euclidean\", \"manhattan\"],\n",
444+
" }\n",
445+
" \n",
446+
" def adjusted_rand_scorer_with_penalty_for_community_references(community_references):\n",
447+
" \"\"\"\n",
448+
" Creates a custom scoring function based on the Adjusted Rand Index (ARI) that penalizes for high noise ratio in clustering.\n",
449+
" Input:\n",
450+
" - community_references: The true labels of the communities for the data points.\n",
451+
" Output:\n",
452+
" - A scoring function that can directly be used for e.g. RandomizedSearchCV and that takes an estimator and data (X) and returns the ARI score with a penalty for noise ratio.\n",
453+
" \"\"\"\n",
454+
" def ari_score_with_penalty(estimator, embeddings):\n",
455+
" clustering_result = estimator.fit_predict(embeddings)\n",
456+
" \n",
457+
" if np.unique(clustering_result[clustering_result != -1]).size < 2:\n",
458+
" return -1 # Return worst score if only one cluster is found or all points are noise\n",
459+
" \n",
460+
" # Calculate the noise ratio. Noise points are labeled as -1 in HDBSCAN.\n",
461+
" noise_ratio = np.sum(clustering_result == -1) / len(clustering_result)\n",
462+
"\n",
463+
" if noise_ratio > 0.50:\n",
464+
" return -1 # Return worst score if more than 50% percent of the points are unlabeled noise\n",
465+
"\n",
466+
" ari = adjusted_rand_score(community_references[clustering_result != -1], clustering_result[clustering_result != -1])\n",
467+
"\n",
468+
" # Penalize for high noise: If 80% of the points are noise, even a perfect ARI of 1.0 gets scaled down to 0.2\n",
469+
" penalty = 1.0 - noise_ratio \n",
470+
" \n",
471+
" return ari * penalty\n",
472+
" return ari_score_with_penalty\n",
473+
"\n",
474+
"\n",
475+
" # Use custom CV that feeds all data to each fold (no slicing)\n",
476+
" all_data_without_slicing_cross_validator = [(np.arange(len(embeddings)), np.arange(len(embeddings)))]\n",
477+
"\n",
478+
" tuned_hdbscan = GridSearchCV(\n",
479+
" estimator=HDBSCAN(),\n",
480+
" refit=False, # Without refit, the estimator doesn't need to implement the 'predict' method. Drawback: Only the best parameters are returned, not the best model.\n",
481+
" param_grid=hyper_parameter_distributions,\n",
482+
" n_jobs=4,\n",
483+
" scoring=adjusted_rand_scorer_with_penalty_for_community_references(reference_community_ids),\n",
484+
" cv=all_data_without_slicing_cross_validator,\n",
485+
" verbose=1\n",
486+
" )\n",
487+
"\n",
488+
" tuned_hdbscan.fit(embeddings)\n",
489+
"\n",
490+
" #print(\"Best adjusted rand score with noise penalty:\", tuned_hdbscan.best_score_)\n",
491+
" print(\"Tuned HDBSCAN parameters:\", tuned_hdbscan.best_params_)\n",
492+
"\n",
493+
" # Run the clustering again with the best parameters\n",
494+
" cluster_algorithm = HDBSCAN(**tuned_hdbscan.best_params_, allow_single_cluster=False)\n",
495+
" best_model = cluster_algorithm.fit(embeddings)\n",
496+
"\n",
497+
" results = TunedClusteringResult(best_model.labels_, best_model.probabilities_)\n",
498+
" print(f\"Number of HDBSCAN clusters (excluding noise): {results.cluster_count:.0f}\")\n",
499+
" return results"
500+
]
501+
},
502+
{
503+
"cell_type": "code",
504+
"execution_count": null,
505+
"id": "8e1f0227",
506+
"metadata": {},
507+
"outputs": [],
508+
"source": [
509+
"import numpy.typing as numpy_typing\n",
510+
"\n",
511+
"class CommunityComparingScores:\n",
512+
" def __init__(self, adjusted_rand_index: float, normalized_mutual_information: float):\n",
513+
" self.adjusted_rand_index = adjusted_rand_index\n",
514+
" self.normalized_mutual_information = normalized_mutual_information\n",
515+
" self.scores = {\n",
516+
" \"Adjusted Rand Index\": adjusted_rand_index,\n",
517+
" \"Normalized Mutual Information\": normalized_mutual_information\n",
518+
" }\n",
519+
" def __repr__(self):\n",
520+
" return f\"CommunityComparingScores(adjusted_rand_index={self.adjusted_rand_index}, normalized_mutual_information={self.normalized_mutual_information})\"\n",
521+
"\n",
522+
"def get_community_comparing_scores(cluster_labels: numpy_typing.NDArray, reference_community_ids: numpy_typing.NDArray) -> CommunityComparingScores:\n",
523+
" \"\"\"\n",
524+
" Returns a DataFrame with the scores of the clustering algorithm compared to the community detection algorithm.\n",
525+
" The scores are calculated using the adjusted rand index (ARI) and the normalized mutual information (NMI).\n",
526+
" \"\"\"\n",
527+
" from sklearn.metrics import adjusted_rand_score, normalized_mutual_info_score\n",
528+
"\n",
529+
" # Create a mask to filter out noise points. In HDBSCAN, noise points are labeled as -1\n",
530+
" mask = cluster_labels != -1\n",
531+
" ari = adjusted_rand_score(reference_community_ids[mask], cluster_labels[mask])\n",
532+
" nmi = normalized_mutual_info_score(reference_community_ids[mask], cluster_labels[mask])\n",
533+
"\n",
534+
" return CommunityComparingScores(ari, nmi)"
535+
]
536+
},
537+
{
538+
"cell_type": "code",
539+
"execution_count": null,
540+
"id": "3c4e8821",
541+
"metadata": {},
542+
"outputs": [],
543+
"source": [
544+
"def add_clustering_results_to_embeddings(embeddings: pd.DataFrame, clustering_result: TunedClusteringResult, clustering_name: str) -> pd.DataFrame:\n",
545+
" \"\"\"\n",
546+
" Adds the clustering results to the embeddings DataFrame.\n",
547+
" \"\"\"\n",
548+
" embeddings['clustering' + clustering_name + 'Label'] = clustering_result.labels\n",
549+
" embeddings['clustering' + clustering_name + 'Probability'] = clustering_result.probabilities\n",
550+
" return embeddings\n",
551+
"\n",
552+
"def get_clustering_results_distribution(embeddings: pd.DataFrame, clustering_name: str) -> pd.DataFrame:\n",
553+
" \"\"\"\n",
554+
" Returns the clustering results distribution for the given clustering name.\n",
555+
" \"\"\"\n",
556+
" return embeddings.groupby('clustering' + clustering_name + 'Label').aggregate(\n",
557+
" probability=('clustering' + clustering_name + 'Probability', 'mean'),\n",
558+
" count=('codeUnitName', 'count'),\n",
559+
" communityIds=('communityId', lambda x: list(set(x))),\n",
560+
" codeUnitNames=('codeUnitName', lambda x: list(set(x))),\n",
561+
" ).reset_index().sort_values(by='count', ascending=False)"
562+
]
563+
},
564+
{
565+
"cell_type": "code",
566+
"execution_count": null,
567+
"id": "c27ec0ec",
568+
"metadata": {},
569+
"outputs": [],
570+
"source": [
571+
"def add_tuned_hierarchical_density_based_spatial_clustering(embeddings: pd.DataFrame) -> pd.DataFrame:\n",
572+
" \"\"\"\n",
573+
" Applies the tuned hierarchical density-based spatial clustering algorithm (HDBSCAN) to the given node embeddings.\n",
574+
" The parameters are tuned to get results similar to the ones of the community detection algorithm.\n",
575+
" The result is the input DataFrame with the clustering results added.\n",
576+
" \"\"\"\n",
577+
" # Apply the tuned HDBSCAN clustering algorithm\n",
578+
" embeddings_values = np.array(embeddings.embedding.tolist())\n",
579+
" community_reference_ids = np.array(embeddings.communityId.tolist())\n",
580+
" \n",
581+
" clustering_result = tuned_hierarchical_density_based_spatial_clustering(embeddings_values, community_reference_ids)\n",
582+
" print(clustering_result)\n",
583+
" \n",
584+
" community_comparing_scores = get_community_comparing_scores(clustering_result.labels, community_reference_ids)\n",
585+
" print(community_comparing_scores)\n",
586+
"\n",
587+
" # Add the clustering results to the embeddings DataFrame\n",
588+
" embeddings = add_clustering_results_to_embeddings(embeddings, clustering_result, \"TunedHDBSCAN\")\n",
589+
" \n",
590+
" # Get the clustering results distribution\n",
591+
" clustering_results_distribution = get_clustering_results_distribution(embeddings, \"TunedHDBSCAN\")\n",
592+
" \n",
593+
" # Display the clustering results distribution\n",
594+
" display(clustering_results_distribution)\n",
595+
" \n",
596+
" return embeddings"
597+
]
598+
},
599+
{
600+
"cell_type": "code",
601+
"execution_count": null,
602+
"id": "0b42ed2a",
603+
"metadata": {},
604+
"outputs": [],
605+
"source": [
606+
"embeddings = add_tuned_hierarchical_density_based_spatial_clustering(embeddings)"
367607
]
368608
},
369609
{
@@ -429,9 +669,11 @@
429669
" \"dependencies_projection_node\": \"Package\",\n",
430670
" \"dependencies_projection_weight_property\": \"weight25PercentInterfaces\",\n",
431671
" \"dependencies_projection_write_property\": \"embeddingsHashGNN\",\n",
432-
" \"dependencies_projection_embedding_dimension\":\"64\"\n",
672+
" \"dependencies_projection_embedding_dimension\":\"128\"\n",
433673
"}\n",
434674
"embeddings = create_node_embeddings(\"../cypher/Node_Embeddings/Node_Embeddings_2d_Hash_GNN_Stream.cypher\", java_package_embeddings_parameters)\n",
675+
"embeddings = add_tuned_hierarchical_density_based_spatial_clustering(embeddings)\n",
676+
"\n",
435677
"node_embeddings_for_visualization = prepare_node_embeddings_for_2d_visualization(embeddings)\n",
436678
"plot_2d_node_embeddings(\n",
437679
" node_embeddings_for_visualization, \n",
@@ -462,6 +704,7 @@
462704
" \"dependencies_projection_embedding_dimension\":\"32\"\n",
463705
"}\n",
464706
"embeddings = create_node_embeddings(\"../cypher/Node_Embeddings/Node_Embeddings_3d_Node2Vec_Stream.cypher\", java_package_embeddings_parameters)\n",
707+
"embeddings = add_tuned_hierarchical_density_based_spatial_clustering(embeddings)\n",
465708
"node_embeddings_for_visualization = prepare_node_embeddings_for_2d_visualization(embeddings)\n",
466709
"plot_2d_node_embeddings(\n",
467710
" node_embeddings_for_visualization, \n",

scripts/reports/CommunityCsv.sh

Lines changed: 2 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -251,17 +251,14 @@ detectCommunitiesWithKCoreDecomposition() {
251251
# Label of the nodes that will be used for the projection. Example: "Package"
252252
# - dependencies_projection_weight_property=...
253253
# Name of the node property that contains the dependency weight. Example: "weight"
254-
# - dependencies_projection_embedding_dimension=...
255-
# Number of the dimensions and therefore size of the resulting array of floating point numbers
256254
nodeEmbeddingsWithFastRandomProjectionForHDBSCAN() {
257-
local PROJECTION_CYPHER_DIR="${CYPHER_DIR}/Dependencies_Projection"
258255
local NODE_EMBEDDINGS_CYPHER_DIR="${CYPHER_DIR}/Node_Embeddings"
259256
local mutatePropertyName="dependencies_projection_write_property=embeddingsFastRandomProjection"
260257
local embeddingsDimension="dependencies_projection_embedding_dimension=2"
261258

262259
# Statistics
263-
execute_cypher "${NODE_EMBEDDINGS_CYPHER_DIR}/Node_Embeddings_1a_Fast_Random_Projection_Estimate.cypher" "${@}" "${mutatePropertyName}" ${embeddingsDimension}
264-
execute_cypher "${NODE_EMBEDDINGS_CYPHER_DIR}/Node_Embeddings_1b_Fast_Random_Projection_Statistics.cypher" "${@}" ${embeddingsDimension}
260+
# execute_cypher "${NODE_EMBEDDINGS_CYPHER_DIR}/Node_Embeddings_1a_Fast_Random_Projection_Estimate.cypher" "${@}" "${mutatePropertyName}" ${embeddingsDimension}
261+
# execute_cypher "${NODE_EMBEDDINGS_CYPHER_DIR}/Node_Embeddings_1b_Fast_Random_Projection_Statistics.cypher" "${@}" ${embeddingsDimension}
265262

266263
# Run the algorithm and write the result into the in-memory projection ("mutate")
267264
execute_cypher "${NODE_EMBEDDINGS_CYPHER_DIR}/Node_Embeddings_1c_Fast_Random_Projection_Mutate.cypher" "${@}" "${mutatePropertyName}" ${embeddingsDimension}

0 commit comments

Comments
 (0)