Add hyperparameter tuning to node embeddings

JohT · JohT · commit 29ebbcb87836 · 2025-05-08T08:28:49.000+02:00
diff --git a/jupyter/NodeEmbeddingsJava.ipynb b/jupyter/NodeEmbeddingsJava.ipynb
@@ -62,6 +62,17 @@
     "from neo4j import GraphDatabase"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "29b00ea6",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Main Colormap\n",
+    "main_color_map = 'nipy_spectral'"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -171,7 +182,7 @@
     "# TODO run a community detection algorithm co-located in here when \"communityId\" is missing\n",
     "# TODO run a centrality algorithm co-located in here when \"centrality\" score is missing\n",
     "\n",
-    "def create_node_embeddings(cypher_file_name: str, parameters: dict) -> pd.DataFrame: \n",
+    "def create_node_embeddings(cypher_file_name: str, parameters: dict, ignore_existing: bool = True) -> pd.DataFrame: \n",
     "    \"\"\"\n",
     "    Creates an in-memory Graph projection by calling \"create_undirected_projection\", \n",
     "    runs the cypher Query given as cypherFileName parameter to calculate and stream the node embeddings\n",
@@ -197,11 +208,14 @@
     "    \n",
     "    if not is_data_available:\n",
     "        print(\"No projected data for node embeddings calculation available\")\n",
-    "        empty_result = pd.DataFrame(columns=[\"codeUnitName\", 'projectName', 'communityId', 'centrality', 'embedding'])\n",
+    "        empty_result = pd.DataFrame(columns=[\"codeUnitName\", 'projectName', 'nodeElementId', 'communityId', 'centrality', 'embedding'])\n",
     "        return empty_result\n",
     "\n",
-    "    existing_embeddings_query_filename=\"../cypher/Node_Embeddings/Node_Embeddings_0a_Query_Calculated.cypher\"\n",
-    "    embeddings = query_first_non_empty_cypher_to_data_frame(existing_embeddings_query_filename, cypher_file_name, parameters=parameters)\n",
+    "    if ignore_existing:\n",
+    "        embeddings = query_cypher_to_data_frame(cypher_file_name, parameters_=parameters)\n",
+    "    else:    \n",
+    "        existing_embeddings_query_filename=\"../cypher/Node_Embeddings/Node_Embeddings_0a_Query_Calculated.cypher\"\n",
+    "        embeddings = query_first_non_empty_cypher_to_data_frame(existing_embeddings_query_filename, cypher_file_name, parameters=parameters)\n",
     "    display(embeddings.head()) # Display the first entries of the table\n",
     "    return embeddings"
    ]
@@ -255,7 +269,9 @@
     "    node_embeddings_for_visualization = pd.DataFrame(data = {\n",
     "        \"codeUnit\": embeddings.codeUnitName,\n",
     "        \"artifact\": embeddings.projectName,\n",
+    "        \"nodeElementId\": embeddings.nodeElementId,\n",
     "        \"communityId\": embeddings.communityId,\n",
+    "        \"clusteringTunedHDBSCANLabel\": embeddings.clusteringTunedHDBSCANLabel,\n",
     "        \"centrality\": embeddings.centrality,\n",
     "        \"x\": [value[0] for value in two_dimension_node_embeddings],\n",
     "        \"y\": [value[1] for value in two_dimension_node_embeddings]\n",
@@ -273,19 +289,42 @@
    "outputs": [],
    "source": [
     "def plot_2d_node_embeddings(node_embeddings_for_visualization: pd.DataFrame, title: str):\n",
-    "    if embeddings.empty:\n",
+    "    if node_embeddings_for_visualization.empty:\n",
     "        print(\"No projected data to plot available\")\n",
     "        return\n",
+    "    \n",
+    "    figure, (top, bottom) = plot.subplots(nrows=2, ncols=1, figsize=(10, 15))\n",
+    "    figure.suptitle(title)\n",
+    "    figure.subplots_adjust(top=0.92, hspace=0.2)\n",
+    "\n",
+    "    node_embeddings_non_noise_cluster = node_embeddings_for_visualization[node_embeddings_for_visualization.clusteringTunedHDBSCANLabel != -1]\n",
+    "    node_embeddings_noise_cluster = node_embeddings_for_visualization[node_embeddings_for_visualization.clusteringTunedHDBSCANLabel == -1]\n",
     "\n",
-    "    plot.scatter(\n",
+    "    # Print the graph communities as a reference in the top plot\n",
+    "    top.set_title(\"Leiden Community Detection\")\n",
+    "    top.scatter(\n",
     "        x=node_embeddings_for_visualization.x,\n",
     "        y=node_embeddings_for_visualization.y,\n",
     "        s=node_embeddings_for_visualization.centrality * 300,\n",
     "        c=node_embeddings_for_visualization.communityId,\n",
     "        cmap=main_color_map,\n",
     "    )\n",
-    "    plot.title(title)\n",
-    "    plot.show()"
+    "\n",
+    "    # Print the clustering results based on the node embeddings in the bottom plot\n",
+    "    bottom.set_title(\"HDBSCAN Clustering\")\n",
+    "    bottom.scatter(\n",
+    "        x=node_embeddings_non_noise_cluster.x,\n",
+    "        y=node_embeddings_non_noise_cluster.y,\n",
+    "        s=node_embeddings_non_noise_cluster.centrality * 300,\n",
+    "        c=node_embeddings_non_noise_cluster.clusteringTunedHDBSCANLabel,\n",
+    "        cmap=main_color_map,\n",
+    "    )\n",
+    "    bottom.scatter(\n",
+    "        x=node_embeddings_noise_cluster.x,\n",
+    "        y=node_embeddings_noise_cluster.y,\n",
+    "        s=node_embeddings_noise_cluster.centrality * 300,\n",
+    "        c='lightgrey'\n",
+    "    )"
    ]
   },
   {
@@ -363,7 +402,208 @@
     "    \"dependencies_projection_write_property\": \"embeddingsFastRandomProjection\",\n",
     "    \"dependencies_projection_embedding_dimension\":\"32\"\n",
     "}\n",
-    "embeddings = create_node_embeddings(\"../cypher/Node_Embeddings/Node_Embeddings_1d_Fast_Random_Projection_Stream.cypher\", java_package_embeddings_parameters)\n"
+    "embeddings = create_node_embeddings(\"../cypher/Node_Embeddings/Node_Embeddings_1d_Fast_Random_Projection_Stream.cypher\", java_package_embeddings_parameters)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "84642495",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import numpy.typing as numpy_typing\n",
+    "\n",
+    "class TunedClusteringResult:\n",
+    "    def __init__(self, labels : list, probabilities : list):\n",
+    "        self.labels = labels\n",
+    "        self.probabilities = probabilities\n",
+    "        self.cluster_count = len(set(labels)) - (1 if -1 in labels else 0)\n",
+    "        self.noise_count = np.sum(labels == -1)\n",
+    "        self.noise_ratio = self.noise_count / len(labels) if len(labels) > 0 else 0\n",
+    "    def __repr__(self):\n",
+    "        return f\"TunedClusteringResult(cluster_count={self.cluster_count}, noise_count={self.noise_count}, noise_ratio={self.noise_ratio}, labels=[...], probabilities=[...], )\"\n",
+    "\n",
+    "def tuned_hierarchical_density_based_spatial_clustering(embeddings: numpy_typing.NDArray, reference_community_ids: numpy_typing.NDArray) -> TunedClusteringResult:\n",
+    "    \"\"\"\n",
+    "    Applies the optimized hierarchical density-based spatial clustering algorithm (HDBSCAN) to the given node embeddings.\n",
+    "    The parameters are tuned to get results similar to the ones of the community detection algorithm.\n",
+    "    The result is a list of cluster ids for each node embedding.\n",
+    "    \"\"\"\n",
+    "    from sklearn.model_selection import GridSearchCV\n",
+    "    from sklearn.cluster import HDBSCAN\n",
+    "    from sklearn.metrics import adjusted_rand_score\n",
+    "    import numpy as np\n",
+    "\n",
+    "    # specify parameters and distributions to sample from\n",
+    "    hyper_parameter_distributions = {\n",
+    "        \"min_samples\": [2, 3, 4, 5, 6, 7, 10, 20, 30, 50, 100],\n",
+    "        \"min_cluster_size\": [4, 5, 6, 7, 10, 20, 30, 50, 100],\n",
+    "        \"cluster_selection_method\": [\"eom\", \"leaf\"],\n",
+    "        \"metric\": [\"euclidean\", \"manhattan\"],\n",
+    "    }\n",
+    "    \n",
+    "    def adjusted_rand_scorer_with_penalty_for_community_references(community_references):\n",
+    "        \"\"\"\n",
+    "        Creates a custom scoring function based on the Adjusted Rand Index (ARI) that penalizes for high noise ratio in clustering.\n",
+    "        Input:\n",
+    "        - community_references: The true labels of the communities for the data points.\n",
+    "        Output:\n",
+    "        - A scoring function that can directly be used for e.g. RandomizedSearchCV and that takes an estimator and data (X) and returns the ARI score with a penalty for noise ratio.\n",
+    "        \"\"\"\n",
+    "        def ari_score_with_penalty(estimator, embeddings):\n",
+    "            clustering_result = estimator.fit_predict(embeddings)\n",
+    "            \n",
+    "            if np.unique(clustering_result[clustering_result != -1]).size < 2:\n",
+    "                return -1  # Return worst score if only one cluster is found or all points are noise\n",
+    "            \n",
+    "            # Calculate the noise ratio. Noise points are labeled as -1 in HDBSCAN.\n",
+    "            noise_ratio = np.sum(clustering_result == -1) / len(clustering_result)\n",
+    "\n",
+    "            if noise_ratio > 0.50:\n",
+    "                return -1 # Return worst score if more than 50% percent of the points are unlabeled noise\n",
+    "\n",
+    "            ari = adjusted_rand_score(community_references[clustering_result != -1], clustering_result[clustering_result != -1])\n",
+    "\n",
+    "            # Penalize for high noise: If 80% of the points are noise, even a perfect ARI of 1.0 gets scaled down to 0.2\n",
+    "            penalty = 1.0 - noise_ratio  \n",
+    "            \n",
+    "            return ari * penalty\n",
+    "        return ari_score_with_penalty\n",
+    "\n",
+    "\n",
+    "    # Use custom CV that feeds all data to each fold (no slicing)\n",
+    "    all_data_without_slicing_cross_validator = [(np.arange(len(embeddings)), np.arange(len(embeddings)))]\n",
+    "\n",
+    "    tuned_hdbscan = GridSearchCV(\n",
+    "        estimator=HDBSCAN(),\n",
+    "        refit=False, # Without refit, the estimator doesn't need to implement the 'predict' method. Drawback: Only the best parameters are returned, not the best model.\n",
+    "        param_grid=hyper_parameter_distributions,\n",
+    "        n_jobs=4,\n",
+    "        scoring=adjusted_rand_scorer_with_penalty_for_community_references(reference_community_ids),\n",
+    "        cv=all_data_without_slicing_cross_validator,\n",
+    "        verbose=1\n",
+    "    )\n",
+    "\n",
+    "    tuned_hdbscan.fit(embeddings)\n",
+    "\n",
+    "    #print(\"Best adjusted rand score with noise penalty:\", tuned_hdbscan.best_score_)\n",
+    "    print(\"Tuned HDBSCAN parameters:\", tuned_hdbscan.best_params_)\n",
+    "\n",
+    "    # Run the clustering again with the best parameters\n",
+    "    cluster_algorithm = HDBSCAN(**tuned_hdbscan.best_params_, allow_single_cluster=False)\n",
+    "    best_model = cluster_algorithm.fit(embeddings)\n",
+    "\n",
+    "    results = TunedClusteringResult(best_model.labels_, best_model.probabilities_)\n",
+    "    print(f\"Number of HDBSCAN clusters (excluding noise): {results.cluster_count:.0f}\")\n",
+    "    return results"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "8e1f0227",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import numpy.typing as numpy_typing\n",
+    "\n",
+    "class CommunityComparingScores:\n",
+    "    def __init__(self, adjusted_rand_index: float, normalized_mutual_information: float):\n",
+    "        self.adjusted_rand_index = adjusted_rand_index\n",
+    "        self.normalized_mutual_information = normalized_mutual_information\n",
+    "        self.scores = {\n",
+    "            \"Adjusted Rand Index\": adjusted_rand_index,\n",
+    "            \"Normalized Mutual Information\": normalized_mutual_information\n",
+    "        }\n",
+    "    def __repr__(self):\n",
+    "        return f\"CommunityComparingScores(adjusted_rand_index={self.adjusted_rand_index}, normalized_mutual_information={self.normalized_mutual_information})\"\n",
+    "\n",
+    "def get_community_comparing_scores(cluster_labels: numpy_typing.NDArray, reference_community_ids: numpy_typing.NDArray) -> CommunityComparingScores:\n",
+    "    \"\"\"\n",
+    "    Returns a DataFrame with the scores of the clustering algorithm compared to the community detection algorithm.\n",
+    "    The scores are calculated using the adjusted rand index (ARI) and the normalized mutual information (NMI).\n",
+    "    \"\"\"\n",
+    "    from sklearn.metrics import adjusted_rand_score, normalized_mutual_info_score\n",
+    "\n",
+    "    # Create a mask to filter out noise points. In HDBSCAN, noise points are labeled as -1\n",
+    "    mask = cluster_labels != -1\n",
+    "    ari = adjusted_rand_score(reference_community_ids[mask], cluster_labels[mask])\n",
+    "    nmi = normalized_mutual_info_score(reference_community_ids[mask], cluster_labels[mask])\n",
+    "\n",
+    "    return CommunityComparingScores(ari, nmi)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "3c4e8821",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def add_clustering_results_to_embeddings(embeddings: pd.DataFrame, clustering_result: TunedClusteringResult, clustering_name: str) -> pd.DataFrame:\n",
+    "    \"\"\"\n",
+    "    Adds the clustering results to the embeddings DataFrame.\n",
+    "    \"\"\"\n",
+    "    embeddings['clustering' + clustering_name + 'Label'] = clustering_result.labels\n",
+    "    embeddings['clustering' + clustering_name + 'Probability'] = clustering_result.probabilities\n",
+    "    return embeddings\n",
+    "\n",
+    "def get_clustering_results_distribution(embeddings: pd.DataFrame, clustering_name: str) -> pd.DataFrame:\n",
+    "    \"\"\"\n",
+    "    Returns the clustering results distribution for the given clustering name.\n",
+    "    \"\"\"\n",
+    "    return embeddings.groupby('clustering' + clustering_name + 'Label').aggregate(\n",
+    "        probability=('clustering' + clustering_name + 'Probability', 'mean'),\n",
+    "        count=('codeUnitName', 'count'),\n",
+    "        communityIds=('communityId', lambda x: list(set(x))),\n",
+    "        codeUnitNames=('codeUnitName', lambda x: list(set(x))),\n",
+    "    ).reset_index().sort_values(by='count', ascending=False)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c27ec0ec",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def add_tuned_hierarchical_density_based_spatial_clustering(embeddings: pd.DataFrame) -> pd.DataFrame:\n",
+    "    \"\"\"\n",
+    "    Applies the tuned hierarchical density-based spatial clustering algorithm (HDBSCAN) to the given node embeddings.\n",
+    "    The parameters are tuned to get results similar to the ones of the community detection algorithm.\n",
+    "    The result is the input DataFrame with the clustering results added.\n",
+    "    \"\"\"\n",
+    "    # Apply the tuned HDBSCAN clustering algorithm\n",
+    "    embeddings_values = np.array(embeddings.embedding.tolist())\n",
+    "    community_reference_ids = np.array(embeddings.communityId.tolist())\n",
+    "    \n",
+    "    clustering_result = tuned_hierarchical_density_based_spatial_clustering(embeddings_values, community_reference_ids)\n",
+    "    print(clustering_result)\n",
+    "    \n",
+    "    community_comparing_scores = get_community_comparing_scores(clustering_result.labels, community_reference_ids)\n",
+    "    print(community_comparing_scores)\n",
+    "\n",
+    "    # Add the clustering results to the embeddings DataFrame\n",
+    "    embeddings = add_clustering_results_to_embeddings(embeddings, clustering_result, \"TunedHDBSCAN\")\n",
+    "    \n",
+    "    # Get the clustering results distribution\n",
+    "    clustering_results_distribution = get_clustering_results_distribution(embeddings, \"TunedHDBSCAN\")\n",
+    "    \n",
+    "    # Display the clustering results distribution\n",
+    "    display(clustering_results_distribution)\n",
+    "    \n",
+    "    return embeddings"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0b42ed2a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "embeddings = add_tuned_hierarchical_density_based_spatial_clustering(embeddings)"
    ]
   },
   {
@@ -429,9 +669,11 @@
     "    \"dependencies_projection_node\": \"Package\",\n",
     "    \"dependencies_projection_weight_property\": \"weight25PercentInterfaces\",\n",
     "    \"dependencies_projection_write_property\": \"embeddingsHashGNN\",\n",
-    "    \"dependencies_projection_embedding_dimension\":\"64\"\n",
+    "    \"dependencies_projection_embedding_dimension\":\"128\"\n",
     "}\n",
     "embeddings = create_node_embeddings(\"../cypher/Node_Embeddings/Node_Embeddings_2d_Hash_GNN_Stream.cypher\", java_package_embeddings_parameters)\n",
+    "embeddings = add_tuned_hierarchical_density_based_spatial_clustering(embeddings)\n",
+    "\n",
     "node_embeddings_for_visualization = prepare_node_embeddings_for_2d_visualization(embeddings)\n",
     "plot_2d_node_embeddings(\n",
     "    node_embeddings_for_visualization, \n",
@@ -462,6 +704,7 @@
     "    \"dependencies_projection_embedding_dimension\":\"32\"\n",
     "}\n",
     "embeddings = create_node_embeddings(\"../cypher/Node_Embeddings/Node_Embeddings_3d_Node2Vec_Stream.cypher\", java_package_embeddings_parameters)\n",
+    "embeddings = add_tuned_hierarchical_density_based_spatial_clustering(embeddings)\n",
     "node_embeddings_for_visualization = prepare_node_embeddings_for_2d_visualization(embeddings)\n",
     "plot_2d_node_embeddings(\n",
     "    node_embeddings_for_visualization, \n",
diff --git a/scripts/reports/CommunityCsv.sh b/scripts/reports/CommunityCsv.sh
@@ -251,17 +251,14 @@ detectCommunitiesWithKCoreDecomposition() {
 #   Label of the nodes that will be used for the projection. Example: "Package"
 # - dependencies_projection_weight_property=...
 #   Name of the node property that contains the dependency weight. Example: "weight"
-# - dependencies_projection_embedding_dimension=...
-#   Number of the dimensions and therefore size of the resulting array of floating point numbers
 nodeEmbeddingsWithFastRandomProjectionForHDBSCAN() {
-    local PROJECTION_CYPHER_DIR="${CYPHER_DIR}/Dependencies_Projection"
     local NODE_EMBEDDINGS_CYPHER_DIR="${CYPHER_DIR}/Node_Embeddings"
     local mutatePropertyName="dependencies_projection_write_property=embeddingsFastRandomProjection" 
     local embeddingsDimension="dependencies_projection_embedding_dimension=2"
 
     # Statistics
-    execute_cypher "${NODE_EMBEDDINGS_CYPHER_DIR}/Node_Embeddings_1a_Fast_Random_Projection_Estimate.cypher" "${@}" "${mutatePropertyName}" ${embeddingsDimension}
-    execute_cypher "${NODE_EMBEDDINGS_CYPHER_DIR}/Node_Embeddings_1b_Fast_Random_Projection_Statistics.cypher" "${@}" ${embeddingsDimension}
+    # execute_cypher "${NODE_EMBEDDINGS_CYPHER_DIR}/Node_Embeddings_1a_Fast_Random_Projection_Estimate.cypher" "${@}" "${mutatePropertyName}" ${embeddingsDimension}
+    # execute_cypher "${NODE_EMBEDDINGS_CYPHER_DIR}/Node_Embeddings_1b_Fast_Random_Projection_Statistics.cypher" "${@}" ${embeddingsDimension}
     
     # Run the algorithm and write the result into the in-memory projection ("mutate")
     execute_cypher "${NODE_EMBEDDINGS_CYPHER_DIR}/Node_Embeddings_1c_Fast_Random_Projection_Mutate.cypher" "${@}" "${mutatePropertyName}" ${embeddingsDimension}