Add hyperparameter tuning to node embeddings

JohT · JohT · commit 2a7a4f4f7fea · 2025-05-07T08:31:20.000+02:00
diff --git a/jupyter/NodeEmbeddingsJava.ipynb b/jupyter/NodeEmbeddingsJava.ipynb
@@ -62,6 +62,17 @@
     "from neo4j import GraphDatabase"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "29b00ea6",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Main Colormap\n",
+    "main_color_map = 'nipy_spectral'"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -171,7 +182,7 @@
     "# TODO run a community detection algorithm co-located in here when \"communityId\" is missing\n",
     "# TODO run a centrality algorithm co-located in here when \"centrality\" score is missing\n",
     "\n",
-    "def create_node_embeddings(cypher_file_name: str, parameters: dict) -> pd.DataFrame: \n",
+    "def create_node_embeddings(cypher_file_name: str, parameters: dict, ignore_existing: bool = True) -> pd.DataFrame: \n",
     "    \"\"\"\n",
     "    Creates an in-memory Graph projection by calling \"create_undirected_projection\", \n",
     "    runs the cypher Query given as cypherFileName parameter to calculate and stream the node embeddings\n",
@@ -200,8 +211,11 @@
     "        empty_result = pd.DataFrame(columns=[\"codeUnitName\", 'projectName', 'communityId', 'centrality', 'embedding'])\n",
     "        return empty_result\n",
     "\n",
-    "    existing_embeddings_query_filename=\"../cypher/Node_Embeddings/Node_Embeddings_0a_Query_Calculated.cypher\"\n",
-    "    embeddings = query_first_non_empty_cypher_to_data_frame(existing_embeddings_query_filename, cypher_file_name, parameters=parameters)\n",
+    "    if ignore_existing:\n",
+    "        embeddings = query_cypher_to_data_frame(cypher_file_name, parameters_=parameters)\n",
+    "    else:    \n",
+    "        existing_embeddings_query_filename=\"../cypher/Node_Embeddings/Node_Embeddings_0a_Query_Calculated.cypher\"\n",
+    "        embeddings = query_first_non_empty_cypher_to_data_frame(existing_embeddings_query_filename, cypher_file_name, parameters=parameters)\n",
     "    display(embeddings.head()) # Display the first entries of the table\n",
     "    return embeddings"
    ]
@@ -273,7 +287,7 @@
    "outputs": [],
    "source": [
     "def plot_2d_node_embeddings(node_embeddings_for_visualization: pd.DataFrame, title: str):\n",
-    "    if embeddings.empty:\n",
+    "    if node_embeddings_for_visualization.empty:\n",
     "        print(\"No projected data to plot available\")\n",
     "        return\n",
     "\n",
@@ -288,6 +302,30 @@
     "    plot.show()"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e80a45ec",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def get_first_2_embedding_dimensions(embeddings: pd.DataFrame) -> pd.DataFrame:\n",
+    "    \"\"\"\n",
+    "    Returns the first two dimensions of the node embeddings as a DataFrame.\n",
+    "    \"\"\"\n",
+    "    if embeddings.empty: \n",
+    "        print(\"No projected data for node embeddings dimensionality reduction available\")\n",
+    "        return embeddings\n",
+    "    \n",
+    "    # Create a new DataFrame with the results of the 2 dimensional node embeddings\n",
+    "    # and the code unit and artifact name of the query above as preparation for the plot\n",
+    "    get_first_2_embedding_dimensions = embeddings.copy()\n",
+    "\n",
+    "    get_first_2_embedding_dimensions['x'] = [value[0] for value in get_first_2_embedding_dimensions.embedding]\n",
+    "    get_first_2_embedding_dimensions['y'] = [value[1] for value in get_first_2_embedding_dimensions.embedding]\n",
+    "    return get_first_2_embedding_dimensions"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -361,11 +399,194 @@
     "    \"dependencies_projection_node\": \"Package\",\n",
     "    \"dependencies_projection_weight_property\": \"weight25PercentInterfaces\",\n",
     "    \"dependencies_projection_write_property\": \"embeddingsFastRandomProjection\",\n",
-    "    \"dependencies_projection_embedding_dimension\":\"32\"\n",
+    "    \"dependencies_projection_embedding_dimension\":\"16\"\n",
     "}\n",
     "embeddings = create_node_embeddings(\"../cypher/Node_Embeddings/Node_Embeddings_1d_Fast_Random_Projection_Stream.cypher\", java_package_embeddings_parameters)\n"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1709344c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "plot_2d_node_embeddings(\n",
+    "    get_first_2_embedding_dimensions(embeddings), \n",
+    "    \"Java Package positioned by their dependency relationships (FastRP first 2 node embeddings without t-SNE)\"\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "84642495",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import numpy.typing as numpy_typing\n",
+    "\n",
+    "class TunedClusteringResult:\n",
+    "    def __init__(self, labels : list, probabilities : list):\n",
+    "        self.labels = labels\n",
+    "        self.probabilities = probabilities\n",
+    "        self.cluster_count = len(set(labels)) - (1 if -1 in labels else 0)\n",
+    "\n",
+    "def tuned_hierarchical_density_based_spatial_clustering(embeddings: numpy_typing.NDArray, reference_community_ids: numpy_typing.NDArray) -> TunedClusteringResult:\n",
+    "    \"\"\"\n",
+    "    Applies the optimized hierarchical density-based spatial clustering algorithm (HDBSCAN) to the given node embeddings.\n",
+    "    The parameters are tuned to get results similar to the ones of the community detection algorithm.\n",
+    "    The result is a list of cluster ids for each node embedding.\n",
+    "    \"\"\"\n",
+    "    from sklearn.model_selection import RandomizedSearchCV\n",
+    "    from sklearn.cluster import HDBSCAN\n",
+    "    from sklearn.metrics import adjusted_rand_score\n",
+    "    import numpy as np\n",
+    "\n",
+    "    # specify parameters and distributions to sample from\n",
+    "    hyper_parameter_distributions = {\n",
+    "        \"min_samples\": [2, 3, 4, 5, 6, 7, 8, 9, 10, 15, 20, 50, 100],\n",
+    "        \"min_cluster_size\": [4, 5, 6, 7, 10, 20, 30, 50, 100],\n",
+    "        \"cluster_selection_method\": [\"eom\", \"leaf\"],\n",
+    "        \"metric\": [\"euclidean\", \"manhattan\"],\n",
+    "    }\n",
+    "    \n",
+    "    def adjusted_rand_scorer_with_penalty_for_community_references(community_references):\n",
+    "        \"\"\"\n",
+    "        Creates a custom scoring function based on the Adjusted Rand Index (ARI) that penalizes for high noise ratio in clustering.\n",
+    "        Input:\n",
+    "        - community_references: The true labels of the communities for the data points.\n",
+    "        Output:\n",
+    "        - A scoring function that can directly be used for e.g. RandomizedSearchCV and that takes an estimator and data (X) and returns the ARI score with a penalty for noise ratio.\n",
+    "        \"\"\"\n",
+    "        def ari_score_with_penalty(estimator, embeddings):\n",
+    "            clustering_result = estimator.fit_predict(embeddings)\n",
+    "\n",
+    "            # Calculate the noise ratio. Noise points are labeled as -1 in HDBSCAN.\n",
+    "            noise_ratio = np.sum(clustering_result == -1) / len(clustering_result)\n",
+    "            \n",
+    "            if np.unique(clustering_result[clustering_result != -1]).size < 2:\n",
+    "                return -1  # Return worst score if only one cluster is found or all points are noise\n",
+    "\n",
+    "            ari = adjusted_rand_score(community_references[clustering_result != -1], clustering_result[clustering_result != -1])\n",
+    "            \n",
+    "            # Penalize for high noise: If 80% of the points are noise, even a perfect ARI of 1.0 gets scaled down to 0.2\n",
+    "            penalty = 1.0 - noise_ratio  \n",
+    "            \n",
+    "            return ari * penalty\n",
+    "        return ari_score_with_penalty\n",
+    "\n",
+    "\n",
+    "    # Use custom CV that feeds all data to each fold (no slicing)\n",
+    "    all_data_without_slicing_cross_validator = [(np.arange(len(embeddings)), np.arange(len(embeddings)))]\n",
+    "\n",
+    "    hdbscan_with_random_search = RandomizedSearchCV(\n",
+    "        estimator=HDBSCAN(),\n",
+    "        refit=False, # Without refit, the estimator doesn't need to implement the 'predict' method. Drawback: Only the best parameters are returned, not the best model.\n",
+    "        param_distributions=hyper_parameter_distributions,\n",
+    "        n_iter=20,\n",
+    "        scoring=adjusted_rand_scorer_with_penalty_for_community_references(reference_community_ids),\n",
+    "        cv=all_data_without_slicing_cross_validator,\n",
+    "        verbose=1\n",
+    "    )\n",
+    "\n",
+    "    hdbscan_with_random_search.fit(embeddings)\n",
+    "\n",
+    "    #print(\"Best adjusted rand score with noise penalty:\", hdbscan_with_random_search.best_score_)\n",
+    "    print(\"Tuned HDBSCAN parameters:\", hdbscan_with_random_search.best_params_)\n",
+    "\n",
+    "    # Run the clustering again with the best parameters\n",
+    "    cluster_algorithm = HDBSCAN(**hdbscan_with_random_search.best_params_, allow_single_cluster=False)\n",
+    "    best_model = cluster_algorithm.fit(embeddings)\n",
+    "\n",
+    "    results = TunedClusteringResult(best_model.labels_, best_model.probabilities_)\n",
+    "    print(f\"Number of HDBSCAN clusters (excluding noise): {results.cluster_count:.0f}\")\n",
+    "    return results"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "8e1f0227",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import numpy.typing as numpy_typing\n",
+    "\n",
+    "class CommunityComparingScores:\n",
+    "    def __init__(self, adjusted_rand_index: float, normalized_mutual_information: float):\n",
+    "        self.adjusted_rand_index = adjusted_rand_index\n",
+    "        self.normalized_mutual_information = normalized_mutual_information\n",
+    "        self.scores = {\n",
+    "            \"Adjusted Rand Index\": adjusted_rand_index,\n",
+    "            \"Normalized Mutual Information\": normalized_mutual_information\n",
+    "        }\n",
+    "    def __repr__(self):\n",
+    "        return f\"CommunityComparingScores(adjusted_rand_index={self.adjusted_rand_index}, normalized_mutual_information={self.normalized_mutual_information})\"\n",
+    "\n",
+    "def get_community_comparing_scores(cluster_labels: numpy_typing.NDArray, reference_community_ids: numpy_typing.NDArray) -> CommunityComparingScores:\n",
+    "    \"\"\"\n",
+    "    Returns a DataFrame with the scores of the clustering algorithm compared to the community detection algorithm.\n",
+    "    The scores are calculated using the adjusted rand index (ARI) and the normalized mutual information (NMI).\n",
+    "    \"\"\"\n",
+    "    from sklearn.metrics import adjusted_rand_score, normalized_mutual_info_score\n",
+    "\n",
+    "    # Create a mask to filter out noise points. In HDBSCAN, noise points are labeled as -1\n",
+    "    mask = cluster_labels != -1\n",
+    "    ari = adjusted_rand_score(reference_community_ids[mask], cluster_labels[mask])\n",
+    "    nmi = normalized_mutual_info_score(reference_community_ids[mask], cluster_labels[mask])\n",
+    "\n",
+    "    return CommunityComparingScores(ari, nmi)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "3c4e8821",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def add_clustering_results_to_embeddings(embeddings: pd.DataFrame, clustering_result: TunedClusteringResult, clustering_name: str) -> pd.DataFrame:\n",
+    "    \"\"\"\n",
+    "    Adds the clustering results to the embeddings DataFrame.\n",
+    "    \"\"\"\n",
+    "    embeddings['clustering' + clustering_name + 'Label'] = clustering_result.labels\n",
+    "    embeddings['clustering' + clustering_name + 'Probability'] = clustering_result.probabilities\n",
+    "    return embeddings\n",
+    "\n",
+    "def get_clustering_results_distribution(embeddings: pd.DataFrame, clustering_name: str) -> pd.DataFrame:\n",
+    "    \"\"\"\n",
+    "    Returns the clustering results distribution for the given clustering name.\n",
+    "    \"\"\"\n",
+    "    return embeddings.groupby('clustering' + clustering_name + 'Label').aggregate(\n",
+    "        probability=('clustering' + clustering_name + 'Probability', 'mean'),\n",
+    "        count=('codeUnitName', 'count'),\n",
+    "        communityIds=('communityId', lambda x: list(set(x))),\n",
+    "        codeUnitNames=('codeUnitName', lambda x: list(set(x))),\n",
+    "    ).reset_index().sort_values(by='count', ascending=False)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "6580301e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import numpy as np\n",
+    "\n",
+    "embeddings_values = np.array(embeddings.embedding.tolist())\n",
+    "community_reference_ids = np.array(embeddings.communityId.tolist())\n",
+    "\n",
+    "results = tuned_hierarchical_density_based_spatial_clustering(embeddings_values, community_reference_ids)\n",
+    "\n",
+    "community_comparing_scores = get_community_comparing_scores(results.labels, community_reference_ids)\n",
+    "print(community_comparing_scores)\n",
+    "\n",
+    "embeddings = add_clustering_results_to_embeddings(embeddings, results, \"Hdbscan\")\n",
+    "display(get_clustering_results_distribution(embeddings, \"Hdbscan\")) "
+   ]
+  },
   {
    "cell_type": "markdown",
    "id": "76d8bca1",
@@ -429,9 +650,13 @@
     "    \"dependencies_projection_node\": \"Package\",\n",
     "    \"dependencies_projection_weight_property\": \"weight25PercentInterfaces\",\n",
     "    \"dependencies_projection_write_property\": \"embeddingsHashGNN\",\n",
-    "    \"dependencies_projection_embedding_dimension\":\"64\"\n",
+    "    \"dependencies_projection_embedding_dimension\":\"2\"\n",
     "}\n",
     "embeddings = create_node_embeddings(\"../cypher/Node_Embeddings/Node_Embeddings_2d_Hash_GNN_Stream.cypher\", java_package_embeddings_parameters)\n",
+    "plot_2d_node_embeddings(\n",
+    "    get_first_2_embedding_dimensions(embeddings), \n",
+    "    \"Java Package positioned by their dependency relationships (HashGNN first 2 node embeddings without t-SNE)\"\n",
+    ")\n",
     "node_embeddings_for_visualization = prepare_node_embeddings_for_2d_visualization(embeddings)\n",
     "plot_2d_node_embeddings(\n",
     "    node_embeddings_for_visualization, \n",
@@ -459,9 +684,13 @@
     "    \"dependencies_projection_node\": \"Package\",\n",
     "    \"dependencies_projection_weight_property\": \"weight25PercentInterfaces\",\n",
     "    \"dependencies_projection_write_property\": \"embeddingsNode2Vec\",\n",
-    "    \"dependencies_projection_embedding_dimension\":\"32\"\n",
+    "    \"dependencies_projection_embedding_dimension\":\"2\"\n",
     "}\n",
     "embeddings = create_node_embeddings(\"../cypher/Node_Embeddings/Node_Embeddings_3d_Node2Vec_Stream.cypher\", java_package_embeddings_parameters)\n",
+    "plot_2d_node_embeddings(\n",
+    "    get_first_2_embedding_dimensions(embeddings), \n",
+    "    \"Java Package positioned by their dependency relationships (node2vec first 2 node embeddings without t-SNE)\"\n",
+    ")\n",
     "node_embeddings_for_visualization = prepare_node_embeddings_for_2d_visualization(embeddings)\n",
     "plot_2d_node_embeddings(\n",
     "    node_embeddings_for_visualization, \n",