diff --git a/configs/transforms/liftings/graph2hypergraph/spectral_lifting.yaml b/configs/transforms/liftings/graph2hypergraph/spectral_lifting.yaml new file mode 100644 index 00000000..a247fba5 --- /dev/null +++ b/configs/transforms/liftings/graph2hypergraph/spectral_lifting.yaml @@ -0,0 +1,6 @@ +transform_type: 'lifting' +transform_name: "SpectralLifting" +n_c: null +cluster_alg: "KMeans" +eps: 1e-9 +feature_lifting: ProjectionSum \ No newline at end of file diff --git a/modules/transforms/data_transform.py b/modules/transforms/data_transform.py index 59253ecf..62762de4 100755 --- a/modules/transforms/data_transform.py +++ b/modules/transforms/data_transform.py @@ -12,6 +12,9 @@ from modules.transforms.liftings.graph2hypergraph.knn_lifting import ( HypergraphKNNLifting, ) +from modules.transforms.liftings.graph2hypergraph.spectral_lifting import ( + SpectralLifting, +) from modules.transforms.liftings.graph2simplicial.clique_lifting import ( SimplicialCliqueLifting, ) @@ -19,6 +22,7 @@ TRANSFORMS = { # Graph -> Hypergraph "HypergraphKNNLifting": HypergraphKNNLifting, + "SpectralLifting": SpectralLifting, # Graph -> Simplicial Complex "SimplicialCliqueLifting": SimplicialCliqueLifting, # Graph -> Cell Complex diff --git a/modules/transforms/liftings/graph2hypergraph/spectral_lifting.py b/modules/transforms/liftings/graph2hypergraph/spectral_lifting.py new file mode 100644 index 00000000..eb72b61a --- /dev/null +++ b/modules/transforms/liftings/graph2hypergraph/spectral_lifting.py @@ -0,0 +1,181 @@ +import warnings + +import torch +import torch_geometric +from sklearn.cluster import KMeans + +from modules.transforms.liftings.graph2hypergraph.base import Graph2HypergraphLifting + + +class SpectralLifting(Graph2HypergraphLifting): + r"""Lifts graphs to hypergraph domain by finding clusters using spectral clustering [Ng, Jordan, and Weiss (2002) [[1]](https://proceedings.neurips.cc/paper/2001/hash/801272ee79cfde7fa5960571fee36b9b-Abstract.html)] + + Parameters + ---------- + n_c : int, optional + The number of clusters. Default is None. + cluster_alg : str, optional + The clustering algorithm to use after spectral projection. Default is KMeans. + eps : float, optional + The threshold to use on the eigenvalues before inverting the matrix to avoid division by 0. Default is 1e- + **kwargs : optional + Additional arguments for the class. + """ + + def __init__( + self, + n_c=None, + cluster_alg="KMeans", + eps=1e-9, + **kwargs, + ): + super().__init__(**kwargs) + self.n_c = n_c + self.eps = eps + self.init_clust_alg(cluster_alg) + + def init_clust_alg(self, cluster_alg: str): + cluster_algs = {"KMeans": KMeans} + if cluster_alg not in cluster_algs: + warnings.warn( + f"KMeans will be used since the algorithm {cluster_alg} was not recognized. Available algorithms: {list(cluster_algs.values())}", + stacklevel=2, + ) + self.cluster_alg = cluster_algs.get(cluster_alg, KMeans) + + def get_degree_matrix(self, W: torch.Tensor): + return torch.diag(W.sum(1)) + + def get_sqrt_inv(self, D: torch.Tensor): + degrees = torch.clamp(torch.diag(D), min=self.eps) + return torch.diag(degrees ** (-0.5)) + + def to_dense(self, m: torch.Tensor): + M = torch.zeros(self.num_nodes, self.num_nodes) + n_edges = m.shape[1] + for i in range(n_edges): + M[m[0, i], m[1, i]] = 1 + M[m[1, i], m[0, i]] = 1 + return M + + def get_normalized_laplacian(self, W: torch.Tensor): + # Retrieve dense representation of W + W = self.to_dense(W) + + # Compute degree matrix + D = self.get_degree_matrix(W) + + # Compute unnormalized Laplacian + L = D - W + + # Compute the square root of the inverse of the degree matrix + Dinv = self.get_sqrt_inv(D) + + # Return normalized Laplacian + return Dinv @ L @ Dinv + + def find_gaps(self, v: torch.Tensor, a_max: int = 6, a_min: int = 2): + def find_gap(a): + """Finds index largest gap using the eigengap heuristic""" + for k in range(1, len(v)): + m = v[:k].mean() + s = v[:k].std() + if v[k] > m + s * a: + return k - 1 + return None + + def find_k_largest_diff(): + """Finds index largest gap usings absolute differences""" + max_diff = 0 + max_index = 0 + for i in range(1, len(v)): + diff = abs(v[i] - v[i - 1]) + if diff > max_diff: + max_diff = diff + max_index = i + return max_index - 1 + + # Attempt to find index largest gap using eigengap heuristic. + for a in range(a_max, a_min - 1, -1): + k = find_gap(a) + if k and k > 1: + break + + if k is None or k == 1: + # Fall back to using largest absolute difference in case gap heuristic did not work + warnings.warn( + "Unable to confidently determine the number of clusters n_c. The largest difference between consecutive eigenvalues was used to determine the number of clusters. Please provide n_c.", + stacklevel=2, + ) + k = find_k_largest_diff() + if k == 1: + warnings.warn( + "Please provide the number of clusters n_c. The heuristics identified a single cluster and n_c was set to 2.", + stacklevel=2, + ) + k += 1 + return k + + def get_first_eigenvectors(self, Lsym: torch.Tensor): + # Compute eigenvectors + Lambdas, V = torch.linalg.eig(Lsym) + Lambdas, ind_sort = torch.sort(torch.abs(Lambdas)) + + # Filter eigenvectors + if not self.n_c: + self.n_c = self.find_gaps(Lambdas) + + # Return eigenvectors associated to the self.nc smallest eigenvalues + return torch.abs(V[:, ind_sort[: self.n_c + 1]]) + + def normalize_rows(self, U: torch.Tensor): + return U / ((U**2).sum(1, keepdims=True).sqrt()) + + def build_clusters(self, U: torch.Tensor): + return torch.tensor(self.cluster_alg(n_clusters=self.n_c).fit(U).labels_) + + def build_incidence_hyperedges(self, indices_clusters): + A = torch.zeros(self.n_c, self.num_nodes) + for c in range(self.n_c): + ind_curr_clust = torch.nonzero(indices_clusters == c) + A[c, ind_curr_clust] = 1 + return A + + def transform(self, data: torch_geometric.data.Data) -> dict: + # Compute normalized Laplacian + self.num_nodes = data.x.shape[0] + Lsym = self.get_normalized_laplacian(data.edge_index) + + # Compute eigenvectors matrix + U = self.get_first_eigenvectors(Lsym) + + # Normalize rows + U = self.normalize_rows(U) + + # Build clusters + indices_clusters = self.build_clusters(U) + + # Return incidence_hyperedges matrix + return self.build_incidence_hyperedges(indices_clusters=indices_clusters).T + + def lift_topology(self, data: torch_geometric.data.Data) -> dict: + r"""Lifts graphs to hypergraph domain by finding clusters using spectral clustering [Ng, Jordan, and Weiss (2002) [[1]](https://proceedings.neurips.cc/paper/2001/hash/801272ee79cfde7fa5960571fee36b9b-Abstract.html)] + + Parameters + ---------- + data : torch_geometric.data.Data + The input data to be lifted. + + Returns + ------- + dict + The lifted topology. + """ + + data.pos = data.x + incidence_1 = self.transform(data).to_sparse_coo() + return { + "incidence_hyperedges": incidence_1, + "num_hyperedges": self.n_c, + "x_0": data.x, + } diff --git a/test/transforms/liftings/graph2hypergraph/test_spectral_lifting.py b/test/transforms/liftings/graph2hypergraph/test_spectral_lifting.py new file mode 100644 index 00000000..5e71d4d2 --- /dev/null +++ b/test/transforms/liftings/graph2hypergraph/test_spectral_lifting.py @@ -0,0 +1,27 @@ +"""Test the message passing module.""" + +from modules.data.utils.utils import load_manual_graph +from modules.transforms.liftings.graph2hypergraph.spectral_lifting import ( + SpectralLifting, +) + + +class TestSpectralLifting: + """Test the SpectralLifting class.""" + + def setup_method(self): + # Load the graph + self.data = load_manual_graph() + + # Initialise the SpectralLifting class + self.num_hyperedges = 3 + self.lifting = SpectralLifting(n_c=self.num_hyperedges) + + def test_lift_topology(self): + # Test the lift_topology method + lifted_data_k = self.lifting.forward(self.data.clone()) + + assert list(lifted_data_k.incidence_hyperedges.shape) == [ + self.data.num_nodes, + self.num_hyperedges, + ], "There were issues computing the incidence_hyperedges matrix" diff --git a/tutorials/graph2hypergraph/spectral_lifting.ipynb b/tutorials/graph2hypergraph/spectral_lifting.ipynb new file mode 100644 index 00000000..804fa0a9 --- /dev/null +++ b/tutorials/graph2hypergraph/spectral_lifting.ipynb @@ -0,0 +1,336 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Graph-to-Hypergraph Spectral Lifting Tutorial" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "***\n", + "This notebook shows how to import a dataset, with the desired lifting, and how to run a neural network using the loaded data.\n", + "\n", + "The notebook is divided into sections:\n", + "\n", + "- [Loading the dataset](#loading-the-dataset) loads the config files for the data and the desired tranformation, createsa a dataset object and visualizes it.\n", + "- [Loading and applying the lifting](#loading-and-applying-the-lifting) defines a simple neural network to test that the lifting creates the expected incidence matrices.\n", + "- [Create and run a simplicial nn model](#create-and-run-a-simplicial-nn-model) simply runs a forward pass of the model to check that everything is working as expected.\n", + "\n", + "***\n", + "***\n", + "\n", + "Note that for simplicity the notebook is setup to use a simple graph. However, there is a set of available datasets that you can play with.\n", + "\n", + "To switch to one of the available datasets, simply change the *dataset_name* variable in [Dataset config](#dataset-config) to one of the following names:\n", + "\n", + "* cocitation_cora\n", + "* cocitation_citeseer\n", + "* cocitation_pubmed\n", + "* MUTAG\n", + "* NCI1\n", + "* NCI109\n", + "* PROTEINS_TU\n", + "* AQSOL\n", + "* ZINC\n", + "***" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Imports and utilities" + ] + }, + { + "cell_type": "code", + "execution_count": 236, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "The autoreload extension is already loaded. To reload it, use:\n", + " %reload_ext autoreload\n" + ] + } + ], + "source": [ + "# With this cell any imported module is reloaded before each cell execution\n", + "%load_ext autoreload\n", + "%autoreload 2\n", + "from modules.data.load.loaders import GraphLoader\n", + "from modules.data.preprocess.preprocessor import PreProcessor\n", + "from modules.utils.utils import (\n", + " describe_data,\n", + " load_dataset_config,\n", + " load_model_config,\n", + " load_transform_config,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Loading the Dataset" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Here we just need to spicify the name of the available dataset that we want to load. First, the dataset config is read from the corresponding yaml file (located at `/configs/datasets/` directory), and then the data is loaded via the implemented `Loaders`.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 237, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Dataset configuration for manual_dataset:\n", + "\n", + "{'data_domain': 'graph',\n", + " 'data_type': 'toy_dataset',\n", + " 'data_name': 'manual',\n", + " 'data_dir': 'datasets/graph/toy_dataset',\n", + " 'num_features': 1,\n", + " 'num_classes': 2,\n", + " 'task': 'classification',\n", + " 'loss_type': 'cross_entropy',\n", + " 'monitor_metric': 'accuracy',\n", + " 'task_level': 'node'}\n" + ] + } + ], + "source": [ + "dataset_name = \"manual_dataset\"\n", + "dataset_config = load_dataset_config(dataset_name)\n", + "loader = GraphLoader(dataset_config)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can then access to the data through the `load()`method:" + ] + }, + { + "cell_type": "code", + "execution_count": 238, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Dataset only contains 1 sample:\n" + ] + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " - Graph with 8 vertices and 13 edges.\n", + " - Features dimensions: [1, 0]\n", + " - There are 0 isolated nodes.\n", + "\n" + ] + } + ], + "source": [ + "dataset = loader.load()\n", + "describe_data(dataset)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Loading and Applying the Lifting" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In this section we will instantiate the lifting we want to apply to the data. For this example, the spectral lifting was chosen. The lifting algorithm is inspired by the spectral clustering method proposed by Ng, Jordan, and Weiss (2002) [[1]](https://proceedings.neurips.cc/paper/2001/hash/801272ee79cfde7fa5960571fee36b9b-Abstract.html) and maps the clusters identified by the algorithm to hyperedges. The method is based on computing a powerful graph representation leveraging the row-normalized eigenvectors of the normalized laplacian of the adjacency matrix. The current implementation clusters the projected datapoint using KMeans, following the original method. However, any hard of soft clustering method can be used after the implemented projection. The number of clusters can be automatically determined using eigengap heuristics, or provided by setting the n_c parameter of the configuration dictionary.\n", + "***\n", + "[[1]](https://proceedings.neurips.cc/paper/2001/hash/801272ee79cfde7fa5960571fee36b9b-Abstract.html) Ng, Andrew, Michael Jordan, and Yair Weiss. \"On spectral clustering: Analysis and an algorithm.\" Advances in neural information processing systems 14 (2001).\n", + "***\n", + "\n", + "\n", + "For hypergraphs creating a lifting involves creating the `incidence_hyperedges` matrix.\n", + "\n", + "Similarly to before, we can specify the transformation we want to apply through its type and id --the correxponding config files located at `/configs/transforms`. \n", + "\n", + "Note that the *tranform_config* dictionary generated below can contain a sequence of tranforms if it is needed.\n", + "\n", + "This can also be used to explore liftings from one topological domain to another, for example using two liftings it is possible to achieve a sequence such as: graph -> simplicial complex -> hypergraph. " + ] + }, + { + "cell_type": "code", + "execution_count": 239, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Transform configuration for graph2hypergraph/spectral_lifting:\n", + "\n", + "{'transform_type': 'lifting',\n", + " 'transform_name': 'SpectralLifting',\n", + " 'n_c': None,\n", + " 'cluster_alg': 'KMeans',\n", + " 'eps': 1e-09,\n", + " 'feature_lifting': 'ProjectionSum'}\n" + ] + } + ], + "source": [ + "# Define transformation type and id\n", + "transform_type = \"liftings\"\n", + "# If the transform is a topological lifting, it should include both the type of the lifting and the identifier\n", + "transform_id = \"graph2hypergraph/spectral_lifting\"\n", + "\n", + "# Read yaml file\n", + "transform_config = {\n", + " \"lifting\": load_transform_config(transform_type, transform_id)\n", + " # other transforms (e.g. data manipulations, feature liftings) can be added here\n", + "}\n", + "# [Optional] specify the number of hyperedges/clusters\n", + "transform_config[\"lifting\"][\"n_c\"] = 2" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We than apply the transform via our `PreProcesor`:" + ] + }, + { + "cell_type": "code", + "execution_count": 240, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Processing...\n", + "Done!\n" + ] + } + ], + "source": [ + "lifted_dataset = PreProcessor(dataset, transform_config, loader.data_dir)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Create and Run a Simplicial NN Model" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In this section a simple model is created to test that the used lifting works as intended. In this case the model uses the `incidence_hyperedges` matrix so the lifting should make sure to add it to the data." + ] + }, + { + "cell_type": "code", + "execution_count": 233, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Model configuration for hypergraph UNIGCN:\n", + "\n", + "{'in_channels': None,\n", + " 'hidden_channels': 32,\n", + " 'out_channels': None,\n", + " 'n_layers': 2}\n" + ] + } + ], + "source": [ + "from modules.models.hypergraph.unigcn import UniGCNModel\n", + "\n", + "model_type = \"hypergraph\"\n", + "model_id = \"unigcn\"\n", + "model_config = load_model_config(model_type, model_id)\n", + "\n", + "model = UniGCNModel(model_config, dataset_config)" + ] + }, + { + "cell_type": "code", + "execution_count": 234, + "metadata": {}, + "outputs": [], + "source": [ + "y_hat = model(lifted_dataset.get(0))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "If everything is correct the cell above should execute without errors. " + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "venv_topox", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.3" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}