From ba3e5ddc1e1813b0c991d0fcdee9bc51fb3d388f Mon Sep 17 00:00:00 2001 From: Gleb Bazhenov Date: Sun, 14 Sep 2025 14:05:37 +0300 Subject: [PATCH 01/13] add graphland source and tests --- CHANGELOG.md | 1 + test/datasets/test_graphland.py | 100 ++++ torch_geometric/datasets/__init__.py | 2 + torch_geometric/datasets/graphland.py | 695 ++++++++++++++++++++++++++ 4 files changed, 798 insertions(+) create mode 100644 test/datasets/test_graphland.py create mode 100644 torch_geometric/datasets/graphland.py diff --git a/CHANGELOG.md b/CHANGELOG.md index efc3c0eca274..75ccb50fb0f5 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -15,6 +15,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). ### Added +- Added GraphLand benchmark via `GraphLandDataset` - Added `torch_geometric.llm` and its examples ([#10436](https://github.com/pyg-team/pytorch_geometric/pull/10436)) - Added support for negative weights in `sparse_cross_entropy` ([#10432](https://github.com/pyg-team/pytorch_geometric/pull/10432)) - Added `connected_components()` method to `Data` and `HeterData` ([#10388](https://github.com/pyg-team/pytorch_geometric/pull/10388)) diff --git a/test/datasets/test_graphland.py b/test/datasets/test_graphland.py new file mode 100644 index 000000000000..5d81ad9a8c42 --- /dev/null +++ b/test/datasets/test_graphland.py @@ -0,0 +1,100 @@ +import pytest +import torch + +from torch_geometric.testing import withPackage, onlyOnline +from torch_geometric.datasets import GraphLandDataset + + +@onlyOnline +@withPackage('pandas', 'sklearn') +@pytest.mark.parametrize('name', [ + 'hm-categories', + 'pokec-regions', + 'web-topics', + 'tolokers-2', + 'city-reviews', + 'artnet-exp', + 'web-fraud', + 'hm-prices', + 'avazu-ctr', + 'city-roads-M', + 'city-roads-L', + 'twitch-views', + 'artnet-views', + 'web-traffic', +]) +def test_transductive_graphland(name: str): + dataset = GraphLandDataset( + root='./datasets', + split='RL', + name=name, + to_undirected=True, + ) + assert len(dataset) == 1 + + data = dataset[0] + assert data.num_nodes == data.x.shape[0] == data.y.shape[0] + + assert not ( + data.train_mask & + data.val_mask & + data.test_mask + ).any().item() + + labeled_mask = data.train_mask | data.val_mask | data.test_mask + assert not torch.isnan(data.y[labeled_mask]).any().item() + assert not torch.isnan(data.x).any().item() + + assert not ( + data.x_numerical_mask & + data.x_fraction_mask & + data.x_categorical_mask + ).any().item() + + assert ( + data.x_numerical_mask | + data.x_fraction_mask | + data.x_categorical_mask + ).all().item() + + +@onlyOnline +@withPackage('pandas', 'sklearn') +@pytest.mark.parametrize('name', [ + 'hm-categories', + 'pokec-regions', + 'web-topics', + 'tolokers-2', + 'artnet-exp', + 'web-fraud', + 'hm-prices', + 'avazu-ctr', + 'twitch-views', + 'artnet-views', +]) +def test_inductive_graphland(name: str): + base_data = GraphLandDataset( + root='./datasets', + split='TH', + name=name, + to_undirected=True, + )[0] + num_nodes = base_data.num_nodes + num_edges = base_data.num_edges + del base_data + + dataset = GraphLandDataset( + root='./datasets', + split='THI', + name=name, + to_undirected=True, + ) + assert len(dataset) == 3 + + train_data, val_data, test_data = dataset + assert num_nodes == test_data.num_nodes == test_data.node_id.shape[0] + assert num_edges == test_data.num_edges + + assert not torch.isnan(train_data.y[train_data.mask]).any().item() + assert not torch.isnan(val_data.y[val_data.mask]).any().item() + assert not torch.isnan(test_data.y[test_data.mask]).any().item() diff --git a/torch_geometric/datasets/__init__.py b/torch_geometric/datasets/__init__.py index 3b819e4a7099..1eeb39df27db 100644 --- a/torch_geometric/datasets/__init__.py +++ b/torch_geometric/datasets/__init__.py @@ -85,6 +85,7 @@ from .tag_dataset import TAGDataset from .city import CityNetwork from .teeth3ds import Teeth3DS +from .graphland import GraphLandDataset from .dbp15k import DBP15K from .aminer import AMiner @@ -207,6 +208,7 @@ 'TAGDataset', 'CityNetwork', 'Teeth3DS', + 'GraphLandDataset', ] hetero_datasets = [ diff --git a/torch_geometric/datasets/graphland.py b/torch_geometric/datasets/graphland.py new file mode 100644 index 000000000000..f62af62eaff7 --- /dev/null +++ b/torch_geometric/datasets/graphland.py @@ -0,0 +1,695 @@ +import os +import os.path as osp +import yaml +from functools import partial +from typing import Callable, Optional + +import numpy as np +import pandas as pd +import torch +from sklearn.impute import SimpleImputer +from sklearn.preprocessing import ( + OneHotEncoder, + MinMaxScaler, + QuantileTransformer, + StandardScaler, +) +from torch_geometric.data import ( + Data, + InMemoryDataset, + download_url, + extract_zip, +) +from torch_geometric.transforms import ToUndirected +from torch_geometric.utils import subgraph + + +def _load_yaml(path: str) -> dict: + with open(path, 'r') as f: + return yaml.safe_load(f) + + +class GraphLandDataset(InMemoryDataset): + r"""The graph datasets from the `"GraphLand: Evaluating + Graph Machine Learning Models on Diverse Industrial Data" + `_ paper. + + Args: + root (str): Root directory where the dataset should be saved. + name (str): The name of the dataset (:obj:`"hm-categories"`, + :obj:`"pokec-regions"`, :obj:`"web-topics"`, :obj:`"tolokers-2"`, + :obj:`"city-reviews"`, :obj:`"artnet-exp"`, :obj:`"web-fraud"`, + :obj:`"hm-prices"`, :obj:`"avazu-ctr"`, :obj:`"city-roads-M"`, + :obj:`"city-roads-L"`, :obj:`"twitch-views"`, + :obj:`"artnet-views"`, :obj:`"web-traffic"`). + split (str): The type of dataset split/setting (:obj:`"RL"`, + :obj:`"RH"`, :obj:`"TH"`, :obj:`"THI"`). + :obj:`"RL"` is for "random low" split — a 10%/10%/80% random + stratified train/val/test split. + :obj:`"RH"` is for "random high" split — a 50%/25%/25% random + stratified train/val/test split. + :obj:`"TH"` is for "temporal high" split — a 50%/25%/25% temporal + train/val/test split. + :obj:`"THI"` is for "temporal high" split with the inductive + setting, which means that val and test nodes are not seen at train + time, and test nodes are not seen at val time. In contrast to the + previous three splits that will result in a dataset with a single + graph, setting the split to :obj:`"THI"` will result in a dataset + with three graphs corresponding to the train, val, and test + snapshots of an evolving network. + :obj:`"TH"` and :obj:`"THI"` splits are not available for the + following datasets: :obj:`"city-reviews"`, :obj:`"city-roads-M"`, + :obj:`"city-roads-L"`, :obj:`"web-traffic"`. + numerical_features_transform (str, optional): A transform applied to + numerical features (:obj:`None`, :obj:`"standard_scaler"`, + :obj:`"min_max_scaler"`, :obj:`"quantile_transform_normal"`, + :obj:`"quantile_transform_uniform"`). Since numerical features can + have widely different scales and distributions, it is typically + useful to apply some transform to them before passing them to a + neural model. This transform is applied to all numerical features + except for those that are also categorized as fraction features. + (default :obj:`"quantile_transform_normal"`) + fraction_features_transform (str, optional): A transform applied to + fraction features (:obj:`None`, :obj:`"standard_scaler"`, + :obj:`"min_max_scaler"`, :obj:`"quantile_transform_normal"`, + :obj:`"quantile_transform_uniform"`). Fraction features are a + subset of numerical features that have the meaning of fractions + and are thus always in :obj:`[0, 1]` range. Since their range is + bounded, it is not neccessary but may still be useful to apply + some transform to them before passing them to a neural model. + (default :obj:`None`) + categorical_features_transform (str, optional): A transform applied to + categorical features (:obj:`None`, :obj:`"one_hot_encoding"`). + It is most often useful to apply one-hot encoding to categorical + features before passing them to a neural model. + (default :obj:`one_hot_encoding`) + regression_targets_transform (str, optional): A transform applied to + regression targets (:obj:`None`, :obj:`"standard_scaler"`, + :obj:`"min_max_scaler"`). Depending on their range, it may or may + not be useful to apply a transform to regression targets before + fitting a neural model to them. This argument does not affect + classification datasets. (default :obj:`"standard_scaler"`) + numerical_features_nan_imputation_strategy (str, optional): Defines + which value to fill NaNs in numerical features with + (:obj:`None`, :obj:`"mean"`, :obj:`"median"`, + :obj:`"most_frequent"`). This imputation strategy is applied to + all numerical features except for those that are also categorized + as fraction features. (default :obj:`"most_frequent"`) + fraction_features_nan_imputation_strategy (str, optional): Defines + which value to fill NaNs in fraction features with (:obj:`None`, + :obj:`"mean"`, :obj:`"median"`, :obj:`"most_frequent"`). + (default :obj:`"most_frequent"`) + to_undirected (bool, optional): Whether to convert a directed graph + to an undirected one. Does not affect undirected graphs. + (default: :obj:`False`) + transform (callable, optional): A function/transform that takes in an + :obj:`torch_geometric.data.Data` object and returns a transformed + version. The data object will be transformed before every access. + (default: :obj:`None`) + pre_transform (callable, optional): A function/transform that takes in + an :obj:`torch_geometric.data.Data` object and returns a + transformed version. The data object will be transformed before + being saved to disk. (default: :obj:`None`) + force_reload (bool, optional): Whether to re-process the dataset. + (default: :obj:`False`) + + **STATS:** + + .. list-table:: + :widths: 14 10 10 10 14 + :header-rows: 1 + + * - Name + - #nodes + - #edges + - is directed + - task + * - :obj:`hm-categories` + - 46,563 + - 21,461,990 + - False + - multiclass + * - :obj:`pokec-regions` + - 1,632,803 + - 30,622,564 + - True + - multiclass + * - :obj:`web-topics` + - 2,890,331 + - 12,895,369 + - True + - multiclass + * - :obj:`tolokers-2` + - 11,758 + - 1,038,000 + - False + - binclass + * - :obj:`city-reviews` + - 148,801 + - 2,330,830 + - False + - binclass + * - :obj:`artnet-exp` + - 50,405 + - 560,696 + - False + - binclass + * - :obj:`web-fraud` + - 2,890,331 + - 12,895,369 + - True + - binclass + * - :obj:`hm-prices` + - 46,563 + - 21,461,990 + - False + - regression + * - :obj:`avazu-ctr` + - 76,269 + - 21,968,154 + - False + - regression + * - :obj:`city-roads-M` + - 57,073 + - 132,571 + - True + - regression + * - :obj:`city-roads-L` + - 142,257 + - 279,062 + - True + - regression + * - :obj:`twitch-views` + - 168,114 + - 13,595,114 + - False + - regression + * - :obj:`artnet-views` + - 50,405 + - 560,696 + - False + - regression + * - :obj:`web-traffic` + - 2,890,331 + - 12,895,369 + - True + - regression + """ + _url = 'https://zenodo.org/records/16895532' + _transforms = { + 'standard_scaler': partial(StandardScaler, copy=False), + 'min_max_scaler': partial(MinMaxScaler, clip=False, copy=False), + 'quantile_transform_normal': partial( + QuantileTransformer, + output_distribution='normal', + subsample=None, + random_state=0, + copy=False, + ), + 'quantile_transform_uniform': partial( + QuantileTransformer, + output_distribution='uniform', + subsample=None, + random_state=0, + copy=False, + ), + 'one_hot_encoding': partial( + OneHotEncoder, + drop='if_binary', + sparse_output=False, + handle_unknown='ignore', + ), + } + + def __init__( + self, + root: str, + name: str, + split: str, + numerical_features_transform: Optional[str] = + 'quantile_transform_normal', + fraction_features_transform: Optional[str] = None, + categorical_features_transform: Optional[str] = 'one_hot_encoding', + regression_targets_transform: Optional[str] = 'standard_scaler', + numerical_features_nan_imputation_strategy: Optional[str] = + 'most_frequent', + fraction_features_nan_imputation_strategy: Optional[str] = + 'most_frequent', + to_undirected: bool = False, + transform: Optional[Callable] = None, + pre_transform: Optional[Callable] = None, + force_reload: bool = False, + ) -> None: + assert name in [ + 'hm-categories', + 'pokec-regions', + 'web-topics', + 'tolokers-2', + 'city-reviews', + 'artnet-exp', + 'web-fraud', + 'hm-prices', + 'avazu-ctr', + 'city-roads-M', + 'city-roads-L', + 'twitch-views', + 'artnet-views', + 'web-traffic', + ], f'Unsupported dataset name: {name}' + + assert split in ['RL', 'RH', 'TH', 'THI'], \ + f'Unsupported split name: {split}' + if split in ['TH', 'THI']: + assert name not in [ + 'city-reviews', + 'city-roads-M', + 'city-roads-L', + 'web-trafic', + ], ( + 'Temporal split is not available for city-reviews, ' + 'city-roads-M, city-roads-L, web-trafic.' + ) + + if numerical_features_transform is not None: + assert numerical_features_transform in [ + 'standard_scaler', + 'min_max_scaler', + 'quantile_transform_normal', + 'quantile_transform_uniform', + ], ( + 'Unsupported numerical features transform: ' + f'{numerical_features_transform}' + ) + + if fraction_features_transform is not None: + assert fraction_features_transform in [ + 'standard_scaler', + 'min_max_scaler', + 'quantile_transform_normal', + 'quantile_transform_uniform', + ], ( + 'Unsupported fraction features transform: ' + f'{fraction_features_transform}' + ) + + if categorical_features_transform is not None: + assert categorical_features_transform == 'one_hot_encoding', ( + 'Unsupported categorical features transform: ' + f'{categorical_features_transform}' + ) + + if regression_targets_transform is not None: + assert regression_targets_transform in [ + 'standard_scaler', + 'min_max_scaler' + ], ( + 'Unsupported regression targets transform:' + f'{regression_targets_transform}' + ) + + self.name = name + self.split = split + self._num_transform = numerical_features_transform + self._frac_transform = fraction_features_transform + self._cat_transform = categorical_features_transform + self._reg_transform = regression_targets_transform + self._num_imputation = numerical_features_nan_imputation_strategy + self._frac_imputation = fraction_features_nan_imputation_strategy + self._to_undirected = to_undirected + + super().__init__( + root, + transform, + pre_transform, + force_reload=force_reload + ) + self.load(self.processed_paths[0]) + + @property + def raw_dir(self) -> str: + return osp.join(self.root, self.name, 'raw') + + @property + def processed_dir(self) -> str: + specs = ''.join(f'__{str(arg).lower()}' for arg in [ + self.split, + self._num_transform, + self._frac_transform, + self._cat_transform, + self._reg_transform, + self._num_imputation, + self._frac_imputation, + self._to_undirected, + ]) + return osp.join(self.root, self.name, 'processed', specs) + + @property + def raw_file_names(self) -> str: + return self.name + + @property + def processed_file_names(self) -> str: + return 'data.pt' + + def download(self) -> None: + zip_url = osp.join(self._url, 'files', f'{self.name}.zip') + path = download_url(zip_url, self.raw_dir) + extract_zip(path, self.raw_dir) + os.unlink(path) + + def _get_raw_data(self) -> dict: + raw_data_dir = osp.join(self.raw_dir, self.name) + info = _load_yaml(osp.join(raw_data_dir, 'info.yaml')) + + features_df = pd.read_csv( + osp.join(raw_data_dir, 'features.csv'), + index_col=0, + ) + num_features_names = [ + name for name in info['numerical_features_names'] + if name not in info['fraction_features_names'] + ] + num_features = features_df[num_features_names].values + + cat_features_names = info['categorical_features_names'] + cat_features = features_df[cat_features_names].values + + frac_features_names = info['fraction_features_names'] + frac_features = features_df[frac_features_names].values + + targets_df = pd.read_csv( + osp.join(raw_data_dir, 'targets.csv'), + index_col=0, + ) + targets = targets_df[info['target_name']].values + + masks_df = pd.read_csv( + osp.join(raw_data_dir, f'split_masks_{self.split[:2]}.csv'), + index_col=0, + ) + masks = { + k: np.array(v, dtype=bool) + for k, v in masks_df.to_dict('list').items() + } + + edges_df = pd.read_csv(osp.join(raw_data_dir, 'edgelist.csv')) + edges = edges_df.values + + return { + 'info': info, + 'num_features': num_features, + 'cat_features': cat_features, + 'frac_features': frac_features, + 'targets': targets, + 'masks': masks, + 'edges': edges, + } + + def _get_transductive_data(self) -> list[Data]: + raw_data = self._get_raw_data() + + # >>> process targets + targets = raw_data['targets'] + labeled_mask = ~np.isnan(targets) + if ( + raw_data['info']['task'] == 'regression' and + self._reg_transform is not None + ): + targets = targets.reshape(-1, 1) + transform = self._transforms[self._reg_transform]() + transform.fit(targets[raw_data['masks']['train']]) + targets = transform.transform(targets).reshape(-1) + targets = torch.tensor(targets, dtype=torch.float) + + # >>> process numerical features + num_features = raw_data['num_features'] + if num_features.size > 0: + if self._num_transform is not None: + transform = self._transforms[self._num_transform]() + transform.fit(num_features) + + num_features = SimpleImputer( + missing_values=np.nan, + strategy=self._num_imputation, + copy=False + ).fit_transform(num_features) + + if self._num_transform is not None: + num_features = transform.transform(num_features) + + # >>> process fraction features + frac_features = raw_data['frac_features'] + if frac_features.size > 0: + if self._frac_transform is not None: + transform = self._transforms[self._frac_transform]() + transform.fit(frac_features) + + frac_features = SimpleImputer( + missing_values=np.nan, + strategy=self._frac_imputation, + copy=False + ).fit_transform(frac_features) + + if self._frac_transform is not None: + frac_features = transform.transform(frac_features) + + # >>> process categorical features + cat_features = raw_data['cat_features'] + if cat_features.size > 0 and self._cat_transform is not None: + cat_features = ( + self._transforms[self._cat_transform]() + .fit_transform(cat_features) + ) + + # >>> concatenate features and make features mask + features = np.concatenate( + [num_features, frac_features, cat_features], + axis=1, + ) + features = torch.tensor(features, dtype=torch.float) + + num_mask = np.zeros(shape=(features.shape[1],), dtype=bool) + num_mask[:num_features.shape[1]] = True + num_mask = torch.tensor(num_mask, dtype=torch.bool) + + frac_mask = np.zeros(shape=(features.shape[1],), dtype=bool) + frac_mask[num_features.shape[1]:-cat_features.shape[1]] = True + frac_mask = torch.tensor(frac_mask, dtype=torch.bool) + + cat_mask = np.zeros(shape=(features.shape[1],), dtype=bool) + cat_mask[-cat_features.shape[1]:] = True + cat_mask = torch.tensor(cat_mask, dtype=torch.bool) + + # >>> update split masks + train_mask = raw_data['masks']['train'] & labeled_mask + train_mask = torch.tensor(train_mask, dtype=torch.bool) + + val_mask = raw_data['masks']['val'] & labeled_mask + val_mask = torch.tensor(val_mask, dtype=torch.bool) + + test_mask = raw_data['masks']['test'] & labeled_mask + test_mask = torch.tensor(test_mask, dtype=torch.bool) + + # >>> make edge index + edge_index = raw_data['edges'].T + edge_index = torch.tensor(edge_index, dtype=torch.long) + + # >>> construct Data object + data = Data( + edge_index=edge_index, + x=features, + y=targets, + train_mask=train_mask, + val_mask=val_mask, + test_mask=test_mask, + x_numerical_mask=num_mask, + x_fraction_mask=frac_mask, + x_categorical_mask=cat_mask, + ) + return [data] + + def _get_inductive_data(self) -> list[Data]: + raw_data = self._get_raw_data() + transform_mask = raw_data['masks']['train'] + + # >>> process targets + targets = raw_data['targets'] + labeled_mask = ~np.isnan(targets) + if ( + raw_data['info']['task'] == 'regression' and + self._reg_transform is not None + ): + targets = targets.reshape(-1, 1) + transform = self._transforms[self._reg_transform]() + transform.fit(targets[transform_mask]) + targets = transform.transform(targets).reshape(-1) + targets = torch.tensor(targets, dtype=torch.float) + + # >>> process numerical features + num_features = raw_data['num_features'] + if num_features.size > 0: + if self._num_transform is not None: + transform = self._transforms[self._num_transform]() + transform.fit(num_features[transform_mask]) + + imputer = SimpleImputer( + missing_values=np.nan, + strategy=self._num_imputation, + copy=False + ) + imputer.fit(num_features[transform_mask]) + num_features = imputer.transform(num_features) + + if self._num_transform is not None: + num_features = transform.transform(num_features) + + # >>> process fraction features + frac_features = raw_data['frac_features'] + if frac_features.size > 0: + if self._frac_transform is not None: + transform = self._transforms[self._frac_transform]() + transform.fit(frac_features[transform_mask]) + + imputer = SimpleImputer( + missing_values=np.nan, + strategy=self._frac_imputation, + copy=False + ) + imputer.fit(frac_features[transform_mask]) + frac_features = imputer.transform(frac_features) + + if self._frac_transform is not None: + frac_features = transform.transform(frac_features) + + # >>> process categorical features + cat_features = raw_data['cat_features'] + if cat_features.size > 0 and self._cat_transform is not None: + transform = self._transforms[self._cat_transform]() + transform.fit(cat_features[transform_mask]) + cat_features = transform.transform(cat_features) + + # >>> concatenate features and make features mask + features = np.concatenate( + [num_features, frac_features, cat_features], + axis=1, + ) + features = torch.tensor(features, dtype=torch.float) + + num_mask = np.zeros(shape=(features.shape[1],), dtype=bool) + num_mask[:num_features.shape[1]] = True + num_mask = torch.tensor(num_mask, dtype=torch.bool) + + frac_mask = np.zeros(shape=(features.shape[1],), dtype=bool) + frac_mask[num_features.shape[1]:-cat_features.shape[1]] = True + frac_mask = torch.tensor(frac_mask, dtype=torch.bool) + + cat_mask = np.zeros(shape=(features.shape[1],), dtype=bool) + cat_mask[-cat_features.shape[1]:] = True + cat_mask = torch.tensor(cat_mask, dtype=torch.bool) + + # >>> construct Data objects + edge_index = raw_data['edges'].T + edge_index = torch.tensor(edge_index, dtype=torch.long) + + # --- train + train_graph_mask = raw_data['masks']['train'] + train_graph_mask = torch.tensor(train_graph_mask, dtype=torch.bool) + + train_label_mask = raw_data['masks']['train'] & labeled_mask + train_label_mask = torch.tensor(train_label_mask, dtype=torch.bool) + + train_node_id = np.where(train_graph_mask)[0] + train_node_id = torch.tensor(train_node_id, dtype=torch.long) + + train_edge_index, _ = subgraph( + train_graph_mask, + edge_index, + relabel_nodes=True, + ) + train_data = Data( + edge_index=train_edge_index, + x=features[train_graph_mask], + y=targets[train_graph_mask], + mask=train_label_mask[train_graph_mask], + x_numerical_mask=num_mask, + x_fraction_mask=frac_mask, + x_categorical_mask=cat_mask, + node_id=train_node_id, + ) + + # --- val + val_graph_mask = ( + raw_data['masks']['train'] | + raw_data['masks']['val'] + ) + val_graph_mask = torch.tensor(val_graph_mask, dtype=torch.bool) + + val_label_mask = raw_data['masks']['val'] & labeled_mask + val_label_mask = torch.tensor(val_label_mask, dtype=torch.bool) + + val_node_id = np.where(val_graph_mask)[0] + val_node_id = torch.tensor(val_node_id, dtype=torch.long) + + val_edge_index, _ = subgraph( + val_graph_mask, + edge_index, + relabel_nodes=True, + ) + val_data = Data( + edge_index=val_edge_index, + x=features[val_graph_mask], + y=targets[val_graph_mask], + mask=val_label_mask[val_graph_mask], + x_numerical_mask=num_mask, + x_fraction_mask=frac_mask, + x_categorical_mask=cat_mask, + node_id=val_node_id, + ) + + # --- test + test_graph_mask = ( + raw_data['masks']['train'] | + raw_data['masks']['val'] | + raw_data['masks']['test'] + ) + test_graph_mask = torch.tensor(test_graph_mask, dtype=torch.bool) + + test_label_mask = raw_data['masks']['test'] & labeled_mask + test_label_mask = torch.tensor(test_label_mask, dtype=torch.bool) + + test_node_id = np.where(test_graph_mask)[0] + test_node_id = torch.tensor(test_node_id, dtype=torch.long) + + test_edge_index, _ = subgraph( + test_graph_mask, + edge_index, + relabel_nodes=True, + ) + test_data = Data( + edge_index=test_edge_index, + x=features[test_graph_mask], + y=targets[test_graph_mask], + mask=test_label_mask[test_graph_mask], + x_numerical_mask=num_mask, + x_fraction_mask=frac_mask, + x_categorical_mask=cat_mask, + node_id=test_node_id, + ) + + return [train_data, val_data, test_data] + + def process(self) -> None: + data = ( + self._get_transductive_data() + if self.split in ['RL', 'RH', 'TH'] + else self._get_inductive_data() + ) + if self._to_undirected: + transform = ToUndirected() + for idx, d in enumerate(data): + data[idx] = transform(d) + + self.save(data, self.processed_paths[0]) + + def __repr__(self) -> str: + return f'{self.__class__.__name__}(name={self.name})' From 7f10551c912dd9a91d6eeebdea2d67bed2559158 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 16 Sep 2025 15:12:54 +0000 Subject: [PATCH 02/13] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- test/datasets/test_graphland.py | 22 ++-- torch_geometric/datasets/graphland.py | 149 ++++++++++---------------- 2 files changed, 65 insertions(+), 106 deletions(-) diff --git a/test/datasets/test_graphland.py b/test/datasets/test_graphland.py index 5d81ad9a8c42..b9e9bd036ee4 100644 --- a/test/datasets/test_graphland.py +++ b/test/datasets/test_graphland.py @@ -1,8 +1,8 @@ import pytest import torch -from torch_geometric.testing import withPackage, onlyOnline from torch_geometric.datasets import GraphLandDataset +from torch_geometric.testing import onlyOnline, withPackage @onlyOnline @@ -35,27 +35,17 @@ def test_transductive_graphland(name: str): data = dataset[0] assert data.num_nodes == data.x.shape[0] == data.y.shape[0] - assert not ( - data.train_mask & - data.val_mask & - data.test_mask - ).any().item() + assert not (data.train_mask & data.val_mask & data.test_mask).any().item() labeled_mask = data.train_mask | data.val_mask | data.test_mask assert not torch.isnan(data.y[labeled_mask]).any().item() assert not torch.isnan(data.x).any().item() - assert not ( - data.x_numerical_mask & - data.x_fraction_mask & - data.x_categorical_mask - ).any().item() + assert not (data.x_numerical_mask & data.x_fraction_mask + & data.x_categorical_mask).any().item() - assert ( - data.x_numerical_mask | - data.x_fraction_mask | - data.x_categorical_mask - ).all().item() + assert (data.x_numerical_mask | data.x_fraction_mask + | data.x_categorical_mask).all().item() @onlyOnline diff --git a/torch_geometric/datasets/graphland.py b/torch_geometric/datasets/graphland.py index f62af62eaff7..a2ac95b47e1c 100644 --- a/torch_geometric/datasets/graphland.py +++ b/torch_geometric/datasets/graphland.py @@ -1,19 +1,20 @@ import os import os.path as osp -import yaml from functools import partial from typing import Callable, Optional import numpy as np import pandas as pd import torch +import yaml from sklearn.impute import SimpleImputer from sklearn.preprocessing import ( - OneHotEncoder, MinMaxScaler, + OneHotEncoder, QuantileTransformer, StandardScaler, ) + from torch_geometric.data import ( Data, InMemoryDataset, @@ -25,7 +26,7 @@ def _load_yaml(path: str) -> dict: - with open(path, 'r') as f: + with open(path) as f: return yaml.safe_load(f) @@ -197,23 +198,28 @@ class GraphLandDataset(InMemoryDataset): """ _url = 'https://zenodo.org/records/16895532' _transforms = { - 'standard_scaler': partial(StandardScaler, copy=False), - 'min_max_scaler': partial(MinMaxScaler, clip=False, copy=False), - 'quantile_transform_normal': partial( + 'standard_scaler': + partial(StandardScaler, copy=False), + 'min_max_scaler': + partial(MinMaxScaler, clip=False, copy=False), + 'quantile_transform_normal': + partial( QuantileTransformer, output_distribution='normal', subsample=None, random_state=0, copy=False, ), - 'quantile_transform_uniform': partial( + 'quantile_transform_uniform': + partial( QuantileTransformer, output_distribution='uniform', subsample=None, random_state=0, copy=False, ), - 'one_hot_encoding': partial( + 'one_hot_encoding': + partial( OneHotEncoder, drop='if_binary', sparse_output=False, @@ -226,15 +232,15 @@ def __init__( root: str, name: str, split: str, - numerical_features_transform: Optional[str] = - 'quantile_transform_normal', + numerical_features_transform: Optional[ + str] = 'quantile_transform_normal', fraction_features_transform: Optional[str] = None, categorical_features_transform: Optional[str] = 'one_hot_encoding', regression_targets_transform: Optional[str] = 'standard_scaler', - numerical_features_nan_imputation_strategy: Optional[str] = - 'most_frequent', - fraction_features_nan_imputation_strategy: Optional[str] = - 'most_frequent', + numerical_features_nan_imputation_strategy: Optional[ + str] = 'most_frequent', + fraction_features_nan_imputation_strategy: Optional[ + str] = 'most_frequent', to_undirected: bool = False, transform: Optional[Callable] = None, pre_transform: Optional[Callable] = None, @@ -265,10 +271,8 @@ def __init__( 'city-roads-M', 'city-roads-L', 'web-trafic', - ], ( - 'Temporal split is not available for city-reviews, ' - 'city-roads-M, city-roads-L, web-trafic.' - ) + ], ('Temporal split is not available for city-reviews, ' + 'city-roads-M, city-roads-L, web-trafic.') if numerical_features_transform is not None: assert numerical_features_transform in [ @@ -276,10 +280,8 @@ def __init__( 'min_max_scaler', 'quantile_transform_normal', 'quantile_transform_uniform', - ], ( - 'Unsupported numerical features transform: ' - f'{numerical_features_transform}' - ) + ], ('Unsupported numerical features transform: ' + f'{numerical_features_transform}') if fraction_features_transform is not None: assert fraction_features_transform in [ @@ -287,25 +289,19 @@ def __init__( 'min_max_scaler', 'quantile_transform_normal', 'quantile_transform_uniform', - ], ( - 'Unsupported fraction features transform: ' - f'{fraction_features_transform}' - ) + ], ('Unsupported fraction features transform: ' + f'{fraction_features_transform}') if categorical_features_transform is not None: assert categorical_features_transform == 'one_hot_encoding', ( 'Unsupported categorical features transform: ' - f'{categorical_features_transform}' - ) + f'{categorical_features_transform}') if regression_targets_transform is not None: assert regression_targets_transform in [ - 'standard_scaler', - 'min_max_scaler' - ], ( - 'Unsupported regression targets transform:' - f'{regression_targets_transform}' - ) + 'standard_scaler', 'min_max_scaler' + ], ('Unsupported regression targets transform:' + f'{regression_targets_transform}') self.name = name self.split = split @@ -317,12 +313,8 @@ def __init__( self._frac_imputation = fraction_features_nan_imputation_strategy self._to_undirected = to_undirected - super().__init__( - root, - transform, - pre_transform, - force_reload=force_reload - ) + super().__init__(root, transform, pre_transform, + force_reload=force_reload) self.load(self.processed_paths[0]) @property @@ -411,10 +403,8 @@ def _get_transductive_data(self) -> list[Data]: # >>> process targets targets = raw_data['targets'] labeled_mask = ~np.isnan(targets) - if ( - raw_data['info']['task'] == 'regression' and - self._reg_transform is not None - ): + if (raw_data['info']['task'] == 'regression' + and self._reg_transform is not None): targets = targets.reshape(-1, 1) transform = self._transforms[self._reg_transform]() transform.fit(targets[raw_data['masks']['train']]) @@ -429,10 +419,8 @@ def _get_transductive_data(self) -> list[Data]: transform.fit(num_features) num_features = SimpleImputer( - missing_values=np.nan, - strategy=self._num_imputation, - copy=False - ).fit_transform(num_features) + missing_values=np.nan, strategy=self._num_imputation, + copy=False).fit_transform(num_features) if self._num_transform is not None: num_features = transform.transform(num_features) @@ -445,10 +433,8 @@ def _get_transductive_data(self) -> list[Data]: transform.fit(frac_features) frac_features = SimpleImputer( - missing_values=np.nan, - strategy=self._frac_imputation, - copy=False - ).fit_transform(frac_features) + missing_values=np.nan, strategy=self._frac_imputation, + copy=False).fit_transform(frac_features) if self._frac_transform is not None: frac_features = transform.transform(frac_features) @@ -456,10 +442,8 @@ def _get_transductive_data(self) -> list[Data]: # >>> process categorical features cat_features = raw_data['cat_features'] if cat_features.size > 0 and self._cat_transform is not None: - cat_features = ( - self._transforms[self._cat_transform]() - .fit_transform(cat_features) - ) + cat_features = (self._transforms[self._cat_transform] + ().fit_transform(cat_features)) # >>> concatenate features and make features mask features = np.concatenate( @@ -468,15 +452,15 @@ def _get_transductive_data(self) -> list[Data]: ) features = torch.tensor(features, dtype=torch.float) - num_mask = np.zeros(shape=(features.shape[1],), dtype=bool) + num_mask = np.zeros(shape=(features.shape[1], ), dtype=bool) num_mask[:num_features.shape[1]] = True num_mask = torch.tensor(num_mask, dtype=torch.bool) - frac_mask = np.zeros(shape=(features.shape[1],), dtype=bool) + frac_mask = np.zeros(shape=(features.shape[1], ), dtype=bool) frac_mask[num_features.shape[1]:-cat_features.shape[1]] = True frac_mask = torch.tensor(frac_mask, dtype=torch.bool) - cat_mask = np.zeros(shape=(features.shape[1],), dtype=bool) + cat_mask = np.zeros(shape=(features.shape[1], ), dtype=bool) cat_mask[-cat_features.shape[1]:] = True cat_mask = torch.tensor(cat_mask, dtype=torch.bool) @@ -515,10 +499,8 @@ def _get_inductive_data(self) -> list[Data]: # >>> process targets targets = raw_data['targets'] labeled_mask = ~np.isnan(targets) - if ( - raw_data['info']['task'] == 'regression' and - self._reg_transform is not None - ): + if (raw_data['info']['task'] == 'regression' + and self._reg_transform is not None): targets = targets.reshape(-1, 1) transform = self._transforms[self._reg_transform]() transform.fit(targets[transform_mask]) @@ -532,11 +514,8 @@ def _get_inductive_data(self) -> list[Data]: transform = self._transforms[self._num_transform]() transform.fit(num_features[transform_mask]) - imputer = SimpleImputer( - missing_values=np.nan, - strategy=self._num_imputation, - copy=False - ) + imputer = SimpleImputer(missing_values=np.nan, + strategy=self._num_imputation, copy=False) imputer.fit(num_features[transform_mask]) num_features = imputer.transform(num_features) @@ -550,11 +529,8 @@ def _get_inductive_data(self) -> list[Data]: transform = self._transforms[self._frac_transform]() transform.fit(frac_features[transform_mask]) - imputer = SimpleImputer( - missing_values=np.nan, - strategy=self._frac_imputation, - copy=False - ) + imputer = SimpleImputer(missing_values=np.nan, + strategy=self._frac_imputation, copy=False) imputer.fit(frac_features[transform_mask]) frac_features = imputer.transform(frac_features) @@ -575,15 +551,15 @@ def _get_inductive_data(self) -> list[Data]: ) features = torch.tensor(features, dtype=torch.float) - num_mask = np.zeros(shape=(features.shape[1],), dtype=bool) + num_mask = np.zeros(shape=(features.shape[1], ), dtype=bool) num_mask[:num_features.shape[1]] = True num_mask = torch.tensor(num_mask, dtype=torch.bool) - frac_mask = np.zeros(shape=(features.shape[1],), dtype=bool) + frac_mask = np.zeros(shape=(features.shape[1], ), dtype=bool) frac_mask[num_features.shape[1]:-cat_features.shape[1]] = True frac_mask = torch.tensor(frac_mask, dtype=torch.bool) - cat_mask = np.zeros(shape=(features.shape[1],), dtype=bool) + cat_mask = np.zeros(shape=(features.shape[1], ), dtype=bool) cat_mask[-cat_features.shape[1]:] = True cat_mask = torch.tensor(cat_mask, dtype=torch.bool) @@ -618,10 +594,8 @@ def _get_inductive_data(self) -> list[Data]: ) # --- val - val_graph_mask = ( - raw_data['masks']['train'] | - raw_data['masks']['val'] - ) + val_graph_mask = (raw_data['masks']['train'] + | raw_data['masks']['val']) val_graph_mask = torch.tensor(val_graph_mask, dtype=torch.bool) val_label_mask = raw_data['masks']['val'] & labeled_mask @@ -647,11 +621,9 @@ def _get_inductive_data(self) -> list[Data]: ) # --- test - test_graph_mask = ( - raw_data['masks']['train'] | - raw_data['masks']['val'] | - raw_data['masks']['test'] - ) + test_graph_mask = (raw_data['masks']['train'] + | raw_data['masks']['val'] + | raw_data['masks']['test']) test_graph_mask = torch.tensor(test_graph_mask, dtype=torch.bool) test_label_mask = raw_data['masks']['test'] & labeled_mask @@ -679,11 +651,8 @@ def _get_inductive_data(self) -> list[Data]: return [train_data, val_data, test_data] def process(self) -> None: - data = ( - self._get_transductive_data() - if self.split in ['RL', 'RH', 'TH'] - else self._get_inductive_data() - ) + data = (self._get_transductive_data() if self.split + in ['RL', 'RH', 'TH'] else self._get_inductive_data()) if self._to_undirected: transform = ToUndirected() for idx, d in enumerate(data): From 3acac128fd770382134877e3e898a728cd89a747 Mon Sep 17 00:00:00 2001 From: Gleb Bazhenov Date: Tue, 16 Sep 2025 18:19:14 +0300 Subject: [PATCH 03/13] try to fix dataset class and test --- test/datasets/test_graphland.py | 26 ++--- torch_geometric/datasets/graphland.py | 152 +++++++++++--------------- 2 files changed, 69 insertions(+), 109 deletions(-) diff --git a/test/datasets/test_graphland.py b/test/datasets/test_graphland.py index 5d81ad9a8c42..0d455a46fab3 100644 --- a/test/datasets/test_graphland.py +++ b/test/datasets/test_graphland.py @@ -1,12 +1,12 @@ import pytest import torch -from torch_geometric.testing import withPackage, onlyOnline from torch_geometric.datasets import GraphLandDataset +from torch_geometric.testing import onlyOnline, withPackage @onlyOnline -@withPackage('pandas', 'sklearn') +@withPackage('pandas', 'sklearn', 'yaml') @pytest.mark.parametrize('name', [ 'hm-categories', 'pokec-regions', @@ -35,31 +35,21 @@ def test_transductive_graphland(name: str): data = dataset[0] assert data.num_nodes == data.x.shape[0] == data.y.shape[0] - assert not ( - data.train_mask & - data.val_mask & - data.test_mask - ).any().item() + assert not (data.train_mask & data.val_mask & data.test_mask).any().item() labeled_mask = data.train_mask | data.val_mask | data.test_mask assert not torch.isnan(data.y[labeled_mask]).any().item() assert not torch.isnan(data.x).any().item() - assert not ( - data.x_numerical_mask & - data.x_fraction_mask & - data.x_categorical_mask - ).any().item() + assert not (data.x_numerical_mask & data.x_fraction_mask + & data.x_categorical_mask).any().item() - assert ( - data.x_numerical_mask | - data.x_fraction_mask | - data.x_categorical_mask - ).all().item() + assert (data.x_numerical_mask | data.x_fraction_mask + | data.x_categorical_mask).all().item() @onlyOnline -@withPackage('pandas', 'sklearn') +@withPackage('pandas', 'sklearn', 'yaml') @pytest.mark.parametrize('name', [ 'hm-categories', 'pokec-regions', diff --git a/torch_geometric/datasets/graphland.py b/torch_geometric/datasets/graphland.py index f62af62eaff7..928a4d1a7d35 100644 --- a/torch_geometric/datasets/graphland.py +++ b/torch_geometric/datasets/graphland.py @@ -1,19 +1,19 @@ import os import os.path as osp -import yaml from functools import partial from typing import Callable, Optional import numpy as np -import pandas as pd import torch +import yaml from sklearn.impute import SimpleImputer from sklearn.preprocessing import ( - OneHotEncoder, MinMaxScaler, + OneHotEncoder, QuantileTransformer, StandardScaler, ) + from torch_geometric.data import ( Data, InMemoryDataset, @@ -25,7 +25,7 @@ def _load_yaml(path: str) -> dict: - with open(path, 'r') as f: + with open(path) as f: return yaml.safe_load(f) @@ -197,23 +197,28 @@ class GraphLandDataset(InMemoryDataset): """ _url = 'https://zenodo.org/records/16895532' _transforms = { - 'standard_scaler': partial(StandardScaler, copy=False), - 'min_max_scaler': partial(MinMaxScaler, clip=False, copy=False), - 'quantile_transform_normal': partial( + 'standard_scaler': + partial(StandardScaler, copy=False), + 'min_max_scaler': + partial(MinMaxScaler, clip=False, copy=False), + 'quantile_transform_normal': + partial( QuantileTransformer, output_distribution='normal', subsample=None, random_state=0, copy=False, ), - 'quantile_transform_uniform': partial( + 'quantile_transform_uniform': + partial( QuantileTransformer, output_distribution='uniform', subsample=None, random_state=0, copy=False, ), - 'one_hot_encoding': partial( + 'one_hot_encoding': + partial( OneHotEncoder, drop='if_binary', sparse_output=False, @@ -226,15 +231,15 @@ def __init__( root: str, name: str, split: str, - numerical_features_transform: Optional[str] = - 'quantile_transform_normal', + numerical_features_transform: Optional[ + str] = 'quantile_transform_normal', fraction_features_transform: Optional[str] = None, categorical_features_transform: Optional[str] = 'one_hot_encoding', regression_targets_transform: Optional[str] = 'standard_scaler', - numerical_features_nan_imputation_strategy: Optional[str] = - 'most_frequent', - fraction_features_nan_imputation_strategy: Optional[str] = - 'most_frequent', + numerical_features_nan_imputation_strategy: Optional[ + str] = 'most_frequent', + fraction_features_nan_imputation_strategy: Optional[ + str] = 'most_frequent', to_undirected: bool = False, transform: Optional[Callable] = None, pre_transform: Optional[Callable] = None, @@ -265,10 +270,8 @@ def __init__( 'city-roads-M', 'city-roads-L', 'web-trafic', - ], ( - 'Temporal split is not available for city-reviews, ' - 'city-roads-M, city-roads-L, web-trafic.' - ) + ], ('Temporal split is not available for city-reviews, ' + 'city-roads-M, city-roads-L, web-trafic.') if numerical_features_transform is not None: assert numerical_features_transform in [ @@ -276,10 +279,8 @@ def __init__( 'min_max_scaler', 'quantile_transform_normal', 'quantile_transform_uniform', - ], ( - 'Unsupported numerical features transform: ' - f'{numerical_features_transform}' - ) + ], ('Unsupported numerical features transform: ' + f'{numerical_features_transform}') if fraction_features_transform is not None: assert fraction_features_transform in [ @@ -287,25 +288,19 @@ def __init__( 'min_max_scaler', 'quantile_transform_normal', 'quantile_transform_uniform', - ], ( - 'Unsupported fraction features transform: ' - f'{fraction_features_transform}' - ) + ], ('Unsupported fraction features transform: ' + f'{fraction_features_transform}') if categorical_features_transform is not None: assert categorical_features_transform == 'one_hot_encoding', ( 'Unsupported categorical features transform: ' - f'{categorical_features_transform}' - ) + f'{categorical_features_transform}') if regression_targets_transform is not None: assert regression_targets_transform in [ - 'standard_scaler', - 'min_max_scaler' - ], ( - 'Unsupported regression targets transform:' - f'{regression_targets_transform}' - ) + 'standard_scaler', 'min_max_scaler' + ], ('Unsupported regression targets transform:' + f'{regression_targets_transform}') self.name = name self.split = split @@ -317,12 +312,8 @@ def __init__( self._frac_imputation = fraction_features_nan_imputation_strategy self._to_undirected = to_undirected - super().__init__( - root, - transform, - pre_transform, - force_reload=force_reload - ) + super().__init__(root, transform, pre_transform, + force_reload=force_reload) self.load(self.processed_paths[0]) @property @@ -358,6 +349,8 @@ def download(self) -> None: os.unlink(path) def _get_raw_data(self) -> dict: + import pandas as pd + raw_data_dir = osp.join(self.raw_dir, self.name) info = _load_yaml(osp.join(raw_data_dir, 'info.yaml')) @@ -411,10 +404,8 @@ def _get_transductive_data(self) -> list[Data]: # >>> process targets targets = raw_data['targets'] labeled_mask = ~np.isnan(targets) - if ( - raw_data['info']['task'] == 'regression' and - self._reg_transform is not None - ): + if (raw_data['info']['task'] == 'regression' + and self._reg_transform is not None): targets = targets.reshape(-1, 1) transform = self._transforms[self._reg_transform]() transform.fit(targets[raw_data['masks']['train']]) @@ -429,10 +420,8 @@ def _get_transductive_data(self) -> list[Data]: transform.fit(num_features) num_features = SimpleImputer( - missing_values=np.nan, - strategy=self._num_imputation, - copy=False - ).fit_transform(num_features) + missing_values=np.nan, strategy=self._num_imputation, + copy=False).fit_transform(num_features) if self._num_transform is not None: num_features = transform.transform(num_features) @@ -445,10 +434,8 @@ def _get_transductive_data(self) -> list[Data]: transform.fit(frac_features) frac_features = SimpleImputer( - missing_values=np.nan, - strategy=self._frac_imputation, - copy=False - ).fit_transform(frac_features) + missing_values=np.nan, strategy=self._frac_imputation, + copy=False).fit_transform(frac_features) if self._frac_transform is not None: frac_features = transform.transform(frac_features) @@ -456,10 +443,8 @@ def _get_transductive_data(self) -> list[Data]: # >>> process categorical features cat_features = raw_data['cat_features'] if cat_features.size > 0 and self._cat_transform is not None: - cat_features = ( - self._transforms[self._cat_transform]() - .fit_transform(cat_features) - ) + cat_features = (self._transforms[self._cat_transform] + ().fit_transform(cat_features)) # >>> concatenate features and make features mask features = np.concatenate( @@ -468,15 +453,15 @@ def _get_transductive_data(self) -> list[Data]: ) features = torch.tensor(features, dtype=torch.float) - num_mask = np.zeros(shape=(features.shape[1],), dtype=bool) + num_mask = np.zeros(shape=(features.shape[1], ), dtype=bool) num_mask[:num_features.shape[1]] = True num_mask = torch.tensor(num_mask, dtype=torch.bool) - frac_mask = np.zeros(shape=(features.shape[1],), dtype=bool) + frac_mask = np.zeros(shape=(features.shape[1], ), dtype=bool) frac_mask[num_features.shape[1]:-cat_features.shape[1]] = True frac_mask = torch.tensor(frac_mask, dtype=torch.bool) - cat_mask = np.zeros(shape=(features.shape[1],), dtype=bool) + cat_mask = np.zeros(shape=(features.shape[1], ), dtype=bool) cat_mask[-cat_features.shape[1]:] = True cat_mask = torch.tensor(cat_mask, dtype=torch.bool) @@ -515,10 +500,8 @@ def _get_inductive_data(self) -> list[Data]: # >>> process targets targets = raw_data['targets'] labeled_mask = ~np.isnan(targets) - if ( - raw_data['info']['task'] == 'regression' and - self._reg_transform is not None - ): + if (raw_data['info']['task'] == 'regression' + and self._reg_transform is not None): targets = targets.reshape(-1, 1) transform = self._transforms[self._reg_transform]() transform.fit(targets[transform_mask]) @@ -532,11 +515,8 @@ def _get_inductive_data(self) -> list[Data]: transform = self._transforms[self._num_transform]() transform.fit(num_features[transform_mask]) - imputer = SimpleImputer( - missing_values=np.nan, - strategy=self._num_imputation, - copy=False - ) + imputer = SimpleImputer(missing_values=np.nan, + strategy=self._num_imputation, copy=False) imputer.fit(num_features[transform_mask]) num_features = imputer.transform(num_features) @@ -550,11 +530,8 @@ def _get_inductive_data(self) -> list[Data]: transform = self._transforms[self._frac_transform]() transform.fit(frac_features[transform_mask]) - imputer = SimpleImputer( - missing_values=np.nan, - strategy=self._frac_imputation, - copy=False - ) + imputer = SimpleImputer(missing_values=np.nan, + strategy=self._frac_imputation, copy=False) imputer.fit(frac_features[transform_mask]) frac_features = imputer.transform(frac_features) @@ -575,15 +552,15 @@ def _get_inductive_data(self) -> list[Data]: ) features = torch.tensor(features, dtype=torch.float) - num_mask = np.zeros(shape=(features.shape[1],), dtype=bool) + num_mask = np.zeros(shape=(features.shape[1], ), dtype=bool) num_mask[:num_features.shape[1]] = True num_mask = torch.tensor(num_mask, dtype=torch.bool) - frac_mask = np.zeros(shape=(features.shape[1],), dtype=bool) + frac_mask = np.zeros(shape=(features.shape[1], ), dtype=bool) frac_mask[num_features.shape[1]:-cat_features.shape[1]] = True frac_mask = torch.tensor(frac_mask, dtype=torch.bool) - cat_mask = np.zeros(shape=(features.shape[1],), dtype=bool) + cat_mask = np.zeros(shape=(features.shape[1], ), dtype=bool) cat_mask[-cat_features.shape[1]:] = True cat_mask = torch.tensor(cat_mask, dtype=torch.bool) @@ -618,10 +595,8 @@ def _get_inductive_data(self) -> list[Data]: ) # --- val - val_graph_mask = ( - raw_data['masks']['train'] | - raw_data['masks']['val'] - ) + val_graph_mask = (raw_data['masks']['train'] + | raw_data['masks']['val']) val_graph_mask = torch.tensor(val_graph_mask, dtype=torch.bool) val_label_mask = raw_data['masks']['val'] & labeled_mask @@ -647,11 +622,9 @@ def _get_inductive_data(self) -> list[Data]: ) # --- test - test_graph_mask = ( - raw_data['masks']['train'] | - raw_data['masks']['val'] | - raw_data['masks']['test'] - ) + test_graph_mask = (raw_data['masks']['train'] + | raw_data['masks']['val'] + | raw_data['masks']['test']) test_graph_mask = torch.tensor(test_graph_mask, dtype=torch.bool) test_label_mask = raw_data['masks']['test'] & labeled_mask @@ -679,11 +652,8 @@ def _get_inductive_data(self) -> list[Data]: return [train_data, val_data, test_data] def process(self) -> None: - data = ( - self._get_transductive_data() - if self.split in ['RL', 'RH', 'TH'] - else self._get_inductive_data() - ) + data = (self._get_transductive_data() if self.split + in ['RL', 'RH', 'TH'] else self._get_inductive_data()) if self._to_undirected: transform = ToUndirected() for idx, d in enumerate(data): From d7a49517e0e9b672c46cd77ca308346cfda9b6e3 Mon Sep 17 00:00:00 2001 From: Gleb Bazhenov Date: Tue, 16 Sep 2025 18:22:13 +0300 Subject: [PATCH 04/13] try to fix imports --- torch_geometric/datasets/graphland.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/torch_geometric/datasets/graphland.py b/torch_geometric/datasets/graphland.py index 928a4d1a7d35..8b77375b13f5 100644 --- a/torch_geometric/datasets/graphland.py +++ b/torch_geometric/datasets/graphland.py @@ -5,7 +5,6 @@ import numpy as np import torch -import yaml from sklearn.impute import SimpleImputer from sklearn.preprocessing import ( MinMaxScaler, @@ -25,6 +24,7 @@ def _load_yaml(path: str) -> dict: + import yaml with open(path) as f: return yaml.safe_load(f) From 4d93e687c66cfe8a82faf5c0eacd89a293f3ebea Mon Sep 17 00:00:00 2001 From: Gleb Bazhenov Date: Tue, 16 Sep 2025 18:22:13 +0300 Subject: [PATCH 05/13] try to fix imports --- torch_geometric/datasets/graphland.py | 25 +++++++++++++------------ 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/torch_geometric/datasets/graphland.py b/torch_geometric/datasets/graphland.py index 928a4d1a7d35..eef848be8854 100644 --- a/torch_geometric/datasets/graphland.py +++ b/torch_geometric/datasets/graphland.py @@ -5,14 +5,6 @@ import numpy as np import torch -import yaml -from sklearn.impute import SimpleImputer -from sklearn.preprocessing import ( - MinMaxScaler, - OneHotEncoder, - QuantileTransformer, - StandardScaler, -) from torch_geometric.data import ( Data, @@ -25,6 +17,7 @@ def _load_yaml(path: str) -> dict: + import yaml with open(path) as f: return yaml.safe_load(f) @@ -195,6 +188,13 @@ class GraphLandDataset(InMemoryDataset): - True - regression """ + from sklearn.impute import SimpleImputer + from sklearn.preprocessing import ( + MinMaxScaler, + OneHotEncoder, + QuantileTransformer, + StandardScaler, + ) _url = 'https://zenodo.org/records/16895532' _transforms = { 'standard_scaler': @@ -225,6 +225,7 @@ class GraphLandDataset(InMemoryDataset): handle_unknown='ignore', ), } + _imputer = SimpleImputer def __init__( self, @@ -419,7 +420,7 @@ def _get_transductive_data(self) -> list[Data]: transform = self._transforms[self._num_transform]() transform.fit(num_features) - num_features = SimpleImputer( + num_features = self._imputer( missing_values=np.nan, strategy=self._num_imputation, copy=False).fit_transform(num_features) @@ -433,7 +434,7 @@ def _get_transductive_data(self) -> list[Data]: transform = self._transforms[self._frac_transform]() transform.fit(frac_features) - frac_features = SimpleImputer( + frac_features = self._imputer( missing_values=np.nan, strategy=self._frac_imputation, copy=False).fit_transform(frac_features) @@ -515,7 +516,7 @@ def _get_inductive_data(self) -> list[Data]: transform = self._transforms[self._num_transform]() transform.fit(num_features[transform_mask]) - imputer = SimpleImputer(missing_values=np.nan, + imputer = self._imputer(missing_values=np.nan, strategy=self._num_imputation, copy=False) imputer.fit(num_features[transform_mask]) num_features = imputer.transform(num_features) @@ -530,7 +531,7 @@ def _get_inductive_data(self) -> list[Data]: transform = self._transforms[self._frac_transform]() transform.fit(frac_features[transform_mask]) - imputer = SimpleImputer(missing_values=np.nan, + imputer = self._imputer(missing_values=np.nan, strategy=self._frac_imputation, copy=False) imputer.fit(frac_features[transform_mask]) frac_features = imputer.transform(frac_features) From 4241b84215ce7ad3c6de993a6a54c0f8c860fbaf Mon Sep 17 00:00:00 2001 From: Gleb Bazhenov Date: Wed, 17 Sep 2025 13:42:36 +0300 Subject: [PATCH 06/13] fix dtypes --- torch_geometric/datasets/graphland.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/torch_geometric/datasets/graphland.py b/torch_geometric/datasets/graphland.py index eef848be8854..849bf5b72120 100644 --- a/torch_geometric/datasets/graphland.py +++ b/torch_geometric/datasets/graphland.py @@ -223,6 +223,7 @@ class GraphLandDataset(InMemoryDataset): drop='if_binary', sparse_output=False, handle_unknown='ignore', + dtype=np.float32, ), } _imputer = SimpleImputer @@ -364,18 +365,22 @@ def _get_raw_data(self) -> dict: if name not in info['fraction_features_names'] ] num_features = features_df[num_features_names].values + num_features = num_features.astype(np.float32) cat_features_names = info['categorical_features_names'] cat_features = features_df[cat_features_names].values + cat_features = cat_features.astype(np.int32) frac_features_names = info['fraction_features_names'] frac_features = features_df[frac_features_names].values + frac_features = frac_features.astype(np.float32) targets_df = pd.read_csv( osp.join(raw_data_dir, 'targets.csv'), index_col=0, ) targets = targets_df[info['target_name']].values + targets = targets.astype(np.float32) masks_df = pd.read_csv( osp.join(raw_data_dir, f'split_masks_{self.split[:2]}.csv'), From 09839e37cbeb2cd9426845bccba1e3391ba84199 Mon Sep 17 00:00:00 2001 From: Gleb Bazhenov Date: Wed, 17 Sep 2025 13:49:57 +0300 Subject: [PATCH 07/13] try to move imports into functions --- torch_geometric/datasets/graphland.py | 77 ++++++++++++++------------- 1 file changed, 39 insertions(+), 38 deletions(-) diff --git a/torch_geometric/datasets/graphland.py b/torch_geometric/datasets/graphland.py index 849bf5b72120..8c4e399dc223 100644 --- a/torch_geometric/datasets/graphland.py +++ b/torch_geometric/datasets/graphland.py @@ -188,45 +188,7 @@ class GraphLandDataset(InMemoryDataset): - True - regression """ - from sklearn.impute import SimpleImputer - from sklearn.preprocessing import ( - MinMaxScaler, - OneHotEncoder, - QuantileTransformer, - StandardScaler, - ) _url = 'https://zenodo.org/records/16895532' - _transforms = { - 'standard_scaler': - partial(StandardScaler, copy=False), - 'min_max_scaler': - partial(MinMaxScaler, clip=False, copy=False), - 'quantile_transform_normal': - partial( - QuantileTransformer, - output_distribution='normal', - subsample=None, - random_state=0, - copy=False, - ), - 'quantile_transform_uniform': - partial( - QuantileTransformer, - output_distribution='uniform', - subsample=None, - random_state=0, - copy=False, - ), - 'one_hot_encoding': - partial( - OneHotEncoder, - drop='if_binary', - sparse_output=False, - handle_unknown='ignore', - dtype=np.float32, - ), - } - _imputer = SimpleImputer def __init__( self, @@ -314,6 +276,45 @@ def __init__( self._frac_imputation = fraction_features_nan_imputation_strategy self._to_undirected = to_undirected + from sklearn.impute import SimpleImputer + from sklearn.preprocessing import ( + MinMaxScaler, + OneHotEncoder, + QuantileTransformer, + StandardScaler, + ) + self._transforms = { + 'standard_scaler': + partial(StandardScaler, copy=False), + 'min_max_scaler': + partial(MinMaxScaler, clip=False, copy=False), + 'quantile_transform_normal': + partial( + QuantileTransformer, + output_distribution='normal', + subsample=None, + random_state=0, + copy=False, + ), + 'quantile_transform_uniform': + partial( + QuantileTransformer, + output_distribution='uniform', + subsample=None, + random_state=0, + copy=False, + ), + 'one_hot_encoding': + partial( + OneHotEncoder, + drop='if_binary', + sparse_output=False, + handle_unknown='ignore', + dtype=np.float32, + ), + } + self._imputer = SimpleImputer + super().__init__(root, transform, pre_transform, force_reload=force_reload) self.load(self.processed_paths[0]) From d2ae7072e2691932cc92116a10c429882858a5d3 Mon Sep 17 00:00:00 2001 From: Gleb Bazhenov Date: Wed, 17 Sep 2025 14:14:46 +0300 Subject: [PATCH 08/13] reduce number of tests --- test/datasets/test_graphland.py | 18 ------------------ 1 file changed, 18 deletions(-) diff --git a/test/datasets/test_graphland.py b/test/datasets/test_graphland.py index 0d455a46fab3..458156e9d9f2 100644 --- a/test/datasets/test_graphland.py +++ b/test/datasets/test_graphland.py @@ -9,19 +9,8 @@ @withPackage('pandas', 'sklearn', 'yaml') @pytest.mark.parametrize('name', [ 'hm-categories', - 'pokec-regions', - 'web-topics', 'tolokers-2', - 'city-reviews', - 'artnet-exp', - 'web-fraud', - 'hm-prices', 'avazu-ctr', - 'city-roads-M', - 'city-roads-L', - 'twitch-views', - 'artnet-views', - 'web-traffic', ]) def test_transductive_graphland(name: str): dataset = GraphLandDataset( @@ -52,15 +41,8 @@ def test_transductive_graphland(name: str): @withPackage('pandas', 'sklearn', 'yaml') @pytest.mark.parametrize('name', [ 'hm-categories', - 'pokec-regions', - 'web-topics', 'tolokers-2', - 'artnet-exp', - 'web-fraud', - 'hm-prices', 'avazu-ctr', - 'twitch-views', - 'artnet-views', ]) def test_inductive_graphland(name: str): base_data = GraphLandDataset( From 7bd1d679d20dd260db388680c7942b125b87d395 Mon Sep 17 00:00:00 2001 From: Gleb Bazhenov Date: Wed, 17 Sep 2025 14:31:58 +0300 Subject: [PATCH 09/13] add pull request link to changelog --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 75ccb50fb0f5..ddf0aaed3b63 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -15,7 +15,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). ### Added -- Added GraphLand benchmark via `GraphLandDataset` +- Added GraphLand benchmark via `GraphLandDataset` ([#10458](https://github.com/pyg-team/pytorch_geometric/pull/10458)) - Added `torch_geometric.llm` and its examples ([#10436](https://github.com/pyg-team/pytorch_geometric/pull/10436)) - Added support for negative weights in `sparse_cross_entropy` ([#10432](https://github.com/pyg-team/pytorch_geometric/pull/10432)) - Added `connected_components()` method to `Data` and `HeterData` ([#10388](https://github.com/pyg-team/pytorch_geometric/pull/10388)) From e5a5e36e125c3e4f6a907bd1ecf6f4b5a8620de5 Mon Sep 17 00:00:00 2001 From: Gleb Bazhenov Date: Mon, 6 Oct 2025 18:28:26 +0300 Subject: [PATCH 10/13] try to fix linter issues --- torch_geometric/datasets/graphland.py | 27 ++++++++++++--------------- 1 file changed, 12 insertions(+), 15 deletions(-) diff --git a/torch_geometric/datasets/graphland.py b/torch_geometric/datasets/graphland.py index 8c4e399dc223..edc7d1a20f36 100644 --- a/torch_geometric/datasets/graphland.py +++ b/torch_geometric/datasets/graphland.py @@ -460,17 +460,14 @@ def _get_transductive_data(self) -> list[Data]: ) features = torch.tensor(features, dtype=torch.float) - num_mask = np.zeros(shape=(features.shape[1], ), dtype=bool) + num_mask = torch.zeros(features.shape[1], dtype=torch.bool) num_mask[:num_features.shape[1]] = True - num_mask = torch.tensor(num_mask, dtype=torch.bool) - frac_mask = np.zeros(shape=(features.shape[1], ), dtype=bool) + frac_mask = torch.zeros(features.shape[1], dtype=torch.bool) frac_mask[num_features.shape[1]:-cat_features.shape[1]] = True - frac_mask = torch.tensor(frac_mask, dtype=torch.bool) - cat_mask = np.zeros(shape=(features.shape[1], ), dtype=bool) + cat_mask = torch.zeros(features.shape[1], dtype=torch.bool) cat_mask[-cat_features.shape[1]:] = True - cat_mask = torch.tensor(cat_mask, dtype=torch.bool) # >>> update split masks train_mask = raw_data['masks']['train'] & labeled_mask @@ -559,17 +556,14 @@ def _get_inductive_data(self) -> list[Data]: ) features = torch.tensor(features, dtype=torch.float) - num_mask = np.zeros(shape=(features.shape[1], ), dtype=bool) + num_mask = torch.zeros(features.shape[1], dtype=torch.bool) num_mask[:num_features.shape[1]] = True - num_mask = torch.tensor(num_mask, dtype=torch.bool) - frac_mask = np.zeros(shape=(features.shape[1], ), dtype=bool) + frac_mask = torch.zeros(features.shape[1], dtype=torch.bool) frac_mask[num_features.shape[1]:-cat_features.shape[1]] = True - frac_mask = torch.tensor(frac_mask, dtype=torch.bool) - cat_mask = np.zeros(shape=(features.shape[1], ), dtype=bool) + cat_mask = torch.zeros(features.shape[1], dtype=torch.bool) cat_mask[-cat_features.shape[1]:] = True - cat_mask = torch.tensor(cat_mask, dtype=torch.bool) # >>> construct Data objects edge_index = raw_data['edges'].T @@ -583,7 +577,8 @@ def _get_inductive_data(self) -> list[Data]: train_label_mask = torch.tensor(train_label_mask, dtype=torch.bool) train_node_id = np.where(train_graph_mask)[0] - train_node_id = torch.tensor(train_node_id, dtype=torch.long) + train_node_id = torch.tensor(train_node_id, + dtype=torch.long) # type: ignore train_edge_index, _ = subgraph( train_graph_mask, @@ -610,7 +605,8 @@ def _get_inductive_data(self) -> list[Data]: val_label_mask = torch.tensor(val_label_mask, dtype=torch.bool) val_node_id = np.where(val_graph_mask)[0] - val_node_id = torch.tensor(val_node_id, dtype=torch.long) + val_node_id = torch.tensor(val_node_id, + dtype=torch.long) # type: ignore val_edge_index, _ = subgraph( val_graph_mask, @@ -638,7 +634,8 @@ def _get_inductive_data(self) -> list[Data]: test_label_mask = torch.tensor(test_label_mask, dtype=torch.bool) test_node_id = np.where(test_graph_mask)[0] - test_node_id = torch.tensor(test_node_id, dtype=torch.long) + test_node_id = torch.tensor(test_node_id, + dtype=torch.long) # type: ignore test_edge_index, _ = subgraph( test_graph_mask, From 5db89fc2f517bc20e06a7c449ba8d9493f3403fd Mon Sep 17 00:00:00 2001 From: Gleb Bazhenov Date: Mon, 6 Oct 2025 22:08:29 +0300 Subject: [PATCH 11/13] add example --- examples/README.md | 2 + examples/graphland.py | 157 ++++++++++++++++++++++++++ test/datasets/test_graphland.py | 12 +- torch_geometric/datasets/graphland.py | 37 +++--- 4 files changed, 185 insertions(+), 23 deletions(-) create mode 100644 examples/graphland.py diff --git a/examples/README.md b/examples/README.md index 2efce7068990..fe69ed6e3ec8 100644 --- a/examples/README.md +++ b/examples/README.md @@ -24,6 +24,8 @@ For examples on [Open Graph Benchmark](https://ogb.stanford.edu/) datasets, see For an example on [Relational Deep Learning](https://arxiv.org/abs/2312.04615) with the [RelBench datasets](https://relbench.stanford.edu/), see [`rdl.py`](./rdl.py). +For an example on using [GraphLand datasets](https://arxiv.org/abs/2409.14500) for node property prediction, see [`graphland.py`](./graphland.py). + For examples on using `torch.compile`, see the examples under [`examples/compile`](./compile). For examples on scaling PyG up via multi-GPUs, see the examples under [`examples/multi_gpu`](./multi_gpu). diff --git a/examples/graphland.py b/examples/graphland.py new file mode 100644 index 000000000000..fc02ae84d4d0 --- /dev/null +++ b/examples/graphland.py @@ -0,0 +1,157 @@ +import argparse + +import numpy as np +import torch +import torch.nn as nn +import torch.nn.functional as F +import torch.optim as optim +from sklearn.metrics import accuracy_score, average_precision_score, r2_score +from tqdm import tqdm + +import torch_geometric.nn as pygnn +from torch_geometric.datasets import GraphLandDataset, graphland + +GRAPHLAND_DATASETS = [ + 'hm-categories', + 'pokec-regions', + 'web-topics', + 'tolokers-2', + 'city-reviews', + 'artnet-exp', + 'web-fraud', + 'hm-prices', + 'avazu-ctr', + 'city-roads-M', + 'city-roads-L', + 'twitch-views', + 'artnet-views', + 'web-traffic', +] + + +class Model(torch.nn.Module): + def __init__(self, in_channels: int, hidden_channels: int, + out_channels: int): + super().__init__() + self.conv = pygnn.GCNConv(in_channels, hidden_channels) + self.head = nn.Sequential( + nn.ReLU(), + nn.Linear(hidden_channels, out_channels), + ) + + def forward(self, x: torch.Tensor, + edge_index: torch.Tensor) -> torch.Tensor: + return self.head(self.conv(x, edge_index)) + + +def _get_num_classes(dataset: GraphLandDataset) -> int: + assert dataset.task != 'regression' + targets = torch.cat([data.y for data in dataset], dim=0) + return len(torch.unique(targets[~torch.isnan(targets)])) + + +def _get_model(dataset: GraphLandDataset) -> nn.Module: + return Model( + in_channels=dataset[0].x.shape[1], + hidden_channels=256, + out_channels=(_get_num_classes(dataset) + if dataset.task != 'regression' else 1), + ) + + +def _get_optimizer(model: nn.Module) -> optim.Optimizer: + return optim.Adam(model.parameters(), lr=1e-3) + + +def _train_step(model: nn.Module, dataset: GraphLandDataset, + optimizer: optim.Optimizer) -> float: + def _compute_loss(outputs: torch.Tensor, + targets: torch.Tensor) -> torch.Tensor: + if dataset.task == 'regression': + return F.mse_loss(outputs, targets) + else: + return F.cross_entropy(outputs, targets.long()) + + data = dataset[0] + mask = data.train_mask if dataset.split != 'THI' else data.mask + + outputs = model(data.x, data.edge_index).squeeze() + loss = _compute_loss(outputs[mask], data.y[mask]) + + optimizer.zero_grad() + loss.backward() + optimizer.step() + return loss.detach().cpu().item() + + +def _eval_step(model: nn.Module, + dataset: GraphLandDataset) -> dict[str, float]: + def _compute_metric(outputs: np.ndarray, targets: np.ndarray) -> float: + if dataset.task == 'regression': + return float(r2_score(targets, outputs)) + + elif dataset.task == 'binary_classification': + predictions = outputs[:, 1] + return float(average_precision_score(targets, predictions)) + + else: + predictions = np.argmax(outputs, axis=1) + return float(accuracy_score(targets, predictions)) + + metrics = dict() + for idx, part in enumerate(['train', 'val', 'test']): + if dataset.split == 'THI': + data = dataset[idx] + mask = data.mask + else: + data = dataset[0] + mask = getattr(data, f'{part}_mask') + + outputs = model(data.x, data.edge_index).squeeze() + metrics[part] = _compute_metric( + outputs[mask].detach().cpu().numpy(), + data.y[mask].cpu().numpy(), + ) + return metrics + + +def _format_metrics(metrics: dict[str, float]) -> str: + return ', '.join(f'{part}={metrics[part] * 100.0:.2f}' + for part in ['train', 'val', 'test']) + + +def run_experiment(name: str, split: str) -> None: + n_steps = 100 + dataset = GraphLandDataset( + root='./datasets', + split=split, + name=name, + to_undirected=True, + ) + model = _get_model(dataset) + model = model.cuda() + dataset = dataset.copy().cuda() + optimizer = _get_optimizer(model) + + best_metrics = {part: -float('inf') for part in ['train', 'val', 'test']} + pbar = tqdm(range(n_steps)) + for _ in pbar: + loss = _train_step(model, dataset, optimizer) + curr_metrics = _eval_step(model, dataset) + description = f'loss={loss:.4f}, ' + _format_metrics(curr_metrics) + pbar.set_postfix_str(description) + if curr_metrics['val'] > best_metrics['val']: + best_metrics = curr_metrics + + print('Best metrics: ' + _format_metrics(best_metrics)) + return best_metrics + + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument('--name', choices=graphland.GRAPHLAND_DATASETS, + help='The name of dataset.', required=True) + parser.add_argument('--split', choices=['RL', 'RH', 'TH', 'THI'], + help='The type of data split.', required=True) + args = parser.parse_args() + run_experiment(args.name, args.split) diff --git a/test/datasets/test_graphland.py b/test/datasets/test_graphland.py index 458156e9d9f2..d8afb8e99b8e 100644 --- a/test/datasets/test_graphland.py +++ b/test/datasets/test_graphland.py @@ -63,10 +63,10 @@ def test_inductive_graphland(name: str): ) assert len(dataset) == 3 - train_data, val_data, test_data = dataset - assert num_nodes == test_data.num_nodes == test_data.node_id.shape[0] - assert num_edges == test_data.num_edges + data_train, data_val, data_test = dataset + assert num_nodes == data_test.num_nodes == data_test.node_id.shape[0] + assert num_edges == data_test.num_edges - assert not torch.isnan(train_data.y[train_data.mask]).any().item() - assert not torch.isnan(val_data.y[val_data.mask]).any().item() - assert not torch.isnan(test_data.y[test_data.mask]).any().item() + assert not torch.isnan(data_train.y[data_train.mask]).any().item() + assert not torch.isnan(data_val.y[data_val.mask]).any().item() + assert not torch.isnan(data_test.y[data_test.mask]).any().item() diff --git a/torch_geometric/datasets/graphland.py b/torch_geometric/datasets/graphland.py index edc7d1a20f36..5075afc46b1c 100644 --- a/torch_geometric/datasets/graphland.py +++ b/torch_geometric/datasets/graphland.py @@ -15,9 +15,26 @@ from torch_geometric.transforms import ToUndirected from torch_geometric.utils import subgraph +GRAPHLAND_DATASETS = { + 'hm-categories': 'multiclass_classification', + 'pokec-regions': 'multiclass_classification', + 'web-topics': 'multiclass_classification', + 'tolokers-2': 'binary_classification', + 'city-reviews': 'binary_classification', + 'artnet-exp': 'binary_classification', + 'web-fraud': 'binary_classification', + 'hm-prices': 'regression', + 'avazu-ctr': 'regression', + 'city-roads-M': 'regression', + 'city-roads-L': 'regression', + 'twitch-views': 'regression', + 'artnet-views': 'regression', + 'web-traffic': 'regression', +} + def _load_yaml(path: str) -> dict: - import yaml + import yaml # type: ignore with open(path) as f: return yaml.safe_load(f) @@ -209,22 +226,7 @@ def __init__( pre_transform: Optional[Callable] = None, force_reload: bool = False, ) -> None: - assert name in [ - 'hm-categories', - 'pokec-regions', - 'web-topics', - 'tolokers-2', - 'city-reviews', - 'artnet-exp', - 'web-fraud', - 'hm-prices', - 'avazu-ctr', - 'city-roads-M', - 'city-roads-L', - 'twitch-views', - 'artnet-views', - 'web-traffic', - ], f'Unsupported dataset name: {name}' + assert name in GRAPHLAND_DATASETS, f'Unsupported dataset name: {name}' assert split in ['RL', 'RH', 'TH', 'THI'], \ f'Unsupported split name: {split}' @@ -268,6 +270,7 @@ def __init__( self.name = name self.split = split + self.task = GRAPHLAND_DATASETS[name] self._num_transform = numerical_features_transform self._frac_transform = fraction_features_transform self._cat_transform = categorical_features_transform From d5926bc5a773ba74ef428ae74d630c282ce2658e Mon Sep 17 00:00:00 2001 From: Akihiro Nitta Date: Fri, 10 Oct 2025 17:59:30 +0000 Subject: [PATCH 12/13] update --- examples/graphland.py | 117 ++++++++++++-------------- torch_geometric/datasets/graphland.py | 38 ++++----- 2 files changed, 73 insertions(+), 82 deletions(-) diff --git a/examples/graphland.py b/examples/graphland.py index fc02ae84d4d0..e006cc6054a9 100644 --- a/examples/graphland.py +++ b/examples/graphland.py @@ -8,39 +8,29 @@ from sklearn.metrics import accuracy_score, average_precision_score, r2_score from tqdm import tqdm -import torch_geometric.nn as pygnn -from torch_geometric.datasets import GraphLandDataset, graphland - -GRAPHLAND_DATASETS = [ - 'hm-categories', - 'pokec-regions', - 'web-topics', - 'tolokers-2', - 'city-reviews', - 'artnet-exp', - 'web-fraud', - 'hm-prices', - 'avazu-ctr', - 'city-roads-M', - 'city-roads-L', - 'twitch-views', - 'artnet-views', - 'web-traffic', -] +from torch_geometric.datasets import GraphLandDataset +from torch_geometric.nn import GCNConv class Model(torch.nn.Module): - def __init__(self, in_channels: int, hidden_channels: int, - out_channels: int): + def __init__( + self, + in_channels: int, + hidden_channels: int, + out_channels: int, + ) -> None: super().__init__() - self.conv = pygnn.GCNConv(in_channels, hidden_channels) + self.conv = GCNConv(in_channels, hidden_channels) self.head = nn.Sequential( nn.ReLU(), nn.Linear(hidden_channels, out_channels), ) - def forward(self, x: torch.Tensor, - edge_index: torch.Tensor) -> torch.Tensor: + def forward( + self, + x: torch.Tensor, + edge_index: torch.Tensor, + ) -> torch.Tensor: return self.head(self.conv(x, edge_index)) @@ -50,42 +40,30 @@ def _get_num_classes(dataset: GraphLandDataset) -> int: return len(torch.unique(targets[~torch.isnan(targets)])) -def _get_model(dataset: GraphLandDataset) -> nn.Module: - return Model( - in_channels=dataset[0].x.shape[1], - hidden_channels=256, - out_channels=(_get_num_classes(dataset) - if dataset.task != 'regression' else 1), - ) - - -def _get_optimizer(model: nn.Module) -> optim.Optimizer: - return optim.Adam(model.parameters(), lr=1e-3) - - -def _train_step(model: nn.Module, dataset: GraphLandDataset, - optimizer: optim.Optimizer) -> float: - def _compute_loss(outputs: torch.Tensor, - targets: torch.Tensor) -> torch.Tensor: - if dataset.task == 'regression': - return F.mse_loss(outputs, targets) - else: - return F.cross_entropy(outputs, targets.long()) - +def _train_step( + model: nn.Module, + dataset: GraphLandDataset, + optimizer: optim.Optimizer, +) -> torch.Tensor: data = dataset[0] mask = data.train_mask if dataset.split != 'THI' else data.mask - + optimizer.zero_grad() outputs = model(data.x, data.edge_index).squeeze() - loss = _compute_loss(outputs[mask], data.y[mask]) - optimizer.zero_grad() + if dataset.task == 'regression': + loss = F.mse_loss(outputs[mask], data.y[mask]) + else: + loss = F.cross_entropy(outputs[mask], data.y[mask].long()) + loss.backward() optimizer.step() - return loss.detach().cpu().item() + return loss -def _eval_step(model: nn.Module, - dataset: GraphLandDataset) -> dict[str, float]: +def _eval_step( + model: nn.Module, + dataset: GraphLandDataset, +) -> dict[str, float]: def _compute_metric(outputs: np.ndarray, targets: np.ndarray) -> float: if dataset.task == 'regression': return float(r2_score(targets, outputs)) @@ -121,25 +99,30 @@ def _format_metrics(metrics: dict[str, float]) -> str: def run_experiment(name: str, split: str) -> None: + device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') n_steps = 100 dataset = GraphLandDataset( - root='./datasets', + root='./data', split=split, name=name, to_undirected=True, ) - model = _get_model(dataset) - model = model.cuda() - dataset = dataset.copy().cuda() - optimizer = _get_optimizer(model) + model = Model( + in_channels=dataset[0].x.shape[1], + hidden_channels=256, + out_channels=(_get_num_classes(dataset) + if dataset.task != 'regression' else 1), + ).to(device) + dataset = dataset.to(device) + optimizer = optim.Adam(model.parameters(), lr=1e-3) best_metrics = {part: -float('inf') for part in ['train', 'val', 'test']} pbar = tqdm(range(n_steps)) for _ in pbar: loss = _train_step(model, dataset, optimizer) curr_metrics = _eval_step(model, dataset) - description = f'loss={loss:.4f}, ' + _format_metrics(curr_metrics) - pbar.set_postfix_str(description) + pbar.set_postfix_str(f'loss={loss.detach().cpu().item():.4f}, ' + + _format_metrics(curr_metrics)) if curr_metrics['val'] > best_metrics['val']: best_metrics = curr_metrics @@ -149,9 +132,17 @@ def run_experiment(name: str, split: str) -> None: if __name__ == '__main__': parser = argparse.ArgumentParser() - parser.add_argument('--name', choices=graphland.GRAPHLAND_DATASETS, - help='The name of dataset.', required=True) - parser.add_argument('--split', choices=['RL', 'RH', 'TH', 'THI'], - help='The type of data split.', required=True) + parser.add_argument( + '--name', + choices=list(GraphLandDataset.GRAPHLAND_DATASETS.keys()), + help='The name of dataset.', + required=True, + ) + parser.add_argument( + '--split', + choices=['RL', 'RH', 'TH', 'THI'], + help='The type of data split.', + required=True, + ) args = parser.parse_args() run_experiment(args.name, args.split) diff --git a/torch_geometric/datasets/graphland.py b/torch_geometric/datasets/graphland.py index 5075afc46b1c..b1ad2e3efbf3 100644 --- a/torch_geometric/datasets/graphland.py +++ b/torch_geometric/datasets/graphland.py @@ -15,23 +15,6 @@ from torch_geometric.transforms import ToUndirected from torch_geometric.utils import subgraph -GRAPHLAND_DATASETS = { - 'hm-categories': 'multiclass_classification', - 'pokec-regions': 'multiclass_classification', - 'web-topics': 'multiclass_classification', - 'tolokers-2': 'binary_classification', - 'city-reviews': 'binary_classification', - 'artnet-exp': 'binary_classification', - 'web-fraud': 'binary_classification', - 'hm-prices': 'regression', - 'avazu-ctr': 'regression', - 'city-roads-M': 'regression', - 'city-roads-L': 'regression', - 'twitch-views': 'regression', - 'artnet-views': 'regression', - 'web-traffic': 'regression', -} - def _load_yaml(path: str) -> dict: import yaml # type: ignore @@ -206,6 +189,22 @@ class GraphLandDataset(InMemoryDataset): - regression """ _url = 'https://zenodo.org/records/16895532' + GRAPHLAND_DATASETS = { + 'hm-categories': 'multiclass_classification', + 'pokec-regions': 'multiclass_classification', + 'web-topics': 'multiclass_classification', + 'tolokers-2': 'binary_classification', + 'city-reviews': 'binary_classification', + 'artnet-exp': 'binary_classification', + 'web-fraud': 'binary_classification', + 'hm-prices': 'regression', + 'avazu-ctr': 'regression', + 'city-roads-M': 'regression', + 'city-roads-L': 'regression', + 'twitch-views': 'regression', + 'artnet-views': 'regression', + 'web-traffic': 'regression', + } def __init__( self, @@ -226,7 +225,8 @@ def __init__( pre_transform: Optional[Callable] = None, force_reload: bool = False, ) -> None: - assert name in GRAPHLAND_DATASETS, f'Unsupported dataset name: {name}' + assert name in self.GRAPHLAND_DATASETS, ( + f'Unsupported dataset name: {name}') assert split in ['RL', 'RH', 'TH', 'THI'], \ f'Unsupported split name: {split}' @@ -270,7 +270,7 @@ def __init__( self.name = name self.split = split - self.task = GRAPHLAND_DATASETS[name] + self.task = self.GRAPHLAND_DATASETS[name] self._num_transform = numerical_features_transform self._frac_transform = fraction_features_transform self._cat_transform = categorical_features_transform From 920ec01031aab84de61201763a5595f1d16f4720 Mon Sep 17 00:00:00 2001 From: Gleb Bazhenov Date: Mon, 13 Oct 2025 17:58:37 +0300 Subject: [PATCH 13/13] address review comments --- examples/graphland.py | 3 +-- torch_geometric/datasets/graphland.py | 39 +++++++++++++-------------- 2 files changed, 19 insertions(+), 23 deletions(-) diff --git a/examples/graphland.py b/examples/graphland.py index e006cc6054a9..a22a8f6891bb 100644 --- a/examples/graphland.py +++ b/examples/graphland.py @@ -106,14 +106,13 @@ def run_experiment(name: str, split: str) -> None: split=split, name=name, to_undirected=True, - ) + ).to(device) model = Model( in_channels=dataset[0].x.shape[1], hidden_channels=256, out_channels=(_get_num_classes(dataset) if dataset.task != 'regression' else 1), ).to(device) - dataset = dataset.to(device) optimizer = optim.Adam(model.parameters(), lr=1e-3) best_metrics = {part: -float('inf') for part in ['train', 'val', 'test']} diff --git a/torch_geometric/datasets/graphland.py b/torch_geometric/datasets/graphland.py index b1ad2e3efbf3..869a4abc515c 100644 --- a/torch_geometric/datasets/graphland.py +++ b/torch_geometric/datasets/graphland.py @@ -420,7 +420,7 @@ def _get_transductive_data(self) -> list[Data]: transform = self._transforms[self._reg_transform]() transform.fit(targets[raw_data['masks']['train']]) targets = transform.transform(targets).reshape(-1) - targets = torch.tensor(targets, dtype=torch.float) + targets = torch.from_numpy(targets).float() # >>> process numerical features num_features = raw_data['num_features'] @@ -461,7 +461,7 @@ def _get_transductive_data(self) -> list[Data]: [num_features, frac_features, cat_features], axis=1, ) - features = torch.tensor(features, dtype=torch.float) + features = torch.from_numpy(features).float() num_mask = torch.zeros(features.shape[1], dtype=torch.bool) num_mask[:num_features.shape[1]] = True @@ -474,17 +474,17 @@ def _get_transductive_data(self) -> list[Data]: # >>> update split masks train_mask = raw_data['masks']['train'] & labeled_mask - train_mask = torch.tensor(train_mask, dtype=torch.bool) + train_mask = torch.from_numpy(train_mask).bool() val_mask = raw_data['masks']['val'] & labeled_mask - val_mask = torch.tensor(val_mask, dtype=torch.bool) + val_mask = torch.from_numpy(val_mask).bool() test_mask = raw_data['masks']['test'] & labeled_mask - test_mask = torch.tensor(test_mask, dtype=torch.bool) + test_mask = torch.from_numpy(test_mask).bool() # >>> make edge index edge_index = raw_data['edges'].T - edge_index = torch.tensor(edge_index, dtype=torch.long) + edge_index = torch.from_numpy(edge_index).long() # >>> construct Data object data = Data( @@ -513,7 +513,7 @@ def _get_inductive_data(self) -> list[Data]: transform = self._transforms[self._reg_transform]() transform.fit(targets[transform_mask]) targets = transform.transform(targets).reshape(-1) - targets = torch.tensor(targets, dtype=torch.float) + targets = torch.from_numpy(targets).float() # >>> process numerical features num_features = raw_data['num_features'] @@ -557,7 +557,7 @@ def _get_inductive_data(self) -> list[Data]: [num_features, frac_features, cat_features], axis=1, ) - features = torch.tensor(features, dtype=torch.float) + features = torch.from_numpy(features).float() num_mask = torch.zeros(features.shape[1], dtype=torch.bool) num_mask[:num_features.shape[1]] = True @@ -570,18 +570,17 @@ def _get_inductive_data(self) -> list[Data]: # >>> construct Data objects edge_index = raw_data['edges'].T - edge_index = torch.tensor(edge_index, dtype=torch.long) + edge_index = torch.from_numpy(edge_index).long() # --- train train_graph_mask = raw_data['masks']['train'] - train_graph_mask = torch.tensor(train_graph_mask, dtype=torch.bool) + train_graph_mask = torch.from_numpy(train_graph_mask).bool() train_label_mask = raw_data['masks']['train'] & labeled_mask - train_label_mask = torch.tensor(train_label_mask, dtype=torch.bool) + train_label_mask = torch.from_numpy(train_label_mask).bool() train_node_id = np.where(train_graph_mask)[0] - train_node_id = torch.tensor(train_node_id, - dtype=torch.long) # type: ignore + train_node_id = torch.from_numpy(train_node_id).bool() # type: ignore train_edge_index, _ = subgraph( train_graph_mask, @@ -602,14 +601,13 @@ def _get_inductive_data(self) -> list[Data]: # --- val val_graph_mask = (raw_data['masks']['train'] | raw_data['masks']['val']) - val_graph_mask = torch.tensor(val_graph_mask, dtype=torch.bool) + val_graph_mask = torch.from_numpy(val_graph_mask).bool() val_label_mask = raw_data['masks']['val'] & labeled_mask - val_label_mask = torch.tensor(val_label_mask, dtype=torch.bool) + val_label_mask = torch.from_numpy(val_label_mask).bool() val_node_id = np.where(val_graph_mask)[0] - val_node_id = torch.tensor(val_node_id, - dtype=torch.long) # type: ignore + val_node_id = torch.from_numpy(val_node_id).long() # type: ignore val_edge_index, _ = subgraph( val_graph_mask, @@ -631,14 +629,13 @@ def _get_inductive_data(self) -> list[Data]: test_graph_mask = (raw_data['masks']['train'] | raw_data['masks']['val'] | raw_data['masks']['test']) - test_graph_mask = torch.tensor(test_graph_mask, dtype=torch.bool) + test_graph_mask = torch.from_numpy(test_graph_mask).bool() test_label_mask = raw_data['masks']['test'] & labeled_mask - test_label_mask = torch.tensor(test_label_mask, dtype=torch.bool) + test_label_mask = torch.from_numpy(test_label_mask).bool() test_node_id = np.where(test_graph_mask)[0] - test_node_id = torch.tensor(test_node_id, - dtype=torch.long) # type: ignore + test_node_id = torch.from_numpy(test_node_id).long() # type: ignore test_edge_index, _ = subgraph( test_graph_mask,