From 08ae309cc0bbfe1105e12cc09de7bd32f14a876e Mon Sep 17 00:00:00 2001 From: Taylor Salo Date: Wed, 30 Jun 2021 12:04:25 -0400 Subject: [PATCH 01/21] Add n option to get_studies_by_coordinate --- nimare/dataset.py | 42 ++++++++++++++++++++++++++++-------- nimare/tests/test_dataset.py | 11 +++++++++- 2 files changed, 43 insertions(+), 10 deletions(-) diff --git a/nimare/dataset.py b/nimare/dataset.py index dc7d20f5e..f9d61fb85 100755 --- a/nimare/dataset.py +++ b/nimare/dataset.py @@ -548,7 +548,8 @@ def get_studies_by_label(self, labels=None, label_threshold=0.001): the labels above the threshold, it will be returned. Default is None. label_threshold : :obj:`float`, optional - Default is 0.5. + Default is 0.001, which corresponds to terms appearing at least once in abstracts + for the commonly-used Neurosynth TF-IDF feature set. Returns ------- @@ -600,27 +601,50 @@ def get_studies_by_mask(self, mask): found_ids = list(self.coordinates.loc[distances, "id"].unique()) return found_ids - def get_studies_by_coordinate(self, xyz, r=20): + def get_studies_by_coordinate(self, xyz, r=None, n=None): """Extract list of studies with at least one focus within radius of requested coordinates. + .. versionchanged:: 0.0.9 + + New option (n) supported to identify closest n studies to coordinate. + Default value for r changed to None. + Parameters ---------- xyz : (X x 3) array_like List of coordinates against which to find studies. r : :obj:`float`, optional - Radius (in mm) within which to find studies. Default is 20mm. + Radius (in mm) within which to find studies. Mutually exclusive with ``n``. + Default is None. + n : :obj:`int`, optional + Number of closest studies to identify. Mutually exclusive with ``r``. + Default is None. Returns ------- found_ids : :obj:`list` - A list of IDs from the Dataset with at least one focus within - radius r of requested coordinates. + A list of IDs from the Dataset matching the search criterion. """ + + if n and r: + raise ValueError("Only one of 'r' and 'n' may be provided.") + elif not n and not r: + raise ValueError("Either 'r' or 'n' must be provided.") + from scipy.spatial.distance import cdist - xyz = np.array(xyz) + temp_coords = self.coordinates.copy() + + xyz = np.atleast_2d(xyz) assert xyz.shape[1] == 3 and xyz.ndim == 2 - distances = cdist(xyz, self.coordinates[["x", "y", "z"]].values) - distances = np.any(distances <= r, axis=0) - found_ids = list(self.coordinates.loc[distances, "id"].unique()) + distances = cdist(xyz, temp_coords[["x", "y", "z"]].values) + distances = np.min(distances, axis=0) + temp_coords["distance"] = distances + min_dist_ser = temp_coords.groupby("id")["distance"].min() + + if r: + found_ids = min_dist_ser.loc[min_dist_ser <= r].index.tolist() + elif n: + found_ids = min_dist_ser.nsmallest(n).index.tolist() + return found_ids diff --git a/nimare/tests/test_dataset.py b/nimare/tests/test_dataset.py index c41073f93..8195916f5 100644 --- a/nimare/tests/test_dataset.py +++ b/nimare/tests/test_dataset.py @@ -3,6 +3,7 @@ import nibabel as nib import numpy as np +import pytest import nimare from nimare import dataset @@ -23,7 +24,15 @@ def test_dataset_smoke(): assert isinstance(dset.get_images(imtype="beta"), list) assert isinstance(dset.get_metadata(field="sample_sizes"), list) assert isinstance(dset.get_studies_by_label("cogat_cognitive_control"), list) - assert isinstance(dset.get_studies_by_coordinate(np.array([[20, 20, 20]])), list) + assert isinstance(dset.get_studies_by_coordinate([20, 20, 20], r=20), list) + assert len(dset.get_studies_by_coordinate([20, 20, 20], n=2)) == 2 + assert len(dset.get_studies_by_coordinate([20, 20, 20], n=3)) == 3 + with pytest.raises(ValueError): + dset.get_studies_by_coordinate([20, 20, 20]) + + with pytest.raises(ValueError): + dset.get_studies_by_coordinate([20, 20, 20], r=20, n=4) + mask_data = np.zeros(dset.masker.mask_img.shape, int) mask_data[40, 40, 40] = 1 mask_img = nib.Nifti1Image(mask_data, dset.masker.mask_img.affine) From 88d5c05f4f9436f2fe8f4b6e079084adfc05af7b Mon Sep 17 00:00:00 2001 From: Taylor Salo Date: Wed, 30 Jun 2021 12:44:48 -0400 Subject: [PATCH 02/21] Mock up maybe passable CoordCBP class. --- nimare/parcellate.py | 134 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 134 insertions(+) create mode 100644 nimare/parcellate.py diff --git a/nimare/parcellate.py b/nimare/parcellate.py new file mode 100644 index 000000000..3eb553f87 --- /dev/null +++ b/nimare/parcellate.py @@ -0,0 +1,134 @@ +"""Parcellation tools.""" +import inspect +import logging + +import numpy as np +from sklearn.cluster import KMeans + +from .base import NiMAREBase +from .meta.base import CBMAEstimator +from .results import MetaResult +from .utils import add_metadata_to_dataframe, check_type, listify, use_memmap, vox2mm + +LGR = logging.getLogger(__name__) + + +class CoordCBP(NiMAREBase): + """Base class for parcellators in :mod:`nimare.parcellate`. + + .. versionadded:: 0.0.10 + + """ + + _required_inputs = {"coordinates": ("coordinates", None)} + + def __init__( + self, + target_mask, + meta_estimator, + target_image, + n_clusters, + r=None, + n=None, + *args, + **kwargs, + ): + super().__init__(*args, **kwargs) + + # Get kernel transformer + meta_estimator = check_type(meta_estimator, CBMAEstimator) + self.meta_estimator = meta_estimator + self.target_image = target_image + self.n_clusters = listify(n_clusters) + + if r and n: + raise ValueError("Only one of 'r' and 'n' may be provided.") + elif not r and not n: + raise ValueError("Either 'r' or 'n' must be provided.") + self.r = r + self.n = n + + def _preprocess_input(self, dataset): + """Mask required input images using either the dataset's mask or the estimator's. + + Also, insert required metadata into coordinates DataFrame. + """ + super()._preprocess_input(dataset) + + # All extra (non-ijk) parameters for a kernel should be overrideable as + # parameters to __init__, so we can access them with get_params() + kt_args = list(self.kernel_transformer.get_params().keys()) + + # Integrate "sample_size" from metadata into DataFrame so that + # kernel_transformer can access it. + if "sample_size" in kt_args: + self.inputs_["coordinates"] = add_metadata_to_dataframe( + dataset, + self.inputs_["coordinates"], + metadata_field="sample_sizes", + target_column="sample_size", + filter_func=np.mean, + ) + + @use_memmap(LGR, n_files=1) + def _fit(self, dataset): + """ + Perform coordinate-based meta-analysis on dataset. + + Parameters + ---------- + dataset : :obj:`nimare.dataset.Dataset` + Dataset to analyze. + """ + self.dataset = dataset + self.masker = self.masker or dataset.masker + self.null_distributions_ = {} + + # Loop through voxels in target_mask, selecting studies for each and running MACMs (no MCC) + target_ijk = np.vstack(np.where(self.target_mask.get_fdata())) + target_xyz = vox2mm(target_ijk, self.masker.mask_img.affine) + for i_coord in target_xyz.shape[1]: + xyz = target_xyz[:, i_coord] + macm_ids = dataset.get_studies_by_coordinate(xyz, r=self.r, n=self.n) + coord_dset = dataset.slice(macm_ids) + + # This seems like a somewhat inelegant solution + # Check if the meta method is a pairwise estimator + if "dataset2" in inspect.getfullargspec(self.meta_estimator.fit).args: + unselected_ids = sorted(list(set(dataset.ids) - set(macm_ids))) + unselected_dset = dataset.slice(unselected_ids) + self.meta_estimator.fit(coord_dset, unselected_dset) + else: + self.meta_estimator.fit(coord_dset) + macm_data = self.meta_estimator.results.get_map(self.target_image, return_type="array") + if i_coord == 0: + data = np.zeros((target_xyz.shape[1], len(macm_data))) + data[i_coord, :] = macm_data + + # Correlate voxel-wise MACM results + data = np.corrcoef(data) + + # Convert voxel-wise correlation matrix to distances + data = 1 - data + + # Perform clustering + labels = np.zeros((len(self.n_clusters), len(macm_data)), dtype=int) + for i_cluster, cluster_count in enumerate(self.n_clusters): + kmeans = KMeans(n_clusters=cluster_count, random_state=0).fit(data) + labels[i_cluster, :] = kmeans.labels_ + + images = {"labels": labels} + return images + + def fit(self, dataset, drop_invalid=True): + self._validate_input(dataset, drop_invalid=drop_invalid) + self._preprocess_input(dataset) + maps = self._fit(dataset) + + if hasattr(self, "masker") and self.masker is not None: + masker = self.masker + else: + masker = dataset.masker + + self.results = MetaResult(self, masker, maps) + return self.results From 3ce90f1dfb9c7ed4a4dd5434808476390df049f8 Mon Sep 17 00:00:00 2001 From: Taylor Salo Date: Wed, 30 Jun 2021 16:18:50 -0400 Subject: [PATCH 03/21] Fix docstrings. --- nimare/parcellate.py | 52 +++++++++++++++++++++++++++++++++++--------- 1 file changed, 42 insertions(+), 10 deletions(-) diff --git a/nimare/parcellate.py b/nimare/parcellate.py index 3eb553f87..88a9049db 100644 --- a/nimare/parcellate.py +++ b/nimare/parcellate.py @@ -7,6 +7,7 @@ from .base import NiMAREBase from .meta.base import CBMAEstimator +from .meta.cbma.ale import ALE from .results import MetaResult from .utils import add_metadata_to_dataframe, check_type, listify, use_memmap, vox2mm @@ -14,10 +15,29 @@ class CoordCBP(NiMAREBase): - """Base class for parcellators in :mod:`nimare.parcellate`. + """Perform coordinate-based coactivation-based parcellation. .. versionadded:: 0.0.10 + Parameters + ---------- + target_mask : :obj:`nibabel.Nifti1.Nifti1Image` + Mask of target of parcellation. + Currently must be in same space/resolution as Dataset mask. + n_clusters : :obj:`list` of :obj:`int` + Number of clusters to evaluate in clustering. + r : :obj:`float` or None, optional + Radius (in mm) within which to find studies. Mutually exclusive with ``n``. + Default is None. + n : :obj:`int` or None, optional + Number of closest studies to identify. Mutually exclusive with ``r``. + Default is None. + meta_estimator : :obj:`nimare.meta.base.CBMAEstimator`, optional + CBMA Estimator with which to run the MACMs. + Default is :obj:`nimare.meta.cbma.ale.ALE`. + target_image : :obj:`str`, optional + Name of meta-analysis results image to use for clustering. + Default is "ale", which is specific to the ALE estimator. """ _required_inputs = {"coordinates": ("coordinates", None)} @@ -25,26 +45,29 @@ class CoordCBP(NiMAREBase): def __init__( self, target_mask, - meta_estimator, - target_image, n_clusters, r=None, n=None, + meta_estimator=None, + target_image="ale", *args, **kwargs, ): super().__init__(*args, **kwargs) - # Get kernel transformer - meta_estimator = check_type(meta_estimator, CBMAEstimator) - self.meta_estimator = meta_estimator - self.target_image = target_image - self.n_clusters = listify(n_clusters) + if meta_estimator is None: + meta_estimator = ALE() + else: + meta_estimator = check_type(meta_estimator, CBMAEstimator) if r and n: raise ValueError("Only one of 'r' and 'n' may be provided.") elif not r and not n: raise ValueError("Either 'r' or 'n' must be provided.") + + self.meta_estimator = meta_estimator + self.target_image = target_image + self.n_clusters = listify(n_clusters) self.r = r self.n = n @@ -72,8 +95,7 @@ def _preprocess_input(self, dataset): @use_memmap(LGR, n_files=1) def _fit(self, dataset): - """ - Perform coordinate-based meta-analysis on dataset. + """Perform coordinate-based coactivation-based parcellation on dataset. Parameters ---------- @@ -121,6 +143,16 @@ def _fit(self, dataset): return images def fit(self, dataset, drop_invalid=True): + """Perform coordinate-based coactivation-based parcellation on dataset. + + Parameters + ---------- + dataset : :obj:`nimare.dataset.Dataset` + Dataset to analyze. + drop_invalid : :obj:`bool`, optional + Whether to automatically ignore any studies without the required data or not. + Default is True. + """ self._validate_input(dataset, drop_invalid=drop_invalid) self._preprocess_input(dataset) maps = self._fit(dataset) From 40f4585f4d0a0d6a44bbb101ab0f2022f79c5517 Mon Sep 17 00:00:00 2001 From: Taylor Salo Date: Thu, 1 Jul 2021 10:58:36 -0400 Subject: [PATCH 04/21] Don't calculate distances. --- nimare/parcellate.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/nimare/parcellate.py b/nimare/parcellate.py index 88a9049db..02ad4291f 100644 --- a/nimare/parcellate.py +++ b/nimare/parcellate.py @@ -122,16 +122,12 @@ def _fit(self, dataset): self.meta_estimator.fit(coord_dset, unselected_dset) else: self.meta_estimator.fit(coord_dset) + macm_data = self.meta_estimator.results.get_map(self.target_image, return_type="array") if i_coord == 0: data = np.zeros((target_xyz.shape[1], len(macm_data))) - data[i_coord, :] = macm_data - # Correlate voxel-wise MACM results - data = np.corrcoef(data) - - # Convert voxel-wise correlation matrix to distances - data = 1 - data + data[i_coord, :] = macm_data # Perform clustering labels = np.zeros((len(self.n_clusters), len(macm_data)), dtype=int) From 530048f66eb09c6f6f5cbae56774324a5d8de57d Mon Sep 17 00:00:00 2001 From: Taylor Salo Date: Thu, 1 Jul 2021 10:59:55 -0400 Subject: [PATCH 05/21] Remove extra newline. --- nimare/dataset.py | 1 - 1 file changed, 1 deletion(-) diff --git a/nimare/dataset.py b/nimare/dataset.py index f9d61fb85..704cc0ec1 100755 --- a/nimare/dataset.py +++ b/nimare/dataset.py @@ -625,7 +625,6 @@ def get_studies_by_coordinate(self, xyz, r=None, n=None): found_ids : :obj:`list` A list of IDs from the Dataset matching the search criterion. """ - if n and r: raise ValueError("Only one of 'r' and 'n' may be provided.") elif not n and not r: From 838270439e1b546f32d1a58f8c15a74ba41efeb7 Mon Sep 17 00:00:00 2001 From: Taylor Salo Date: Thu, 1 Jul 2021 12:35:23 -0400 Subject: [PATCH 06/21] Clustering should be fairly good. Still need to implement metrics. --- nimare/parcellate.py | 113 +++++++++++++++++++++++++++++-------------- 1 file changed, 78 insertions(+), 35 deletions(-) diff --git a/nimare/parcellate.py b/nimare/parcellate.py index 02ad4291f..13425e4cb 100644 --- a/nimare/parcellate.py +++ b/nimare/parcellate.py @@ -1,11 +1,15 @@ """Parcellation tools.""" +import datetime import inspect import logging +import os +from tempfile import mkstemp import numpy as np from sklearn.cluster import KMeans from .base import NiMAREBase +from .extract.utils import _get_dataset_dir from .meta.base import CBMAEstimator from .meta.cbma.ale import ALE from .results import MetaResult @@ -26,12 +30,18 @@ class CoordCBP(NiMAREBase): Currently must be in same space/resolution as Dataset mask. n_clusters : :obj:`list` of :obj:`int` Number of clusters to evaluate in clustering. - r : :obj:`float` or None, optional - Radius (in mm) within which to find studies. Mutually exclusive with ``n``. - Default is None. - n : :obj:`int` or None, optional - Number of closest studies to identify. Mutually exclusive with ``r``. - Default is None. + Metrics will be calculated for each cluster-count in the list, to allow users to select + the optimal cluster solution. + r : :obj:`float` or :obj:`list` of :obj:`float` or None, optional + Radius (in mm) within which to find studies. + If a list of values is provided, then MACMs and clustering will be performed across all + values, and a selection procedure will be performed to identify the optimal ``r``. + Mutually exclusive with ``n``. Default is None. + n : :obj:`int` or :obj:`list` of :obj:`int` or None, optional + Number of closest studies to identify. + If a list of values is provided, then MACMs and clustering will be performed across all + values, and a selection procedure will be performed to identify the optimal ``n``. + Mutually exclusive with ``r``. Default is None. meta_estimator : :obj:`nimare.meta.base.CBMAEstimator`, optional CBMA Estimator with which to run the MACMs. Default is :obj:`nimare.meta.cbma.ale.ALE`. @@ -68,8 +78,10 @@ def __init__( self.meta_estimator = meta_estimator self.target_image = target_image self.n_clusters = listify(n_clusters) - self.r = r - self.n = n + self.filter_selection = isinstance(r, list) or isinstance(n, list) + self.filter_type = "r" if r else "n" + self.r = listify(r) + self.n = listify(n) def _preprocess_input(self, dataset): """Mask required input images using either the dataset's mask or the estimator's. @@ -80,7 +92,7 @@ def _preprocess_input(self, dataset): # All extra (non-ijk) parameters for a kernel should be overrideable as # parameters to __init__, so we can access them with get_params() - kt_args = list(self.kernel_transformer.get_params().keys()) + kt_args = list(self.meta_estimator.kernel_transformer.get_params().keys()) # Integrate "sample_size" from metadata into DataFrame so that # kernel_transformer can access it. @@ -104,36 +116,67 @@ def _fit(self, dataset): """ self.dataset = dataset self.masker = self.masker or dataset.masker - self.null_distributions_ = {} # Loop through voxels in target_mask, selecting studies for each and running MACMs (no MCC) target_ijk = np.vstack(np.where(self.target_mask.get_fdata())) target_xyz = vox2mm(target_ijk, self.masker.mask_img.affine) - for i_coord in target_xyz.shape[1]: - xyz = target_xyz[:, i_coord] - macm_ids = dataset.get_studies_by_coordinate(xyz, r=self.r, n=self.n) - coord_dset = dataset.slice(macm_ids) - - # This seems like a somewhat inelegant solution - # Check if the meta method is a pairwise estimator - if "dataset2" in inspect.getfullargspec(self.meta_estimator.fit).args: - unselected_ids = sorted(list(set(dataset.ids) - set(macm_ids))) - unselected_dset = dataset.slice(unselected_ids) - self.meta_estimator.fit(coord_dset, unselected_dset) - else: - self.meta_estimator.fit(coord_dset) - - macm_data = self.meta_estimator.results.get_map(self.target_image, return_type="array") - if i_coord == 0: - data = np.zeros((target_xyz.shape[1], len(macm_data))) - - data[i_coord, :] = macm_data - - # Perform clustering - labels = np.zeros((len(self.n_clusters), len(macm_data)), dtype=int) - for i_cluster, cluster_count in enumerate(self.n_clusters): - kmeans = KMeans(n_clusters=cluster_count, random_state=0).fit(data) - labels[i_cluster, :] = kmeans.labels_ + n_target_voxels = target_xyz.shape[1] + n_mask_voxels = self.masker.transform(self.masker.mask_img).shape[0] + + n_filters = len(getattr(self, self.filter_type)) + labels = np.zeros((n_filters, len(self.n_clusters), n_target_voxels), dtype=int) + kwargs = {"r": None, "n": None} + + # Use a memmapped 2D array + start_time = datetime.datetime.now().strftime("%Y%m%dT%H%M%S") + dataset_dir = _get_dataset_dir("temporary_files", data_dir=None) + _, memmap_filename = mkstemp( + prefix=self.__class__.__name__, + suffix=start_time, + dir=dataset_dir, + ) + data = np.memmap( + memmap_filename, + dtype=float, + mode="w+", + shape=(n_target_voxels, n_mask_voxels), + ) + + for i_filter in range(n_filters): + kwargs[self.filter_type] = getattr(self, self.filter_type)[i_filter] + for j_coord in n_target_voxels: + xyz = target_xyz[:, j_coord] + macm_ids = dataset.get_studies_by_coordinate(xyz, **kwargs) + coord_dset = dataset.slice(macm_ids) + + # This seems like a somewhat inelegant solution + # Check if the meta method is a pairwise estimator + if "dataset2" in inspect.getfullargspec(self.meta_estimator.fit).args: + unselected_ids = sorted(list(set(dataset.ids) - set(macm_ids))) + unselected_dset = dataset.slice(unselected_ids) + self.meta_estimator.fit(coord_dset, unselected_dset) + else: + self.meta_estimator.fit(coord_dset) + + data[j_coord, :] = self.meta_estimator.results.get_map( + self.target_image, + return_type="array", + ) # data is overwritten across filters + + # Perform clustering + for j_cluster, cluster_count in enumerate(self.n_clusters): + kmeans = KMeans( + n_clusters=cluster_count, + init="k-means++", + n_init=10, + random_state=0, + algorithm="elkan", + ).fit(data) + labels[i_filter, j_cluster, :] = kmeans.labels_ + + # Clean up MACM data memmap + LGR.info(f"Removing temporary file: {memmap_filename}") + os.remove(memmap_filename) images = {"labels": labels} return images From c79b251e83b54a22327d7a63ab1f916a456e2f10 Mon Sep 17 00:00:00 2001 From: Taylor Salo Date: Thu, 1 Jul 2021 12:54:03 -0400 Subject: [PATCH 07/21] Add empty methods for the different metrics. --- nimare/parcellate.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/nimare/parcellate.py b/nimare/parcellate.py index 13425e4cb..091c04c78 100644 --- a/nimare/parcellate.py +++ b/nimare/parcellate.py @@ -181,6 +181,24 @@ def _fit(self, dataset): images = {"labels": labels} return images + def _filter_selection(self): + pass + + def _silhouette(self): + pass + + def _voxel_misclassification(self): + pass + + def _variation_of_information(self): + pass + + def _nondominant_voxel_percentage(self): + pass + + def _cluster_distance_ratio(self): + pass + def fit(self, dataset, drop_invalid=True): """Perform coordinate-based coactivation-based parcellation on dataset. From 3e2d580fac55859c3ef55ae8fef15db436a9a8cb Mon Sep 17 00:00:00 2001 From: Taylor Salo Date: Thu, 1 Jul 2021 13:41:56 -0400 Subject: [PATCH 08/21] Work on filter-selection metric. Far from done. --- nimare/parcellate.py | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/nimare/parcellate.py b/nimare/parcellate.py index 091c04c78..fe13ad9b4 100644 --- a/nimare/parcellate.py +++ b/nimare/parcellate.py @@ -181,8 +181,21 @@ def _fit(self, dataset): images = {"labels": labels} return images - def _filter_selection(self): - pass + def _filter_selection(self, labels): + from scipy import stats + n_filters, n_clusters, n_voxels = labels.shape + deviant_proportions = np.zeros((n_filters, n_clusters)) + for i_cluster in range(n_clusters): + cluster_labels = labels[:, i_cluster, :] + cluster_labels_mode = stats.mode(cluster_labels, axis=0)[0] + is_deviant = cluster_labels != cluster_labels_mode + deviant_proportion = is_deviant.mean(axis=1) + assert deviant_proportion.size == n_filters + deviant_proportions[:, i_cluster] = deviant_proportion + # Z-score within each cluster solution + deviant_z = stats.zscore(deviant_proportions, axis=0) + filter_deviant_z = deviant_z.sum(axis=1) + min_deviants_filter = np.where(filter_deviant_z == np.min(filter_deviant_z))[0] def _silhouette(self): pass From 3365d5c4a998f4a0ef85c8412a9b5e5652ba9b6c Mon Sep 17 00:00:00 2001 From: Taylor Salo Date: Thu, 1 Jul 2021 15:22:57 -0400 Subject: [PATCH 09/21] Add a couple of metrics. --- nimare/parcellate.py | 50 +++++++++++++++++++++++++++++++++++++++----- 1 file changed, 45 insertions(+), 5 deletions(-) diff --git a/nimare/parcellate.py b/nimare/parcellate.py index fe13ad9b4..41f844237 100644 --- a/nimare/parcellate.py +++ b/nimare/parcellate.py @@ -48,6 +48,12 @@ class CoordCBP(NiMAREBase): target_image : :obj:`str`, optional Name of meta-analysis results image to use for clustering. Default is "ale", which is specific to the ALE estimator. + + Notes + ----- + This approach deviates in a number of respects from the method described in + Chase et al. (2020), including the fact that the scikit-learn implementation of the K-means + clustering algorithm only supports Euclidean distance, so correlation distance is not used. """ _required_inputs = {"coordinates": ("coordinates", None)} @@ -59,7 +65,7 @@ def __init__( r=None, n=None, meta_estimator=None, - target_image="ale", + target_image="stat", *args, **kwargs, ): @@ -125,6 +131,7 @@ def _fit(self, dataset): n_filters = len(getattr(self, self.filter_type)) labels = np.zeros((n_filters, len(self.n_clusters), n_target_voxels), dtype=int) + silhouettes = np.zeros((n_filters, len(self.n_clusters)), dtype=float) kwargs = {"r": None, "n": None} # Use a memmapped 2D array @@ -173,6 +180,7 @@ def _fit(self, dataset): algorithm="elkan", ).fit(data) labels[i_filter, j_cluster, :] = kmeans.labels_ + silhouettes[i_filter, j_cluster] = self._silhouette(data, kmeans.labels_) # Clean up MACM data memmap LGR.info(f"Removing temporary file: {memmap_filename}") @@ -197,14 +205,46 @@ def _filter_selection(self, labels): filter_deviant_z = deviant_z.sum(axis=1) min_deviants_filter = np.where(filter_deviant_z == np.min(filter_deviant_z))[0] - def _silhouette(self): - pass + def _silhouette(self, data, labels): + """Calculate silhouette score.""" + from sklearn.metrics import silhouette_score + silhouette = silhouette_score(data, labels, metric="euclidean", random_state=None) + return silhouette def _voxel_misclassification(self): pass - def _variation_of_information(self): - pass + def _variation_of_information(self, X, Y): + """Calculate variation of information metric. + + Parameters + ---------- + X, Y : :obj:`list` of :obj:`list` + Partitions to compare + + Notes + ----- + From https://gist.github.com/jwcarr/626cbc80e0006b526688 + + Examples + -------- + >>> X = [[1, 2, 3], [4, 5, 6, 7]] + >>> Y = [[1, 2, 3, 4], [5, 6, 7]] + >>> variation_of_information(X, Y) + 0.9271749993818661 + """ + from math import log + + n = float(sum([len(x) for x in X])) + sigma = 0.0 + for x in X: + p = len(x) / n + for y in Y: + q = len(y) / n + r = len(set(x) & set(y)) / n + if r > 0.0: + sigma += r * (log(r / p, 2) + log(r / q, 2)) + return abs(sigma) def _nondominant_voxel_percentage(self): pass From 708160433750e9d73fc27deb95424dad65005c42 Mon Sep 17 00:00:00 2001 From: Taylor Salo Date: Fri, 2 Jul 2021 11:19:54 -0400 Subject: [PATCH 10/21] Improve metric documentation. --- nimare/parcellate.py | 68 ++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 62 insertions(+), 6 deletions(-) diff --git a/nimare/parcellate.py b/nimare/parcellate.py index 41f844237..8b86652b6 100644 --- a/nimare/parcellate.py +++ b/nimare/parcellate.py @@ -190,6 +190,13 @@ def _fit(self, dataset): return images def _filter_selection(self, labels): + """Select a range of optimal filter values based on consistency of cluster assignment. + + Parameters + ---------- + labels : :obj:`numpy.ndarray` of shape (n_filters, nclusters, n_voxels) + Labeling results from a range of KMeans clustering runs. + """ from scipy import stats n_filters, n_clusters, n_voxels = labels.shape deviant_proportions = np.zeros((n_filters, n_clusters)) @@ -205,13 +212,23 @@ def _filter_selection(self, labels): filter_deviant_z = deviant_z.sum(axis=1) min_deviants_filter = np.where(filter_deviant_z == np.min(filter_deviant_z))[0] - def _silhouette(self, data, labels): - """Calculate silhouette score.""" - from sklearn.metrics import silhouette_score - silhouette = silhouette_score(data, labels, metric="euclidean", random_state=None) - return silhouette + # This is not the end + return min_deviants_filter def _voxel_misclassification(self): + """Calculate voxel misclassification metric. + + Notes + ----- + From Chase et al. (2020): + First, misclassified voxels (deviants) were examined as a topological criterion, + with optimal K parcellations being those in which the percentage of deviants was not + significantly increased compared to the K − 1 solution and but where the K + 1 was + associated with significantly increased deviants. + + TS: Deviants are presumably calculated only from the range of filters selected in the + filter selection step. + """ pass def _variation_of_information(self, X, Y): @@ -224,7 +241,12 @@ def _variation_of_information(self, X, Y): Notes ----- - From https://gist.github.com/jwcarr/626cbc80e0006b526688 + Implementation adapted from https://gist.github.com/jwcarr/626cbc80e0006b526688. + + From Chase et al. (2020): + Second, the variation of information (VI) metric was employed as an information-theoretic + criterion to assess the similarity of cluster assignments for each filter size between + the current solution and the neighboring (K − 1 and K + 1) solutions. Examples -------- @@ -246,10 +268,44 @@ def _variation_of_information(self, X, Y): sigma += r * (log(r / p, 2) + log(r / q, 2)) return abs(sigma) + def _average_silhouette(self, data, labels): + """Calculate average silhouette score. + + Notes + ----- + From Chase et al. (2020): + Third, the silhouette value averaged across voxels for each filter size was considered a + cluster separation criterion. + """ + from sklearn.metrics import silhouette_score + silhouette = silhouette_score(data, labels, metric="euclidean", random_state=None) + # Average across voxels + return silhouette + def _nondominant_voxel_percentage(self): + """Calculate percentage-of-voxels-not-with-parent metric. + + Notes + ----- + From Chase et al. (2020): + Fourth, we assessed the percentage of voxels not related to the dominant parent cluster + compared with the K − 1 solution as a second topological criterion. + This measure corresponds to the percentage voxels that are not present in hierarchy, K, + compared with the previous K − 1 solution, and is related to the hierarchy index. + """ pass def _cluster_distance_ratio(self): + """Calculate change-in-inter/intra-cluster-distance metric. + + Notes + ----- + From Chase et al. (2020): + Finally, the change in inter- versus intra-cluster distance ratio was computed as a second + cluster separation criterion. + This ratio is the first derivative of the ratio between the average distance of a given + voxel to its own cluster center and the average distance between the cluster centers. + """ pass def fit(self, dataset, drop_invalid=True): From 12cddec571d62d97baaf54cc2ba82e66ede70223 Mon Sep 17 00:00:00 2001 From: Taylor Salo Date: Fri, 2 Jul 2021 11:54:56 -0400 Subject: [PATCH 11/21] VI should be fairly good now. --- nimare/parcellate.py | 77 ++++++++++++++++++++++++++++++-------------- nimare/utils.py | 33 +++++++++++++++++++ 2 files changed, 86 insertions(+), 24 deletions(-) diff --git a/nimare/parcellate.py b/nimare/parcellate.py index 8b86652b6..39ddfcff8 100644 --- a/nimare/parcellate.py +++ b/nimare/parcellate.py @@ -198,6 +198,7 @@ def _filter_selection(self, labels): Labeling results from a range of KMeans clustering runs. """ from scipy import stats + n_filters, n_clusters, n_voxels = labels.shape deviant_proportions = np.zeros((n_filters, n_clusters)) for i_cluster in range(n_clusters): @@ -231,42 +232,69 @@ def _voxel_misclassification(self): """ pass - def _variation_of_information(self, X, Y): + def _variation_of_information(self, labels, cluster_counts): """Calculate variation of information metric. Parameters ---------- - X, Y : :obj:`list` of :obj:`list` - Partitions to compare + labels : :obj:`numpy.ndarray` of shape (n_filters, n_clusters, n_voxels) + cluster_counts : :obj:`list` of :obj:`int` + The set of K values tested. Must be n_clusters items long. + + Returns + ------- + vi_values : :obj:`numpy.ndarray` of shape (n_filters, n_clusters, 2) + Variation of information values for each filter and each cluster, where the last + dimension has two values: the VI value for K vs. K - 1 and the value for K vs. K + 1. + K values with no K - 1 or K + 1 have a NaN in the associated cell. Notes ----- - Implementation adapted from https://gist.github.com/jwcarr/626cbc80e0006b526688. - From Chase et al. (2020): Second, the variation of information (VI) metric was employed as an information-theoretic criterion to assess the similarity of cluster assignments for each filter size between the current solution and the neighboring (K − 1 and K + 1) solutions. - - Examples - -------- - >>> X = [[1, 2, 3], [4, 5, 6, 7]] - >>> Y = [[1, 2, 3, 4], [5, 6, 7]] - >>> variation_of_information(X, Y) - 0.9271749993818661 """ - from math import log - - n = float(sum([len(x) for x in X])) - sigma = 0.0 - for x in X: - p = len(x) / n - for y in Y: - q = len(y) / n - r = len(set(x) & set(y)) / n - if r > 0.0: - sigma += r * (log(r / p, 2) + log(r / q, 2)) - return abs(sigma) + from .utils import variation_of_information + + n_filters, n_clusters, _ = labels.shape + assert len(cluster_counts) == n_clusters + + vi_values = np.empty((n_filters, n_clusters, 2)) + for i_filter in range(n_filters): + filter_labels = labels[i_filter, :, :] + for j_cluster, cluster_count in enumerate(cluster_counts): + cluster_partition = [ + np.where(filter_labels[j_cluster, :] == k_cluster_num)[0] + for k_cluster_num in range(cluster_count) + ] + if j_cluster > 0: + cluster_m1_partition = [ + np.where(filter_labels[j_cluster - 1, :] == k_cluster_num)[0] + for k_cluster_num in range(cluster_counts[j_cluster - 1]) + ] + # Calculate VI between K and K - 1 + vi_values[i_filter, j_cluster, 0] = variation_of_information( + cluster_partition, + cluster_m1_partition, + ) + else: + vi_values[i_filter, j_cluster, 0] = np.nan + + if j_cluster < (labels.shape[1] - 1): + cluster_p1_partition = [ + np.where(filter_labels[j_cluster + 1, :] == k_cluster_num)[0] + for k_cluster_num in range(cluster_counts[j_cluster + 1]) + ] + # Calculate VI between K and K + 1 + vi_values[i_filter, j_cluster, 1] = variation_of_information( + cluster_partition, + cluster_p1_partition, + ) + else: + vi_values[i_filter, j_cluster, 1] = np.nan + + return vi_values def _average_silhouette(self, data, labels): """Calculate average silhouette score. @@ -278,6 +306,7 @@ def _average_silhouette(self, data, labels): cluster separation criterion. """ from sklearn.metrics import silhouette_score + silhouette = silhouette_score(data, labels, metric="euclidean", random_state=None) # Average across voxels return silhouette diff --git a/nimare/utils.py b/nimare/utils.py index 6d4f4a6f1..7592f9e63 100755 --- a/nimare/utils.py +++ b/nimare/utils.py @@ -789,3 +789,36 @@ def mni2tal(coords): if use_dim == 1: out_coords = out_coords.transpose() return out_coords + + +def variation_of_information(X, Y): + """Calculate variation of information metric. + + Parameters + ---------- + X, Y : :obj:`list` of :obj:`list` + Partitions to compare + + Notes + ----- + Implementation adapted from https://gist.github.com/jwcarr/626cbc80e0006b526688. + + Examples + -------- + >>> X = [[1, 2, 3], [4, 5, 6, 7]] + >>> Y = [[1, 2, 3, 4], [5, 6, 7]] + >>> variation_of_information(X, Y) + 0.9271749993818661 + """ + from math import log + + n = float(sum([len(x) for x in X])) + sigma = 0.0 + for x in X: + p = len(x) / n + for y in Y: + q = len(y) / n + r = len(set(x) & set(y)) / n + if r > 0.0: + sigma += r * (log(r / p, 2) + log(r / q, 2)) + return abs(sigma) From f37751aa18d6ed5b90509806a2cf1bfd54802afd Mon Sep 17 00:00:00 2001 From: Taylor Salo Date: Fri, 9 Jul 2021 17:16:05 -0400 Subject: [PATCH 12/21] Draft voxel misclassification metric. --- nimare/parcellate.py | 90 +++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 85 insertions(+), 5 deletions(-) diff --git a/nimare/parcellate.py b/nimare/parcellate.py index 39ddfcff8..7342fb923 100644 --- a/nimare/parcellate.py +++ b/nimare/parcellate.py @@ -196,6 +196,18 @@ def _filter_selection(self, labels): ---------- labels : :obj:`numpy.ndarray` of shape (n_filters, nclusters, n_voxels) Labeling results from a range of KMeans clustering runs. + + Notes + ----- + From Chase et al. (2020): + We implemented a two-step procedure that involved a decision on those filter sizes to be + included in the final analysis and subsequently a decision on the optimal cluster solution. + In the first step, we examined the consistency of the cluster assignment for the individual + voxels across the cluster solutions of the co-occurrence maps performed at different filter + sizes. We selected a filter range with the lowest number of deviants, that is, number of + voxels that were assigned differently compared with the solution from the majority of + filters. In other words, we identified those filter sizes which produced solutions most + similar to the consensus-solution across all filter sizes """ from scipy import stats @@ -213,12 +225,18 @@ def _filter_selection(self, labels): filter_deviant_z = deviant_z.sum(axis=1) min_deviants_filter = np.where(filter_deviant_z == np.min(filter_deviant_z))[0] - # This is not the end + # NOTE: This is not the end, but I don't know how filters were selected based on this array return min_deviants_filter - def _voxel_misclassification(self): + def _voxel_misclassification(self, selected_filter_labels, alpha=0.05): """Calculate voxel misclassification metric. + Parameters + ---------- + labels : :obj:`numpy.ndarray` of shape (n_selected_filters, nclusters, n_voxels) + Labeling results from a range of KMeans clustering runs, limited to those filters + that survived the filter selection procedure. + Notes ----- From Chase et al. (2020): @@ -229,8 +247,67 @@ def _voxel_misclassification(self): TS: Deviants are presumably calculated only from the range of filters selected in the filter selection step. + + TS: To measure significance, we could do a series of pair-wise proportion z-tests + Left side null hypothesis: proportion from K is <= K - 1 + Left side alternative: K > K -1 + Accept on null + z, p = proportions_ztest([k, k-1], [n, n], alternative="larger") + p >= alpha --> YAY + Right side null hypothesis: proportion from K is >= K + 1 + Right side alternative: K < K + 1 + Accept on rejected null + z, p = proportions_ztest([k, k+1], [n, n], alternative="smaller") + p < alpha --> YAY """ - pass + from scipy import stats + from statsmodels.stats.proportion import proportions_ztest + + n_filters, n_clusters, n_voxels = selected_filter_labels.shape + count = n_voxels * n_filters + deviant_counts = np.zeros((n_filters, n_clusters)) + for i_cluster in range(n_clusters): + cluster_labels = selected_filter_labels[:, i_cluster, :] + cluster_labels_mode = stats.mode(cluster_labels, axis=0)[0] + is_deviant = cluster_labels != cluster_labels_mode + deviant_count = is_deviant.sum(axis=1) + assert deviant_count.size == n_filters + deviant_counts[:, i_cluster] = deviant_count + + decision_criteria = np.empty((n_clusters, 2), dtype=bool) + + for i_cluster in range(n_clusters): + cluster_deviant_count = np.sum(deviant_counts[:, i_cluster]) + + # Compare to k - 1 + if i_cluster > 0: + m1_cluster_deviant_count = np.sum(deviant_counts[:, i_cluster - 1]) + _, p = proportions_ztest( + count=[cluster_deviant_count, m1_cluster_deviant_count], + nobs=[count, count], + alternative="larger", + ) + decision_criteria[i_cluster, 0] = p >= alpha + else: + decision_criteria[i_cluster, 0] = np.nan + + if i_cluster < (n_clusters - 1): + p1_cluster_deviant_count = np.sum(deviant_counts[:, i_cluster + 1]) + _, p = proportions_ztest( + count=[cluster_deviant_count, p1_cluster_deviant_count], + nobs=[count, count], + alternative="smaller", + ) + decision_criteria[i_cluster, 1] = p < alpha + else: + decision_criteria[i_cluster, 1] = np.nan + + decision_results = np.all(decision_criteria, axis=1) + good_cluster_idx = np.where(decision_results)[0] + if not good_cluster_idx: + LGR.warning("No cluster solution found according to VM metric.") + + return good_cluster_idx def _variation_of_information(self, labels, cluster_counts): """Calculate variation of information metric. @@ -268,12 +345,14 @@ def _variation_of_information(self, labels, cluster_counts): np.where(filter_labels[j_cluster, :] == k_cluster_num)[0] for k_cluster_num in range(cluster_count) ] + + # Calculate VI between K and K - 1 if j_cluster > 0: cluster_m1_partition = [ np.where(filter_labels[j_cluster - 1, :] == k_cluster_num)[0] for k_cluster_num in range(cluster_counts[j_cluster - 1]) ] - # Calculate VI between K and K - 1 + vi_values[i_filter, j_cluster, 0] = variation_of_information( cluster_partition, cluster_m1_partition, @@ -281,12 +360,13 @@ def _variation_of_information(self, labels, cluster_counts): else: vi_values[i_filter, j_cluster, 0] = np.nan + # Calculate VI between K and K + 1 if j_cluster < (labels.shape[1] - 1): cluster_p1_partition = [ np.where(filter_labels[j_cluster + 1, :] == k_cluster_num)[0] for k_cluster_num in range(cluster_counts[j_cluster + 1]) ] - # Calculate VI between K and K + 1 + vi_values[i_filter, j_cluster, 1] = variation_of_information( cluster_partition, cluster_p1_partition, From 0896745f258dbc38a8143c95792d3a3cd537ee29 Mon Sep 17 00:00:00 2001 From: Taylor Salo Date: Wed, 14 Jul 2021 12:52:03 -0400 Subject: [PATCH 13/21] Draft another metric. Will move most of the work to the main fit loop, because this metric requires too much information. --- nimare/parcellate.py | 47 ++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 45 insertions(+), 2 deletions(-) diff --git a/nimare/parcellate.py b/nimare/parcellate.py index 7342fb923..89299851b 100644 --- a/nimare/parcellate.py +++ b/nimare/parcellate.py @@ -131,6 +131,10 @@ def _fit(self, dataset): n_filters = len(getattr(self, self.filter_type)) labels = np.zeros((n_filters, len(self.n_clusters), n_target_voxels), dtype=int) + cluster_centers = np.zeros( + (n_filters, len(self.n_clusters), np.max(self.n_clusters), n_mask_voxels), + dtype=float, + ) silhouettes = np.zeros((n_filters, len(self.n_clusters)), dtype=float) kwargs = {"r": None, "n": None} @@ -180,6 +184,7 @@ def _fit(self, dataset): algorithm="elkan", ).fit(data) labels[i_filter, j_cluster, :] = kmeans.labels_ + cluster_centers[i_filter, j_cluster, :cluster_count, :] = kmeans.cluster_centers_ silhouettes[i_filter, j_cluster] = self._silhouette(data, kmeans.labels_) # Clean up MACM data memmap @@ -404,9 +409,18 @@ def _nondominant_voxel_percentage(self): """ pass - def _cluster_distance_ratio(self): + def _cluster_distance_ratio(self, labels, cluster_centers, cluster_counts, data): """Calculate change-in-inter/intra-cluster-distance metric. + Parameters + ---------- + labels : :obj:`numpy.ndarray` of shape (n_filters, n_clusters, n_target_voxels) + cluster_centers : :obj:`numpy.ndarray` of shape + (n_filters, n_clusters, max_cluster_size, n_feature_voxels) + Cluster centers. + cluster_counts : :obj:`list` of :obj:`int` + The set of K values tested. Must be n_clusters items long. + Notes ----- From Chase et al. (2020): @@ -415,7 +429,36 @@ def _cluster_distance_ratio(self): This ratio is the first derivative of the ratio between the average distance of a given voxel to its own cluster center and the average distance between the cluster centers. """ - pass + from scipy.spatial import distance + + n_filters, n_clusters, n_target_voxels = labels.shape + _, _, max_cluster_size, n_feature_voxels = cluster_centers.shape + assert n_filters == cluster_centers.shape[0] + assert n_clusters == cluster_centers.shape[1] + ratios = np.zeros((n_filters, n_clusters), dtype=float) + for i_filter in range(n_filters): + for j_cluster in range(n_clusters): + cluster_count = cluster_counts[j_cluster] + solution_labels = labels[i_filter, j_cluster, :] + solution_centers = cluster_centers[i_filter, j_cluster, :cluster_count, :] + avg_center_distance = np.mean(distance.pdist(solution_centers)) + total_voxel_center_distance = 0 + for k_cluster_val in range(cluster_count): + cluster_val_voxels = np.where(solution_labels == k_cluster_val)[0] + cluster_val_center = solution_centers[k_cluster_val, :] + cluster_val_features = data[cluster_val_voxels, :] + cluster_val_distances = distance.cdist( + cluster_val_center, + cluster_val_features, + ) + total_voxel_center_distance += np.sum(cluster_val_distances) + avg_voxel_center_distance = total_voxel_center_distance / n_target_voxels + ratio = avg_voxel_center_distance / avg_center_distance + ratios[i_filter, j_cluster] = ratio + + # TODO: Calculate first derivative? + + return ratios def fit(self, dataset, drop_invalid=True): """Perform coordinate-based coactivation-based parcellation on dataset. From f9611dbedbfcd5861f89cc59c7c7420beef3f57d Mon Sep 17 00:00:00 2001 From: Taylor Salo Date: Wed, 14 Jul 2021 12:53:44 -0400 Subject: [PATCH 14/21] Little cleanup. --- nimare/parcellate.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/nimare/parcellate.py b/nimare/parcellate.py index 89299851b..97ccafa8d 100644 --- a/nimare/parcellate.py +++ b/nimare/parcellate.py @@ -441,7 +441,12 @@ def _cluster_distance_ratio(self, labels, cluster_centers, cluster_counts, data) cluster_count = cluster_counts[j_cluster] solution_labels = labels[i_filter, j_cluster, :] solution_centers = cluster_centers[i_filter, j_cluster, :cluster_count, :] - avg_center_distance = np.mean(distance.pdist(solution_centers)) + avg_intracenter_distance = np.mean( + distance.pdist( + solution_centers, + metric="euclidean", + ) + ) total_voxel_center_distance = 0 for k_cluster_val in range(cluster_count): cluster_val_voxels = np.where(solution_labels == k_cluster_val)[0] @@ -450,10 +455,11 @@ def _cluster_distance_ratio(self, labels, cluster_centers, cluster_counts, data) cluster_val_distances = distance.cdist( cluster_val_center, cluster_val_features, + metric="euclidean", ) total_voxel_center_distance += np.sum(cluster_val_distances) avg_voxel_center_distance = total_voxel_center_distance / n_target_voxels - ratio = avg_voxel_center_distance / avg_center_distance + ratio = avg_voxel_center_distance / avg_intracenter_distance ratios[i_filter, j_cluster] = ratio # TODO: Calculate first derivative? From 909c82243f04f7bd59eff405f92dbd8e97f91784 Mon Sep 17 00:00:00 2001 From: Taylor Salo Date: Wed, 14 Jul 2021 13:26:59 -0400 Subject: [PATCH 15/21] Move ratio calculations to main fit method. --- nimare/parcellate.py | 65 ++++++++++++++++++-------------------------- 1 file changed, 26 insertions(+), 39 deletions(-) diff --git a/nimare/parcellate.py b/nimare/parcellate.py index 97ccafa8d..00a0fd194 100644 --- a/nimare/parcellate.py +++ b/nimare/parcellate.py @@ -6,6 +6,7 @@ from tempfile import mkstemp import numpy as np +from scipy.spatial import distance from sklearn.cluster import KMeans from .base import NiMAREBase @@ -131,10 +132,6 @@ def _fit(self, dataset): n_filters = len(getattr(self, self.filter_type)) labels = np.zeros((n_filters, len(self.n_clusters), n_target_voxels), dtype=int) - cluster_centers = np.zeros( - (n_filters, len(self.n_clusters), np.max(self.n_clusters), n_mask_voxels), - dtype=float, - ) silhouettes = np.zeros((n_filters, len(self.n_clusters)), dtype=float) kwargs = {"r": None, "n": None} @@ -152,6 +149,7 @@ def _fit(self, dataset): mode="w+", shape=(n_target_voxels, n_mask_voxels), ) + ratios = np.zeros((n_filters, len(self.n_clusters)), dtype=float) for i_filter in range(n_filters): kwargs[self.filter_type] = getattr(self, self.filter_type)[i_filter] @@ -184,9 +182,31 @@ def _fit(self, dataset): algorithm="elkan", ).fit(data) labels[i_filter, j_cluster, :] = kmeans.labels_ - cluster_centers[i_filter, j_cluster, :cluster_count, :] = kmeans.cluster_centers_ + silhouettes[i_filter, j_cluster] = self._silhouette(data, kmeans.labels_) + # Necessary calculations for ratios metric + avg_intracenter_distance = np.mean( + distance.pdist( + kmeans.cluster_centers_, + metric="euclidean", + ) + ) + total_voxel_center_distance = 0 + for k_cluster_val in range(cluster_count): + cluster_val_voxels = np.where(kmeans.labels_ == k_cluster_val)[0] + cluster_val_center = kmeans.cluster_centers_[k_cluster_val, :] + cluster_val_features = data[cluster_val_voxels, :] + cluster_val_distances = distance.cdist( + cluster_val_center, + cluster_val_features, + metric="euclidean", + ) + total_voxel_center_distance += np.sum(cluster_val_distances) + avg_voxel_center_distance = total_voxel_center_distance / n_target_voxels + ratio = avg_voxel_center_distance / avg_intracenter_distance + ratios[i_filter, j_cluster] = ratio + # Clean up MACM data memmap LGR.info(f"Removing temporary file: {memmap_filename}") os.remove(memmap_filename) @@ -409,7 +429,7 @@ def _nondominant_voxel_percentage(self): """ pass - def _cluster_distance_ratio(self, labels, cluster_centers, cluster_counts, data): + def _cluster_distance_ratio(self, ratios): """Calculate change-in-inter/intra-cluster-distance metric. Parameters @@ -429,39 +449,6 @@ def _cluster_distance_ratio(self, labels, cluster_centers, cluster_counts, data) This ratio is the first derivative of the ratio between the average distance of a given voxel to its own cluster center and the average distance between the cluster centers. """ - from scipy.spatial import distance - - n_filters, n_clusters, n_target_voxels = labels.shape - _, _, max_cluster_size, n_feature_voxels = cluster_centers.shape - assert n_filters == cluster_centers.shape[0] - assert n_clusters == cluster_centers.shape[1] - ratios = np.zeros((n_filters, n_clusters), dtype=float) - for i_filter in range(n_filters): - for j_cluster in range(n_clusters): - cluster_count = cluster_counts[j_cluster] - solution_labels = labels[i_filter, j_cluster, :] - solution_centers = cluster_centers[i_filter, j_cluster, :cluster_count, :] - avg_intracenter_distance = np.mean( - distance.pdist( - solution_centers, - metric="euclidean", - ) - ) - total_voxel_center_distance = 0 - for k_cluster_val in range(cluster_count): - cluster_val_voxels = np.where(solution_labels == k_cluster_val)[0] - cluster_val_center = solution_centers[k_cluster_val, :] - cluster_val_features = data[cluster_val_voxels, :] - cluster_val_distances = distance.cdist( - cluster_val_center, - cluster_val_features, - metric="euclidean", - ) - total_voxel_center_distance += np.sum(cluster_val_distances) - avg_voxel_center_distance = total_voxel_center_distance / n_target_voxels - ratio = avg_voxel_center_distance / avg_intracenter_distance - ratios[i_filter, j_cluster] = ratio - # TODO: Calculate first derivative? return ratios From 65857d1d718d9d2704ea90fffb07c0f4eb2cf022 Mon Sep 17 00:00:00 2001 From: Taylor Salo Date: Wed, 14 Jul 2021 13:47:28 -0400 Subject: [PATCH 16/21] More cleanup. --- nimare/parcellate.py | 26 ++++++++++++++++++++------ 1 file changed, 20 insertions(+), 6 deletions(-) diff --git a/nimare/parcellate.py b/nimare/parcellate.py index 00a0fd194..8d12e17be 100644 --- a/nimare/parcellate.py +++ b/nimare/parcellate.py @@ -87,6 +87,7 @@ def __init__( self.n_clusters = listify(n_clusters) self.filter_selection = isinstance(r, list) or isinstance(n, list) self.filter_type = "r" if r else "n" + self.alpha = 0.05 self.r = listify(r) self.n = listify(n) @@ -207,6 +208,24 @@ def _fit(self, dataset): ratio = avg_voxel_center_distance / avg_intracenter_distance ratios[i_filter, j_cluster] = ratio + # Identify range of optimal filters + retained_filter_idx = self._filter_selection(labels) + labels = labels[retained_filter_idx, ...] + silhouettes = silhouettes[retained_filter_idx, ...] + ratios = ratios[retained_filter_idx, ...] + + # Calculate metrics + vm_results = self._voxel_misclassification(labels, alpha=self.alpha) # boolx2 + vi_results = self._variation_of_information(labels, self.n_clusters) # floatx2 + s_results = silhouettes # float + nvp_results = self._nondominant_voxel_percentage() # ? + cdr_results = self._cluster_distance_ratio(ratios) # float + vote = np.nansum( + np.dstack((vm_results, vi_results, s_results, nvp_results, cdr_results)), + axis=2, + ) + assert vote.shape == (n_filters, len(self.n_clusters)) + # Clean up MACM data memmap LGR.info(f"Removing temporary file: {memmap_filename}") os.remove(memmap_filename) @@ -434,12 +453,7 @@ def _cluster_distance_ratio(self, ratios): Parameters ---------- - labels : :obj:`numpy.ndarray` of shape (n_filters, n_clusters, n_target_voxels) - cluster_centers : :obj:`numpy.ndarray` of shape - (n_filters, n_clusters, max_cluster_size, n_feature_voxels) - Cluster centers. - cluster_counts : :obj:`list` of :obj:`int` - The set of K values tested. Must be n_clusters items long. + ratios Notes ----- From 65b967513fbd89466475590418ba101c48e5e8df Mon Sep 17 00:00:00 2001 From: Taylor Salo Date: Wed, 14 Jul 2021 15:58:39 -0400 Subject: [PATCH 17/21] More work and debugging. --- nimare/parcellate.py | 60 ++++++++++++++++++++++++++------------------ nimare/utils.py | 2 +- 2 files changed, 36 insertions(+), 26 deletions(-) diff --git a/nimare/parcellate.py b/nimare/parcellate.py index 8d12e17be..c62c8b1a9 100644 --- a/nimare/parcellate.py +++ b/nimare/parcellate.py @@ -6,13 +6,14 @@ from tempfile import mkstemp import numpy as np +from nilearn._utils import load_niimg from scipy.spatial import distance from sklearn.cluster import KMeans from .base import NiMAREBase from .extract.utils import _get_dataset_dir -from .meta.base import CBMAEstimator from .meta.cbma.ale import ALE +from .meta.cbma.base import CBMAEstimator from .results import MetaResult from .utils import add_metadata_to_dataframe, check_type, listify, use_memmap, vox2mm @@ -48,7 +49,7 @@ class CoordCBP(NiMAREBase): Default is :obj:`nimare.meta.cbma.ale.ALE`. target_image : :obj:`str`, optional Name of meta-analysis results image to use for clustering. - Default is "ale", which is specific to the ALE estimator. + Default is "stat". Notes ----- @@ -82,6 +83,7 @@ def __init__( elif not r and not n: raise ValueError("Either 'r' or 'n' must be provided.") + self.target_mask = load_niimg(target_mask) self.meta_estimator = meta_estimator self.target_image = target_image self.n_clusters = listify(n_clusters) @@ -90,6 +92,7 @@ def __init__( self.alpha = 0.05 self.r = listify(r) self.n = listify(n) + self.masker = None def _preprocess_input(self, dataset): """Mask required input images using either the dataset's mask or the estimator's. @@ -126,10 +129,11 @@ def _fit(self, dataset): self.masker = self.masker or dataset.masker # Loop through voxels in target_mask, selecting studies for each and running MACMs (no MCC) - target_ijk = np.vstack(np.where(self.target_mask.get_fdata())) + target_ijk = np.vstack(np.where(self.target_mask.get_fdata())).T + target_ijk = target_ijk[:20, :] # limit to first 20 for testing target_xyz = vox2mm(target_ijk, self.masker.mask_img.affine) - n_target_voxels = target_xyz.shape[1] - n_mask_voxels = self.masker.transform(self.masker.mask_img).shape[0] + n_target_voxels = target_xyz.shape[0] + n_mask_voxels = self.masker.transform(self.masker.mask_img).size n_filters = len(getattr(self, self.filter_type)) labels = np.zeros((n_filters, len(self.n_clusters), n_target_voxels), dtype=int) @@ -154,8 +158,8 @@ def _fit(self, dataset): for i_filter in range(n_filters): kwargs[self.filter_type] = getattr(self, self.filter_type)[i_filter] - for j_coord in n_target_voxels: - xyz = target_xyz[:, j_coord] + for j_coord in range(n_target_voxels): + xyz = target_xyz[j_coord, :] macm_ids = dataset.get_studies_by_coordinate(xyz, **kwargs) coord_dset = dataset.slice(macm_ids) @@ -184,10 +188,10 @@ def _fit(self, dataset): ).fit(data) labels[i_filter, j_cluster, :] = kmeans.labels_ - silhouettes[i_filter, j_cluster] = self._silhouette(data, kmeans.labels_) + silhouettes[i_filter, j_cluster] = self._average_silhouette(data, kmeans.labels_) # Necessary calculations for ratios metric - avg_intracenter_distance = np.mean( + avg_intercenter_distance = np.mean( distance.pdist( kmeans.cluster_centers_, metric="euclidean", @@ -199,24 +203,28 @@ def _fit(self, dataset): cluster_val_center = kmeans.cluster_centers_[k_cluster_val, :] cluster_val_features = data[cluster_val_voxels, :] cluster_val_distances = distance.cdist( - cluster_val_center, + cluster_val_center[None, :], cluster_val_features, metric="euclidean", ) total_voxel_center_distance += np.sum(cluster_val_distances) - avg_voxel_center_distance = total_voxel_center_distance / n_target_voxels - ratio = avg_voxel_center_distance / avg_intracenter_distance + avg_intracluster_distance = total_voxel_center_distance / n_target_voxels + ratio = avg_intercenter_distance / avg_intracluster_distance ratios[i_filter, j_cluster] = ratio + # return labels, silhouettes, ratios # Identify range of optimal filters - retained_filter_idx = self._filter_selection(labels) - labels = labels[retained_filter_idx, ...] - silhouettes = silhouettes[retained_filter_idx, ...] - ratios = ratios[retained_filter_idx, ...] + # retained_filter_idx = self._filter_selection(labels) + # labels = labels[retained_filter_idx, ...] + # silhouettes = silhouettes[retained_filter_idx, ...] + # ratios = ratios[retained_filter_idx, ...] # Calculate metrics - vm_results = self._voxel_misclassification(labels, alpha=self.alpha) # boolx2 + vm_decision_criteria, deviant_props = self._voxel_misclassification( + labels, alpha=self.alpha) # boolx2 vi_results = self._variation_of_information(labels, self.n_clusters) # floatx2 + + return labels, silhouettes, ratios, deviant_props, vi_results s_results = silhouettes # float nvp_results = self._nondominant_voxel_percentage() # ? cdr_results = self._cluster_distance_ratio(ratios) # float @@ -348,10 +356,12 @@ def _voxel_misclassification(self, selected_filter_labels, alpha=0.05): decision_results = np.all(decision_criteria, axis=1) good_cluster_idx = np.where(decision_results)[0] - if not good_cluster_idx: + if not good_cluster_idx.size: LGR.warning("No cluster solution found according to VM metric.") - return good_cluster_idx + deviant_props = deviant_counts / n_voxels + + return decision_criteria, deviant_props def _variation_of_information(self, labels, cluster_counts): """Calculate variation of information metric. @@ -453,7 +463,7 @@ def _cluster_distance_ratio(self, ratios): Parameters ---------- - ratios + ratios : (n_filters, n_cluster_solutions) Notes ----- @@ -464,8 +474,8 @@ def _cluster_distance_ratio(self, ratios): voxel to its own cluster center and the average distance between the cluster centers. """ # TODO: Calculate first derivative? - - return ratios + ratio_derivs = np.diff(ratios, n=1, axis=1) + return ratio_derivs def fit(self, dataset, drop_invalid=True): """Perform coordinate-based coactivation-based parcellation on dataset. @@ -478,14 +488,14 @@ def fit(self, dataset, drop_invalid=True): Whether to automatically ignore any studies without the required data or not. Default is True. """ - self._validate_input(dataset, drop_invalid=drop_invalid) - self._preprocess_input(dataset) + # self._validate_input(dataset, drop_invalid=drop_invalid) + # self._preprocess_input(dataset) maps = self._fit(dataset) if hasattr(self, "masker") and self.masker is not None: masker = self.masker else: masker = dataset.masker - + return maps self.results = MetaResult(self, masker, maps) return self.results diff --git a/nimare/utils.py b/nimare/utils.py index 7940c583c..fab94452d 100755 --- a/nimare/utils.py +++ b/nimare/utils.py @@ -232,7 +232,7 @@ def validate_images_df(image_df): # Get parent *directory* if shared path includes common prefix. if not shared_path.endswith(op.sep): shared_path = op.dirname(shared_path) + op.sep - LGR.info(f"Shared path detected: '{shared_path}'") + # LGR.info(f"Shared path detected: '{shared_path}'") image_df_out = image_df.copy() # To avoid SettingWithCopyWarning for abs_col in abs_cols: From 118e7189fb452a15afc2b36d51ee95c4d5ed863f Mon Sep 17 00:00:00 2001 From: Taylor Salo Date: Thu, 15 Jul 2021 12:18:21 -0400 Subject: [PATCH 18/21] Mention re-labeling (not implemented). --- nimare/parcellate.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/nimare/parcellate.py b/nimare/parcellate.py index c62c8b1a9..b584ab1dc 100644 --- a/nimare/parcellate.py +++ b/nimare/parcellate.py @@ -212,6 +212,8 @@ def _fit(self, dataset): ratio = avg_intercenter_distance / avg_intracluster_distance ratios[i_filter, j_cluster] = ratio + # TODO: Implement re-labeling procedure across filters, for each cluster solution + # return labels, silhouettes, ratios # Identify range of optimal filters # retained_filter_idx = self._filter_selection(labels) @@ -221,7 +223,9 @@ def _fit(self, dataset): # Calculate metrics vm_decision_criteria, deviant_props = self._voxel_misclassification( - labels, alpha=self.alpha) # boolx2 + labels, + alpha=self.alpha, + ) # boolx2 vi_results = self._variation_of_information(labels, self.n_clusters) # floatx2 return labels, silhouettes, ratios, deviant_props, vi_results @@ -229,7 +233,7 @@ def _fit(self, dataset): nvp_results = self._nondominant_voxel_percentage() # ? cdr_results = self._cluster_distance_ratio(ratios) # float vote = np.nansum( - np.dstack((vm_results, vi_results, s_results, nvp_results, cdr_results)), + np.dstack((vm_decision_criteria, vi_results, s_results, nvp_results, cdr_results)), axis=2, ) assert vote.shape == (n_filters, len(self.n_clusters)) From 530a4de1cd4438a66116f81c0156c811626ea40a Mon Sep 17 00:00:00 2001 From: Taylor Salo Date: Tue, 21 Dec 2021 13:10:07 -0500 Subject: [PATCH 19/21] Remove added line. --- nimare/utils.py | 1 - 1 file changed, 1 deletion(-) diff --git a/nimare/utils.py b/nimare/utils.py index d6494d487..bdab5edc4 100755 --- a/nimare/utils.py +++ b/nimare/utils.py @@ -956,7 +956,6 @@ def _run_shell_command(command, env=None): shell=True, env=merged_env, ) - while True: line = process.stdout.readline() line = str(line, "utf-8")[:-1] From 1c352f798cff34fddd8851ab4ee6b0af90fd4670 Mon Sep 17 00:00:00 2001 From: JulioAPeraza Date: Tue, 4 Oct 2022 13:31:22 -0400 Subject: [PATCH 20/21] Update test_dataset.py --- nimare/tests/test_dataset.py | 1 - 1 file changed, 1 deletion(-) diff --git a/nimare/tests/test_dataset.py b/nimare/tests/test_dataset.py index 96604630d..31a2d79f1 100644 --- a/nimare/tests/test_dataset.py +++ b/nimare/tests/test_dataset.py @@ -32,7 +32,6 @@ def test_dataset_smoke(): assert isinstance(dset.get_metadata(field="sample_sizes"), list) assert isinstance(dset.get_studies_by_label("cogat_cognitive_control"), list) assert isinstance(dset.get_studies_by_coordinate([20, 20, 20], r=20), list) - assert isinstance(dset.get_studies_by_coordinate(np.array([[20, 20, 20]])), list) assert len(dset.get_studies_by_coordinate([20, 20, 20], n=2)) == 2 assert len(dset.get_studies_by_coordinate([20, 20, 20], n=3)) == 3 From 0c60dd5e1fdefae51e01bcd5e3bc257ec7b4d802 Mon Sep 17 00:00:00 2001 From: JulioAPeraza Date: Tue, 4 Oct 2022 13:44:34 -0400 Subject: [PATCH 21/21] Update utils.py --- nimare/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nimare/utils.py b/nimare/utils.py index 24e233c29..7c388ced1 100644 --- a/nimare/utils.py +++ b/nimare/utils.py @@ -1159,7 +1159,7 @@ def _get_cluster_coms(labeled_cluster_arr): return cluster_coms - + def variation_of_information(X, Y): """Calculate variation of information metric.