Merge pull request #54 from HiDiHlabs/restructure

niklasmueboe · web-flow · commit 381c33bd5f5e · 2025-06-18T13:41:53.000Z
Small improvements/refactorings
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -15,14 +15,14 @@ repos:
       - id: no-commit-to-branch
         args: [--branch=main]
   - repo: https://github.com/astral-sh/ruff-pre-commit
-    rev: v0.11.4
+    rev: v0.11.12
     hooks:
       # Linter
-      - id: ruff
+      - id: ruff-check
       # Formatter
       - id: ruff-format
   - repo: https://github.com/pre-commit/mirrors-mypy
-    rev: v1.15.0
+    rev: v1.16.0
     hooks:
       - id: mypy
         additional_dependencies:
diff --git a/.readthedocs.yaml b/.readthedocs.yaml
@@ -1,6 +1,6 @@
 version: 2
 build:
-  os: ubuntu-22.04
+  os: ubuntu-24.04
   tools:
     python: "3.12"
 sphinx:
diff --git a/LICENSE b/LICENSE
@@ -1,6 +1,7 @@
 MIT License
 
-Copyright (c) 2023 sebastiantiesmeyer
+Copyright (c) 2025 Sebastian Tiesmeyer, Niklas Müller-Bötticher, Naveed Ishaque,
+Roland Eils, Berlin Institute of Health @ Charité
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
diff --git a/README.md b/README.md
@@ -11,7 +11,7 @@ Much of spatial biology uses microscopic tissue slices to study the spatial dist
 ![3D slice visualization](docs/resources/cell_overlap_visualization.jpg)
 
 Ovrl.py is a quality-control tool for spatial transcriptomics data that can help analysts find sources of vertical signal inconsistency in their data.
-It is works with imaging-based spatial transcriptomics data, such as 10x genomics' Xenium or vizgen's MERFISH platforms.
+It is works with imaging-based spatial transcriptomics data, such as 10x genomics' Xenium or vizgen's MERSCOPE platforms.
 The main feature of the tool is the production of 'signal integrity maps' that can help analysts identify sources of signal inconsistency in their data.
 Users can also use the built-in 3D visualisation tool to explore regions of signal inconsistency in their data on a molecular level.
 
@@ -38,7 +38,7 @@ import pandas as pd
 import ovrlpy
 
 # define ovrlpy analysis parameters
-n_components = 20
+n_components = 20 # number pf PCA components
 
 # load the data
 coordinate_df = pd.read_csv('path/to/coordinate_file.csv')
@@ -90,7 +90,7 @@ doublet_to_show = 0
 
 x, y = doublets["x", "y"].row(doublet_to_show)
 
-fig = ovrlpy.plot_region_of_interest(dataset, x, y, window_size=window_size)
+fig = ovrlpy.plot_region_of_interest(dataset, x, y, window_size=50)
 ```
 
 ![plot_region_of_interest output](docs/resources/plot_roi.png)
diff --git a/docs/source/index.rst b/docs/source/index.rst
@@ -9,7 +9,7 @@ Introduction
 In spatial biology, tissue slices are commonly used to study the spatial distribution of cells and molecules. However, since these slices represent 3D structures in 2D, overlapping structures in the vertical dimension can lead to artefacts and inconsistencies in the data.
 
 **ovrlpy** is a quality-control tool for spatial transcriptomics data that can help analysts find sources of vertical signal inconsistency in their data.
-It is works with imaging-based spatial transcriptomics data, such as 10x Genomics' Xenium or Vizgen's MERFISH platforms.
+It is works with imaging-based spatial transcriptomics data, such as 10x Genomics' Xenium or Vizgen's MERSCOPE platforms.
 The main feature of the tool is the production of 'signal integrity maps' that can help analysts identify sources of signal inconsistency in their data.
 Users can also use the built-in 3D visualisation tool to explore regions of signal inconsistency in their data on a molecular level.
 
diff --git a/docs/source/tutorials/vizgen_liver.ipynb b/docs/source/tutorials/vizgen_liver.ipynb
@@ -5,9 +5,9 @@
    "id": "8ef9e021-1b2e-4086-868a-86d7e77b6f04",
    "metadata": {},
    "source": [
-    "# MERFISH mouse liver\n",
+    "# MERSCOPE mouse liver\n",
     "\n",
-    "In this notebook, we will use ovrlpy to investigate the [Vizgen MERFISH's mouse liver dataset](https://info.vizgen.com/mouse-liver-data).\n",
+    "In this notebook, we will use ovrlpy to investigate the [Vizgen MERSCOPE's mouse liver dataset](https://info.vizgen.com/mouse-liver-data).\n",
     "\n",
     "We want to create a signal embedding of the transcriptome, and a vertical signal incoherence map to identify locations with a high risk of containing spatial doublets."
    ]
@@ -78,7 +78,7 @@
     }
    ],
    "source": [
-    "coordinate_df = ovrlpy.io.read_MERFISH(data_path / \"detected_transcripts.csv\")\n",
+    "coordinate_df = ovrlpy.io.read_MERSCOPE(data_path / \"detected_transcripts.csv\")\n",
     "\n",
     "print(f\"Number of transcripts: {len(coordinate_df):,}\")"
    ]
diff --git a/ovrlpy/_ovrlp.py b/ovrlpy/_ovrlp.py
@@ -84,13 +84,13 @@ class Ovrlp:
         The center of gravity of each celltype in the 2D embedding, used for UMAP annotation.
     celltype_assignments : numpy.ndarray
         The assignments of the cell types.
-    pca_2d : sklearn.decomposition.PCA
-        The PCA object used for the 2D embedding.
-    embedder_2d : umap.UMAP
+    pca : sklearn.decomposition.PCA
+        The PCA object used for the 2D embedding and calculating the VSI score.
+    umap_2d : umap.UMAP
         The UMAP object used for the 2D embedding.
-    pca_3d : sklearn.decomposition.PCA
+    pca_rgb : sklearn.decomposition.PCA
         The PCA object used for the 3D RGB embedding.
-    embedder_3d : umap.UMAP
+    umap_rgb : umap.UMAP
         The UMAP object used for the 3D RGB embedding.
     genes : list
         A list of genes to utilize in the model.
@@ -147,10 +147,10 @@ def __init__(
             n_jobs = n_workers if cumap_kwargs.get("random_state") is None else 1
             cumap_kwargs["n_jobs"] = n_jobs
 
-        self.pca_2d = PCA(n_components=n_components, random_state=random_state)
-        self.embedder_2d = UMAP(**(umap_kwargs | {"n_components": 2}))
-        self.pca_3d = PCA(n_components=3, random_state=random_state)
-        self.embedder_3d = UMAP(**(cumap_kwargs | {"n_components": 3}))
+        self.pca = PCA(n_components=n_components, random_state=random_state)
+        self.umap_2d = UMAP(**(umap_kwargs | {"n_components": 2}))
+        self.pca_rgb = PCA(n_components=3, random_state=random_state)
+        self.umap_rgb = UMAP(**(cumap_kwargs | {"n_components": 3}))
 
     def process_coordinates(self, gridsize: float = 1, **kwargs):
         """
@@ -225,19 +225,19 @@ def fit_pseudocells(self, pseudocells: AnnData, *, fit_umap: bool = True):
 
         self.pseudocells = pseudocells
         X = pseudocells[:, self.genes].X
-        self.pca_2d.fit(X)
+        self.pca.fit(X)
 
         if fit_umap:
-            factors = self.pca_2d.transform(X)
+            factors = self.pca.transform(X)
 
             print(f"Modeling {factors.shape[1]} pseudo-celltype clusters;")
 
-            self.pseudocells.obsm["2D_UMAP"] = self.embedder_2d.fit_transform(factors)
+            self.pseudocells.obsm["2D_UMAP"] = self.umap_2d.fit_transform(factors)
 
-            embedding_color = self.embedder_3d.fit_transform(
+            embedding_color = self.umap_rgb.fit_transform(
                 factors / norm(factors, axis=1, keepdims=True)
             )
-            embedding_color = _fill_color_axes(embedding_color, self.pca_3d, fit=True)
+            embedding_color = _fill_color_axes(embedding_color, self.pca_rgb, fit=True)
 
             self._colors_min_max = (
                 embedding_color.min(axis=0),
@@ -405,7 +405,7 @@ def compute_VSI(self, *, min_transcripts: float = 2):
                                 _calculate_embedding,
                                 gene_queue,
                                 patch_mask,
-                                self.pca_2d.components_,
+                                self.pca.components_,
                                 bandwidth=self.KDE_bandwidth,
                                 dtype=self.dtype,
                             )
@@ -602,11 +602,11 @@ def transform_pseudocells(
 
         embedding, embedding_color = _transform_embeddings(
             pseudocells.to_numpy(),
-            self.pca_2d,
-            embedder_2d=self.embedder_2d,
-            embedder_3d=self.embedder_3d,
+            self.pca,
+            umap_2d=self.umap_2d,
+            umap_rgb=self.umap_rgb,
         )
-        embedding_color = _fill_color_axes(embedding_color, self.pca_3d)
+        embedding_color = _fill_color_axes(embedding_color, self.pca_rgb)
         color_min, color_max = self._colors_min_max
         embedding_color = (embedding_color - color_min) / (color_max - color_min)
         embedding_color = np.clip(embedding_color, 0, 1)
diff --git a/ovrlpy/_plotting.py b/ovrlpy/_plotting.py
@@ -29,6 +29,8 @@
     ][::-1],
 )
 
+VSI = "vertical signal integrity"
+
 
 def _plot_scalebar(ax: Axes, dx: float = 1, units="um", **kwargs):
     ax.add_artist(ScaleBar(dx, units=units, **kwargs))
@@ -323,15 +325,14 @@ def plot_signal_integrity(
             bars = ax_hist.barh(bins[1:-1], vals[1:], height=0.01)
             for i, bar in enumerate(bars):
                 bar.set_color(colors[i])
-            ax_hist.set(ylim=(0, 1), ylabel="signal integrity")
+            ax_hist.set(ylim=(0, 1), ylabel=VSI, xticks=[])
             ax_hist.yaxis.tick_right()
             ax_hist.yaxis.set_label_position("right")
-            ax_hist.set_xticks([], [])
             ax_hist.invert_xaxis()
             ax_hist.spines[["top", "bottom", "left"]].set_visible(False)
 
         else:
-            fig.colorbar(img)
+            fig.colorbar(img, label=VSI)
 
     return fig
 
@@ -402,7 +403,7 @@ def plot_region_of_interest(
 
     ax_integrity.set_title("ROI, signal integrity")
     ax_integrity.invert_yaxis()
-    fig.colorbar(img)
+    fig.colorbar(img, label=VSI)
 
     ax_integrity.set_xlim(x - window_size, x + window_size)
     ax_integrity.set_ylim(y - window_size, y + window_size)
diff --git a/ovrlpy/_utils.py b/ovrlpy/_utils.py
@@ -78,14 +78,12 @@ def _minmax_scaling(x: np.ndarray):
     return (x - x_min) / (x_max - x_min)
 
 
-def _transform_embeddings(expression, pca: PCA, embedder_2d: UMAP, embedder_3d: UMAP):
+def _transform_embeddings(expression, pca: PCA, umap_2d: UMAP, umap_rgb: UMAP):
     """fit the expression data into the umap embeddings after PCA transformation"""
     factors = pca.transform(expression)
 
-    embedding = embedder_2d.transform(factors)
-    embedding_color = embedder_3d.transform(
-        factors / norm(factors, axis=1, keepdims=True)
-    )
+    embedding = umap_2d.transform(factors)
+    embedding_color = umap_rgb.transform(factors / norm(factors, axis=1, keepdims=True))
 
     return embedding, embedding_color
 
diff --git a/ovrlpy/io.py b/ovrlpy/io.py
@@ -41,6 +41,7 @@ def read_Xenium(
     *,
     min_qv: float | None = None,
     remove_features: Collection[str] = XENIUM_CTRLS,
+    additional_columns: Collection[str] = [],
     n_threads: int | None = None,
 ) -> pl.DataFrame:
     """
@@ -56,6 +57,8 @@ def read_Xenium(
     remove_features : collections.abc.Collection[str], optional
         List of regex patterns to filter the 'feature_name' column,
         :py:attr:`ovrlpy.io.XENIUM_CTRLS` by default.
+    additional_columns : collections.abc.Collection[str], optional
+        Additional columns to load from the transcripts file.
     n_threads : int | None, optional
         Number of threads used for parsing the input file.
         If None, will default to number of available CPUs.
@@ -65,7 +68,7 @@ def read_Xenium(
     polars.DataFrame
     """
     filepath = Path(filepath)
-    columns = list(_XENIUM_COLUMNS.keys())
+    columns = list(set(_XENIUM_COLUMNS.keys()) | set(additional_columns))
 
     if filepath.suffix == ".parquet":
         transcripts = pl.scan_parquet(filepath)
@@ -87,7 +90,7 @@ def read_Xenium(
             )
 
     else:
-        if min_qv is not None:
+        if min_qv is not None and "qv" not in additional_columns:
             columns.append("qv")
         transcripts = pl.read_csv(
             filepath,
@@ -97,26 +100,29 @@ def read_Xenium(
         )
 
         if min_qv is not None:
-            transcripts = transcripts.filter(pl.col("qv") >= min_qv).drop("qv")
+            transcripts = transcripts.filter(pl.col("qv") >= min_qv)
+            if "qv" not in additional_columns:
+                transcripts = transcripts.drop("qv")
 
     transcripts = transcripts.rename(_XENIUM_COLUMNS)
     transcripts = _filter_genes(transcripts, remove_features)
 
     return transcripts
 
 
-# Vizgen MERFISH
-_MERFISH_COLUMNS = {"gene": "gene", "global_x": "x", "global_y": "y", "global_z": "z"}
+# Vizgen MERSCOPE
+_MERSCOPE_COLUMNS = {"gene": "gene", "global_x": "x", "global_y": "y", "global_z": "z"}
 
-MERFISH_CTRLS = ["^Blank"]
+MERSCOPE_CTRLS = ["^Blank"]
 """Patterns for Vizgen controls"""
 
 
-def read_MERFISH(
+def read_MERSCOPE(
     filepath: str | os.PathLike,
     z_scale: float = 1.5,
     *,
-    remove_genes: Collection[str] = MERFISH_CTRLS,
+    remove_genes: Collection[str] = MERSCOPE_CTRLS,
+    additional_columns: Collection[str] = [],
     n_threads: int | None = None,
 ) -> pl.DataFrame:
     """
@@ -125,12 +131,14 @@ def read_MERFISH(
     Parameters
     ----------
     filepath : os.PathLike or str
-        Path to the Vizgen transcripts file.
+        Path to the Vizgen transcripts file. Both, .csv(.gz) and .parquet files, are supported.
     z_scale : float
         Factor to scale z-plane index to um, i.e. distance between z-planes.
     remove_genes : collections.abc.Collection[str], optional
         List of regex patterns to filter the 'gene' column,
-        :py:attr:`ovrlpy.io.MERFISH_CTRLS` by default.
+        :py:attr:`ovrlpy.io.MERSCOPE_CTRLS` by default.
+    additional_columns : collections.abc.Collection[str], optional
+        Additional columns to load from the transcripts file.
     n_threads : int | None, optional
         Number of threads used for parsing the input file.
         If None, will default to number of available CPUs.
@@ -139,18 +147,38 @@ def read_MERFISH(
     -------
     polars.DataFrame
     """
+    filepath = Path(filepath)
+    columns = list(set(_MERSCOPE_COLUMNS.keys()) | set(additional_columns))
 
-    transcripts = pl.read_csv(
-        Path(filepath),
-        columns=list(_MERFISH_COLUMNS.keys()),
-        schema_overrides={"gene": pl.Categorical},
-        n_threads=n_threads,
-    ).rename(_MERFISH_COLUMNS)
+    if filepath.suffixes[-2:] == [".csv", ".gz"]:
+        transcripts = pl.read_csv(
+            filepath,
+            columns=columns,
+            schema_overrides={"gene": pl.Categorical},
+            n_threads=n_threads,
+        )
 
+    else:
+        if filepath.suffix == ".parquet":
+            transcripts = pl.scan_parquet(filepath)
+        elif filepath.suffix == ".csv":
+            transcripts = pl.scan_csv(filepath)
+        else:
+            raise ValueError(
+                "Unsupported file format; must be one of .csv(.gz) or .parquet"
+            )
+
+        with pl.StringCache():
+            transcripts = (
+                transcripts.select(columns)
+                .with_columns(pl.col("gene").cast(pl.String).cast(pl.Categorical))
+                .collect()
+            )
+
+    transcripts = transcripts.rename(_MERSCOPE_COLUMNS)
     transcripts = _filter_genes(transcripts, remove_genes)
 
     # convert plane to um
-
     transcripts = transcripts.with_columns(pl.col("z") * z_scale)
 
     return transcripts
@@ -168,6 +196,7 @@ def read_CosMx(
     scale: Mapping[str, float] = {"xy": 0.12028, "z": 0.8},
     *,
     remove_targets: Collection[str] = COSMX_CTRLS,
+    additional_columns: Collection[str] = [],
     n_threads: int | None = None,
 ) -> pl.DataFrame:
     """
@@ -182,6 +211,8 @@ def read_CosMx(
     remove_targets : collections.abc.Collection[str], optional
         List of regex patterns to filter the 'target' column,
         :py:attr:`ovrlpy.io.COSMX_CTRLS` by default.
+    additional_columns : collections.abc.Collection[str], optional
+        Additional columns to load from the transcripts file.
     n_threads : int | None, optional
         Number of threads used for parsing the input file.
         If None, will default to number of available CPUs.
@@ -193,7 +224,7 @@ def read_CosMx(
 
     transcripts = pl.read_csv(
         Path(filepath),
-        columns=list(_COSMX_COLUMNS.keys()),
+        columns=list(set(_COSMX_COLUMNS.keys()) | set(additional_columns)),
         schema_overrides={"target": pl.Categorical},
         n_threads=n_threads,
     ).rename(_COSMX_COLUMNS)