From b7ec813176371d123eea294307483ff1d682dc74 Mon Sep 17 00:00:00 2001
From: Jake Awe <jawe@nvidia.com>
Date: Thu, 17 Jul 2025 12:32:58 -0700
Subject: [PATCH 01/40] DOC v25.10 Updates [skip ci]

---
 .../cuda12.9-conda/devcontainer.json          |  6 ++---
 .devcontainer/cuda12.9-pip/devcontainer.json  |  6 ++---
 .github/workflows/build.yaml                  | 16 ++++++------
 .github/workflows/pr.yaml                     | 26 +++++++++----------
 .github/workflows/test.yaml                   |  6 ++---
 .../trigger-breaking-change-alert.yaml        |  2 +-
 VERSION                                       |  2 +-
 .../all_cuda-129_arch-aarch64.yaml            |  2 +-
 .../all_cuda-129_arch-x86_64.yaml             |  2 +-
 .../downstream/cmake/get_kvikio.cmake         |  2 +-
 dependencies.yaml                             | 18 ++++++-------
 java/pom.xml                                  |  2 +-
 python/kvikio/pyproject.toml                  |  6 ++---
 13 files changed, 48 insertions(+), 48 deletions(-)

diff --git a/.devcontainer/cuda12.9-conda/devcontainer.json b/.devcontainer/cuda12.9-conda/devcontainer.json
index 7e000efb91..462343b8cc 100644
--- a/.devcontainer/cuda12.9-conda/devcontainer.json
+++ b/.devcontainer/cuda12.9-conda/devcontainer.json
@@ -5,17 +5,17 @@
     "args": {
       "CUDA": "12.9",
       "PYTHON_PACKAGE_MANAGER": "conda",
-      "BASE": "rapidsai/devcontainers:25.08-cpp-mambaforge-ubuntu22.04"
+      "BASE": "rapidsai/devcontainers:25.10-cpp-mambaforge-ubuntu22.04"
     }
   },
   "runArgs": [
     "--rm",
     "--name",
-    "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-25.08-cuda12.9-conda"
+    "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-25.10-cuda12.9-conda"
   ],
   "hostRequirements": {"gpu": "optional"},
   "features": {
-    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:25.8": {}
+    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:25.10": {}
   },
   "overrideFeatureInstallOrder": [
     "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils"
diff --git a/.devcontainer/cuda12.9-pip/devcontainer.json b/.devcontainer/cuda12.9-pip/devcontainer.json
index 087acd2900..4acd41599a 100644
--- a/.devcontainer/cuda12.9-pip/devcontainer.json
+++ b/.devcontainer/cuda12.9-pip/devcontainer.json
@@ -5,17 +5,17 @@
     "args": {
       "CUDA": "12.9",
       "PYTHON_PACKAGE_MANAGER": "pip",
-      "BASE": "rapidsai/devcontainers:25.08-cpp-cuda12.9-ubuntu22.04"
+      "BASE": "rapidsai/devcontainers:25.10-cpp-cuda12.9-ubuntu22.04"
     }
   },
   "runArgs": [
     "--rm",
     "--name",
-    "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-25.08-cuda12.9-pip"
+    "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-25.10-cuda12.9-pip"
   ],
   "hostRequirements": {"gpu": "optional"},
   "features": {
-    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:25.8": {}
+    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:25.10": {}
   },
   "overrideFeatureInstallOrder": [
     "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils"
diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
index 53d0402fef..d643e1aa9e 100644
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -34,7 +34,7 @@ concurrency:
 jobs:
   cpp-build:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-25.08
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-25.10
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -44,7 +44,7 @@ jobs:
   python-build:
     needs: [cpp-build]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-25.08
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-25.10
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -54,7 +54,7 @@ jobs:
   upload-conda:
     needs: [cpp-build, python-build]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-upload-packages.yaml@branch-25.08
+    uses: rapidsai/shared-workflows/.github/workflows/conda-upload-packages.yaml@branch-25.10
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -64,7 +64,7 @@ jobs:
     if: github.ref_type == 'branch'
     needs: python-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-25.08
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-25.10
     with:
       arch: "amd64"
       branch: ${{ inputs.branch }}
@@ -76,7 +76,7 @@ jobs:
       sha: ${{ inputs.sha }}
   wheel-build-cpp:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-25.08
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-25.10
     with:
       matrix_filter: group_by([.ARCH, (.CUDA_VER|split(".")|map(tonumber)|.[0])]) | map(max_by(.PY_VER|split(".")|map(tonumber)))
       build_type: ${{ inputs.build_type || 'branch' }}
@@ -89,7 +89,7 @@ jobs:
   wheel-build-python:
     needs: wheel-build-cpp
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-25.08
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-25.10
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -101,7 +101,7 @@ jobs:
   wheel-publish-cpp:
     needs: wheel-build-cpp
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-25.08
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-25.10
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -112,7 +112,7 @@ jobs:
   wheel-publish-python:
     needs: wheel-build-python
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-25.08
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-25.10
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index 908d30f9e4..3ef91ed573 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -27,7 +27,7 @@ jobs:
       - wheel-python-tests
       - telemetry-setup
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@branch-25.08
+    uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@branch-25.10
     if: always()
     with:
       needs: ${{ toJSON(needs) }}
@@ -43,7 +43,7 @@ jobs:
           repo: kvikio
   changed-files:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/changed-files.yaml@branch-25.08
+    uses: rapidsai/shared-workflows/.github/workflows/changed-files.yaml@branch-25.10
     with:
       files_yaml: |
         test_cpp:
@@ -86,20 +86,20 @@ jobs:
   checks:
     secrets: inherit
     needs: telemetry-setup
-    uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@branch-25.08
+    uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@branch-25.10
     with:
       ignored_pr_jobs: telemetry-summarize
   conda-cpp-build:
     needs: checks
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-25.08
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-25.10
     with:
       build_type: pull-request
       script: ci/build_cpp.sh
   conda-cpp-tests:
     needs: [conda-cpp-build, changed-files]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-25.08
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-25.10
     if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_cpp
     with:
       build_type: pull-request
@@ -107,7 +107,7 @@ jobs:
   conda-java-tests:
     needs: conda-cpp-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-25.08
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-25.10
     with:
       build_type: pull-request
       node_type: "gpu-l4-latest-1"
@@ -117,14 +117,14 @@ jobs:
   conda-python-build:
     needs: conda-cpp-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-25.08
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-25.10
     with:
       build_type: pull-request
       script: ci/build_python.sh
   conda-python-tests:
     needs: [conda-python-build, changed-files]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-25.08
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-25.10
     if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_python
     with:
       build_type: pull-request
@@ -132,7 +132,7 @@ jobs:
   docs-build:
     needs: conda-python-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-25.08
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-25.10
     with:
       build_type: pull-request
       node_type: "gpu-l4-latest-1"
@@ -142,7 +142,7 @@ jobs:
   devcontainer:
     needs: telemetry-setup
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/build-in-devcontainer.yaml@branch-25.08
+    uses: rapidsai/shared-workflows/.github/workflows/build-in-devcontainer.yaml@branch-25.10
     with:
       arch: '["amd64"]'
       cuda: '["12.9"]'
@@ -153,7 +153,7 @@ jobs:
         sccache -s;
   wheel-cpp-build:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-25.08
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-25.10
     with:
       matrix_filter: group_by([.ARCH, (.CUDA_VER|split(".")|map(tonumber)|.[0])]) | map(max_by(.PY_VER|split(".")|map(tonumber)))
       build_type: pull-request
@@ -163,7 +163,7 @@ jobs:
   wheel-python-build:
     needs: wheel-cpp-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-25.08
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-25.10
     with:
       build_type: pull-request
       script: ci/build_wheel_python.sh
@@ -172,7 +172,7 @@ jobs:
   wheel-python-tests:
     needs: [wheel-python-build, changed-files]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-25.08
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-25.10
     if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_python
     with:
       build_type: pull-request
diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
index b58dbd3ade..d98e3f330c 100644
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -25,7 +25,7 @@ on:
 jobs:
   cpp-tests:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-25.08
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-25.10
     with:
       build_type: ${{ inputs.build_type }}
       branch: ${{ inputs.branch }}
@@ -34,7 +34,7 @@ jobs:
       sha: ${{ inputs.sha }}
   python-tests:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-25.08
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-25.10
     with:
       build_type: ${{ inputs.build_type }}
       branch: ${{ inputs.branch }}
@@ -43,7 +43,7 @@ jobs:
       sha: ${{ inputs.sha }}
   conda-java-tests:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-25.08
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-25.10
     with:
       build_type: ${{ inputs.build_type }}
       branch: ${{ inputs.branch }}
diff --git a/.github/workflows/trigger-breaking-change-alert.yaml b/.github/workflows/trigger-breaking-change-alert.yaml
index 593fcb1086..48bf37afc4 100644
--- a/.github/workflows/trigger-breaking-change-alert.yaml
+++ b/.github/workflows/trigger-breaking-change-alert.yaml
@@ -12,7 +12,7 @@ jobs:
   trigger-notifier:
     if: contains(github.event.pull_request.labels.*.name, 'breaking')
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/breaking-change-alert.yaml@branch-25.08
+    uses: rapidsai/shared-workflows/.github/workflows/breaking-change-alert.yaml@branch-25.10
     with:
       sender_login: ${{ github.event.sender.login }}
       sender_avatar: ${{ github.event.sender.avatar_url }}
diff --git a/VERSION b/VERSION
index 3af4bda020..296e35288d 100644
--- a/VERSION
+++ b/VERSION
@@ -1 +1 @@
-25.08.00
+25.10.00
diff --git a/conda/environments/all_cuda-129_arch-aarch64.yaml b/conda/environments/all_cuda-129_arch-aarch64.yaml
index 0c352383d8..98070be5c6 100644
--- a/conda/environments/all_cuda-129_arch-aarch64.yaml
+++ b/conda/environments/all_cuda-129_arch-aarch64.yaml
@@ -34,7 +34,7 @@ dependencies:
 - python>=3.10,<3.14
 - rangehttpserver
 - rapids-build-backend>=0.3.0,<0.4.0.dev0
-- rapids-dask-dependency==25.8.*,>=0.0.0a0
+- rapids-dask-dependency==25.10.*,>=0.0.0a0
 - scikit-build-core>=0.10.0
 - sphinx
 - sphinx-click
diff --git a/conda/environments/all_cuda-129_arch-x86_64.yaml b/conda/environments/all_cuda-129_arch-x86_64.yaml
index 51d46cafd7..bb54f98ef5 100644
--- a/conda/environments/all_cuda-129_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-129_arch-x86_64.yaml
@@ -34,7 +34,7 @@ dependencies:
 - python>=3.10,<3.14
 - rangehttpserver
 - rapids-build-backend>=0.3.0,<0.4.0.dev0
-- rapids-dask-dependency==25.8.*,>=0.0.0a0
+- rapids-dask-dependency==25.10.*,>=0.0.0a0
 - scikit-build-core>=0.10.0
 - sphinx
 - sphinx-click
diff --git a/cpp/examples/downstream/cmake/get_kvikio.cmake b/cpp/examples/downstream/cmake/get_kvikio.cmake
index 5e4132f8e5..4e03834e56 100644
--- a/cpp/examples/downstream/cmake/get_kvikio.cmake
+++ b/cpp/examples/downstream/cmake/get_kvikio.cmake
@@ -30,4 +30,4 @@ function(find_and_configure_kvikio MIN_VERSION)
 
 endfunction()
 
-find_and_configure_kvikio("25.08")
+find_and_configure_kvikio("25.10")
diff --git a/dependencies.yaml b/dependencies.yaml
index 34d1f0b249..f8038dc76c 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -151,7 +151,7 @@ dependencies:
     common:
       - output_types: conda
         packages: &libkvikio_packages
-          - libkvikio==25.8.*,>=0.0.0a0
+          - libkvikio==25.10.*,>=0.0.0a0
     specific:
       - output_types: [requirements, pyproject]
         matrices:
@@ -159,7 +159,7 @@ dependencies:
               cuda: "12.*"
               cuda_suffixed: "true"
             packages:
-              - libkvikio-cu12==25.8.*,>=0.0.0a0
+              - libkvikio-cu12==25.10.*,>=0.0.0a0
           - {matrix: null, packages: *libkvikio_packages}
   build-py-wrapper:
     common:
@@ -247,7 +247,7 @@ dependencies:
     common:
       - output_types: conda
         packages:
-          - &libkvikio_unsuffixed libkvikio==25.8.*,>=0.0.0a0
+          - &libkvikio_unsuffixed libkvikio==25.10.*,>=0.0.0a0
       - output_types: requirements
         packages:
           # pip recognizes the index as a global option for the requirements.txt file
@@ -261,7 +261,7 @@ dependencies:
               cuda: "12.*"
               cuda_suffixed: "true"
             packages:
-              - libkvikio-cu12==25.8.*,>=0.0.0a0
+              - libkvikio-cu12==25.10.*,>=0.0.0a0
           - {matrix: null, packages: [*libkvikio_unsuffixed]}
   docs:
     common:
@@ -321,14 +321,14 @@ dependencies:
     common:
       - output_types: conda
         packages:
-          - libkvikio==25.8.*,>=0.0.0a0
-          - libkvikio-tests==25.8.*,>=0.0.0a0
+          - libkvikio==25.10.*,>=0.0.0a0
+          - libkvikio-tests==25.10.*,>=0.0.0a0
   test_kvikio:
     common:
       - output_types: conda
         packages:
-          - libkvikio==25.8.*,>=0.0.0a0
-          - kvikio==25.8.*,>=0.0.0a0
+          - libkvikio==25.10.*,>=0.0.0a0
+          - kvikio==25.10.*,>=0.0.0a0
   test_cpp:
     common:
       - output_types: conda
@@ -338,7 +338,7 @@ dependencies:
     common:
       - output_types: [conda, requirements, pyproject]
         packages:
-          - rapids-dask-dependency==25.8.*,>=0.0.0a0
+          - rapids-dask-dependency==25.10.*,>=0.0.0a0
           - pytest
           - pytest-asyncio
           - pytest-cov
diff --git a/java/pom.xml b/java/pom.xml
index 228c82a3d5..7e00c88661 100644
--- a/java/pom.xml
+++ b/java/pom.xml
@@ -7,7 +7,7 @@
 
   <groupId>ai.rapids.kvikio</groupId>
   <artifactId>cufile</artifactId>
-  <version>25.08.0-SNAPSHOT</version>
+  <version>25.10.0-SNAPSHOT</version>
 
   <name>cuFile</name>
   <description>
diff --git a/python/kvikio/pyproject.toml b/python/kvikio/pyproject.toml
index d71abdb608..b8ddc09945 100644
--- a/python/kvikio/pyproject.toml
+++ b/python/kvikio/pyproject.toml
@@ -20,7 +20,7 @@ license = { text = "Apache-2.0" }
 requires-python = ">=3.10"
 dependencies = [
     "cupy-cuda12x>=12.0.0",
-    "libkvikio==25.8.*,>=0.0.0a0",
+    "libkvikio==25.10.*,>=0.0.0a0",
     "numcodecs !=0.12.0",
     "numpy>=1.23,<3.0a0",
     "packaging",
@@ -48,7 +48,7 @@ test = [
     "pytest-cov",
     "pytest-timeout",
     "rangehttpserver",
-    "rapids-dask-dependency==25.8.*,>=0.0.0a0",
+    "rapids-dask-dependency==25.10.*,>=0.0.0a0",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 
 [project.urls]
@@ -127,7 +127,7 @@ matrix-entry = "cuda_suffixed=true;use_cuda_wheels=true"
 requires = [
     "cmake>=3.30.4",
     "cython>=3.0.0",
-    "libkvikio==25.8.*,>=0.0.0a0",
+    "libkvikio==25.10.*,>=0.0.0a0",
     "ninja",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 

From fab9f0e568cab45acab76ddc7f2f206b95f2e5cc Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Thu, 24 Jul 2025 10:19:34 -0500
Subject: [PATCH 02/40] Allow latest OS in devcontainers (#780)

This PR removes the OS suffix from devcontainers, allowing the upstream devcontainer images to determine the OS version.

Contributes to https://github.com/rapidsai/build-planning/issues/200.

Authors:
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - Gil Forsyth (https://github.com/gforsyth)

URL: https://github.com/rapidsai/kvikio/pull/780
---
 .devcontainer/cuda12.9-conda/devcontainer.json | 2 +-
 .devcontainer/cuda12.9-pip/devcontainer.json   | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/.devcontainer/cuda12.9-conda/devcontainer.json b/.devcontainer/cuda12.9-conda/devcontainer.json
index 462343b8cc..fc3e89da7b 100644
--- a/.devcontainer/cuda12.9-conda/devcontainer.json
+++ b/.devcontainer/cuda12.9-conda/devcontainer.json
@@ -5,7 +5,7 @@
     "args": {
       "CUDA": "12.9",
       "PYTHON_PACKAGE_MANAGER": "conda",
-      "BASE": "rapidsai/devcontainers:25.10-cpp-mambaforge-ubuntu22.04"
+      "BASE": "rapidsai/devcontainers:25.10-cpp-mambaforge"
     }
   },
   "runArgs": [
diff --git a/.devcontainer/cuda12.9-pip/devcontainer.json b/.devcontainer/cuda12.9-pip/devcontainer.json
index 4acd41599a..334b4b6d6c 100644
--- a/.devcontainer/cuda12.9-pip/devcontainer.json
+++ b/.devcontainer/cuda12.9-pip/devcontainer.json
@@ -5,7 +5,7 @@
     "args": {
       "CUDA": "12.9",
       "PYTHON_PACKAGE_MANAGER": "pip",
-      "BASE": "rapidsai/devcontainers:25.10-cpp-cuda12.9-ubuntu22.04"
+      "BASE": "rapidsai/devcontainers:25.10-cpp-cuda12.9"
     }
   },
   "runArgs": [

From 888fcbf0169acec7bdb3ea3d1faa452fcfbfd60e Mon Sep 17 00:00:00 2001
From: Tianyu Liu <kingcrimsontianyu@gmail.com>
Date: Fri, 25 Jul 2025 10:50:55 -0400
Subject: [PATCH 03/40] Relax mmap read requirement. Improve error message.
 (#781)

cuDF PR https://github.com/rapidsai/cudf/pull/19164 currently has 4 failed unit tests when `LIBCUDF_MMAP_ENABLED=ON`:
```
28 - CSV_TEST (Failed)
29 - ORC_TEST (Failed)
32 - JSON_TEST (Failed)
40 - DATA_CHUNK_SOURCE_TEST (Failed)
```
The fix entails code changes on both the KvikIO and cuDF sides.
On the KvikIO side, the `MmapHandle::read()` and `MmapHandle::pread()` methods need to:
- Allow the read size to be 0
- Allow `offset` to be equal to `initial_map_offset` (when the read size is 0)

This PR makes this change. In addition, this PR adds more detailed error messages when out-of-range exception occurs.

Authors:
  - Tianyu Liu (https://github.com/kingcrimsontianyu)

Approvers:
  - Mads R. B. Kristensen (https://github.com/madsbk)

URL: https://github.com/rapidsai/kvikio/pull/781
---
 cpp/include/kvikio/mmap.hpp | 20 ++++++++-------
 cpp/src/mmap.cpp            | 49 ++++++++++++++++++++++++++++---------
 cpp/tests/test_mmap.cpp     | 44 ++++++++++++++++++++++++++-------
 3 files changed, 83 insertions(+), 30 deletions(-)

diff --git a/cpp/include/kvikio/mmap.hpp b/cpp/include/kvikio/mmap.hpp
index f5245858ef..b5730727f1 100644
--- a/cpp/include/kvikio/mmap.hpp
+++ b/cpp/include/kvikio/mmap.hpp
@@ -59,7 +59,6 @@ class MmapHandle {
    *
    * @exception std::out_of_range if the read region specified by `offset` and `size` is
    * outside the initial region specified when the mapping handle was constructed
-   * @exception std::invalid_argument if the size is given but is 0
    * @exception std::runtime_error if the mapping handle is closed
    */
   std::size_t validate_and_adjust_read_args(std::optional<std::size_t> const& size,
@@ -81,11 +80,16 @@ class MmapHandle {
    *   - "w": "open for writing, truncating the file first"
    *   - "a": "open for writing, appending to the end of file if it exists"
    *   - "+": "open for updating (reading and writing)"
-   * @param initial_map_size Size in bytes of the mapped region. If not specified, map the region
-   * starting from `initial_map_offset` to the end of file
+   * @param initial_map_size Size in bytes of the mapped region. Must be greater than 0. If not
+   * specified, map the region starting from `initial_map_offset` to the end of file
    * @param initial_map_offset File offset of the mapped region
    * @param mode Access mode
    * @param map_flags Flags to be passed to the system call `mmap`. See `mmap(2)` for details
+   * @exception std::out_of_range if `initial_map_offset` (left bound of the mapped region) is equal
+   * to or greater than the file size
+   * @exception std::out_of_range if the sum of `initial_map_offset` and `initial_map_size` (right
+   * bound of the mapped region) is greater than the file size
+   * @exception std::invalid_argument if `initial_map_size` is given but is 0
    */
   MmapHandle(std::string const& file_path,
              std::string const& flags                    = "r",
@@ -147,14 +151,13 @@ class MmapHandle {
    * destination buffer `buf`
    *
    * @param buf Address of the host or device memory (destination buffer)
-   * @param size Size in bytes to read. If not specified, read starts from `offset` to the end
-   * of file
+   * @param size Size in bytes to read. Can be 0 in which case nothing will be read. If not
+   * specified, read starts from `offset` to the end of file
    * @param offset File offset
    * @return Number of bytes that have been read
    *
    * @exception std::out_of_range if the read region specified by `offset` and `size` is
    * outside the initial region specified when the mapping handle was constructed
-   * @exception std::invalid_argument if the size is given but is 0
    * @exception std::runtime_error if the mapping handle is closed
    */
   std::size_t read(void* buf,
@@ -166,15 +169,14 @@ class MmapHandle {
    * destination buffer `buf`
    *
    * @param buf Address of the host or device memory (destination buffer)
-   * @param size Size in bytes to read. If not specified, read starts from `offset` to the end
-   * of file
+   * @param size Size in bytes to read. Can be 0 in which case nothing will be read. If not
+   * specified, read starts from `offset` to the end of file
    * @param offset File offset
    * @param task_size Size of each task in bytes
    * @return Future that on completion returns the size of bytes that were successfully read.
    *
    * @exception std::out_of_range if the read region specified by `offset` and `size` is
    * outside the initial region specified when the mapping handle was constructed
-   * @exception std::invalid_argument if the size is given but is 0
    * @exception std::runtime_error if the mapping handle is closed
    *
    * @note The `std::future` object's `wait()` or `get()` should not be called after the lifetime of
diff --git a/cpp/src/mmap.cpp b/cpp/src/mmap.cpp
index 11b0416c29..671340cb09 100644
--- a/cpp/src/mmap.cpp
+++ b/cpp/src/mmap.cpp
@@ -19,6 +19,7 @@
 #include <cstdlib>
 #include <future>
 #include <optional>
+#include <sstream>
 #include <stdexcept>
 #include <type_traits>
 #include <unordered_map>
@@ -282,8 +283,12 @@ MmapHandle::MmapHandle(std::string const& file_path,
   _file_size = get_file_size(_file_wrapper.fd());
   if (_file_size == 0) { return; }
 
-  KVIKIO_EXPECT(
-    _initial_map_offset < _file_size, "Offset is past the end of file", std::out_of_range);
+  {
+    std::stringstream ss;
+    ss << "Offset must be less than the file size. initial_map_offset: " << _initial_map_offset
+       << ", file size: " << _file_size << "\n";
+    KVIKIO_EXPECT(_initial_map_offset < _file_size, ss.str(), std::out_of_range);
+  }
 
   // An initial size of std::nullopt is a shorthand for "starting from _initial_map_offset to the
   // end of file".
@@ -292,9 +297,14 @@ MmapHandle::MmapHandle(std::string const& file_path,
 
   KVIKIO_EXPECT(
     _initial_map_size > 0, "Mapped region should not be zero byte", std::invalid_argument);
-  KVIKIO_EXPECT(_initial_map_offset + _initial_map_size <= _file_size,
-                "Mapped region is past the end of file",
-                std::out_of_range);
+
+  {
+    std::stringstream ss;
+    ss << "Mapped region is past the end of file. initial map offset: " << _initial_map_offset
+       << ", initial map size: " << _initial_map_size << ", file size: " << _file_size << "\n";
+    KVIKIO_EXPECT(
+      _initial_map_offset + _initial_map_size <= _file_size, ss.str(), std::out_of_range);
+  }
 
   auto const page_size    = get_page_size();
   _map_offset             = detail::align_down(_initial_map_offset, page_size);
@@ -401,6 +411,7 @@ std::size_t MmapHandle::read(void* buf, std::optional<std::size_t> size, std::si
   KVIKIO_NVTX_FUNC_RANGE();
 
   auto actual_size = validate_and_adjust_read_args(size, offset);
+  if (actual_size == 0) { return actual_size; }
 
   auto const is_dst_buf_host_mem = is_host_memory(buf);
   CUcontext ctx{};
@@ -420,6 +431,7 @@ std::future<std::size_t> MmapHandle::pread(void* buf,
   KVIKIO_EXPECT(task_size <= defaults::bounce_buffer_size(),
                 "bounce buffer size cannot be less than task size.");
   auto actual_size = validate_and_adjust_read_args(size, offset);
+  if (actual_size == 0) { return make_ready_future(actual_size); }
 
   auto& [nvtx_color, call_idx] = detail::get_next_color_and_call_idx();
   KVIKIO_NVTX_FUNC_RANGE(actual_size, nvtx_color);
@@ -454,14 +466,27 @@ std::future<std::size_t> MmapHandle::pread(void* buf,
 std::size_t MmapHandle::validate_and_adjust_read_args(std::optional<std::size_t> const& size,
                                                       std::size_t offset)
 {
-  KVIKIO_EXPECT(!closed(), "Cannot read from a closed MmapHandle", std::runtime_error);
-  KVIKIO_EXPECT(offset < _file_size, "Offset is past the end of file", std::out_of_range);
+  {
+    std::stringstream ss;
+    KVIKIO_EXPECT(!closed(), "Cannot read from a closed MmapHandle", std::runtime_error);
+
+    ss << "Offset is past the end of file. offset: " << offset << ", file size: " << _file_size
+       << "\n";
+    KVIKIO_EXPECT(offset <= _file_size, ss.str(), std::out_of_range);
+  }
+
   auto actual_size = size.has_value() ? size.value() : _file_size - offset;
-  KVIKIO_EXPECT(actual_size > 0, "Read size must be greater than 0", std::invalid_argument);
-  KVIKIO_EXPECT(offset >= _initial_map_offset &&
-                  offset + actual_size <= _initial_map_offset + _initial_map_size,
-                "Read is out of bound",
-                std::out_of_range);
+
+  {
+    std::stringstream ss;
+    ss << "Read is out of bound. offset: " << offset << ", actual size to read: " << actual_size
+       << ", initial map offset: " << _initial_map_offset
+       << ", initial map size: " << _initial_map_size << "\n";
+    KVIKIO_EXPECT(offset >= _initial_map_offset &&
+                    offset + actual_size <= _initial_map_offset + _initial_map_size,
+                  ss.str(),
+                  std::out_of_range);
+  }
   return actual_size;
 }
 
diff --git a/cpp/tests/test_mmap.cpp b/cpp/tests/test_mmap.cpp
index b52730ee74..2de89c5cd7 100644
--- a/cpp/tests/test_mmap.cpp
+++ b/cpp/tests/test_mmap.cpp
@@ -107,8 +107,9 @@ TEST_F(MmapTest, constructor_invalid_range)
               ThrowsMessage<std::out_of_range>(HasSubstr("Mapped region is past the end of file")));
 
   // init_file_offset is too large (by 1 char)
-  EXPECT_THAT([=] { kvikio::MmapHandle(_filepath, "r", std::nullopt, _file_size); },
-              ThrowsMessage<std::out_of_range>(HasSubstr("Offset is past the end of file")));
+  EXPECT_THAT(
+    [=] { kvikio::MmapHandle(_filepath, "r", std::nullopt, _file_size); },
+    ThrowsMessage<std::out_of_range>(HasSubstr("Offset must be less than the file size")));
 
   // init_size is 0
   EXPECT_THAT(
@@ -134,29 +135,36 @@ TEST_F(MmapTest, read_invalid_range)
   std::size_t const initial_file_offset{512};
   std::vector<value_type> out_host_buf(_file_size / sizeof(value_type), {});
 
-  // file_offset is too large
+  // Right bound is too large
   EXPECT_THAT(
     [&] {
       kvikio::MmapHandle mmap_handle(_filepath, "r", initial_map_size, initial_file_offset);
       mmap_handle.read(out_host_buf.data(), initial_map_size, _file_size);
     },
-    ThrowsMessage<std::out_of_range>(HasSubstr("Offset is past the end of file")));
+    ThrowsMessage<std::out_of_range>(HasSubstr("Read is out of bound")));
 
-  // file_offset is too small
+  // Left bound is too large
   EXPECT_THAT(
     [&] {
       kvikio::MmapHandle mmap_handle(_filepath, "r", initial_map_size, initial_file_offset);
-      mmap_handle.read(out_host_buf.data(), initial_map_size, initial_file_offset - 128);
+      mmap_handle.read(out_host_buf.data(), 0, initial_file_offset + initial_map_size + 1);
     },
     ThrowsMessage<std::out_of_range>(HasSubstr("Read is out of bound")));
 
-  // size is 0
+  EXPECT_THAT(
+    [&] {
+      kvikio::MmapHandle mmap_handle(_filepath, "r");
+      mmap_handle.read(out_host_buf.data(), 0, _file_size + 1);
+    },
+    ThrowsMessage<std::out_of_range>(HasSubstr("Offset is past the end of file")));
+
+  // Left bound is too small
   EXPECT_THAT(
     [&] {
       kvikio::MmapHandle mmap_handle(_filepath, "r", initial_map_size, initial_file_offset);
-      mmap_handle.read(out_host_buf.data(), 0, initial_file_offset);
+      mmap_handle.read(out_host_buf.data(), initial_map_size, initial_file_offset - 128);
     },
-    ThrowsMessage<std::invalid_argument>(HasSubstr("Read size must be greater than 0")));
+    ThrowsMessage<std::out_of_range>(HasSubstr("Read is out of bound")));
 
   // size is too large
   EXPECT_THAT(
@@ -167,6 +175,24 @@ TEST_F(MmapTest, read_invalid_range)
     ThrowsMessage<std::out_of_range>(HasSubstr("Read is out of bound")));
 }
 
+TEST_F(MmapTest, read_valid_range)
+{
+  std::size_t const initial_map_size{1024};
+  std::size_t const initial_file_offset{512};
+  std::vector<value_type> out_host_buf(_file_size / sizeof(value_type), {});
+
+  // size is 0
+  EXPECT_NO_THROW({
+    kvikio::MmapHandle mmap_handle(_filepath, "r", initial_map_size, initial_file_offset);
+    mmap_handle.read(out_host_buf.data(), 0, initial_file_offset + initial_map_size);
+  });
+
+  EXPECT_NO_THROW({
+    kvikio::MmapHandle mmap_handle(_filepath, "r");
+    mmap_handle.read(out_host_buf.data(), 0, _file_size);
+  });
+}
+
 TEST_F(MmapTest, read_seq)
 {
   auto do_test = [&](std::size_t num_elements_to_skip, std::size_t num_elements_to_read) {

From 003d6711f08b2b2bc8ae0c227cb67d92c0a03980 Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Fri, 25 Jul 2025 08:34:14 -0700
Subject: [PATCH 04/40] Use GCC 14 in conda builds. (#756)

conda-forge is migrating to gcc 14, so this PR is updating for alignment.

See https://github.com/rapidsai/build-planning/issues/188

Authors:
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Gil Forsyth (https://github.com/gforsyth)

URL: https://github.com/rapidsai/kvikio/pull/756
---
 conda/environments/all_cuda-129_arch-aarch64.yaml | 2 +-
 conda/environments/all_cuda-129_arch-x86_64.yaml  | 2 +-
 conda/recipes/kvikio/conda_build_config.yaml      | 4 ++--
 conda/recipes/libkvikio/recipe.yaml               | 2 +-
 dependencies.yaml                                 | 4 ++--
 5 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/conda/environments/all_cuda-129_arch-aarch64.yaml b/conda/environments/all_cuda-129_arch-aarch64.yaml
index 3f0a88d8b6..45a7d1297c 100644
--- a/conda/environments/all_cuda-129_arch-aarch64.yaml
+++ b/conda/environments/all_cuda-129_arch-aarch64.yaml
@@ -15,7 +15,7 @@ dependencies:
 - cxx-compiler
 - cython>=3.0.0
 - doxygen=1.9.1
-- gcc_linux-aarch64=13.*
+- gcc_linux-aarch64=14.*
 - libcufile-dev
 - libcurl>=8.5.0,<9.0a0
 - libnuma
diff --git a/conda/environments/all_cuda-129_arch-x86_64.yaml b/conda/environments/all_cuda-129_arch-x86_64.yaml
index 48a3f925f8..4ade45d196 100644
--- a/conda/environments/all_cuda-129_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-129_arch-x86_64.yaml
@@ -15,7 +15,7 @@ dependencies:
 - cxx-compiler
 - cython>=3.0.0
 - doxygen=1.9.1
-- gcc_linux-64=13.*
+- gcc_linux-64=14.*
 - libcufile-dev
 - libcurl>=8.5.0,<9.0a0
 - libnuma
diff --git a/conda/recipes/kvikio/conda_build_config.yaml b/conda/recipes/kvikio/conda_build_config.yaml
index c831f4c3cb..2694d62eac 100644
--- a/conda/recipes/kvikio/conda_build_config.yaml
+++ b/conda/recipes/kvikio/conda_build_config.yaml
@@ -1,8 +1,8 @@
 c_compiler_version:
-  - 13
+  - 14
 
 cxx_compiler_version:
-  - 13
+  - 14
 
 cmake_version:
   - ">=3.30.4"
diff --git a/conda/recipes/libkvikio/recipe.yaml b/conda/recipes/libkvikio/recipe.yaml
index 6d6316593e..55ddc6fcaf 100644
--- a/conda/recipes/libkvikio/recipe.yaml
+++ b/conda/recipes/libkvikio/recipe.yaml
@@ -10,7 +10,7 @@ context:
   # 3. Linux aarch64 with CUDA < 12.2, which does not use libcufile
   # Each case has different cuda-version constraints as expressed below
   should_use_cufile: ${{ x86_64 or (aarch64 and cuda_version >= "12.2") }}
-  c_compiler_version: ${{ 13 if should_use_cufile else 12 }}
+  c_compiler_version: ${{ 14 if should_use_cufile else 12 }}
   cxx_compiler_version: ${{ c_compiler_version }}
   # When reverting, instances of cuda_key_string can be replaced with cuda_major
   cuda_key_string: ${{ cuda_version | replace(".", "_") }}
diff --git a/dependencies.yaml b/dependencies.yaml
index 589715cdef..9b7b899780 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -133,13 +133,13 @@ dependencies:
               arch: x86_64
               cuda: "12.*"
             packages:
-              - gcc_linux-64=13.*
+              - gcc_linux-64=14.*
               - sysroot_linux-64=2.28
           - matrix:
               arch: aarch64
               cuda: "12.*"
             packages:
-              - gcc_linux-aarch64=13.*
+              - gcc_linux-aarch64=14.*
               - sysroot_linux-aarch64=2.28
       - output_types: conda
         matrices:

From 074758eb31725660217c67b6caddf412512ea544 Mon Sep 17 00:00:00 2001
From: Robert Maynard <rmaynard@nvidia.com>
Date: Fri, 25 Jul 2025 13:58:04 -0400
Subject: [PATCH 05/40] Update build infra to support new branching strategy
 (#776)

rapids_config will use `RAPIDS_BRANCH` contents to determine what branch to use

Authors:
  - Robert Maynard (https://github.com/robertmaynard)

Approvers:
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/kvikio/pull/776
---
 RAPIDS_BRANCH             |  1 +
 cmake/RAPIDS.cmake        |  6 +++---
 cmake/rapids_config.cmake | 10 ++++++++++
 3 files changed, 14 insertions(+), 3 deletions(-)
 create mode 100644 RAPIDS_BRANCH

diff --git a/RAPIDS_BRANCH b/RAPIDS_BRANCH
new file mode 100644
index 0000000000..9b1c52d941
--- /dev/null
+++ b/RAPIDS_BRANCH
@@ -0,0 +1 @@
+branch-25.10
diff --git a/cmake/RAPIDS.cmake b/cmake/RAPIDS.cmake
index d112951d3c..40de7cefcd 100644
--- a/cmake/RAPIDS.cmake
+++ b/cmake/RAPIDS.cmake
@@ -18,9 +18,9 @@
 cmake_minimum_required(VERSION 3.30.4 FATAL_ERROR)
 
 # Allow users to control which version is used
-if(NOT rapids-cmake-version OR NOT rapids-cmake-version MATCHES [[^([0-9][0-9])\.([0-9][0-9])$]])
+if(NOT rapids-cmake-branch OR NOT rapids-cmake-version)
   message(
-    FATAL_ERROR "The CMake variable rapids-cmake-version must be defined in the format MAJOR.MINOR."
+    FATAL_ERROR "The CMake variable `rapids-cmake-branch` or `rapids-cmake-version` must be defined"
   )
 endif()
 
@@ -33,7 +33,7 @@ endif()
 # Allow users to control which branch is fetched
 if(NOT rapids-cmake-branch)
   # Define a default branch if the user doesn't set one
-  set(rapids-cmake-branch "branch-${rapids-cmake-version}")
+  set(rapids-cmake-branch "release/${rapids-cmake-version}")
 endif()
 
 # Allow users to control the exact URL passed to FetchContent
diff --git a/cmake/rapids_config.cmake b/cmake/rapids_config.cmake
index abe468dce8..b706c926e7 100644
--- a/cmake/rapids_config.cmake
+++ b/cmake/rapids_config.cmake
@@ -26,5 +26,15 @@ else()
   )
 endif()
 
+# Use STRINGS to trim whitespace/newlines
+file(STRINGS "${CMAKE_CURRENT_LIST_DIR}/../RAPIDS_BRANCH" _rapids_branch)
+if(NOT _rapids_branch)
+  message(
+    FATAL_ERROR
+      "Could not determine branch name to use for checking out rapids-cmake. The file \"${CMAKE_CURRENT_LIST_DIR}/../RAPIDS_BRANCH\" is missing."
+  )
+endif()
+
 set(rapids-cmake-version "${RAPIDS_VERSION_MAJOR_MINOR}")
+set(rapids-cmake-branch "${_rapids_branch}")
 include("${CMAKE_CURRENT_LIST_DIR}/RAPIDS.cmake")

From a1bd3eaed2297d615d8377a5ae8c99cebcf2c033 Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Fri, 25 Jul 2025 11:38:49 -0700
Subject: [PATCH 06/40] Revert "Set compiler versions in context (#755)" (#784)

Closes https://github.com/rapidsai/kvikio/issues/773

Authors:
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/kvikio/pull/784
---
 ci/build_cpp.sh                     | 32 +++++++++++++++++++++++------
 conda/recipes/libkvikio/recipe.yaml |  2 --
 2 files changed, 26 insertions(+), 8 deletions(-)

diff --git a/ci/build_cpp.sh b/ci/build_cpp.sh
index b12ed048ad..700ee25707 100755
--- a/ci/build_cpp.sh
+++ b/ci/build_cpp.sh
@@ -26,13 +26,33 @@ export RAPIDS_ARTIFACTS_DIR
 source rapids-rattler-channel-string
 
 # Construct the extra variants according to the architecture
-cat > variants.yaml << EOF
-cuda_version:
-  - ${RAPIDS_CUDA_VERSION%.*}
+if [[ "$(arch)" == "x86_64" ]]; then
+    cat > variants.yaml << EOF
+    c_compiler_version:
+      - 14
+
+    cxx_compiler_version:
+      - 14
+
+    cuda_version:
+      - ${RAPIDS_CUDA_VERSION%.*}
 EOF
-if [[ "$(arch)" == "aarch64" ]]; then
-  cat >> variants.yaml << EOF
-  - 12.1 # The last version to not support cufile
+else
+    cat > variants.yaml << EOF
+    zip_keys:
+    - [c_compiler_version, cxx_compiler_version, cuda_version]
+
+    c_compiler_version:
+    - 12
+    - 14
+
+    cxx_compiler_version:
+    - 12
+    - 14
+
+    cuda_version:
+    - 12.1 # The last version to not support cufile
+    - ${RAPIDS_CUDA_VERSION%.*}
 EOF
 fi
 
diff --git a/conda/recipes/libkvikio/recipe.yaml b/conda/recipes/libkvikio/recipe.yaml
index 55ddc6fcaf..a4e423d3ed 100644
--- a/conda/recipes/libkvikio/recipe.yaml
+++ b/conda/recipes/libkvikio/recipe.yaml
@@ -10,8 +10,6 @@ context:
   # 3. Linux aarch64 with CUDA < 12.2, which does not use libcufile
   # Each case has different cuda-version constraints as expressed below
   should_use_cufile: ${{ x86_64 or (aarch64 and cuda_version >= "12.2") }}
-  c_compiler_version: ${{ 14 if should_use_cufile else 12 }}
-  cxx_compiler_version: ${{ c_compiler_version }}
   # When reverting, instances of cuda_key_string can be replaced with cuda_major
   cuda_key_string: ${{ cuda_version | replace(".", "_") }}
   #cuda_version: ${{ (env.get("RAPIDS_CUDA_VERSION") | split("."))[:2] | join(".") }}

From 7bfaf3482505d305c04239ed08e62ef5adf9e05e Mon Sep 17 00:00:00 2001
From: Tianyu Liu <kingcrimsontianyu@gmail.com>
Date: Fri, 25 Jul 2025 16:06:06 -0400
Subject: [PATCH 07/40] Use C++20 standard (#749)

This PR changes KvikIO C++ standard from 17 to 20.

Depends on https://github.com/rapidsai/kvikio/pull/751

Authors:
  - Tianyu Liu (https://github.com/kingcrimsontianyu)
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Mads R. B. Kristensen (https://github.com/madsbk)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/kvikio/pull/749
---
 cpp/CMakeLists.txt            | 5 +++++
 cpp/benchmarks/CMakeLists.txt | 9 ++++++++-
 cpp/tests/CMakeLists.txt      | 6 ++----
 3 files changed, 15 insertions(+), 5 deletions(-)

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 37d237e0e3..53351393ee 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -29,6 +29,11 @@ project(
 set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
 set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_CURRENT_SOURCE_DIR}/cmake/Modules/")
 
+# For now, disable CMake's automatic module scanning for C++ files. There is an sccache bug in the
+# version RAPIDS uses in CI that causes it to handle the resulting -M* flags incorrectly with
+# gcc>=14. We can remove this once we upgrade to a newer sccache version.
+set(CMAKE_CXX_SCAN_FOR_MODULES OFF)
+
 # Write the version header
 rapids_cmake_write_version_file(include/kvikio/version_config.hpp)
 
diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt
index 665d45edfb..fc857fdfdf 100644
--- a/cpp/benchmarks/CMakeLists.txt
+++ b/cpp/benchmarks/CMakeLists.txt
@@ -42,7 +42,14 @@ function(kvikio_add_benchmark)
   endif()
 
   add_executable(${_KVIKIO_NAME} ${_KVIKIO_SOURCES})
-  set_target_properties(${_KVIKIO_NAME} PROPERTIES INSTALL_RPATH "\$ORIGIN/../../../lib")
+  set_target_properties(
+    ${_KVIKIO_NAME}
+    PROPERTIES INSTALL_RPATH "\$ORIGIN/../../../lib"
+               CXX_STANDARD 20
+               CXX_STANDARD_REQUIRED ON
+               CUDA_STANDARD 20
+               CUDA_STANDARD_REQUIRED ON
+  )
 
   target_link_libraries(${_KVIKIO_NAME} PUBLIC benchmark::benchmark kvikio::kvikio)
 
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index a6b8391928..3d53bbd86f 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -50,11 +50,9 @@ function(kvikio_add_test)
   set_target_properties(
     ${_KVIKIO_NAME}
     PROPERTIES RUNTIME_OUTPUT_DIRECTORY "$<BUILD_INTERFACE:${KvikIO_BINARY_DIR}/gtests>"
-               CXX_STANDARD 17
+               CXX_STANDARD 20
                CXX_STANDARD_REQUIRED ON
-               # For std:: support of __int128_t. Can be removed once using cuda::std
-               CXX_EXTENSIONS ON
-               CUDA_STANDARD 17
+               CUDA_STANDARD 20
                CUDA_STANDARD_REQUIRED ON
   )
   target_link_libraries(

From c9935d144f56e5b2d2c3557db0417ce2c1e9207c Mon Sep 17 00:00:00 2001
From: Tianyu Liu <kingcrimsontianyu@gmail.com>
Date: Mon, 28 Jul 2025 11:02:57 -0400
Subject: [PATCH 08/40] Fix C++20 warning in the mmap test (#785)

The unit tests of mmap contain lambda expressions. The style of capturing the current object (`*this`) is not consistent: some places use `[&]` and others use `[=]`. In both cases, `*this` is captured by reference. However, in C++20, implicit capture of `*this` when the capture default is `=` is deprecated. This PR fixes the warning messages by consistently using `[&]` on the ground that the lifetime of `*this` is longer than the point the closure is being called.

Authors:
  - Tianyu Liu (https://github.com/kingcrimsontianyu)

Approvers:
  - Mads R. B. Kristensen (https://github.com/madsbk)

URL: https://github.com/rapidsai/kvikio/pull/785
---
 cpp/tests/test_mmap.cpp | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/cpp/tests/test_mmap.cpp b/cpp/tests/test_mmap.cpp
index 2de89c5cd7..7fb1e3a4fd 100644
--- a/cpp/tests/test_mmap.cpp
+++ b/cpp/tests/test_mmap.cpp
@@ -71,7 +71,7 @@ TEST_F(MmapTest, invalid_file_open_flag)
 {
   // Empty file open flag
   EXPECT_THAT(
-    [=] {
+    [&] {
       {
         kvikio::MmapHandle(_filepath, "");
       }
@@ -80,7 +80,7 @@ TEST_F(MmapTest, invalid_file_open_flag)
 
   // Invalid file open flag
   EXPECT_THAT(
-    [=] {
+    [&] {
       {
         kvikio::MmapHandle(_filepath, "z");
       }
@@ -91,7 +91,7 @@ TEST_F(MmapTest, invalid_file_open_flag)
 TEST_F(MmapTest, invalid_mmap_flag)
 {
   EXPECT_THAT(
-    [=] {
+    [&] {
       {
         int invalid_flag{-1};
         kvikio::MmapHandle(_filepath, "r", std::nullopt, 0, kvikio::FileHandle::m644, invalid_flag);
@@ -108,12 +108,12 @@ TEST_F(MmapTest, constructor_invalid_range)
 
   // init_file_offset is too large (by 1 char)
   EXPECT_THAT(
-    [=] { kvikio::MmapHandle(_filepath, "r", std::nullopt, _file_size); },
+    [&] { kvikio::MmapHandle(_filepath, "r", std::nullopt, _file_size); },
     ThrowsMessage<std::out_of_range>(HasSubstr("Offset must be less than the file size")));
 
   // init_size is 0
   EXPECT_THAT(
-    [=] { kvikio::MmapHandle(_filepath, "r", 0); },
+    [&] { kvikio::MmapHandle(_filepath, "r", 0); },
     ThrowsMessage<std::invalid_argument>(HasSubstr("Mapped region should not be zero byte")));
 }
 

From 71c7638ef47cf137839385b8f5875e403d7dd4bf Mon Sep 17 00:00:00 2001
From: Tianyu Liu <kingcrimsontianyu@gmail.com>
Date: Thu, 31 Jul 2025 11:34:48 -0400
Subject: [PATCH 09/40] Support file-backed mapping (2/n): host/device read
 Python binding (#742)

On top of https://github.com/rapidsai/kvikio/pull/740, this PR provides Python binding for file-backed memory mapping.

Closes https://github.com/rapidsai/kvikio/issues/530

Authors:
  - Tianyu Liu (https://github.com/kingcrimsontianyu)

Approvers:
  - Vukasin Milovanovic (https://github.com/vuule)
  - Tom Augspurger (https://github.com/TomAugspurger)
  - Mads R. B. Kristensen (https://github.com/madsbk)

URL: https://github.com/rapidsai/kvikio/pull/742
---
 cpp/src/mmap.cpp                          |  37 ++---
 docs/source/api.rst                       |   7 +
 python/kvikio/kvikio/__init__.py          |   2 +
 python/kvikio/kvikio/_lib/CMakeLists.txt  |   2 +-
 python/kvikio/kvikio/_lib/defaults.pyx    |   1 -
 python/kvikio/kvikio/_lib/file_handle.pyx |   7 +-
 python/kvikio/kvikio/_lib/mmap.pyx        | 116 +++++++++++++
 python/kvikio/kvikio/mmap.py              | 171 ++++++++++++++++++++
 python/kvikio/tests/test_mmap.py          | 188 ++++++++++++++++++++++
 9 files changed, 505 insertions(+), 26 deletions(-)
 create mode 100644 python/kvikio/kvikio/_lib/mmap.pyx
 create mode 100644 python/kvikio/kvikio/mmap.py
 create mode 100644 python/kvikio/tests/test_mmap.py

diff --git a/cpp/src/mmap.cpp b/cpp/src/mmap.cpp
index 671340cb09..968b96aa42 100644
--- a/cpp/src/mmap.cpp
+++ b/cpp/src/mmap.cpp
@@ -274,13 +274,25 @@ MmapHandle::MmapHandle(std::string const& file_path,
                        std::size_t initial_map_offset,
                        mode_t mode,
                        std::optional<int> map_flags)
-  : _initial_map_offset(initial_map_offset),
-    _initialized{true},
-    _file_wrapper(file_path, flags, false /* o_direct */, mode)
+  : _initial_map_offset(initial_map_offset), _initialized{true}
 {
   KVIKIO_NVTX_FUNC_RANGE();
 
-  _file_size = get_file_size(_file_wrapper.fd());
+  switch (flags[0]) {
+    case 'r': {
+      _map_protection = PROT_READ;
+      break;
+    }
+    case 'w': {
+      KVIKIO_FAIL("File-backed mmap write is not supported yet", std::invalid_argument);
+    }
+    default: {
+      KVIKIO_FAIL("Unknown file open flag", std::invalid_argument);
+    }
+  }
+
+  _file_wrapper = FileWrapper(file_path, flags, false /* o_direct */, mode);
+  _file_size    = get_file_size(_file_wrapper.fd());
   if (_file_size == 0) { return; }
 
   {
@@ -310,22 +322,7 @@ MmapHandle::MmapHandle(std::string const& file_path,
   _map_offset             = detail::align_down(_initial_map_offset, page_size);
   auto const offset_delta = _initial_map_offset - _map_offset;
   _map_size               = _initial_map_size + offset_delta;
-
-  switch (flags[0]) {
-    case 'r': {
-      _map_protection = PROT_READ;
-      break;
-    }
-    case 'w': {
-      KVIKIO_FAIL("File-backed mmap write is not supported yet", std::invalid_argument);
-    }
-    default: {
-      KVIKIO_FAIL("Unknown file open flag", std::invalid_argument);
-    }
-  }
-
-  _map_flags = map_flags.has_value() ? map_flags.value() : MAP_PRIVATE;
-
+  _map_flags              = map_flags.has_value() ? map_flags.value() : MAP_PRIVATE;
   _map_addr =
     mmap(nullptr, _map_size, _map_protection, _map_flags, _file_wrapper.fd(), _map_offset);
   SYSCALL_CHECK(_map_addr, "Cannot create memory mapping", MAP_FAILED);
diff --git a/docs/source/api.rst b/docs/source/api.rst
index 1e19f12bdc..7ae724dec5 100644
--- a/docs/source/api.rst
+++ b/docs/source/api.rst
@@ -33,6 +33,13 @@ CuFile driver
 
 .. autofunction:: initialize
 
+Mmap
+----
+.. currentmodule:: kvikio.mmap
+
+.. autoclass:: Mmap
+    :members:
+
 Zarr
 ----
 .. currentmodule:: kvikio.zarr
diff --git a/python/kvikio/kvikio/__init__.py b/python/kvikio/kvikio/__init__.py
index 9208d4e3ce..5c6d8b6353 100644
--- a/python/kvikio/kvikio/__init__.py
+++ b/python/kvikio/kvikio/__init__.py
@@ -15,6 +15,7 @@
 from kvikio._lib.defaults import CompatMode  # noqa: F401
 from kvikio._version import __git_commit__, __version__
 from kvikio.cufile import CuFile, clear_page_cache, get_page_cache_info
+from kvikio.mmap import Mmap
 from kvikio.remote_file import RemoteFile, is_remote_file_available
 
 __all__ = [
@@ -22,6 +23,7 @@
     "__version__",
     "clear_page_cache",
     "CuFile",
+    "Mmap",
     "get_page_cache_info",
     "is_remote_file_available",
     "RemoteFile",
diff --git a/python/kvikio/kvikio/_lib/CMakeLists.txt b/python/kvikio/kvikio/_lib/CMakeLists.txt
index 1ea9b85dff..7dfb47cc0f 100644
--- a/python/kvikio/kvikio/_lib/CMakeLists.txt
+++ b/python/kvikio/kvikio/_lib/CMakeLists.txt
@@ -14,7 +14,7 @@
 
 # Set the list of Cython files to build, one .so per file
 set(cython_modules arr.pyx buffer.pyx defaults.pyx cufile_driver.pyx file_handle.pyx future.pyx
-                   libnvcomp.pyx libnvcomp_ll.pyx
+                   libnvcomp.pyx libnvcomp_ll.pyx mmap.pyx
 )
 
 if(KvikIO_REMOTE_SUPPORT)
diff --git a/python/kvikio/kvikio/_lib/defaults.pyx b/python/kvikio/kvikio/_lib/defaults.pyx
index 00f1de4ec1..de511211e6 100644
--- a/python/kvikio/kvikio/_lib/defaults.pyx
+++ b/python/kvikio/kvikio/_lib/defaults.pyx
@@ -37,7 +37,6 @@ cdef extern from "<kvikio/defaults.hpp>" namespace "kvikio" nogil:
     vector[int] cpp_http_status_codes "kvikio::defaults::http_status_codes"() except +
     void cpp_set_http_status_codes \
         "kvikio::defaults::set_http_status_codes"(vector[int] status_codes) except +
-
     long cpp_http_timeout "kvikio::defaults::http_timeout"() except +
     void cpp_set_http_timeout\
         "kvikio::defaults::set_http_timeout"(long timeout_seconds) except +
diff --git a/python/kvikio/kvikio/_lib/file_handle.pyx b/python/kvikio/kvikio/_lib/file_handle.pyx
index c0d71f36a7..b17d283433 100644
--- a/python/kvikio/kvikio/_lib/file_handle.pyx
+++ b/python/kvikio/kvikio/_lib/file_handle.pyx
@@ -6,7 +6,6 @@
 
 import io
 import os
-import pathlib
 from typing import Optional, Union
 
 from posix cimport fcntl
@@ -97,8 +96,8 @@ cdef class CuFile:
     def __init__(self, file_path, flags="r"):
         self._handle = move(
             FileHandle(
-                str.encode(str(pathlib.Path(file_path))),
-                str.encode(str(flags))
+                os.fsencode(file_path),
+                str(flags).encode()
             )
         )
 
@@ -194,7 +193,7 @@ def get_page_cache_info(file: Union[os.PathLike, str, int, io.IOBase]) \
         -> tuple[int, int]:
     if isinstance(file, os.PathLike) or isinstance(file, str):
         # file is a path or a string object
-        path_bytes = str(pathlib.Path(file)).encode()
+        path_bytes = os.fsencode(file)
         return cpp_get_page_cache_info_str(path_bytes)
     elif isinstance(file, int):
         # file is a file descriptor
diff --git a/python/kvikio/kvikio/_lib/mmap.pyx b/python/kvikio/kvikio/_lib/mmap.pyx
new file mode 100644
index 0000000000..ac4889b25c
--- /dev/null
+++ b/python/kvikio/kvikio/_lib/mmap.pyx
@@ -0,0 +1,116 @@
+# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+# See file LICENSE for terms.
+
+# distutils: language = c++
+# cython: language_level=3
+
+import os
+from typing import Any, Optional
+
+from posix cimport fcntl, stat
+
+from libc.stdint cimport uintptr_t
+from libcpp cimport bool
+from libcpp.optional cimport nullopt, optional
+from libcpp.string cimport string
+from libcpp.utility cimport move, pair
+
+from kvikio._lib.arr cimport parse_buffer_argument
+from kvikio._lib.future cimport IOFuture, _wrap_io_future, future
+
+from kvikio._lib import defaults
+
+
+cdef extern from "<kvikio/mmap.hpp>" namespace "kvikio" nogil:
+    cdef cppclass CppMmapHandle "kvikio::MmapHandle":
+        CppMmapHandle() noexcept
+        CppMmapHandle(string file_path, string flags, optional[size_t] initial_map_size,
+                      size_t initial_map_offset, fcntl.mode_t mode,
+                      optional[int] map_flags) except +
+        size_t initial_map_size() noexcept
+        size_t initial_map_offset() noexcept
+        size_t file_size() except +
+        void close() noexcept
+        bool closed() noexcept
+        size_t read(void* buf, optional[size_t] size, size_t offset) except +
+        future[size_t] pread(void* buf, optional[size_t] size, size_t offset,
+                             size_t task_size) except +
+
+cdef class InternalMmapHandle:
+    cdef CppMmapHandle _handle
+
+    def __init__(self, file_path: os.PathLike,
+                 flags: str = "r",
+                 initial_map_size: Optional[int] = None,
+                 initial_map_offset: int = 0,
+                 mode: int = stat.S_IRUSR | stat.S_IWUSR | stat.S_IRGRP | stat.S_IROTH,
+                 map_flags: Optional[int] = None):
+        if not os.path.exists(file_path):
+            raise RuntimeError("Unable to open file")
+
+        cdef optional[size_t] cpp_initial_map_size
+        if initial_map_size is None:
+            cpp_initial_map_size = nullopt
+        else:
+            cpp_initial_map_size = <size_t>(initial_map_size)
+
+        path_bytes = os.fsencode(file_path)
+        flags_bytes = str(flags).encode()
+
+        cdef optional[int] cpp_map_flags
+        if map_flags is None:
+            cpp_map_flags = nullopt
+        else:
+            cpp_map_flags = <int>(map_flags)
+
+        self._handle = move(CppMmapHandle(path_bytes,
+                                          flags_bytes,
+                                          cpp_initial_map_size,
+                                          initial_map_offset,
+                                          mode,
+                                          cpp_map_flags))
+
+    def initial_map_size(self) -> int:
+        return self._handle.initial_map_size()
+
+    def initial_map_offset(self) -> int:
+        return self._handle.initial_map_offset()
+
+    def file_size(self) -> int:
+        return self._handle.file_size()
+
+    def close(self) -> None:
+        self._handle.close()
+
+    def closed(self) -> bool:
+        return self._handle.closed()
+
+    def read(self, buf: Any, size: Optional[int] = None, offset: int = 0) -> int:
+        cdef optional[size_t] cpp_size
+        if size is None:
+            cpp_size = nullopt
+        else:
+            cpp_size = <size_t>(size)
+        cdef pair[uintptr_t, size_t] info = parse_buffer_argument(buf, size, True)
+        return self._handle.read(<void*>info.first,
+                                 cpp_size,
+                                 offset)
+
+    def pread(self, buf: Any, size: Optional[int] = None, offset: int = 0,
+              task_size: Optional[int] = None) -> IOFuture:
+        cdef optional[size_t] cpp_size
+        if size is None:
+            cpp_size = nullopt
+        else:
+            cpp_size = <size_t>(size)
+        cdef pair[uintptr_t, size_t] info = parse_buffer_argument(buf, size, True)
+
+        if task_size is None:
+            cpp_task_size = defaults.task_size()
+        else:
+            cpp_task_size = task_size
+
+        return _wrap_io_future(self._handle.pread(<void*>info.first,
+                               cpp_size,
+                               offset,
+                               cpp_task_size))
diff --git a/python/kvikio/kvikio/mmap.py b/python/kvikio/kvikio/mmap.py
new file mode 100644
index 0000000000..3b506ded02
--- /dev/null
+++ b/python/kvikio/kvikio/mmap.py
@@ -0,0 +1,171 @@
+# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+# See file LICENSE for terms.
+
+import os
+import stat
+from typing import Any, Optional
+
+from kvikio._lib.mmap import InternalMmapHandle
+from kvikio.cufile import IOFuture
+
+
+class Mmap:
+    """Handle of a memory-mapped file"""
+
+    def __init__(
+        self,
+        file_path: os.PathLike,
+        flags: str = "r",
+        initial_map_size: Optional[int] = None,
+        initial_map_offset: int = 0,
+        mode: int = stat.S_IRUSR | stat.S_IWUSR | stat.S_IRGRP | stat.S_IROTH,
+        map_flags: Optional[int] = None,
+    ):
+        """Construct a new memory-mapped file handle
+
+        Parameters
+        ----------
+        file_path : os.PathLike
+            File path.
+        flags : str, optional
+
+            - ``r``: Open for reading (default)
+            - ``w``: (Not implemented yet) Open for writing, truncating the file first
+            - ``a``: (Not implemented yet) Open for writing, appending to the end of
+              file if it exists
+            - ``+``: (Not implemented yet) Open for updating (reading and writing)
+        initial_map_size : int, optional
+            Size in bytes of the mapped region. If not specified, map the region
+            starting from ``initial_map_offset`` to the end of file.
+        initial_map_offset : int, optional
+            File offset of the mapped region. Default is 0.
+        mode : int, optional
+            Access mode (permissions) to use if creating a new file. Default is
+            0644 (octal), 420 (decimal).
+        map_flags : int, optional
+            Flags to be passed to the system call ``mmap``. See `mmap(2)` for details.
+        """
+        self._handle = InternalMmapHandle(
+            file_path, flags, initial_map_size, initial_map_offset, mode, map_flags
+        )
+
+    def initial_map_size(self) -> int:
+        """Size in bytes of the mapped region when the mapping handle was constructed
+
+        Returns
+        -------
+        int
+            Initial size of the mapped region.
+        """
+        return self._handle.initial_map_size()
+
+    def initial_map_offset(self) -> int:
+        """File offset of the mapped region when the mapping handle was constructed
+
+        Returns
+        -------
+        int
+            Initial file offset of the mapped region.
+        """
+        return self._handle.initial_map_offset()
+
+    def file_size(self) -> int:
+        """Get the file size if the file is open
+
+        Returns 0 if the file is closed.
+
+        Returns
+        -------
+        int
+            The file size in bytes.
+        """
+        return self._handle.file_size()
+
+    def close(self) -> None:
+        """Close the mapping handle if it is open; do nothing otherwise
+
+        Unmaps the memory region and closes the underlying file descriptor.
+        """
+        self._handle.close()
+
+    def closed(self) -> bool:
+        """Whether the mapping handle is closed
+
+        Returns
+        -------
+        bool
+            Boolean answer.
+        """
+        return self._handle.closed()
+
+    def read(self, buf: Any, size: Optional[int] = None, offset: int = 0) -> int:
+        """Sequential read ``size`` bytes from the file to the destination buffer
+        ``buf``
+
+        Parameters
+        ----------
+        buf : buffer-like or array-like
+            Address of the host or device memory (destination buffer).
+        size : int, optional
+            Size in bytes to read. If not specified, read starts from ``offset``
+            to the end of file.
+        offset : int, optional
+            File offset. Default is 0.
+
+        Returns
+        -------
+        int
+            Number of bytes that have been read.
+
+        Raises
+        ------
+        IndexError
+            If the read region specified by ``offset`` and ``size`` is outside the
+            initial region specified when the mapping handle was constructed.
+        RuntimeError
+            If the mapping handle is closed.
+        """
+        return self._handle.read(buf, size, offset)
+
+    def pread(
+        self,
+        buf: Any,
+        size: Optional[int] = None,
+        offset: int = 0,
+        task_size: Optional[int] = None,
+    ) -> IOFuture:
+        """Parallel read ``size`` bytes from the file to the destination buffer ``buf``
+
+        Parameters
+        ----------
+        buf : buffer-like or array-like
+            Address of the host or device memory (destination buffer).
+        size : int, optional
+            Size in bytes to read. If not specified, read starts from ``offset``
+            to the end of file.
+        offset : int, optional
+            File offset. Default is 0.
+        task_size : int, optional
+            Size of each task in bytes for parallel execution. If None, uses
+            the default task size from :func:`kvikio.defaults.task_size`.
+
+        Returns
+        -------
+        IOFuture
+            Future that on completion returns the size of bytes that were successfully
+            read.
+
+        Raises
+        ------
+        IndexError
+            If the read region specified by ``offset`` and ``size`` is outside the
+            initial region specified when the mapping handle was constructed.
+        RuntimeError
+            If the mapping handle is closed.
+
+        Notes
+        -----
+        The returned IOFuture object's ``get()`` should not be called after the lifetime
+        of the MmapHandle object ends. Otherwise, the behavior is undefined.
+        """
+        return IOFuture(self._handle.pread(buf, size, offset, task_size))
diff --git a/python/kvikio/tests/test_mmap.py b/python/kvikio/tests/test_mmap.py
new file mode 100644
index 0000000000..6ad7468690
--- /dev/null
+++ b/python/kvikio/tests/test_mmap.py
@@ -0,0 +1,188 @@
+# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+# See file LICENSE for terms.
+
+import contextlib
+import os
+
+import pytest
+
+import kvikio.defaults
+
+cupy = pytest.importorskip("cupy")
+numpy = pytest.importorskip("numpy")
+
+
+def test_no_file(tmp_path):
+    nonexistent_file = tmp_path / "nonexistent_file"
+    with pytest.raises(RuntimeError, match=r".*Unable to open file.*"):
+        kvikio.Mmap(nonexistent_file)
+
+
+def test_invalid_file_open_flag(tmp_path):
+    filename = tmp_path / "read-only-test-file"
+    expected_data = numpy.arange(1024)
+    expected_data.tofile(filename)
+
+    with pytest.raises(ValueError, match=r".*Unknown file open flag.*"):
+        kvikio.Mmap(filename, "")
+
+    with pytest.raises(ValueError, match=r".*Unknown file open flag.*"):
+        kvikio.Mmap(filename, "z")
+
+
+def test_constructor_invalid_range(tmp_path, xp):
+    filename = tmp_path / "read-only-test-file"
+    test_data = xp.arange(1024 * 1024)
+    test_data.tofile(filename)
+
+    with pytest.raises(IndexError, match=r".*Offset must be less than the file size.*"):
+        kvikio.Mmap(filename, "r", None, test_data.nbytes * 2)
+
+    with pytest.raises(IndexError, match=r".*Mapped region is past the end of file.*"):
+        kvikio.Mmap(filename, "r", test_data.nbytes * 2)
+
+    with pytest.raises(ValueError, match=r".*Mapped region should not be zero byte.*"):
+        kvikio.Mmap(filename, "r", 0)
+
+
+def test_read_invalid_range(tmp_path, xp):
+    filename = tmp_path / "read-only-test-file"
+    test_data = xp.arange(1024 * 1024)
+    test_data.tofile(filename)
+    output_data = xp.zeros_like(test_data)
+
+    initial_size = 1024
+    initial_file_offset = 512
+
+    with pytest.raises(IndexError, match=r".*Offset is past the end of file.*"):
+        mmap_handle = kvikio.Mmap(filename, "r", initial_size, initial_file_offset)
+        mmap_handle.read(output_data, initial_size, test_data.nbytes + 1)
+
+    with pytest.raises(IndexError, match=r".*Read is out of bound.*"):
+        mmap_handle = kvikio.Mmap(filename, "r", initial_size, initial_file_offset)
+        mmap_handle.read(output_data, initial_size, initial_file_offset + 1)
+
+    with pytest.raises(IndexError, match=r".*Read is out of bound.*"):
+        mmap_handle = kvikio.Mmap(filename, "r", initial_size, initial_file_offset)
+        mmap_handle.read(output_data, initial_size, initial_file_offset - 128)
+
+    with pytest.raises(IndexError, match=r".*Read is out of bound.*"):
+        mmap_handle = kvikio.Mmap(filename, "r", initial_size, initial_file_offset)
+        mmap_handle.read(output_data, initial_size + 128, initial_file_offset)
+
+
+def test_read_valid_range(tmp_path, xp):
+    filename = tmp_path / "read-only-test-file"
+    test_data = xp.arange(1024 * 1024)
+    test_data.tofile(filename)
+    output_data = xp.zeros_like(test_data)
+
+    initial_size = 1024
+    initial_file_offset = 512
+
+    with contextlib.nullcontext():
+        mmap_handle = kvikio.Mmap(filename, "r", initial_size, initial_file_offset)
+        mmap_handle.read(output_data, 0, initial_file_offset)
+
+
+@pytest.mark.parametrize("num_elements_to_read", [None, 10, 9999])
+@pytest.mark.parametrize("num_elements_to_skip", [0, 10, 100, 1000, 9999])
+def test_read_seq(tmp_path, xp, num_elements_to_read, num_elements_to_skip):
+    filename = tmp_path / "read-only-test-file"
+    test_data = xp.arange(1024 * 1024)
+    test_data.tofile(filename)
+
+    if num_elements_to_read is None:
+        initial_size = None
+        actual_num_elements_to_read = int(
+            os.path.getsize(filename) / test_data.itemsize
+        )
+    else:
+        initial_size = num_elements_to_read * test_data.itemsize
+        actual_num_elements_to_read = num_elements_to_read
+
+    initial_file_offset = num_elements_to_skip * test_data.itemsize
+    expected_data = test_data[
+        num_elements_to_skip : (num_elements_to_skip + actual_num_elements_to_read)
+    ]
+    actual_data = xp.zeros_like(expected_data)
+
+    mmap_handle = kvikio.Mmap(filename, "r", initial_size, initial_file_offset)
+    read_size = mmap_handle.read(actual_data, initial_size, initial_file_offset)
+
+    assert read_size == expected_data.nbytes
+    xp.testing.assert_array_equal(actual_data, expected_data)
+
+
+@pytest.mark.parametrize("num_elements_to_read", [None, 10, 9999])
+@pytest.mark.parametrize("num_elements_to_skip", [0, 10, 100, 1000, 9999])
+@pytest.mark.parametrize("task_size", [1024, 12345])
+def test_read_parallel(
+    tmp_path, xp, num_elements_to_read, num_elements_to_skip, task_size
+):
+    filename = tmp_path / "read-only-test-file"
+    test_data = xp.arange(1024 * 1024)
+    test_data.tofile(filename)
+
+    if num_elements_to_read is None:
+        initial_size = None
+        actual_num_elements_to_read = int(
+            os.path.getsize(filename) / test_data.itemsize
+        )
+    else:
+        initial_size = num_elements_to_read * test_data.itemsize
+        actual_num_elements_to_read = num_elements_to_read
+
+    initial_file_offset = num_elements_to_skip * test_data.itemsize
+    expected_data = test_data[
+        num_elements_to_skip : (num_elements_to_skip + actual_num_elements_to_read)
+    ]
+    actual_data = xp.zeros_like(expected_data)
+
+    with kvikio.defaults.set("task_size", task_size):
+        mmap_handle = kvikio.Mmap(filename, "r", initial_size, initial_file_offset)
+        fut = mmap_handle.pread(
+            actual_data, initial_size, initial_file_offset, task_size
+        )
+
+        assert fut.get() == expected_data.nbytes
+        xp.testing.assert_array_equal(actual_data, expected_data)
+
+
+def test_read_with_default_arguments(tmp_path, xp):
+    filename = tmp_path / "read-only-test-file"
+    expected_data = xp.arange(1024 * 1024)
+    expected_data.tofile(filename)
+    actual_data = xp.zeros_like(expected_data)
+
+    # Workaround for a CI failure where defaults.task_size() is somehow 0
+    # instead of 4 MiB when KVIKIO_TASK_SIZE is unset
+    with kvikio.defaults.set("task_size", 4 * 1024 * 1024):
+        mmap_handle = kvikio.Mmap(filename, "r")
+
+        read_size = mmap_handle.read(actual_data)
+        assert read_size == expected_data.nbytes
+        xp.testing.assert_array_equal(actual_data, expected_data)
+
+        fut = mmap_handle.pread(actual_data)
+        assert fut.get() == expected_data.nbytes
+        xp.testing.assert_array_equal(actual_data, expected_data)
+
+
+def test_closed_handle(tmp_path, xp):
+    filename = tmp_path / "read-only-test-file"
+    expected_data = xp.arange(1024 * 1024)
+    expected_data.tofile(filename)
+    actual_data = xp.zeros_like(expected_data)
+
+    mmap_handle = kvikio.Mmap(filename, "r")
+    mmap_handle.close()
+
+    assert mmap_handle.closed()
+    assert mmap_handle.file_size() == 0
+
+    with pytest.raises(RuntimeError, match=r".*Cannot read from a closed MmapHandle.*"):
+        mmap_handle.read(actual_data)
+
+    with pytest.raises(RuntimeError, match=r".*Cannot read from a closed MmapHandle.*"):
+        mmap_handle.pread(actual_data)

From 6cffae5a905644f88227bdcad35cbad695258789 Mon Sep 17 00:00:00 2001
From: Tianyu Liu <kingcrimsontianyu@gmail.com>
Date: Mon, 4 Aug 2025 09:35:54 -0400
Subject: [PATCH 10/40] Support file size query for S3 presigned URL (#789)

## Background

Knowing the size of the remote file before reading is important in remote I/O, as it allows users to pre-allocate buffer to avoid expensive on-the-fly reallocation. Currently in KvikIO this is not possible for AWS S3 presigned URL, which is a special link generated by data owner to grant time-limited access without using AWS credentials.

As is described in #585, file size query in KvikIO results in the HTTP 403 (forbidden) status code. This is because the query method is based on the `HEAD` request, and AWS S3 does not allow `HEAD` for presigned URL.

## Proposed solution

This PR provides a solution. The idea is to send a `GET` request (instead of `HEAD`) with a 1-byte range, so that we can still obtain the header information at a negligible cost. Since the `content-length` header is now at a fixed value of 1, we instead extract the file size value from `content-range`.

This PR adds a new C++ endpoint `S3EndpointWithPresignedUrl` and Python API `kvikio.RemoteFile.open_s3_presigned_url(url)`.

## Result

The following code now works properly without 403 error:

```python
import kvikio
import cupy

presigned_url = "<long_url_generated_by_data_owner>"
remote_file = kvikio.RemoteFile.open_s3_presigned_url(presigned_url)
print("--> file size: {:}".format(remote_file.nbytes()))

buf = cupy.zeros(remote_file.nbytes() // 8)
fut = remote_file.pread(buf)
read_size = fut.get()

print("--> read_size: {:}", read_size)
print(buf)
```

## Limitation

This PR is tested manually using a presigned URL. In a future PR, we need to add unit tests using `boto`.

Closes #585

Authors:
  - Tianyu Liu (https://github.com/kingcrimsontianyu)

Approvers:
  - Mads R. B. Kristensen (https://github.com/madsbk)
  - Lawrence Mitchell (https://github.com/wence-)

URL: https://github.com/rapidsai/kvikio/pull/789
---
 cpp/include/kvikio/remote_handle.hpp        |  40 ++++--
 cpp/src/remote_handle.cpp                   | 139 ++++++++++++++++++--
 python/kvikio/kvikio/_lib/remote_handle.pyx |  16 +++
 python/kvikio/kvikio/remote_file.py         |  22 ++++
 4 files changed, 196 insertions(+), 21 deletions(-)

diff --git a/cpp/include/kvikio/remote_handle.hpp b/cpp/include/kvikio/remote_handle.hpp
index 7c197ea2b0..5f11d76f3d 100644
--- a/cpp/include/kvikio/remote_handle.hpp
+++ b/cpp/include/kvikio/remote_handle.hpp
@@ -18,12 +18,8 @@
 #include <cassert>
 #include <cstddef>
 #include <cstring>
-#include <iostream>
 #include <memory>
 #include <optional>
-#include <regex>
-#include <sstream>
-#include <stdexcept>
 #include <string>
 
 #include <kvikio/defaults.hpp>
@@ -48,6 +44,8 @@ class CurlHandle;  // Prototype
  */
 class RemoteEndpoint {
  public:
+  virtual ~RemoteEndpoint() = default;
+
   /**
    * @brief Set needed connection options on a curl handle.
    *
@@ -64,7 +62,12 @@ class RemoteEndpoint {
    */
   virtual std::string str() const = 0;
 
-  virtual ~RemoteEndpoint() = default;
+  /**
+   * @brief Get the size of the remote file.
+   *
+   * @return The file size
+   */
+  virtual std::size_t get_file_size() = 0;
 };
 
 /**
@@ -81,9 +84,11 @@ class HttpEndpoint : public RemoteEndpoint {
    * @param url The full http url to the remote file.
    */
   HttpEndpoint(std::string url);
+
+  ~HttpEndpoint() override = default;
   void setopt(CurlHandle& curl) override;
   std::string str() const override;
-  ~HttpEndpoint() override = default;
+  std::size_t get_file_size() override;
 };
 
 /**
@@ -189,9 +194,27 @@ class S3Endpoint : public RemoteEndpoint {
              std::optional<std::string> aws_endpoint_url      = std::nullopt,
              std::optional<std::string> aws_session_token     = std::nullopt);
 
+  ~S3Endpoint() override;
   void setopt(CurlHandle& curl) override;
   std::string str() const override;
-  ~S3Endpoint() override;
+  std::size_t get_file_size() override;
+};
+
+/**
+ * @brief A remote endpoint using AWS's S3 protocol and expecting a presigned URL. File access via
+ * this type of URL is time-limited and does not require AWS credentials.
+ */
+class S3EndpointWithPresignedUrl : public RemoteEndpoint {
+ private:
+  std::string _url;
+
+ public:
+  explicit S3EndpointWithPresignedUrl(std::string presigned_url);
+
+  ~S3EndpointWithPresignedUrl() override = default;
+  void setopt(CurlHandle& curl) override;
+  std::string str() const override;
+  std::size_t get_file_size() override;
 };
 
 /**
@@ -229,7 +252,8 @@ class RemoteHandle {
   /**
    * @brief Get the file size.
    *
-   * Note, this is very fast, no communication needed.
+   * Note, the file size is retrieved at construction so this method is very fast, no communication
+   * needed.
    *
    * @return The number of bytes.
    */
diff --git a/cpp/src/remote_handle.cpp b/cpp/src/remote_handle.cpp
index 485e0739ac..1aba93bdc5 100644
--- a/cpp/src/remote_handle.cpp
+++ b/cpp/src/remote_handle.cpp
@@ -19,6 +19,7 @@
 #include <cstring>
 #include <iostream>
 #include <memory>
+#include <regex>
 #include <sstream>
 #include <stdexcept>
 #include <string>
@@ -133,12 +134,46 @@ class BounceBufferH2D {
   }
 };
 
+/**
+ * @brief Get the file size, if using `HEAD` request to obtain the content-length header is
+ * permitted.
+ *
+ * This function works for the `HttpEndpoint` and `S3Endpoint`, but not for
+ * `S3EndpointWithPresignedUrl`, which does not allow `HEAD` request.
+ *
+ * @param endpoint The remote endpoint
+ * @param url The URL of the remote file
+ * @return The file size
+ */
+std::size_t get_file_size_using_head_impl(RemoteEndpoint& endpoint, std::string const& url)
+{
+  auto curl = create_curl_handle();
+
+  endpoint.setopt(curl);
+  curl.setopt(CURLOPT_NOBODY, 1L);
+  curl.setopt(CURLOPT_FOLLOWLOCATION, 1L);
+  curl.perform();
+  curl_off_t cl;
+  curl.getinfo(CURLINFO_CONTENT_LENGTH_DOWNLOAD_T, &cl);
+  KVIKIO_EXPECT(
+    cl >= 0,
+    "cannot get size of " + endpoint.str() + ", content-length not provided by the server",
+    std::runtime_error);
+  return static_cast<std::size_t>(cl);
+}
+
 }  // namespace
 
 HttpEndpoint::HttpEndpoint(std::string url) : _url{std::move(url)} {}
 
 std::string HttpEndpoint::str() const { return _url; }
 
+std::size_t HttpEndpoint::get_file_size()
+{
+  KVIKIO_NVTX_FUNC_RANGE();
+  return get_file_size_using_head_impl(*this, _url);
+}
+
 void HttpEndpoint::setopt(CurlHandle& curl)
 {
   KVIKIO_NVTX_FUNC_RANGE();
@@ -286,28 +321,106 @@ S3Endpoint::~S3Endpoint() { curl_slist_free_all(_curl_header_list); }
 
 std::string S3Endpoint::str() const { return _url; }
 
-RemoteHandle::RemoteHandle(std::unique_ptr<RemoteEndpoint> endpoint, std::size_t nbytes)
-  : _endpoint{std::move(endpoint)}, _nbytes{nbytes}
+std::size_t S3Endpoint::get_file_size()
 {
   KVIKIO_NVTX_FUNC_RANGE();
+  return get_file_size_using_head_impl(*this, _url);
 }
 
-RemoteHandle::RemoteHandle(std::unique_ptr<RemoteEndpoint> endpoint)
+S3EndpointWithPresignedUrl::S3EndpointWithPresignedUrl(std::string presigned_url)
+  : _url{std::move(presigned_url)}
+{
+}
+
+void S3EndpointWithPresignedUrl::setopt(CurlHandle& curl)
+{
+  KVIKIO_NVTX_FUNC_RANGE();
+  curl.setopt(CURLOPT_URL, _url.c_str());
+}
+
+std::string S3EndpointWithPresignedUrl::str() const { return _url; }
+
+namespace {
+/**
+ * @brief Callback for the `CURLOPT_HEADERFUNCTION` parameter in libcurl
+ *
+ * The header callback is called once for each header and only complete header lines are passed on
+ * to the callback. The provided header line is not null-terminated.
+ *
+ * @param data Transfer buffer where new data is received
+ * @param size Curl internal implementation always sets this parameter to 1
+ * @param num_bytes The size of new data received
+ * @param userdata User-defined data
+ * @return The number of bytes consumed by the callback
+ * @exception std::invalid_argument if the server does not know the file size, thereby using "*" as
+ * the filler text in the content-range header of the HTTP message.
+ */
+std::size_t callback_header(char* data, std::size_t size, std::size_t num_bytes, void* userdata)
 {
+  auto new_data_size = size * num_bytes;
+  auto* file_size    = reinterpret_cast<long*>(userdata);
+
+  // The header line is not null-terminated. This constructor overload ensures header_line.data() is
+  // null-terminated.
+  std::string const header_line{data, new_data_size};
+
+  // The content-range header has the format
+  // Content-Range: <unit> <range>/<size>
+  // Content-Range: <unit> <range>/*
+  // Content-Range: <unit> */<size>
+  std::regex const pattern(R"(Content-Range:[^/]+/(.*))", std::regex::icase);
+  std::smatch match_result;
+  bool found = std::regex_search(header_line, match_result, pattern);
+  if (found) {
+    // If the file size is unknown (represented by "*" in the content-range header), string-to-long
+    // conversion will throw an `std::invalid_argument` exception. The exception message from
+    // `std::stol` is usually too concise to be useful (being simply a string of "stol"), so a
+    // custom exception is used instead.
+    try {
+      *file_size = std::stol(match_result[1].str());
+    } catch (...) {
+      KVIKIO_FAIL("File size information missing on the server side.", std::invalid_argument);
+    }
+  }
+  return new_data_size;
+}
+}  // namespace
+
+std::size_t S3EndpointWithPresignedUrl::get_file_size()
+{
+  // Usually the `HEAD` request is used to obtain the content-length (file size). However, AWS S3
+  // does not allow it for presigned URL. The workaround here is to send the `GET` request with
+  // 1-byte range, so that we can still obtain the header information at a negligible cost. Since
+  // the content-length header is now at a fixed value of 1, we instead extract the file size value
+  // from content-range.
+
   KVIKIO_NVTX_FUNC_RANGE();
+
   auto curl = create_curl_handle();
+  curl.setopt(CURLOPT_URL, _url.c_str());
+
+  // 1-byte range, specified in the format "<start-byte>-<end-byte>""
+  std::string my_range{"0-0"};
+  curl.setopt(CURLOPT_RANGE, my_range.c_str());
+
+  long file_size{};
+  curl.setopt(CURLOPT_HEADERDATA, static_cast<void*>(&file_size));
+  curl.setopt(CURLOPT_HEADERFUNCTION, callback_header);
 
-  endpoint->setopt(curl);
-  curl.setopt(CURLOPT_NOBODY, 1L);
-  curl.setopt(CURLOPT_FOLLOWLOCATION, 1L);
   curl.perform();
-  curl_off_t cl;
-  curl.getinfo(CURLINFO_CONTENT_LENGTH_DOWNLOAD_T, &cl);
-  KVIKIO_EXPECT(
-    cl >= 0,
-    "cannot get size of " + endpoint->str() + ", content-length not provided by the server",
-    std::runtime_error);
-  _nbytes   = cl;
+  return file_size;
+}
+
+RemoteHandle::RemoteHandle(std::unique_ptr<RemoteEndpoint> endpoint, std::size_t nbytes)
+  : _endpoint{std::move(endpoint)}, _nbytes{nbytes}
+{
+  KVIKIO_NVTX_FUNC_RANGE();
+}
+
+RemoteHandle::RemoteHandle(std::unique_ptr<RemoteEndpoint> endpoint)
+{
+  KVIKIO_NVTX_FUNC_RANGE();
+  _nbytes   = endpoint->get_file_size();
   _endpoint = std::move(endpoint);
 }
 
diff --git a/python/kvikio/kvikio/_lib/remote_handle.pyx b/python/kvikio/kvikio/_lib/remote_handle.pyx
index dfb662a9fb..0c2ae4c3e4 100644
--- a/python/kvikio/kvikio/_lib/remote_handle.pyx
+++ b/python/kvikio/kvikio/_lib/remote_handle.pyx
@@ -31,6 +31,10 @@ cdef extern from "<kvikio/remote_handle.hpp>" nogil:
     pair[string, string] cpp_parse_s3_url \
         "kvikio::S3Endpoint::parse_s3_url"(string url) except +
 
+    cdef cppclass cpp_S3EndpointWithPresignedUrl "kvikio::S3EndpointWithPresignedUrl" \
+                                                 (cpp_RemoteEndpoint):
+        cpp_S3EndpointWithPresignedUrl(string presigned_url) except +
+
     cdef cppclass cpp_RemoteHandle "kvikio::RemoteHandle":
         cpp_RemoteHandle(
             unique_ptr[cpp_RemoteEndpoint] endpoint, size_t nbytes
@@ -141,6 +145,18 @@ cdef class RemoteFile:
             nbytes
         )
 
+    @staticmethod
+    def open_s3_from_http_presigned_url(
+        presigned_url: str,
+        nbytes: Optional[int],
+    ):
+        return RemoteFile._from_endpoint(
+            cast_to_remote_endpoint(
+                make_unique[cpp_S3EndpointWithPresignedUrl](_to_string(presigned_url))
+            ),
+            nbytes
+        )
+
     def __str__(self) -> str:
         cdef string ep_str = deref(self._handle).endpoint().str()
         return f'<{self.__class__.__name__} "{ep_str.decode()}">'
diff --git a/python/kvikio/kvikio/remote_file.py b/python/kvikio/kvikio/remote_file.py
index 41ec216e5c..f06a40b45f 100644
--- a/python/kvikio/kvikio/remote_file.py
+++ b/python/kvikio/kvikio/remote_file.py
@@ -142,6 +142,28 @@ def open_s3_url(
             )
         raise ValueError(f"Unsupported protocol: {url}")
 
+    @classmethod
+    def open_s3_presigned_url(
+        cls,
+        presigned_url: str,
+        nbytes: Optional[int] = None,
+    ) -> RemoteFile:
+        """Open a AWS S3 file from a presigned URL.
+
+        Parameters
+        ----------
+        presigned_url
+            Presigned URL to the remote file.
+        nbytes
+            The size of the file. If None, KvikIO will ask the server
+            for the file size.
+        """
+        return RemoteFile(
+            _get_remote_module().RemoteFile.open_s3_from_http_presigned_url(
+                presigned_url, nbytes
+            )
+        )
+
     def close(self) -> None:
         """Close the file"""
         pass

From ac5fc39b57e4761e8e9c768bb923093967be17d1 Mon Sep 17 00:00:00 2001
From: Kyle Edwards <kyedwards@nvidia.com>
Date: Mon, 4 Aug 2025 14:26:33 -0400
Subject: [PATCH 11/40] Update rapids-build-backend to 0.4.0 (#790)

Issue: https://github.com/rapidsai/build-planning/issues/207

Authors:
  - Kyle Edwards (https://github.com/KyleFromNVIDIA)

Approvers:
  - Lawrence Mitchell (https://github.com/wence-)
  - James Lamb (https://github.com/jameslamb)

URL: https://github.com/rapidsai/kvikio/pull/790
---
 conda/environments/all_cuda-129_arch-aarch64.yaml | 2 +-
 conda/environments/all_cuda-129_arch-x86_64.yaml  | 2 +-
 conda/recipes/kvikio/recipe.yaml                  | 2 +-
 dependencies.yaml                                 | 2 +-
 python/kvikio/pyproject.toml                      | 2 +-
 python/libkvikio/pyproject.toml                   | 2 +-
 6 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/conda/environments/all_cuda-129_arch-aarch64.yaml b/conda/environments/all_cuda-129_arch-aarch64.yaml
index 45a7d1297c..f72a97faf3 100644
--- a/conda/environments/all_cuda-129_arch-aarch64.yaml
+++ b/conda/environments/all_cuda-129_arch-aarch64.yaml
@@ -33,7 +33,7 @@ dependencies:
 - pytest-timeout
 - python>=3.10,<3.14
 - rangehttpserver
-- rapids-build-backend>=0.3.0,<0.4.0.dev0
+- rapids-build-backend>=0.4.0,<0.5.0.dev0
 - rapids-dask-dependency==25.10.*,>=0.0.0a0
 - scikit-build-core>=0.10.0
 - sphinx
diff --git a/conda/environments/all_cuda-129_arch-x86_64.yaml b/conda/environments/all_cuda-129_arch-x86_64.yaml
index 4ade45d196..890bfc29d7 100644
--- a/conda/environments/all_cuda-129_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-129_arch-x86_64.yaml
@@ -33,7 +33,7 @@ dependencies:
 - pytest-timeout
 - python>=3.10,<3.14
 - rangehttpserver
-- rapids-build-backend>=0.3.0,<0.4.0.dev0
+- rapids-build-backend>=0.4.0,<0.5.0.dev0
 - rapids-dask-dependency==25.10.*,>=0.0.0a0
 - scikit-build-core>=0.10.0
 - sphinx
diff --git a/conda/recipes/kvikio/recipe.yaml b/conda/recipes/kvikio/recipe.yaml
index 3a7957133e..2e9e411d9e 100644
--- a/conda/recipes/kvikio/recipe.yaml
+++ b/conda/recipes/kvikio/recipe.yaml
@@ -69,7 +69,7 @@ requirements:
     - libnvcomp-dev ${{ nvcomp_version }}
     - pip
     - python =${{ py_version }}
-    - rapids-build-backend >=0.3.0,<0.4.0.dev0
+    - rapids-build-backend >=0.4.0,<0.5.0.dev0
     - scikit-build-core >=0.10.0
     - cuda-cudart-dev
   run:
diff --git a/dependencies.yaml b/dependencies.yaml
index 9b7b899780..92fdec6bd5 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -283,7 +283,7 @@ dependencies:
     common:
       - output_types: [conda, requirements, pyproject]
         packages:
-          - rapids-build-backend>=0.3.0,<0.4.0.dev0
+          - rapids-build-backend>=0.4.0,<0.5.0.dev0
       - output_types: conda
         packages:
           - scikit-build-core>=0.10.0
diff --git a/python/kvikio/pyproject.toml b/python/kvikio/pyproject.toml
index b8ddc09945..8baa9aa89c 100644
--- a/python/kvikio/pyproject.toml
+++ b/python/kvikio/pyproject.toml
@@ -4,7 +4,7 @@
 [build-system]
 build-backend = "rapids_build_backend.build"
 requires = [
-    "rapids-build-backend>=0.3.0,<0.4.0.dev0",
+    "rapids-build-backend>=0.4.0,<0.5.0.dev0",
     "scikit-build-core[pyproject]>=0.10.0",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 
diff --git a/python/libkvikio/pyproject.toml b/python/libkvikio/pyproject.toml
index bbbd6f2e74..43b659aec8 100644
--- a/python/libkvikio/pyproject.toml
+++ b/python/libkvikio/pyproject.toml
@@ -4,7 +4,7 @@
 [build-system]
 build-backend = "rapids_build_backend.build"
 requires = [
-    "rapids-build-backend>=0.3.0,<0.4.0.dev0",
+    "rapids-build-backend>=0.4.0,<0.5.0.dev0",
     "scikit-build-core[pyproject]>=0.10.0",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 

From 9afce55d806f806d8883ac207c90c11e33715e15 Mon Sep 17 00:00:00 2001
From: Robert Maynard <rmaynard@nvidia.com>
Date: Thu, 7 Aug 2025 14:22:21 -0400
Subject: [PATCH 12/40] Update rapids_config to handle user defined branch name
 (#794)

rapids_config will use a user defined branch over `RAPIDS_BRANCH` contents

Authors:
  - Robert Maynard (https://github.com/robertmaynard)

Approvers:
  - Kyle Edwards (https://github.com/KyleFromNVIDIA)

URL: https://github.com/rapidsai/kvikio/pull/794
---
 cmake/RAPIDS.cmake        | 2 +-
 cmake/rapids_config.cmake | 8 ++++++--
 2 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/cmake/RAPIDS.cmake b/cmake/RAPIDS.cmake
index 40de7cefcd..ddef819498 100644
--- a/cmake/RAPIDS.cmake
+++ b/cmake/RAPIDS.cmake
@@ -18,7 +18,7 @@
 cmake_minimum_required(VERSION 3.30.4 FATAL_ERROR)
 
 # Allow users to control which version is used
-if(NOT rapids-cmake-branch OR NOT rapids-cmake-version)
+if(NOT (rapids-cmake-branch OR rapids-cmake-version))
   message(
     FATAL_ERROR "The CMake variable `rapids-cmake-branch` or `rapids-cmake-version` must be defined"
   )
diff --git a/cmake/rapids_config.cmake b/cmake/rapids_config.cmake
index b706c926e7..b2c54a3f27 100644
--- a/cmake/rapids_config.cmake
+++ b/cmake/rapids_config.cmake
@@ -35,6 +35,10 @@ if(NOT _rapids_branch)
   )
 endif()
 
-set(rapids-cmake-version "${RAPIDS_VERSION_MAJOR_MINOR}")
-set(rapids-cmake-branch "${_rapids_branch}")
+if(NOT rapids-cmake-version)
+  set(rapids-cmake-version "${RAPIDS_VERSION_MAJOR_MINOR}")
+endif()
+if(NOT rapids-cmake-branch)
+  set(rapids-cmake-branch "${_rapids_branch}")
+endif()
 include("${CMAKE_CURRENT_LIST_DIR}/RAPIDS.cmake")

From acada8d748b5cc1371b856c4ef3876d61fea18dc Mon Sep 17 00:00:00 2001
From: Tianyu Liu <kingcrimsontianyu@gmail.com>
Date: Thu, 7 Aug 2025 22:23:27 -0400
Subject: [PATCH 13/40] Support WebHDFS (1/2): C++ implementation (#788)

## Summary
This PR adds WebHDFS support to KvikIO. The background information is available at #787.

## Limitations
This PR does not address:
- Idiomatic and secure URL parsing and validation
- Testing on URL encoding/decoding (which means percent-decoded URL may or may not work at the moment)
- Advanced authentication such as Kerberos

These features will be added in the future.

Partially addresses #787

Authors:
  - Tianyu Liu (https://github.com/kingcrimsontianyu)
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - Mads R. B. Kristensen (https://github.com/madsbk)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/kvikio/pull/788
---
 .../recipes/libkvikio/conda_build_config.yaml |   2 +-
 conda/recipes/libkvikio/recipe.yaml           |   7 +-
 cpp/CMakeLists.txt                            |   4 +-
 cpp/include/kvikio/detail/remote_handle.hpp   |  34 +++
 cpp/include/kvikio/hdfs.hpp                   |  62 ++++++
 cpp/include/kvikio/remote_handle.hpp          |   9 +
 cpp/src/detail/remote_handle.cpp              |  32 +++
 cpp/src/hdfs.cpp                              | 131 ++++++++++++
 cpp/src/remote_handle.cpp                     |  43 +++-
 cpp/tests/CMakeLists.txt                      |   7 +-
 cpp/tests/test_basic_io.cpp                   |   9 +-
 cpp/tests/test_hdfs.cpp                       | 180 ++++++++++++++++
 cpp/tests/test_mmap.cpp                       |  11 +-
 cpp/tests/utils/hdfs_helper.cpp               | 197 ++++++++++++++++++
 cpp/tests/utils/hdfs_helper.hpp               |  61 ++++++
 cpp/tests/utils/utils.hpp                     |  16 +-
 16 files changed, 771 insertions(+), 34 deletions(-)
 create mode 100644 cpp/include/kvikio/detail/remote_handle.hpp
 create mode 100644 cpp/include/kvikio/hdfs.hpp
 create mode 100644 cpp/src/detail/remote_handle.cpp
 create mode 100644 cpp/src/hdfs.cpp
 create mode 100644 cpp/tests/test_hdfs.cpp
 create mode 100644 cpp/tests/utils/hdfs_helper.cpp
 create mode 100644 cpp/tests/utils/hdfs_helper.hpp

diff --git a/conda/recipes/libkvikio/conda_build_config.yaml b/conda/recipes/libkvikio/conda_build_config.yaml
index 1149e0f9fa..b67ab5d118 100644
--- a/conda/recipes/libkvikio/conda_build_config.yaml
+++ b/conda/recipes/libkvikio/conda_build_config.yaml
@@ -11,4 +11,4 @@ c_stdlib_version:
   - "2.28"
 
 libcurl_version:
-  - "==8.5.0"
+  - "8.5.0"
diff --git a/conda/recipes/libkvikio/recipe.yaml b/conda/recipes/libkvikio/recipe.yaml
index a4e423d3ed..f4164c9611 100644
--- a/conda/recipes/libkvikio/recipe.yaml
+++ b/conda/recipes/libkvikio/recipe.yaml
@@ -65,7 +65,7 @@ cache:
       - ${{ stdlib("c") }}
     host:
       - cuda-version =${{ cuda_version }}
-      - libcurl ${{ libcurl_version }}
+      - libcurl ==${{ libcurl_version }}
       - if: should_use_cufile
         then:
         - libcufile-dev
@@ -91,7 +91,7 @@ outputs:
         - ${{ compiler("c") }}
       host:
         - cuda-version =${{ cuda_version }}
-        - libcurl ${{ libcurl_version }}
+        - libcurl ==${{ libcurl_version }}
       run:
         - if: x86_64
           then:
@@ -108,7 +108,6 @@ outputs:
       ignore_run_exports:
         by_name:
           - cuda-version
-          - libcurl
           - if: should_use_cufile
             then:
             - libcufile
@@ -138,6 +137,7 @@ outputs:
         - ${{ pin_subpackage("libkvikio", exact=True) }}
         - cuda-version =${{ cuda_version }}
         - cuda-cudart-dev
+        - libcurl ==${{ libcurl_version }}
         - if: should_use_cufile
           then:
           - libcufile-dev
@@ -156,7 +156,6 @@ outputs:
         by_name:
           - cuda-cudart
           - cuda-version
-          - libcurl
           - libnuma
           - if: should_use_cufile
             then:
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 53351393ee..6107a0a795 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -163,7 +163,9 @@ set(SOURCES
 )
 
 if(KvikIO_REMOTE_SUPPORT)
-  list(APPEND SOURCES "src/remote_handle.cpp" "src/shim/libcurl.cpp")
+  list(APPEND SOURCES "src/hdfs.cpp" "src/remote_handle.cpp" "src/detail/remote_handle.cpp"
+       "src/shim/libcurl.cpp"
+  )
 endif()
 
 add_library(kvikio ${SOURCES})
diff --git a/cpp/include/kvikio/detail/remote_handle.hpp b/cpp/include/kvikio/detail/remote_handle.hpp
new file mode 100644
index 0000000000..f6bd55c4c0
--- /dev/null
+++ b/cpp/include/kvikio/detail/remote_handle.hpp
@@ -0,0 +1,34 @@
+/*
+ * Copyright (c) 2025, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <cstring>
+
+namespace kvikio::detail {
+/**
+ * @brief Callback for `CURLOPT_WRITEFUNCTION` that copies received data into a `std::string`.
+ *
+ * @param data Received data
+ * @param size Curl internal implementation always sets this parameter to 1
+ * @param num_bytes Number of bytes received
+ * @param userdata Must be cast from `std::string*`
+ * @return The number of bytes consumed by the callback
+ */
+std::size_t callback_get_string_response(char* data,
+                                         std::size_t size,
+                                         std::size_t num_bytes,
+                                         void* userdata);
+}  // namespace kvikio::detail
diff --git a/cpp/include/kvikio/hdfs.hpp b/cpp/include/kvikio/hdfs.hpp
new file mode 100644
index 0000000000..0b20d658bd
--- /dev/null
+++ b/cpp/include/kvikio/hdfs.hpp
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2025, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <optional>
+
+#include <kvikio/remote_handle.hpp>
+
+namespace kvikio {
+
+/**
+ * @brief A remote endpoint for Apache Hadoop WebHDFS.
+ *
+ * If KvikIO is run within a Docker, the argument `--network host` needs to be passed to the `docker
+ * run` command.
+ */
+class WebHdfsEndpoint : public RemoteEndpoint {
+ private:
+  std::string _url;
+  std::optional<std::string> _username;
+
+ public:
+  /**
+   * @brief Create an WebHDFS endpoint from a url.
+   *
+   * @param url The WebHDFS HTTP/HTTPS url to the remote file.
+   */
+  explicit WebHdfsEndpoint(std::string url);
+
+  /**
+   * @brief Create an WebHDFS endpoint from the host, port, file path and optionally username.
+   *
+   * @param host Host
+   * @param port Port
+   * @param remote_file_path Remote file path
+   * @param username User name
+   */
+  explicit WebHdfsEndpoint(std::string host,
+                           std::string port,
+                           std::string remote_file_path,
+                           std::optional<std::string> username = std::nullopt);
+
+  ~WebHdfsEndpoint() override = default;
+  void setopt(CurlHandle& curl) override;
+  std::string str() const override;
+  std::size_t get_file_size() override;
+  void setup_range_request(CurlHandle& curl, std::size_t file_offset, std::size_t size) override;
+};
+}  // namespace kvikio
diff --git a/cpp/include/kvikio/remote_handle.hpp b/cpp/include/kvikio/remote_handle.hpp
index 5f11d76f3d..b2e2d1d0ff 100644
--- a/cpp/include/kvikio/remote_handle.hpp
+++ b/cpp/include/kvikio/remote_handle.hpp
@@ -68,6 +68,12 @@ class RemoteEndpoint {
    * @return The file size
    */
   virtual std::size_t get_file_size() = 0;
+
+  /**
+   * @brief Set up the range request in order to read part of a file given the file offset and read
+   * size.
+   */
+  virtual void setup_range_request(CurlHandle& curl, std::size_t file_offset, std::size_t size) = 0;
 };
 
 /**
@@ -89,6 +95,7 @@ class HttpEndpoint : public RemoteEndpoint {
   void setopt(CurlHandle& curl) override;
   std::string str() const override;
   std::size_t get_file_size() override;
+  void setup_range_request(CurlHandle& curl, std::size_t file_offset, std::size_t size) override;
 };
 
 /**
@@ -198,6 +205,7 @@ class S3Endpoint : public RemoteEndpoint {
   void setopt(CurlHandle& curl) override;
   std::string str() const override;
   std::size_t get_file_size() override;
+  void setup_range_request(CurlHandle& curl, std::size_t file_offset, std::size_t size) override;
 };
 
 /**
@@ -215,6 +223,7 @@ class S3EndpointWithPresignedUrl : public RemoteEndpoint {
   void setopt(CurlHandle& curl) override;
   std::string str() const override;
   std::size_t get_file_size() override;
+  void setup_range_request(CurlHandle& curl, std::size_t file_offset, std::size_t size) override;
 };
 
 /**
diff --git a/cpp/src/detail/remote_handle.cpp b/cpp/src/detail/remote_handle.cpp
new file mode 100644
index 0000000000..b023859ef8
--- /dev/null
+++ b/cpp/src/detail/remote_handle.cpp
@@ -0,0 +1,32 @@
+/*
+ * Copyright (c) 2025, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <string>
+
+#include <kvikio/detail/remote_handle.hpp>
+
+namespace kvikio::detail {
+std::size_t callback_get_string_response(char* data,
+                                         std::size_t size,
+                                         std::size_t num_bytes,
+                                         void* userdata)
+{
+  auto new_data_size = size * num_bytes;
+  auto* response     = reinterpret_cast<std::string*>(userdata);
+  response->append(data, new_data_size);
+  return new_data_size;
+}
+}  // namespace kvikio::detail
diff --git a/cpp/src/hdfs.cpp b/cpp/src/hdfs.cpp
new file mode 100644
index 0000000000..12455b3a26
--- /dev/null
+++ b/cpp/src/hdfs.cpp
@@ -0,0 +1,131 @@
+/*
+ * Copyright (c) 2025, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <regex>
+
+#include <kvikio/detail/remote_handle.hpp>
+#include <kvikio/error.hpp>
+#include <kvikio/hdfs.hpp>
+#include <kvikio/nvtx.hpp>
+#include <kvikio/remote_handle.hpp>
+#include <kvikio/shim/libcurl.hpp>
+
+namespace kvikio {
+
+WebHdfsEndpoint::WebHdfsEndpoint(std::string url)
+{
+  // todo: Use libcurl URL API for more secure and idiomatic parsing.
+  // Split the URL into two parts: one without query and one with.
+  std::regex const pattern{R"(^([^?]+)\?([^#]*))"};
+  // Regex meaning:
+  // ^: From the start of the line
+  // [^?]+: Matches non-question-mark characters one or more times. The question mark ushers in the
+  // URL query component.
+  // \?: Matches the question mark, which needs to be escaped.
+  // [^#]*: Matches the non-pound characters zero or more times. The pound sign ushers in the URL
+  // fragment component. It is very likely that this part does not exist.
+  std::smatch match_results;
+  bool found = std::regex_search(url, match_results, pattern);
+  // If the match is not found, the URL contains no query.
+  if (!found) {
+    _url = url;
+    return;
+  }
+
+  _url       = match_results[1].str();
+  auto query = match_results[2].str();
+
+  {
+    // Extract user name if provided. In WebHDFS, user name is specified as the key=value pair in
+    // the query
+    std::regex const pattern{R"(user.name=([^&]+))"};
+    // Regex meaning:
+    // [^&]+: Matches the non-ampersand character one or more times. The ampersand delimits
+    // different parameters.
+    std::smatch match_results;
+    if (std::regex_search(query, match_results, pattern)) { _username = match_results[1].str(); }
+  }
+}
+
+WebHdfsEndpoint::WebHdfsEndpoint(std::string host,
+                                 std::string port,
+                                 std::string file_path,
+                                 std::optional<std::string> username)
+  : _username{std::move(username)}
+{
+  std::stringstream ss;
+  ss << "http://" << host << ":" << port << "/webhdfs/v1" << file_path;
+  _url = ss.str();
+}
+
+std::string WebHdfsEndpoint::str() const { return _url; }
+
+void WebHdfsEndpoint::setopt(CurlHandle& curl)
+{
+  KVIKIO_NVTX_FUNC_RANGE();
+  curl.setopt(CURLOPT_URL, _url.c_str());
+  curl.setopt(CURLOPT_FOLLOWLOCATION, 1L);
+}
+
+std::size_t WebHdfsEndpoint::get_file_size()
+{
+  KVIKIO_NVTX_FUNC_RANGE();
+
+  std::stringstream ss;
+  ss << _url << "?";
+  if (_username.has_value()) { ss << "user.name=" << _username.value() << "&"; }
+  ss << "op=GETFILESTATUS";
+
+  auto curl = create_curl_handle();
+  curl.setopt(CURLOPT_URL, ss.str().c_str());
+  curl.setopt(CURLOPT_FOLLOWLOCATION, 1L);
+
+  std::string response;
+  curl.setopt(CURLOPT_WRITEDATA, static_cast<void*>(&response));
+  curl.setopt(CURLOPT_WRITEFUNCTION, detail::callback_get_string_response);
+
+  curl.perform();
+
+  long http_status_code{};
+  curl.getinfo(CURLINFO_RESPONSE_CODE, &http_status_code);
+  KVIKIO_EXPECT(http_status_code == 200, "HTTP response is not successful.");
+
+  // The response is in JSON format. The file size is given by `"length":<file_size>`.
+  std::regex const pattern{R"("length"\s*:\s*(\d+)[^\d])"};
+  // Regex meaning:
+  // \s*: Matches the space character zero or more times.
+  // \d+: Matches the digit one or more times.
+  // [^\d]: Matches a non-digit character.
+  std::smatch match_results;
+  bool found = std::regex_search(response, match_results, pattern);
+  KVIKIO_EXPECT(
+    found, "Regular expression search failed. Cannot extract file length from the JSON response.");
+  return std::stoull(match_results[1].str());
+}
+
+void WebHdfsEndpoint::setup_range_request(CurlHandle& curl,
+                                          std::size_t file_offset,
+                                          std::size_t size)
+{
+  // WebHDFS does not support CURLOPT_RANGE. The range is specified as query parameters in the URL.
+  KVIKIO_NVTX_FUNC_RANGE();
+  std::stringstream ss;
+  ss << _url << "?";
+  if (_username.has_value()) { ss << "user.name=" << _username.value() << "&"; }
+  ss << "op=OPEN&offset=" << file_offset << "&length=" << size;
+  curl.setopt(CURLOPT_URL, ss.str().c_str());
+}
+}  // namespace kvikio
diff --git a/cpp/src/remote_handle.cpp b/cpp/src/remote_handle.cpp
index 1aba93bdc5..23cf5c6305 100644
--- a/cpp/src/remote_handle.cpp
+++ b/cpp/src/remote_handle.cpp
@@ -25,6 +25,7 @@
 #include <string>
 
 #include <kvikio/defaults.hpp>
+#include <kvikio/detail/remote_handle.hpp>
 #include <kvikio/error.hpp>
 #include <kvikio/nvtx.hpp>
 #include <kvikio/parallel_operation.hpp>
@@ -162,6 +163,20 @@ std::size_t get_file_size_using_head_impl(RemoteEndpoint& endpoint, std::string
   return static_cast<std::size_t>(cl);
 }
 
+/**
+ * @brief Set up the range request for libcurl. Use this method when HTTP range request is supposed.
+ *
+ * @param curl A curl handle
+ * @param file_offset File offset
+ * @param size read size
+ */
+void setup_range_request_impl(CurlHandle& curl, std::size_t file_offset, std::size_t size)
+{
+  std::string const byte_range =
+    std::to_string(file_offset) + "-" + std::to_string(file_offset + size - 1);
+  curl.setopt(CURLOPT_RANGE, byte_range.c_str());
+}
+
 }  // namespace
 
 HttpEndpoint::HttpEndpoint(std::string url) : _url{std::move(url)} {}
@@ -174,15 +189,15 @@ std::size_t HttpEndpoint::get_file_size()
   return get_file_size_using_head_impl(*this, _url);
 }
 
-void HttpEndpoint::setopt(CurlHandle& curl)
+void HttpEndpoint::setup_range_request(CurlHandle& curl, std::size_t file_offset, std::size_t size)
 {
-  KVIKIO_NVTX_FUNC_RANGE();
-  curl.setopt(CURLOPT_URL, _url.c_str());
+  setup_range_request_impl(curl, file_offset, size);
 }
 
+void HttpEndpoint::setopt(CurlHandle& curl) { curl.setopt(CURLOPT_URL, _url.c_str()); }
+
 void S3Endpoint::setopt(CurlHandle& curl)
 {
-  KVIKIO_NVTX_FUNC_RANGE();
   curl.setopt(CURLOPT_URL, _url.c_str());
   curl.setopt(CURLOPT_AWS_SIGV4, _aws_sigv4.c_str());
   curl.setopt(CURLOPT_USERPWD, _aws_userpwd.c_str());
@@ -327,6 +342,12 @@ std::size_t S3Endpoint::get_file_size()
   return get_file_size_using_head_impl(*this, _url);
 }
 
+void S3Endpoint::setup_range_request(CurlHandle& curl, std::size_t file_offset, std::size_t size)
+{
+  KVIKIO_NVTX_FUNC_RANGE();
+  setup_range_request_impl(curl, file_offset, size);
+}
+
 S3EndpointWithPresignedUrl::S3EndpointWithPresignedUrl(std::string presigned_url)
   : _url{std::move(presigned_url)}
 {
@@ -334,7 +355,6 @@ S3EndpointWithPresignedUrl::S3EndpointWithPresignedUrl(std::string presigned_url
 
 void S3EndpointWithPresignedUrl::setopt(CurlHandle& curl)
 {
-  KVIKIO_NVTX_FUNC_RANGE();
   curl.setopt(CURLOPT_URL, _url.c_str());
 }
 
@@ -411,6 +431,14 @@ std::size_t S3EndpointWithPresignedUrl::get_file_size()
   return file_size;
 }
 
+void S3EndpointWithPresignedUrl::setup_range_request(CurlHandle& curl,
+                                                     std::size_t file_offset,
+                                                     std::size_t size)
+{
+  KVIKIO_NVTX_FUNC_RANGE();
+  setup_range_request_impl(curl, file_offset, size);
+}
+
 RemoteHandle::RemoteHandle(std::unique_ptr<RemoteEndpoint> endpoint, std::size_t nbytes)
   : _endpoint{std::move(endpoint)}, _nbytes{nbytes}
 {
@@ -510,10 +538,7 @@ std::size_t RemoteHandle::read(void* buf, std::size_t size, std::size_t file_off
   bool const is_host_mem = is_host_memory(buf);
   auto curl              = create_curl_handle();
   _endpoint->setopt(curl);
-
-  std::string const byte_range =
-    std::to_string(file_offset) + "-" + std::to_string(file_offset + size - 1);
-  curl.setopt(CURLOPT_RANGE, byte_range.c_str());
+  _endpoint->setup_range_request(curl, file_offset, size);
 
   if (is_host_mem) {
     curl.setopt(CURLOPT_WRITEFUNCTION, callback_host_memory);
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index 3d53bbd86f..41de4bb6fa 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -57,7 +57,7 @@ function(kvikio_add_test)
   )
   target_link_libraries(
     ${_KVIKIO_NAME} PRIVATE kvikio::kvikio GTest::gmock GTest::gmock_main GTest::gtest
-                            GTest::gtest_main CUDA::cudart
+                            GTest::gtest_main CUDA::cudart $<TARGET_NAME_IF_EXISTS:CURL::libcurl>
   )
 
   rapids_test_add(
@@ -76,6 +76,9 @@ kvikio_add_test(NAME ERROR_TEST SOURCES test_error.cpp)
 
 kvikio_add_test(NAME MMAP_TEST SOURCES test_mmap.cpp)
 
-kvikio_add_test(NAME REMOTE_HANDLE_TEST SOURCES test_remote_handle.cpp utils/env.cpp)
+if(KvikIO_REMOTE_SUPPORT)
+  kvikio_add_test(NAME REMOTE_HANDLE_TEST SOURCES test_remote_handle.cpp utils/env.cpp)
+  kvikio_add_test(NAME HDFS_TEST SOURCES test_hdfs.cpp utils/hdfs_helper.cpp)
+endif()
 
 rapids_test_install_relocatable(INSTALL_COMPONENT_SET testing DESTINATION bin/tests/libkvikio)
diff --git a/cpp/tests/test_basic_io.cpp b/cpp/tests/test_basic_io.cpp
index d72ba8841f..141fe386c3 100644
--- a/cpp/tests/test_basic_io.cpp
+++ b/cpp/tests/test_basic_io.cpp
@@ -27,15 +27,16 @@ class BasicIOTest : public testing::Test {
     TempDir tmp_dir{false};
     _filepath = tmp_dir.path() / "test";
 
-    _dev_a = std::move(DevBuffer::arange(100));
-    _dev_b = std::move(DevBuffer::zero_like(_dev_a));
+    _dev_a = std::move(DevBuffer<value_type>::arange(100));
+    _dev_b = std::move(DevBuffer<value_type>::zero_like(_dev_a));
   }
 
   void TearDown() override {}
 
   std::filesystem::path _filepath;
-  DevBuffer _dev_a;
-  DevBuffer _dev_b;
+  using value_type = std::int64_t;
+  DevBuffer<value_type> _dev_a;
+  DevBuffer<value_type> _dev_b;
 };
 
 TEST_F(BasicIOTest, write_read)
diff --git a/cpp/tests/test_hdfs.cpp b/cpp/tests/test_hdfs.cpp
new file mode 100644
index 0000000000..354c8c4aca
--- /dev/null
+++ b/cpp/tests/test_hdfs.cpp
@@ -0,0 +1,180 @@
+/*
+ * Copyright (c) 2025, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <gtest/gtest.h>
+#include <cstdlib>
+#include <memory>
+
+#include <kvikio/file_handle.hpp>
+#include <kvikio/hdfs.hpp>
+#include <kvikio/remote_handle.hpp>
+
+#include "utils/hdfs_helper.hpp"
+#include "utils/utils.hpp"
+
+// This test makes the following assumptions:
+// - This KvikIO unit test is run on the "name node" of a WebHDFS server.
+// - Port 9870 (default for WebHDFS) is being used to listen to the requests.
+// - The environment variable `KVIKIO_USER` is specified prior to the test. It contains a valid user
+//   name that has been granted access to the HDFS.
+// - The user has the proper permission to create a file under the `/tmp` directory on the HDFS.
+// - If the unit test is run within a Docker. The following arguments are passed to the `docker run`
+//   command:
+//   - `--network host`
+//   - `--env KVIKIO_USER=<hdfs_username>`
+//
+// If any of these assumptions is not satisfied, this unit test is expected to be skipped
+// gracefully.
+
+using value_type = double;
+
+namespace kvikio::test {
+struct Config {
+  std::size_t num_elements{1024ull * 1024ull};
+  std::vector<value_type> host_buf;
+  kvikio::test::DevBuffer<value_type> dev_buf;
+  std::string host;
+  std::string port;
+  std::string _username;
+  std::string remote_file_path;
+  bool file_created{false};
+};
+}  // namespace kvikio::test
+
+class WebHdfsTest : public testing::Test {
+ protected:
+  static void SetUpTestSuite()
+  {
+    config.num_elements = 1024ull * 1024ull;
+    config.host_buf.resize(config.num_elements);
+    std::iota(config.host_buf.begin(), config.host_buf.end(), 0);
+
+    config.dev_buf = kvikio::test::DevBuffer<value_type>{config.host_buf};
+
+    config.host = "localhost";
+    config.port = "9870";
+
+    config.remote_file_path = "/tmp/kvikio-test-webhdfs.bin";
+
+    auto res = std::getenv("KVIKIO_USER");
+    if (res) {
+      config._username = res;
+    } else {
+      GTEST_SKIP() << "Environment variable KVIKIO_USER is not set for this test.";
+    }
+
+    webhdfs_helper =
+      std::make_unique<kvikio::test::WebHdfsTestHelper>(config.host, config.port, config._username);
+
+    if (!webhdfs_helper->can_connect()) {
+      GTEST_SKIP() << "Cannot connect to WebHDFS. Skipping all tests for this fixture.";
+    }
+
+    std::span<std::byte> buffer{reinterpret_cast<std::byte*>(config.host_buf.data()),
+                                config.host_buf.size() * sizeof(value_type)};
+    if (!webhdfs_helper->upload_data(buffer, config.remote_file_path)) {
+      GTEST_SKIP()
+        << "Failed to upload test data using WebHDFS. Skipping all tests for this fixture.";
+    };
+
+    config.file_created = true;
+  }
+
+  static void TearDownTestSuite()
+  {
+    if (config.file_created) { webhdfs_helper->delete_data(config.remote_file_path); }
+  }
+
+  static kvikio::test::Config config;
+  static std::unique_ptr<kvikio::test::WebHdfsTestHelper> webhdfs_helper;
+};
+
+kvikio::test::Config WebHdfsTest::config{};
+std::unique_ptr<kvikio::test::WebHdfsTestHelper> WebHdfsTest::webhdfs_helper{};
+
+TEST_F(WebHdfsTest, constructor)
+{
+  auto do_test = [&](kvikio::RemoteHandle& remote_handle) {
+    kvikio::test::DevBuffer<value_type> out_device_buf(config.num_elements);
+    auto read_size    = remote_handle.read(out_device_buf.ptr, remote_handle.nbytes());
+    auto out_host_buf = out_device_buf.to_vector();
+    for (std::size_t i = 0; i < config.num_elements; ++i) {
+      EXPECT_EQ(config.host_buf[i], out_host_buf[i]);
+    }
+    EXPECT_EQ(read_size, remote_handle.nbytes());
+  };
+
+  std::stringstream ss;
+  ss << "http://" << config.host << ":" << config.port << "/webhdfs/v1" << config.remote_file_path
+     << "?user.name=" << config._username;
+  std::vector<kvikio::RemoteHandle> remote_handles;
+
+  remote_handles.emplace_back(std::make_unique<kvikio::WebHdfsEndpoint>(ss.str()));
+  remote_handles.emplace_back(std::make_unique<kvikio::WebHdfsEndpoint>(
+    config.host, config.port, config.remote_file_path, config._username));
+
+  for (auto& remote_handle : remote_handles) {
+    do_test(remote_handle);
+  }
+}
+
+TEST_F(WebHdfsTest, read_parallel)
+{
+  auto do_test = [&](std::string const& url,
+                     std::size_t num_elements_to_skip,
+                     std::size_t num_elements_to_read,
+                     std::size_t task_size) {
+    kvikio::RemoteHandle remote_handle{std::make_unique<kvikio::WebHdfsEndpoint>(url)};
+    auto const offset             = num_elements_to_skip * sizeof(value_type);
+    auto const expected_read_size = num_elements_to_read * sizeof(value_type);
+
+    // host
+    {
+      std::vector<value_type> out_host_buf(num_elements_to_read, {});
+      auto fut = remote_handle.pread(out_host_buf.data(), expected_read_size, offset, task_size);
+      auto const read_size = fut.get();
+      for (std::size_t i = num_elements_to_skip; i < num_elements_to_read; ++i) {
+        EXPECT_EQ(config.host_buf[i], out_host_buf[i - num_elements_to_skip]);
+      }
+      EXPECT_EQ(read_size, expected_read_size);
+    }
+
+    // device
+    {
+      kvikio::test::DevBuffer<value_type> out_device_buf(num_elements_to_read);
+      auto fut = remote_handle.pread(out_device_buf.ptr, expected_read_size, offset, task_size);
+      auto const read_size = fut.get();
+      auto out_host_buf    = out_device_buf.to_vector();
+      for (std::size_t i = num_elements_to_skip; i < num_elements_to_read; ++i) {
+        EXPECT_EQ(config.host_buf[i], out_host_buf[i - num_elements_to_skip]);
+      }
+      EXPECT_EQ(read_size, expected_read_size);
+    }
+  };
+
+  std::stringstream ss;
+  ss << "http://" << config.host << ":" << config.port << "/webhdfs/v1" << config.remote_file_path
+     << "?user.name=" << config._username;
+  std::vector<std::size_t> task_sizes{256, 1024, kvikio::defaults::task_size()};
+
+  for (const auto& task_size : task_sizes) {
+    for (const auto& num_elements_to_read : {10, 9999}) {
+      for (const auto& num_elements_to_skip : {0, 10, 100, 1000, 9999}) {
+        do_test(ss.str(), num_elements_to_skip, num_elements_to_read, task_size);
+      }
+    }
+  }
+}
diff --git a/cpp/tests/test_mmap.cpp b/cpp/tests/test_mmap.cpp
index 7fb1e3a4fd..1c6e3a54b3 100644
--- a/cpp/tests/test_mmap.cpp
+++ b/cpp/tests/test_mmap.cpp
@@ -40,7 +40,7 @@ class MmapTest : public testing::Test {
     _filepath                = tmp_dir.path() / "test.bin";
     std::size_t num_elements = 1024ull * 1024ull;
     _host_buf                = CreateTempFile<value_type>(_filepath, num_elements);
-    _dev_buf                 = kvikio::test::DevBuffer{_host_buf};
+    _dev_buf                 = kvikio::test::DevBuffer<value_type>{_host_buf};
     _page_size               = kvikio::get_page_size();
   }
 
@@ -62,9 +62,8 @@ class MmapTest : public testing::Test {
   std::size_t _file_size;
   std::size_t _page_size;
   std::vector<std::int64_t> _host_buf;
-  kvikio::test::DevBuffer _dev_buf;
-
   using value_type = decltype(_host_buf)::value_type;
+  kvikio::test::DevBuffer<value_type> _dev_buf;
 };
 
 TEST_F(MmapTest, invalid_file_open_flag)
@@ -212,7 +211,7 @@ TEST_F(MmapTest, read_seq)
 
     // device
     {
-      kvikio::test::DevBuffer out_device_buf(num_elements_to_read);
+      kvikio::test::DevBuffer<value_type> out_device_buf(num_elements_to_read);
       auto const read_size = mmap_handle.read(out_device_buf.ptr, expected_read_size, offset);
       auto out_host_buf    = out_device_buf.to_vector();
       for (std::size_t i = num_elements_to_skip; i < num_elements_to_read; ++i) {
@@ -250,7 +249,7 @@ TEST_F(MmapTest, read_parallel)
 
       // device
       {
-        kvikio::test::DevBuffer out_device_buf(num_elements_to_read);
+        kvikio::test::DevBuffer<value_type> out_device_buf(num_elements_to_read);
         auto fut             = mmap_handle.pread(out_device_buf.ptr, expected_read_size, offset);
         auto const read_size = fut.get();
         auto out_host_buf    = out_device_buf.to_vector();
@@ -300,7 +299,7 @@ TEST_F(MmapTest, read_with_default_arguments)
 
   // device
   {
-    kvikio::test::DevBuffer out_device_buf(num_elements);
+    kvikio::test::DevBuffer<value_type> out_device_buf(num_elements);
 
     {
       auto const read_size = mmap_handle.read(out_device_buf.ptr);
diff --git a/cpp/tests/utils/hdfs_helper.cpp b/cpp/tests/utils/hdfs_helper.cpp
new file mode 100644
index 0000000000..2bcbc7fed2
--- /dev/null
+++ b/cpp/tests/utils/hdfs_helper.cpp
@@ -0,0 +1,197 @@
+/*
+ * Copyright (c) 2025, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "hdfs_helper.hpp"
+
+#include <algorithm>
+#include <iostream>
+#include <regex>
+#include <sstream>
+#include <string>
+
+#include <kvikio/detail/remote_handle.hpp>
+#include <kvikio/shim/libcurl.hpp>
+
+namespace kvikio::test {
+
+namespace {
+
+/**
+ * @brief Helper struct that wraps a buffer view and tracks how many data have been processed via an
+ * offset value.
+ */
+struct tracked_buffer_t {
+  std::span<std::byte> buffer;
+  std::size_t offset;
+};
+
+/**
+ * @brief Callback for `CURLOPT_READFUNCTION` to upload data.
+ *
+ * @param data
+ * @param size Curl internal implementation always sets this parameter to 1
+ * @param num_bytes_max The maximum number of bytes that can be uploaded
+ * @param userdata Must be cast from `tracked_buffer_t*`
+ * @return The number of bytes that have been copied to the transfer buffer.
+ */
+std::size_t callback_upload(char* data, std::size_t size, std::size_t num_bytes_max, void* userdata)
+{
+  auto new_data_size_max = size * num_bytes_max;
+  auto* tracked_buffer   = reinterpret_cast<tracked_buffer_t*>(userdata);
+
+  // All data have been uploaded. Nothing more to do.
+  if (tracked_buffer->offset >= tracked_buffer->buffer.size()) { return 0; }
+
+  auto copy_size =
+    std::min(new_data_size_max, tracked_buffer->buffer.size() - tracked_buffer->offset);
+  std::memcpy(data, tracked_buffer->buffer.data() + tracked_buffer->offset, copy_size);
+  tracked_buffer->offset += copy_size;
+
+  return copy_size;
+}
+}  // namespace
+
+WebHdfsTestHelper::WebHdfsTestHelper(std::string const& host,
+                                     std::string const& port,
+                                     std::string const& username)
+  : _host{host}, _port{port}, _username{username}
+{
+  std::stringstream ss;
+  ss << "http://" << host << ":" << port << "/webhdfs/v1";
+  _url_before_path = ss.str();
+}
+
+bool WebHdfsTestHelper::can_connect() noexcept
+{
+  try {
+    auto curl = create_curl_handle();
+
+    std::stringstream ss;
+    ss << _url_before_path << "/?user.name=" << _username << "&op=GETHOMEDIRECTORY";
+
+    curl.setopt(CURLOPT_URL, ss.str().c_str());
+
+    std::string response{};
+    curl.setopt(CURLOPT_WRITEDATA, &response);
+    curl.setopt(CURLOPT_WRITEFUNCTION, kvikio::detail::callback_get_string_response);
+    curl.setopt(CURLOPT_FOLLOWLOCATION, 1L);
+    curl.perform();
+    return true;
+  } catch (std::exception const& e) {
+    std::cout << e.what() << "\n";
+    return false;
+  }
+}
+
+bool WebHdfsTestHelper::upload_data(std::span<std::byte> buffer,
+                                    std::string const& remote_file_path) noexcept
+{
+  try {
+    // Official reference on how to create and write to a file:
+    // https://hadoop.apache.org/docs/current/hadoop-project-dist/hadoop-hdfs/WebHDFS.html#Create_and_Write_to_a_File
+    std::string redirect_url;
+
+    {
+      // Step 1: Submit a HTTP PUT request without automatically following redirects and without
+      // sending the file data.
+      auto curl = create_curl_handle();
+
+      std::stringstream ss;
+      ss << _url_before_path << remote_file_path << "?user.name=" << _username << "&op=CREATE";
+      std::string redirect_data_node_location{};
+
+      curl.setopt(CURLOPT_URL, ss.str().c_str());
+      curl.setopt(CURLOPT_FOLLOWLOCATION, 0L);
+      curl.setopt(CURLOPT_CUSTOMREQUEST, "PUT");
+
+      std::string response{};
+      curl.setopt(CURLOPT_HEADERDATA, &response);
+      curl.setopt(CURLOPT_HEADERFUNCTION, kvikio::detail::callback_get_string_response);
+
+      curl.perform();
+
+      long http_status_code{};
+      curl.getinfo(CURLINFO_RESPONSE_CODE, &http_status_code);
+      KVIKIO_EXPECT(http_status_code == 307, "Redirection from name node to data node failed.");
+
+      std::regex const pattern{R"(Location:\s*(.*)\s*)"};
+      std::smatch match_results;
+      bool found = std::regex_search(response, match_results, pattern);
+      KVIKIO_EXPECT(found,
+                    "Regular expression search failed. Cannot extract redirect location from the "
+                    "JSON response.");
+      redirect_url = match_results[1].str();
+    }
+
+    {
+      // Step 2: Submit another HTTP PUT request using the URL in the Location header with the file
+      // data to be written.
+      auto curl = create_curl_handle();
+      curl.setopt(CURLOPT_URL, redirect_url.c_str());
+      curl.setopt(CURLOPT_UPLOAD, 1L);
+
+      tracked_buffer_t tracked_buffer{.buffer = buffer, .offset = 0};
+      curl.setopt(CURLOPT_READDATA, &tracked_buffer);
+      curl.setopt(CURLOPT_READFUNCTION, callback_upload);
+      curl.setopt(CURLOPT_INFILESIZE_LARGE, static_cast<curl_off_t>(buffer.size()));
+
+      curl.perform();
+
+      long http_status_code{};
+      curl.getinfo(CURLINFO_RESPONSE_CODE, &http_status_code);
+      KVIKIO_EXPECT(http_status_code == 201, "File creation failed.");
+    }
+
+    return true;
+  } catch (std::exception const& e) {
+    std::cout << e.what() << "\n";
+    return false;
+  }
+}
+
+bool WebHdfsTestHelper::delete_data(std::string const& remote_file_path) noexcept
+{
+  try {
+    // Official reference on how to delete a file:
+    // https://hadoop.apache.org/docs/current/hadoop-project-dist/hadoop-hdfs/WebHDFS.html#Delete_a_File.2FDirectory
+    auto curl = create_curl_handle();
+
+    std::stringstream ss;
+    ss << _url_before_path << remote_file_path << "?user.name=" << _username << "&op=DELETE";
+    std::string const url = ss.str();
+    std::string redirect_data_node_location{};
+
+    curl.setopt(CURLOPT_URL, url.c_str());
+    curl.setopt(CURLOPT_FOLLOWLOCATION, 1L);
+    curl.setopt(CURLOPT_CUSTOMREQUEST, "DELETE");
+
+    std::string response{};
+    curl.setopt(CURLOPT_HEADERDATA, &response);
+    curl.setopt(CURLOPT_HEADERFUNCTION, kvikio::detail::callback_get_string_response);
+
+    curl.perform();
+
+    long http_status_code{};
+    curl.getinfo(CURLINFO_RESPONSE_CODE, &http_status_code);
+    KVIKIO_EXPECT(http_status_code == 200, "File deletion failed.");
+
+    return true;
+  } catch (std::exception const& e) {
+    std::cout << e.what() << "\n";
+    return false;
+  }
+}
+}  // namespace kvikio::test
diff --git a/cpp/tests/utils/hdfs_helper.hpp b/cpp/tests/utils/hdfs_helper.hpp
new file mode 100644
index 0000000000..7cc5da377c
--- /dev/null
+++ b/cpp/tests/utils/hdfs_helper.hpp
@@ -0,0 +1,61 @@
+/*
+ * Copyright (c) 2025, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <span>
+#include <string>
+
+namespace kvikio::test {
+
+/**
+ * @brief Helper class to create and upload a file on WebHDFS so as to enable read testing.
+ */
+class WebHdfsTestHelper {
+ private:
+  std::string _host;
+  std::string _port;
+  std::string _username;
+  std::string _url_before_path;
+
+ public:
+  WebHdfsTestHelper(std::string const& host, std::string const& port, std::string const& username);
+
+  /**
+   * @brief Whether KvikIO can connect to the WebHDFS server.
+   *
+   * @return A boolean answer.
+   */
+  bool can_connect() noexcept;
+
+  /**
+   * @brief Copy the data from a host buffer to a remote file on the WebHDFS server.
+   *
+   * @param buffer View to the host buffer whose data will be copied to the WebHDFS server
+   * @param remote_file_path Remote file path
+   * @return True if the file has been successfully uploaded; false otherwise.
+   */
+  bool upload_data(std::span<std::byte> buffer, std::string const& remote_file_path) noexcept;
+
+  /**
+   * @brief Delete a remote file on the WebHDFS server.
+   *
+   * @param remote_file_path Remote file path
+   * @return True if the file has been successfully deleted; false otherwise.
+   */
+  bool delete_data(std::string const& remote_file_path) noexcept;
+};
+
+}  // namespace kvikio::test
diff --git a/cpp/tests/utils/utils.hpp b/cpp/tests/utils/utils.hpp
index bb50d07c13..7d733621a7 100644
--- a/cpp/tests/utils/utils.hpp
+++ b/cpp/tests/utils/utils.hpp
@@ -108,6 +108,7 @@ class TempDir {
 /**
  * @brief Help class for creating and comparing buffers.
  */
+template <typename T>
 class DevBuffer {
  public:
   std::size_t nelem;
@@ -116,12 +117,12 @@ class DevBuffer {
 
   DevBuffer() : nelem{0}, nbytes{0} {};
 
-  DevBuffer(std::size_t nelem) : nelem{nelem}, nbytes{nelem * sizeof(std::int64_t)}
+  DevBuffer(std::size_t nelem) : nelem{nelem}, nbytes{nelem * sizeof(T)}
   {
     KVIKIO_CHECK_CUDA(cudaMalloc(&ptr, nbytes));
     KVIKIO_CHECK_CUDA(cudaMemset(ptr, 0, nbytes));
   }
-  DevBuffer(std::vector<std::int64_t> const& host_buffer) : DevBuffer{host_buffer.size()}
+  DevBuffer(std::vector<T> const& host_buffer) : DevBuffer{host_buffer.size()}
   {
     KVIKIO_CHECK_CUDA(cudaMemcpy(ptr, host_buffer.data(), nbytes, cudaMemcpyHostToDevice));
   }
@@ -143,9 +144,9 @@ class DevBuffer {
 
   ~DevBuffer() noexcept { cudaFree(ptr); }
 
-  [[nodiscard]] static DevBuffer arange(std::size_t nelem, std::int64_t start = 0)
+  [[nodiscard]] static DevBuffer arange(std::size_t nelem, T start = 0)
   {
-    std::vector<std::int64_t> host_buffer(nelem);
+    std::vector<T> host_buffer(nelem);
     std::iota(host_buffer.begin(), host_buffer.end(), start);
     return DevBuffer{host_buffer};
   }
@@ -157,9 +158,9 @@ class DevBuffer {
     return ret;
   }
 
-  [[nodiscard]] std::vector<std::int64_t> to_vector() const
+  [[nodiscard]] std::vector<T> to_vector() const
   {
-    std::vector<std::int64_t> ret(nelem);
+    std::vector<T> ret(nelem);
     KVIKIO_CHECK_CUDA(cudaMemcpy(ret.data(), this->ptr, nbytes, cudaMemcpyDeviceToHost));
     return ret;
   }
@@ -177,7 +178,8 @@ class DevBuffer {
 /**
  * @brief Check that two buffers are equal
  */
-inline void expect_equal(DevBuffer const& a, DevBuffer const& b)
+template <typename T>
+inline void expect_equal(DevBuffer<T> const& a, DevBuffer<T> const& b)
 {
   EXPECT_EQ(a.nbytes, b.nbytes);
   auto a_vec = a.to_vector();

From a35bf582306553c40dc6eaa9c29a1e49ddb969fc Mon Sep 17 00:00:00 2001
From: Tianyu Liu <kingcrimsontianyu@gmail.com>
Date: Mon, 11 Aug 2025 11:31:18 -0400
Subject: [PATCH 14/40] Improve KvikIO Python binding performance by releasing
 GIL wherever deemed necessary (#796)

This PR improves the Python binding performance by releasing the Global Interpreter Lock (GIL) wherever necessary. The tasks include:
- For function declarations, add `nogil` if missing. Only one such case has been identified, which defines an embedded template function.
- At the call site of a C++ function, add `with nogil` context if missing. All the other changes fall into this category.

Closes #795

Authors:
  - Tianyu Liu (https://github.com/kingcrimsontianyu)

Approvers:
  - Mads R. B. Kristensen (https://github.com/madsbk)

URL: https://github.com/rapidsai/kvikio/pull/796
---
 python/kvikio/kvikio/_lib/buffer.pyx        |  13 +-
 python/kvikio/kvikio/_lib/cufile_driver.pyx |  74 ++++++---
 python/kvikio/kvikio/_lib/defaults.pyx      |  71 +++++++--
 python/kvikio/kvikio/_lib/file_handle.pyx   | 159 +++++++++++++-------
 python/kvikio/kvikio/_lib/future.pyx        |  12 +-
 python/kvikio/kvikio/_lib/mmap.pyx          |  70 ++++++---
 python/kvikio/kvikio/_lib/remote_handle.pyx | 116 ++++++++++----
 7 files changed, 370 insertions(+), 145 deletions(-)

diff --git a/python/kvikio/kvikio/_lib/buffer.pyx b/python/kvikio/kvikio/_lib/buffer.pyx
index 3b90f09816..a7b638eaf4 100644
--- a/python/kvikio/kvikio/_lib/buffer.pyx
+++ b/python/kvikio/kvikio/_lib/buffer.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2024-2025, NVIDIA CORPORATION. All rights reserved.
 # See file LICENSE for terms.
 
 # distutils: language = c++
@@ -17,14 +17,16 @@ def memory_register(buf) -> None:
     if not isinstance(buf, Array):
         buf = Array(buf)
     cdef Array arr = buf
-    cpp_memory_register(<void*>arr.ptr)
+    with nogil:
+        cpp_memory_register(<void*>arr.ptr)
 
 
 def memory_deregister(buf) -> None:
     if not isinstance(buf, Array):
         buf = Array(buf)
     cdef Array arr = buf
-    cpp_memory_deregister(<void*>arr.ptr)
+    with nogil:
+        cpp_memory_deregister(<void*>arr.ptr)
 
 
 cdef extern from "<kvikio/bounce_buffer.hpp>" nogil:
@@ -32,4 +34,7 @@ cdef extern from "<kvikio/bounce_buffer.hpp>" nogil:
 
 
 def bounce_buffer_free() -> int:
-    return cpp_alloc_retain_clear()
+    cdef size_t result
+    with nogil:
+        result = cpp_alloc_retain_clear()
+    return result
diff --git a/python/kvikio/kvikio/_lib/cufile_driver.pyx b/python/kvikio/kvikio/_lib/cufile_driver.pyx
index 0488eb3b20..0bc44b2066 100644
--- a/python/kvikio/kvikio/_lib/cufile_driver.pyx
+++ b/python/kvikio/kvikio/_lib/cufile_driver.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2024-2025, NVIDIA CORPORATION. All rights reserved.
 # See file LICENSE for terms.
 
 # distutils: language = c++
@@ -15,15 +15,20 @@ cdef extern from "<kvikio/shim/cufile.hpp>" nogil:
 
 
 def libcufile_version() -> int:
-    return cpp_libcufile_version()
+    cdef int version
+    with nogil:
+        version = cpp_libcufile_version()
+    return version
 
 
 def driver_open():
-    cpp_driver_open()
+    with nogil:
+        cpp_driver_open()
 
 
 def driver_close():
-    cpp_driver_close()
+    with nogil:
+        cpp_driver_close()
 
 
 cdef extern from "<kvikio/cufile/driver.hpp>" nogil:
@@ -49,55 +54,90 @@ cdef class DriverProperties:
 
     @property
     def is_gds_available(self) -> bool:
+        cdef bool result
         try:
-            return self._handle.is_gds_available()
+            with nogil:
+                result = self._handle.is_gds_available()
+            return result
         except RuntimeError:
             return False
 
     @property
     def major_version(self) -> bool:
-        return self._handle.get_nvfs_major_version()
+        cdef unsigned int version
+        with nogil:
+            version = self._handle.get_nvfs_major_version()
+        return version
 
     @property
     def minor_version(self) -> bool:
-        return self._handle.get_nvfs_minor_version()
+        cdef unsigned int version
+        with nogil:
+            version = self._handle.get_nvfs_minor_version()
+        return version
 
     @property
     def allow_compat_mode(self) -> bool:
-        return self._handle.get_nvfs_allow_compat_mode()
+        cdef bool result
+        with nogil:
+            result = self._handle.get_nvfs_allow_compat_mode()
+        return result
 
     @property
     def poll_mode(self) -> bool:
-        return self._handle.get_nvfs_poll_mode()
+        cdef bool result
+        with nogil:
+            result = self._handle.get_nvfs_poll_mode()
+        return result
 
     @poll_mode.setter
     def poll_mode(self, enable: bool) -> None:
-        self._handle.set_nvfs_poll_mode(enable)
+        cdef bool cpp_enable = enable
+        with nogil:
+            self._handle.set_nvfs_poll_mode(cpp_enable)
 
     @property
     def poll_thresh_size(self) -> int:
-        return self._handle.get_nvfs_poll_thresh_size()
+        cdef size_t size
+        with nogil:
+            size = self._handle.get_nvfs_poll_thresh_size()
+        return size
 
     @poll_thresh_size.setter
     def poll_thresh_size(self, size_in_kb: int) -> None:
-        self._handle.set_nvfs_poll_thresh_size(size_in_kb)
+        cdef size_t size = size_in_kb
+        with nogil:
+            self._handle.set_nvfs_poll_thresh_size(size)
 
     @property
     def max_device_cache_size(self) -> int:
-        return self._handle.get_max_device_cache_size()
+        cdef size_t size
+        with nogil:
+            size = self._handle.get_max_device_cache_size()
+        return size
 
     @max_device_cache_size.setter
     def max_device_cache_size(self, size_in_kb: int) -> None:
-        self._handle.set_max_device_cache_size(size_in_kb)
+        cdef size_t size = size_in_kb
+        with nogil:
+            self._handle.set_max_device_cache_size(size)
 
     @property
     def per_buffer_cache_size(self) -> int:
-        return self._handle.get_per_buffer_cache_size()
+        cdef size_t size
+        with nogil:
+            size = self._handle.get_per_buffer_cache_size()
+        return size
 
     @property
     def max_pinned_memory_size(self) -> int:
-        return self._handle.get_max_pinned_memory_size()
+        cdef size_t size
+        with nogil:
+            size = self._handle.get_max_pinned_memory_size()
+        return size
 
     @max_pinned_memory_size.setter
     def max_pinned_memory_size(self, size_in_kb: int) -> None:
-        self._handle.set_max_pinned_memory_size(size_in_kb)
+        cdef size_t size = size_in_kb
+        with nogil:
+            self._handle.set_max_pinned_memory_size(size)
diff --git a/python/kvikio/kvikio/_lib/defaults.pyx b/python/kvikio/kvikio/_lib/defaults.pyx
index de511211e6..2fb6065680 100644
--- a/python/kvikio/kvikio/_lib/defaults.pyx
+++ b/python/kvikio/kvikio/_lib/defaults.pyx
@@ -43,68 +43,107 @@ cdef extern from "<kvikio/defaults.hpp>" namespace "kvikio" nogil:
 
 
 def is_compat_mode_preferred() -> bool:
-    return cpp_is_compat_mode_preferred()
+    cdef bool result
+    with nogil:
+        result = cpp_is_compat_mode_preferred()
+    return result
 
 
 def compat_mode() -> CompatMode:
-    return cpp_compat_mode()
+    cdef CompatMode result
+    with nogil:
+        result = cpp_compat_mode()
+    return result
 
 
 def set_compat_mode(compat_mode: CompatMode) -> None:
-    cpp_set_compat_mode(compat_mode)
+    with nogil:
+        cpp_set_compat_mode(compat_mode)
 
 
 def thread_pool_nthreads() -> int:
-    return cpp_thread_pool_nthreads()
+    cdef unsigned int result
+    with nogil:
+        result = cpp_thread_pool_nthreads()
+    return result
 
 
 def set_thread_pool_nthreads(nthreads: int) -> None:
-    cpp_set_thread_pool_nthreads(nthreads)
+    cdef unsigned int cpp_nthreads = nthreads
+    with nogil:
+        cpp_set_thread_pool_nthreads(cpp_nthreads)
 
 
 def task_size() -> int:
-    return cpp_task_size()
+    cdef size_t result
+    with nogil:
+        result = cpp_task_size()
+    return result
 
 
 def set_task_size(nbytes: int) -> None:
-    cpp_set_task_size(nbytes)
+    cdef size_t cpp_nbytes = nbytes
+    with nogil:
+        cpp_set_task_size(cpp_nbytes)
 
 
 def gds_threshold() -> int:
-    return cpp_gds_threshold()
+    cdef size_t result
+    with nogil:
+        result = cpp_gds_threshold()
+    return result
 
 
 def set_gds_threshold(nbytes: int) -> None:
-    cpp_set_gds_threshold(nbytes)
+    cdef size_t cpp_nbytes = nbytes
+    with nogil:
+        cpp_set_gds_threshold(cpp_nbytes)
 
 
 def bounce_buffer_size() -> int:
-    return cpp_bounce_buffer_size()
+    cdef size_t result
+    with nogil:
+        result = cpp_bounce_buffer_size()
+    return result
 
 
 def set_bounce_buffer_size(nbytes: int) -> None:
-    cpp_set_bounce_buffer_size(nbytes)
+    cdef size_t cpp_nbytes = nbytes
+    with nogil:
+        cpp_set_bounce_buffer_size(cpp_nbytes)
 
 
 def http_max_attempts() -> int:
-    return cpp_http_max_attempts()
+    cdef size_t result
+    with nogil:
+        result = cpp_http_max_attempts()
+    return result
 
 
 def set_http_max_attempts(attempts: int) -> None:
-    cpp_set_http_max_attempts(attempts)
+    cdef size_t cpp_attempts = attempts
+    with nogil:
+        cpp_set_http_max_attempts(cpp_attempts)
 
 
 def http_timeout() -> int:
-    return cpp_http_timeout()
+    cdef long result
+    with nogil:
+        result = cpp_http_timeout()
+    return result
 
 
 def set_http_timeout(timeout: int) -> None:
-    return cpp_set_http_timeout(timeout)
+    cdef long cpp_timeout = timeout
+    with nogil:
+        cpp_set_http_timeout(cpp_timeout)
 
 
 def http_status_codes() -> list[int]:
+    # Cannot use nogil here because we need the GIL for list creation
     return cpp_http_status_codes()
 
 
 def set_http_status_codes(status_codes: list[int]) -> None:
-    return cpp_set_http_status_codes(status_codes)
+    # Cannot use nogil here because we need the GIL for list conversion
+    cpp_set_http_status_codes(status_codes)
diff --git a/python/kvikio/kvikio/_lib/file_handle.pyx b/python/kvikio/kvikio/_lib/file_handle.pyx
index b17d283433..062d0d9fb5 100644
--- a/python/kvikio/kvikio/_lib/file_handle.pyx
+++ b/python/kvikio/kvikio/_lib/file_handle.pyx
@@ -94,88 +94,127 @@ cdef class CuFile:
     cdef FileHandle _handle
 
     def __init__(self, file_path, flags="r"):
-        self._handle = move(
-            FileHandle(
-                os.fsencode(file_path),
-                str(flags).encode()
+        cdef string cpp_file_path = os.fsencode(file_path)
+        cdef string cpp_flags = str(flags).encode()
+        with nogil:
+            self._handle = move(
+                FileHandle(
+                    cpp_file_path,
+                    cpp_flags
+                )
             )
-        )
 
     def close(self) -> None:
-        self._handle.close()
+        with nogil:
+            self._handle.close()
 
     def closed(self) -> bool:
-        return self._handle.closed()
+        cdef bool result
+        with nogil:
+            result = self._handle.closed()
+        return result
 
     def fileno(self) -> int:
-        return self._handle.fd()
+        cdef int result
+        with nogil:
+            result = self._handle.fd()
+        return result
 
     def open_flags(self) -> int:
-        return self._handle.fd_open_flags()
+        cdef int result
+        with nogil:
+            result = self._handle.fd_open_flags()
+        return result
 
     def pread(self, buf, size: Optional[int], file_offset: int, task_size) -> IOFuture:
         cdef pair[uintptr_t, size_t] info = parse_buffer_argument(buf, size, True)
-        return _wrap_io_future(
-            self._handle.pread(
+        cdef size_t cpp_file_offset = file_offset
+        cdef size_t cpp_task_size = task_size if task_size else defaults.task_size()
+        cdef future[size_t] fut
+        with nogil:
+            fut = self._handle.pread(
                 <void*>info.first,
                 info.second,
-                file_offset,
-                task_size if task_size else defaults.task_size()
+                cpp_file_offset,
+                cpp_task_size
             )
-        )
+        return _wrap_io_future(fut)
 
     def pwrite(self, buf, size: Optional[int], file_offset: int, task_size) -> IOFuture:
         cdef pair[uintptr_t, size_t] info = parse_buffer_argument(buf, size, True)
-        return _wrap_io_future(
-            self._handle.pwrite(
+        cdef size_t cpp_file_offset = file_offset
+        cdef size_t cpp_task_size = task_size if task_size else defaults.task_size()
+        cdef future[size_t] fut
+        with nogil:
+            fut = self._handle.pwrite(
                 <void*>info.first,
                 info.second,
-                file_offset,
-                task_size if task_size else defaults.task_size()
+                cpp_file_offset,
+                cpp_task_size
             )
-        )
+        return _wrap_io_future(fut)
 
     def read(self, buf, size: Optional[int], file_offset: int, dev_offset: int) -> int:
         cdef pair[uintptr_t, size_t] info = parse_buffer_argument(buf, size, False)
-        return self._handle.read(
-            <void*>info.first,
-            info.second,
-            file_offset,
-            dev_offset,
-        )
+        cdef size_t cpp_file_offset = file_offset
+        cdef size_t cpp_dev_offset = dev_offset
+        cdef size_t result
+        with nogil:
+            result = self._handle.read(
+                <void*>info.first,
+                info.second,
+                cpp_file_offset,
+                cpp_dev_offset,
+            )
+        return result
 
     def write(self, buf, size: Optional[int], file_offset: int, dev_offset: int) -> int:
         cdef pair[uintptr_t, size_t] info = parse_buffer_argument(buf, size, False)
-        return self._handle.write(
-            <void*>info.first,
-            info.second,
-            file_offset,
-            dev_offset,
-        )
+        cdef size_t cpp_file_offset = file_offset
+        cdef size_t cpp_dev_offset = dev_offset
+        cdef size_t result
+        with nogil:
+            result = self._handle.write(
+                <void*>info.first,
+                info.second,
+                cpp_file_offset,
+                cpp_dev_offset,
+            )
+        return result
 
     def read_async(self, buf, size: Optional[int], file_offset: int, dev_offset: int,
                    st: uintptr_t) -> IOFutureStream:
-        stream = <CUstream>st
+        cdef CUstream stream = <CUstream>st
         cdef pair[uintptr_t, size_t] info = parse_buffer_argument(buf, size, False)
-        return _wrap_stream_future(self._handle.read_async(
-            <void*>info.first,
-            info.second,
-            file_offset,
-            dev_offset,
-            stream,
-        ))
+        cdef size_t cpp_file_offset = file_offset
+        cdef size_t cpp_dev_offset = dev_offset
+        cdef cpp_StreamFuture fut
+        with nogil:
+            fut = self._handle.read_async(
+                <void*>info.first,
+                info.second,
+                cpp_file_offset,
+                cpp_dev_offset,
+                stream,
+            )
+        return _wrap_stream_future(fut)
 
     def write_async(self, buf, size: Optional[int], file_offset: int, dev_offset: int,
                     st: uintptr_t) -> IOFutureStream:
-        stream = <CUstream>st
+        cdef CUstream stream = <CUstream>st
         cdef pair[uintptr_t, size_t] info = parse_buffer_argument(buf, size, False)
-        return _wrap_stream_future(self._handle.write_async(
-            <void*>info.first,
-            info.second,
-            file_offset,
-            dev_offset,
-            stream,
-        ))
+        cdef size_t cpp_file_offset = file_offset
+        cdef size_t cpp_dev_offset = dev_offset
+        cdef cpp_StreamFuture fut
+        with nogil:
+            fut = self._handle.write_async(
+                <void*>info.first,
+                info.second,
+                cpp_file_offset,
+                cpp_dev_offset,
+                stream,
+            )
+        return _wrap_stream_future(fut)
 
 cdef extern from "<kvikio/file_utils.hpp>" nogil:
     pair[size_t, size_t] cpp_get_page_cache_info_str \
@@ -191,21 +230,37 @@ cdef extern from "<kvikio/file_utils.hpp>" nogil:
 
 def get_page_cache_info(file: Union[os.PathLike, str, int, io.IOBase]) \
         -> tuple[int, int]:
+    cdef pair[size_t, size_t] result
+    cdef string path_bytes
+    cdef int fd
+
     if isinstance(file, os.PathLike) or isinstance(file, str):
         # file is a path or a string object
         path_bytes = os.fsencode(file)
-        return cpp_get_page_cache_info_str(path_bytes)
+        with nogil:
+            result = cpp_get_page_cache_info_str(path_bytes)
+        return result
     elif isinstance(file, int):
         # file is a file descriptor
-        return cpp_get_page_cache_info_int(file)
+        fd = file
+        with nogil:
+            result = cpp_get_page_cache_info_int(fd)
+        return result
     elif isinstance(file, io.IOBase):
         # file is a file object
         # pass its file descriptor to the underlying C++ function
-        return cpp_get_page_cache_info_int(file.fileno())
+        fd = file.fileno()
+        with nogil:
+            result = cpp_get_page_cache_info_int(fd)
+        return result
     else:
         raise ValueError("The type of `file` must be `os.PathLike`, `str`, `int`, "
                          "or `io.IOBase`")
 
 
-def clear_page_cache(reclaim_dentries_and_inodes: bool, clear_dirty_pages: bool):
-    return cpp_clear_page_cache(reclaim_dentries_and_inodes, clear_dirty_pages)
+def clear_page_cache(reclaim_dentries_and_inodes: bool,
+                     clear_dirty_pages: bool) -> bool:
+    cdef bool result
+    with nogil:
+        result = cpp_clear_page_cache(reclaim_dentries_and_inodes, clear_dirty_pages)
+    return result
diff --git a/python/kvikio/kvikio/_lib/future.pyx b/python/kvikio/kvikio/_lib/future.pyx
index da6ab308dc..94d9dd6d9c 100644
--- a/python/kvikio/kvikio/_lib/future.pyx
+++ b/python/kvikio/kvikio/_lib/future.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2024-2025, NVIDIA CORPORATION. All rights reserved.
 # See file LICENSE for terms.
 
 # distutils: language = c++
@@ -15,7 +15,10 @@ cdef extern from "<kvikio/utils.hpp>" namespace "kvikio" nogil:
 cdef class IOFutureStream:
     """Wrap a C++ StreamFuture in a Python object"""
     def check_bytes_done(self) -> int:
-        return self._handle.check_bytes_done()
+        cdef size_t bytes_done
+        with nogil:
+            bytes_done = self._handle.check_bytes_done()
+        return bytes_done
 
 
 cdef IOFutureStream _wrap_stream_future(cpp_StreamFuture &fut):
@@ -34,7 +37,10 @@ cdef class IOFuture:
         return ret
 
     def done(self) -> bool:
-        return is_future_done(self._handle)
+        cdef bool result
+        with nogil:
+            result = is_future_done(self._handle)
+        return result
 
 
 cdef IOFuture _wrap_io_future(future[size_t] &fut):
diff --git a/python/kvikio/kvikio/_lib/mmap.pyx b/python/kvikio/kvikio/_lib/mmap.pyx
index ac4889b25c..46fc3846d0 100644
--- a/python/kvikio/kvikio/_lib/mmap.pyx
+++ b/python/kvikio/kvikio/_lib/mmap.pyx
@@ -48,14 +48,17 @@ cdef class InternalMmapHandle:
         if not os.path.exists(file_path):
             raise RuntimeError("Unable to open file")
 
+        cdef string cpp_path_bytes = os.fsencode(file_path)
+        cdef string cpp_flags_bytes = str(flags).encode()
+
         cdef optional[size_t] cpp_initial_map_size
         if initial_map_size is None:
             cpp_initial_map_size = nullopt
         else:
             cpp_initial_map_size = <size_t>(initial_map_size)
 
-        path_bytes = os.fsencode(file_path)
-        flags_bytes = str(flags).encode()
+        cdef size_t cpp_initial_map_offset = initial_map_offset
+        cdef fcntl.mode_t cpp_mode = mode
 
         cdef optional[int] cpp_map_flags
         if map_flags is None:
@@ -63,54 +66,79 @@ cdef class InternalMmapHandle:
         else:
             cpp_map_flags = <int>(map_flags)
 
-        self._handle = move(CppMmapHandle(path_bytes,
-                                          flags_bytes,
-                                          cpp_initial_map_size,
-                                          initial_map_offset,
-                                          mode,
-                                          cpp_map_flags))
+        with nogil:
+            self._handle = move(CppMmapHandle(cpp_path_bytes,
+                                              cpp_flags_bytes,
+                                              cpp_initial_map_size,
+                                              cpp_initial_map_offset,
+                                              cpp_mode,
+                                              cpp_map_flags))
 
     def initial_map_size(self) -> int:
-        return self._handle.initial_map_size()
+        cdef size_t result
+        with nogil:
+            result = self._handle.initial_map_size()
+        return result
 
     def initial_map_offset(self) -> int:
-        return self._handle.initial_map_offset()
+        cdef size_t result
+        with nogil:
+            result = self._handle.initial_map_offset()
+        return result
 
     def file_size(self) -> int:
-        return self._handle.file_size()
+        cdef size_t result
+        with nogil:
+            result = self._handle.file_size()
+        return result
 
     def close(self) -> None:
-        self._handle.close()
+        with nogil:
+            self._handle.close()
 
     def closed(self) -> bool:
-        return self._handle.closed()
+        cdef bool result
+        with nogil:
+            result = self._handle.closed()
+        return result
 
     def read(self, buf: Any, size: Optional[int] = None, offset: int = 0) -> int:
+        cdef pair[uintptr_t, size_t] info = parse_buffer_argument(buf, size, True)
         cdef optional[size_t] cpp_size
         if size is None:
             cpp_size = nullopt
         else:
             cpp_size = <size_t>(size)
-        cdef pair[uintptr_t, size_t] info = parse_buffer_argument(buf, size, True)
-        return self._handle.read(<void*>info.first,
-                                 cpp_size,
-                                 offset)
+        cdef size_t cpp_offset = offset
+        cdef size_t result
+        with nogil:
+            result = self._handle.read(<void*>info.first,
+                                       cpp_size,
+                                       cpp_offset)
+        return result
 
     def pread(self, buf: Any, size: Optional[int] = None, offset: int = 0,
               task_size: Optional[int] = None) -> IOFuture:
         cdef optional[size_t] cpp_size
+        cdef size_t cpp_task_size
+
         if size is None:
             cpp_size = nullopt
         else:
             cpp_size = <size_t>(size)
         cdef pair[uintptr_t, size_t] info = parse_buffer_argument(buf, size, True)
 
+        cdef size_t cpp_offset = offset
+
         if task_size is None:
             cpp_task_size = defaults.task_size()
         else:
             cpp_task_size = task_size
 
-        return _wrap_io_future(self._handle.pread(<void*>info.first,
-                               cpp_size,
-                               offset,
-                               cpp_task_size))
+        cdef future[size_t] cpp_future
+        with nogil:
+            cpp_future = self._handle.pread(<void*>info.first,
+                                            cpp_size,
+                                            cpp_offset,
+                                            cpp_task_size)
+        return _wrap_io_future(cpp_future)
diff --git a/python/kvikio/kvikio/_lib/remote_handle.pyx b/python/kvikio/kvikio/_lib/remote_handle.pyx
index 0c2ae4c3e4..5cb2d5cdd9 100644
--- a/python/kvikio/kvikio/_lib/remote_handle.pyx
+++ b/python/kvikio/kvikio/_lib/remote_handle.pyx
@@ -65,8 +65,9 @@ cdef pair[string, string] _to_string_pair(str s1, str s2):
     """Wrap two Python string objects in a C++ pair"""
     return pair[string, string](_to_string(s1), _to_string(s2))
 
+
 # Helper function to cast an endpoint to its base class `RemoteEndpoint`
-cdef extern from *:
+cdef extern from * nogil:
     """
     template <typename T>
     std::unique_ptr<kvikio::RemoteEndpoint> cast_to_remote_endpoint(T endpoint)
@@ -86,11 +87,16 @@ cdef class RemoteFile:
         nbytes: Optional[int],
     ):
         cdef RemoteFile ret = RemoteFile()
+
         if nbytes is None:
-            ret._handle = make_unique[cpp_RemoteHandle](move(ep))
+            with nogil:
+                ret._handle = make_unique[cpp_RemoteHandle](move(ep))
             return ret
+
         cdef size_t n = nbytes
-        ret._handle = make_unique[cpp_RemoteHandle](move(ep), n)
+
+        with nogil:
+            ret._handle = make_unique[cpp_RemoteHandle](move(ep), n)
         return ret
 
     @staticmethod
@@ -98,10 +104,16 @@ cdef class RemoteFile:
         url: str,
         nbytes: Optional[int],
     ):
+        cdef string cpp_url = _to_string(url)
+        cdef unique_ptr[cpp_RemoteEndpoint] cpp_endpoint
+
+        with nogil:
+            cpp_endpoint = cast_to_remote_endpoint(
+                make_unique[cpp_HttpEndpoint](cpp_url)
+            )
+
         return RemoteFile._from_endpoint(
-            cast_to_remote_endpoint(
-                make_unique[cpp_HttpEndpoint](_to_string(url))
-            ),
+            move(cpp_endpoint),
             nbytes
         )
 
@@ -111,12 +123,18 @@ cdef class RemoteFile:
         object_name: str,
         nbytes: Optional[int],
     ):
+        cdef pair[string, string] bucket_and_object_names = _to_string_pair(
+            bucket_name, object_name
+        )
+        cdef unique_ptr[cpp_RemoteEndpoint] cpp_endpoint
+
+        with nogil:
+            cpp_endpoint = cast_to_remote_endpoint(
+                make_unique[cpp_S3Endpoint](bucket_and_object_names)
+            )
+
         return RemoteFile._from_endpoint(
-            cast_to_remote_endpoint(
-                make_unique[cpp_S3Endpoint](
-                    _to_string_pair(bucket_name, object_name)
-                )
-            ),
+            move(cpp_endpoint),
             nbytes
         )
 
@@ -125,10 +143,16 @@ cdef class RemoteFile:
         url: str,
         nbytes: Optional[int],
     ):
+        cdef string cpp_url = _to_string(url)
+        cdef unique_ptr[cpp_RemoteEndpoint] cpp_endpoint
+
+        with nogil:
+            cpp_endpoint = cast_to_remote_endpoint(
+                make_unique[cpp_S3Endpoint](cpp_url)
+            )
+
         return RemoteFile._from_endpoint(
-            cast_to_remote_endpoint(
-                make_unique[cpp_S3Endpoint](_to_string(url))
-            ),
+            move(cpp_endpoint),
             nbytes
         )
 
@@ -137,11 +161,18 @@ cdef class RemoteFile:
         url: str,
         nbytes: Optional[int],
     ):
-        cdef pair[string, string] bucket_and_object = cpp_parse_s3_url(_to_string(url))
+        cdef string cpp_url = _to_string(url)
+        cdef pair[string, string] bucket_and_object_names
+        cdef unique_ptr[cpp_RemoteEndpoint] cpp_endpoint
+
+        with nogil:
+            bucket_and_object_names = cpp_parse_s3_url(cpp_url)
+            cpp_endpoint = cast_to_remote_endpoint(
+                make_unique[cpp_S3Endpoint](bucket_and_object_names)
+            )
+
         return RemoteFile._from_endpoint(
-            cast_to_remote_endpoint(
-                make_unique[cpp_S3Endpoint](bucket_and_object)
-            ),
+            move(cpp_endpoint),
             nbytes
         )
 
@@ -150,34 +181,55 @@ cdef class RemoteFile:
         presigned_url: str,
         nbytes: Optional[int],
     ):
+        cdef string cpp_url = _to_string(presigned_url)
+        cdef unique_ptr[cpp_RemoteEndpoint] cpp_endpoint
+
+        with nogil:
+            cpp_endpoint = cast_to_remote_endpoint(
+                make_unique[cpp_S3EndpointWithPresignedUrl](cpp_url)
+            )
+
         return RemoteFile._from_endpoint(
-            cast_to_remote_endpoint(
-                make_unique[cpp_S3EndpointWithPresignedUrl](_to_string(presigned_url))
-            ),
+            move(cpp_endpoint),
             nbytes
         )
 
     def __str__(self) -> str:
-        cdef string ep_str = deref(self._handle).endpoint().str()
+        cdef string ep_str
+        with nogil:
+            ep_str = deref(self._handle).endpoint().str()
         return f'<{self.__class__.__name__} "{ep_str.decode()}">'
 
     def nbytes(self) -> int:
-        return deref(self._handle).nbytes()
+        cdef size_t result
+        with nogil:
+            result = deref(self._handle).nbytes()
+        return result
 
     def read(self, buf, size: Optional[int], file_offset: int) -> int:
         cdef pair[uintptr_t, size_t] info = parse_buffer_argument(buf, size, True)
-        return deref(self._handle).read(
-            <void*>info.first,
-            info.second,
-            file_offset,
-        )
+        cdef size_t cpp_file_offset = file_offset
+        cdef size_t result
+
+        with nogil:
+            result = deref(self._handle).read(
+                <void*>info.first,
+                info.second,
+                cpp_file_offset,
+            )
+
+        return result
 
     def pread(self, buf, size: Optional[int], file_offset: int) -> IOFuture:
         cdef pair[uintptr_t, size_t] info = parse_buffer_argument(buf, size, True)
-        return _wrap_io_future(
-            deref(self._handle).pread(
+        cdef size_t cpp_file_offset = file_offset
+        cdef future[size_t] fut
+
+        with nogil:
+            fut = deref(self._handle).pread(
                 <void*>info.first,
                 info.second,
-                file_offset,
+                cpp_file_offset,
             )
-        )
+
+        return _wrap_io_future(fut)

From 9f8e8736cc3c7ce4f9e39ed23add4d65889fd7c8 Mon Sep 17 00:00:00 2001
From: Tianyu Liu <kingcrimsontianyu@gmail.com>
Date: Fri, 15 Aug 2025 11:20:42 -0400
Subject: [PATCH 15/40] Support WebHDFS (2/2): Python binding (#791)

## Summary
This PR adds Python binding for the WebHDFS support

Depends on PR https://github.com/rapidsai/kvikio/pull/788

Closes https://github.com/rapidsai/kvikio/issues/787

Python's built-in package `http.server` is well suited to server mocking. It enables high-level testing for the client. Closes https://github.com/rapidsai/kvikio/issues/634 too.

Authors:
  - Tianyu Liu (https://github.com/kingcrimsontianyu)

Approvers:
  - Mads R. B. Kristensen (https://github.com/madsbk)

URL: https://github.com/rapidsai/kvikio/pull/791
---
 python/kvikio/kvikio/_lib/remote_handle.pyx |  15 ++
 python/kvikio/kvikio/remote_file.py         |  21 ++
 python/kvikio/tests/test_hdfs_io.py         | 234 ++++++++++++++++++++
 python/kvikio/tests/test_s3_io.py           |  13 +-
 python/kvikio/tests/utils.py                |  15 ++
 5 files changed, 289 insertions(+), 9 deletions(-)
 create mode 100644 python/kvikio/tests/test_hdfs_io.py
 create mode 100644 python/kvikio/tests/utils.py

diff --git a/python/kvikio/kvikio/_lib/remote_handle.pyx b/python/kvikio/kvikio/_lib/remote_handle.pyx
index 5cb2d5cdd9..17222685ae 100644
--- a/python/kvikio/kvikio/_lib/remote_handle.pyx
+++ b/python/kvikio/kvikio/_lib/remote_handle.pyx
@@ -53,6 +53,9 @@ cdef extern from "<kvikio/remote_handle.hpp>" nogil:
             size_t file_offset
         ) except +
 
+cdef extern from "<kvikio/hdfs.hpp>" nogil:
+    cdef cppclass cpp_WebHdfsEndpoint "kvikio::WebHdfsEndpoint"(cpp_RemoteEndpoint):
+        cpp_WebHdfsEndpoint(string url) except +
 
 cdef string _to_string(str s):
     """Convert Python object to a C++ string (if None, return the empty string)"""
@@ -194,6 +197,18 @@ cdef class RemoteFile:
             nbytes
         )
 
+    @staticmethod
+    def open_webhdfs(
+        url: str,
+        nbytes: Optional[int],
+    ):
+        return RemoteFile._from_endpoint(
+            cast_to_remote_endpoint(
+                make_unique[cpp_WebHdfsEndpoint](_to_string(url))
+            ),
+            nbytes
+        )
+
     def __str__(self) -> str:
         cdef string ep_str
         with nogil:
diff --git a/python/kvikio/kvikio/remote_file.py b/python/kvikio/kvikio/remote_file.py
index f06a40b45f..105c42e438 100644
--- a/python/kvikio/kvikio/remote_file.py
+++ b/python/kvikio/kvikio/remote_file.py
@@ -164,6 +164,27 @@ def open_s3_presigned_url(
             )
         )
 
+    @classmethod
+    def open_webhdfs(
+        cls,
+        url: str,
+        nbytes: Optional[int] = None,
+    ) -> RemoteFile:
+        """Open a file on Apache Hadoop Distributed File System (HDFS) using WebHDFS.
+
+        If KvikIO is run within a Docker, the argument ``--network host`` needs to be
+        passed to the ``docker run`` command.
+
+        Parameters
+        ----------
+        url
+            URL to the remote file.
+        nbytes
+            The size of the file. If None, KvikIO will ask the server for the file
+            size.
+        """
+        return RemoteFile(_get_remote_module().RemoteFile.open_webhdfs(url, nbytes))
+
     def close(self) -> None:
         """Close the file"""
         pass
diff --git a/python/kvikio/tests/test_hdfs_io.py b/python/kvikio/tests/test_hdfs_io.py
new file mode 100644
index 0000000000..eab4d0a5fb
--- /dev/null
+++ b/python/kvikio/tests/test_hdfs_io.py
@@ -0,0 +1,234 @@
+# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+# See file LICENSE for terms.
+
+from __future__ import annotations
+
+import json
+import urllib.parse
+from http.server import BaseHTTPRequestHandler, HTTPServer
+from multiprocessing import Process, Queue
+from typing import Any, Generator
+
+import cupy as cp
+import numpy as np
+import numpy.typing as npt
+import pytest
+import utils
+
+import kvikio.defaults
+from kvikio import remote_file
+
+
+class RemoteFileData:
+    def __init__(self, file_path: str, num_elements: int, dtype: npt.DTypeLike) -> None:
+        self.file_path = file_path
+        self.num_elements = num_elements
+        self.dtype = dtype
+        self.buf = np.arange(0, self.num_elements, dtype=self.dtype)
+        self.file_size = self.buf.nbytes
+
+
+@pytest.fixture(scope="module")
+def remote_file_data() -> RemoteFileData:
+    return RemoteFileData(
+        file_path="/webhdfs/v1/home/test_user/test_file.bin",
+        num_elements=1024 * 1024,
+        dtype=np.float64,
+    )
+
+
+def run_mock_server(queue: Queue[int], file_size: int, buf: npt.NDArray[Any]) -> None:
+    """Run HTTP server in a separate process"""
+
+    class WebHdfsHandler(BaseHTTPRequestHandler):
+        def do_GET(self) -> None:
+            parsed_url = urllib.parse.urlparse(self.path)
+            query_dict = urllib.parse.parse_qs(parsed_url.query)
+            op = query_dict["op"]
+
+            # Client requests file size
+            if op == ["GETFILESTATUS"]:
+                self.send_response(200)
+                self.send_header("Content-Type", "application/json")
+                self.end_headers()
+                response = json.dumps({"length": file_size})
+                self.wfile.write(response.encode())
+
+            # Client requests file content
+            elif op == ["OPEN"]:
+                offset = int(query_dict["offset"][0])
+                length = int(query_dict["length"][0])
+
+                # Convert byte offsets to element indices
+                element_size = buf.itemsize
+                begin_idx = offset // element_size
+                end_idx = (offset + length) // element_size
+                range_data = buf[begin_idx:end_idx].tobytes()
+
+                self.send_response(200)
+                self.send_header("Content-Type", "application/octet-stream")
+                self.send_header("Content-Length", str(len(range_data)))
+                self.end_headers()
+                self.wfile.write(range_data)
+            else:
+                self.send_response(400)
+                self.end_headers()
+
+        def log_message(self, format: str, *args: Any) -> None:
+            pass
+
+    port = utils.find_free_port()
+    server = HTTPServer((utils.localhost(), port), WebHdfsHandler)
+
+    # Send port back to parent process
+    queue.put(port)
+
+    server.serve_forever()
+
+
+@pytest.fixture
+def mock_webhdfs_server(remote_file_data: RemoteFileData) -> Generator[str, None, None]:
+    """Start WebHDFS mock server in a separate process"""
+    queue: Queue[int] = Queue()
+    server_process = Process(
+        target=run_mock_server,
+        args=(
+            queue,
+            remote_file_data.file_size,
+            remote_file_data.buf,
+        ),
+        daemon=True,
+    )
+    server_process.start()
+
+    # Get the port the server is running on
+    port = queue.get(timeout=5)
+
+    yield f"http://{utils.localhost()}:{port}"
+
+    # Cleanup
+    server_process.terminate()
+    server_process.join(timeout=1)
+
+
+class TestWebHdfsOperations:
+    @pytest.mark.parametrize("url_query", ["", "?op=OPEN"])
+    def test_get_file_size(
+        self,
+        mock_webhdfs_server: str,
+        remote_file_data: RemoteFileData,
+        url_query: str,
+    ) -> None:
+        url = f"{mock_webhdfs_server}{remote_file_data.file_path}{url_query}"
+        handle = remote_file.RemoteFile.open_webhdfs(url)
+        file_size = handle.nbytes()
+        assert file_size == remote_file_data.file_size
+
+    def test_parallel_read(
+        self, mock_webhdfs_server: str, remote_file_data: RemoteFileData, xp: Any
+    ) -> None:
+        url = f"{mock_webhdfs_server}{remote_file_data.file_path}"
+        handle = remote_file.RemoteFile.open_webhdfs(url)
+        result_buf = xp.arange(
+            0, remote_file_data.num_elements, dtype=remote_file_data.dtype
+        )
+        fut = handle.pread(result_buf)
+        read_size = fut.get()
+
+        assert read_size == remote_file_data.file_size
+
+        result_buf_np = result_buf
+        if isinstance(result_buf, cp.ndarray):
+            result_buf_np = cp.asnumpy(result_buf)
+        assert np.array_equal(result_buf_np, remote_file_data.buf)
+
+    @pytest.mark.parametrize("size", [80, 8 * 9999])
+    @pytest.mark.parametrize("offset", [0, 800, 8000, 8 * 9999])
+    @pytest.mark.parametrize("num_threads", [1, 4])
+    @pytest.mark.parametrize("task_size", [1024, 4096])
+    def test_parallel_read_partial(
+        self,
+        mock_webhdfs_server: str,
+        remote_file_data: RemoteFileData,
+        size: int,
+        offset: int,
+        num_threads: int,
+        task_size: int,
+        xp: Any,
+    ) -> None:
+        url = f"{mock_webhdfs_server}{remote_file_data.file_path}"
+        element_size = remote_file_data.buf.itemsize
+        begin_idx = offset // element_size
+        end_idx = (offset + size) // element_size
+        expected_buf = remote_file_data.buf[begin_idx:end_idx]
+
+        actual_num_elements = size // np.dtype(remote_file_data.dtype).itemsize
+        with kvikio.defaults.set({"num_threads": num_threads, "task_size": task_size}):
+            handle = remote_file.RemoteFile.open_webhdfs(url)
+            result_buf = xp.zeros(actual_num_elements, dtype=remote_file_data.dtype)
+            fut = handle.pread(result_buf, size, offset)
+            read_size = fut.get()
+
+            assert read_size == size
+
+            result_buf_np = result_buf
+            if isinstance(result_buf, cp.ndarray):
+                result_buf_np = cp.asnumpy(result_buf)
+            assert np.array_equal(result_buf_np, expected_buf)
+
+
+class TestWebHdfsErrors:
+    @pytest.fixture
+    def mock_bad_server(
+        self, remote_file_data: RemoteFileData
+    ) -> Generator[str, None, None]:
+        """Start a bad WebHDFS server that returns invalid JSON"""
+
+        def run_bad_server(queue: Queue[int]) -> None:
+            class BadHandler(BaseHTTPRequestHandler):
+                def do_GET(self):
+                    parsed = urllib.parse.urlparse(self.path)
+                    query = urllib.parse.parse_qs(parsed.query)
+
+                    if query.get("op") == ["GETFILESTATUS"]:
+                        self.send_response(200)
+                        self.send_header("Content-Type", "application/json")
+                        self.end_headers()
+                        # Missing "length" field
+                        response = json.dumps({})
+                        self.wfile.write(response.encode())
+                    else:
+                        self.send_response(400)
+                        self.end_headers()
+
+                def log_message(self, format, *args):
+                    pass
+
+            port = utils.find_free_port()
+            server = HTTPServer((utils.localhost(), port), BadHandler)
+            queue.put(port)
+            server.serve_forever()
+
+        queue: Queue[int] = Queue()
+        server_process = Process(target=run_bad_server, args=(queue,), daemon=True)
+        server_process.start()
+
+        port = queue.get(timeout=5)
+
+        yield f"http://{utils.localhost()}:{port}"
+
+        server_process.terminate()
+        server_process.join(timeout=1)
+
+    def test_missing_file_size(
+        self, mock_bad_server: str, remote_file_data: RemoteFileData
+    ) -> None:
+        url = f"{mock_bad_server}{remote_file_data.file_path}"
+
+        with pytest.raises(
+            RuntimeError,
+            match="Regular expression search failed. "
+            "Cannot extract file length from the JSON response.",
+        ):
+            handle = remote_file.RemoteFile.open_webhdfs(url)
+            handle.nbytes()
diff --git a/python/kvikio/tests/test_s3_io.py b/python/kvikio/tests/test_s3_io.py
index 58a73184a8..2256bc1284 100644
--- a/python/kvikio/tests/test_s3_io.py
+++ b/python/kvikio/tests/test_s3_io.py
@@ -2,11 +2,11 @@
 # See file LICENSE for terms.
 
 import multiprocessing as mp
-import socket
 import time
 from contextlib import contextmanager
 
 import pytest
+import utils
 
 import kvikio
 import kvikio.defaults
@@ -26,18 +26,13 @@
 
 
 @pytest.fixture(scope="session")
-def endpoint_ip():
-    return "127.0.0.1"
+def endpoint_ip() -> str:
+    return utils.localhost()
 
 
 @pytest.fixture(scope="session")
 def endpoint_port():
-    # Return a free port per worker session.
-    sock = socket.socket()
-    sock.bind(("127.0.0.1", 0))
-    port = sock.getsockname()[1]
-    sock.close()
-    return port
+    return utils.find_free_port()
 
 
 def start_s3_server(ip_address, port):
diff --git a/python/kvikio/tests/utils.py b/python/kvikio/tests/utils.py
new file mode 100644
index 0000000000..58d801efcf
--- /dev/null
+++ b/python/kvikio/tests/utils.py
@@ -0,0 +1,15 @@
+# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+# See file LICENSE for terms.
+
+import socket
+
+
+def localhost() -> str:
+    return "127.0.0.1"
+
+
+def find_free_port(host: str = localhost()) -> int:
+    with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
+        s.bind((host, 0))
+        _, port = s.getsockname()
+    return port

From e20807ee11d2e8f7795c7ffd1c1bb5433e87a199 Mon Sep 17 00:00:00 2001
From: Vukasin Milovanovic <vmilovanovic@nvidia.com>
Date: Mon, 18 Aug 2025 13:50:31 -0700
Subject: [PATCH 16/40] Remove Python nvCOMP bindings and Zarr 2 support (#798)

Removes the features that utilize nvCOMP - Python bindings and Zarr 2 support.

Authors:
  - Vukasin Milovanovic (https://github.com/vuule)

Approvers:
  - Gil Forsyth (https://github.com/gforsyth)
  - Tom Augspurger (https://github.com/TomAugspurger)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/kvikio/pull/798
---
 README.md                                     |    1 -
 docs/source/zarr.rst                          |   13 -
 notebooks/nvcomp_batch_codec.ipynb            |  357 ---
 notebooks/nvcomp_vs_zarr_lz4.ipynb            | 2832 -----------------
 notebooks/zarr.ipynb                          |  364 ---
 python/kvikio/cli/gpu_compressor.py           |  136 -
 python/kvikio/examples/zarr_cupy_nvcomp.py    |   88 -
 python/kvikio/kvikio/_lib/CMakeLists.txt      |    2 +-
 python/kvikio/kvikio/_lib/libnvcomp.pyx       |  235 --
 python/kvikio/kvikio/_lib/libnvcomp_ll.pyx    | 1182 -------
 python/kvikio/kvikio/_lib/nvcomp_cxx_api.pxd  |  212 --
 .../kvikio/kvikio/_lib/nvcomp_ll_cxx_api.pxd  |  362 ---
 python/kvikio/kvikio/_nvcomp.py               |  368 ---
 python/kvikio/kvikio/_nvcomp_codec.py         |  228 --
 .../kvikio/benchmarks/single_node_io.py       |   34 +-
 python/kvikio/kvikio/benchmarks/zarr_io.py    |   87 +-
 python/kvikio/kvikio/nvcomp.py                |   20 -
 python/kvikio/kvikio/nvcomp_codec.py          |    9 -
 python/kvikio/kvikio/zarr/__init__.py         |    2 -
 python/kvikio/kvikio/zarr/_zarr_python_2.py   |  400 ---
 python/kvikio/kvikio/zarr/_zarr_python_3.py   |   12 +-
 python/kvikio/tests/conftest.py               |   13 -
 python/kvikio/tests/test_benchmarks.py        |   52 -
 python/kvikio/tests/test_examples.py          |   13 -
 python/kvikio/tests/test_nvcomp.py            |  444 ---
 python/kvikio/tests/test_nvcomp_codec.py      |  243 --
 python/kvikio/tests/test_zarr.py              |  292 --
 27 files changed, 38 insertions(+), 7963 deletions(-)
 delete mode 100644 notebooks/nvcomp_batch_codec.ipynb
 delete mode 100644 notebooks/nvcomp_vs_zarr_lz4.ipynb
 delete mode 100644 notebooks/zarr.ipynb
 delete mode 100755 python/kvikio/cli/gpu_compressor.py
 delete mode 100644 python/kvikio/examples/zarr_cupy_nvcomp.py
 delete mode 100644 python/kvikio/kvikio/_lib/libnvcomp.pyx
 delete mode 100644 python/kvikio/kvikio/_lib/libnvcomp_ll.pyx
 delete mode 100644 python/kvikio/kvikio/_lib/nvcomp_cxx_api.pxd
 delete mode 100644 python/kvikio/kvikio/_lib/nvcomp_ll_cxx_api.pxd
 delete mode 100644 python/kvikio/kvikio/_nvcomp.py
 delete mode 100644 python/kvikio/kvikio/_nvcomp_codec.py
 delete mode 100644 python/kvikio/kvikio/nvcomp.py
 delete mode 100644 python/kvikio/kvikio/nvcomp_codec.py
 delete mode 100644 python/kvikio/kvikio/zarr/_zarr_python_2.py
 delete mode 100644 python/kvikio/tests/test_nvcomp.py
 delete mode 100644 python/kvikio/tests/test_nvcomp_codec.py
 delete mode 100644 python/kvikio/tests/test_zarr.py

diff --git a/README.md b/README.md
index ac2ee2a4bd..549d8bcfc4 100644
--- a/README.md
+++ b/README.md
@@ -15,7 +15,6 @@ KvikIO also works efficiently when GDS isn't available and can read/write both h
 * Concurrent reads and writes using an internal thread pool.
 * Non-blocking API.
 * Transparently handles reads and writes to/from memory on both host and device.
-* (Deprecated) Provides Python bindings to [nvCOMP](https://docs.nvidia.com/cuda/nvcomp/py_api.html).
 
 
 ### Documentation
diff --git a/docs/source/zarr.rst b/docs/source/zarr.rst
index 019eff2767..f8e4564011 100644
--- a/docs/source/zarr.rst
+++ b/docs/source/zarr.rst
@@ -28,16 +28,3 @@ You can use any store, but KvikIO provides :py:class:`kvikio.zarr.GDSStore` to e
    ... )
    >>> type(z[:10, :10])
    cupy.ndarray
-
-
-
-Zarr Python 2.x
----------------
-
-
-The following uses zarr-python 2.x, and is an example of how to use the convenience function :py:meth:`kvikio.zarr.open_cupy_array`
-to create a new Zarr array and how to open an existing Zarr array.
-
-
-.. literalinclude:: ../../python/kvikio/examples/zarr_cupy_nvcomp.py
-    :language: python
diff --git a/notebooks/nvcomp_batch_codec.ipynb b/notebooks/nvcomp_batch_codec.ipynb
deleted file mode 100644
index f4f4689f3c..0000000000
--- a/notebooks/nvcomp_batch_codec.ipynb
+++ /dev/null
@@ -1,357 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "code",
-   "execution_count": 1,
-   "id": "b543ae63",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import json\n",
-    "\n",
-    "import numcodecs\n",
-    "\n",
-    "import numpy as np\n",
-    "\n",
-    "import zarr\n",
-    "\n",
-    "from IPython.display import display\n",
-    "\n",
-    "np.set_printoptions(precision=4, suppress=True)"
-   ]
-  },
-  {
-   "attachments": {},
-   "cell_type": "markdown",
-   "id": "1a8e55d5",
-   "metadata": {},
-   "source": [
-    "### Basic usage\n",
-    "\n",
-    "Get nvCOMP codec from numcodecs registry:"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 2,
-   "id": "75524650",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "NvCompBatchCodec(algorithm='lz4', options={})"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    }
-   ],
-   "source": [
-    "NVCOMP_CODEC_ID = \"nvcomp_batch\"\n",
-    "\n",
-    "# Currently supported algorithms.\n",
-    "LZ4_ALGO = \"LZ4\"\n",
-    "GDEFLATE_ALGO = \"Gdeflate\"\n",
-    "SNAPPY_ALGO = \"snappy\"\n",
-    "ZSTD_ALGO = \"zstd\"\n",
-    "\n",
-    "codec = numcodecs.registry.get_codec(dict(id=NVCOMP_CODEC_ID, algorithm=LZ4_ALGO))\n",
-    "# To pass algorithm-specific options, use options parameter:\n",
-    "# codec = numcodecs.registry.get_codec(dict(id=NVCOMP_CODEC_ID, algo=LZ4_ALGO, options={\"data_type\": 1}))\n",
-    "\n",
-    "display(codec)"
-   ]
-  },
-  {
-   "attachments": {},
-   "cell_type": "markdown",
-   "id": "46641ccb",
-   "metadata": {},
-   "source": [
-    "Create data:"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 3,
-   "id": "12a4fffd",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "array([[ 1.6243, -0.6118, -0.5282, ...,  0.0436, -0.62  ,  0.698 ],\n",
-       "       [-0.4471,  1.2245,  0.4035, ...,  0.4203,  0.811 ,  1.0444],\n",
-       "       [-0.4009,  0.824 , -0.5623, ...,  0.7848, -0.9554,  0.5859],\n",
-       "       ...,\n",
-       "       [ 1.3797,  0.1387,  1.2255, ...,  1.8051,  0.3722,  0.1253],\n",
-       "       [ 0.7348, -0.7115, -0.1248, ..., -1.9533, -0.7684, -0.5345],\n",
-       "       [ 0.2183, -0.8654,  0.8886, ..., -1.0141, -0.0627, -1.4379]],\n",
-       "      dtype=float32)"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "text/html": [
-       "<table class=\"zarr-info\"><tbody><tr><th style=\"text-align: left\">Type</th><td style=\"text-align: left\">zarr.core.Array</td></tr><tr><th style=\"text-align: left\">Data type</th><td style=\"text-align: left\">float32</td></tr><tr><th style=\"text-align: left\">Shape</th><td style=\"text-align: left\">(100, 100)</td></tr><tr><th style=\"text-align: left\">Chunk shape</th><td style=\"text-align: left\">(10, 10)</td></tr><tr><th style=\"text-align: left\">Order</th><td style=\"text-align: left\">C</td></tr><tr><th style=\"text-align: left\">Read-only</th><td style=\"text-align: left\">False</td></tr><tr><th style=\"text-align: left\">Compressor</th><td style=\"text-align: left\">NvCompBatchCodec(algorithm='lz4', options={})</td></tr><tr><th style=\"text-align: left\">Store type</th><td style=\"text-align: left\">zarr.storage.KVStore</td></tr><tr><th style=\"text-align: left\">No. bytes</th><td style=\"text-align: left\">40000 (39.1K)</td></tr><tr><th style=\"text-align: left\">No. bytes stored</th><td style=\"text-align: left\">41006 (40.0K)</td></tr><tr><th style=\"text-align: left\">Storage ratio</th><td style=\"text-align: left\">1.0</td></tr><tr><th style=\"text-align: left\">Chunks initialized</th><td style=\"text-align: left\">100/100</td></tr></tbody></table>"
-      ],
-      "text/plain": [
-       "Type               : zarr.core.Array\n",
-       "Data type          : float32\n",
-       "Shape              : (100, 100)\n",
-       "Chunk shape        : (10, 10)\n",
-       "Order              : C\n",
-       "Read-only          : False\n",
-       "Compressor         : NvCompBatchCodec(algorithm='lz4', options={})\n",
-       "Store type         : zarr.storage.KVStore\n",
-       "No. bytes          : 40000 (39.1K)\n",
-       "No. bytes stored   : 41006 (40.0K)\n",
-       "Storage ratio      : 1.0\n",
-       "Chunks initialized : 100/100"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    }
-   ],
-   "source": [
-    "shape = (100, 100)\n",
-    "chunks = (10, 10)\n",
-    "\n",
-    "np.random.seed(1)\n",
-    "\n",
-    "x = zarr.array(np.random.randn(*shape).astype(np.float32), chunks=chunks, compressor=codec)\n",
-    "display(x[:])\n",
-    "display(x.info)"
-   ]
-  },
-  {
-   "attachments": {},
-   "cell_type": "markdown",
-   "id": "c15cbdff",
-   "metadata": {},
-   "source": [
-    "Store and load back the data:"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 4,
-   "id": "730cde85",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "{'chunks': [10, 10],\n",
-       " 'compressor': {'algorithm': 'lz4', 'id': 'nvcomp_batch', 'options': {}},\n",
-       " 'dtype': '<f4',\n",
-       " 'fill_value': 0.0,\n",
-       " 'filters': None,\n",
-       " 'order': 'C',\n",
-       " 'shape': [100, 100],\n",
-       " 'zarr_format': 2}"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    }
-   ],
-   "source": [
-    "# Use simple dictionary store, see zarr documentation for other options.\n",
-    "zarr_store = {}\n",
-    "\n",
-    "zarr.save_array(zarr_store, x, compressor=codec)\n",
-    "\n",
-    "# Check stored metadata.\n",
-    "meta_info = json.loads(zarr_store[\".zarray\"])\n",
-    "display(meta_info)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 5,
-   "id": "11338a99",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<table class=\"zarr-info\"><tbody><tr><th style=\"text-align: left\">Type</th><td style=\"text-align: left\">zarr.core.Array</td></tr><tr><th style=\"text-align: left\">Data type</th><td style=\"text-align: left\">float32</td></tr><tr><th style=\"text-align: left\">Shape</th><td style=\"text-align: left\">(100, 100)</td></tr><tr><th style=\"text-align: left\">Chunk shape</th><td style=\"text-align: left\">(10, 10)</td></tr><tr><th style=\"text-align: left\">Order</th><td style=\"text-align: left\">C</td></tr><tr><th style=\"text-align: left\">Read-only</th><td style=\"text-align: left\">False</td></tr><tr><th style=\"text-align: left\">Compressor</th><td style=\"text-align: left\">NvCompBatchCodec(algorithm='lz4', options={})</td></tr><tr><th style=\"text-align: left\">Store type</th><td style=\"text-align: left\">zarr.storage.KVStore</td></tr><tr><th style=\"text-align: left\">No. bytes</th><td style=\"text-align: left\">40000 (39.1K)</td></tr><tr><th style=\"text-align: left\">No. bytes stored</th><td style=\"text-align: left\">41006 (40.0K)</td></tr><tr><th style=\"text-align: left\">Storage ratio</th><td style=\"text-align: left\">1.0</td></tr><tr><th style=\"text-align: left\">Chunks initialized</th><td style=\"text-align: left\">100/100</td></tr></tbody></table>"
-      ],
-      "text/plain": [
-       "Type               : zarr.core.Array\n",
-       "Data type          : float32\n",
-       "Shape              : (100, 100)\n",
-       "Chunk shape        : (10, 10)\n",
-       "Order              : C\n",
-       "Read-only          : False\n",
-       "Compressor         : NvCompBatchCodec(algorithm='lz4', options={})\n",
-       "Store type         : zarr.storage.KVStore\n",
-       "No. bytes          : 40000 (39.1K)\n",
-       "No. bytes stored   : 41006 (40.0K)\n",
-       "Storage ratio      : 1.0\n",
-       "Chunks initialized : 100/100"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    }
-   ],
-   "source": [
-    "y = zarr.open_array(zarr_store)\n",
-    "display(y.info)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 6,
-   "id": "5b6cc2ca",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Test the roundtrip.\n",
-    "np.testing.assert_equal(y[:], x[:])"
-   ]
-  },
-  {
-   "attachments": {},
-   "cell_type": "markdown",
-   "id": "1a8eea79",
-   "metadata": {},
-   "source": [
-    "### CPU compression / GPU decompression\n",
-    "\n",
-    "Some algorithms, such as LZ4, can be used interchangeably on CPU and GPU. For example, the data might be created using CPU LZ4 codec and then decompressed using GPU version of LZ4 codec."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 7,
-   "id": "87d25b76",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "{'chunks': [10, 10],\n",
-       " 'compressor': {'acceleration': 1, 'id': 'lz4'},\n",
-       " 'dtype': '<f4',\n",
-       " 'fill_value': 0.0,\n",
-       " 'filters': None,\n",
-       " 'order': 'C',\n",
-       " 'shape': [100, 100],\n",
-       " 'zarr_format': 2}"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "text/html": [
-       "<table class=\"zarr-info\"><tbody><tr><th style=\"text-align: left\">Type</th><td style=\"text-align: left\">zarr.core.Array</td></tr><tr><th style=\"text-align: left\">Data type</th><td style=\"text-align: left\">float32</td></tr><tr><th style=\"text-align: left\">Shape</th><td style=\"text-align: left\">(100, 100)</td></tr><tr><th style=\"text-align: left\">Chunk shape</th><td style=\"text-align: left\">(10, 10)</td></tr><tr><th style=\"text-align: left\">Order</th><td style=\"text-align: left\">C</td></tr><tr><th style=\"text-align: left\">Read-only</th><td style=\"text-align: left\">False</td></tr><tr><th style=\"text-align: left\">Compressor</th><td style=\"text-align: left\">LZ4(acceleration=1)</td></tr><tr><th style=\"text-align: left\">Store type</th><td style=\"text-align: left\">zarr.storage.KVStore</td></tr><tr><th style=\"text-align: left\">No. bytes</th><td style=\"text-align: left\">40000 (39.1K)</td></tr><tr><th style=\"text-align: left\">No. bytes stored</th><td style=\"text-align: left\">40973 (40.0K)</td></tr><tr><th style=\"text-align: left\">Storage ratio</th><td style=\"text-align: left\">1.0</td></tr><tr><th style=\"text-align: left\">Chunks initialized</th><td style=\"text-align: left\">100/100</td></tr></tbody></table>"
-      ],
-      "text/plain": [
-       "Type               : zarr.core.Array\n",
-       "Data type          : float32\n",
-       "Shape              : (100, 100)\n",
-       "Chunk shape        : (10, 10)\n",
-       "Order              : C\n",
-       "Read-only          : False\n",
-       "Compressor         : LZ4(acceleration=1)\n",
-       "Store type         : zarr.storage.KVStore\n",
-       "No. bytes          : 40000 (39.1K)\n",
-       "No. bytes stored   : 40973 (40.0K)\n",
-       "Storage ratio      : 1.0\n",
-       "Chunks initialized : 100/100"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "text/html": [
-       "<table class=\"zarr-info\"><tbody><tr><th style=\"text-align: left\">Type</th><td style=\"text-align: left\">zarr.core.Array</td></tr><tr><th style=\"text-align: left\">Data type</th><td style=\"text-align: left\">float32</td></tr><tr><th style=\"text-align: left\">Shape</th><td style=\"text-align: left\">(100, 100)</td></tr><tr><th style=\"text-align: left\">Chunk shape</th><td style=\"text-align: left\">(10, 10)</td></tr><tr><th style=\"text-align: left\">Order</th><td style=\"text-align: left\">C</td></tr><tr><th style=\"text-align: left\">Read-only</th><td style=\"text-align: left\">False</td></tr><tr><th style=\"text-align: left\">Compressor</th><td style=\"text-align: left\">NvCompBatchCodec(algorithm='lz4', options={})</td></tr><tr><th style=\"text-align: left\">Store type</th><td style=\"text-align: left\">zarr.storage.KVStore</td></tr><tr><th style=\"text-align: left\">No. bytes</th><td style=\"text-align: left\">40000 (39.1K)</td></tr><tr><th style=\"text-align: left\">No. bytes stored</th><td style=\"text-align: left\">40883 (39.9K)</td></tr><tr><th style=\"text-align: left\">Storage ratio</th><td style=\"text-align: left\">1.0</td></tr><tr><th style=\"text-align: left\">Chunks initialized</th><td style=\"text-align: left\">100/100</td></tr></tbody></table>"
-      ],
-      "text/plain": [
-       "Type               : zarr.core.Array\n",
-       "Data type          : float32\n",
-       "Shape              : (100, 100)\n",
-       "Chunk shape        : (10, 10)\n",
-       "Order              : C\n",
-       "Read-only          : False\n",
-       "Compressor         : NvCompBatchCodec(algorithm='lz4', options={})\n",
-       "Store type         : zarr.storage.KVStore\n",
-       "No. bytes          : 40000 (39.1K)\n",
-       "No. bytes stored   : 40883 (39.9K)\n",
-       "Storage ratio      : 1.0\n",
-       "Chunks initialized : 100/100"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    }
-   ],
-   "source": [
-    "# Get default (CPU) implementation of LZ4 codec.\n",
-    "cpu_codec = numcodecs.registry.get_codec({\"id\": \"lz4\"})\n",
-    "\n",
-    "x = zarr.array(np.random.randn(*shape).astype(np.float32), chunks=chunks, compressor=cpu_codec)\n",
-    "# Define a simple, dictionary-based store. In real scenarios this can be a filesystem or some other persistent store.\n",
-    "store = {}\n",
-    "zarr.save_array(store, x, compressor=cpu_codec)\n",
-    "\n",
-    "# Check that the data was written by the expected codec.\n",
-    "meta = json.loads(store[\".zarray\"])\n",
-    "display(meta)\n",
-    "assert meta[\"compressor\"][\"id\"] == \"lz4\"\n",
-    "\n",
-    "# Change codec to GPU/nvCOMP-based.\n",
-    "meta[\"compressor\"] = {\"id\": NVCOMP_CODEC_ID, \"algorithm\": LZ4_ALGO}\n",
-    "store[\".zarray\"] = json.dumps(meta).encode()\n",
-    "\n",
-    "y = zarr.open_array(store, compressor=codec)\n",
-    "\n",
-    "display(x.info)\n",
-    "display(y.info)\n",
-    "\n",
-    "np.testing.assert_equal(x[:], y[:])\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "b9294992",
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3 (ipykernel)",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.8.10"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 5
-}
diff --git a/notebooks/nvcomp_vs_zarr_lz4.ipynb b/notebooks/nvcomp_vs_zarr_lz4.ipynb
deleted file mode 100644
index 3b6d947ac3..0000000000
--- a/notebooks/nvcomp_vs_zarr_lz4.ipynb
+++ /dev/null
@@ -1,2832 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "code",
-   "execution_count": 42,
-   "id": "f95b6759-533a-470b-8f08-5f91ebcea625",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import cupy as cp\n",
-    "import numpy as np\n",
-    "import pandas as pd\n",
-    "import time\n",
-    "import zarr\n",
-    "\n",
-    "import kvikio.nvcomp\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 43,
-   "id": "d1e60a9b-0bca-4c66-b2f0-829acc3b1ba2",
-   "metadata": {
-    "scrolled": true,
-    "tags": []
-   },
-   "outputs": [],
-   "source": [
-    "# conda install -c conda-forge zarr"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 44,
-   "id": "2cedb529-c0fa-4883-a2fd-78b1ad3c1a59",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[2013929216, 1006964608, 503482304, 251741152, 125870576, 62935288, 31467644, 15733822, 7866911, 3933455, 1966727, 983363, 491681, 245840, 122920, 61460, 30730, 15365, 7682, 3841]\n"
-     ]
-    }
-   ],
-   "source": [
-    "HOST_LZ4_MAX = 2013929216 # 2113929216\n",
-    "sizes = list(map(lambda x: HOST_LZ4_MAX//(2**x), np.arange(20)))\n",
-    "print(sizes)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 45,
-   "id": "39483573-e79b-4dca-aee3-13bf392da3a7",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "input_size = []\n",
-    "cascaded_size = []\n",
-    "cascaded_temp_size = []\n",
-    "cascaded_round_trip_time = []\n",
-    "lz4_gpu_size = []\n",
-    "lz4_gpu_temp_size = []\n",
-    "lz4_gpu_round_trip_time = []\n",
-    "bitcomp_gpu_size = []\n",
-    "bitcomp_gpu_temp_size = []\n",
-    "bitcomp_gpu_round_trip_time = []\n",
-    "lz4_size = []\n",
-    "lz4_round_trip_time = []"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 46,
-   "id": "ccd9b1e7-b607-4948-8256-73bedf1ec7a8",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "821.61s - pydevd: Sending message related to process being replaced timed-out after 5 seconds\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "--2023-09-29 13:44:43--  http://textfiles.com/etext/NONFICTION/kjv10.txt\n",
-      "Resolving textfiles.com (textfiles.com)... 208.86.224.90\n",
-      "Connecting to textfiles.com (textfiles.com)|208.86.224.90|:80... connected.\n",
-      "HTTP request sent, awaiting response... 200 OK\n",
-      "Length: 4432803 (4.2M) [text/plain]\n",
-      "Saving to: ‘kjv10.txt.2’\n",
-      "\n",
-      "kjv10.txt.2         100%[===================>]   4.23M   316KB/s    in 14s     \n",
-      "\n",
-      "2023-09-29 13:44:58 (304 KB/s) - ‘kjv10.txt.2’ saved [4432803/4432803]\n",
-      "\n"
-     ]
-    }
-   ],
-   "source": [
-    "!wget http://textfiles.com/etext/NONFICTION/kjv10.txt"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 47,
-   "id": "4c9a5c4c-4c49-4834-8dc2-3e6fc11ea930",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "text = open('kjv10.txt').read()\n",
-    "bib = np.frombuffer(bytes(text, 'utf-8'), dtype=np.int8)\n",
-    "data_buffer = np.tile(bib, 500)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 48,
-   "id": "74740819-b987-4012-ba6c-ed3d3b9afd60",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# One of the three below keys, this will set the arrangement of test data for a full run of the notebook.\n",
-    "TARGET = \"Ascending\"\n",
-    "DTYPE = cp.int32"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 49,
-   "id": "0a1307ed-034c-4943-a7e1-36665cba8ad5",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "data = {\n",
-    "    \"Ascending\": np.arange(0, HOST_LZ4_MAX, dtype=np.int32),\n",
-    "    \"Random\": np.random.randint(0, 100, HOST_LZ4_MAX, dtype=np.int32),\n",
-    "    \"Text\": data_buffer\n",
-    "}"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 50,
-   "id": "68adbb33-ddb7-4603-8863-fdd25b8bdc51",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def get_host_data(offset, dtype):\n",
-    "    exemplar = np.array([1], dtype=dtype)\n",
-    "    print(offset)\n",
-    "    print(exemplar.itemsize)\n",
-    "    print(data[TARGET].itemsize)\n",
-    "    index = offset // data[TARGET].itemsize\n",
-    "    index = index - (index % exemplar.itemsize)\n",
-    "    print(index)\n",
-    "    return data[TARGET][0:index].view(dtype)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 51,
-   "id": "f067cdc2-ee14-4258-b89d-0bb4a224c698",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "2013929216\n",
-      "4\n",
-      "4\n",
-      "503482304\n",
-      "-----\n",
-      "Input size:  2013929216\n",
-      "Cascaded GPU compressor output size:  33434464\n",
-      "Cascaded GPU decompressor output size:  2013929216\n",
-      "Cascaded GPU compress/decompress round trip time:  0.1076362133026123\n",
-      "2021826967\n",
-      "Lz4 zarr time:  4.681669235229492\n",
-      "Lz4 compressed size:  2021826967\n",
-      "1006964608\n",
-      "4\n",
-      "4\n",
-      "251741152\n",
-      "-----\n",
-      "Input size:  1006964608\n",
-      "Cascaded GPU compressor output size:  16717276\n",
-      "Cascaded GPU decompressor output size:  1006964608\n",
-      "Cascaded GPU compress/decompress round trip time:  0.11769247055053711\n",
-      "1010913478\n",
-      "Lz4 zarr time:  2.57978892326355\n",
-      "Lz4 compressed size:  1010913478\n",
-      "503482304\n",
-      "4\n",
-      "4\n",
-      "125870576\n",
-      "-----\n",
-      "Input size:  503482304\n",
-      "Cascaded GPU compressor output size:  8358716\n",
-      "Cascaded GPU decompressor output size:  503482304\n",
-      "Cascaded GPU compress/decompress round trip time:  0.05775332450866699\n",
-      "505456734\n",
-      "Lz4 zarr time:  1.2365527153015137\n",
-      "Lz4 compressed size:  505456734\n",
-      "251741152\n",
-      "4\n",
-      "4\n",
-      "62935288\n",
-      "-----\n",
-      "Input size:  251741152\n",
-      "Cascaded GPU compressor output size:  4179436\n",
-      "Cascaded GPU decompressor output size:  251741152\n",
-      "Cascaded GPU compress/decompress round trip time:  0.0284881591796875\n",
-      "252728362\n",
-      "Lz4 zarr time:  0.5986642837524414\n",
-      "Lz4 compressed size:  252728362\n",
-      "125870576\n",
-      "4\n",
-      "4\n",
-      "31467644\n",
-      "-----\n",
-      "Input size:  125870576\n",
-      "Cascaded GPU compressor output size:  2089796\n",
-      "Cascaded GPU decompressor output size:  125870576\n",
-      "Cascaded GPU compress/decompress round trip time:  0.01472783088684082\n",
-      "126364175\n",
-      "Lz4 zarr time:  0.30330395698547363\n",
-      "Lz4 compressed size:  126364175\n",
-      "62935288\n",
-      "4\n",
-      "4\n",
-      "15733820\n",
-      "-----\n",
-      "Input size:  62935280\n",
-      "Cascaded GPU compressor output size:  1044976\n",
-      "Cascaded GPU decompressor output size:  62935280\n",
-      "Cascaded GPU compress/decompress round trip time:  0.007399559020996094\n",
-      "63182074\n",
-      "Lz4 zarr time:  0.1610257625579834\n",
-      "Lz4 compressed size:  63182074\n",
-      "31467644\n",
-      "4\n",
-      "4\n",
-      "7866908\n",
-      "-----\n",
-      "Input size:  31467632\n",
-      "Cascaded GPU compressor output size:  522532\n",
-      "Cascaded GPU decompressor output size:  31467632\n",
-      "Cascaded GPU compress/decompress round trip time:  0.004503726959228516\n",
-      "31591024\n",
-      "Lz4 zarr time:  0.1471562385559082\n",
-      "Lz4 compressed size:  31591024\n",
-      "15733822\n",
-      "4\n",
-      "4\n",
-      "3933452\n",
-      "-----\n",
-      "Input size:  15733808\n",
-      "Cascaded GPU compressor output size:  261344\n",
-      "Cascaded GPU decompressor output size:  15733808\n",
-      "Cascaded GPU compress/decompress round trip time:  0.0025734901428222656\n",
-      "15795499\n",
-      "Lz4 zarr time:  0.03436875343322754\n",
-      "Lz4 compressed size:  15795499\n",
-      "7866911\n",
-      "4\n",
-      "4\n",
-      "1966724\n",
-      "-----\n",
-      "Input size:  7866896\n",
-      "Cascaded GPU compressor output size:  130716\n",
-      "Cascaded GPU decompressor output size:  7866896\n",
-      "Cascaded GPU compress/decompress round trip time:  0.0018618106842041016\n",
-      "7897736\n",
-      "Lz4 zarr time:  0.010539531707763672\n",
-      "Lz4 compressed size:  7897736\n",
-      "3933455\n",
-      "4\n",
-      "4\n",
-      "983360\n",
-      "-----\n",
-      "Input size:  3933440\n",
-      "Cascaded GPU compressor output size:  65436\n",
-      "Cascaded GPU decompressor output size:  3933440\n",
-      "Cascaded GPU compress/decompress round trip time:  0.0017323493957519531\n",
-      "3948855\n",
-      "Lz4 zarr time:  0.028203964233398438\n",
-      "Lz4 compressed size:  3948855\n",
-      "1966727\n",
-      "4\n",
-      "4\n",
-      "491680\n",
-      "-----\n",
-      "Input size:  1966720\n",
-      "Cascaded GPU compressor output size:  32796\n",
-      "Cascaded GPU decompressor output size:  1966720\n",
-      "Cascaded GPU compress/decompress round trip time:  0.0020630359649658203\n",
-      "1974422\n",
-      "Lz4 zarr time:  0.002621889114379883\n",
-      "Lz4 compressed size:  1974422\n",
-      "983363\n",
-      "4\n",
-      "4\n",
-      "245840\n",
-      "-----\n",
-      "Input size:  983360\n",
-      "Cascaded GPU compressor output size:  16476\n",
-      "Cascaded GPU decompressor output size:  983360\n",
-      "Cascaded GPU compress/decompress round trip time:  0.0014410018920898438\n",
-      "987206\n",
-      "Lz4 zarr time:  0.0007197856903076172\n",
-      "Lz4 compressed size:  987206\n",
-      "491681\n",
-      "4\n",
-      "4\n",
-      "122920\n",
-      "-----\n",
-      "Input size:  491680\n",
-      "Cascaded GPU compressor output size:  8316\n",
-      "Cascaded GPU decompressor output size:  491680\n",
-      "Cascaded GPU compress/decompress round trip time:  0.0011644363403320312\n",
-      "493597\n",
-      "Lz4 zarr time:  0.000965118408203125\n",
-      "Lz4 compressed size:  493597\n",
-      "245840\n",
-      "4\n",
-      "4\n",
-      "61460\n",
-      "-----\n",
-      "Input size:  245840\n",
-      "Cascaded GPU compressor output size:  4236\n",
-      "Cascaded GPU decompressor output size:  245840\n",
-      "Cascaded GPU compress/decompress round trip time:  0.0015044212341308594\n",
-      "246793\n",
-      "Lz4 zarr time:  0.0004220008850097656\n",
-      "Lz4 compressed size:  246793\n",
-      "122920\n",
-      "4\n",
-      "4\n",
-      "30728\n",
-      "-----\n",
-      "Input size:  122912\n",
-      "Cascaded GPU compressor output size:  2184\n",
-      "Cascaded GPU decompressor output size:  122912\n",
-      "Cascaded GPU compress/decompress round trip time:  0.0011115074157714844\n",
-      "123383\n",
-      "Lz4 zarr time:  0.0002646446228027344\n",
-      "Lz4 compressed size:  123383\n",
-      "61460\n",
-      "4\n",
-      "4\n",
-      "15364\n",
-      "-----\n",
-      "Input size:  61456\n",
-      "Cascaded GPU compressor output size:  1148\n",
-      "Cascaded GPU decompressor output size:  61456\n",
-      "Cascaded GPU compress/decompress round trip time:  0.0009233951568603516\n",
-      "61678\n",
-      "Lz4 zarr time:  0.00020623207092285156\n",
-      "Lz4 compressed size:  61678\n",
-      "30730\n",
-      "4\n",
-      "4\n",
-      "7680\n",
-      "-----\n",
-      "Input size:  30720\n",
-      "Cascaded GPU compressor output size:  632\n",
-      "Cascaded GPU decompressor output size:  30720\n",
-      "Cascaded GPU compress/decompress round trip time:  0.001186370849609375\n",
-      "30822\n",
-      "Lz4 zarr time:  0.00011777877807617188\n",
-      "Lz4 compressed size:  30822\n",
-      "15365\n",
-      "4\n",
-      "4\n",
-      "3840\n",
-      "-----\n",
-      "Input size:  15360\n",
-      "Cascaded GPU compressor output size:  360\n",
-      "Cascaded GPU decompressor output size:  15360\n",
-      "Cascaded GPU compress/decompress round trip time:  0.001523733139038086\n",
-      "15401\n",
-      "Lz4 zarr time:  0.0003781318664550781\n",
-      "Lz4 compressed size:  15401\n",
-      "7682\n",
-      "4\n",
-      "4\n",
-      "1920\n",
-      "-----\n",
-      "Input size:  7680\n",
-      "Cascaded GPU compressor output size:  224\n",
-      "Cascaded GPU decompressor output size:  7680\n",
-      "Cascaded GPU compress/decompress round trip time:  0.0012781620025634766\n",
-      "7699\n",
-      "Lz4 zarr time:  0.0001780986785888672\n",
-      "Lz4 compressed size:  7699\n",
-      "3841\n",
-      "4\n",
-      "4\n",
-      "960\n",
-      "-----\n",
-      "Input size:  3840\n",
-      "Cascaded GPU compressor output size:  156\n",
-      "Cascaded GPU decompressor output size:  3840\n",
-      "Cascaded GPU compress/decompress round trip time:  0.001318216323852539\n",
-      "3852\n",
-      "Lz4 zarr time:  0.00019931793212890625\n",
-      "Lz4 compressed size:  3852\n"
-     ]
-    }
-   ],
-   "source": [
-    "input_size = []\n",
-    "cascaded_size = []\n",
-    "cascaded_temp_size = []\n",
-    "cascaded_round_trip_time = []\n",
-    "lz4_gpu_size = []\n",
-    "lz4_gpu_temp_size = []\n",
-    "lz4_gpu_round_trip_time = []\n",
-    "lz4_size = []\n",
-    "lz4_round_trip_time = []\n",
-    "for size in sizes:\n",
-    "    data_host = get_host_data(size, DTYPE)\n",
-    "    data_gpu = cp.array(data_host)\n",
-    "    \"\"\"Cascaded GPU\"\"\"\n",
-    "    t_gpu = time.time()\n",
-    "    compressor = kvikio.nvcomp.CascadedManager(dtype=data_gpu.dtype)\n",
-    "    compressed = compressor.compress(data_gpu)\n",
-    "    output_size = compressed.nbytes\n",
-    "\n",
-    "    decompressed = compressor.decompress(compressed)\n",
-    "    decompressed_size = decompressed.size * decompressed.itemsize\n",
-    "    input_size.append(data_gpu.size * data_gpu.itemsize)\n",
-    "    cascaded_round_trip_time.append(time.time() - t_gpu)\n",
-    "    cascaded_size.append(output_size)\n",
-    "    print('-----')\n",
-    "    print('Input size: ', data_gpu.size * data_gpu.itemsize)\n",
-    "    print('Cascaded GPU compressor output size: ', output_size)\n",
-    "    print('Cascaded GPU decompressor output size: ', decompressed_size)\n",
-    "    print('Cascaded GPU compress/decompress round trip time: ',time.time() - t_gpu)\n",
-    "    \n",
-    "    del compressor\n",
-    "        \n",
-    "    \"\"\"LZ4 Host\"\"\"\n",
-    "    lz4 = zarr.LZ4()\n",
-    "    t_host = time.time()\n",
-    "    host_compressed = lz4.encode(data_gpu.get())\n",
-    "    del data_gpu\n",
-    "    print(len(host_compressed))\n",
-    "    host_compressed = host_compressed[:2113929216]\n",
-    "    host_decompressed = lz4.decode(host_compressed)\n",
-    "    print('Lz4 zarr time: ', time.time() - t_host)\n",
-    "    print('Lz4 compressed size: ', len(host_compressed))\n",
-    "    lz4_size.append(len(host_compressed))\n",
-    "    lz4_round_trip_time.append(time.time() - t_host)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 52,
-   "id": "c981e8bc-e96a-4af4-9fe1-414aa2ff4c99",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "2013929216\n",
-      "4\n",
-      "4\n",
-      "503482304\n",
-      "lz4 GPU compressor output size:  2022340697\n",
-      "lz4 GPU decompressor output size:  2013929216\n",
-      "lz4 GPU compress/decompress round trip time:  0.7271463871002197\n",
-      "1006964608\n",
-      "4\n",
-      "4\n",
-      "251741152\n",
-      "lz4 GPU compressor output size:  1011170371\n",
-      "lz4 GPU decompressor output size:  1006964608\n",
-      "lz4 GPU compress/decompress round trip time:  0.36713171005249023\n",
-      "503482304\n",
-      "4\n",
-      "4\n",
-      "125870576\n",
-      "lz4 GPU compressor output size:  505585200\n",
-      "lz4 GPU decompressor output size:  503482304\n",
-      "lz4 GPU compress/decompress round trip time:  0.1900792121887207\n",
-      "251741152\n",
-      "4\n",
-      "4\n",
-      "62935288\n",
-      "lz4 GPU compressor output size:  252792621\n",
-      "lz4 GPU decompressor output size:  251741152\n",
-      "lz4 GPU compress/decompress round trip time:  0.09049177169799805\n",
-      "125870576\n",
-      "4\n",
-      "4\n",
-      "31467644\n",
-      "lz4 GPU compressor output size:  126396327\n",
-      "lz4 GPU decompressor output size:  125870576\n",
-      "lz4 GPU compress/decompress round trip time:  0.04643416404724121\n",
-      "62935288\n",
-      "4\n",
-      "4\n",
-      "15733820\n",
-      "lz4 GPU compressor output size:  63198181\n",
-      "lz4 GPU decompressor output size:  62935280\n",
-      "lz4 GPU compress/decompress round trip time:  0.02284073829650879\n",
-      "31467644\n",
-      "4\n",
-      "4\n",
-      "7866908\n",
-      "lz4 GPU compressor output size:  31599109\n",
-      "lz4 GPU decompressor output size:  31467632\n",
-      "lz4 GPU compress/decompress round trip time:  0.015845537185668945\n",
-      "15733822\n",
-      "4\n",
-      "4\n",
-      "3933452\n",
-      "lz4 GPU compressor output size:  15799573\n",
-      "lz4 GPU decompressor output size:  15733808\n",
-      "lz4 GPU compress/decompress round trip time:  0.009501934051513672\n",
-      "7866911\n",
-      "4\n",
-      "4\n",
-      "1966724\n",
-      "lz4 GPU compressor output size:  7899801\n",
-      "lz4 GPU decompressor output size:  7866896\n",
-      "lz4 GPU compress/decompress round trip time:  0.011568546295166016\n",
-      "3933455\n",
-      "4\n",
-      "4\n",
-      "983360\n",
-      "lz4 GPU compressor output size:  3949915\n",
-      "lz4 GPU decompressor output size:  3933440\n",
-      "lz4 GPU compress/decompress round trip time:  0.00696110725402832\n",
-      "1966727\n",
-      "4\n",
-      "4\n",
-      "491680\n",
-      "lz4 GPU compressor output size:  1974981\n",
-      "lz4 GPU decompressor output size:  1966720\n",
-      "lz4 GPU compress/decompress round trip time:  0.012327194213867188\n",
-      "983363\n",
-      "4\n",
-      "4\n",
-      "245840\n",
-      "lz4 GPU compressor output size:  987514\n",
-      "lz4 GPU decompressor output size:  983360\n",
-      "lz4 GPU compress/decompress round trip time:  0.006538867950439453\n",
-      "491681\n",
-      "4\n",
-      "4\n",
-      "122920\n",
-      "lz4 GPU compressor output size:  493774\n",
-      "lz4 GPU decompressor output size:  491680\n",
-      "lz4 GPU compress/decompress round trip time:  0.012677907943725586\n",
-      "245840\n",
-      "4\n",
-      "4\n",
-      "61460\n",
-      "lz4 GPU compressor output size:  246904\n",
-      "lz4 GPU decompressor output size:  245840\n",
-      "lz4 GPU compress/decompress round trip time:  0.006706953048706055\n",
-      "122920\n",
-      "4\n",
-      "4\n",
-      "30728\n",
-      "lz4 GPU compressor output size:  123459\n",
-      "lz4 GPU decompressor output size:  122912\n",
-      "lz4 GPU compress/decompress round trip time:  0.010996580123901367\n",
-      "61460\n",
-      "4\n",
-      "4\n",
-      "15364\n",
-      "lz4 GPU compressor output size:  61745\n",
-      "lz4 GPU decompressor output size:  61456\n",
-      "lz4 GPU compress/decompress round trip time:  0.006911039352416992\n",
-      "30730\n",
-      "4\n",
-      "4\n",
-      "7680\n",
-      "lz4 GPU compressor output size:  30907\n",
-      "lz4 GPU decompressor output size:  30720\n",
-      "lz4 GPU compress/decompress round trip time:  0.004134178161621094\n",
-      "15365\n",
-      "4\n",
-      "4\n",
-      "3840\n",
-      "lz4 GPU compressor output size:  15498\n",
-      "lz4 GPU decompressor output size:  15360\n",
-      "lz4 GPU compress/decompress round trip time:  0.0048847198486328125\n",
-      "7682\n",
-      "4\n",
-      "4\n",
-      "1920\n",
-      "lz4 GPU compressor output size:  7787\n",
-      "lz4 GPU decompressor output size:  7680\n",
-      "lz4 GPU compress/decompress round trip time:  0.0031135082244873047\n",
-      "3841\n",
-      "4\n",
-      "4\n",
-      "960\n",
-      "lz4 GPU compressor output size:  3940\n",
-      "lz4 GPU decompressor output size:  3840\n",
-      "lz4 GPU compress/decompress round trip time:  0.0027506351470947266\n"
-     ]
-    }
-   ],
-   "source": [
-    "lz4_gpu_size = []\n",
-    "lz4_gpu_temp_size = []\n",
-    "lz4_gpu_round_trip_time = []\n",
-    "for size in sizes:\n",
-    "    data_host = get_host_data(size, DTYPE)\n",
-    "    data_gpu = cp.array(data_host)\n",
-    "\n",
-    "    \"\"\"LZ4 GPU\"\"\"\n",
-    "    data_gpu = cp.array(data_host)\n",
-    "    t_gpu = time.time()\n",
-    "    compressor = kvikio.nvcomp.LZ4Manager(dtype=data_gpu.dtype)\n",
-    "    compressed = compressor.compress(data_gpu)\n",
-    "    output_size = compressed.nbytes\n",
-    "\n",
-    "    decompressed = compressor.decompress(compressed)\n",
-    "    decompressed_size = decompressed.size * decompressed.itemsize\n",
-    "    lz4_gpu_round_trip_time.append(time.time() - t_gpu)\n",
-    "    lz4_gpu_size.append(output_size)\n",
-    "    print('lz4 GPU compressor output size: ', output_size)\n",
-    "    print('lz4 GPU decompressor output size: ', decompressed_size)\n",
-    "    print('lz4 GPU compress/decompress round trip time: ',time.time() - t_gpu)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 53,
-   "id": "0b9e6efb-439b-4d9e-b221-1a728adee7d6",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# zarr lz4 max buffer size is 264241152 int64s\n",
-    "# zarr lz4 max buffer size is 2113929216 bytes\n",
-    "# cascaded max buffer size is 2147483640 bytes\n",
-    "# cascaded max buffer size is 268435456 int64s"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 54,
-   "id": "9cd69f83-88de-4929-b760-b8ebfb916b8f",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[2013929216, 1006964608, 503482304, 251741152, 125870576, 62935280, 31467632, 15733808, 7866896, 3933440, 1966720, 983360, 491680, 245840, 122912, 61456, 30720, 15360, 7680, 3840]\n",
-      "[33434464, 16717276, 8358716, 4179436, 2089796, 1044976, 522532, 261344, 130716, 65436, 32796, 16476, 8316, 4236, 2184, 1148, 632, 360, 224, 156]\n",
-      "[]\n",
-      "[0.10751104354858398, 0.11756682395935059, 0.05767321586608887, 0.028416156768798828, 0.014620304107666016, 0.007331132888793945, 0.004427194595336914, 0.0025060176849365234, 0.0017902851104736328, 0.0016641616821289062, 0.001974821090698242, 0.0013790130615234375, 0.0011060237884521484, 0.0014438629150390625, 0.0010533332824707031, 0.0008640289306640625, 0.001127481460571289, 0.0014081001281738281, 0.0011692047119140625, 0.0012063980102539062]\n",
-      "[2022340697, 1011170371, 505585200, 252792621, 126396327, 63198181, 31599109, 15799573, 7899801, 3949915, 1974981, 987514, 493774, 246904, 123459, 61745, 30907, 15498, 7787, 3940]\n",
-      "[]\n",
-      "[0.7270452976226807, 0.3670234680175781, 0.18999958038330078, 0.09043264389038086, 0.04634451866149902, 0.022789478302001953, 0.015785932540893555, 0.009443283081054688, 0.011508703231811523, 0.00690460205078125, 0.012271881103515625, 0.00648951530456543, 0.012626171112060547, 0.006663322448730469, 0.010945320129394531, 0.00687098503112793, 0.004094123840332031, 0.004844188690185547, 0.0030717849731445312, 0.0027098655700683594]\n",
-      "[2021826967, 1010913478, 505456734, 252728362, 126364175, 63182074, 31591024, 15795499, 7897736, 3948855, 1974422, 987206, 493597, 246793, 123383, 61678, 30822, 15401, 7699, 3852]\n",
-      "[4.681788921356201, 2.579982280731201, 1.2367866039276123, 0.5987403392791748, 0.3033754825592041, 0.16110515594482422, 0.1472797393798828, 0.03442859649658203, 0.010602712631225586, 0.028273344039916992, 0.0026633739471435547, 0.0007534027099609375, 0.0009970664978027344, 0.0004544258117675781, 0.0002968311309814453, 0.0002384185791015625, 0.00015044212341308594, 0.00044274330139160156, 0.00023889541625976562, 0.00026869773864746094]\n"
-     ]
-    }
-   ],
-   "source": [
-    "print(input_size)\n",
-    "print(cascaded_size)\n",
-    "print(cascaded_temp_size)\n",
-    "print(cascaded_round_trip_time)\n",
-    "print(lz4_gpu_size)\n",
-    "print(lz4_gpu_temp_size)\n",
-    "print(lz4_gpu_round_trip_time)\n",
-    "print(lz4_size)\n",
-    "print(lz4_round_trip_time)\n",
-    "df = pd.DataFrame({\n",
-    "    'Input Size (Bytes)': input_size,\n",
-    "    'cascaded_size': cascaded_size,\n",
-    "    'cascaded_round_trip_time': cascaded_round_trip_time,\n",
-    "    'lz4_gpu_size': lz4_gpu_size,\n",
-    "    'lz4_gpu_round_trip_time': lz4_gpu_round_trip_time,\n",
-    "    'lz4_size': lz4_size,\n",
-    "    'lz4_round_trip_time': lz4_round_trip_time\n",
-    "})"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 55,
-   "id": "c7a23383-a073-4156-9be6-9da6b8c9026e",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "### You'll need the following to display the upcoming plots. ###\n",
-    "\n",
-    "# !conda install -c conda-forge plotly\n",
-    "# !npm install require"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 56,
-   "id": "8a7d2c60-79d7-4840-a5fb-c7e1eb42f829",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Index(['Input Size (Bytes)', 'cascaded_size', 'cascaded_round_trip_time',\n",
-      "       'lz4_gpu_size', 'lz4_gpu_round_trip_time', 'lz4_size',\n",
-      "       'lz4_round_trip_time', 'Cascaded Compression Ratio',\n",
-      "       'Lz4 Gpu Compression Ratio', 'Lz4 Host Compression Ratio',\n",
-      "       'Cascaded Speedup', 'Lz4 Gpu Speedup'],\n",
-      "      dtype='object')\n"
-     ]
-    }
-   ],
-   "source": [
-    "df['Cascaded Compression Ratio'] = df['Input Size (Bytes)'] / df['cascaded_size']\n",
-    "df['Lz4 Gpu Compression Ratio'] = df['Input Size (Bytes)'] / df['lz4_gpu_size']\n",
-    "df['Lz4 Host Compression Ratio'] = df['Input Size (Bytes)'] / df['lz4_size']\n",
-    "df['Cascaded Speedup'] = df['lz4_round_trip_time'] / df['cascaded_round_trip_time']\n",
-    "df['Lz4 Gpu Speedup'] = df['lz4_round_trip_time'] / df['lz4_gpu_round_trip_time']\n",
-    "print(df.columns)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 57,
-   "id": "8c6f225a-61e6-42b2-a991-6eeab56aae48",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "application/vnd.plotly.v1+json": {
-       "config": {
-        "plotlyServerURL": "https://plot.ly"
-       },
-       "data": [
-        {
-         "hovertemplate": "variable=Cascaded Speedup<br>Input Size (Bytes)=%{x}<br>Multiple Faster=%{y}<extra></extra>",
-         "legendgroup": "Cascaded Speedup",
-         "line": {
-          "color": "#636efa",
-          "dash": "solid"
-         },
-         "marker": {
-          "symbol": "circle"
-         },
-         "mode": "lines",
-         "name": "Cascaded Speedup",
-         "orientation": "v",
-         "showlegend": true,
-         "type": "scatter",
-         "x": [
-          2013929216,
-          1006964608,
-          503482304,
-          251741152,
-          125870576,
-          62935280,
-          31467632,
-          15733808,
-          7866896,
-          3933440,
-          1966720,
-          983360,
-          491680,
-          245840,
-          122912,
-          61456,
-          30720,
-          15360,
-          7680,
-          3840
-         ],
-         "xaxis": "x",
-         "y": [
-          43.54705123144407,
-          21.94481567030547,
-          21.444731065444667,
-          21.07041934455389,
-          20.75028537882,
-          21.975478877361866,
-          33.267057999892295,
-          13.738369327371325,
-          5.922359834864829,
-          16.989541547277938,
-          1.3486659422914402,
-          0.5463347164591977,
-          0.9014873895236042,
-          0.31472919418758255,
-          0.2818017202354006,
-          0.27593818984547464,
-          0.13343201522520617,
-          0.3144260074500508,
-          0.20432300163132136,
-          0.22272727272727272
-         ],
-         "yaxis": "y"
-        },
-        {
-         "hovertemplate": "variable=Lz4 Gpu Speedup<br>Input Size (Bytes)=%{x}<br>Multiple Faster=%{y}<extra></extra>",
-         "legendgroup": "Lz4 Gpu Speedup",
-         "line": {
-          "color": "#EF553B",
-          "dash": "solid"
-         },
-         "marker": {
-          "symbol": "circle"
-         },
-         "mode": "lines",
-         "name": "Lz4 Gpu Speedup",
-         "orientation": "v",
-         "showlegend": true,
-         "type": "scatter",
-         "x": [
-          2013929216,
-          1006964608,
-          503482304,
-          251741152,
-          125870576,
-          62935280,
-          31467632,
-          15733808,
-          7866896,
-          3933440,
-          1966720,
-          983360,
-          491680,
-          245840,
-          122912,
-          61456,
-          30720,
-          15360,
-          7680,
-          3840
-         ],
-         "xaxis": "x",
-         "y": [
-          6.439473491768513,
-          7.029474967000302,
-          6.5094175546732655,
-          6.620843022182852,
-          6.5460919936414195,
-          7.069277927730003,
-          9.329809246197762,
-          3.645829125429206,
-          0.9212777858341448,
-          4.094854972375691,
-          0.21703061858874728,
-          0.11609537455453911,
-          0.07896823898183467,
-          0.06819808215256906,
-          0.027119456303912173,
-          0.03469933030292515,
-          0.036745865362217564,
-          0.09139679102273846,
-          0.0777708786091276,
-          0.09915537568185817
-         ],
-         "yaxis": "y"
-        }
-       ],
-       "layout": {
-        "legend": {
-         "title": {
-          "text": "variable"
-         },
-         "tracegroupgap": 0
-        },
-        "template": {
-         "data": {
-          "bar": [
-           {
-            "error_x": {
-             "color": "#2a3f5f"
-            },
-            "error_y": {
-             "color": "#2a3f5f"
-            },
-            "marker": {
-             "line": {
-              "color": "#E5ECF6",
-              "width": 0.5
-             },
-             "pattern": {
-              "fillmode": "overlay",
-              "size": 10,
-              "solidity": 0.2
-             }
-            },
-            "type": "bar"
-           }
-          ],
-          "barpolar": [
-           {
-            "marker": {
-             "line": {
-              "color": "#E5ECF6",
-              "width": 0.5
-             },
-             "pattern": {
-              "fillmode": "overlay",
-              "size": 10,
-              "solidity": 0.2
-             }
-            },
-            "type": "barpolar"
-           }
-          ],
-          "carpet": [
-           {
-            "aaxis": {
-             "endlinecolor": "#2a3f5f",
-             "gridcolor": "white",
-             "linecolor": "white",
-             "minorgridcolor": "white",
-             "startlinecolor": "#2a3f5f"
-            },
-            "baxis": {
-             "endlinecolor": "#2a3f5f",
-             "gridcolor": "white",
-             "linecolor": "white",
-             "minorgridcolor": "white",
-             "startlinecolor": "#2a3f5f"
-            },
-            "type": "carpet"
-           }
-          ],
-          "choropleth": [
-           {
-            "colorbar": {
-             "outlinewidth": 0,
-             "ticks": ""
-            },
-            "type": "choropleth"
-           }
-          ],
-          "contour": [
-           {
-            "colorbar": {
-             "outlinewidth": 0,
-             "ticks": ""
-            },
-            "colorscale": [
-             [
-              0,
-              "#0d0887"
-             ],
-             [
-              0.1111111111111111,
-              "#46039f"
-             ],
-             [
-              0.2222222222222222,
-              "#7201a8"
-             ],
-             [
-              0.3333333333333333,
-              "#9c179e"
-             ],
-             [
-              0.4444444444444444,
-              "#bd3786"
-             ],
-             [
-              0.5555555555555556,
-              "#d8576b"
-             ],
-             [
-              0.6666666666666666,
-              "#ed7953"
-             ],
-             [
-              0.7777777777777778,
-              "#fb9f3a"
-             ],
-             [
-              0.8888888888888888,
-              "#fdca26"
-             ],
-             [
-              1,
-              "#f0f921"
-             ]
-            ],
-            "type": "contour"
-           }
-          ],
-          "contourcarpet": [
-           {
-            "colorbar": {
-             "outlinewidth": 0,
-             "ticks": ""
-            },
-            "type": "contourcarpet"
-           }
-          ],
-          "heatmap": [
-           {
-            "colorbar": {
-             "outlinewidth": 0,
-             "ticks": ""
-            },
-            "colorscale": [
-             [
-              0,
-              "#0d0887"
-             ],
-             [
-              0.1111111111111111,
-              "#46039f"
-             ],
-             [
-              0.2222222222222222,
-              "#7201a8"
-             ],
-             [
-              0.3333333333333333,
-              "#9c179e"
-             ],
-             [
-              0.4444444444444444,
-              "#bd3786"
-             ],
-             [
-              0.5555555555555556,
-              "#d8576b"
-             ],
-             [
-              0.6666666666666666,
-              "#ed7953"
-             ],
-             [
-              0.7777777777777778,
-              "#fb9f3a"
-             ],
-             [
-              0.8888888888888888,
-              "#fdca26"
-             ],
-             [
-              1,
-              "#f0f921"
-             ]
-            ],
-            "type": "heatmap"
-           }
-          ],
-          "heatmapgl": [
-           {
-            "colorbar": {
-             "outlinewidth": 0,
-             "ticks": ""
-            },
-            "colorscale": [
-             [
-              0,
-              "#0d0887"
-             ],
-             [
-              0.1111111111111111,
-              "#46039f"
-             ],
-             [
-              0.2222222222222222,
-              "#7201a8"
-             ],
-             [
-              0.3333333333333333,
-              "#9c179e"
-             ],
-             [
-              0.4444444444444444,
-              "#bd3786"
-             ],
-             [
-              0.5555555555555556,
-              "#d8576b"
-             ],
-             [
-              0.6666666666666666,
-              "#ed7953"
-             ],
-             [
-              0.7777777777777778,
-              "#fb9f3a"
-             ],
-             [
-              0.8888888888888888,
-              "#fdca26"
-             ],
-             [
-              1,
-              "#f0f921"
-             ]
-            ],
-            "type": "heatmapgl"
-           }
-          ],
-          "histogram": [
-           {
-            "marker": {
-             "pattern": {
-              "fillmode": "overlay",
-              "size": 10,
-              "solidity": 0.2
-             }
-            },
-            "type": "histogram"
-           }
-          ],
-          "histogram2d": [
-           {
-            "colorbar": {
-             "outlinewidth": 0,
-             "ticks": ""
-            },
-            "colorscale": [
-             [
-              0,
-              "#0d0887"
-             ],
-             [
-              0.1111111111111111,
-              "#46039f"
-             ],
-             [
-              0.2222222222222222,
-              "#7201a8"
-             ],
-             [
-              0.3333333333333333,
-              "#9c179e"
-             ],
-             [
-              0.4444444444444444,
-              "#bd3786"
-             ],
-             [
-              0.5555555555555556,
-              "#d8576b"
-             ],
-             [
-              0.6666666666666666,
-              "#ed7953"
-             ],
-             [
-              0.7777777777777778,
-              "#fb9f3a"
-             ],
-             [
-              0.8888888888888888,
-              "#fdca26"
-             ],
-             [
-              1,
-              "#f0f921"
-             ]
-            ],
-            "type": "histogram2d"
-           }
-          ],
-          "histogram2dcontour": [
-           {
-            "colorbar": {
-             "outlinewidth": 0,
-             "ticks": ""
-            },
-            "colorscale": [
-             [
-              0,
-              "#0d0887"
-             ],
-             [
-              0.1111111111111111,
-              "#46039f"
-             ],
-             [
-              0.2222222222222222,
-              "#7201a8"
-             ],
-             [
-              0.3333333333333333,
-              "#9c179e"
-             ],
-             [
-              0.4444444444444444,
-              "#bd3786"
-             ],
-             [
-              0.5555555555555556,
-              "#d8576b"
-             ],
-             [
-              0.6666666666666666,
-              "#ed7953"
-             ],
-             [
-              0.7777777777777778,
-              "#fb9f3a"
-             ],
-             [
-              0.8888888888888888,
-              "#fdca26"
-             ],
-             [
-              1,
-              "#f0f921"
-             ]
-            ],
-            "type": "histogram2dcontour"
-           }
-          ],
-          "mesh3d": [
-           {
-            "colorbar": {
-             "outlinewidth": 0,
-             "ticks": ""
-            },
-            "type": "mesh3d"
-           }
-          ],
-          "parcoords": [
-           {
-            "line": {
-             "colorbar": {
-              "outlinewidth": 0,
-              "ticks": ""
-             }
-            },
-            "type": "parcoords"
-           }
-          ],
-          "pie": [
-           {
-            "automargin": true,
-            "type": "pie"
-           }
-          ],
-          "scatter": [
-           {
-            "fillpattern": {
-             "fillmode": "overlay",
-             "size": 10,
-             "solidity": 0.2
-            },
-            "type": "scatter"
-           }
-          ],
-          "scatter3d": [
-           {
-            "line": {
-             "colorbar": {
-              "outlinewidth": 0,
-              "ticks": ""
-             }
-            },
-            "marker": {
-             "colorbar": {
-              "outlinewidth": 0,
-              "ticks": ""
-             }
-            },
-            "type": "scatter3d"
-           }
-          ],
-          "scattercarpet": [
-           {
-            "marker": {
-             "colorbar": {
-              "outlinewidth": 0,
-              "ticks": ""
-             }
-            },
-            "type": "scattercarpet"
-           }
-          ],
-          "scattergeo": [
-           {
-            "marker": {
-             "colorbar": {
-              "outlinewidth": 0,
-              "ticks": ""
-             }
-            },
-            "type": "scattergeo"
-           }
-          ],
-          "scattergl": [
-           {
-            "marker": {
-             "colorbar": {
-              "outlinewidth": 0,
-              "ticks": ""
-             }
-            },
-            "type": "scattergl"
-           }
-          ],
-          "scattermapbox": [
-           {
-            "marker": {
-             "colorbar": {
-              "outlinewidth": 0,
-              "ticks": ""
-             }
-            },
-            "type": "scattermapbox"
-           }
-          ],
-          "scatterpolar": [
-           {
-            "marker": {
-             "colorbar": {
-              "outlinewidth": 0,
-              "ticks": ""
-             }
-            },
-            "type": "scatterpolar"
-           }
-          ],
-          "scatterpolargl": [
-           {
-            "marker": {
-             "colorbar": {
-              "outlinewidth": 0,
-              "ticks": ""
-             }
-            },
-            "type": "scatterpolargl"
-           }
-          ],
-          "scatterternary": [
-           {
-            "marker": {
-             "colorbar": {
-              "outlinewidth": 0,
-              "ticks": ""
-             }
-            },
-            "type": "scatterternary"
-           }
-          ],
-          "surface": [
-           {
-            "colorbar": {
-             "outlinewidth": 0,
-             "ticks": ""
-            },
-            "colorscale": [
-             [
-              0,
-              "#0d0887"
-             ],
-             [
-              0.1111111111111111,
-              "#46039f"
-             ],
-             [
-              0.2222222222222222,
-              "#7201a8"
-             ],
-             [
-              0.3333333333333333,
-              "#9c179e"
-             ],
-             [
-              0.4444444444444444,
-              "#bd3786"
-             ],
-             [
-              0.5555555555555556,
-              "#d8576b"
-             ],
-             [
-              0.6666666666666666,
-              "#ed7953"
-             ],
-             [
-              0.7777777777777778,
-              "#fb9f3a"
-             ],
-             [
-              0.8888888888888888,
-              "#fdca26"
-             ],
-             [
-              1,
-              "#f0f921"
-             ]
-            ],
-            "type": "surface"
-           }
-          ],
-          "table": [
-           {
-            "cells": {
-             "fill": {
-              "color": "#EBF0F8"
-             },
-             "line": {
-              "color": "white"
-             }
-            },
-            "header": {
-             "fill": {
-              "color": "#C8D4E3"
-             },
-             "line": {
-              "color": "white"
-             }
-            },
-            "type": "table"
-           }
-          ]
-         },
-         "layout": {
-          "annotationdefaults": {
-           "arrowcolor": "#2a3f5f",
-           "arrowhead": 0,
-           "arrowwidth": 1
-          },
-          "autotypenumbers": "strict",
-          "coloraxis": {
-           "colorbar": {
-            "outlinewidth": 0,
-            "ticks": ""
-           }
-          },
-          "colorscale": {
-           "diverging": [
-            [
-             0,
-             "#8e0152"
-            ],
-            [
-             0.1,
-             "#c51b7d"
-            ],
-            [
-             0.2,
-             "#de77ae"
-            ],
-            [
-             0.3,
-             "#f1b6da"
-            ],
-            [
-             0.4,
-             "#fde0ef"
-            ],
-            [
-             0.5,
-             "#f7f7f7"
-            ],
-            [
-             0.6,
-             "#e6f5d0"
-            ],
-            [
-             0.7,
-             "#b8e186"
-            ],
-            [
-             0.8,
-             "#7fbc41"
-            ],
-            [
-             0.9,
-             "#4d9221"
-            ],
-            [
-             1,
-             "#276419"
-            ]
-           ],
-           "sequential": [
-            [
-             0,
-             "#0d0887"
-            ],
-            [
-             0.1111111111111111,
-             "#46039f"
-            ],
-            [
-             0.2222222222222222,
-             "#7201a8"
-            ],
-            [
-             0.3333333333333333,
-             "#9c179e"
-            ],
-            [
-             0.4444444444444444,
-             "#bd3786"
-            ],
-            [
-             0.5555555555555556,
-             "#d8576b"
-            ],
-            [
-             0.6666666666666666,
-             "#ed7953"
-            ],
-            [
-             0.7777777777777778,
-             "#fb9f3a"
-            ],
-            [
-             0.8888888888888888,
-             "#fdca26"
-            ],
-            [
-             1,
-             "#f0f921"
-            ]
-           ],
-           "sequentialminus": [
-            [
-             0,
-             "#0d0887"
-            ],
-            [
-             0.1111111111111111,
-             "#46039f"
-            ],
-            [
-             0.2222222222222222,
-             "#7201a8"
-            ],
-            [
-             0.3333333333333333,
-             "#9c179e"
-            ],
-            [
-             0.4444444444444444,
-             "#bd3786"
-            ],
-            [
-             0.5555555555555556,
-             "#d8576b"
-            ],
-            [
-             0.6666666666666666,
-             "#ed7953"
-            ],
-            [
-             0.7777777777777778,
-             "#fb9f3a"
-            ],
-            [
-             0.8888888888888888,
-             "#fdca26"
-            ],
-            [
-             1,
-             "#f0f921"
-            ]
-           ]
-          },
-          "colorway": [
-           "#636efa",
-           "#EF553B",
-           "#00cc96",
-           "#ab63fa",
-           "#FFA15A",
-           "#19d3f3",
-           "#FF6692",
-           "#B6E880",
-           "#FF97FF",
-           "#FECB52"
-          ],
-          "font": {
-           "color": "#2a3f5f"
-          },
-          "geo": {
-           "bgcolor": "white",
-           "lakecolor": "white",
-           "landcolor": "#E5ECF6",
-           "showlakes": true,
-           "showland": true,
-           "subunitcolor": "white"
-          },
-          "hoverlabel": {
-           "align": "left"
-          },
-          "hovermode": "closest",
-          "mapbox": {
-           "style": "light"
-          },
-          "paper_bgcolor": "white",
-          "plot_bgcolor": "#E5ECF6",
-          "polar": {
-           "angularaxis": {
-            "gridcolor": "white",
-            "linecolor": "white",
-            "ticks": ""
-           },
-           "bgcolor": "#E5ECF6",
-           "radialaxis": {
-            "gridcolor": "white",
-            "linecolor": "white",
-            "ticks": ""
-           }
-          },
-          "scene": {
-           "xaxis": {
-            "backgroundcolor": "#E5ECF6",
-            "gridcolor": "white",
-            "gridwidth": 2,
-            "linecolor": "white",
-            "showbackground": true,
-            "ticks": "",
-            "zerolinecolor": "white"
-           },
-           "yaxis": {
-            "backgroundcolor": "#E5ECF6",
-            "gridcolor": "white",
-            "gridwidth": 2,
-            "linecolor": "white",
-            "showbackground": true,
-            "ticks": "",
-            "zerolinecolor": "white"
-           },
-           "zaxis": {
-            "backgroundcolor": "#E5ECF6",
-            "gridcolor": "white",
-            "gridwidth": 2,
-            "linecolor": "white",
-            "showbackground": true,
-            "ticks": "",
-            "zerolinecolor": "white"
-           }
-          },
-          "shapedefaults": {
-           "line": {
-            "color": "#2a3f5f"
-           }
-          },
-          "ternary": {
-           "aaxis": {
-            "gridcolor": "white",
-            "linecolor": "white",
-            "ticks": ""
-           },
-           "baxis": {
-            "gridcolor": "white",
-            "linecolor": "white",
-            "ticks": ""
-           },
-           "bgcolor": "#E5ECF6",
-           "caxis": {
-            "gridcolor": "white",
-            "linecolor": "white",
-            "ticks": ""
-           }
-          },
-          "title": {
-           "x": 0.05
-          },
-          "xaxis": {
-           "automargin": true,
-           "gridcolor": "white",
-           "linecolor": "white",
-           "ticks": "",
-           "title": {
-            "standoff": 15
-           },
-           "zerolinecolor": "white",
-           "zerolinewidth": 2
-          },
-          "yaxis": {
-           "automargin": true,
-           "gridcolor": "white",
-           "linecolor": "white",
-           "ticks": "",
-           "title": {
-            "standoff": 15
-           },
-           "zerolinecolor": "white",
-           "zerolinewidth": 2
-          }
-         }
-        },
-        "title": {
-         "text": "Gpu Acceleration over Zarr Lz4 - Ascending <class 'numpy.int32'>"
-        },
-        "xaxis": {
-         "anchor": "y",
-         "domain": [
-          0,
-          1
-         ],
-         "title": {
-          "text": "Input Size (Bytes)"
-         },
-         "type": "category"
-        },
-        "yaxis": {
-         "anchor": "x",
-         "domain": [
-          0,
-          1
-         ],
-         "title": {
-          "text": "Multiple Faster"
-         }
-        }
-       }
-      }
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    }
-   ],
-   "source": [
-    "import plotly.express as px\n",
-    "title = 'Gpu Acceleration over Zarr Lz4 - ' + TARGET + \" \" + str(DTYPE)\n",
-    "subtitle = 'Includes host->gpu copy time'\n",
-    "fig = px.line(df, x='Input Size (Bytes)',\n",
-    "              y=['Cascaded Speedup', 'Lz4 Gpu Speedup'],\n",
-    "              labels={'value': 'Multiple Faster'},\n",
-    "              title=title)\n",
-    "fig.update_xaxes(type='category')\n",
-    "fig.show()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 58,
-   "id": "e3d57a90-ca86-41da-9747-696151d66184",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "application/vnd.plotly.v1+json": {
-       "config": {
-        "plotlyServerURL": "https://plot.ly"
-       },
-       "data": [
-        {
-         "hovertemplate": "variable=Lz4 Gpu Compression Ratio<br>Input Size (Bytes)=%{x}<br>Compression Factor=%{y}<extra></extra>",
-         "legendgroup": "Lz4 Gpu Compression Ratio",
-         "line": {
-          "color": "#636efa",
-          "dash": "solid"
-         },
-         "marker": {
-          "symbol": "circle"
-         },
-         "mode": "lines",
-         "name": "Lz4 Gpu Compression Ratio",
-         "orientation": "v",
-         "showlegend": true,
-         "type": "scatter",
-         "x": [
-          2013929216,
-          1006964608,
-          503482304,
-          251741152,
-          125870576,
-          62935280,
-          31467632,
-          15733808,
-          7866896,
-          3933440,
-          1966720,
-          983360,
-          491680,
-          245840,
-          122912,
-          61456,
-          30720,
-          15360,
-          7680,
-          3840
-         ],
-         "xaxis": "x",
-         "y": [
-          0.9958407201059258,
-          0.9958406979470327,
-          0.995840669386683,
-          0.9958405866601621,
-          0.9958404566613712,
-          0.9958400543205508,
-          0.9958392181247895,
-          0.9958375457362044,
-          0.9958347052033336,
-          0.9958290241688745,
-          0.9958171749500375,
-          0.9957934773582957,
-          0.9957591934771778,
-          0.9956906327965525,
-          0.9955693793081104,
-          0.9953194590655113,
-          0.9939495907076067,
-          0.9910956252419667,
-          0.9862591498651598,
-          0.9746192893401016
-         ],
-         "yaxis": "y"
-        },
-        {
-         "hovertemplate": "variable=Cascaded Compression Ratio<br>Input Size (Bytes)=%{x}<br>Compression Factor=%{y}<extra></extra>",
-         "legendgroup": "Cascaded Compression Ratio",
-         "line": {
-          "color": "#EF553B",
-          "dash": "solid"
-         },
-         "marker": {
-          "symbol": "circle"
-         },
-         "mode": "lines",
-         "name": "Cascaded Compression Ratio",
-         "orientation": "v",
-         "showlegend": true,
-         "type": "scatter",
-         "x": [
-          2013929216,
-          1006964608,
-          503482304,
-          251741152,
-          125870576,
-          62935280,
-          31467632,
-          15733808,
-          7866896,
-          3933440,
-          1966720,
-          983360,
-          491680,
-          245840,
-          122912,
-          61456,
-          30720,
-          15360,
-          7680,
-          3840
-         ],
-         "xaxis": "x",
-         "y": [
-          60.23512792069884,
-          60.23496938137529,
-          60.23440729413465,
-          60.23328315112374,
-          60.231034990975196,
-          60.226531518427215,
-          60.2214448110355,
-          60.20344067589078,
-          60.183114538388566,
-          60.11125374411639,
-          59.96828881570923,
-          59.68438941490653,
-          59.12457912457913,
-          58.035882908404155,
-          56.27838827838828,
-          53.53310104529617,
-          48.607594936708864,
-          42.666666666666664,
-          34.285714285714285,
-          24.615384615384617
-         ],
-         "yaxis": "y"
-        },
-        {
-         "hovertemplate": "variable=Lz4 Host Compression Ratio<br>Input Size (Bytes)=%{x}<br>Compression Factor=%{y}<extra></extra>",
-         "legendgroup": "Lz4 Host Compression Ratio",
-         "line": {
-          "color": "#00cc96",
-          "dash": "solid"
-         },
-         "marker": {
-          "symbol": "circle"
-         },
-         "mode": "lines",
-         "name": "Lz4 Host Compression Ratio",
-         "orientation": "v",
-         "showlegend": true,
-         "type": "scatter",
-         "x": [
-          2013929216,
-          1006964608,
-          503482304,
-          251741152,
-          125870576,
-          62935280,
-          31467632,
-          15733808,
-          7866896,
-          3933440,
-          1966720,
-          983360,
-          491680,
-          245840,
-          122912,
-          61456,
-          30720,
-          15360,
-          7680,
-          3840
-         ],
-         "xaxis": "x",
-         "y": [
-          0.9960937552377597,
-          0.9960937606571312,
-          0.9960937705105339,
-          0.9960937902173401,
-          0.9960938375136782,
-          0.9960939237290628,
-          0.9960940803944817,
-          0.9960943937257063,
-          0.9960950834517639,
-          0.996096336786233,
-          0.9960991115374525,
-          0.9961041565792752,
-          0.9961162648881577,
-          0.9961384642190013,
-          0.9961826183509884,
-          0.9964006615000487,
-          0.996690675491532,
-          0.9973378352055061,
-          0.997532147032082,
-          0.9968847352024922
-         ],
-         "yaxis": "y"
-        }
-       ],
-       "layout": {
-        "legend": {
-         "title": {
-          "text": "variable"
-         },
-         "tracegroupgap": 0
-        },
-        "template": {
-         "data": {
-          "bar": [
-           {
-            "error_x": {
-             "color": "#2a3f5f"
-            },
-            "error_y": {
-             "color": "#2a3f5f"
-            },
-            "marker": {
-             "line": {
-              "color": "#E5ECF6",
-              "width": 0.5
-             },
-             "pattern": {
-              "fillmode": "overlay",
-              "size": 10,
-              "solidity": 0.2
-             }
-            },
-            "type": "bar"
-           }
-          ],
-          "barpolar": [
-           {
-            "marker": {
-             "line": {
-              "color": "#E5ECF6",
-              "width": 0.5
-             },
-             "pattern": {
-              "fillmode": "overlay",
-              "size": 10,
-              "solidity": 0.2
-             }
-            },
-            "type": "barpolar"
-           }
-          ],
-          "carpet": [
-           {
-            "aaxis": {
-             "endlinecolor": "#2a3f5f",
-             "gridcolor": "white",
-             "linecolor": "white",
-             "minorgridcolor": "white",
-             "startlinecolor": "#2a3f5f"
-            },
-            "baxis": {
-             "endlinecolor": "#2a3f5f",
-             "gridcolor": "white",
-             "linecolor": "white",
-             "minorgridcolor": "white",
-             "startlinecolor": "#2a3f5f"
-            },
-            "type": "carpet"
-           }
-          ],
-          "choropleth": [
-           {
-            "colorbar": {
-             "outlinewidth": 0,
-             "ticks": ""
-            },
-            "type": "choropleth"
-           }
-          ],
-          "contour": [
-           {
-            "colorbar": {
-             "outlinewidth": 0,
-             "ticks": ""
-            },
-            "colorscale": [
-             [
-              0,
-              "#0d0887"
-             ],
-             [
-              0.1111111111111111,
-              "#46039f"
-             ],
-             [
-              0.2222222222222222,
-              "#7201a8"
-             ],
-             [
-              0.3333333333333333,
-              "#9c179e"
-             ],
-             [
-              0.4444444444444444,
-              "#bd3786"
-             ],
-             [
-              0.5555555555555556,
-              "#d8576b"
-             ],
-             [
-              0.6666666666666666,
-              "#ed7953"
-             ],
-             [
-              0.7777777777777778,
-              "#fb9f3a"
-             ],
-             [
-              0.8888888888888888,
-              "#fdca26"
-             ],
-             [
-              1,
-              "#f0f921"
-             ]
-            ],
-            "type": "contour"
-           }
-          ],
-          "contourcarpet": [
-           {
-            "colorbar": {
-             "outlinewidth": 0,
-             "ticks": ""
-            },
-            "type": "contourcarpet"
-           }
-          ],
-          "heatmap": [
-           {
-            "colorbar": {
-             "outlinewidth": 0,
-             "ticks": ""
-            },
-            "colorscale": [
-             [
-              0,
-              "#0d0887"
-             ],
-             [
-              0.1111111111111111,
-              "#46039f"
-             ],
-             [
-              0.2222222222222222,
-              "#7201a8"
-             ],
-             [
-              0.3333333333333333,
-              "#9c179e"
-             ],
-             [
-              0.4444444444444444,
-              "#bd3786"
-             ],
-             [
-              0.5555555555555556,
-              "#d8576b"
-             ],
-             [
-              0.6666666666666666,
-              "#ed7953"
-             ],
-             [
-              0.7777777777777778,
-              "#fb9f3a"
-             ],
-             [
-              0.8888888888888888,
-              "#fdca26"
-             ],
-             [
-              1,
-              "#f0f921"
-             ]
-            ],
-            "type": "heatmap"
-           }
-          ],
-          "heatmapgl": [
-           {
-            "colorbar": {
-             "outlinewidth": 0,
-             "ticks": ""
-            },
-            "colorscale": [
-             [
-              0,
-              "#0d0887"
-             ],
-             [
-              0.1111111111111111,
-              "#46039f"
-             ],
-             [
-              0.2222222222222222,
-              "#7201a8"
-             ],
-             [
-              0.3333333333333333,
-              "#9c179e"
-             ],
-             [
-              0.4444444444444444,
-              "#bd3786"
-             ],
-             [
-              0.5555555555555556,
-              "#d8576b"
-             ],
-             [
-              0.6666666666666666,
-              "#ed7953"
-             ],
-             [
-              0.7777777777777778,
-              "#fb9f3a"
-             ],
-             [
-              0.8888888888888888,
-              "#fdca26"
-             ],
-             [
-              1,
-              "#f0f921"
-             ]
-            ],
-            "type": "heatmapgl"
-           }
-          ],
-          "histogram": [
-           {
-            "marker": {
-             "pattern": {
-              "fillmode": "overlay",
-              "size": 10,
-              "solidity": 0.2
-             }
-            },
-            "type": "histogram"
-           }
-          ],
-          "histogram2d": [
-           {
-            "colorbar": {
-             "outlinewidth": 0,
-             "ticks": ""
-            },
-            "colorscale": [
-             [
-              0,
-              "#0d0887"
-             ],
-             [
-              0.1111111111111111,
-              "#46039f"
-             ],
-             [
-              0.2222222222222222,
-              "#7201a8"
-             ],
-             [
-              0.3333333333333333,
-              "#9c179e"
-             ],
-             [
-              0.4444444444444444,
-              "#bd3786"
-             ],
-             [
-              0.5555555555555556,
-              "#d8576b"
-             ],
-             [
-              0.6666666666666666,
-              "#ed7953"
-             ],
-             [
-              0.7777777777777778,
-              "#fb9f3a"
-             ],
-             [
-              0.8888888888888888,
-              "#fdca26"
-             ],
-             [
-              1,
-              "#f0f921"
-             ]
-            ],
-            "type": "histogram2d"
-           }
-          ],
-          "histogram2dcontour": [
-           {
-            "colorbar": {
-             "outlinewidth": 0,
-             "ticks": ""
-            },
-            "colorscale": [
-             [
-              0,
-              "#0d0887"
-             ],
-             [
-              0.1111111111111111,
-              "#46039f"
-             ],
-             [
-              0.2222222222222222,
-              "#7201a8"
-             ],
-             [
-              0.3333333333333333,
-              "#9c179e"
-             ],
-             [
-              0.4444444444444444,
-              "#bd3786"
-             ],
-             [
-              0.5555555555555556,
-              "#d8576b"
-             ],
-             [
-              0.6666666666666666,
-              "#ed7953"
-             ],
-             [
-              0.7777777777777778,
-              "#fb9f3a"
-             ],
-             [
-              0.8888888888888888,
-              "#fdca26"
-             ],
-             [
-              1,
-              "#f0f921"
-             ]
-            ],
-            "type": "histogram2dcontour"
-           }
-          ],
-          "mesh3d": [
-           {
-            "colorbar": {
-             "outlinewidth": 0,
-             "ticks": ""
-            },
-            "type": "mesh3d"
-           }
-          ],
-          "parcoords": [
-           {
-            "line": {
-             "colorbar": {
-              "outlinewidth": 0,
-              "ticks": ""
-             }
-            },
-            "type": "parcoords"
-           }
-          ],
-          "pie": [
-           {
-            "automargin": true,
-            "type": "pie"
-           }
-          ],
-          "scatter": [
-           {
-            "fillpattern": {
-             "fillmode": "overlay",
-             "size": 10,
-             "solidity": 0.2
-            },
-            "type": "scatter"
-           }
-          ],
-          "scatter3d": [
-           {
-            "line": {
-             "colorbar": {
-              "outlinewidth": 0,
-              "ticks": ""
-             }
-            },
-            "marker": {
-             "colorbar": {
-              "outlinewidth": 0,
-              "ticks": ""
-             }
-            },
-            "type": "scatter3d"
-           }
-          ],
-          "scattercarpet": [
-           {
-            "marker": {
-             "colorbar": {
-              "outlinewidth": 0,
-              "ticks": ""
-             }
-            },
-            "type": "scattercarpet"
-           }
-          ],
-          "scattergeo": [
-           {
-            "marker": {
-             "colorbar": {
-              "outlinewidth": 0,
-              "ticks": ""
-             }
-            },
-            "type": "scattergeo"
-           }
-          ],
-          "scattergl": [
-           {
-            "marker": {
-             "colorbar": {
-              "outlinewidth": 0,
-              "ticks": ""
-             }
-            },
-            "type": "scattergl"
-           }
-          ],
-          "scattermapbox": [
-           {
-            "marker": {
-             "colorbar": {
-              "outlinewidth": 0,
-              "ticks": ""
-             }
-            },
-            "type": "scattermapbox"
-           }
-          ],
-          "scatterpolar": [
-           {
-            "marker": {
-             "colorbar": {
-              "outlinewidth": 0,
-              "ticks": ""
-             }
-            },
-            "type": "scatterpolar"
-           }
-          ],
-          "scatterpolargl": [
-           {
-            "marker": {
-             "colorbar": {
-              "outlinewidth": 0,
-              "ticks": ""
-             }
-            },
-            "type": "scatterpolargl"
-           }
-          ],
-          "scatterternary": [
-           {
-            "marker": {
-             "colorbar": {
-              "outlinewidth": 0,
-              "ticks": ""
-             }
-            },
-            "type": "scatterternary"
-           }
-          ],
-          "surface": [
-           {
-            "colorbar": {
-             "outlinewidth": 0,
-             "ticks": ""
-            },
-            "colorscale": [
-             [
-              0,
-              "#0d0887"
-             ],
-             [
-              0.1111111111111111,
-              "#46039f"
-             ],
-             [
-              0.2222222222222222,
-              "#7201a8"
-             ],
-             [
-              0.3333333333333333,
-              "#9c179e"
-             ],
-             [
-              0.4444444444444444,
-              "#bd3786"
-             ],
-             [
-              0.5555555555555556,
-              "#d8576b"
-             ],
-             [
-              0.6666666666666666,
-              "#ed7953"
-             ],
-             [
-              0.7777777777777778,
-              "#fb9f3a"
-             ],
-             [
-              0.8888888888888888,
-              "#fdca26"
-             ],
-             [
-              1,
-              "#f0f921"
-             ]
-            ],
-            "type": "surface"
-           }
-          ],
-          "table": [
-           {
-            "cells": {
-             "fill": {
-              "color": "#EBF0F8"
-             },
-             "line": {
-              "color": "white"
-             }
-            },
-            "header": {
-             "fill": {
-              "color": "#C8D4E3"
-             },
-             "line": {
-              "color": "white"
-             }
-            },
-            "type": "table"
-           }
-          ]
-         },
-         "layout": {
-          "annotationdefaults": {
-           "arrowcolor": "#2a3f5f",
-           "arrowhead": 0,
-           "arrowwidth": 1
-          },
-          "autotypenumbers": "strict",
-          "coloraxis": {
-           "colorbar": {
-            "outlinewidth": 0,
-            "ticks": ""
-           }
-          },
-          "colorscale": {
-           "diverging": [
-            [
-             0,
-             "#8e0152"
-            ],
-            [
-             0.1,
-             "#c51b7d"
-            ],
-            [
-             0.2,
-             "#de77ae"
-            ],
-            [
-             0.3,
-             "#f1b6da"
-            ],
-            [
-             0.4,
-             "#fde0ef"
-            ],
-            [
-             0.5,
-             "#f7f7f7"
-            ],
-            [
-             0.6,
-             "#e6f5d0"
-            ],
-            [
-             0.7,
-             "#b8e186"
-            ],
-            [
-             0.8,
-             "#7fbc41"
-            ],
-            [
-             0.9,
-             "#4d9221"
-            ],
-            [
-             1,
-             "#276419"
-            ]
-           ],
-           "sequential": [
-            [
-             0,
-             "#0d0887"
-            ],
-            [
-             0.1111111111111111,
-             "#46039f"
-            ],
-            [
-             0.2222222222222222,
-             "#7201a8"
-            ],
-            [
-             0.3333333333333333,
-             "#9c179e"
-            ],
-            [
-             0.4444444444444444,
-             "#bd3786"
-            ],
-            [
-             0.5555555555555556,
-             "#d8576b"
-            ],
-            [
-             0.6666666666666666,
-             "#ed7953"
-            ],
-            [
-             0.7777777777777778,
-             "#fb9f3a"
-            ],
-            [
-             0.8888888888888888,
-             "#fdca26"
-            ],
-            [
-             1,
-             "#f0f921"
-            ]
-           ],
-           "sequentialminus": [
-            [
-             0,
-             "#0d0887"
-            ],
-            [
-             0.1111111111111111,
-             "#46039f"
-            ],
-            [
-             0.2222222222222222,
-             "#7201a8"
-            ],
-            [
-             0.3333333333333333,
-             "#9c179e"
-            ],
-            [
-             0.4444444444444444,
-             "#bd3786"
-            ],
-            [
-             0.5555555555555556,
-             "#d8576b"
-            ],
-            [
-             0.6666666666666666,
-             "#ed7953"
-            ],
-            [
-             0.7777777777777778,
-             "#fb9f3a"
-            ],
-            [
-             0.8888888888888888,
-             "#fdca26"
-            ],
-            [
-             1,
-             "#f0f921"
-            ]
-           ]
-          },
-          "colorway": [
-           "#636efa",
-           "#EF553B",
-           "#00cc96",
-           "#ab63fa",
-           "#FFA15A",
-           "#19d3f3",
-           "#FF6692",
-           "#B6E880",
-           "#FF97FF",
-           "#FECB52"
-          ],
-          "font": {
-           "color": "#2a3f5f"
-          },
-          "geo": {
-           "bgcolor": "white",
-           "lakecolor": "white",
-           "landcolor": "#E5ECF6",
-           "showlakes": true,
-           "showland": true,
-           "subunitcolor": "white"
-          },
-          "hoverlabel": {
-           "align": "left"
-          },
-          "hovermode": "closest",
-          "mapbox": {
-           "style": "light"
-          },
-          "paper_bgcolor": "white",
-          "plot_bgcolor": "#E5ECF6",
-          "polar": {
-           "angularaxis": {
-            "gridcolor": "white",
-            "linecolor": "white",
-            "ticks": ""
-           },
-           "bgcolor": "#E5ECF6",
-           "radialaxis": {
-            "gridcolor": "white",
-            "linecolor": "white",
-            "ticks": ""
-           }
-          },
-          "scene": {
-           "xaxis": {
-            "backgroundcolor": "#E5ECF6",
-            "gridcolor": "white",
-            "gridwidth": 2,
-            "linecolor": "white",
-            "showbackground": true,
-            "ticks": "",
-            "zerolinecolor": "white"
-           },
-           "yaxis": {
-            "backgroundcolor": "#E5ECF6",
-            "gridcolor": "white",
-            "gridwidth": 2,
-            "linecolor": "white",
-            "showbackground": true,
-            "ticks": "",
-            "zerolinecolor": "white"
-           },
-           "zaxis": {
-            "backgroundcolor": "#E5ECF6",
-            "gridcolor": "white",
-            "gridwidth": 2,
-            "linecolor": "white",
-            "showbackground": true,
-            "ticks": "",
-            "zerolinecolor": "white"
-           }
-          },
-          "shapedefaults": {
-           "line": {
-            "color": "#2a3f5f"
-           }
-          },
-          "ternary": {
-           "aaxis": {
-            "gridcolor": "white",
-            "linecolor": "white",
-            "ticks": ""
-           },
-           "baxis": {
-            "gridcolor": "white",
-            "linecolor": "white",
-            "ticks": ""
-           },
-           "bgcolor": "#E5ECF6",
-           "caxis": {
-            "gridcolor": "white",
-            "linecolor": "white",
-            "ticks": ""
-           }
-          },
-          "title": {
-           "x": 0.05
-          },
-          "xaxis": {
-           "automargin": true,
-           "gridcolor": "white",
-           "linecolor": "white",
-           "ticks": "",
-           "title": {
-            "standoff": 15
-           },
-           "zerolinecolor": "white",
-           "zerolinewidth": 2
-          },
-          "yaxis": {
-           "automargin": true,
-           "gridcolor": "white",
-           "linecolor": "white",
-           "ticks": "",
-           "title": {
-            "standoff": 15
-           },
-           "zerolinecolor": "white",
-           "zerolinewidth": 2
-          }
-         }
-        },
-        "title": {
-         "text": "Compression - Ascending <class 'numpy.int32'>"
-        },
-        "xaxis": {
-         "anchor": "y",
-         "domain": [
-          0,
-          1
-         ],
-         "title": {
-          "text": "Input Size (Bytes)"
-         },
-         "type": "category"
-        },
-        "yaxis": {
-         "anchor": "x",
-         "domain": [
-          0,
-          1
-         ],
-         "title": {
-          "text": "Compression Factor"
-         }
-        }
-       }
-      }
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    }
-   ],
-   "source": [
-    "import plotly.express as px\n",
-    "title = 'Compression - ' + TARGET + \" \" + str(DTYPE)\n",
-    "fig = px.line(df, x='Input Size (Bytes)',\n",
-    "              y=[\n",
-    "                  'Lz4 Gpu Compression Ratio',\n",
-    "                  'Cascaded Compression Ratio',\n",
-    "                  'Lz4 Host Compression Ratio'\n",
-    "              ],\n",
-    "              labels={'value': 'Compression Factor'},\n",
-    "              title=title)\n",
-    "fig.update_xaxes(type='category')\n",
-    "fig.show()"
-   ]
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3 (ipykernel)",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.10.12"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 5
-}
diff --git a/notebooks/zarr.ipynb b/notebooks/zarr.ipynb
deleted file mode 100644
index 33a981ebf5..0000000000
--- a/notebooks/zarr.ipynb
+++ /dev/null
@@ -1,364 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "code",
-   "execution_count": 23,
-   "id": "7a060f7d-9a0c-4763-98df-7dc82409c6ba",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "\"\"\"\n",
-    "In this tutorial, we will show how to use KvikIO to read and write GPU memory directly to/from Zarr files.\n",
-    "\"\"\"\n",
-    "import json\n",
-    "import shutil\n",
-    "import numpy\n",
-    "import cupy\n",
-    "import zarr\n",
-    "import kvikio\n",
-    "import kvikio.zarr\n",
-    "from kvikio.nvcomp_codec import NvCompBatchCodec\n",
-    "from numcodecs import LZ4"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "99f4d25b-2006-4026-8629-1accafb338ef",
-   "metadata": {},
-   "source": [
-    "We need to set three Zarr arguments: \n",
-    " - `meta_array`: in order to make Zarr read into GPU memory (instead of CPU memory), we set the `meta_array` argument to an empty CuPy array. \n",
-    " - `store`: we need to use a GPU compatible Zarr Store, which will be KvikIO’s GDS store in our case. \n",
-    " - `compressor`: finally, we need to use a GPU compatible compressor (or `None`). KvikIO provides a nvCOMP compressor `kvikio.nvcomp_codec.NvCompBatchCodec` that we will use."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 24,
-   "id": "c179c24a-766e-4e09-83c5-349868042576",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "(<zarr.core.Array (10,) int64>,\n",
-       " NvCompBatchCodec(algorithm='lz4', options={}),\n",
-       " <kvikio.zarr.GDSStore at 0x7fd42021ac20>)"
-      ]
-     },
-     "execution_count": 24,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "# Let's create a new Zarr array using KvikIO's GDS store and LZ4 compression\n",
-    "z = zarr.array(\n",
-    "    cupy.arange(10), \n",
-    "    chunks=2, \n",
-    "    store=kvikio.zarr.GDSStore(\"my-zarr-file.zarr\"), \n",
-    "    meta_array=cupy.empty(()),\n",
-    "    compressor=NvCompBatchCodec(\"lz4\"),\n",
-    "    overwrite=True,\n",
-    ")\n",
-    "z, z.compressor, z.store"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 25,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "cupy.ndarray"
-      ]
-     },
-     "execution_count": 25,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "# And because we set the `meta_array` argument, reading the Zarr array returns a CuPy array\n",
-    "type(z[:])"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "549ded39-1053-4f82-a8a7-5a2ee999a4a1",
-   "metadata": {},
-   "source": [
-    "From this point onwards, `z` can be used just like any other Zarr array."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 26,
-   "id": "8221742d-f15c-450a-9701-dc8c05326126",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "array([1, 2, 3, 4, 5, 6, 7, 8])"
-      ]
-     },
-     "execution_count": 26,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "z[1:9]"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 27,
-   "id": "f0c451c1-a240-4b26-a5ef-6e70a5bbeb55",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "array([42, 43, 44, 45, 46, 47, 48, 49, 50, 51])"
-      ]
-     },
-     "execution_count": 27,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "z[:] + 42"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "7797155f-40f4-4c50-b704-2356ca64cba3",
-   "metadata": {},
-   "source": [
-    "### GPU compression / CPU decompression"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "a0029deb-19b9-4dbb-baf0-ce4b199605a5",
-   "metadata": {},
-   "source": [
-    "In order to read GPU-written Zarr file into a NumPy array, we simply open that file **without** setting the `meta_array` argument:"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 28,
-   "id": "399f23f7-4475-496a-a537-a7163a35c888",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "(numpy.ndarray,\n",
-       " kvikio.nvcomp_codec.NvCompBatchCodec,\n",
-       " array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]))"
-      ]
-     },
-     "execution_count": 28,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "z = zarr.open_array(kvikio.zarr.GDSStore(\"my-zarr-file.zarr\"))\n",
-    "type(z[:]), type(z.compressor), z[:]"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "8e9f31d5",
-   "metadata": {},
-   "source": [
-    "And we don't need to use `kvikio.zarr.GDSStore` either:"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 29,
-   "id": "4b1f46b2",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "(numpy.ndarray,\n",
-       " kvikio.nvcomp_codec.NvCompBatchCodec,\n",
-       " array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]))"
-      ]
-     },
-     "execution_count": 29,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "z = zarr.open_array(\"my-zarr-file.zarr\")\n",
-    "type(z[:]), type(z.compressor), z[:]"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "f10fd704-35f7-46b7-aabe-ea68fb2bf88d",
-   "metadata": {},
-   "source": [
-    "However, the above use `NvCompBatchCodec(\"lz4\")` for decompression. In the following, we will show how to read Zarr file written and compressed using a GPU on the CPU.\n",
-    "\n",
-    "Some algorithms, such as LZ4, can be used interchangeably on CPU and GPU but Zarr will always use the compressor used to write the Zarr file. We are working with the Zarr team to fix this shortcoming but for now, we will use a workaround where we _patch_ the metadata manually."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 30,
-   "id": "d980361a-e132-4f29-ab13-cbceec5bbbb5",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "(numpy.ndarray, numcodecs.lz4.LZ4, array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]))"
-      ]
-     },
-     "execution_count": 30,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "# Read the Zarr metadata and replace the compressor with a CPU implementation of LZ4\n",
-    "store = zarr.DirectoryStore(\"my-zarr-file.zarr\")  # We could also have used kvikio.zarr.GDSStore\n",
-    "meta = json.loads(store[\".zarray\"])\n",
-    "meta[\"compressor\"] = LZ4().get_config()\n",
-    "store[\".zarray\"] = json.dumps(meta).encode()  # NB: this changes the Zarr metadata on disk\n",
-    "\n",
-    "# And then open the file as usually\n",
-    "z = zarr.open_array(store)\n",
-    "type(z[:]), type(z.compressor), z[:]"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "8ea73705",
-   "metadata": {},
-   "source": [
-    "### CPU compression / GPU decompression\n",
-    "\n",
-    "Now, let's try the otherway around."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 31,
-   "id": "c9b2d56a",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "(<zarr.core.Array (10,) int64>,\n",
-       " LZ4(acceleration=1),\n",
-       " <zarr.storage.DirectoryStore at 0x7fd351e7a9b0>)"
-      ]
-     },
-     "execution_count": 31,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "import numcodecs\n",
-    "# Let's create a new Zarr array using the default compression.\n",
-    "z = zarr.array(\n",
-    "    numpy.arange(10), \n",
-    "    chunks=2, \n",
-    "    store=\"my-zarr-file.zarr\", \n",
-    "    overwrite=True,\n",
-    "    # The default (CPU) implementation of LZ4 codec.\n",
-    "    compressor=numcodecs.registry.get_codec({\"id\": \"lz4\"})\n",
-    ")\n",
-    "z, z.compressor, z.store"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "dedd4623",
-   "metadata": {},
-   "source": [
-    "Again, we will use a workaround where we _patch_ the metadata manually."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 32,
-   "id": "ac3f30b1",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "(cupy.ndarray,\n",
-       " kvikio.nvcomp_codec.NvCompBatchCodec,\n",
-       " array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]))"
-      ]
-     },
-     "execution_count": 32,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "# Read the Zarr metadata and replace the compressor with a GPU implementation of LZ4\n",
-    "store = kvikio.zarr.GDSStore(\"my-zarr-file.zarr\")  # We could also have used zarr.DirectoryStore\n",
-    "meta = json.loads(store[\".zarray\"])\n",
-    "meta[\"compressor\"] = NvCompBatchCodec(\"lz4\").get_config()\n",
-    "store[\".zarray\"] = json.dumps(meta).encode()  # NB: this changes the Zarr metadata on disk\n",
-    "\n",
-    "# And then open the file as usually\n",
-    "z = zarr.open_array(store, meta_array=cupy.empty(()))\n",
-    "type(z[:]), type(z.compressor), z[:]"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 33,
-   "id": "80682922-b7b0-4b08-b595-228c2b446a78",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Clean up\n",
-    "shutil.rmtree(\"my-zarr-file.zarr\", ignore_errors=True)"
-   ]
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3 (ipykernel)",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.10.11"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 5
-}
diff --git a/python/kvikio/cli/gpu_compressor.py b/python/kvikio/cli/gpu_compressor.py
deleted file mode 100755
index ac34b15e8f..0000000000
--- a/python/kvikio/cli/gpu_compressor.py
+++ /dev/null
@@ -1,136 +0,0 @@
-# NVIDIA 2022
-
-import argparse
-import os
-import sys
-import time
-
-import cupy
-
-import kvikio
-import kvikio.nvcomp as nvcomp
-
-
-def get_parser():
-    class NvcompParser(argparse.ArgumentParser):
-        """
-        Handle special case and show help on invalid argument
-        """
-
-        def error(self, message):
-            sys.stderr.write("\nERROR: {}\n\n".format(message))
-            self.print_help()
-            sys.exit(2)
-
-    parser = NvcompParser()
-    parser.add_argument("-v", "--verbose", action="store_true", help="Verbose Output")
-    parser.add_argument(
-        "-o",
-        "--out_file",
-        action="store",
-        dest="out_file",
-        help="Output filename",
-    )
-    parser.add_argument(
-        "-c",
-        choices=["ans", "bitcomp", "cascaded", "gdeflate", "lz4", "snappy"],
-        action="store",
-        dest="compression",
-        help="Which GPU algorithm to use for compression.",
-    )
-    parser.add_argument(
-        "-d",
-        action="store_true",
-        help="Decompress the incoming file",
-    )
-    parser.add_argument(action="store", dest="filename", help="Relative Filename")
-    return parser
-
-
-def main():
-    parser = get_parser()
-    args = parser.parse_args()
-
-    print("GPU Compression Initialized") if args.verbose else None
-
-    file_size = os.path.getsize(args.filename)
-    """ test
-    data = cupy.arange(10000, dtype="uint8")
-    """
-    data = cupy.zeros(file_size, dtype=cupy.int8)
-    t = time.time()
-    f = kvikio.CuFile(args.filename, "r")
-    f.read(data)
-    f.close()
-    read_time = time.time() - t
-    print(f"File read time: {read_time:.3} seconds.") if args.verbose else None
-
-    if args.d:
-        compressor = nvcomp.ManagedDecompressionManager(data)
-    elif args.compression == "ans":
-        compressor = nvcomp.ANSManager()
-    elif args.compression == "bitcomp":
-        compressor = nvcomp.BitcompManager()
-    elif args.compression == "cascaded":
-        compressor = nvcomp.CascadedManager()
-    elif args.compression == "gdeflate":
-        compressor = nvcomp.GdeflateManager()
-    elif args.compression == "snappy":
-        compressor = nvcomp.SnappyManager()
-    else:
-        compressor = nvcomp.LZ4Manager(chunk_size=1 << 16)
-
-    if args.d is True:
-        print(f"Decompressing {file_size} bytes") if args.verbose else None
-        t = time.time()
-        converted = compressor.decompress(data)
-        decompress_time = time.time() - t
-        print(
-            f"Decompression time: {decompress_time:.3} seconds"
-        ) if args.verbose else None
-
-        if not args.out_file:
-            raise ValueError("Must specify filename with -o for decompression.")
-
-        t = time.time()
-        o = kvikio.CuFile(args.out_file, "w")
-        o.write(converted)
-        o.close()
-        io_time = time.time() - t
-        print(f"File write time: {io_time:.3} seconds") if args.verbose else None
-
-        print(
-            f"Decompressed file size {os.path.getsize(args.out_file)}"
-        ) if args.verbose else None
-    else:
-        file_size = os.path.getsize(args.filename)
-
-        print(f"Compressing {file_size} bytes") if args.verbose else None
-        t = time.time()
-        converted = compressor.compress(data)
-        compress_time = time.time() - t
-        print(f"Compression time: {compress_time:.3} seconds") if args.verbose else None
-
-        t = time.time()
-        if args.out_file:
-            o = kvikio.CuFile(args.out_file, "w")
-        else:
-            o = kvikio.CuFile(args.filename + ".gpc", "w")
-        o.write(converted)
-        o.close()
-        io_time = time.time() - t
-        print(f"File write time: {io_time:.3} seconds") if args.verbose else None
-
-        print(
-            f"Compressed file size {compressor.get_compressed_output_size(converted)}"
-        ) if args.verbose else None
-
-    if args.out_file:
-        end_name = args.out_file
-    else:
-        end_name = args.filename + ".gpc"
-    print(f"Created file {end_name}") if args.verbose else None
-
-
-if __name__ == "__main__":
-    main()
diff --git a/python/kvikio/examples/zarr_cupy_nvcomp.py b/python/kvikio/examples/zarr_cupy_nvcomp.py
deleted file mode 100644
index 9f05f7874a..0000000000
--- a/python/kvikio/examples/zarr_cupy_nvcomp.py
+++ /dev/null
@@ -1,88 +0,0 @@
-# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
-# See file LICENSE for terms.
-
-import cupy
-import numpy
-import zarr
-
-import kvikio
-import kvikio.zarr
-
-
-def main(path):
-    a = cupy.arange(20)
-
-    # Let's use KvikIO's convenience function `open_cupy_array()` to create
-    # a new Zarr file on disk. Its semantic is the same as `zarr.open_array()`
-    # but uses a GDS file store, nvCOMP compression, and CuPy arrays.
-    z = kvikio.zarr.open_cupy_array(store=path, mode="w", shape=(20,), chunks=(5,))
-
-    # `z` is a regular Zarr Array that we can write to as usual
-    z[0:10] = numpy.arange(0, 10)
-    # but it also support direct reads and writes of CuPy arrays
-    z[10:20] = cupy.arange(10, 20)
-
-    # Reading `z` returns a CuPy array
-    assert isinstance(z[:], cupy.ndarray)
-    assert (a == z[:]).all()
-
-    # Normally, we cannot assume that GPU and CPU compressors are compatible.
-    # E.g., `open_cupy_array()` uses nvCOMP's Snappy GPU compression by default,
-    # which, as far as we know, isn’t compatible with any CPU compressor. Thus,
-    # let's re-write our Zarr array using a CPU and GPU compatible compressor.
-    #
-    # Warning: it isn't possible to use `CompatCompressor` as a compressor argument
-    #          in Zarr directly. It is only meant for `open_cupy_array()`. However,
-    #          in an example further down, we show how to write using regular Zarr.
-    z = kvikio.zarr.open_cupy_array(
-        store=path,
-        mode="w",
-        shape=(20,),
-        chunks=(5,),
-        compressor=kvikio.zarr.CompatCompressor.lz4(),
-    )
-    z[:] = a
-
-    # Because we are using a CompatCompressor, it is now possible to open the file
-    # using Zarr's built-in LZ4 decompressor that uses the CPU.
-    z = zarr.open_array(path)
-    # `z` is now read as a regular NumPy array
-    assert isinstance(z[:], numpy.ndarray)
-    assert (a.get() == z[:]).all()
-    # and we can write to is as usual
-    z[:] = numpy.arange(20, 40)
-
-    # And we can read the Zarr file back into a CuPy array.
-    z = kvikio.zarr.open_cupy_array(store=path, mode="r")
-    assert isinstance(z[:], cupy.ndarray)
-    assert (cupy.arange(20, 40) == z[:]).all()
-
-    # Similarly, we can also open a file written by regular Zarr.
-    # Let's write the file without any compressor.
-    ary = numpy.arange(10)
-    z = zarr.open(store=path, mode="w", shape=ary.shape, compressor=None)
-    z[:] = ary
-    # This works as before where the file is read as a CuPy array
-    z = kvikio.zarr.open_cupy_array(store=path)
-    assert isinstance(z[:], cupy.ndarray)
-    assert (z[:] == cupy.asarray(ary)).all()
-
-    # Using a compressor is a bit more tricky since not all CPU compressors
-    # are GPU compatible. To make sure we use a compable compressor, we use
-    # the CPU-part of `CompatCompressor.lz4()`.
-    ary = numpy.arange(10)
-    z = zarr.open(
-        store=path,
-        mode="w",
-        shape=ary.shape,
-        compressor=kvikio.zarr.CompatCompressor.lz4().cpu,
-    )
-    z[:] = ary
-    # This works as before where the file is read as a CuPy array
-    z = kvikio.zarr.open_cupy_array(store=path)
-    assert isinstance(z[:], cupy.ndarray)
-    assert (z[:] == cupy.asarray(ary)).all()
-
-
-if __name__ == "__main__":
-    main("/tmp/zarr-cupy-nvcomp")
diff --git a/python/kvikio/kvikio/_lib/CMakeLists.txt b/python/kvikio/kvikio/_lib/CMakeLists.txt
index 7dfb47cc0f..5b454295c0 100644
--- a/python/kvikio/kvikio/_lib/CMakeLists.txt
+++ b/python/kvikio/kvikio/_lib/CMakeLists.txt
@@ -14,7 +14,7 @@
 
 # Set the list of Cython files to build, one .so per file
 set(cython_modules arr.pyx buffer.pyx defaults.pyx cufile_driver.pyx file_handle.pyx future.pyx
-                   libnvcomp.pyx libnvcomp_ll.pyx mmap.pyx
+                   mmap.pyx
 )
 
 if(KvikIO_REMOTE_SUPPORT)
diff --git a/python/kvikio/kvikio/_lib/libnvcomp.pyx b/python/kvikio/kvikio/_lib/libnvcomp.pyx
deleted file mode 100644
index dc5359e9b3..0000000000
--- a/python/kvikio/kvikio/_lib/libnvcomp.pyx
+++ /dev/null
@@ -1,235 +0,0 @@
-# Copyright (c) 2022 Carson Swope
-# Use, modification, and distribution is subject to the MIT License
-# https://github.com/carsonswope/py-nvcomp/blob/main/LICENSE)
-# SPDX-FileCopyrightText: Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES.
-# All rights reserved.
-# SPDX-License-Identifier: MIT
-# Permission is hereby granted, free of charge, to any person obtaining a
-# copy of this software and associated documentation files (the "Software"),
-# to deal in the Software without restriction, including without limitation
-# the rights to use, copy, modify, merge, publish, distribute, sublicense,
-# and/or sell copies of the Software, and to permit persons to whom the
-# Software is furnished to do so, subject to the following conditions:
-# The above copyright notice and this permission notice shall be included in
-# all copies or substantial portions of the Software.
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
-# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-# DEALINGS IN THE SOFTWARE.
-
-from enum import Enum
-
-from libc.stdint cimport uint8_t, uintptr_t
-from libcpp cimport nullptr
-from libcpp.memory cimport make_shared, shared_ptr
-from libcpp.utility cimport move
-
-from kvikio._lib.arr cimport Array
-from kvikio._lib.nvcomp_cxx_api cimport (
-    ANSManager,
-    BitcompManager,
-    CascadedManager,
-    CompressionConfig,
-    DecompressionConfig,
-    GdeflateManager,
-    LZ4Manager,
-    SnappyManager,
-    create_manager,
-    nvcompBatchedANSDefaultOpts,
-    nvcompBatchedANSOpts_t,
-    nvcompBatchedBitcompFormatOpts,
-    nvcompBatchedCascadedDefaultOpts,
-    nvcompBatchedCascadedOpts_t,
-    nvcompBatchedGdeflateOpts_t,
-    nvcompBatchedLZ4Opts_t,
-    nvcompBatchedSnappyDefaultOpts,
-    nvcompBatchedSnappyOpts_t,
-    nvcompManagerBase,
-    nvcompType_t,
-)
-
-
-class pyNvcompType_t(Enum):
-    pyNVCOMP_TYPE_CHAR = nvcompType_t.NVCOMP_TYPE_CHAR
-    pyNVCOMP_TYPE_UCHAR = nvcompType_t.NVCOMP_TYPE_UCHAR
-    pyNVCOMP_TYPE_SHORT = nvcompType_t.NVCOMP_TYPE_SHORT
-    pyNVCOMP_TYPE_USHORT = nvcompType_t.NVCOMP_TYPE_USHORT
-    pyNVCOMP_TYPE_INT = nvcompType_t.NVCOMP_TYPE_INT
-    pyNVCOMP_TYPE_UINT = nvcompType_t.NVCOMP_TYPE_UINT
-    pyNVCOMP_TYPE_LONGLONG = nvcompType_t.NVCOMP_TYPE_LONGLONG
-    pyNVCOMP_TYPE_ULONGLONG = nvcompType_t.NVCOMP_TYPE_ULONGLONG
-    pyNVCOMP_TYPE_BITS = nvcompType_t.NVCOMP_TYPE_BITS
-
-
-cdef class _nvcompManager:
-    # Temporary storage for factory allocated manager to prevent cleanup
-    cdef shared_ptr[nvcompManagerBase] _mgr
-    cdef nvcompManagerBase* _impl
-    cdef shared_ptr[CompressionConfig] _compression_config
-    cdef shared_ptr[DecompressionConfig] _decompression_config
-
-    def __dealloc__(self):
-        # `ManagedManager` uses a temporary object, self._mgr
-        # to retain a reference count to the Manager created by
-        # create_manager. If it is present, then the `shared_ptr`
-        # system will free self._impl. Otherwise, we need to free
-        # self._iNonempl
-        if self._mgr == nullptr:
-            del self._impl
-
-    def configure_compression(self, decomp_buffer_size):
-        cdef shared_ptr[CompressionConfig] partial = make_shared[
-            CompressionConfig](
-                self._impl.configure_compression(decomp_buffer_size)
-        )
-        self._compression_config = make_shared[CompressionConfig](
-            (move(partial.get()[0]))
-        )
-        cdef const CompressionConfig* compression_config_ptr = \
-            self._compression_config.get()
-        return {
-            "uncompressed_buffer_size": compression_config_ptr.
-            uncompressed_buffer_size,
-            "max_compressed_buffer_size": compression_config_ptr.
-            max_compressed_buffer_size,
-            "num_chunks": compression_config_ptr.num_chunks
-        }
-
-    def compress(self, Array decomp_buffer, Array comp_buffer):
-        cdef uintptr_t comp_buffer_ptr = comp_buffer.ptr
-        self._impl.compress(
-            <const uint8_t*>decomp_buffer.ptr,
-            <uint8_t*>comp_buffer_ptr,
-            <CompressionConfig&>self._compression_config.get()[0]
-        )
-        size = self._impl.get_compressed_output_size(
-            <uint8_t*>comp_buffer_ptr
-        )
-        return size
-
-    def configure_decompression_with_compressed_buffer(
-        self,
-        Array comp_buffer
-    ) -> dict:
-        cdef shared_ptr[DecompressionConfig] partial = make_shared[
-            DecompressionConfig](self._impl.configure_decompression(
-                <uint8_t*>comp_buffer.ptr
-            )
-        )
-        self._decompression_config = make_shared[DecompressionConfig](
-            (move(partial.get()[0]))
-        )
-        cdef const DecompressionConfig* decompression_config_ptr = \
-            self._decompression_config.get()
-        return {
-            "decomp_data_size": decompression_config_ptr.decomp_data_size,
-            "num_chunks": decompression_config_ptr.num_chunks
-        }
-
-    def decompress(
-        self,
-        Array decomp_buffer,
-        Array comp_buffer,
-    ):
-        self._impl.decompress(
-            <uint8_t*>decomp_buffer.ptr,
-            <const uint8_t*>comp_buffer.ptr,
-            <DecompressionConfig&>self._decompression_config.get()[0]
-        )
-
-    def get_compressed_output_size(self, Array comp_buffer):
-        return self._impl.get_compressed_output_size(
-            <uint8_t*>comp_buffer.ptr
-        )
-
-
-cdef class _ANSManager(_nvcompManager):
-    def __cinit__(
-        self,
-        size_t uncomp_chunk_size,
-    ):
-        self._impl = <nvcompManagerBase*>new ANSManager(
-            uncomp_chunk_size,
-            <nvcompBatchedANSOpts_t>nvcompBatchedANSDefaultOpts
-        )
-
-
-cdef class _BitcompManager(_nvcompManager):
-    def __cinit__(
-        self,
-        size_t uncomp_chunk_size,
-        nvcompType_t data_type,
-        int bitcomp_algo,
-    ):
-        cdef opts = nvcompBatchedBitcompFormatOpts(bitcomp_algo, data_type)
-        self._impl = <nvcompManagerBase*>new BitcompManager(
-            uncomp_chunk_size,
-            opts
-        )
-
-
-cdef class _CascadedManager(_nvcompManager):
-    def __cinit__(
-        self,
-        _options,
-    ):
-        self._impl = <nvcompManagerBase*>new CascadedManager(
-            _options["chunk_size"],
-            <nvcompBatchedCascadedOpts_t>nvcompBatchedCascadedDefaultOpts
-        )
-
-
-cdef class _GdeflateManager(_nvcompManager):
-    def __cinit__(
-        self,
-        size_t uncomp_chunk_size,
-        int algo,
-    ):
-        cdef opts = nvcompBatchedGdeflateOpts_t(algo)
-        self._impl = <nvcompManagerBase*>new GdeflateManager(
-            uncomp_chunk_size,
-            opts
-        )
-
-
-cdef class _LZ4Manager(_nvcompManager):
-    def __cinit__(
-        self,
-        size_t uncomp_chunk_size,
-        nvcompType_t data_type,
-    ):
-        # TODO: Doesn't work with user specified streams passed down
-        # from anywhere up. I'm not going to rabbit hole on it until
-        # everything else works.
-        # cdef cudaStream_t stream = <cudaStream_t><void*>user_stream
-        cdef opts = nvcompBatchedLZ4Opts_t(data_type)
-        self._impl = <nvcompManagerBase*>new LZ4Manager(
-            uncomp_chunk_size,
-            opts
-        )
-
-
-cdef class _SnappyManager(_nvcompManager):
-    def __cinit__(
-        self,
-        size_t uncomp_chunk_size,
-    ):
-        # TODO: Doesn't work with user specified streams passed down
-        # from anywhere up. I'm not going to rabbit hole on it until
-        # everything else works.
-        self._impl = <nvcompManagerBase*>new SnappyManager(
-            uncomp_chunk_size,
-            <nvcompBatchedSnappyOpts_t>nvcompBatchedSnappyDefaultOpts
-        )
-
-
-cdef class _ManagedManager(_nvcompManager):
-    def __init__(self, compressed_buffer):
-        cdef shared_ptr[nvcompManagerBase] _mgr = create_manager(
-            <uint8_t*><uintptr_t>compressed_buffer.ptr
-        )
-        self._mgr = _mgr
-        self._impl = move(_mgr).get()
diff --git a/python/kvikio/kvikio/_lib/libnvcomp_ll.pyx b/python/kvikio/kvikio/_lib/libnvcomp_ll.pyx
deleted file mode 100644
index 46c7b399a9..0000000000
--- a/python/kvikio/kvikio/_lib/libnvcomp_ll.pyx
+++ /dev/null
@@ -1,1182 +0,0 @@
-# Copyright (c) 2023-2024, NVIDIA CORPORATION. All rights reserved.
-# See file LICENSE for terms.
-
-from __future__ import annotations
-
-from abc import ABC, abstractmethod
-from enum import IntEnum
-
-from libc.stdint cimport uint32_t, uintptr_t
-
-from kvikio._lib.nvcomp_ll_cxx_api cimport cudaStream_t, nvcompStatus_t, nvcompType_t
-
-import cupy
-
-
-class nvCompStatus(IntEnum):
-    Success = nvcompStatus_t.nvcompSuccess,
-    ErrorInvalidValue = nvcompStatus_t.nvcompErrorInvalidValue,
-    ErrorNotSupported = nvcompStatus_t.nvcompErrorNotSupported,
-    ErrorCannotDecompress = nvcompStatus_t.nvcompErrorCannotDecompress,
-    ErrorBadChecksum = nvcompStatus_t.nvcompErrorBadChecksum,
-    ErrorCannotVerifyChecksums = nvcompStatus_t.nvcompErrorCannotVerifyChecksums,
-    ErrorCudaError = nvcompStatus_t.nvcompErrorCudaError,
-    ErrorInternal = nvcompStatus_t.nvcompErrorInternal,
-
-
-class nvCompType(IntEnum):
-    CHAR = nvcompType_t.NVCOMP_TYPE_CHAR
-    UCHAR = nvcompType_t.NVCOMP_TYPE_UCHAR
-    SHORT = nvcompType_t.NVCOMP_TYPE_SHORT
-    USHORT = nvcompType_t.NVCOMP_TYPE_USHORT
-    INT = nvcompType_t.NVCOMP_TYPE_INT
-    UINT = nvcompType_t.NVCOMP_TYPE_UINT
-    LONGLONG = nvcompType_t.NVCOMP_TYPE_LONGLONG
-    ULONGLONG = nvcompType_t.NVCOMP_TYPE_ULONGLONG
-    BITS = nvcompType_t.NVCOMP_TYPE_BITS
-
-
-class nvCompBatchAlgorithm(ABC):
-    """Abstract class that provides interface to nvCOMP batched algorithms."""
-
-    # TODO(akamenev): it might be possible to have a simpler implementation that
-    # eilminates the need to have a separate implementation class for each algorithm,
-    # potentially using fused types in Cython (similar to C++ templates),
-    # but I could not figure out how to do that (e.g. each algorithm API set has
-    # a different type for the options and so on).
-
-    def get_compress_temp_size(
-        self,
-        size_t batch_size,
-        size_t max_uncompressed_chunk_bytes,
-    ):
-        """Get temporary space required for compression.
-
-        Parameters
-        ----------
-        batch_size: int
-            The number of items in the batch.
-        max_uncompressed_chunk_bytes: int
-            The maximum size in bytes of a chunk in the batch.
-
-        Returns
-        -------
-        int
-            The size in bytes of the required GPU workspace for compression.
-        """
-        err, temp_size = self._get_comp_temp_size(
-            batch_size,
-            max_uncompressed_chunk_bytes
-        )
-        if err != nvcompStatus_t.nvcompSuccess:
-            raise RuntimeError(
-                f"Could not get compress temp buffer size, "
-                f"error: {nvCompStatus(err)!r}."
-            )
-        return temp_size
-
-    @abstractmethod
-    def _get_comp_temp_size(
-        self,
-        size_t batch_size,
-        size_t max_uncompressed_chunk_bytes,
-    ) -> tuple[nvcompStatus_t, size_t]:
-        """Algorithm-specific implementation."""
-        ...
-
-    def get_compress_chunk_size(self, size_t max_uncompressed_chunk_bytes):
-        """Get the maximum size any chunk could compress to in the batch.
-
-        Parameters
-        ----------
-        max_uncompressed_chunk_bytes: int
-            The maximum size in bytes of a chunk in the batch.
-
-        Returns
-        -------
-        int
-            The maximum compressed size in bytes of the largest chunk. That is,
-            the minimum amount of output memory required to be given to
-            the corresponding *CompressAsync function.
-        """
-        err, comp_chunk_size = self._get_comp_chunk_size(max_uncompressed_chunk_bytes)
-        if err != nvcompStatus_t.nvcompSuccess:
-            raise RuntimeError(
-                f"Could not get output buffer size, "
-                f"error: {nvCompStatus(err)!r}."
-            )
-        return comp_chunk_size
-
-    @abstractmethod
-    def _get_comp_chunk_size(self, size_t max_uncompressed_chunk_bytes):
-        """Algorithm-specific implementation."""
-        ...
-
-    def compress(
-        self,
-        uncomp_chunks,
-        uncomp_chunk_sizes,
-        size_t max_uncomp_chunk_bytes,
-        size_t batch_size,
-        temp_buf,
-        comp_chunks,
-        comp_chunk_sizes,
-        stream,
-    ):
-        """Perform compression.
-
-        Parameters
-        ----------
-        uncomp_chunks: cp.ndarray[uintp]
-            The pointers on the GPU, to uncompressed batched items.
-        uncomp_chunk_sizes: cp.ndarray[uint64]
-            The size in bytes of each uncompressed batch item on the GPU.
-        max_uncomp_chunk_bytes: int
-            The maximum size in bytes of the largest chunk in the batch.
-        batch_size: int
-            The number of chunks to compress.
-        temp_buf: cp.ndarray
-            The temporary GPU workspace.
-        comp_chunks: cp.ndarray[uintp]
-            (output) The list of pointers on the GPU, to the output location for each
-            compressed batch item.
-        comp_chunk_sizes: cp.ndarray[uint64]
-            (output) The compressed size in bytes of each chunk.
-        stream: cp.cuda.Stream
-            CUDA stream.
-        """
-
-        err = self._compress(
-            uncomp_chunks,
-            uncomp_chunk_sizes,
-            max_uncomp_chunk_bytes,
-            batch_size,
-            temp_buf,
-            comp_chunks,
-            comp_chunk_sizes,
-            stream,
-        )
-        if err != nvcompStatus_t.nvcompSuccess:
-            raise RuntimeError(f"Compression failed, error: {nvCompStatus(err)!r}.")
-
-    @abstractmethod
-    def _compress(
-        self,
-        uncomp_chunks,
-        uncomp_chunk_sizes,
-        size_t max_uncomp_chunk_bytes,
-        size_t batch_size,
-        temp_buf,
-        comp_chunks,
-        comp_chunk_sizes,
-        stream
-    ):
-        """Algorithm-specific implementation."""
-        ...
-
-    def get_decompress_temp_size(
-        self,
-        size_t batch_size,
-        size_t max_uncompressed_chunk_bytes,
-    ):
-        """Get the amount of temp space required on the GPU for decompression.
-
-        Parameters
-        ----------
-        batch_size: int
-            The number of items in the batch.
-        max_uncompressed_chunk_bytes: int
-            The size in bytes of the largest chunk when uncompressed.
-
-        Returns
-        -------
-        int
-            The amount of temporary GPU space in bytes that will be
-            required to decompress.
-        """
-        err, temp_size = self._get_decomp_temp_size(
-            batch_size,
-            max_uncompressed_chunk_bytes
-        )
-        if err != nvcompStatus_t.nvcompSuccess:
-            raise RuntimeError(
-                f"Could not get decompress temp buffer size, "
-                f"error: {nvCompStatus(err)!r}."
-            )
-
-        return temp_size
-
-    @abstractmethod
-    def _get_decomp_temp_size(
-        self,
-        size_t batch_size,
-        size_t max_uncompressed_chunk_bytes,
-    ):
-        """Algorithm-specific implementation."""
-        ...
-
-    def get_decompress_size(
-        self,
-        comp_chunks,
-        comp_chunk_sizes,
-        stream,
-    ):
-        """Get the amount of space required on the GPU for decompression.
-
-        Parameters
-        ----------
-        comp_chunks: cp.ndarray[uintp]
-            The pointers on the GPU, to compressed batched items.
-        comp_chunk_sizes: cp.ndarray[uint64]
-            The size in bytes of each compressed batch item.
-        stream: cp.cuda.Stream
-            CUDA stream.
-
-        Returns
-        -------
-        cp.ndarray[uint64]
-            The amount of GPU space in bytes that will be required
-            to decompress each chunk.
-        """
-
-        assert len(comp_chunks) == len(comp_chunk_sizes)
-        batch_size = len(comp_chunks)
-
-        # nvCOMP requires all buffers to be in GPU memory.
-        uncomp_chunk_sizes = cupy.empty_like(comp_chunk_sizes)
-
-        err = self._get_decomp_size(
-            comp_chunks,
-            comp_chunk_sizes,
-            batch_size,
-            uncomp_chunk_sizes,
-            stream,
-        )
-        if err != nvcompStatus_t.nvcompSuccess:
-            raise RuntimeError(
-                f"Could not get decompress buffer size, error: {nvCompStatus(err)!r}."
-            )
-
-        return uncomp_chunk_sizes
-
-    @abstractmethod
-    def _get_decomp_size(
-        self,
-        comp_chunks,
-        comp_chunk_sizes,
-        size_t batch_size,
-        uncomp_chunk_sizes,
-        stream,
-    ):
-        """Algorithm-specific implementation."""
-        ...
-
-    def decompress(
-        self,
-        comp_chunks,
-        comp_chunk_sizes,
-        size_t batch_size,
-        temp_buf,
-        uncomp_chunks,
-        uncomp_chunk_sizes,
-        actual_uncomp_chunk_sizes,
-        statuses,
-        stream,
-    ):
-        """Perform decompression.
-
-        Parameters
-        ----------
-        comp_chunks: cp.ndarray[uintp]
-            The pointers on the GPU, to compressed batched items.
-        comp_chunk_sizes: cp.ndarray[uint64]
-            The size in bytes of each compressed batch item.
-        batch_size: int
-            The number of chunks to decompress.
-        temp_buf: cp.ndarray
-            The temporary GPU workspace.
-        uncomp_chunks: cp.ndarray[uintp]
-            (output) The pointers on the GPU, to the output location for each
-            decompressed batch item.
-        uncomp_chunk_sizes: cp.ndarray[uint64]
-            The size in bytes of each decompress chunk location on the GPU.
-        actual_uncomp_chunk_sizes: cp.ndarray[uint64]
-            (output) The actual decompressed size in bytes of each chunk on the GPU.
-        statuses: cp.ndarray
-            (output) The status for each chunk of whether it was decompressed or not.
-        stream: cp.cuda.Stream
-            CUDA stream.
-        """
-
-        err = self._decompress(
-            comp_chunks,
-            comp_chunk_sizes,
-            batch_size,
-            temp_buf,
-            uncomp_chunks,
-            uncomp_chunk_sizes,
-            actual_uncomp_chunk_sizes,
-            statuses,
-            stream,
-        )
-        if err != nvcompStatus_t.nvcompSuccess:
-            raise RuntimeError(f"Decompression failed, error: {nvCompStatus(err)!r}.")
-
-    @abstractmethod
-    def _decompress(
-        self,
-        comp_chunks,
-        comp_chunk_sizes,
-        size_t batch_size,
-        temp_buf,
-        uncomp_chunks,
-        uncomp_chunk_sizes,
-        actual_uncomp_chunk_sizes,
-        statuses,
-        stream,
-    ):
-        """Algorithm-specific implementation."""
-        ...
-
-
-cdef uintptr_t to_ptr(buf):
-    return buf.data.ptr
-
-
-cdef cudaStream_t to_stream(stream):
-    return <cudaStream_t><size_t>stream.ptr
-
-
-#
-# LZ4 algorithm.
-#
-
-from kvikio._lib.nvcomp_ll_cxx_api cimport (
-    nvcompBatchedLZ4CompressAsync,
-    nvcompBatchedLZ4CompressGetMaxOutputChunkSize,
-    nvcompBatchedLZ4CompressGetTempSize,
-    nvcompBatchedLZ4DecompressAsync,
-    nvcompBatchedLZ4DecompressGetTempSize,
-    nvcompBatchedLZ4DefaultOpts,
-    nvcompBatchedLZ4GetDecompressSizeAsync,
-    nvcompBatchedLZ4Opts_t,
-)
-
-
-class nvCompBatchAlgorithmLZ4(nvCompBatchAlgorithm):
-    """LZ4 algorithm implementation."""
-
-    algo_id: str = "lz4"
-
-    options: nvcompBatchedLZ4Opts_t
-
-    HEADER_SIZE_BYTES: size_t = sizeof(uint32_t)
-
-    def __init__(self, data_type: int = None, has_header: bool = True):
-        """Initialize the codec.
-
-        Parameters
-        ----------
-        data_type: int or None
-            Source data type. If None, uses nvcomp default options.
-        has_header: bool
-            Whether the compressed data has a header.
-            This enables data compatibility between numcodecs LZ4 codec,
-            which has the header and nvCOMP LZ4 codec which does not
-            require the header.
-        """
-        if data_type is None:
-            self.options = nvcompBatchedLZ4DefaultOpts
-        else:
-            self.options = nvcompBatchedLZ4Opts_t(data_type)
-
-        self.has_header = has_header
-
-        # Note on LZ4 header structure: numcodecs LZ4 codec prepends
-        # a 4-byte (uint32_t) header to each compressed chunk.
-        # The header stores the size of the original (uncompressed) data:
-        # https://github.com/zarr-developers/numcodecs/blob/cb155432e36536e17a2d054c8c24b7bf6f4a7347/numcodecs/lz4.pyx#L89
-        #
-        # The following CUDA kernels read / write chunk header by
-        # casting the chunk pointer to a pointer to unsigned int.
-
-        # CUDA kernel that copies uncompressed chunk size from the chunk header.
-        self._get_size_from_header_kernel = cupy.ElementwiseKernel(
-            "uint64 comp_chunk_ptr",
-            "uint64 uncomp_chunk_size",
-            "uncomp_chunk_size = *((unsigned int *)comp_chunk_ptr)",
-            "get_size_from_header",
-        )
-
-        # CUDA kernel that copies uncompressed chunk size to the chunk header.
-        self._set_chunk_size_header_kernel = cupy.ElementwiseKernel(
-            "uint64 uncomp_chunk_size",
-            "uint64 comp_chunk_ptr",
-            "((unsigned int *)comp_chunk_ptr)[0] = (unsigned int)uncomp_chunk_size",
-            "set_chunk_size_header",
-            no_return=True,
-        )
-
-    def _get_comp_temp_size(
-        self,
-        size_t batch_size,
-        size_t max_uncompressed_chunk_bytes,
-    ) -> tuple[nvcompStatus_t, size_t]:
-        cdef size_t temp_bytes = 0
-
-        err = nvcompBatchedLZ4CompressGetTempSize(
-            batch_size,
-            max_uncompressed_chunk_bytes,
-            self.options,
-            &temp_bytes
-        )
-
-        return (err, temp_bytes)
-
-    def _get_comp_chunk_size(self, size_t max_uncompressed_chunk_bytes):
-        cdef size_t max_compressed_bytes = 0
-
-        err = nvcompBatchedLZ4CompressGetMaxOutputChunkSize(
-            max_uncompressed_chunk_bytes,
-            self.options,
-            &max_compressed_bytes
-        )
-
-        # Add header size, if needed.
-        if err == nvcompStatus_t.nvcompSuccess and self.has_header:
-            max_compressed_bytes += self.HEADER_SIZE_BYTES
-
-        return (err, max_compressed_bytes)
-
-    def compress(
-        self,
-        uncomp_chunks,
-        uncomp_chunk_sizes,
-        size_t max_uncomp_chunk_bytes,
-        size_t batch_size,
-        temp_buf,
-        comp_chunks,
-        comp_chunk_sizes,
-        stream,
-    ):
-        if self.has_header:
-            # If there is a header, we need to:
-            # 1. Copy the uncompressed chunk size to the compressed chunk header.
-            # 2. Update target pointers in comp_chunks to skip the header portion,
-            # which is not compressed.
-            #
-            self._set_chunk_size_header_kernel(uncomp_chunk_sizes, comp_chunks)
-            # Update chunk pointer to skip the header.
-            comp_chunks += self.HEADER_SIZE_BYTES
-
-        super().compress(
-            uncomp_chunks,
-            uncomp_chunk_sizes,
-            max_uncomp_chunk_bytes,
-            batch_size,
-            temp_buf,
-            comp_chunks,
-            comp_chunk_sizes,
-            stream,
-        )
-
-        if self.has_header:
-            # Update chunk pointer and size to include the header.
-            comp_chunks -= self.HEADER_SIZE_BYTES
-            comp_chunk_sizes += self.HEADER_SIZE_BYTES
-
-    def _compress(
-        self,
-        uncomp_chunks,
-        uncomp_chunk_sizes,
-        size_t max_uncomp_chunk_bytes,
-        size_t batch_size,
-        temp_buf,
-        comp_chunks,
-        comp_chunk_sizes,
-        stream
-    ):
-        # Cast buffer pointers that have Python int type to appropriate C types
-        # suitable for passing to nvCOMP API.
-        return nvcompBatchedLZ4CompressAsync(
-            <const void* const*>to_ptr(uncomp_chunks),
-            <const size_t*>to_ptr(uncomp_chunk_sizes),
-            max_uncomp_chunk_bytes,
-            batch_size,
-            <void*>to_ptr(temp_buf),
-            <size_t>temp_buf.nbytes,
-            <void* const*>to_ptr(comp_chunks),
-            <size_t*>to_ptr(comp_chunk_sizes),
-            self.options,
-            to_stream(stream),
-        )
-
-    def _get_decomp_temp_size(
-        self,
-        size_t batch_size,
-        size_t max_uncompressed_chunk_bytes,
-    ):
-        cdef size_t temp_bytes = 0
-
-        err = nvcompBatchedLZ4DecompressGetTempSize(
-            batch_size,
-            max_uncompressed_chunk_bytes,
-            &temp_bytes
-        )
-
-        return (err, temp_bytes)
-
-    def get_decompress_size(
-        self,
-        comp_chunks,
-        comp_chunk_sizes,
-        stream,
-    ):
-        if not self.has_header:
-            return super().get_decompress_size(
-                comp_chunks,
-                comp_chunk_sizes,
-                stream,
-            )
-
-        return self._get_size_from_header_kernel(comp_chunks)
-
-    def _get_decomp_size(
-        self,
-        comp_chunks,
-        comp_chunk_sizes,
-        size_t batch_size,
-        uncomp_chunk_sizes,
-        stream,
-    ):
-        return nvcompBatchedLZ4GetDecompressSizeAsync(
-            <const void* const*>to_ptr(comp_chunks),
-            <const size_t*>to_ptr(comp_chunk_sizes),
-            <size_t*>to_ptr(uncomp_chunk_sizes),
-            batch_size,
-            to_stream(stream),
-        )
-
-    def decompress(
-        self,
-        comp_chunks,
-        comp_chunk_sizes,
-        size_t batch_size,
-        temp_buf,
-        uncomp_chunks,
-        uncomp_chunk_sizes,
-        actual_uncomp_chunk_sizes,
-        statuses,
-        stream,
-    ):
-        if self.has_header:
-            # Update chunk pointer and size to exclude the header.
-            comp_chunks += self.HEADER_SIZE_BYTES
-            comp_chunk_sizes -= self.HEADER_SIZE_BYTES
-
-        super().decompress(
-            comp_chunks,
-            comp_chunk_sizes,
-            batch_size,
-            temp_buf,
-            uncomp_chunks,
-            uncomp_chunk_sizes,
-            actual_uncomp_chunk_sizes,
-            statuses,
-            stream,
-        )
-
-    def _decompress(
-        self,
-        comp_chunks,
-        comp_chunk_sizes,
-        size_t batch_size,
-        temp_buf,
-        uncomp_chunks,
-        uncomp_chunk_sizes,
-        actual_uncomp_chunk_sizes,
-        statuses,
-        stream,
-    ):
-        # Cast buffer pointers that have Python int type to appropriate C types
-        # suitable for passing to nvCOMP API.
-        return nvcompBatchedLZ4DecompressAsync(
-            <const void* const*>to_ptr(comp_chunks),
-            <const size_t*>to_ptr(comp_chunk_sizes),
-            <const size_t*>to_ptr(uncomp_chunk_sizes),
-            <size_t*>NULL,
-            batch_size,
-            <void* const>to_ptr(temp_buf),
-            <size_t>temp_buf.nbytes,
-            <void* const*>to_ptr(uncomp_chunks),
-            <nvcompStatus_t*>NULL,
-            to_stream(stream),
-        )
-
-    def __repr__(self):
-        return f"{self.__class__.__name__}(data_type={self.options['data_type']})"
-
-
-#
-# Gdeflate algorithm.
-#
-from kvikio._lib.nvcomp_ll_cxx_api cimport (
-    nvcompBatchedGdeflateCompressAsync,
-    nvcompBatchedGdeflateCompressGetMaxOutputChunkSize,
-    nvcompBatchedGdeflateCompressGetTempSize,
-    nvcompBatchedGdeflateDecompressAsync,
-    nvcompBatchedGdeflateDecompressGetTempSize,
-    nvcompBatchedGdeflateDefaultOpts,
-    nvcompBatchedGdeflateGetDecompressSizeAsync,
-    nvcompBatchedGdeflateOpts_t,
-)
-
-
-class nvCompBatchAlgorithmGdeflate(nvCompBatchAlgorithm):
-    """Gdeflate algorithm implementation."""
-
-    algo_id: str = "gdeflate"
-
-    options: nvcompBatchedGdeflateOpts_t
-
-    def __init__(self, algo: int = None):
-        if algo is None:
-            self.options = nvcompBatchedGdeflateDefaultOpts
-        else:
-            self.options = nvcompBatchedGdeflateOpts_t(algo)
-
-    def _get_comp_temp_size(
-        self,
-        size_t batch_size,
-        size_t max_uncompressed_chunk_bytes,
-    ) -> tuple[nvcompStatus_t, size_t]:
-        cdef size_t temp_bytes = 0
-
-        err = nvcompBatchedGdeflateCompressGetTempSize(
-            batch_size,
-            max_uncompressed_chunk_bytes,
-            self.options,
-            &temp_bytes
-        )
-
-        return (err, temp_bytes)
-
-    def _get_comp_chunk_size(self, size_t max_uncompressed_chunk_bytes):
-        cdef size_t max_compressed_bytes = 0
-
-        err = nvcompBatchedGdeflateCompressGetMaxOutputChunkSize(
-            max_uncompressed_chunk_bytes,
-            self.options,
-            &max_compressed_bytes
-        )
-
-        return (err, max_compressed_bytes)
-
-    def _compress(
-        self,
-        uncomp_chunks,
-        uncomp_chunk_sizes,
-        size_t max_uncomp_chunk_bytes,
-        size_t batch_size,
-        temp_buf,
-        comp_chunks,
-        comp_chunk_sizes,
-        stream
-    ):
-        return nvcompBatchedGdeflateCompressAsync(
-            <const void* const*>to_ptr(uncomp_chunks),
-            <const size_t*>to_ptr(uncomp_chunk_sizes),
-            max_uncomp_chunk_bytes,
-            batch_size,
-            <void*>to_ptr(temp_buf),
-            <size_t>temp_buf.nbytes,
-            <void* const*>to_ptr(comp_chunks),
-            <size_t*>to_ptr(comp_chunk_sizes),
-            self.options,
-            to_stream(stream),
-        )
-
-    def _get_decomp_temp_size(
-        self,
-        size_t num_chunks,
-        size_t max_uncompressed_chunk_bytes,
-    ):
-        cdef size_t temp_bytes = 0
-
-        err = nvcompBatchedGdeflateDecompressGetTempSize(
-            num_chunks,
-            max_uncompressed_chunk_bytes,
-            &temp_bytes
-        )
-
-        return (err, temp_bytes)
-
-    def _get_decomp_size(
-        self,
-        comp_chunks,
-        comp_chunk_sizes,
-        size_t batch_size,
-        uncomp_chunk_sizes,
-        stream,
-    ):
-        return nvcompBatchedGdeflateGetDecompressSizeAsync(
-            <const void* const*>to_ptr(comp_chunks),
-            <const size_t*>to_ptr(comp_chunk_sizes),
-            <size_t*>to_ptr(uncomp_chunk_sizes),
-            batch_size,
-            to_stream(stream),
-        )
-
-    def _decompress(
-        self,
-        comp_chunks,
-        comp_chunk_sizes,
-        size_t batch_size,
-        temp_buf,
-        uncomp_chunks,
-        uncomp_chunk_sizes,
-        actual_uncomp_chunk_sizes,
-        statuses,
-        stream,
-    ):
-        return nvcompBatchedGdeflateDecompressAsync(
-            <const void* const*>to_ptr(comp_chunks),
-            <const size_t*>to_ptr(comp_chunk_sizes),
-            <const size_t*>to_ptr(uncomp_chunk_sizes),
-            <size_t*>NULL,
-            batch_size,
-            <void* const>to_ptr(temp_buf),
-            <size_t>temp_buf.nbytes,
-            <void* const*>to_ptr(uncomp_chunks),
-            <nvcompStatus_t*>NULL,
-            to_stream(stream),
-        )
-
-    def __repr__(self):
-        return f"{self.__class__.__name__}(algo={self.options['algo']})"
-
-
-#
-# zstd algorithm.
-#
-from kvikio._lib.nvcomp_ll_cxx_api cimport (
-    nvcompBatchedZstdCompressAsync,
-    nvcompBatchedZstdCompressGetMaxOutputChunkSize,
-    nvcompBatchedZstdCompressGetTempSize,
-    nvcompBatchedZstdDecompressAsync,
-    nvcompBatchedZstdDecompressGetTempSize,
-    nvcompBatchedZstdDefaultOpts,
-    nvcompBatchedZstdGetDecompressSizeAsync,
-    nvcompBatchedZstdOpts_t,
-)
-
-
-class nvCompBatchAlgorithmZstd(nvCompBatchAlgorithm):
-    """zstd algorithm implementation."""
-
-    algo_id: str = "zstd"
-
-    options: nvcompBatchedZstdOpts_t
-
-    def __init__(self):
-        self.options = nvcompBatchedZstdDefaultOpts
-
-    def _get_comp_temp_size(
-        self,
-        size_t batch_size,
-        size_t max_uncompressed_chunk_bytes,
-    ) -> tuple[nvcompStatus_t, size_t]:
-        cdef size_t temp_bytes = 0
-
-        err = nvcompBatchedZstdCompressGetTempSize(
-            batch_size,
-            max_uncompressed_chunk_bytes,
-            self.options,
-            &temp_bytes
-        )
-
-        return (err, temp_bytes)
-
-    def _get_comp_chunk_size(self, size_t max_uncompressed_chunk_bytes):
-        cdef size_t max_compressed_bytes = 0
-
-        err = nvcompBatchedZstdCompressGetMaxOutputChunkSize(
-            max_uncompressed_chunk_bytes,
-            self.options,
-            &max_compressed_bytes
-        )
-
-        return (err, max_compressed_bytes)
-
-    def _compress(
-        self,
-        uncomp_chunks,
-        uncomp_chunk_sizes,
-        size_t max_uncomp_chunk_bytes,
-        size_t batch_size,
-        temp_buf,
-        comp_chunks,
-        comp_chunk_sizes,
-        stream
-    ):
-        return nvcompBatchedZstdCompressAsync(
-            <const void* const*>to_ptr(uncomp_chunks),
-            <const size_t*>to_ptr(uncomp_chunk_sizes),
-            max_uncomp_chunk_bytes,
-            batch_size,
-            <void*>to_ptr(temp_buf),
-            <size_t>temp_buf.nbytes,
-            <void* const*>to_ptr(comp_chunks),
-            <size_t*>to_ptr(comp_chunk_sizes),
-            self.options,
-            to_stream(stream),
-        )
-
-    def _get_decomp_temp_size(
-        self,
-        size_t num_chunks,
-        size_t max_uncompressed_chunk_bytes,
-    ):
-        cdef size_t temp_bytes = 0
-
-        err = nvcompBatchedZstdDecompressGetTempSize(
-            num_chunks,
-            max_uncompressed_chunk_bytes,
-            &temp_bytes
-        )
-
-        return (err, temp_bytes)
-
-    def _get_decomp_size(
-        self,
-        comp_chunks,
-        comp_chunk_sizes,
-        size_t batch_size,
-        uncomp_chunk_sizes,
-        stream,
-    ):
-        return nvcompBatchedZstdGetDecompressSizeAsync(
-            <const void* const*>to_ptr(comp_chunks),
-            <const size_t*>to_ptr(comp_chunk_sizes),
-            <size_t*>to_ptr(uncomp_chunk_sizes),
-            batch_size,
-            to_stream(stream),
-        )
-
-    def _decompress(
-        self,
-        comp_chunks,
-        comp_chunk_sizes,
-        size_t batch_size,
-        temp_buf,
-        uncomp_chunks,
-        uncomp_chunk_sizes,
-        actual_uncomp_chunk_sizes,
-        statuses,
-        stream,
-    ):
-        return nvcompBatchedZstdDecompressAsync(
-            <const void* const*>to_ptr(comp_chunks),
-            <const size_t*>to_ptr(comp_chunk_sizes),
-            <const size_t*>to_ptr(uncomp_chunk_sizes),
-            <size_t*>to_ptr(actual_uncomp_chunk_sizes),
-            batch_size,
-            <void* const>to_ptr(temp_buf),
-            <size_t>temp_buf.nbytes,
-            <void* const*>to_ptr(uncomp_chunks),
-            <nvcompStatus_t*>to_ptr(statuses),
-            to_stream(stream),
-        )
-
-    def __repr__(self):
-        return f"{self.__class__.__name__}()"
-
-
-#
-# Snappy algorithm.
-#
-from kvikio._lib.nvcomp_ll_cxx_api cimport (
-    nvcompBatchedSnappyCompressAsync,
-    nvcompBatchedSnappyCompressGetMaxOutputChunkSize,
-    nvcompBatchedSnappyCompressGetTempSize,
-    nvcompBatchedSnappyDecompressAsync,
-    nvcompBatchedSnappyDecompressGetTempSize,
-    nvcompBatchedSnappyDefaultOpts,
-    nvcompBatchedSnappyGetDecompressSizeAsync,
-    nvcompBatchedSnappyOpts_t,
-)
-
-
-class nvCompBatchAlgorithmSnappy(nvCompBatchAlgorithm):
-    """Snappy algorithm implementation."""
-
-    algo_id: str = "snappy"
-
-    options: nvcompBatchedSnappyOpts_t
-
-    def __init__(self):
-        self.options = nvcompBatchedSnappyDefaultOpts
-
-    def _get_comp_temp_size(
-        self,
-        size_t batch_size,
-        size_t max_uncompressed_chunk_bytes,
-    ) -> tuple[nvcompStatus_t, size_t]:
-        cdef size_t temp_bytes = 0
-
-        err = nvcompBatchedSnappyCompressGetTempSize(
-            batch_size,
-            max_uncompressed_chunk_bytes,
-            self.options,
-            &temp_bytes
-        )
-
-        return (err, temp_bytes)
-
-    def _get_comp_chunk_size(self, size_t max_uncompressed_chunk_bytes):
-        cdef size_t max_compressed_bytes = 0
-
-        err = nvcompBatchedSnappyCompressGetMaxOutputChunkSize(
-            max_uncompressed_chunk_bytes,
-            self.options,
-            &max_compressed_bytes
-        )
-
-        return (err, max_compressed_bytes)
-
-    def _compress(
-        self,
-        uncomp_chunks,
-        uncomp_chunk_sizes,
-        size_t max_uncomp_chunk_bytes,
-        size_t batch_size,
-        temp_buf,
-        comp_chunks,
-        comp_chunk_sizes,
-        stream
-    ):
-        return nvcompBatchedSnappyCompressAsync(
-            <const void* const*>to_ptr(uncomp_chunks),
-            <const size_t*>to_ptr(uncomp_chunk_sizes),
-            max_uncomp_chunk_bytes,
-            batch_size,
-            <void*>to_ptr(temp_buf),
-            <size_t>temp_buf.nbytes,
-            <void* const*>to_ptr(comp_chunks),
-            <size_t*>to_ptr(comp_chunk_sizes),
-            self.options,
-            to_stream(stream),
-        )
-
-    def _get_decomp_temp_size(
-        self,
-        size_t num_chunks,
-        size_t max_uncompressed_chunk_bytes,
-    ):
-        cdef size_t temp_bytes = 0
-
-        err = nvcompBatchedSnappyDecompressGetTempSize(
-            num_chunks,
-            max_uncompressed_chunk_bytes,
-            &temp_bytes
-        )
-
-        return (err, temp_bytes)
-
-    def _get_decomp_size(
-        self,
-        comp_chunks,
-        comp_chunk_sizes,
-        size_t batch_size,
-        uncomp_chunk_sizes,
-        stream,
-    ):
-        return nvcompBatchedSnappyGetDecompressSizeAsync(
-            <const void* const*>to_ptr(comp_chunks),
-            <const size_t*>to_ptr(comp_chunk_sizes),
-            <size_t*>to_ptr(uncomp_chunk_sizes),
-            batch_size,
-            to_stream(stream),
-        )
-
-    def _decompress(
-        self,
-        comp_chunks,
-        comp_chunk_sizes,
-        size_t batch_size,
-        temp_buf,
-        uncomp_chunks,
-        uncomp_chunk_sizes,
-        actual_uncomp_chunk_sizes,
-        statuses,
-        stream,
-    ):
-        return nvcompBatchedSnappyDecompressAsync(
-            <const void* const*>to_ptr(comp_chunks),
-            <const size_t*>to_ptr(comp_chunk_sizes),
-            <const size_t*>to_ptr(uncomp_chunk_sizes),
-            <size_t*>NULL,
-            batch_size,
-            <void* const>to_ptr(temp_buf),
-            <size_t>temp_buf.nbytes,
-            <void* const*>to_ptr(uncomp_chunks),
-            <nvcompStatus_t*>NULL,
-            to_stream(stream),
-        )
-
-    def __repr__(self):
-        return f"{self.__class__.__name__}()"
-
-
-#
-# Deflate algorithm.
-#
-from kvikio._lib.nvcomp_ll_cxx_api cimport (
-    nvcompBatchedDeflateCompressAsync,
-    nvcompBatchedDeflateCompressGetMaxOutputChunkSize,
-    nvcompBatchedDeflateCompressGetTempSize,
-    nvcompBatchedDeflateDecompressAsync,
-    nvcompBatchedDeflateDecompressGetTempSize,
-    nvcompBatchedDeflateDefaultOpts,
-    nvcompBatchedDeflateGetDecompressSizeAsync,
-    nvcompBatchedDeflateOpts_t,
-)
-
-
-class nvCompBatchAlgorithmDeflate(nvCompBatchAlgorithm):
-    """Deflate algorithm implementation."""
-
-    algo_id: str = "deflate"
-
-    options: nvcompBatchedDeflateOpts_t
-
-    def __init__(self, algo: int = None):
-        if algo is None:
-            self.options = nvcompBatchedDeflateDefaultOpts
-        else:
-            self.options = nvcompBatchedDeflateOpts_t(algo)
-
-    def _get_comp_temp_size(
-        self,
-        size_t batch_size,
-        size_t max_uncompressed_chunk_bytes,
-    ) -> tuple[nvcompStatus_t, size_t]:
-        cdef size_t temp_bytes = 0
-
-        err = nvcompBatchedDeflateCompressGetTempSize(
-            batch_size,
-            max_uncompressed_chunk_bytes,
-            self.options,
-            &temp_bytes
-        )
-
-        return (err, temp_bytes)
-
-    def _get_comp_chunk_size(self, size_t max_uncompressed_chunk_bytes):
-        cdef size_t max_compressed_bytes = 0
-
-        err = nvcompBatchedDeflateCompressGetMaxOutputChunkSize(
-            max_uncompressed_chunk_bytes,
-            self.options,
-            &max_compressed_bytes
-        )
-
-        return (err, max_compressed_bytes)
-
-    def _compress(
-        self,
-        uncomp_chunks,
-        uncomp_chunk_sizes,
-        size_t max_uncomp_chunk_bytes,
-        size_t batch_size,
-        temp_buf,
-        comp_chunks,
-        comp_chunk_sizes,
-        stream
-    ):
-        # Cast buffer pointers that have Python int type to appropriate C types
-        # suitable for passing to nvCOMP API.
-        return nvcompBatchedDeflateCompressAsync(
-            <const void* const*>to_ptr(uncomp_chunks),
-            <const size_t*>to_ptr(uncomp_chunk_sizes),
-            max_uncomp_chunk_bytes,
-            batch_size,
-            <void*>to_ptr(temp_buf),
-            <size_t>temp_buf.nbytes,
-            <void* const*>to_ptr(comp_chunks),
-            <size_t*>to_ptr(comp_chunk_sizes),
-            self.options,
-            to_stream(stream),
-        )
-
-    def _get_decomp_temp_size(
-        self,
-        size_t num_chunks,
-        size_t max_uncompressed_chunk_bytes,
-    ):
-        cdef size_t temp_bytes = 0
-
-        err = nvcompBatchedDeflateDecompressGetTempSize(
-            num_chunks,
-            max_uncompressed_chunk_bytes,
-            &temp_bytes
-        )
-
-        return (err, temp_bytes)
-
-    def _get_decomp_size(
-        self,
-        comp_chunks,
-        comp_chunk_sizes,
-        size_t batch_size,
-        uncomp_chunk_sizes,
-        stream,
-    ):
-        return nvcompBatchedDeflateGetDecompressSizeAsync(
-            <const void* const*>to_ptr(comp_chunks),
-            <const size_t*>to_ptr(comp_chunk_sizes),
-            <size_t*>to_ptr(uncomp_chunk_sizes),
-            batch_size,
-            to_stream(stream),
-        )
-
-    def _decompress(
-        self,
-        comp_chunks,
-        comp_chunk_sizes,
-        size_t batch_size,
-        temp_buf,
-        uncomp_chunks,
-        uncomp_chunk_sizes,
-        actual_uncomp_chunk_sizes,
-        statuses,
-        stream,
-    ):
-        # Cast buffer pointers that have Python int type to appropriate C types
-        # suitable for passing to nvCOMP API.
-        return nvcompBatchedDeflateDecompressAsync(
-            <const void* const*>to_ptr(comp_chunks),
-            <const size_t*>to_ptr(comp_chunk_sizes),
-            <const size_t*>to_ptr(uncomp_chunk_sizes),
-            <size_t*>to_ptr(actual_uncomp_chunk_sizes),
-            batch_size,
-            <void* const>to_ptr(temp_buf),
-            <size_t>temp_buf.nbytes,
-            <void* const*>to_ptr(uncomp_chunks),
-            <nvcompStatus_t*>to_ptr(statuses),
-            to_stream(stream),
-        )
-
-    def __repr__(self):
-        return f"{self.__class__.__name__}(algo={self.options['algo']})"
-
-
-SUPPORTED_ALGORITHMS = {
-    a.algo_id: a for a in [
-        nvCompBatchAlgorithmLZ4,
-        nvCompBatchAlgorithmGdeflate,
-        nvCompBatchAlgorithmZstd,
-        nvCompBatchAlgorithmSnappy,
-        nvCompBatchAlgorithmDeflate,
-    ]
-}
diff --git a/python/kvikio/kvikio/_lib/nvcomp_cxx_api.pxd b/python/kvikio/kvikio/_lib/nvcomp_cxx_api.pxd
deleted file mode 100644
index b86797a93f..0000000000
--- a/python/kvikio/kvikio/_lib/nvcomp_cxx_api.pxd
+++ /dev/null
@@ -1,212 +0,0 @@
-# Copyright (c) 2022 Carson Swope
-# Use, modification, and distribution is subject to the MIT License
-# https://github.com/carsonswope/py-nvcomp/blob/main/LICENSE)
-# SPDX-FileCopyrightText: Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES.
-# All rights reserved.
-# SPDX-License-Identifier: MIT
-# Permission is hereby granted, free of charge, to any person obtaining a
-# copy of this software and associated documentation files (the "Software"),
-# to deal in the Software without restriction, including without limitation
-# the rights to use, copy, modify, merge, publish, distribute, sublicense,
-# and/or sell copies of the Software, and to permit persons to whom the
-# Software is furnished to do so, subject to the following conditions:
-# The above copyright notice and this permission notice shall be included in
-# all copies or substantial portions of the Software.
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
-# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-# DEALINGS IN THE SOFTWARE.
-
-from libc.stdint cimport uint8_t, uint32_t
-from libcpp.memory cimport shared_ptr
-from libcpp.vector cimport vector
-
-
-cdef extern from "cuda_runtime.h":
-    ctypedef void* cudaStream_t
-
-cdef extern from "nvcomp.h":
-    ctypedef enum nvcompType_t:
-        NVCOMP_TYPE_CHAR = 0,       # 1B
-        NVCOMP_TYPE_UCHAR = 1,      # 1B
-        NVCOMP_TYPE_SHORT = 2,      # 2B
-        NVCOMP_TYPE_USHORT = 3,     # 2B
-        NVCOMP_TYPE_INT = 4,        # 4B
-        NVCOMP_TYPE_UINT = 5,       # 4B
-        NVCOMP_TYPE_LONGLONG = 6,   # 8B
-        NVCOMP_TYPE_ULONGLONG = 7,  # 8B
-        NVCOMP_TYPE_BITS = 0xff     # 1b
-
-
-cdef extern from "nvcomp/shared_types.h":
-    ctypedef enum nvcompStatus_t:
-        nvcompSuccess = 0,
-        nvcompErrorInvalidValue = 10,
-        nvcompErrorNotSupported = 11,
-        nvcompErrorCannotDecompress = 12,
-        nvcompErrorBadChecksum = 13,
-        nvcompErrorCannotVerifyChecksums = 14,
-        nvcompErrorCudaError = 1000,
-        nvcompErrorInternal = 10000,
-
-# Manager Factory
-cdef extern from "nvcomp/nvcompManagerFactory.hpp" namespace 'nvcomp':
-    cdef shared_ptr[nvcompManagerBase] create_manager "nvcomp::create_manager"(
-        const uint8_t* comp_buffer
-    ) except +
-
-
-# Compression Manager
-cdef extern from "nvcomp/nvcompManager.hpp" namespace 'nvcomp':
-    cdef cppclass PinnedPtrPool[T]:
-        pass
-
-    cdef cppclass CompressionConfig "nvcomp::CompressionConfig":
-        const size_t uncompressed_buffer_size
-        const size_t max_compressed_buffer_size
-        const size_t num_chunks
-        CompressionConfig(
-            PinnedPtrPool[nvcompStatus_t]* pool,
-            size_t uncompressed_buffer_size) except +
-        nvcompStatus_t* get_status() const
-        CompressionConfig(CompressionConfig& other)
-        CompressionConfig& operator=(const CompressionConfig& other) except +
-        # Commented as Cython doesn't support rvalues, but a user can call
-        # `move` with the existing operator and generate correct C++ code
-        # xref: https://github.com/cython/cython/issues/1445
-        # CompressionConfig& operator=(CompressionConfig&& other) except +
-
-    cdef cppclass DecompressionConfig "nvcomp::DecompressionConfig":
-        size_t decomp_data_size
-        uint32_t num_chunks
-        DecompressionConfig(PinnedPtrPool[nvcompStatus_t]& pool) except +
-        nvcompStatus_t* get_status() const
-        DecompressionConfig(DecompressionConfig& other)
-        DecompressionConfig& operator=(const DecompressionConfig& other) except +
-        # Commented as Cython doesn't support rvalues, but a user can call
-        # `move` with the existing operator and generate correct C++ code
-        # xref: https://github.com/cython/cython/issues/1445
-        # DecompressionConfig& operator=(DecompressionConfig&& other) except +
-
-    cdef cppclass nvcompManagerBase "nvcomp::nvcompManagerBase":
-        CompressionConfig configure_compression(
-            const size_t decomp_buffer_size)
-        void compress(
-            const uint8_t* decomp_buffer,
-            uint8_t* comp_buffer,
-            const CompressionConfig& comp_config) except +
-        DecompressionConfig configure_decompression(
-            const uint8_t* comp_buffer)
-        DecompressionConfig configure_decompression(
-            const CompressionConfig& comp_config)
-        void decompress(
-            uint8_t* decomp_buffer,
-            const uint8_t* comp_buffer,
-            const DecompressionConfig& decomp_config)
-        size_t get_compressed_output_size(uint8_t* comp_buffer) except +
-
-    cdef cppclass PimplManager "nvcomp::PimplManager":
-        CompressionConfig configure_compression(
-            const size_t decomp_buffer_size) except +
-        void compress(
-            const uint8_t* decomp_buffer,
-            uint8_t* comp_buffer,
-            const CompressionConfig& comp_config) except +
-        DecompressionConfig configure_decompression(
-            const uint8_t* comp_buffer)
-        DecompressionConfig configure_decompression(
-            const CompressionConfig& comp_config)
-        void decompress(
-            uint8_t* decomp_buffer,
-            const uint8_t* comp_buffer,
-            const DecompressionConfig& decomp_config) except +
-        size_t get_compressed_output_size(uint8_t* comp_buffer) except +
-
-# C++ Concrete ANS Manager
-cdef extern from "nvcomp/ans.h" nogil:
-    ctypedef enum nvcompANSType_t:
-        nvcomp_rANS = 0
-
-    ctypedef struct nvcompBatchedANSOpts_t:
-        nvcompANSType_t type
-    cdef nvcompBatchedANSOpts_t nvcompBatchedANSDefaultOpts
-
-cdef extern from "nvcomp/ans.hpp":
-    cdef cppclass ANSManager "nvcomp::ANSManager":
-        ANSManager(
-            size_t uncomp_chunk_size,
-            const nvcompBatchedANSOpts_t& format_opts,
-        ) except +
-
-# C++ Concrete Bitcomp Manager
-cdef extern from "nvcomp/bitcomp.h" nogil:
-    ctypedef struct nvcompBatchedBitcompFormatOpts:
-        int algorithm_type
-        nvcompType_t data_type
-    cdef nvcompBatchedBitcompFormatOpts nvcompBatchedBitcompDefaultOpts
-
-cdef extern from "nvcomp/bitcomp.hpp":
-    cdef cppclass BitcompManager "nvcomp::BitcompManager":
-        BitcompManager(
-            size_t uncomp_chunk_size,
-            const nvcompBatchedBitcompFormatOpts& format_opts,
-        ) except +
-
-# C++ Concrete Cascaded Manager
-cdef extern from "nvcomp/cascaded.h" nogil:
-    ctypedef struct nvcompBatchedCascadedOpts_t:
-        size_t chunk_size
-        nvcompType_t type
-        int num_RLEs
-        int num_deltas
-        int use_bp
-    cdef nvcompBatchedCascadedOpts_t nvcompBatchedCascadedDefaultOpts
-
-cdef extern from "nvcomp/cascaded.hpp" nogil:
-    cdef cppclass CascadedManager "nvcomp::CascadedManager":
-        CascadedManager(
-            size_t uncomp_chunk_size,
-            const nvcompBatchedCascadedOpts_t& options,
-        )
-
-# C++ Concrete Gdeflate Manager
-cdef extern from "nvcomp/gdeflate.h" nogil:
-    ctypedef struct nvcompBatchedGdeflateOpts_t:
-        int algo
-    cdef nvcompBatchedGdeflateOpts_t nvcompBatchedGdeflateDefaultOpts
-
-cdef extern from "nvcomp/gdeflate.hpp":
-    cdef cppclass GdeflateManager "nvcomp::GdeflateManager":
-        GdeflateManager(
-            int uncomp_chunk_size,
-            const nvcompBatchedGdeflateOpts_t& format_opts,
-        ) except +
-
-# C++ Concrete LZ4 Manager
-cdef extern from "nvcomp/gdeflate.h" nogil:
-    ctypedef struct nvcompBatchedLZ4Opts_t:
-        nvcompType_t data_type
-    cdef nvcompBatchedLZ4Opts_t nvcompBatchedLZ4DefaultOpts
-
-cdef extern from "nvcomp/lz4.hpp":
-    cdef cppclass LZ4Manager "nvcomp::LZ4Manager":
-        LZ4Manager(
-            size_t uncomp_chunk_size,
-            const nvcompBatchedLZ4Opts_t& format_opts,
-        ) except +
-
-# C++ Concrete Snappy Manager
-cdef extern from "nvcomp/snappy.h" nogil:
-    ctypedef struct nvcompBatchedSnappyOpts_t:
-        int reserved
-    cdef nvcompBatchedSnappyOpts_t nvcompBatchedSnappyDefaultOpts
-
-cdef extern from "nvcomp/snappy.hpp":
-    cdef cppclass SnappyManager "nvcomp::SnappyManager":
-        SnappyManager(
-            size_t uncomp_chunk_size,
-            const nvcompBatchedSnappyOpts_t& format_opts,
-        ) except +
diff --git a/python/kvikio/kvikio/_lib/nvcomp_ll_cxx_api.pxd b/python/kvikio/kvikio/_lib/nvcomp_ll_cxx_api.pxd
deleted file mode 100644
index 6a23eb5cd1..0000000000
--- a/python/kvikio/kvikio/_lib/nvcomp_ll_cxx_api.pxd
+++ /dev/null
@@ -1,362 +0,0 @@
-# Copyright (c) 2023-2024, NVIDIA CORPORATION. All rights reserved.
-# See file LICENSE for terms.
-
-# distutils: language = c++
-# cython: language_level=3
-
-cdef extern from "cuda_runtime.h":
-    ctypedef void* cudaStream_t
-
-    ctypedef enum cudaMemcpyKind:
-        cudaMemcpyHostToHost = 0,
-        cudaMemcpyHostToDevice = 1,
-        cudaMemcpyDeviceToHost = 2,
-        cudaMemcpyDeviceToDevice = 3,
-        cudaMemcpyDefault = 4
-
-cdef extern from "nvcomp.h":
-    ctypedef enum nvcompType_t:
-        NVCOMP_TYPE_CHAR = 0,       # 1B
-        NVCOMP_TYPE_UCHAR = 1,      # 1B
-        NVCOMP_TYPE_SHORT = 2,      # 2B
-        NVCOMP_TYPE_USHORT = 3,     # 2B
-        NVCOMP_TYPE_INT = 4,        # 4B
-        NVCOMP_TYPE_UINT = 5,       # 4B
-        NVCOMP_TYPE_LONGLONG = 6,   # 8B
-        NVCOMP_TYPE_ULONGLONG = 7,  # 8B
-        NVCOMP_TYPE_BITS = 0xff     # 1b
-
-cdef extern from "nvcomp/shared_types.h":
-    ctypedef enum nvcompStatus_t:
-        nvcompSuccess = 0,
-        nvcompErrorInvalidValue = 10,
-        nvcompErrorNotSupported = 11,
-        nvcompErrorCannotDecompress = 12,
-        nvcompErrorBadChecksum = 13,
-        nvcompErrorCannotVerifyChecksums = 14,
-        nvcompErrorCudaError = 1000,
-        nvcompErrorInternal = 10000,
-
-# nvCOMP Low-Level Interface.
-# https://github.com/NVIDIA/nvcomp/blob/main/doc/lowlevel_c_quickstart.md
-
-#
-# LZ4 batch compression/decompression API.
-#
-cdef extern from "nvcomp/lz4.h" nogil:
-    ctypedef struct nvcompBatchedLZ4Opts_t:
-        nvcompType_t data_type
-
-    cdef nvcompBatchedLZ4Opts_t nvcompBatchedLZ4DefaultOpts
-
-    # Compression API.
-    cdef nvcompStatus_t nvcompBatchedLZ4CompressGetTempSize(
-        size_t batch_size,
-        size_t max_uncompressed_chunk_bytes,
-        nvcompBatchedLZ4Opts_t format_opts,
-        size_t* temp_bytes
-    )
-
-    cdef nvcompStatus_t nvcompBatchedLZ4CompressGetMaxOutputChunkSize(
-        size_t max_uncompressed_chunk_bytes,
-        nvcompBatchedLZ4Opts_t format_opts,
-        size_t* max_compressed_bytes
-    )
-
-    cdef nvcompStatus_t nvcompBatchedLZ4CompressAsync(
-        const void* const* device_uncompressed_ptrs,
-        const size_t* device_uncompressed_bytes,
-        size_t max_uncompressed_chunk_bytes,
-        size_t batch_size,
-        void* device_temp_ptr,
-        size_t temp_bytes,
-        void* const* device_compressed_ptrs,
-        size_t* device_compressed_bytes,
-        nvcompBatchedLZ4Opts_t format_opts,
-        cudaStream_t stream
-    )
-
-    # Decompression API.
-    cdef nvcompStatus_t nvcompBatchedLZ4DecompressGetTempSize(
-        size_t num_chunks,
-        size_t max_uncompressed_chunk_bytes,
-        size_t* temp_bytes
-    )
-
-    cdef nvcompStatus_t nvcompBatchedLZ4GetDecompressSizeAsync(
-        const void* const* device_compressed_ptrs,
-        const size_t* device_compressed_bytes,
-        size_t* device_uncompressed_bytes,
-        size_t batch_size,
-        cudaStream_t stream
-    )
-
-    nvcompStatus_t nvcompBatchedLZ4DecompressAsync(
-        const void* const* device_compressed_ptrs,
-        const size_t* device_compressed_bytes,
-        const size_t* device_uncompressed_bytes,
-        size_t* device_actual_uncompressed_bytes,
-        size_t batch_size,
-        void* const device_temp_ptr,
-        size_t temp_bytes,
-        void* const* device_uncompressed_ptrs,
-        nvcompStatus_t* device_statuses,
-        cudaStream_t stream
-    )
-
-#
-# Gdeflate batch compression/decompression API.
-#
-cdef extern from "nvcomp/gdeflate.h" nogil:
-    ctypedef struct nvcompBatchedGdeflateOpts_t:
-        int algo
-
-    cdef nvcompBatchedGdeflateOpts_t nvcompBatchedGdeflateDefaultOpts
-
-    # Compression API.
-    cdef nvcompStatus_t nvcompBatchedGdeflateCompressGetTempSize(
-        size_t batch_size,
-        size_t max_uncompressed_chunk_bytes,
-        nvcompBatchedGdeflateOpts_t format_opts,
-        size_t* temp_bytes
-    )
-
-    cdef nvcompStatus_t nvcompBatchedGdeflateCompressGetMaxOutputChunkSize(
-        size_t max_uncompressed_chunk_bytes,
-        nvcompBatchedGdeflateOpts_t format_opts,
-        size_t* max_compressed_bytes
-    )
-
-    cdef nvcompStatus_t nvcompBatchedGdeflateCompressAsync(
-        const void* const* device_uncompressed_ptrs,
-        const size_t* device_uncompressed_bytes,
-        size_t max_uncompressed_chunk_bytes,
-        size_t batch_size,
-        void* device_temp_ptr,
-        size_t temp_bytes,
-        void* const* device_compressed_ptrs,
-        size_t* device_compressed_bytes,
-        nvcompBatchedGdeflateOpts_t format_opts,
-        cudaStream_t stream
-    )
-
-    # Decompression API.
-    cdef nvcompStatus_t nvcompBatchedGdeflateDecompressGetTempSize(
-        size_t num_chunks,
-        size_t max_uncompressed_chunk_bytes,
-        size_t* temp_bytes
-    )
-
-    nvcompStatus_t nvcompBatchedGdeflateGetDecompressSizeAsync(
-        const void* const* device_compressed_ptrs,
-        const size_t* device_compressed_bytes,
-        size_t* device_uncompressed_bytes,
-        size_t batch_size,
-        cudaStream_t stream
-    )
-
-    nvcompStatus_t nvcompBatchedGdeflateDecompressAsync(
-        const void* const* device_compressed_ptrs,
-        const size_t* device_compressed_bytes,
-        const size_t* device_uncompressed_bytes,
-        size_t* device_actual_uncompressed_bytes,
-        size_t batch_size,
-        void* const device_temp_ptr,
-        size_t temp_bytes,
-        void* const* device_uncompressed_ptrs,
-        nvcompStatus_t* device_statuses,
-        cudaStream_t stream
-    )
-
-#
-# zstd batch compression/decompression API.
-#
-cdef extern from "nvcomp/zstd.h" nogil:
-    ctypedef struct nvcompBatchedZstdOpts_t:
-        int reserved
-
-    cdef nvcompBatchedZstdOpts_t nvcompBatchedZstdDefaultOpts
-
-    # Compression API.
-    cdef nvcompStatus_t nvcompBatchedZstdCompressGetTempSize(
-        size_t batch_size,
-        size_t max_uncompressed_chunk_bytes,
-        nvcompBatchedZstdOpts_t format_opts,
-        size_t* temp_bytes
-    )
-
-    cdef nvcompStatus_t nvcompBatchedZstdCompressGetMaxOutputChunkSize(
-        size_t max_uncompressed_chunk_bytes,
-        nvcompBatchedZstdOpts_t format_opts,
-        size_t* max_compressed_bytes
-    )
-
-    cdef nvcompStatus_t nvcompBatchedZstdCompressAsync(
-        const void* const* device_uncompressed_ptrs,
-        const size_t* device_uncompressed_bytes,
-        size_t max_uncompressed_chunk_bytes,
-        size_t batch_size,
-        void* device_temp_ptr,
-        size_t temp_bytes,
-        void* const* device_compressed_ptrs,
-        size_t* device_compressed_bytes,
-        nvcompBatchedZstdOpts_t format_opts,
-        cudaStream_t stream
-    )
-
-    # Decompression API.
-    cdef nvcompStatus_t nvcompBatchedZstdDecompressGetTempSize(
-        size_t num_chunks,
-        size_t max_uncompressed_chunk_bytes,
-        size_t* temp_bytes
-    )
-
-    nvcompStatus_t nvcompBatchedZstdGetDecompressSizeAsync(
-        const void* const* device_compressed_ptrs,
-        const size_t* device_compressed_bytes,
-        size_t* device_uncompressed_bytes,
-        size_t batch_size,
-        cudaStream_t stream
-    )
-
-    nvcompStatus_t nvcompBatchedZstdDecompressAsync(
-        const void* const* device_compressed_ptrs,
-        const size_t* device_compressed_bytes,
-        const size_t* device_uncompressed_bytes,
-        size_t* device_actual_uncompressed_bytes,
-        size_t batch_size,
-        void* const device_temp_ptr,
-        size_t temp_bytes,
-        void* const* device_uncompressed_ptrs,
-        nvcompStatus_t* device_statuses,
-        cudaStream_t stream
-    )
-
-#
-# Snappy batch compression/decompression API.
-#
-cdef extern from "nvcomp/snappy.h" nogil:
-    ctypedef struct nvcompBatchedSnappyOpts_t:
-        int reserved
-
-    cdef nvcompBatchedSnappyOpts_t nvcompBatchedSnappyDefaultOpts
-
-    # Compression API.
-    cdef nvcompStatus_t nvcompBatchedSnappyCompressGetTempSize(
-        size_t batch_size,
-        size_t max_uncompressed_chunk_bytes,
-        nvcompBatchedSnappyOpts_t format_opts,
-        size_t* temp_bytes
-    )
-
-    cdef nvcompStatus_t nvcompBatchedSnappyCompressGetMaxOutputChunkSize(
-        size_t max_uncompressed_chunk_bytes,
-        nvcompBatchedSnappyOpts_t format_opts,
-        size_t* max_compressed_bytes
-    )
-
-    cdef nvcompStatus_t nvcompBatchedSnappyCompressAsync(
-        const void* const* device_uncompressed_ptrs,
-        const size_t* device_uncompressed_bytes,
-        size_t max_uncompressed_chunk_bytes,
-        size_t batch_size,
-        void* device_temp_ptr,
-        size_t temp_bytes,
-        void* const* device_compressed_ptrs,
-        size_t* device_compressed_bytes,
-        nvcompBatchedSnappyOpts_t format_opts,
-        cudaStream_t stream
-    )
-
-    # Decompression API.
-    cdef nvcompStatus_t nvcompBatchedSnappyDecompressGetTempSize(
-        size_t num_chunks,
-        size_t max_uncompressed_chunk_bytes,
-        size_t* temp_bytes
-    )
-
-    nvcompStatus_t nvcompBatchedSnappyGetDecompressSizeAsync(
-        const void* const* device_compressed_ptrs,
-        const size_t* device_compressed_bytes,
-        size_t* device_uncompressed_bytes,
-        size_t batch_size,
-        cudaStream_t stream
-    )
-
-    nvcompStatus_t nvcompBatchedSnappyDecompressAsync(
-        const void* const* device_compressed_ptrs,
-        const size_t* device_compressed_bytes,
-        const size_t* device_uncompressed_bytes,
-        size_t* device_actual_uncompressed_bytes,
-        size_t batch_size,
-        void* const device_temp_ptr,
-        size_t temp_bytes,
-        void* const* device_uncompressed_ptrs,
-        nvcompStatus_t* device_statuses,
-        cudaStream_t stream
-    )
-
-
-#
-# Deflate batch compression/decompression API.
-#
-cdef extern from "nvcomp/deflate.h" nogil:
-    ctypedef struct nvcompBatchedDeflateOpts_t:
-        int algo
-
-    cdef nvcompBatchedDeflateOpts_t nvcompBatchedDeflateDefaultOpts
-
-    # Compression API.
-    cdef nvcompStatus_t nvcompBatchedDeflateCompressGetTempSize(
-        size_t batch_size,
-        size_t max_uncompressed_chunk_bytes,
-        nvcompBatchedDeflateOpts_t format_opts,
-        size_t* temp_bytes
-    )
-
-    cdef nvcompStatus_t nvcompBatchedDeflateCompressGetMaxOutputChunkSize(
-        size_t max_uncompressed_chunk_bytes,
-        nvcompBatchedDeflateOpts_t format_opts,
-        size_t* max_compressed_bytes
-    )
-
-    cdef nvcompStatus_t nvcompBatchedDeflateCompressAsync(
-        const void* const* device_uncompressed_ptrs,
-        const size_t* device_uncompressed_bytes,
-        size_t max_uncompressed_chunk_bytes,
-        size_t batch_size,
-        void* device_temp_ptr,
-        size_t temp_bytes,
-        void* const* device_compressed_ptrs,
-        size_t* device_compressed_bytes,
-        nvcompBatchedDeflateOpts_t format_opts,
-        cudaStream_t stream
-    )
-
-    # Decompression API.
-    cdef nvcompStatus_t nvcompBatchedDeflateDecompressGetTempSize(
-        size_t num_chunks,
-        size_t max_uncompressed_chunk_bytes,
-        size_t* temp_bytes
-    )
-
-    nvcompStatus_t nvcompBatchedDeflateGetDecompressSizeAsync(
-        const void* const* device_compressed_ptrs,
-        const size_t* device_compressed_bytes,
-        size_t* device_uncompressed_bytes,
-        size_t batch_size,
-        cudaStream_t stream
-    )
-
-    nvcompStatus_t nvcompBatchedDeflateDecompressAsync(
-        const void* const* device_compressed_ptrs,
-        const size_t* device_compressed_bytes,
-        const size_t* device_uncompressed_bytes,
-        size_t* device_actual_uncompressed_bytes,
-        size_t batch_size,
-        void* const device_temp_ptr,
-        size_t temp_bytes,
-        void* const* device_uncompressed_ptrs,
-        nvcompStatus_t* device_statuses,
-        cudaStream_t stream
-    )
diff --git a/python/kvikio/kvikio/_nvcomp.py b/python/kvikio/kvikio/_nvcomp.py
deleted file mode 100644
index 5606ad5ce5..0000000000
--- a/python/kvikio/kvikio/_nvcomp.py
+++ /dev/null
@@ -1,368 +0,0 @@
-# Copyright (c) 2021-2025, NVIDIA CORPORATION. All rights reserved.
-# See file LICENSE for terms.
-
-from enum import Enum
-
-import cupy as cp
-import numpy as np
-
-import kvikio._lib.libnvcomp as _lib
-from kvikio._lib.arr import asarray
-
-_dtype_map = {
-    cp.dtype("int8"): _lib.pyNvcompType_t.pyNVCOMP_TYPE_CHAR,
-    cp.dtype("uint8"): _lib.pyNvcompType_t.pyNVCOMP_TYPE_UCHAR,
-    cp.dtype("int16"): _lib.pyNvcompType_t.pyNVCOMP_TYPE_SHORT,
-    cp.dtype("uint16"): _lib.pyNvcompType_t.pyNVCOMP_TYPE_USHORT,
-    cp.dtype("int32"): _lib.pyNvcompType_t.pyNVCOMP_TYPE_INT,
-    cp.dtype("uint32"): _lib.pyNvcompType_t.pyNVCOMP_TYPE_UINT,
-    cp.dtype("int64"): _lib.pyNvcompType_t.pyNVCOMP_TYPE_LONGLONG,
-    cp.dtype("uint64"): _lib.pyNvcompType_t.pyNVCOMP_TYPE_ULONGLONG,
-}
-
-
-def cp_to_nvcomp_dtype(in_type: cp.dtype) -> Enum:
-    """Convert np/cp dtypes to nvcomp integral dtypes.
-
-    Parameters
-    ----------
-    in_type
-        A type argument that can be used to initialize a cupy/numpy dtype.
-
-    Returns
-    -------
-    int
-        The value of the NVCOMP_TYPE for supported dtype.
-    """
-    cp_type = cp.dtype(in_type)
-    return _dtype_map[cp_type]
-
-
-class nvCompManager:
-    """Base class for nvComp Compression Managers.
-
-    Compression managers compress uncompressed data and decompress the result.
-
-    Child types of nvCompManager implement only their constructor, as they each
-    take different options to build. The rest of their implementation is
-    in nvCompManager.
-
-    nvCompManager also keeps all of the options for its child types.
-    """
-
-    _manager: _lib._nvcompManager = None
-    config: dict = {}
-    decompression_config: dict = {}
-
-    # This is a python option: What type was the data when it was passed in?
-    # This is used only for returning a decompressed view of the original
-    # datatype. Untested so far.
-    input_type = cp.int8
-
-    # Default options exist for every option type for every class that inherits
-    # from nvCompManager, which takes advantage of the below property-setting
-    # code.
-    chunk_size: int = 1 << 16
-    data_type: _lib.pyNvcompType_t = _lib.pyNvcompType_t.pyNVCOMP_TYPE_UCHAR
-    # Some classes have this defined as type, some as data_type.
-    type: _lib.pyNvcompType_t = _lib.pyNvcompType_t.pyNVCOMP_TYPE_UCHAR
-
-    # Bitcomp Defaults
-    bitcomp_algo: int = 0
-
-    # Gdeflate defaults
-    algo: int = 0
-
-    def __init__(self, kwargs):
-        """Stores the results of all input arguments as class members.
-
-        This code does type correction, fixing inputs to have an expected
-        shape before calling one of the nvCompManager methods on a child
-        class.
-
-        Special case: Convert data_type to a _lib.pyNvcompType_t
-        """
-        # data_type will be passed in as a python object. Convert it to
-        # a C++ nvcompType_t here.
-        if kwargs.get("data_type"):
-            if not isinstance(kwargs["data_type"], _lib.pyNvcompType_t):
-                kwargs["input_type"] = kwargs.get("data_type")
-                kwargs["data_type"] = cp_to_nvcomp_dtype(
-                    cp.dtype(kwargs["data_type"]).type
-                )
-        # Special case: Convert type to a _lib.pyNvcompType_t
-        if kwargs.get("type"):
-            if not isinstance(kwargs["type"], _lib.pyNvcompType_t):
-                kwargs["input_type"] = kwargs.get("type")
-                kwargs["type"] = cp_to_nvcomp_dtype(cp.dtype(kwargs["type"]).type)
-        for k, v in kwargs.items():
-            setattr(self, k, v)
-
-    def compress(self, data: cp.ndarray) -> cp.ndarray:
-        """Compress a buffer.
-
-        Parameters
-        ----------
-        data: cp.ndarray
-            A GPU buffer of data to compress.
-
-        Returns
-        -------
-        cp.ndarray
-            A GPU buffer of compressed bytes.
-        """
-        # TODO: An option: check if incoming data size matches the size of the
-        # last incoming data, and reuse temp and out buffer if so.
-        data_size = data.size * data.itemsize
-        self.config = self._manager.configure_compression(data_size)
-        self.compress_out_buffer = cp.empty(
-            self.config["max_compressed_buffer_size"], dtype="uint8"
-        )
-        size = self._manager.compress(asarray(data), asarray(self.compress_out_buffer))
-        return self.compress_out_buffer[0:size]
-
-    def decompress(self, data: cp.ndarray) -> cp.ndarray:
-        """Decompress a GPU buffer.
-
-        Parameters
-        ----------
-        data: cp.ndarray
-            A GPU buffer of data to decompress.
-
-        Returns
-        -------
-        cp.ndarray
-            An array of `self.dtype` produced after decompressing the input argument.
-        """
-        self.decompression_config = (
-            self._manager.configure_decompression_with_compressed_buffer(asarray(data))
-        )
-        decomp_buffer = cp.empty(
-            self.decompression_config["decomp_data_size"], dtype="uint8"
-        )
-        self._manager.decompress(asarray(decomp_buffer), asarray(data))
-        return decomp_buffer.view(self.input_type)
-
-    def configure_compression(self, data_size: int) -> dict:
-        """Return the compression configuration object.
-
-        Parameters
-        ----------
-        data_size: int
-            The size of the buffer that is staged to be compressed.
-
-        Returns
-        -------
-        dict {
-            "uncompressed_buffer_size": The size of the input data
-            "max_compressed_buffer_size": The maximum size of the compressed data. The
-                size of the buffer that must be allocated before calling compress.
-            "num_chunks": The number of configured chunks to compress the data over
-        }
-        """
-        return self._manager.configure_compression(data_size)
-
-    def configure_decompression_with_compressed_buffer(
-        self, data: cp.ndarray
-    ) -> cp.ndarray:
-        """Return the decompression configuration object.
-
-        Parameters
-        ----------
-        data: cp.ndarray
-            A GPU buffer of previously compressed data.
-
-        Returns
-        -------
-        dict {
-            "decomp_data_size": The size of each decompression chunk.
-            "num_chunks": The number of chunks that the decompressed data is returned
-            in.
-        }
-        """
-        return self._manager.configure_decompression_with_compressed_buffer(
-            asarray(data)
-        )
-
-    def get_compressed_output_size(self, comp_buffer: cp.ndarray) -> int:
-        """Return the actual size of compression result.
-
-        Returns the number of bytes that should be copied out of
-        `comp_buffer`.
-
-        Parameters
-        ----------
-        comp_buffer: cp.ndarray
-            A GPU buffer that has been previously compressed.
-
-        Returns
-        -------
-        int
-        """
-        return self._manager.get_compressed_output_size(asarray(comp_buffer))
-
-
-class ANSManager(nvCompManager):
-    def __init__(self, **kwargs):
-        """Initialize an ANSManager object.
-
-        Used to compress and decompress GPU buffers.
-        All parameters are optional and will be set to usable defaults.
-
-        Parameters
-        ----------
-        chunk_size: int (optional)
-            Defaults to 4096.
-        """
-        super().__init__(kwargs)
-
-        self._manager = _lib._ANSManager(self.chunk_size)
-
-
-class BitcompManager(nvCompManager):
-    def __init__(self, **kwargs):
-        """Create a GPU BitcompCompressor object.
-
-        Used to compress and decompress GPU buffers.
-        All parameters are optional and will be set to usable defaults.
-
-        Parameters
-        ----------
-        chunk_size: int (optional)
-            Defaults to 4096.
-        """
-        super().__init__(kwargs)
-
-        self._manager = _lib._BitcompManager(
-            self.chunk_size,
-            self.data_type.value,
-            self.bitcomp_algo,
-        )
-
-
-class CascadedManager(nvCompManager):
-    def __init__(self, **kwargs):
-        """Initialize a CascadedManager for a specific dtype.
-
-        Used to compress and decompress GPU buffers.
-        All parameters are optional and will be set to usable defaults.
-
-        Parameters
-        ----------
-        chunk_size: int (optional)
-            Defaults to 4096 and can't currently be changed.
-        dtype: cp.dtype (optional)
-            The dtype of the input buffer to be compressed.
-        num_RLEs: int (optional)
-            Number of Run-Length Encoders to use, see [algorithms overview.md](
-                https://github.com/NVIDIA/nvcomp/blob/main/doc/algorithms_overview.md#run-length-encoding-rle)  # noqa: E501
-        num_deltas: int (optional)
-            Number of Delta Encoders to use, see [algorithms overview.md](
-                https://github.com/NVIDIA/nvcomp/blob/main/doc/algorithms_overview.md#delta-encoding)  # noqa: E501
-        use_bp: bool (optional)
-            Enable Bitpacking, see [algorithms overview.md](
-                https://github.com/NVIDIA/nvcomp/blob/main/doc/algorithms_overview.md#bitpacking)  # noqa: E501
-        """
-        super().__init__(kwargs)
-        default_options = {
-            "chunk_size": 1 << 12,
-            "type": np.int32,
-            "num_RLEs": 2,
-            "num_deltas": 1,
-            "use_bp": True,
-        }
-        # Replace any options that may have been excluded, they are not optional.
-        for k, v in default_options.items():
-            try:
-                getattr(self, k)
-            except Exception:
-                setattr(self, k, v)
-
-        self.options = {
-            "chunk_size": self.chunk_size,
-            "type": self.type,
-            "num_RLEs": self.num_RLEs,
-            "num_deltas": self.num_deltas,
-            "use_bp": self.use_bp,
-        }
-        self._manager = _lib._CascadedManager(default_options)
-
-
-class GdeflateManager(nvCompManager):
-    def __init__(self, **kwargs):
-        """Create a GPU GdeflateCompressor object.
-
-        Used to compress and decompress GPU buffers.
-        All parameters are optional and will be set to usable defaults.
-
-        Parameters
-        ----------
-        chunk_size: int (optional)
-        algo: int (optional)
-            Integer in the range [0, 1, 2]. Only algorithm #0 is currently
-            supported.
-        """
-        super().__init__(kwargs)
-
-        self._manager = _lib._GdeflateManager(self.chunk_size, self.algo)
-
-
-class LZ4Manager(nvCompManager):
-    def __init__(self, **kwargs):
-        """Create a GPU LZ4Compressor object.
-
-        Used to compress and decompress GPU buffers of a specific dtype.
-        All parameters are optional and will be set to usable defaults.
-
-        Parameters
-        ----------
-        chunk_size: int (optional)
-            The size of each chunk of data to decompress indepentently with
-            LZ4. Must be within the range of [32768, 16777216]. Larger sizes will
-            result in higher compression, but with decreased parallelism. The
-            recommended size is 65536.
-            Defaults to the recommended size.
-        data_type: pyNVCOMP_TYPE (optional)
-            The data type returned for decompression.
-            Defaults to pyNVCOMP_TYPE.UCHAR
-        """
-        super().__init__(kwargs)
-        self._manager = _lib._LZ4Manager(self.chunk_size, self.data_type.value)
-
-
-class SnappyManager(nvCompManager):
-    def __init__(self, **kwargs):
-        """Create a GPU SnappyCompressor object.
-
-        Used to compress and decompress GPU buffers.
-        All parameters are optional and will be set to usable defaults.
-
-        Parameters
-        ----------
-        chunk_size: int (optional)
-        """
-        super().__init__(kwargs)
-        self._manager = _lib._SnappyManager(self.chunk_size)
-
-
-class ManagedDecompressionManager(nvCompManager):
-    def __init__(self, compressed_buffer):
-        """Create a Managed compressor using the
-        create_manager factory method.
-
-        This function is used in order to automatically
-        identify which compression algorithm was used on
-        an input buffer.
-
-        It returns a ManagedDecompressionManager that can
-        then be used normally to decompress the unknown
-        compressed binary data, or compress other data
-        into the same format.
-
-        Parameters
-        ----------
-        compressed_buffer: cp.ndarray
-            A buffer of compressed bytes of unknown origin.
-        """
-        super().__init__({})
-        self._manager = _lib._ManagedManager(asarray(compressed_buffer))
diff --git a/python/kvikio/kvikio/_nvcomp_codec.py b/python/kvikio/kvikio/_nvcomp_codec.py
deleted file mode 100644
index dc60d9c7dc..0000000000
--- a/python/kvikio/kvikio/_nvcomp_codec.py
+++ /dev/null
@@ -1,228 +0,0 @@
-# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
-# See file LICENSE for terms.
-
-from typing import Any, Mapping, Optional, Sequence
-
-import cupy as cp
-import cupy.typing
-from numcodecs.compat import ensure_contiguous_ndarray_like
-
-from kvikio._lib.libnvcomp_ll import SUPPORTED_ALGORITHMS
-from kvikio.numcodecs import BufferLike, CudaCodec
-
-
-class NvCompBatchCodec(CudaCodec):
-    """Codec that uses batch algorithms from nvCOMP library.
-
-    An algorithm is selected using `algorithm` parameter.
-    If the algorithm takes additional options, they can be
-    passed to the algorithm using `options` dictionary.
-    """
-
-    # Header stores original uncompressed size. This is required to enable
-    # data compatibility between existing numcodecs codecs and NvCompBatchCodec.
-    HEADER_SIZE_BYTES: int = 4
-
-    codec_id: str = "nvcomp_batch"
-    algorithm: str
-    options: Mapping[str, Any]
-
-    def __init__(
-        self,
-        algorithm: str,
-        options: Optional[Mapping[str, Any]] = None,
-        stream: Optional[cp.cuda.Stream] = None,
-    ) -> None:
-        algo_id = algorithm.lower()
-        algo_t = SUPPORTED_ALGORITHMS.get(algo_id, None)
-        if algo_t is None:
-            raise ValueError(
-                f"{algorithm} is not supported. "
-                f"Must be one of: {list(SUPPORTED_ALGORITHMS.keys())}"
-            )
-
-        self.algorithm = algo_id
-        self.options = dict(options) if options is not None else {}
-
-        # Create an algorithm.
-        self._algo = algo_t(**self.options)
-        # Use default stream, if needed.
-        self._stream = stream if stream is not None else cp.cuda.Stream.ptds
-
-    def encode(self, buf: BufferLike) -> cupy.typing.NDArray:
-        return self.encode_batch([buf])[0]
-
-    def encode_batch(self, bufs: Sequence[Any]) -> Sequence[Any]:
-        """Encode data in `bufs` using nvCOMP.
-
-        Parameters
-        ----------
-        bufs :
-            Data to be encoded. Each buffer in the list may be any object
-            supporting the new-style buffer protocol.
-
-        Returns
-        -------
-            List of encoded buffers. Each buffer may be any object supporting
-            the new-style buffer protocol.
-        """
-        num_chunks = len(bufs)
-        if num_chunks == 0:
-            return []
-
-        bufs = [cp.asarray(ensure_contiguous_ndarray_like(b)) for b in bufs]
-        buf_sizes = [b.size * b.itemsize for b in bufs]
-
-        max_chunk_size = max(buf_sizes)
-
-        # Get temp and output buffer sizes.
-        temp_size = self._algo.get_compress_temp_size(num_chunks, max_chunk_size)
-        comp_chunk_size = self._algo.get_compress_chunk_size(max_chunk_size)
-
-        # Prepare data and size buffers.
-        # uncomp_chunks is used as a container that stores pointers to actual chunks.
-        # nvCOMP requires this and sizes buffers to be in GPU memory.
-        uncomp_chunks = cp.array([b.data.ptr for b in bufs], dtype=cp.uintp)
-        uncomp_chunk_sizes = cp.array(buf_sizes, dtype=cp.uint64)
-
-        temp_buf = cp.empty(temp_size, dtype=cp.uint8)
-
-        comp_chunks = cp.empty((num_chunks, comp_chunk_size), dtype=cp.uint8)
-        # Array of pointers to each compressed chunk.
-        comp_chunk_ptrs = cp.array([c.data.ptr for c in comp_chunks], dtype=cp.uintp)
-        # Resulting compressed chunk sizes.
-        comp_chunk_sizes = cp.empty(num_chunks, dtype=cp.uint64)
-
-        self._algo.compress(
-            uncomp_chunks,
-            uncomp_chunk_sizes,
-            max_chunk_size,
-            num_chunks,
-            temp_buf,
-            comp_chunk_ptrs,
-            comp_chunk_sizes,
-            self._stream,
-        )
-
-        res = []
-        # Copy to host to subsequently avoid many smaller D2H copies.
-        comp_chunks = cp.asnumpy(comp_chunks, self._stream)
-        comp_chunk_sizes = cp.asnumpy(comp_chunk_sizes, self._stream)
-        self._stream.synchronize()
-
-        for i in range(num_chunks):
-            res.append(comp_chunks[i, : comp_chunk_sizes[i]].tobytes())
-        return res
-
-    def decode(self, buf: BufferLike, out: Optional[BufferLike] = None) -> BufferLike:
-        return self.decode_batch([buf], [out])[0]
-
-    def decode_batch(
-        self, bufs: Sequence[Any], out: Optional[Sequence[Any]] = None
-    ) -> Sequence[Any]:
-        """Decode data in `bufs` using nvCOMP.
-
-        Parameters
-        ----------
-        bufs :
-            Encoded data. Each buffer in the list may be any object
-            supporting the new-style buffer protocol.
-        out :
-            List of writeable buffers to store decoded data.
-            N.B. if provided, each buffer must be exactly the right size
-            to store the decoded data.
-
-        Returns
-        -------
-            List of decoded buffers. Each buffer may be any object supporting
-            the new-style buffer protocol.
-        """
-        num_chunks = len(bufs)
-        if num_chunks == 0:
-            return []
-
-        # TODO(akamenev): check only first buffer, assuming they are all
-        # of the same kind.
-        is_host_buffer = not hasattr(bufs[0], "__cuda_array_interface__")
-        if is_host_buffer:
-            bufs = [cp.asarray(ensure_contiguous_ndarray_like(b)) for b in bufs]
-
-        # Prepare compressed chunks buffers.
-        comp_chunks = cp.array([b.data.ptr for b in bufs], dtype=cp.uintp)
-        comp_chunk_sizes = cp.array([b.size for b in bufs], dtype=cp.uint64)
-
-        # Get uncompressed chunk sizes.
-        uncomp_chunk_sizes = self._algo.get_decompress_size(
-            comp_chunks,
-            comp_chunk_sizes,
-            self._stream,
-        )
-
-        # Check whether the uncompressed chunks are all the same size.
-        # cupy.unique returns sorted sizes.
-        sorted_chunk_sizes = cp.unique(uncomp_chunk_sizes)
-        max_chunk_size = sorted_chunk_sizes[-1].item()
-        is_equal_chunks = sorted_chunk_sizes.shape[0] == 1
-
-        # Get temp buffer size.
-        temp_size = self._algo.get_decompress_temp_size(num_chunks, max_chunk_size)
-
-        temp_buf = cp.empty(temp_size, dtype=cp.uint8)
-
-        # Prepare uncompressed chunks buffers.
-        # First, allocate chunks of max_chunk_size and then
-        # copy the pointers to a pointer array in GPU memory as required by nvCOMP.
-        # For performance reasons, we use max_chunk_size so we can create
-        # a rectangular array with the same pointer increments.
-        uncomp_chunks = cp.empty((num_chunks, max_chunk_size), dtype=cp.uint8)
-        p_start = uncomp_chunks.data.ptr
-        uncomp_chunk_ptrs = cp.uint64(p_start) + (
-            cp.arange(0, num_chunks * max_chunk_size, max_chunk_size, dtype=cp.uint64)
-        )
-
-        # TODO(akamenev): currently we provide the following 2 buffers to decompress()
-        # but do not check/use them afterwards since some of the algos
-        # (e.g. LZ4 and Gdeflate) do not require it and run faster
-        # without those arguments passed, while other algos (e.g. zstd) require
-        # these buffers to be valid.
-        actual_uncomp_chunk_sizes = cp.empty(num_chunks, dtype=cp.uint64)
-        statuses = cp.empty(num_chunks, dtype=cp.int32)
-
-        self._algo.decompress(
-            comp_chunks,
-            comp_chunk_sizes,
-            num_chunks,
-            temp_buf,
-            uncomp_chunk_ptrs,
-            uncomp_chunk_sizes,
-            actual_uncomp_chunk_sizes,
-            statuses,
-            self._stream,
-        )
-
-        # If all chunks are the same size, we can just return uncomp_chunks.
-        if is_equal_chunks and out is None:
-            return cp.asnumpy(uncomp_chunks) if is_host_buffer else uncomp_chunks
-
-        res = []
-        uncomp_chunk_sizes = uncomp_chunk_sizes.get()
-        for i in range(num_chunks):
-            ret = uncomp_chunks[i, : uncomp_chunk_sizes[i]]
-            if out is None or out[i] is None:
-                res.append(cp.asnumpy(ret) if is_host_buffer else ret)
-            else:
-                o = ensure_contiguous_ndarray_like(out[i])
-                if hasattr(o, "__cuda_array_interface__"):
-                    cp.copyto(o, ret.view(dtype=o.dtype), casting="no")
-                else:
-                    cp.asnumpy(ret.view(dtype=o.dtype), out=o, stream=self._stream)
-                res.append(o)
-        self._stream.synchronize()
-
-        return res
-
-    def __repr__(self):
-        return (
-            f"{self.__class__.__name__}"
-            f"(algorithm={self.algorithm!r}, options={self.options!r})"
-        )
diff --git a/python/kvikio/kvikio/benchmarks/single_node_io.py b/python/kvikio/kvikio/benchmarks/single_node_io.py
index e3b152cbaf..ddc6680167 100644
--- a/python/kvikio/kvikio/benchmarks/single_node_io.py
+++ b/python/kvikio/kvikio/benchmarks/single_node_io.py
@@ -8,7 +8,7 @@
 import statistics
 import tempfile
 from time import perf_counter as clock
-from typing import Any, ContextManager, Dict, Union
+from typing import ContextManager, Union
 
 import cupy
 from dask.utils import format_bytes, parse_bytes
@@ -19,21 +19,6 @@
 from kvikio.benchmarks.utils import parse_directory, pprint_sys_info
 
 
-def get_zarr_compressors() -> Dict[str, Any]:
-    """Returns a dict of available Zarr compressors"""
-    try:
-        import kvikio.zarr
-    except ImportError:
-        return {}
-    try:
-        compressors = kvikio.zarr.nvcomp_compressors
-    except AttributeError:
-        # zarr-python 3.x
-        return {}
-    else:
-        return {c.__name__.lower(): c for c in compressors}
-
-
 def create_data(nbytes):
     """Return a random uint8 cupy array"""
     return cupy.arange(nbytes, dtype="uint8")
@@ -223,10 +208,6 @@ def run_zarr(args):
     if not kvikio.zarr.supported:
         raise RuntimeError(f"requires Zarr >={kvikio.zarr.MINIMUM_ZARR_VERSION}")
 
-    compressor = None
-    if args.zarr_compressor is not None:
-        compressor = get_zarr_compressors()[args.zarr_compressor]()
-
     a = create_data(args.nbytes)
 
     shutil.rmtree(str(dir_path), ignore_errors=True)
@@ -236,7 +217,6 @@ def run_zarr(args):
     z = zarr.array(
         a,
         chunks=False,
-        compressor=compressor,
         store=kvikio.zarr.GDSStore(dir_path),
         meta_array=cupy.empty(()),
     )
@@ -277,8 +257,6 @@ def main(args):
     print(f"directory         | {args.dir}")
     print(f"nthreads          | {args.nthreads}")
     print(f"nruns             | {args.nruns}")
-    if args.zarr_compressor is not None:
-        print(f"Zarr compressor   | {args.zarr_compressor}")
     print("==================================")
 
     # Run each benchmark using the requested APIs
@@ -354,16 +332,6 @@ def pprint_api_res(name, samples):
         choices=tuple(API.keys()) + ("all",),
         help="List of APIs to use {%(choices)s}",
     )
-    parser.add_argument(
-        "--zarr-compressor",
-        metavar="COMPRESSOR",
-        default=None,
-        choices=tuple(get_zarr_compressors().keys()),
-        help=(
-            "Set a nvCOMP compressor to use with Zarr "
-            "{%(choices)s} (default: %(default)s)"
-        ),
-    )
 
     args = parser.parse_args()
     if "all" in args.api:
diff --git a/python/kvikio/kvikio/benchmarks/zarr_io.py b/python/kvikio/kvikio/benchmarks/zarr_io.py
index 7882fcad8c..c4d51a6d5f 100644
--- a/python/kvikio/kvikio/benchmarks/zarr_io.py
+++ b/python/kvikio/kvikio/benchmarks/zarr_io.py
@@ -13,9 +13,9 @@
 from typing import ContextManager, Union
 
 import cupy
-import numcodecs.blosc
 import numpy
 import zarr
+import zarr.storage
 from dask.utils import format_bytes, parse_bytes
 
 import kvikio
@@ -26,59 +26,47 @@
 if not kvikio.zarr.supported:
     raise RuntimeError(f"requires Zarr >={kvikio.zarr.MINIMUM_ZARR_VERSION}")
 
-compressors = {
-    "none": (None, None),
-    "lz4": (numcodecs.blosc.Blosc(cname="lz4"), kvikio.zarr.LZ4()),
-}
-
 
 def create_src_data(args):
     return cupy.random.random(args.nelem, dtype=args.dtype)
 
 
 def run_kvikio(args):
-    dir_path = args.dir / "kvikio"
-    shutil.rmtree(str(dir_path), ignore_errors=True)
-
-    # Get the GPU compressor
-    compressor = compressors[args.compressor][1]
-
-    src = create_src_data(args)
-
-    # Write
-    if args.drop_vm_cache:
-        drop_vm_cache()
-    t0 = clock()
-    z = zarr.create(
-        shape=(args.nelem,),
-        chunks=(args.chunksize,),
-        dtype=args.dtype,
-        compressor=compressor,
-        store=kvikio.zarr.GDSStore(dir_path),
-        meta_array=cupy.empty(()),
-    )
-    z[:] = src
-    os.sync()
-    write_time = clock() - t0
-
-    # Read
-    if args.drop_vm_cache:
-        drop_vm_cache()
-    t0 = clock()
-    res = z[:]
-    read_time = clock() - t0
-    assert res.nbytes == args.nbytes
-
-    return read_time, write_time
+    with zarr.config.enable_gpu():
+        dir_path = args.dir / "kvikio"
+        shutil.rmtree(str(dir_path), ignore_errors=True)
+
+        src = create_src_data(args)
+
+        # Write
+        if args.drop_vm_cache:
+            drop_vm_cache()
+        t0 = clock()
+        z = zarr.create(
+            shape=(args.nelem,),
+            chunks=(args.chunksize,),
+            dtype=args.dtype,
+            store=kvikio.zarr.GDSStore(dir_path),
+        )
+        z[:] = src
+        os.sync()
+        write_time = clock() - t0
+
+        # Read
+        if args.drop_vm_cache:
+            drop_vm_cache()
+        t0 = clock()
+        res = z[:]
+        read_time = clock() - t0
+        assert res.nbytes == args.nbytes
+
+        return read_time, write_time
 
 
 def run_posix(args):
     dir_path = args.dir / "posix"
     shutil.rmtree(str(dir_path), ignore_errors=True)
 
-    # Get the CPU compressor
-    compressor = compressors[args.compressor][0]
-
     src = create_src_data(args)
 
     # Write
@@ -89,9 +77,7 @@ def run_posix(args):
         shape=(args.nelem,),
         chunks=(args.chunksize,),
         dtype=args.dtype,
-        compressor=compressor,
-        store=zarr.DirectoryStore(dir_path),
-        meta_array=numpy.empty(()),
+        store=zarr.storage.LocalStore(dir_path),
     )
     z[:] = src.get()
     os.sync()
@@ -135,7 +121,6 @@ def main(args):
     print(f"directory         | {args.dir}")
     print(f"nthreads          | {args.nthreads}")
     print(f"nruns             | {args.nruns}")
-    print(f"compressor        | {args.compressor}")
     print("==================================")
 
     # Run each benchmark using the requested APIs
@@ -226,16 +211,6 @@ def pprint_api_res(name, samples):
         choices=tuple(API.keys()) + ("all",),
         help="List of APIs to use {%(choices)s}",
     )
-    parser.add_argument(
-        "--compressor",
-        metavar="COMPRESSOR",
-        default="none",
-        choices=tuple(compressors.keys()),
-        help=(
-            "Set a nvCOMP compressor to use with Zarr "
-            "{%(choices)s} (default: %(default)s)"
-        ),
-    )
     parser.add_argument(
         "--drop-vm-cache",
         action="store_true",
diff --git a/python/kvikio/kvikio/nvcomp.py b/python/kvikio/kvikio/nvcomp.py
deleted file mode 100644
index 3b62e51e8c..0000000000
--- a/python/kvikio/kvikio/nvcomp.py
+++ /dev/null
@@ -1,20 +0,0 @@
-# Copyright (c) 2021-2025, NVIDIA CORPORATION. All rights reserved.
-# See file LICENSE for terms.
-
-
-from kvikio._nvcomp import (  # noqa: F401
-    ANSManager,
-    BitcompManager,
-    CascadedManager,
-    GdeflateManager,
-    LZ4Manager,
-    ManagedDecompressionManager,
-    SnappyManager,
-    cp_to_nvcomp_dtype,
-    nvCompManager,
-)
-from kvikio.utils import kvikio_deprecate_module
-
-kvikio_deprecate_module(
-    "Use the official nvCOMP API from 'nvidia.nvcomp' instead.", since="25.06"
-)
diff --git a/python/kvikio/kvikio/nvcomp_codec.py b/python/kvikio/kvikio/nvcomp_codec.py
deleted file mode 100644
index ded350cdd5..0000000000
--- a/python/kvikio/kvikio/nvcomp_codec.py
+++ /dev/null
@@ -1,9 +0,0 @@
-# Copyright (c) 2023-2025, NVIDIA CORPORATION. All rights reserved.
-# See file LICENSE for terms.
-
-from kvikio._nvcomp_codec import NvCompBatchCodec  # noqa: F401
-from kvikio.utils import kvikio_deprecate_module
-
-kvikio_deprecate_module(
-    "Use the official nvCOMP API from 'nvidia.nvcomp' instead.", since="25.06"
-)
diff --git a/python/kvikio/kvikio/zarr/__init__.py b/python/kvikio/kvikio/zarr/__init__.py
index 7ec22c275a..758670ea21 100644
--- a/python/kvikio/kvikio/zarr/__init__.py
+++ b/python/kvikio/kvikio/zarr/__init__.py
@@ -6,5 +6,3 @@
 
 if _parse(_metadata.version("zarr")) >= _Version("3.0.0"):
     from ._zarr_python_3 import *  # noqa: F401,F403
-else:
-    from ._zarr_python_2 import *  # type: ignore[assignment] # noqa: F401,F403
diff --git a/python/kvikio/kvikio/zarr/_zarr_python_2.py b/python/kvikio/kvikio/zarr/_zarr_python_2.py
deleted file mode 100644
index bd1418e799..0000000000
--- a/python/kvikio/kvikio/zarr/_zarr_python_2.py
+++ /dev/null
@@ -1,400 +0,0 @@
-# Copyright (c) 2021-2025, NVIDIA CORPORATION. All rights reserved.
-# See file LICENSE for terms.
-from __future__ import annotations
-
-import contextlib
-import os
-import os.path
-from abc import abstractmethod
-from typing import Any, Literal, Mapping, Optional, Sequence, Union
-
-import cupy
-import cupy.typing
-import numcodecs
-import numpy
-import numpy as np
-import zarr
-import zarr.creation
-import zarr.errors
-import zarr.storage
-from numcodecs.abc import Codec
-from numcodecs.compat import ensure_contiguous_ndarray_like
-from numcodecs.registry import register_codec
-from packaging.version import parse
-
-import kvikio
-import kvikio._nvcomp
-import kvikio._nvcomp_codec
-import kvikio.zarr
-from kvikio._nvcomp_codec import NvCompBatchCodec
-from kvikio.numcodecs import BufferLike, CudaCodec
-
-MINIMUM_ZARR_VERSION = "2.15"
-
-# Is this version of zarr supported? We depend on the `Context`
-# argument introduced in https://github.com/zarr-developers/zarr-python/pull/1131
-# in zarr v2.15.
-supported = parse(zarr.__version__) >= parse(MINIMUM_ZARR_VERSION)
-
-
-class GDSStore(zarr.storage.DirectoryStore):  # type: ignore[name-defined]
-    """GPUDirect Storage (GDS) class using directories and files.
-
-    This class works like `zarr.storage.DirectoryStore` but implements
-    getitems() in order to support direct reading into device memory.
-    It uses KvikIO for reads and writes, which in turn will use GDS
-    when applicable.
-
-    Parameters
-    ----------
-    path : string
-        Location of directory to use as the root of the storage hierarchy.
-    normalize_keys : bool, optional
-        If True, all store keys will be normalized to use lower case characters
-        (e.g. 'foo' and 'FOO' will be treated as equivalent). This can be
-        useful to avoid potential discrepancies between case-sensitive and
-        case-insensitive file system. Default value is False.
-    dimension_separator : {'.', '/'}, optional
-        Separator placed between the dimensions of a chunk.
-    compressor_config_overwrite
-        If not None, use this `Mapping` to specify what is written to the Zarr metadata
-        file on disk (`.zarray`). Normally, Zarr writes the configuration[1] given by
-        the `compressor` argument to the `.zarray` file. Use this argument to overwrite
-        the normal configuration and use the specified `Mapping` instead.
-    decompressor_config_overwrite
-        If not None, use this `Mapping` to specify what compressor configuration[1] is
-        used for decompressing no matter the configuration found in the Zarr metadata
-        on disk (the `.zarray` file).
-
-    [1] https://github.com/zarr-developers/numcodecs/blob/cb155432/numcodecs/abc.py#L79
-
-    Notes
-    -----
-    Atomic writes are used, which means that data are first written to a
-    temporary file, then moved into place when the write is successfully
-    completed. Files are only held open while they are being read or written and are
-    closed immediately afterwards, so there is no need to manually close any files.
-
-    Safe to write in multiple threads or processes.
-    """
-
-    # The default output array type used by getitems().
-    default_meta_array = numpy.empty(())
-
-    def __init__(
-        self,
-        path,
-        normalize_keys=False,
-        dimension_separator=None,
-        *,
-        compressor_config_overwrite: Optional[Mapping] = None,
-        decompressor_config_overwrite: Optional[Mapping] = None,
-    ) -> None:
-        if not kvikio.zarr.supported:
-            raise RuntimeError(
-                f"GDSStore requires Zarr >={kvikio.zarr.MINIMUM_ZARR_VERSION}"
-            )
-        super().__init__(
-            path, normalize_keys=normalize_keys, dimension_separator=dimension_separator
-        )
-        self.compressor_config_overwrite = compressor_config_overwrite
-        self.decompressor_config_overwrite = decompressor_config_overwrite
-
-    def __eq__(self, other):
-        return isinstance(other, GDSStore) and self.path == other.path
-
-    def _tofile(self, a, fn):
-        with kvikio.CuFile(fn, "w") as f:
-            written = f.write(a)
-            assert written == a.nbytes
-
-    def __getitem__(self, key):
-        ret = super().__getitem__(key)
-        if self.decompressor_config_overwrite and key == ".zarray":
-            meta = self._metadata_class.decode_array_metadata(ret)
-            if meta["compressor"]:
-                meta["compressor"] = self.decompressor_config_overwrite
-                ret = self._metadata_class.encode_array_metadata(meta)
-        return ret
-
-    def __setitem__(self, key, value):
-        if self.compressor_config_overwrite and key == ".zarray":
-            meta = self._metadata_class.decode_array_metadata(value)
-            if meta["compressor"]:
-                meta["compressor"] = self.compressor_config_overwrite
-                value = self._metadata_class.encode_array_metadata(meta)
-        super().__setitem__(key, value)
-
-    def getitems(
-        self,
-        keys: Sequence[str],
-        *,
-        contexts: Mapping[str, Mapping] = {},
-    ) -> Mapping[str, Any]:
-        """Retrieve data from multiple keys.
-
-        Parameters
-        ----------
-        keys : Iterable[str]
-            The keys to retrieve
-        contexts: Mapping[str, Context]
-            A mapping of keys to their context. Each context is a mapping of store
-            specific information. If the "meta_array" key exist, GDSStore use its
-            values as the output array otherwise GDSStore.default_meta_array is used.
-
-        Returns
-        -------
-        Mapping
-            A collection mapping the input keys to their results.
-        """
-        ret = {}
-        io_results = []
-
-        with contextlib.ExitStack() as stack:
-            for key in keys:
-                filepath = os.path.join(self.path, key)
-                if not os.path.isfile(filepath):
-                    continue
-                try:
-                    meta_array = contexts[key]["meta_array"]
-                except KeyError:
-                    meta_array = self.default_meta_array
-
-                nbytes = os.path.getsize(filepath)
-                f = stack.enter_context(kvikio.CuFile(filepath, "r"))
-                ret[key] = numpy.empty_like(meta_array, shape=(nbytes,), dtype="u1")
-                io_results.append((f.pread(ret[key]), nbytes))
-
-            for future, nbytes in io_results:
-                nbytes_read = future.get()
-                if nbytes_read != nbytes:
-                    raise RuntimeError(
-                        f"Incomplete read ({nbytes_read}) expected {nbytes}"
-                    )
-        return ret
-
-
-class NVCompCompressor(CudaCodec):
-    """Abstract base class for nvCOMP compressors
-
-    The derived classes must set `codec_id` and implement
-    `get_nvcomp_manager`
-    """
-
-    @abstractmethod
-    def get_nvcomp_manager(self) -> kvikio.nvcomp.nvCompManager:
-        """Abstract method that should return the nvCOMP compressor manager"""
-        pass  # TODO: cache Manager
-
-    def encode(self, buf: BufferLike) -> cupy.typing.NDArray:
-        buf = cupy.asarray(ensure_contiguous_ndarray_like(buf))
-        return self.get_nvcomp_manager().compress(buf)
-
-    def decode(self, buf: BufferLike, out: Optional[BufferLike] = None) -> BufferLike:
-        buf = ensure_contiguous_ndarray_like(buf)
-        is_host_buffer = not hasattr(buf, "__cuda_array_interface__")
-        if is_host_buffer:
-            buf = cupy.asarray(buf)
-
-        ret = self.get_nvcomp_manager().decompress(buf)
-
-        if is_host_buffer:
-            ret = cupy.asnumpy(ret)
-
-        if out is not None:
-            out = ensure_contiguous_ndarray_like(out)
-            if hasattr(out, "__cuda_array_interface__"):
-                cupy.copyto(out, ret.view(dtype=out.dtype), casting="no")
-            else:
-                np.copyto(out, cupy.asnumpy(ret.view(dtype=out.dtype)), casting="no")
-        return ret
-
-
-class ANS(NVCompCompressor):
-    codec_id = "nvcomp_ANS"
-
-    def get_nvcomp_manager(self):
-        return kvikio.nvcomp.ANSManager()
-
-
-class Bitcomp(NVCompCompressor):
-    codec_id = "nvcomp_Bitcomp"
-
-    def get_nvcomp_manager(self):
-        return kvikio.nvcomp.BitcompManager()
-
-
-class Cascaded(NVCompCompressor):
-    codec_id = "nvcomp_Cascaded"
-
-    def get_nvcomp_manager(self):
-        return kvikio.nvcomp.CascadedManager()
-
-
-class Gdeflate(NVCompCompressor):
-    codec_id = "nvcomp_Gdeflate"
-
-    def get_nvcomp_manager(self):
-        return kvikio.nvcomp.GdeflateManager()
-
-
-class LZ4(NVCompCompressor):
-    codec_id = "nvcomp_LZ4"
-
-    def get_nvcomp_manager(self):
-        return kvikio.nvcomp.LZ4Manager()
-
-
-class Snappy(NVCompCompressor):
-    codec_id = "nvcomp_Snappy"
-
-    def get_nvcomp_manager(self):
-        return kvikio.nvcomp.SnappyManager()
-
-
-# Expose a list of available nvCOMP compressors and register them as Zarr condecs
-nvcomp_compressors = [ANS, Bitcomp, Cascaded, Gdeflate, LZ4, Snappy]
-for c in nvcomp_compressors:
-    register_codec(c)
-
-
-class CompatCompressor:
-    """A pair of compatible compressors one using the CPU and one using the GPU
-
-    Warning
-    -------
-    `CompatCompressor` is only supported by KvikIO's `open_cupy_array()` and
-    cannot be used as a compressor argument in Zarr functions like `open()`
-    and `open_array()` directly. However, it is possible to use its `.cpu`
-    like: `open(..., compressor=CompatCompressor.lz4().cpu)`.
-
-    Parameters
-    ----------
-    cpu
-        The CPU compressor.
-    gpu
-        The GPU compressor.
-    """
-
-    def __init__(self, cpu: Codec, gpu: CudaCodec) -> None:
-        self.cpu = cpu
-        self.gpu = gpu
-
-    @classmethod
-    def lz4(cls) -> CompatCompressor:
-        """A compatible pair of LZ4 compressors"""
-        return cls(cpu=numcodecs.LZ4(), gpu=NvCompBatchCodec("lz4"))
-
-
-def open_cupy_array(
-    store: Union[os.PathLike, str],
-    mode: Literal["r", "r+", "a", "w", "w-"] = "a",
-    compressor: Codec | CompatCompressor = Snappy(),
-    meta_array=cupy.empty(()),
-    **kwargs,
-) -> zarr.Array:
-    """Open an Zarr array as a CuPy-like array using file-mode-like semantics.
-
-    This function is a CUDA friendly version of `zarr.open_array` that reads
-    and writes to CuPy arrays. Beside the arguments listed below, the arguments
-    have the same semantics as in `zarr.open_array`.
-
-    Parameters
-    ----------
-    store
-        Path to directory in file system. As opposed to `zarr.open_array`,
-        Store and path to zip files isn't supported.
-    mode
-        Persistence mode: 'r' means read only (must exist); 'r+' means
-        read/write (must exist); 'a' means read/write (create if doesn't
-        exist); 'w' means create (overwrite if exists); 'w-' means create
-        (fail if exists).
-    compressor
-        The compressor used when creating a Zarr file or None if no compressor
-        is to be used. If a `CompatCompressor` is given, `CompatCompressor.gpu`
-        is used for compression and decompression; and `CompatCompressor.cpu`
-        is written as the compressor in the Zarr file metadata on disk.
-        This argument is ignored in "r" and "r+" mode. By default the
-        Snappy compressor by nvCOMP is used.
-    meta_array : array-like, optional
-        An CuPy-like array instance to use for determining arrays to create and
-        return to users. It must implement `__cuda_array_interface__`.
-    **kwargs
-        The rest of the arguments are forwarded to `zarr.open_array` as-is.
-
-    Returns
-    -------
-    Zarr array backed by a GDS file store, nvCOMP compression, and CuPy arrays.
-    """
-
-    if not isinstance(store, (str, os.PathLike)):
-        raise ValueError("store must be a path")
-    store = str(os.fspath(store))
-    if not hasattr(meta_array, "__cuda_array_interface__"):
-        raise ValueError("meta_array must implement __cuda_array_interface__")
-
-    if mode in ("r", "r+", "a"):
-        # In order to handle "a", we start by trying to open the file in read mode.
-        try:
-            ret = zarr.open_array(
-                store=kvikio.zarr.GDSStore(path=store),  # type: ignore[call-arg]
-                mode="r+",
-                meta_array=meta_array,
-                **kwargs,
-            )
-        except (
-            zarr.errors.ContainsGroupError,
-            zarr.errors.ArrayNotFoundError,  # type: ignore[attr-defined]
-        ):
-            # If we are reading, this is a genuine error.
-            if mode in ("r", "r+"):
-                raise
-        else:
-            if ret.compressor is None:
-                return ret
-            # If we are reading a LZ4-CPU compressed file, we overwrite the
-            # metadata on-the-fly to make Zarr use LZ4-GPU for both compression
-            # and decompression.
-            compat_lz4 = CompatCompressor.lz4()
-            if ret.compressor == compat_lz4.cpu:
-                ret = zarr.open_array(
-                    store=kvikio.zarr.GDSStore(  # type: ignore[call-arg]
-                        path=store,
-                        compressor_config_overwrite=compat_lz4.cpu.get_config(),
-                        decompressor_config_overwrite=compat_lz4.gpu.get_config(),
-                    ),
-                    mode=mode,
-                    meta_array=meta_array,
-                    **kwargs,
-                )
-            elif not isinstance(ret.compressor, CudaCodec):
-                raise ValueError(
-                    "The Zarr file was written using a non-CUDA compatible "
-                    f"compressor, {ret.compressor}, please use something "
-                    "like kvikio.zarr.CompatCompressor"
-                )
-            return ret
-
-    # At this point, we known that we are writing a new array
-    if mode not in ("w", "w-", "a"):
-        raise ValueError(f"Unknown mode: {mode}")
-
-    if isinstance(compressor, CompatCompressor):
-        compressor_config_overwrite = compressor.cpu.get_config()
-        decompressor_config_overwrite = compressor.gpu.get_config()
-        compressor = compressor.gpu
-    else:
-        compressor_config_overwrite = None
-        decompressor_config_overwrite = None
-
-    return zarr.open_array(
-        store=kvikio.zarr.GDSStore(  # type: ignore[call-arg]
-            path=store,
-            compressor_config_overwrite=compressor_config_overwrite,
-            decompressor_config_overwrite=decompressor_config_overwrite,
-        ),
-        mode=mode,
-        meta_array=meta_array,
-        compressor=compressor,
-        **kwargs,
-    )
diff --git a/python/kvikio/kvikio/zarr/_zarr_python_3.py b/python/kvikio/kvikio/zarr/_zarr_python_3.py
index 5305cd9b72..2f21be360a 100644
--- a/python/kvikio/kvikio/zarr/_zarr_python_3.py
+++ b/python/kvikio/kvikio/zarr/_zarr_python_3.py
@@ -8,6 +8,7 @@
 
 import packaging
 import zarr.storage
+from packaging.version import parse
 from zarr.abc.store import (
     ByteRequest,
     OffsetByteRequest,
@@ -23,6 +24,10 @@
 # at https://github.com/zarr-developers/zarr-python/blob/main/src/zarr/storage/_local.py
 # with differences coming swapping in `cuFile` for the stdlib open file object.
 
+MINIMUM_ZARR_VERSION = "3"
+
+supported = parse(zarr.__version__) >= parse(MINIMUM_ZARR_VERSION)
+
 
 @functools.cache
 def _is_ge_zarr_3_0_7():
@@ -138,10 +143,3 @@ async def _set(self, key: str, value: Buffer, exclusive: bool = False) -> None:
         path = self.root / key
 
         await asyncio.to_thread(_put, path, value, start=None, exclusive=exclusive)
-
-
-# Matching the check that zarr.__version__ > 2.15 that's
-# part of the public API for our zarr 2.x support
-# This module is behind a check that zarr.__version__ > 3
-# so we can just assume it's already checked and supported.
-supported = True
diff --git a/python/kvikio/tests/conftest.py b/python/kvikio/tests/conftest.py
index 07636095eb..06aef52ecc 100644
--- a/python/kvikio/tests/conftest.py
+++ b/python/kvikio/tests/conftest.py
@@ -57,19 +57,6 @@ def run_cmd(cmd: Iterable[str], cwd, verbose=True):
     p.kill()
 
 
-@pytest.fixture()
-def managers():
-    libnvcomp = pytest.importorskip("kvikio.nvcomp")
-    return [
-        libnvcomp.ANSManager,
-        libnvcomp.BitcompManager,
-        libnvcomp.CascadedManager,
-        libnvcomp.GdeflateManager,
-        libnvcomp.LZ4Manager,
-        libnvcomp.SnappyManager,
-    ]
-
-
 @pytest.fixture(
     params=[("cupy", False), ("cupy", True), ("numpy", False)],
     ids=["cupy", "cupy_async", "numpy"],
diff --git a/python/kvikio/tests/test_benchmarks.py b/python/kvikio/tests/test_benchmarks.py
index 8450fdfc25..584f719fc8 100644
--- a/python/kvikio/tests/test_benchmarks.py
+++ b/python/kvikio/tests/test_benchmarks.py
@@ -7,7 +7,6 @@
 from pathlib import Path
 
 import pytest
-from packaging.version import parse
 
 import kvikio
 
@@ -26,25 +25,12 @@
         "cufile-mfma",
         "cufile-mf",
         "cufile-ma",
-        "zarr",
     ],
 )
 @pytest.mark.timeout(30, method="thread")
 def test_single_node_io(run_cmd, tmp_path, api):
     """Test benchmarks/single_node_io.py"""
 
-    if "zarr" in api:
-        kz = pytest.importorskip("kvikio.zarr")
-        import zarr
-
-        if not kz.supported:
-            pytest.skip(f"requires Zarr >={kz.MINIMUM_ZARR_VERSION}")
-
-        if parse(zarr.__version__) >= parse("3.0.0"):
-            pytest.skip(
-                "requires Zarr<3",
-            )
-
     retcode = run_cmd(
         cmd=[
             sys.executable or "python",
@@ -61,44 +47,6 @@ def test_single_node_io(run_cmd, tmp_path, api):
     assert retcode == 0
 
 
-@pytest.mark.parametrize(
-    "api",
-    [
-        "kvikio",
-        "posix",
-    ],
-)
-@pytest.mark.timeout(30, method="thread")
-def test_zarr_io(run_cmd, tmp_path, api):
-    """Test benchmarks/zarr_io.py"""
-
-    kz = pytest.importorskip("kvikio.zarr")
-    import zarr
-
-    if not kz.supported:
-        pytest.skip(f"requires Zarr >={kz.MINIMUM_ZARR_VERSION}")
-
-    if parse(zarr.__version__) >= parse("3.0.0"):
-        pytest.skip(
-            "requires Zarr<3",
-        )
-
-    retcode = run_cmd(
-        cmd=[
-            sys.executable or "python",
-            "zarr_io.py",
-            "-n",
-            "1MiB",
-            "-d",
-            str(tmp_path),
-            "--api",
-            api,
-        ],
-        cwd=benchmarks_path,
-    )
-    assert retcode == 0
-
-
 @pytest.mark.parametrize(
     "api",
     [
diff --git a/python/kvikio/tests/test_examples.py b/python/kvikio/tests/test_examples.py
index f32485b6c4..1d08525a23 100644
--- a/python/kvikio/tests/test_examples.py
+++ b/python/kvikio/tests/test_examples.py
@@ -6,7 +6,6 @@
 from pathlib import Path
 
 import pytest
-from packaging.version import parse
 
 import kvikio
 
@@ -21,18 +20,6 @@ def test_hello_world(tmp_path, monkeypatch):
     import_module("hello_world").main(tmp_path / "test-file")
 
 
-def test_zarr_cupy_nvcomp(tmp_path, monkeypatch):
-    """Test examples/zarr_cupy_nvcomp.py"""
-
-    # `examples/zarr_cupy_nvcomp.py` requires the Zarr submodule
-    zarr = pytest.importorskip("zarr")
-    if parse(zarr.__version__) >= parse("3.0.0"):
-        pytest.skip(reason="Requires zarr<3")
-
-    monkeypatch.syspath_prepend(str(examples_path))
-    import_module("zarr_cupy_nvcomp").main(tmp_path / "test-file")
-
-
 def test_http_io(tmp_path, monkeypatch):
     """Test examples/http_io.py"""
 
diff --git a/python/kvikio/tests/test_nvcomp.py b/python/kvikio/tests/test_nvcomp.py
deleted file mode 100644
index 356c5e77cd..0000000000
--- a/python/kvikio/tests/test_nvcomp.py
+++ /dev/null
@@ -1,444 +0,0 @@
-# Copyright (c) 2021-2025, NVIDIA CORPORATION. All rights reserved.
-# See file LICENSE for terms.
-
-import pytest
-
-np = pytest.importorskip("numpy")
-cupy = pytest.importorskip("cupy")
-kvikio = pytest.importorskip("kvikio")
-libnvcomp = pytest.importorskip("kvikio.nvcomp")
-
-
-# TODO: don't hardcode the following expected values
-LEN = {
-    "ANS": 11144,
-    "Bitcomp": 3208,
-    "Cascaded": 600,
-    "Gdeflate": 760,
-    "LZ4": 393,
-    "Snappy": 3548,
-}
-
-
-def assert_compression_size(actual, desired, rtol=0.1):
-    """Compression ratios might change slightly between library versions
-
-    We mark a failure as "xfail"
-    """
-    try:
-        np.testing.assert_allclose(actual, desired, rtol=rtol)
-    except AssertionError:
-        pytest.xfail("mismatch in compression ratios is acceptable")
-        raise
-
-
-def managers():
-    return [
-        libnvcomp.ANSManager,
-        libnvcomp.BitcompManager,
-        libnvcomp.CascadedManager,
-        libnvcomp.GdeflateManager,
-        libnvcomp.LZ4Manager,
-        libnvcomp.SnappyManager,
-    ]
-
-
-def dtypes():
-    return [
-        "uint8",
-        "uint16",
-        "uint32",
-        "int8",
-        "int16",
-        "int32",
-    ]
-
-
-@pytest.mark.parametrize("manager, dtype", zip(managers(), dtypes()))
-def test_round_trip_dtypes(manager, dtype):
-    length = 10000
-    data = cupy.array(
-        np.arange(
-            0,
-            length // cupy.dtype(dtype).type(0).itemsize,
-            dtype=dtype,
-        )
-    )
-    compressor_instance = manager(data_type=dtype)
-    compressed = compressor_instance.compress(data)
-    decompressed = compressor_instance.decompress(compressed)
-    assert (data == decompressed).all()
-
-
-#
-# ANS Options test
-#
-@pytest.mark.parametrize(
-    "inputs",
-    [
-        {},
-        {
-            "chunk_size": 1 << 16,
-        },
-        {
-            "chunk_size": 1 << 16,
-        },
-    ],
-)
-def test_ans_inputs(inputs):
-    size = 10000
-    dtype = inputs.get("data_type") if inputs.get("data_type") else np.int8
-    data = cupy.array(np.arange(0, size // dtype(0).itemsize, dtype=dtype))
-    compressor = libnvcomp.ANSManager(**inputs)
-    final = compressor.compress(data)
-    assert_compression_size(len(final), LEN["ANS"])
-
-
-@pytest.mark.parametrize(
-    "inputs",
-    [
-        {},
-        {
-            "data_type": np.uint8,
-            "algo": 0,
-        },
-        {"data_type": np.uint8},
-        {
-            "algo": 0,
-        },
-    ],
-)
-def test_bitcomp_inputs(inputs):
-    size = 10000
-    dtype = inputs.get("data_type") if inputs.get("data_type") else np.int8
-    data = cupy.array(np.arange(0, size // dtype(0).itemsize, dtype=dtype))
-    compressor = libnvcomp.BitcompManager(**inputs)
-    final = compressor.compress(data)
-    assert_compression_size(len(final), LEN["Bitcomp"])
-
-
-@pytest.mark.parametrize(
-    "inputs, expected",
-    zip(
-        [
-            {"algo": 0},
-            {"algo": 1},
-            {"algo": 2},
-        ],
-        [LEN["Bitcomp"], LEN["Bitcomp"], LEN["Bitcomp"]],
-    ),
-)
-def test_bitcomp_algorithms(inputs, expected):
-    size = 10000
-    dtype = inputs.get("data_type") if inputs.get("data_type") else np.int8
-    data = cupy.array(np.arange(0, size // dtype(0).itemsize, dtype=dtype))
-    compressor = libnvcomp.BitcompManager(**inputs)
-    final = compressor.compress(data)
-    assert_compression_size(len(final), expected)
-
-
-@pytest.mark.parametrize(
-    "inputs",
-    [
-        {},
-        {
-            "options": {
-                "chunk_size": 1 << 12,
-                "type": np.uint32,
-                "num_RLEs": 2,
-                "num_deltas": 1,
-                "use_bp": True,
-            },
-        },
-        {
-            "options": {
-                "chunk_size": 1 << 12,
-                "type": np.uint32,
-                "num_RLEs": 2,
-                "num_deltas": 1,
-                "use_bp": True,
-            },
-            "chunk_size": 1 << 16,
-        },
-        {
-            "options": {
-                "chunk_size": 1 << 12,
-                "type": np.uint32,
-                "num_RLEs": 2,
-                "num_deltas": 1,
-                "use_bp": True,
-            },
-            "data_type": np.uint8,
-        },
-        {
-            "options": {
-                "chunk_size": 1 << 12,
-                "type": np.uint32,
-                "num_RLEs": 2,
-                "num_deltas": 1,
-                "use_bp": True,
-            },
-        },
-    ],
-)
-def test_cascaded_inputs(inputs):
-    size = 10000
-    dtype = inputs.get("data_type") if inputs.get("data_type") else np.int8
-    data = cupy.array(np.arange(0, size // dtype(0).itemsize, dtype=dtype))
-    compressor = libnvcomp.CascadedManager(**inputs)
-    final = compressor.compress(data)
-    assert_compression_size(len(final), LEN["Cascaded"])
-
-
-@pytest.mark.parametrize(
-    "inputs",
-    [
-        {},
-        {
-            "chunk_size": 1 << 16,
-            "algo": 0,
-        },
-        {
-            "chunk_size": 1 << 16,
-        },
-        {
-            "algo": 0,
-        },
-    ],
-)
-def test_gdeflate_inputs(inputs):
-    size = 10000
-    dtype = inputs.get("data_type") if inputs.get("data_type") else np.int8
-    data = cupy.array(np.arange(0, size // dtype(0).itemsize, dtype=dtype))
-    compressor = libnvcomp.GdeflateManager(**inputs)
-    final = compressor.compress(data)
-    assert_compression_size(len(final), LEN["Gdeflate"])
-
-
-@pytest.mark.parametrize(
-    "inputs, expected",
-    zip(
-        [
-            {"algo": 0},
-        ],
-        [LEN["Gdeflate"]],
-    ),
-)
-def test_gdeflate_algorithms(inputs, expected):
-    size = 10000
-    dtype = np.int8
-    data = cupy.array(np.arange(0, size // dtype(0).itemsize, dtype=dtype))
-    compressor = libnvcomp.GdeflateManager(**inputs)
-    final = compressor.compress(data)
-    assert_compression_size(len(final), expected)
-
-
-@pytest.mark.xfail(raises=ValueError)
-@pytest.mark.parametrize(
-    "inputs, expected",
-    zip([{"algo": 1}, {"algo": 2}], [LEN["Gdeflate"], LEN["Gdeflate"]]),
-)
-def test_gdeflate_algorithms_not_implemented(inputs, expected):
-    size = 10000
-    dtype = np.int8
-    data = cupy.array(np.arange(0, size // dtype(0).itemsize, dtype=dtype))
-    compressor = libnvcomp.GdeflateManager(**inputs)
-    final = compressor.compress(data)
-    assert_compression_size(len(final), expected)
-
-
-@pytest.mark.parametrize(
-    "inputs",
-    [
-        {},
-        {
-            "chunk_size": 1 << 16,
-            "data_type": np.uint8,
-        },
-        {
-            "chunk_size": 1 << 16,
-        },
-        {
-            "data_type": np.uint8,
-        },
-    ],
-)
-def test_lz4_inputs(inputs):
-    size = 10000
-    dtype = inputs.get("data_type") if inputs.get("data_type") else np.int8
-    data = cupy.array(np.arange(0, size // dtype(0).itemsize, dtype=dtype))
-    compressor = libnvcomp.LZ4Manager(**inputs)
-    final = compressor.compress(data)
-    assert_compression_size(len(final), LEN["LZ4"])
-
-
-@pytest.mark.parametrize(
-    "inputs",
-    [
-        {},
-        {
-            "chunk_size": 1 << 16,
-        },
-        {
-            "chunk_size": 1 << 16,
-        },
-        {},
-    ],
-)
-def test_snappy_inputs(inputs):
-    size = 10000
-    dtype = np.int8
-    data = cupy.array(np.arange(0, size // dtype(0).itemsize, dtype=dtype))
-    compressor = libnvcomp.SnappyManager(**inputs)
-    final = compressor.compress(data)
-    assert_compression_size(len(final), LEN["Snappy"])
-
-
-@pytest.mark.parametrize(
-    "compressor_size",
-    zip(
-        managers(),
-        [
-            {  # ANS
-                "max_compressed_buffer_size": 89373,
-                "num_chunks": 1,
-                "uncompressed_buffer_size": 10000,
-            },
-            {  # Bitcomp
-                "max_compressed_buffer_size": 16432,
-                "num_chunks": 1,
-                "uncompressed_buffer_size": 10000,
-            },
-            {  # Cascaded
-                "max_compressed_buffer_size": 12460,
-                "num_chunks": 3,
-                "uncompressed_buffer_size": 10000,
-            },
-            {  # Gdeflate
-                "max_compressed_buffer_size": 131160,
-                "num_chunks": 1,
-                "uncompressed_buffer_size": 10000,
-            },
-            {  # LZ4
-                "max_compressed_buffer_size": 65888,
-                "num_chunks": 1,
-                "uncompressed_buffer_size": 10000,
-            },
-            {  # Snappy
-                "max_compressed_buffer_size": 76575,
-                "num_chunks": 1,
-                "uncompressed_buffer_size": 10000,
-            },
-        ],
-    ),
-)
-def test_get_compression_config_with_default_options(compressor_size):
-    compressor = compressor_size[0]
-    expected = compressor_size[1]
-    length = 10000
-    dtype = cupy.uint8
-    data = cupy.array(
-        np.arange(
-            0,
-            length // cupy.dtype(dtype).type(0).itemsize,
-            dtype=dtype,
-        )
-    )
-    compressor_instance = compressor()
-    result = compressor_instance.configure_compression(len(data))
-    assert_compression_size(
-        result.pop("max_compressed_buffer_size"),
-        expected.pop("max_compressed_buffer_size"),
-    )
-    assert result == expected
-
-
-@pytest.mark.parametrize(
-    "manager,expected",
-    zip(
-        managers(),
-        [
-            {  # ANS
-                "num_chunks": 1,
-                "decomp_data_size": 10000,
-            },
-            {  # Bitcomp
-                "num_chunks": 1,
-                "decomp_data_size": 10000,
-            },
-            {  # Cascaded
-                "num_chunks": 3,
-                "decomp_data_size": 10000,
-            },
-            {  # Gdeflate
-                "num_chunks": 1,
-                "decomp_data_size": 10000,
-            },
-            {  # LZ4
-                "num_chunks": 1,
-                "decomp_data_size": 10000,
-            },
-            {  # Snappy
-                "num_chunks": 1,
-                "decomp_data_size": 10000,
-            },
-        ],
-    ),
-)
-def test_get_decompression_config_with_default_options(manager, expected):
-    length = 10000
-    dtype = cupy.uint8
-    data = cupy.array(
-        np.arange(
-            0,
-            length // cupy.dtype(dtype).type(0).itemsize,
-            dtype=dtype,
-        )
-    )
-    compressor_instance = manager()
-    compressed = compressor_instance.compress(data)
-    result = compressor_instance.configure_decompression_with_compressed_buffer(
-        compressed
-    )
-    assert_compression_size(
-        result.pop("decomp_data_size"), expected.pop("decomp_data_size")
-    )
-    assert result == expected
-
-
-@pytest.mark.parametrize(
-    "manager, expected",
-    zip(managers(), list(LEN.values())),
-)
-def test_get_compressed_output_size(manager, expected):
-    length = 10000
-    dtype = cupy.uint8
-    data = cupy.array(
-        np.arange(
-            0,
-            length // cupy.dtype(dtype).type(0).itemsize,
-            dtype=dtype,
-        )
-    )
-    compressor_instance = manager()
-    compressed = compressor_instance.compress(data)
-    buffer_size = compressor_instance.get_compressed_output_size(compressed)
-    assert_compression_size(buffer_size, expected)
-
-
-@pytest.mark.parametrize("manager", managers())
-def test_managed_manager(manager):
-    length = 10000
-    dtype = cupy.uint8
-    data = cupy.array(
-        np.arange(
-            0,
-            length // cupy.dtype(dtype).type(0).itemsize,
-            dtype=dtype,
-        )
-    )
-    compressor_instance = manager()
-    compressed = compressor_instance.compress(data)
-    manager = libnvcomp.ManagedDecompressionManager(compressed)
-    decompressed = manager.decompress(compressed)
-    assert len(decompressed) == 10000
diff --git a/python/kvikio/tests/test_nvcomp_codec.py b/python/kvikio/tests/test_nvcomp_codec.py
deleted file mode 100644
index 29e50ad64b..0000000000
--- a/python/kvikio/tests/test_nvcomp_codec.py
+++ /dev/null
@@ -1,243 +0,0 @@
-# Copyright (c) 2023-2025, NVIDIA CORPORATION. All rights reserved.
-# See file LICENSE for terms.
-
-import itertools as it
-import json
-
-import cupy as cp
-import numcodecs
-import numpy as np
-import packaging
-import packaging.version
-import pytest
-import zarr
-from numpy.testing import assert_equal
-
-from kvikio.nvcomp_codec import NvCompBatchCodec
-
-NVCOMP_CODEC_ID = "nvcomp_batch"
-
-LZ4_ALGO = "LZ4"
-GDEFLATE_ALGO = "Gdeflate"
-SNAPPY_ALGO = "snappy"
-ZSTD_ALGO = "zstd"
-DEFLATE_ALGO = "deflate"
-
-SUPPORTED_CODECS = [LZ4_ALGO, GDEFLATE_ALGO, SNAPPY_ALGO, ZSTD_ALGO, DEFLATE_ALGO]
-
-
-def skip_if_zarr_v3():
-    return pytest.mark.skipif(
-        packaging.version.parse(zarr.__version__) >= packaging.version.Version("3.0.0"),
-        reason="zarr 3.x not supported.",
-    )
-
-
-def _get_codec(algo: str, **kwargs):
-    codec_args = {"id": NVCOMP_CODEC_ID, "algorithm": algo, "options": kwargs}
-    return numcodecs.registry.get_codec(codec_args)
-
-
-@pytest.fixture(params=[(32,), (8, 16), (16, 16)])
-def shape(request):
-    return request.param
-
-
-# Separate fixture for combinations of shapes and chunks, since
-# chunks array must have the same rank as data array.
-@pytest.fixture(
-    params=it.chain(
-        it.product([(64,)], [(64,), (100,)]),
-        it.product([(16, 8), (16, 16)], [(8, 16), (16, 16), (40, 12)]),
-    )
-)
-def shape_chunks(request):
-    return request.param
-
-
-@pytest.mark.parametrize("algo", SUPPORTED_CODECS)
-def test_codec_registry(algo: str):
-    codec = _get_codec(algo)
-    assert isinstance(codec, numcodecs.abc.Codec)
-
-
-@pytest.mark.parametrize("algo", SUPPORTED_CODECS)
-def test_basic(algo: str, shape):
-    codec = NvCompBatchCodec(algo)
-
-    # Create data.
-    dtype = np.float32
-    data = np.ones(shape, dtype=dtype)
-    # Do roundtrip.
-    comp_data = codec.encode(data)
-    # Decompress and cast to original data type/shape.
-    decomp_data = codec.decode(comp_data).view(dtype).reshape(shape)
-
-    assert_equal(decomp_data, data)
-
-
-@pytest.mark.parametrize("algo", SUPPORTED_CODECS)
-@skip_if_zarr_v3()
-def test_basic_zarr(algo: str, shape_chunks):
-    shape, chunks = shape_chunks
-
-    codec = NvCompBatchCodec(algo)
-
-    data = np.ones(shape, dtype=np.float32)
-
-    # This will do the compression.
-    z = zarr.array(data, chunks=chunks, compressor=codec)
-
-    # Test the decompression.
-    assert_equal(z[:], data[:])
-
-
-@pytest.mark.parametrize("algo", SUPPORTED_CODECS)
-@pytest.mark.parametrize("chunk_sizes", [(100, 100), (100, 150)])
-@pytest.mark.parametrize("out", [None, "cpu", "gpu"])
-def test_batch_comp_decomp(algo: str, chunk_sizes, out: str):
-    codec = _get_codec(algo)
-
-    np.random.seed(1)
-
-    dtype = np.float32
-    chunks = [np.random.randn(s).astype(dtype) for s in chunk_sizes]
-    out_buf = None
-    if out == "cpu":
-        out_buf = [np.empty_like(c) for c in chunks]
-    elif out == "gpu":
-        out_buf = [cp.empty_like(c) for c in chunks]
-
-    comp_chunks = codec.encode_batch([c.tobytes() for c in chunks])
-    assert len(comp_chunks) == 2
-
-    decomp_chunks = codec.decode_batch(comp_chunks, out=out_buf)
-    assert len(decomp_chunks) == 2
-
-    for i, dc in enumerate(decomp_chunks):
-        dc = dc.view(dtype=dtype)
-        if isinstance(dc, cp.ndarray):
-            dc = dc.get()
-        assert_equal(dc, chunks[i], f"{i=}")
-
-        if out_buf is not None:
-            ob = out_buf[i]
-            if isinstance(ob, cp.ndarray):
-                ob = ob.get()
-            assert_equal(ob, chunks[i], f"{i=}")
-
-
-@pytest.mark.parametrize("algo", SUPPORTED_CODECS)
-@skip_if_zarr_v3()
-def test_comp_decomp(algo: str, shape_chunks):
-    shape, chunks = shape_chunks
-
-    codec = _get_codec(algo)
-
-    np.random.seed(1)
-
-    data = np.random.randn(*shape).astype(np.float32)
-
-    z1 = zarr.array(data, chunks=chunks, compressor=codec)
-
-    zarr_store = zarr.MemoryStore()
-    zarr.save_array(zarr_store, z1, compressor=codec)
-    # Check the store.
-    meta = json.loads(zarr_store[".zarray"])
-    assert meta["compressor"]["id"] == NVCOMP_CODEC_ID
-    assert meta["compressor"]["algorithm"] == algo.lower()
-
-    # Read back/decompress.
-    z2 = zarr.open_array(zarr_store)
-
-    assert_equal(z1[:], z2[:])
-
-
-@pytest.mark.parametrize(
-    "algo, options",
-    [
-        ("lz4", {"data_type": 4}),  # NVCOMP_TYPE_INT data type.
-        ("gdeflate", {"algo": 1}),  # low-throughput, high compression ratio algo
-    ],
-)
-@skip_if_zarr_v3()
-def test_codec_options(algo, options):
-    codec = NvCompBatchCodec(algo, options)
-
-    shape = (16, 16)
-    chunks = (8, 8)
-
-    data = np.ones(shape, dtype=np.float32)
-
-    z = zarr.array(data, chunks=chunks, compressor=codec)
-
-    assert_equal(z[:], data[:])
-
-
-@skip_if_zarr_v3()
-def test_codec_invalid_options():
-    # There are currently only 3 supported algos in Gdeflate
-    codec = NvCompBatchCodec(GDEFLATE_ALGO, options={"algo": 10})
-
-    data = np.ones((16, 16), dtype=np.float32)
-
-    with pytest.raises(RuntimeError):
-        zarr.array(data, compressor=codec)
-
-
-@pytest.mark.parametrize(
-    "cpu_algo, gpu_algo",
-    [
-        ("lz4", LZ4_ALGO),
-        ("zstd", ZSTD_ALGO),
-    ],
-)
-@skip_if_zarr_v3()
-def test_cpu_comp_gpu_decomp(cpu_algo, gpu_algo):
-    cpu_codec = numcodecs.registry.get_codec({"id": cpu_algo})
-    gpu_codec = _get_codec(gpu_algo)
-
-    shape = (16, 16)
-    chunks = (8, 8)
-
-    data = np.ones(shape, dtype=np.float32)
-
-    z1 = zarr.array(data, chunks=chunks)
-    store = {}
-    zarr.save_array(store, z1, compressor=cpu_codec)
-
-    meta = json.loads(store[".zarray"])
-    assert meta["compressor"]["id"] == cpu_algo
-
-    meta["compressor"] = {"id": NVCOMP_CODEC_ID, "algorithm": gpu_algo}
-    store[".zarray"] = json.dumps(meta).encode()
-
-    z2 = zarr.open_array(store, compressor=gpu_codec)
-
-    assert_equal(z1[:], z2[:])
-
-
-@skip_if_zarr_v3()
-def test_lz4_codec_header(shape_chunks):
-    shape, chunks = shape_chunks
-
-    # Test LZ4 nvCOMP codecs with and without the header.
-    codec_h = _get_codec(LZ4_ALGO, has_header=True)
-    codec_no_h = _get_codec(LZ4_ALGO, has_header=False)
-
-    np.random.seed(1)
-
-    data = np.random.randn(*shape).astype(np.float32)
-
-    z_h = zarr.array(data, chunks=chunks, compressor=codec_h)
-    z_no_h = zarr.array(data, chunks=chunks, compressor=codec_no_h)
-
-    # Result must be the same regardless of the header presence.
-    assert_equal(z_h[:], z_no_h[:])
-
-
-def test_empty_batch():
-    codec = _get_codec(LZ4_ALGO)
-
-    assert len(codec.encode_batch([])) == 0
-    assert len(codec.decode_batch([])) == 0
diff --git a/python/kvikio/tests/test_zarr.py b/python/kvikio/tests/test_zarr.py
deleted file mode 100644
index a793e2568e..0000000000
--- a/python/kvikio/tests/test_zarr.py
+++ /dev/null
@@ -1,292 +0,0 @@
-# Copyright (c) 2021-2025, NVIDIA CORPORATION. All rights reserved.
-# See file LICENSE for terms.
-
-
-import math
-
-import numpy
-import pytest
-from packaging.version import parse
-
-cupy = pytest.importorskip("cupy")
-zarr = pytest.importorskip("zarr")
-kvikio_zarr = pytest.importorskip("kvikio.zarr")
-kvikio_nvcomp_codec = pytest.importorskip("kvikio.nvcomp_codec")
-numcodecs = pytest.importorskip("numcodecs")
-
-if not kvikio_zarr.supported:
-    pytest.skip(
-        f"requires Zarr >={kvikio_zarr.MINIMUM_ZARR_VERSION}",
-        allow_module_level=True,
-    )
-
-if parse(zarr.__version__) >= parse("3.0.0"):
-    pytest.skip(
-        "requires Zarr<3",
-        allow_module_level=True,
-    )
-
-
-@pytest.fixture
-def store(tmp_path):
-    """Fixture that creates a GDS Store"""
-    return kvikio_zarr.GDSStore(tmp_path / "test-file.zarr")
-
-
-def test_direct_store_access(store, xp):
-    """Test accessing the GDS Store directly"""
-
-    a = xp.arange(5, dtype="u1")
-    store["a"] = a
-    b = store["a"]
-
-    # Notice, unless using getitems(), GDSStore always returns bytes
-    assert isinstance(b, bytes)
-    assert (xp.frombuffer(b, dtype="u1") == a).all()
-
-
-@pytest.mark.parametrize("xp_write", ["numpy", "cupy"])
-@pytest.mark.parametrize("xp_read_a", ["numpy", "cupy"])
-@pytest.mark.parametrize("xp_read_b", ["numpy", "cupy"])
-def test_direct_store_access_getitems(store, xp_write, xp_read_a, xp_read_b):
-    """Test accessing the GDS Store directly using getitems()"""
-
-    xp_read_a = pytest.importorskip(xp_read_a)
-    xp_read_b = pytest.importorskip(xp_read_b)
-    xp_write = pytest.importorskip(xp_write)
-    a = xp_write.arange(5, dtype="u1")
-    b = a * 2
-    store["a"] = a
-    store["b"] = b
-
-    res = store.getitems(
-        keys=["a", "b"],
-        contexts={
-            "a": {"meta_array": xp_read_a.empty(())},
-            "b": {"meta_array": xp_read_b.empty(())},
-        },
-    )
-    assert isinstance(res["a"], xp_read_a.ndarray)
-    assert isinstance(res["b"], xp_read_b.ndarray)
-    cupy.testing.assert_array_equal(res["a"], a)
-    cupy.testing.assert_array_equal(res["b"], b)
-
-
-def test_array(store, xp):
-    """Test Zarr array"""
-
-    a = xp.arange(100)
-    z = zarr.array(a, chunks=10, compressor=None, store=store, meta_array=xp.empty(()))
-    assert isinstance(z.meta_array, type(a))
-    assert a.shape == z.shape
-    assert a.dtype == z.dtype
-    assert isinstance(a, type(z[:]))
-    xp.testing.assert_array_equal(a, z[:])
-
-
-def test_group(store, xp):
-    """Test Zarr group"""
-
-    g = zarr.open_group(store, meta_array=xp.empty(()))
-    g.ones("data", shape=(10, 11), dtype=int, compressor=None)
-    a = g["data"]
-    assert a.shape == (10, 11)
-    assert a.dtype == int
-    assert isinstance(a, zarr.Array)
-    assert isinstance(a.meta_array, xp.ndarray)
-    assert isinstance(a[:], xp.ndarray)
-    assert (a[:] == 1).all()
-
-
-def test_open_array(store, xp):
-    """Test Zarr's open_array()"""
-
-    a = xp.arange(10)
-    z = zarr.open_array(
-        store,
-        shape=a.shape,
-        dtype=a.dtype,
-        chunks=(10,),
-        compressor=None,
-        meta_array=xp.empty(()),
-    )
-    z[:] = a
-    assert a.shape == z.shape
-    assert a.dtype == z.dtype
-    assert isinstance(a, type(z[:]))
-    xp.testing.assert_array_equal(a, z[:])
-
-
-@pytest.mark.parametrize("inline_array", [True, False])
-def test_dask_read(store, xp, inline_array):
-    """Test Zarr read in Dask"""
-
-    da = pytest.importorskip("dask.array")
-    a = xp.arange(100)
-    z = zarr.array(a, chunks=10, compressor=None, store=store, meta_array=xp.empty(()))
-    d = da.from_zarr(z, inline_array=inline_array)
-    d += 1
-    xp.testing.assert_array_equal(a + 1, d.compute())
-
-
-def test_dask_write(store, xp):
-    """Test Zarr write in Dask"""
-
-    da = pytest.importorskip("dask.array")
-
-    # Write dask array to disk using Zarr
-    a = xp.arange(100)
-    d = da.from_array(a, chunks=10)
-    da.to_zarr(d, store, compressor=None, meta_array=xp.empty(()))
-
-    # Validate the written Zarr array
-    z = zarr.open_array(store)
-    xp.testing.assert_array_equal(a, z[:])
-
-
-@pytest.mark.parametrize("xp_read", ["numpy", "cupy"])
-@pytest.mark.parametrize("xp_write", ["numpy", "cupy"])
-@pytest.mark.parametrize("compressor", kvikio_zarr.nvcomp_compressors)
-def test_compressor(store, xp_write, xp_read, compressor):
-    xp_read = pytest.importorskip(xp_read)
-    xp_write = pytest.importorskip(xp_write)
-
-    shape = (10, 1)
-    chunks = (10, 1)
-    a = xp_write.arange(math.prod(shape)).reshape(shape)
-    z = zarr.creation.create(
-        shape=shape,
-        chunks=chunks,
-        compressor=compressor(),
-        store=store,
-        meta_array=xp_read.empty(()),
-    )
-    z[:] = a
-    b = z[:]
-    assert isinstance(b, xp_read.ndarray)
-    cupy.testing.assert_array_equal(b, a)
-
-
-@pytest.mark.parametrize("algo", ["lz4", "zstd"])
-def test_decompressor_config_overwrite(tmp_path, xp, algo):
-    cpu_codec = numcodecs.registry.get_codec({"id": algo})
-    gpu_codec = kvikio_nvcomp_codec.NvCompBatchCodec(algo)
-
-    # Write using Zarr's default file store and the `cpu_codec` compressor
-    z = zarr.open_array(tmp_path, mode="w", shape=(10,), compressor=cpu_codec)
-    z[:] = range(10)
-    assert z.compressor == cpu_codec
-
-    # Open file using GDSStore and use `gpu_codec` as decompressor.
-    z = zarr.open_array(
-        kvikio_zarr.GDSStore(
-            tmp_path,
-            decompressor_config_overwrite=gpu_codec.get_config(),
-        ),
-        mode="r",
-        meta_array=xp.empty(()),
-    )
-    assert z.compressor == gpu_codec
-    assert isinstance(z[:], xp.ndarray)
-    xp.testing.assert_array_equal(z[:], range(10))
-
-
-@pytest.mark.parametrize("algo", ["lz4"])
-def test_compressor_config_overwrite(tmp_path, xp, algo):
-    cpu_codec = numcodecs.registry.get_codec({"id": algo})
-    gpu_codec = kvikio_nvcomp_codec.NvCompBatchCodec(algo)
-
-    # Write file using GDSStore and the `gpu_codec` compressor. In order
-    # to make the file compatible with Zarr's builtin CPU decompressor,
-    # we set `cpu_codec` as the compressor in the meta file on disk.
-    z = zarr.open_array(
-        kvikio_zarr.GDSStore(
-            tmp_path,
-            compressor_config_overwrite=cpu_codec.get_config(),
-            decompressor_config_overwrite=gpu_codec.get_config(),
-        ),
-        mode="w",
-        shape=10,
-        compressor=gpu_codec,
-        meta_array=xp.empty(()),
-    )
-    assert z.compressor == gpu_codec
-    z[:] = xp.arange(10)
-
-    # We can now open the file using Zarr's builtin CPU decompressor
-    z = zarr.open_array(tmp_path, mode="r")
-    assert isinstance(z[:], numpy.ndarray)
-    numpy.testing.assert_array_equal(z[:], range(10))
-
-
-@pytest.mark.parametrize("write_mode", ["w", "w-", "a"])
-@pytest.mark.parametrize("read_mode", ["r", "r+", "a"])
-def test_open_cupy_array(tmp_path, write_mode, read_mode):
-    a = cupy.arange(10)
-    z = kvikio_zarr.open_cupy_array(
-        tmp_path,
-        mode=write_mode,
-        shape=a.shape,
-        dtype=a.dtype,
-        chunks=(2,),
-        compressor=kvikio_zarr.CompatCompressor.lz4(),
-    )
-    z[:] = a
-    assert a.shape == z.shape
-    assert a.dtype == z.dtype
-    assert isinstance(z[:], type(a))
-    assert z.compressor == kvikio_nvcomp_codec.NvCompBatchCodec("lz4")
-    cupy.testing.assert_array_equal(a, z[:])
-
-    z = kvikio_zarr.open_cupy_array(
-        tmp_path,
-        mode=read_mode,
-    )
-    assert a.shape == z.shape
-    assert a.dtype == z.dtype
-    assert isinstance(z[:], type(a))
-    assert z.compressor == kvikio_nvcomp_codec.NvCompBatchCodec("lz4")
-    cupy.testing.assert_array_equal(a, z[:])
-
-    z = zarr.open_array(tmp_path, mode=read_mode)
-    assert a.shape == z.shape
-    assert a.dtype == z.dtype
-    assert isinstance(z[:], numpy.ndarray)
-    assert z.compressor == kvikio_zarr.CompatCompressor.lz4().cpu
-    numpy.testing.assert_array_equal(a.get(), z[:])
-
-
-@pytest.mark.parametrize("compressor", [None, kvikio_zarr.CompatCompressor.lz4().cpu])
-def test_open_cupy_array_written_by_zarr(tmp_path, compressor):
-    data = numpy.arange(100)
-    z = zarr.open_array(
-        tmp_path,
-        shape=data.shape,
-        mode="w",
-        compressor=compressor,
-    )
-    z[:] = data
-
-    z = kvikio_zarr.open_cupy_array(tmp_path, mode="r")
-    assert isinstance(z[:], cupy.ndarray)
-    cupy.testing.assert_array_equal(z[:], data)
-
-
-@pytest.mark.parametrize("mode", ["r", "r+", "a"])
-def test_open_cupy_array_incompatible_compressor(tmp_path, mode):
-    zarr.create((10,), store=tmp_path, compressor=numcodecs.Blosc())
-
-    with pytest.raises(ValueError, match="non-CUDA compatible compressor"):
-        kvikio_zarr.open_cupy_array(tmp_path, mode=mode)
-
-
-def test_open_cupy_array_unknown_mode(tmp_path):
-    a = cupy.arange(10)
-    with pytest.raises(ValueError, match="Unknown mode: x"):
-        kvikio_zarr.open_cupy_array(
-            tmp_path,
-            mode="x",
-            shape=a.shape,
-            dtype=a.dtype,
-            chunks=(2,),
-        )

From c1391d696fa5eca90048c98ba9f29cd54a588393 Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Mon, 18 Aug 2025 18:35:38 -0700
Subject: [PATCH 17/40] Remove remaining nvcomp references (#801)

https://github.com/rapidsai/kvikio/pull/798 removed usage of nvcomp but left the linkage in place, kvikio extension modules still relied on nvcomp existing even though they didn't actually use any of its functionality. That is now causing problems in #800. Removing the linkage entirely here (while still revendoring manually until we can move the vendoring to cudf) should resolve that.

Authors:
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/kvikio/pull/801
---
 cpp/CMakeLists.txt                            |  1 -
 python/kvikio/CMakeLists.txt                  |  4 ---
 python/kvikio/cmake/CMakeLists.txt            | 15 ---------
 .../kvikio/cmake/thirdparty/get_nvcomp.cmake  | 33 -------------------
 python/kvikio/kvikio/_lib/CMakeLists.txt      | 12 +------
 python/kvikio/pyproject.toml                  |  7 ----
 python/libkvikio/CMakeLists.txt               | 29 +++++-----------
 python/libkvikio/libkvikio/load.py            |  5 +--
 8 files changed, 12 insertions(+), 94 deletions(-)
 delete mode 100644 python/kvikio/cmake/CMakeLists.txt
 delete mode 100644 python/kvikio/cmake/thirdparty/get_nvcomp.cmake

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 6107a0a795..9fa8e1f798 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -49,7 +49,6 @@ option(KvikIO_BUILD_EXAMPLES "Configure CMake to build examples" ON)
 option(KvikIO_BUILD_TESTS "Configure CMake to build tests" ON)
 option(KvikIO_REMOTE_SUPPORT "Configure CMake to build with remote IO support" ON)
 option(KvikIO_CUDA_SUPPORT "Configure CMake to build with CUDA support" ON)
-option(KvikIO_EXPORT_NVCOMP "Export NVCOMP as a dependency" ON)
 
 # ##################################################################################################
 # * conda environment ------------------------------------------------------------------------------
diff --git a/python/kvikio/CMakeLists.txt b/python/kvikio/CMakeLists.txt
index 3e5af8c857..480e485d2b 100644
--- a/python/kvikio/CMakeLists.txt
+++ b/python/kvikio/CMakeLists.txt
@@ -26,8 +26,6 @@ project(
   LANGUAGES CXX CUDA
 )
 
-option(USE_NVCOMP_RUNTIME_WHEEL "Use the nvcomp wheel at runtime instead of the system library" OFF)
-
 find_package(kvikio REQUIRED "${RAPIDS_VERSION}")
 
 find_package(CUDAToolkit REQUIRED)
@@ -35,8 +33,6 @@ find_package(CUDAToolkit REQUIRED)
 include(rapids-cython-core)
 rapids_cython_init()
 
-add_subdirectory(cmake)
-
 set(cython_lib_dir kvikio)
 
 add_subdirectory(kvikio/_lib)
diff --git a/python/kvikio/cmake/CMakeLists.txt b/python/kvikio/cmake/CMakeLists.txt
deleted file mode 100644
index d3882b5ab3..0000000000
--- a/python/kvikio/cmake/CMakeLists.txt
+++ /dev/null
@@ -1,15 +0,0 @@
-# =============================================================================
-# Copyright (c) 2022-2024, NVIDIA CORPORATION.
-#
-# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
-# in compliance with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software distributed under the License
-# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
-# or implied. See the License for the specific language governing permissions and limitations under
-# the License.
-# =============================================================================
-
-include(thirdparty/get_nvcomp.cmake)
diff --git a/python/kvikio/cmake/thirdparty/get_nvcomp.cmake b/python/kvikio/cmake/thirdparty/get_nvcomp.cmake
deleted file mode 100644
index a2c6326e76..0000000000
--- a/python/kvikio/cmake/thirdparty/get_nvcomp.cmake
+++ /dev/null
@@ -1,33 +0,0 @@
-# =============================================================================
-# Copyright (c) 2021-2024, NVIDIA CORPORATION.
-#
-# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
-# in compliance with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software distributed under the License
-# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
-# or implied. See the License for the specific language governing permissions and limitations under
-# the License.
-# =============================================================================
-
-set(KVIKIO_USE_PROPRIETARY_BINARY ON)
-
-# This function finds nvcomp and sets any additional necessary environment variables.
-function(find_and_configure_nvcomp)
-
-  include(${rapids-cmake-dir}/cpm/nvcomp.cmake)
-  set(export_args)
-  if(KvikIO_EXPORT_NVCOMP)
-    set(export_args BUILD_EXPORT_SET kvikio-exports INSTALL_EXPORT_SET kvikio-exports)
-  endif()
-  rapids_cpm_nvcomp(${export_args} USE_PROPRIETARY_BINARY ${KVIKIO_USE_PROPRIETARY_BINARY})
-
-  # Per-thread default stream
-  if(TARGET nvcomp AND PER_THREAD_DEFAULT_STREAM)
-    target_compile_definitions(nvcomp PRIVATE CUDA_API_PER_THREAD_DEFAULT_STREAM)
-  endif()
-endfunction()
-
-find_and_configure_nvcomp()
diff --git a/python/kvikio/kvikio/_lib/CMakeLists.txt b/python/kvikio/kvikio/_lib/CMakeLists.txt
index 5b454295c0..b46d59c960 100644
--- a/python/kvikio/kvikio/_lib/CMakeLists.txt
+++ b/python/kvikio/kvikio/_lib/CMakeLists.txt
@@ -30,15 +30,5 @@ endif()
 rapids_cython_create_modules(
   CXX
   SOURCE_FILES "${cython_modules}"
-  LINKED_LIBRARIES kvikio::kvikio nvcomp::nvcomp
+  LINKED_LIBRARIES kvikio::kvikio
 )
-if(USE_NVCOMP_RUNTIME_WHEEL)
-  set(rpaths "$ORIGIN/../../nvidia/nvcomp")
-  foreach(tgt IN LISTS RAPIDS_CYTHON_CREATED_TARGETS)
-    set_property(
-      TARGET ${tgt}
-      PROPERTY INSTALL_RPATH ${rpaths}
-      APPEND
-    )
-  endforeach()
-endif()
diff --git a/python/kvikio/pyproject.toml b/python/kvikio/pyproject.toml
index 8baa9aa89c..234738fe5b 100644
--- a/python/kvikio/pyproject.toml
+++ b/python/kvikio/pyproject.toml
@@ -110,12 +110,6 @@ skip = [
 
 [tool.mypy]
 ignore_missing_imports = true
-exclude = [
-    # we type check against zarr-python 3.x
-    # and ignore modules using 2.x
-    "python/kvikio/kvikio/zarr/_zarr_python_2.py",
-    "python/kvikio/tests/test_nvcomp_codec.py",
-]
 
 [project.entry-points."numcodecs.codecs"]
 nvcomp_batch = "kvikio.nvcomp_codec:NvCompBatchCodec"
@@ -160,7 +154,6 @@ filterwarnings = [
     "error",
     "ignore:Jitify is performing a one-time only warm-up to populate the persistent cache",
     "ignore::DeprecationWarning:botocore.*",
-    "ignore:This module is deprecated since.*Use the official nvCOMP API from 'nvidia.nvcomp' instead.:FutureWarning:.*nvcomp|.*nvcomp_codec",
 ]
 markers = [
     "cufile: tests to skip if cuFile isn't available e.g. run with `pytest -m 'not cufile'`",
diff --git a/python/libkvikio/CMakeLists.txt b/python/libkvikio/CMakeLists.txt
index ecde2dc288..231bebbd84 100644
--- a/python/libkvikio/CMakeLists.txt
+++ b/python/libkvikio/CMakeLists.txt
@@ -39,28 +39,15 @@ unset(kvikio_FOUND)
 set(KvikIO_BUILD_BENCHMARKS OFF)
 set(KvikIO_BUILD_EXAMPLES OFF)
 set(KvikIO_BUILD_TESTS OFF)
-if(USE_NVCOMP_RUNTIME_WHEEL)
-  set(KvikIO_EXPORT_NVCOMP OFF)
-else()
-  # vendor nvcomp but not the entire kvikio-export set because that's huge
-  include(cmake/thirdparty/get_nvcomp.cmake)
-  include(cmake/Modules/WheelHelpers.cmake)
-  install_aliased_imported_targets(
-    TARGETS nvcomp::nvcomp DESTINATION ${SKBUILD_PLATLIB_DIR}/libkvikio/lib64/
-  )
-endif()
+
+# vendor nvcomp but not the entire kvikio-export set because that's huge TODO: Move nvcomp vendoring
+# to libcudf wheel instead
+include(cmake/thirdparty/get_nvcomp.cmake)
+include(cmake/Modules/WheelHelpers.cmake)
+install_aliased_imported_targets(
+  TARGETS nvcomp::nvcomp DESTINATION ${SKBUILD_PLATLIB_DIR}/libkvikio/lib64/
+)
 
 set(CUDA_STATIC_RUNTIME ON)
 
 add_subdirectory(../../cpp kvikio-cpp)
-
-if(USE_NVCOMP_RUNTIME_WHEEL)
-  set(rpaths "$ORIGIN/../../nvidia/nvcomp")
-  foreach(tgt IN LISTS RAPIDS_CYTHON_CREATED_TARGETS)
-    set_property(
-      TARGET ${tgt}
-      PROPERTY INSTALL_RPATH ${rpaths}
-      APPEND
-    )
-  endforeach()
-endif()
diff --git a/python/libkvikio/libkvikio/load.py b/python/libkvikio/libkvikio/load.py
index c790d2dd00..2e573a0889 100644
--- a/python/libkvikio/libkvikio/load.py
+++ b/python/libkvikio/libkvikio/load.py
@@ -44,8 +44,9 @@ def _load_wheel_installation(soname: str):
 
 
 def load_library():
-    # TODO: remove this nvcomp load when `nvcomp` is re-de-vendored
-    # https://github.com/rapidsai/build-planning/issues/171
+    # TODO: remove this nvcomp load when `nvcomp` is vendored into cudf instead.
+    # Currently this load only exists to ensure that libcudf wheels are not broken by
+    # prematurely removing the load
     _load_library("libnvcomp.so.4")
     return _load_library("libkvikio.so")
 

From 0ff3c675c72e82d1120f38dfac9e66d0aedcffab Mon Sep 17 00:00:00 2001
From: Paul Taylor <178183+trxcllnt@users.noreply.github.com>
Date: Tue, 19 Aug 2025 09:52:38 -0700
Subject: [PATCH 18/40] Use build cluster in devcontainers (#797)

RAPIDS has deployed an autoscaling cloud build cluster that can be used to accelerate building large RAPIDS projects. This contributes to https://github.com/rapidsai/build-planning/issues/209.

Authors:
  - Paul Taylor (https://github.com/trxcllnt)

Approvers:
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/kvikio/pull/797
---
 .devcontainer/Dockerfile                      | 34 +++++++++++++++++--
 .../cuda12.9-conda/devcontainer.json          |  4 ++-
 .devcontainer/cuda12.9-pip/devcontainer.json  |  4 ++-
 .github/workflows/pr.yaml                     | 15 +++++---
 .../cmake/thirdparty/get_nvcomp.cmake         |  2 +-
 5 files changed, 50 insertions(+), 9 deletions(-)

diff --git a/.devcontainer/Dockerfile b/.devcontainer/Dockerfile
index 5d1d536704..4775d28f23 100644
--- a/.devcontainer/Dockerfile
+++ b/.devcontainer/Dockerfile
@@ -13,6 +13,8 @@ ENV DEFAULT_CONDA_ENV=rapids
 
 FROM ${PYTHON_PACKAGE_MANAGER}-base
 
+ARG TARGETARCH
+
 ARG CUDA
 ENV CUDAARCHS="RAPIDS"
 ENV CUDA_VERSION="${CUDA_VERSION:-${CUDA}}"
@@ -24,7 +26,35 @@ ENV PYTHONSAFEPATH="1"
 ENV PYTHONUNBUFFERED="1"
 ENV PYTHONDONTWRITEBYTECODE="1"
 
+ENV HISTFILE="/home/coder/.cache/._bash_history"
+
+###
+# sccache configuration
+###
+ENV AWS_ROLE_ARN="arn:aws:iam::279114543810:role/nv-gha-token-sccache-devs"
 ENV SCCACHE_REGION="us-east-2"
 ENV SCCACHE_BUCKET="rapids-sccache-devs"
-ENV AWS_ROLE_ARN="arn:aws:iam::279114543810:role/nv-gha-token-sccache-devs"
-ENV HISTFILE="/home/coder/.cache/._bash_history"
+# 2hr (1 minute longer than sccache-dist request timeout)
+ENV SCCACHE_IDLE_TIMEOUT=7200
+
+###
+# sccache-dist configuration
+###
+# Enable sccache-dist by default
+ENV DEVCONTAINER_UTILS_ENABLE_SCCACHE_DIST=1
+# Compile locally if max retries exceeded
+ENV SCCACHE_DIST_FALLBACK_TO_LOCAL_COMPILE=true
+# Retry transient errors 4 times (for a total of 5 attempts)
+ENV SCCACHE_DIST_MAX_RETRIES=4
+ENV SCCACHE_DIST_CONNECT_TIMEOUT=30
+ENV SCCACHE_DIST_CONNECTION_POOL=false
+# 1hr 59min (to accommodate debug builds)
+ENV SCCACHE_DIST_REQUEST_TIMEOUT=7140
+ENV SCCACHE_DIST_KEEPALIVE_ENABLED=true
+ENV SCCACHE_DIST_KEEPALIVE_INTERVAL=20
+ENV SCCACHE_DIST_KEEPALIVE_TIMEOUT=600
+ENV SCCACHE_DIST_URL="https://${TARGETARCH}.linux.sccache.rapids.nvidia.com"
+
+# Build as much in parallel as possible
+ENV INFER_NUM_DEVICE_ARCHITECTURES=1
+ENV MAX_DEVICE_OBJ_TO_COMPILE_IN_PARALLEL=20
diff --git a/.devcontainer/cuda12.9-conda/devcontainer.json b/.devcontainer/cuda12.9-conda/devcontainer.json
index fc3e89da7b..175af7a73c 100644
--- a/.devcontainer/cuda12.9-conda/devcontainer.json
+++ b/.devcontainer/cuda12.9-conda/devcontainer.json
@@ -11,7 +11,9 @@
   "runArgs": [
     "--rm",
     "--name",
-    "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-25.10-cuda12.9-conda"
+    "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-25.10-cuda12.9-conda",
+    "--ulimit",
+    "nofile=500000"
   ],
   "hostRequirements": {"gpu": "optional"},
   "features": {
diff --git a/.devcontainer/cuda12.9-pip/devcontainer.json b/.devcontainer/cuda12.9-pip/devcontainer.json
index 334b4b6d6c..ff13ba7e3c 100644
--- a/.devcontainer/cuda12.9-pip/devcontainer.json
+++ b/.devcontainer/cuda12.9-pip/devcontainer.json
@@ -11,7 +11,9 @@
   "runArgs": [
     "--rm",
     "--name",
-    "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-25.10-cuda12.9-pip"
+    "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-25.10-cuda12.9-pip",
+    "--ulimit",
+    "nofile=500000"
   ],
   "hostRequirements": {"gpu": "optional"},
   "features": {
diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index 7a3dada523..8db71539ab 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -144,13 +144,20 @@ jobs:
     secrets: inherit
     uses: rapidsai/shared-workflows/.github/workflows/build-in-devcontainer.yaml@branch-25.10
     with:
-      arch: '["amd64"]'
+      arch: '["amd64", "arm64"]'
       cuda: '["12.9"]'
+      node_type: "cpu8"
+      rapids-aux-secret-1: GIST_REPO_READ_ORG_GITHUB_TOKEN
+      env: |
+        SCCACHE_DIST_MAX_RETRIES=inf
+        SCCACHE_SERVER_LOG=sccache=debug
+        SCCACHE_DIST_FALLBACK_TO_LOCAL_COMPILE=false
+        SCCACHE_DIST_AUTH_TOKEN_VAR=RAPIDS_AUX_SECRET_1
       build_command: |
-        sccache -z;
-        build-all --verbose;
+        sccache --zero-stats;
+        build-all --verbose 2>&1 | tee telemetry-artifacts/build.log;
         python -c "import kvikio; print(kvikio.__version__)";
-        sccache -s;
+        sccache --show-adv-stats | tee telemetry-artifacts/sccache-stats.txt;
   wheel-cpp-build:
     secrets: inherit
     uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-25.10
diff --git a/python/libkvikio/cmake/thirdparty/get_nvcomp.cmake b/python/libkvikio/cmake/thirdparty/get_nvcomp.cmake
index 0901c1e349..fdaeeef407 100644
--- a/python/libkvikio/cmake/thirdparty/get_nvcomp.cmake
+++ b/python/libkvikio/cmake/thirdparty/get_nvcomp.cmake
@@ -21,7 +21,7 @@ function(find_and_configure_nvcomp)
   set(export_args)
   if(KvikIO_EXPORT_NVCOMP)
     # We're vendoring nvcomp and we only want `libnvcomp.so.4`
-    set(export_args BUILD_EXPORT_SET nvcomp)
+    set(export_args BUILD_EXPORT_SET nvcomp INSTALL_EXPORT_SET nvcomp)
   endif()
 
   rapids_cpm_nvcomp(${export_args} USE_PROPRIETARY_BINARY ${KVIKIO_USE_PROPRIETARY_BINARY})

From 927b5e36f079f938262773901166515858066b65 Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Tue, 19 Aug 2025 12:35:13 -0500
Subject: [PATCH 19/40] Remove more nvcomp packaging for conda (#804)

Follow-up to #798 and #801.

After libcudf wheels vendor libnvcomp, we can finalize removal of nvcomp from kvikio.

Authors:
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/kvikio/pull/804
---
 conda/environments/all_cuda-129_arch-aarch64.yaml |  1 -
 conda/environments/all_cuda-129_arch-x86_64.yaml  |  1 -
 conda/recipes/kvikio/conda_build_config.yaml      |  3 ---
 conda/recipes/kvikio/recipe.yaml                  |  1 -
 dependencies.yaml                                 | 10 ----------
 python/kvikio/pyproject.toml                      |  3 ---
 6 files changed, 19 deletions(-)

diff --git a/conda/environments/all_cuda-129_arch-aarch64.yaml b/conda/environments/all_cuda-129_arch-aarch64.yaml
index f72a97faf3..0a9f52ca87 100644
--- a/conda/environments/all_cuda-129_arch-aarch64.yaml
+++ b/conda/environments/all_cuda-129_arch-aarch64.yaml
@@ -19,7 +19,6 @@ dependencies:
 - libcufile-dev
 - libcurl>=8.5.0,<9.0a0
 - libnuma
-- libnvcomp-dev==4.2.0.11
 - moto>=4.0.8
 - ninja
 - numcodecs !=0.12.0
diff --git a/conda/environments/all_cuda-129_arch-x86_64.yaml b/conda/environments/all_cuda-129_arch-x86_64.yaml
index 890bfc29d7..dd4b21ea32 100644
--- a/conda/environments/all_cuda-129_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-129_arch-x86_64.yaml
@@ -19,7 +19,6 @@ dependencies:
 - libcufile-dev
 - libcurl>=8.5.0,<9.0a0
 - libnuma
-- libnvcomp-dev==4.2.0.11
 - moto>=4.0.8
 - ninja
 - numcodecs !=0.12.0
diff --git a/conda/recipes/kvikio/conda_build_config.yaml b/conda/recipes/kvikio/conda_build_config.yaml
index 2694d62eac..f5f37a39a2 100644
--- a/conda/recipes/kvikio/conda_build_config.yaml
+++ b/conda/recipes/kvikio/conda_build_config.yaml
@@ -18,6 +18,3 @@ c_stdlib_version:
 
 libcurl_version:
   - "==8.5.0"
-
-nvcomp_version:
-  - "=4.2.0.11"
diff --git a/conda/recipes/kvikio/recipe.yaml b/conda/recipes/kvikio/recipe.yaml
index 2e9e411d9e..7ec50ec153 100644
--- a/conda/recipes/kvikio/recipe.yaml
+++ b/conda/recipes/kvikio/recipe.yaml
@@ -66,7 +66,6 @@ requirements:
     - cython >=3.0.0
     - libcurl ${{ libcurl_version }}
     - libkvikio =${{ version }}
-    - libnvcomp-dev ${{ nvcomp_version }}
     - pip
     - python =${{ py_version }}
     - rapids-build-backend >=0.4.0,<0.5.0.dev0
diff --git a/dependencies.yaml b/dependencies.yaml
index 92fdec6bd5..13b4f3c54d 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -13,7 +13,6 @@ files:
       - cuda
       - cuda_version
       - depends_on_cupy
-      - depends_on_libnvcomp
       - docs
       - py_version
       - rapids_build_skbuild
@@ -66,10 +65,6 @@ files:
       table: project
     includes:
       - depends_on_cupy
-      # TODO: restore runtime dependency when we no longer vendor nvcomp
-      # (when nvcomp ships C++ wheels)
-      # https://github.com/rapidsai/build-planning/issues/171
-      # - depends_on_libnvcomp
       - depends_on_libkvikio
       - run
   py_rapids_build_libkvikio:
@@ -220,11 +215,6 @@ dependencies:
           - matrix: # All CUDA 12 versions
             packages:
               - cupy-cuda12x>=12.0.0
-  depends_on_libnvcomp:
-    common:
-      - output_types: conda
-        packages:
-          - libnvcomp-dev==4.2.0.11
   depends_on_libkvikio:
     common:
       - output_types: conda
diff --git a/python/kvikio/pyproject.toml b/python/kvikio/pyproject.toml
index 234738fe5b..1d3191cc67 100644
--- a/python/kvikio/pyproject.toml
+++ b/python/kvikio/pyproject.toml
@@ -111,9 +111,6 @@ skip = [
 [tool.mypy]
 ignore_missing_imports = true
 
-[project.entry-points."numcodecs.codecs"]
-nvcomp_batch = "kvikio.nvcomp_codec:NvCompBatchCodec"
-
 [tool.rapids-build-backend]
 build-backend = "scikit_build_core.build"
 dependencies-file = "../../dependencies.yaml"

From 8820d52067dd6fe4367278f6ec870954fb53cc50 Mon Sep 17 00:00:00 2001
From: Tom Augspurger <toaugspurger@nvidia.com>
Date: Tue, 19 Aug 2025 16:35:39 -0500
Subject: [PATCH 20/40] Optionally require zarr>=3.0.0 (#802)

This makes zarr an optional dependency of kvikio. The `pyproject.toml` now includes an optional dependency group 'zarr' that requires zarr>=3.0.0. `zarr` is no longer present as a (required) dependency in the conda recipes.

Authors:
  - Tom Augspurger (https://github.com/TomAugspurger)
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/kvikio/pull/802
---
 .../all_cuda-129_arch-aarch64.yaml            |  3 +-
 .../all_cuda-129_arch-x86_64.yaml             |  3 +-
 conda/recipes/kvikio/recipe.yaml              |  3 --
 dependencies.yaml                             | 27 ++++++++++++--
 docs/source/zarr.rst                          | 12 +++----
 python/kvikio/kvikio/benchmarks/zarr_io.py    |  3 --
 python/kvikio/kvikio/zarr/__init__.py         | 10 +++---
 python/kvikio/kvikio/zarr/_zarr_python_3.py   | 30 +++++++++-------
 python/kvikio/pyproject.toml                  |  6 ++--
 python/kvikio/tests/test_zarr_missing.py      | 36 +++++++++++++++++++
 python/kvikio/tests/test_zarr_v3.py           |  5 +--
 11 files changed, 97 insertions(+), 41 deletions(-)
 create mode 100644 python/kvikio/tests/test_zarr_missing.py

diff --git a/conda/environments/all_cuda-129_arch-aarch64.yaml b/conda/environments/all_cuda-129_arch-aarch64.yaml
index 0a9f52ca87..cd8fcff024 100644
--- a/conda/environments/all_cuda-129_arch-aarch64.yaml
+++ b/conda/environments/all_cuda-129_arch-aarch64.yaml
@@ -21,7 +21,6 @@ dependencies:
 - libnuma
 - moto>=4.0.8
 - ninja
-- numcodecs !=0.12.0
 - numpy>=1.23,<3.0a0
 - numpydoc
 - packaging
@@ -39,5 +38,5 @@ dependencies:
 - sphinx-click
 - sphinx_rtd_theme
 - sysroot_linux-aarch64=2.28
-- zarr>=2.0.0,<4.0.0
+- zarr>=3.0.0,<4.0.0
 name: all_cuda-129_arch-aarch64
diff --git a/conda/environments/all_cuda-129_arch-x86_64.yaml b/conda/environments/all_cuda-129_arch-x86_64.yaml
index dd4b21ea32..3c18c5589c 100644
--- a/conda/environments/all_cuda-129_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-129_arch-x86_64.yaml
@@ -21,7 +21,6 @@ dependencies:
 - libnuma
 - moto>=4.0.8
 - ninja
-- numcodecs !=0.12.0
 - numpy>=1.23,<3.0a0
 - numpydoc
 - packaging
@@ -39,5 +38,5 @@ dependencies:
 - sphinx-click
 - sphinx_rtd_theme
 - sysroot_linux-64=2.28
-- zarr>=2.0.0,<4.0.0
+- zarr>=3.0.0,<4.0.0
 name: all_cuda-129_arch-x86_64
diff --git a/conda/recipes/kvikio/recipe.yaml b/conda/recipes/kvikio/recipe.yaml
index 7ec50ec153..f4d8f07143 100644
--- a/conda/recipes/kvikio/recipe.yaml
+++ b/conda/recipes/kvikio/recipe.yaml
@@ -75,12 +75,9 @@ requirements:
     - ${{ pin_compatible("cuda-version", upper_bound="x", lower_bound="x") }}
     - cupy >=12.0.0
     - libkvikio =${{ version }}
-    # See https://github.com/zarr-developers/numcodecs/pull/475
-    - numcodecs !=0.12.0
     - numpy >=1.23,<3.0a0
     - packaging
     - python
-    - zarr >=2.0.0,<4.0.0a0
     - cuda-cudart
   ignore_run_exports:
     by_name:
diff --git a/dependencies.yaml b/dependencies.yaml
index 13b4f3c54d..5d70bdba18 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -86,6 +86,14 @@ files:
       - build-cpp
       - build-py-wrapper
       - build-use-libkvikio-wheel
+  py_optional_zarr:
+    output: pyproject
+    pyproject_dir: python/kvikio
+    extras:
+      table: project.optional-dependencies
+      key: zarr
+    includes:
+      - zarr
   py_optional_test:
     output: pyproject
     pyproject_dir: python/kvikio
@@ -94,6 +102,7 @@ files:
       key: test
     includes:
       - test_python
+      - zarr
   test_java:
     output: none
     includes:
@@ -240,6 +249,7 @@ dependencies:
       - output_types: [conda, requirements]
         packages:
           - numpydoc
+          - zarr>=3.0.0,<4.0.0
           - sphinx
           - sphinx-click
           - sphinx_rtd_theme
@@ -285,10 +295,12 @@ dependencies:
       - output_types: [conda, requirements, pyproject]
         packages:
           - numpy>=1.23,<3.0a0
-          - zarr>=2.0.0,<4.0.0
-          # See https://github.com/zarr-developers/numcodecs/pull/475
-          - numcodecs !=0.12.0
           - packaging
+  zarr:
+    common:
+      - output_types: [requirements, pyproject]
+        packages:
+          - "zarr>=3.0.0,<4.0.0; python_version >= '3.11'"
   test_libkvikio:
     common:
       - output_types: conda
@@ -324,6 +336,15 @@ dependencies:
         packages:
           - moto>=4.0.8
     specific:
+      - output_types: [conda]
+        matrices:
+          # zarr 3 is not supported on Python 3.10
+          - matrix:
+              py: "3.1[123]"
+            packages:
+              - zarr>=3.0.0,<4.0.0
+          - matrix:
+            packages:
       - output_types: [conda, requirements, pyproject]
         matrices:
           - matrix: # All CUDA 12 versions
diff --git a/docs/source/zarr.rst b/docs/source/zarr.rst
index f8e4564011..baa48fa3e2 100644
--- a/docs/source/zarr.rst
+++ b/docs/source/zarr.rst
@@ -3,18 +3,16 @@ Zarr
 
 `Zarr <https://github.com/zarr-developers/zarr-specs>`_ is a binary file format for chunked, compressed, N-Dimensional array. It is used throughout the PyData ecosystem and especially for climate and biological science applications.
 
-
 `Zarr-Python <https://zarr.readthedocs.io/en/stable/>`_ is the official Python package for reading and writing Zarr arrays. Its main feature is a NumPy-like array that translates array operations into file IO seamlessly.
 KvikIO provides a GPU backend to Zarr-Python that enables `GPUDirect Storage (GDS) <https://developer.nvidia.com/blog/gpudirect-storage/>`_ seamlessly.
 
-KvikIO supports either zarr-python 2.x or zarr-python 3.x.
-However, the API provided in :mod:`kvikio.zarr` differs based on which version of zarr you have, following the differences between zarr-python 2.x and zarr-python 3.x.
-
+If the optional zarr-python dependency is installed, then ``kvikio.zarr`` will be available.
+KvikIO supports zarr-python 3.x.
 
-Zarr Python 3.x
----------------
+Usage
+-----
 
-Zarr-python includes native support for reading Zarr chunks into device memory if you `configure Zarr <https://zarr.readthedocs.io/en/stable/user-guide/gpu.html#>`__ to use GPUs.
+Zarr-Python includes native support for reading Zarr chunks into device memory if you `configure Zarr <https://zarr.readthedocs.io/en/stable/user-guide/gpu.html#>`__ to use GPUs.
 You can use any store, but KvikIO provides :py:class:`kvikio.zarr.GDSStore` to efficiently load data directly into GPU memory.
 
 .. code-block:: python
diff --git a/python/kvikio/kvikio/benchmarks/zarr_io.py b/python/kvikio/kvikio/benchmarks/zarr_io.py
index c4d51a6d5f..14c9fb38d3 100644
--- a/python/kvikio/kvikio/benchmarks/zarr_io.py
+++ b/python/kvikio/kvikio/benchmarks/zarr_io.py
@@ -23,9 +23,6 @@
 import kvikio.zarr
 from kvikio.benchmarks.utils import drop_vm_cache, parse_directory, pprint_sys_info
 
-if not kvikio.zarr.supported:
-    raise RuntimeError(f"requires Zarr >={kvikio.zarr.MINIMUM_ZARR_VERSION}")
-
 
 def create_src_data(args):
     return cupy.random.random(args.nelem, dtype=args.dtype)
diff --git a/python/kvikio/kvikio/zarr/__init__.py b/python/kvikio/kvikio/zarr/__init__.py
index 758670ea21..ca6cfde7e9 100644
--- a/python/kvikio/kvikio/zarr/__init__.py
+++ b/python/kvikio/kvikio/zarr/__init__.py
@@ -1,8 +1,8 @@
 # Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
 
-from importlib import metadata as _metadata
+try:
+    from ._zarr_python_3 import GDSStore
+except ImportError as e:
+    raise ImportError("kvikio.zarr requires the optional 'zarr>=3' dependency") from e
 
-from packaging.version import Version as _Version, parse as _parse
-
-if _parse(_metadata.version("zarr")) >= _Version("3.0.0"):
-    from ._zarr_python_3 import *  # noqa: F401,F403
+__all__ = ["GDSStore"]
diff --git a/python/kvikio/kvikio/zarr/_zarr_python_3.py b/python/kvikio/kvikio/zarr/_zarr_python_3.py
index 2f21be360a..b12ff93125 100644
--- a/python/kvikio/kvikio/zarr/_zarr_python_3.py
+++ b/python/kvikio/kvikio/zarr/_zarr_python_3.py
@@ -6,32 +6,38 @@
 import os
 from pathlib import Path
 
-import packaging
-import zarr.storage
-from packaging.version import parse
-from zarr.abc.store import (
+import packaging.version
+import zarr
+
+_zarr_version = packaging.version.parse(zarr.__version__)
+
+if _zarr_version < packaging.version.parse("3.0.0"):
+    # We include this runtime package checking to help users who relied on
+    # installing kvikio to also include zarr, which is not an optional dependency.
+    raise ImportError(
+        f"'zarr>=3' is required, but 'zarr=={_zarr_version}' is installed."
+    )
+
+import zarr.storage  # noqa: E402
+from zarr.abc.store import (  # noqa: E402
     ByteRequest,
     OffsetByteRequest,
     RangeByteRequest,
     SuffixByteRequest,
 )
-from zarr.core.buffer import Buffer, BufferPrototype
-from zarr.core.buffer.core import default_buffer_prototype
+from zarr.core.buffer import Buffer, BufferPrototype  # noqa: E402
+from zarr.core.buffer.core import default_buffer_prototype  # noqa: E402
 
-import kvikio
+import kvikio  # noqa: E402
 
 # The GDSStore implementation follows the `LocalStore` implementation
 # at https://github.com/zarr-developers/zarr-python/blob/main/src/zarr/storage/_local.py
 # with differences coming swapping in `cuFile` for the stdlib open file object.
 
-MINIMUM_ZARR_VERSION = "3"
-
-supported = parse(zarr.__version__) >= parse(MINIMUM_ZARR_VERSION)
-
 
 @functools.cache
 def _is_ge_zarr_3_0_7():
-    return packaging.version.parse(zarr.__version__) >= packaging.version.parse("3.0.7")
+    return _zarr_version >= packaging.version.parse("3.0.7")
 
 
 def _get(
diff --git a/python/kvikio/pyproject.toml b/python/kvikio/pyproject.toml
index 1d3191cc67..51ff2419de 100644
--- a/python/kvikio/pyproject.toml
+++ b/python/kvikio/pyproject.toml
@@ -21,10 +21,8 @@ requires-python = ">=3.10"
 dependencies = [
     "cupy-cuda12x>=12.0.0",
     "libkvikio==25.10.*,>=0.0.0a0",
-    "numcodecs !=0.12.0",
     "numpy>=1.23,<3.0a0",
     "packaging",
-    "zarr>=2.0.0,<4.0.0",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 classifiers = [
     "Intended Audience :: Developers",
@@ -49,6 +47,10 @@ test = [
     "pytest-timeout",
     "rangehttpserver",
     "rapids-dask-dependency==25.10.*,>=0.0.0a0",
+    "zarr>=3.0.0,<4.0.0; python_version >= '3.11'",
+] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
+zarr = [
+    "zarr>=3.0.0,<4.0.0; python_version >= '3.11'",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 
 [project.urls]
diff --git a/python/kvikio/tests/test_zarr_missing.py b/python/kvikio/tests/test_zarr_missing.py
new file mode 100644
index 0000000000..ef25b41dab
--- /dev/null
+++ b/python/kvikio/tests/test_zarr_missing.py
@@ -0,0 +1,36 @@
+# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+# See file LICENSE for terms.
+
+import sys
+
+import pytest
+
+
+def test_zarr_missing_raises(monkeypatch):
+    modules = list(sys.modules)
+    for module in modules:
+        pkg = module.split(".")[0]
+        if pkg == "kvikio":
+            # remove from the import cache
+            monkeypatch.delitem(sys.modules, module, raising=False)
+        elif pkg == "zarr":
+            # force an ImportError
+            monkeypatch.setitem(sys.modules, module, None)
+
+    with pytest.raises(ImportError):
+        import kvikio.zarr  # noqa: F401
+
+
+def test_zarr_2_installed_raises(monkeypatch):
+    modules = list(sys.modules)
+    zarr = pytest.importorskip("zarr")
+    monkeypatch.setattr(zarr, "__version__", "2.0.0")
+
+    for module in modules:
+        pkg = module.split(".")[0]
+        if pkg == "kvikio":
+            # remove from the import cache
+            monkeypatch.delitem(sys.modules, module, raising=False)
+
+    with pytest.raises(ImportError):
+        import kvikio.zarr  # noqa: F401
diff --git a/python/kvikio/tests/test_zarr_v3.py b/python/kvikio/tests/test_zarr_v3.py
index e44de36d22..59331d5513 100644
--- a/python/kvikio/tests/test_zarr_v3.py
+++ b/python/kvikio/tests/test_zarr_v3.py
@@ -6,16 +6,17 @@
 import cupy as cp
 import pytest
 
-import kvikio.zarr
-
 pytest.importorskip("zarr", minversion="3.0.0")
 
+# these must follow the pytest.importorskip
 
 import zarr.core.buffer  # noqa: E402
 import zarr.storage  # noqa: E402
 from zarr.core.buffer.gpu import Buffer  # noqa: E402
 from zarr.testing.store import StoreTests  # noqa: E402
 
+import kvikio.zarr  # noqa: E402
+
 
 @pytest.mark.asyncio
 async def test_basic(tmp_path: pathlib.Path) -> None:

From e43b3d196f2e1b3b802499b08d6e62c07fa3c126 Mon Sep 17 00:00:00 2001
From: Vukasin Milovanovic <vmilovanovic@nvidia.com>
Date: Tue, 19 Aug 2025 17:11:26 -0700
Subject: [PATCH 21/40] Upgrade to nvCOMP 5.0.0.6 (#800)

Upgrade the nvCOMP dependency to 5.0.0.6.
This library is not used directly, but it's till vendored and used in libcudf wheels. Future changes will completely remove the dependency in kvikIO.

Depends on https://github.com/rapidsai/rapids-cmake/pull/896

Authors:
  - Vukasin Milovanovic (https://github.com/vuule)
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/kvikio/pull/800
---
 ci/build_wheel_python.sh                           | 2 +-
 python/libkvikio/cmake/thirdparty/get_nvcomp.cmake | 2 +-
 python/libkvikio/libkvikio/load.py                 | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/ci/build_wheel_python.sh b/ci/build_wheel_python.sh
index 55ee0c05a0..ed2df80542 100755
--- a/ci/build_wheel_python.sh
+++ b/ci/build_wheel_python.sh
@@ -22,7 +22,7 @@ export SKBUILD_CMAKE_ARGS="-DUSE_NVCOMP_RUNTIME_WHEEL=ON"
 
 python -m auditwheel repair \
     --exclude libkvikio.so \
-    --exclude libnvcomp.so.4 \
+    --exclude libnvcomp.so.5 \
     -w "${RAPIDS_WHEEL_BLD_OUTPUT_DIR}" \
     ${package_dir}/dist/*
 
diff --git a/python/libkvikio/cmake/thirdparty/get_nvcomp.cmake b/python/libkvikio/cmake/thirdparty/get_nvcomp.cmake
index fdaeeef407..9d62dfda38 100644
--- a/python/libkvikio/cmake/thirdparty/get_nvcomp.cmake
+++ b/python/libkvikio/cmake/thirdparty/get_nvcomp.cmake
@@ -20,7 +20,7 @@ function(find_and_configure_nvcomp)
   include(${rapids-cmake-dir}/cpm/nvcomp.cmake)
   set(export_args)
   if(KvikIO_EXPORT_NVCOMP)
-    # We're vendoring nvcomp and we only want `libnvcomp.so.4`
+    # We're vendoring nvcomp and we only want `libnvcomp.so.5`
     set(export_args BUILD_EXPORT_SET nvcomp INSTALL_EXPORT_SET nvcomp)
   endif()
 
diff --git a/python/libkvikio/libkvikio/load.py b/python/libkvikio/libkvikio/load.py
index 2e573a0889..7a432d95f8 100644
--- a/python/libkvikio/libkvikio/load.py
+++ b/python/libkvikio/libkvikio/load.py
@@ -47,7 +47,7 @@ def load_library():
     # TODO: remove this nvcomp load when `nvcomp` is vendored into cudf instead.
     # Currently this load only exists to ensure that libcudf wheels are not broken by
     # prematurely removing the load
-    _load_library("libnvcomp.so.4")
+    _load_library("libnvcomp.so.5")
     return _load_library("libkvikio.so")
 
 

From c7740795e5904289b721a7ad5d536a9a59654e19 Mon Sep 17 00:00:00 2001
From: James Lamb <jaylamb20@gmail.com>
Date: Wed, 20 Aug 2025 16:02:50 -0500
Subject: [PATCH 22/40] Build and test with CUDA 13.0.0 (#803)

Contributes to https://github.com/rapidsai/build-planning/issues/208

* uses CUDA 13.0.0 to build and test

Contributes to https://github.com/rapidsai/build-planning/issues/68

* updates to CUDA 13 dependencies in fallback entries in `dependencies.yaml` matrices (i.e., the ones that get written to `pyproject.toml` in source control)

## Notes for Reviewers

This switches GitHub Actions workflows to the `cuda13.0` branch from here: https://github.com/rapidsai/shared-workflows/pull/413

A future round of PRs will revert that back to `branch-25.10`, once all of RAPIDS supports CUDA 13.

Authors:
  - James Lamb (https://github.com/jameslamb)
  - Bradley Dice (https://github.com/bdice)
  - Tianyu Liu (https://github.com/kingcrimsontianyu)

Approvers:
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/kvikio/pull/803
---
 .../cuda13.0-conda/devcontainer.json          | 44 ++++++++++++
 .devcontainer/cuda13.0-pip/devcontainer.json  | 43 ++++++++++++
 .github/workflows/build.yaml                  | 16 ++---
 .github/workflows/pr.yaml                     | 28 ++++----
 .github/workflows/test.yaml                   |  6 +-
 .../trigger-breaking-change-alert.yaml        |  2 +-
 .../all_cuda-129_arch-aarch64.yaml            |  4 +-
 .../all_cuda-129_arch-x86_64.yaml             |  4 +-
 .../all_cuda-130_arch-aarch64.yaml            | 42 ++++++++++++
 .../all_cuda-130_arch-x86_64.yaml             | 42 ++++++++++++
 conda/recipes/kvikio/recipe.yaml              |  2 +-
 cpp/doxygen/main_page.md                      |  9 +++
 cpp/src/mmap.cpp                              |  2 +
 cpp/src/shim/cuda.cpp                         |  2 +-
 dependencies.yaml                             | 67 ++++++++++++-------
 docs/source/install.rst                       | 18 +++--
 python/kvikio/pyproject.toml                  |  4 +-
 17 files changed, 271 insertions(+), 64 deletions(-)
 create mode 100644 .devcontainer/cuda13.0-conda/devcontainer.json
 create mode 100644 .devcontainer/cuda13.0-pip/devcontainer.json
 create mode 100644 conda/environments/all_cuda-130_arch-aarch64.yaml
 create mode 100644 conda/environments/all_cuda-130_arch-x86_64.yaml

diff --git a/.devcontainer/cuda13.0-conda/devcontainer.json b/.devcontainer/cuda13.0-conda/devcontainer.json
new file mode 100644
index 0000000000..21e7bf83c4
--- /dev/null
+++ b/.devcontainer/cuda13.0-conda/devcontainer.json
@@ -0,0 +1,44 @@
+{
+  "build": {
+    "context": "${localWorkspaceFolder}/.devcontainer",
+    "dockerfile": "${localWorkspaceFolder}/.devcontainer/Dockerfile",
+    "args": {
+      "CUDA": "13.0",
+      "PYTHON_PACKAGE_MANAGER": "conda",
+      "BASE": "rapidsai/devcontainers:25.10-cpp-mambaforge"
+    }
+  },
+  "runArgs": [
+    "--rm",
+    "--name",
+    "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-25.10-cuda13.0-conda",
+    "--ulimit",
+    "nofile=500000"
+  ],
+  "hostRequirements": {"gpu": "optional"},
+  "features": {
+    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:25.10": {}
+  },
+  "overrideFeatureInstallOrder": [
+    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils"
+  ],
+  "initializeCommand": ["/bin/bash", "-c", "mkdir -m 0755 -p ${localWorkspaceFolder}/../.{aws,cache,config,conda/pkgs,conda/${localWorkspaceFolderBasename}-cuda13.0-envs}"],
+  "postAttachCommand": ["/bin/bash", "-c", "if [ ${CODESPACES:-false} = 'true' ]; then . devcontainer-utils-post-attach-command; . rapids-post-attach-command; fi"],
+  "workspaceFolder": "/home/coder",
+  "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/kvikio,type=bind,consistency=consistent",
+  "mounts": [
+    "source=${localWorkspaceFolder}/../.aws,target=/home/coder/.aws,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/../.cache,target=/home/coder/.cache,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/../.config,target=/home/coder/.config,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/../.conda/pkgs,target=/home/coder/.conda/pkgs,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/../.conda/${localWorkspaceFolderBasename}-cuda13.0-envs,target=/home/coder/.conda/envs,type=bind,consistency=consistent"
+  ],
+  "customizations": {
+    "vscode": {
+      "extensions": [
+        "ms-python.flake8",
+        "nvidia.nsight-vscode-edition"
+      ]
+    }
+  }
+}
diff --git a/.devcontainer/cuda13.0-pip/devcontainer.json b/.devcontainer/cuda13.0-pip/devcontainer.json
new file mode 100644
index 0000000000..3c035eee05
--- /dev/null
+++ b/.devcontainer/cuda13.0-pip/devcontainer.json
@@ -0,0 +1,43 @@
+{
+  "build": {
+    "context": "${localWorkspaceFolder}/.devcontainer",
+    "dockerfile": "${localWorkspaceFolder}/.devcontainer/Dockerfile",
+    "args": {
+      "CUDA": "13.0",
+      "PYTHON_PACKAGE_MANAGER": "pip",
+      "BASE": "rapidsai/devcontainers:25.10-cpp-cuda13.0"
+    }
+  },
+  "runArgs": [
+    "--rm",
+    "--name",
+    "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-25.10-cuda13.0-pip",
+    "--ulimit",
+    "nofile=500000"
+  ],
+  "hostRequirements": {"gpu": "optional"},
+  "features": {
+    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:25.10": {}
+  },
+  "overrideFeatureInstallOrder": [
+    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils"
+  ],
+  "initializeCommand": ["/bin/bash", "-c", "mkdir -m 0755 -p ${localWorkspaceFolder}/../.{aws,cache,config/pip,local/share/${localWorkspaceFolderBasename}-cuda13.0-venvs}"],
+  "postAttachCommand": ["/bin/bash", "-c", "if [ ${CODESPACES:-false} = 'true' ]; then . devcontainer-utils-post-attach-command; . rapids-post-attach-command; fi"],
+  "workspaceFolder": "/home/coder",
+  "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/kvikio,type=bind,consistency=consistent",
+  "mounts": [
+    "source=${localWorkspaceFolder}/../.aws,target=/home/coder/.aws,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/../.cache,target=/home/coder/.cache,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/../.config,target=/home/coder/.config,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/../.local/share/${localWorkspaceFolderBasename}-cuda13.0-venvs,target=/home/coder/.local/share/venvs,type=bind,consistency=consistent"
+  ],
+  "customizations": {
+    "vscode": {
+      "extensions": [
+        "ms-python.flake8",
+        "nvidia.nsight-vscode-edition"
+      ]
+    }
+  }
+}
diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
index 0070274f37..e119ba8ac1 100644
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -34,7 +34,7 @@ concurrency:
 jobs:
   cpp-build:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-25.10
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@cuda13.0
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -44,7 +44,7 @@ jobs:
   python-build:
     needs: [cpp-build]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-25.10
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@cuda13.0
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -54,7 +54,7 @@ jobs:
   upload-conda:
     needs: [cpp-build, python-build]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-upload-packages.yaml@branch-25.10
+    uses: rapidsai/shared-workflows/.github/workflows/conda-upload-packages.yaml@cuda13.0
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -64,7 +64,7 @@ jobs:
     if: github.ref_type == 'branch'
     needs: python-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-25.10
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@cuda13.0
     with:
       arch: "amd64"
       branch: ${{ inputs.branch }}
@@ -76,7 +76,7 @@ jobs:
       sha: ${{ inputs.sha }}
   wheel-build-cpp:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-25.10
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@cuda13.0
     with:
       matrix_filter: group_by([.ARCH, (.CUDA_VER|split(".")|map(tonumber)|.[0])]) | map(max_by(.PY_VER|split(".")|map(tonumber)))
       build_type: ${{ inputs.build_type || 'branch' }}
@@ -89,7 +89,7 @@ jobs:
   wheel-build-python:
     needs: wheel-build-cpp
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-25.10
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@cuda13.0
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -101,7 +101,7 @@ jobs:
   wheel-publish-cpp:
     needs: wheel-build-cpp
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-25.10
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@cuda13.0
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -112,7 +112,7 @@ jobs:
   wheel-publish-python:
     needs: wheel-build-python
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-25.10
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@cuda13.0
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index 8db71539ab..4afc5ca548 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -27,7 +27,7 @@ jobs:
       - wheel-python-tests
       - telemetry-setup
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@branch-25.10
+    uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@cuda13.0
     if: always()
     with:
       needs: ${{ toJSON(needs) }}
@@ -43,7 +43,7 @@ jobs:
           repo: kvikio
   changed-files:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/changed-files.yaml@branch-25.10
+    uses: rapidsai/shared-workflows/.github/workflows/changed-files.yaml@cuda13.0
     with:
       files_yaml: |
         test_cpp:
@@ -86,20 +86,20 @@ jobs:
   checks:
     secrets: inherit
     needs: telemetry-setup
-    uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@branch-25.10
+    uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@cuda13.0
     with:
       ignored_pr_jobs: telemetry-summarize
   conda-cpp-build:
     needs: checks
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-25.10
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@cuda13.0
     with:
       build_type: pull-request
       script: ci/build_cpp.sh
   conda-cpp-tests:
     needs: [conda-cpp-build, changed-files]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-25.10
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@cuda13.0
     if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_cpp
     with:
       build_type: pull-request
@@ -107,7 +107,7 @@ jobs:
   conda-java-tests:
     needs: conda-cpp-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-25.10
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@cuda13.0
     with:
       build_type: pull-request
       node_type: "gpu-l4-latest-1"
@@ -117,14 +117,14 @@ jobs:
   conda-python-build:
     needs: conda-cpp-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-25.10
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@cuda13.0
     with:
       build_type: pull-request
       script: ci/build_python.sh
   conda-python-tests:
     needs: [conda-python-build, changed-files]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-25.10
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@cuda13.0
     if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_python
     with:
       build_type: pull-request
@@ -132,7 +132,7 @@ jobs:
   docs-build:
     needs: conda-python-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-25.10
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@cuda13.0
     with:
       build_type: pull-request
       node_type: "gpu-l4-latest-1"
@@ -142,10 +142,10 @@ jobs:
   devcontainer:
     needs: telemetry-setup
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/build-in-devcontainer.yaml@branch-25.10
+    uses: rapidsai/shared-workflows/.github/workflows/build-in-devcontainer.yaml@cuda13.0
     with:
       arch: '["amd64", "arm64"]'
-      cuda: '["12.9"]'
+      cuda: '["13.0"]'
       node_type: "cpu8"
       rapids-aux-secret-1: GIST_REPO_READ_ORG_GITHUB_TOKEN
       env: |
@@ -160,7 +160,7 @@ jobs:
         sccache --show-adv-stats | tee telemetry-artifacts/sccache-stats.txt;
   wheel-cpp-build:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-25.10
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@cuda13.0
     with:
       matrix_filter: group_by([.ARCH, (.CUDA_VER|split(".")|map(tonumber)|.[0])]) | map(max_by(.PY_VER|split(".")|map(tonumber)))
       build_type: pull-request
@@ -170,7 +170,7 @@ jobs:
   wheel-python-build:
     needs: wheel-cpp-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-25.10
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@cuda13.0
     with:
       build_type: pull-request
       script: ci/build_wheel_python.sh
@@ -179,7 +179,7 @@ jobs:
   wheel-python-tests:
     needs: [wheel-python-build, changed-files]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-25.10
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@cuda13.0
     if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_python
     with:
       build_type: pull-request
diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
index 5f618498b3..4c4ff60385 100644
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -25,7 +25,7 @@ on:
 jobs:
   cpp-tests:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-25.10
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@cuda13.0
     with:
       build_type: ${{ inputs.build_type }}
       branch: ${{ inputs.branch }}
@@ -34,7 +34,7 @@ jobs:
       sha: ${{ inputs.sha }}
   python-tests:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-25.10
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@cuda13.0
     with:
       build_type: ${{ inputs.build_type }}
       branch: ${{ inputs.branch }}
@@ -43,7 +43,7 @@ jobs:
       sha: ${{ inputs.sha }}
   conda-java-tests:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-25.10
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@cuda13.0
     with:
       build_type: ${{ inputs.build_type }}
       branch: ${{ inputs.branch }}
diff --git a/.github/workflows/trigger-breaking-change-alert.yaml b/.github/workflows/trigger-breaking-change-alert.yaml
index 48bf37afc4..72751d071b 100644
--- a/.github/workflows/trigger-breaking-change-alert.yaml
+++ b/.github/workflows/trigger-breaking-change-alert.yaml
@@ -12,7 +12,7 @@ jobs:
   trigger-notifier:
     if: contains(github.event.pull_request.labels.*.name, 'breaking')
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/breaking-change-alert.yaml@branch-25.10
+    uses: rapidsai/shared-workflows/.github/workflows/breaking-change-alert.yaml@cuda13.0
     with:
       sender_login: ${{ github.event.sender.login }}
       sender_avatar: ${{ github.event.sender.avatar_url }}
diff --git a/conda/environments/all_cuda-129_arch-aarch64.yaml b/conda/environments/all_cuda-129_arch-aarch64.yaml
index cd8fcff024..279b130e55 100644
--- a/conda/environments/all_cuda-129_arch-aarch64.yaml
+++ b/conda/environments/all_cuda-129_arch-aarch64.yaml
@@ -9,9 +9,9 @@ dependencies:
 - c-compiler
 - cmake>=3.30.4
 - cuda-nvcc
-- cuda-python>=12.6.2,<13.0a0
+- cuda-python>=12.9.2,<13.0a0
 - cuda-version=12.9
-- cupy>=12.0.0
+- cupy>=13.6.0
 - cxx-compiler
 - cython>=3.0.0
 - doxygen=1.9.1
diff --git a/conda/environments/all_cuda-129_arch-x86_64.yaml b/conda/environments/all_cuda-129_arch-x86_64.yaml
index 3c18c5589c..5460d73ab7 100644
--- a/conda/environments/all_cuda-129_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-129_arch-x86_64.yaml
@@ -9,9 +9,9 @@ dependencies:
 - c-compiler
 - cmake>=3.30.4
 - cuda-nvcc
-- cuda-python>=12.6.2,<13.0a0
+- cuda-python>=12.9.2,<13.0a0
 - cuda-version=12.9
-- cupy>=12.0.0
+- cupy>=13.6.0
 - cxx-compiler
 - cython>=3.0.0
 - doxygen=1.9.1
diff --git a/conda/environments/all_cuda-130_arch-aarch64.yaml b/conda/environments/all_cuda-130_arch-aarch64.yaml
new file mode 100644
index 0000000000..1b752cbe8e
--- /dev/null
+++ b/conda/environments/all_cuda-130_arch-aarch64.yaml
@@ -0,0 +1,42 @@
+# This file is generated by `rapids-dependency-file-generator`.
+# To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
+channels:
+- rapidsai
+- rapidsai-nightly
+- conda-forge
+dependencies:
+- boto3>=1.21.21
+- c-compiler
+- cmake>=3.30.4
+- cuda-nvcc
+- cuda-python>=13.0.1,<14.0a0
+- cuda-version=13.0
+- cupy>=13.6.0
+- cxx-compiler
+- cython>=3.0.0
+- doxygen=1.9.1
+- gcc_linux-aarch64=14.*
+- libcufile-dev
+- libcurl>=8.5.0,<9.0a0
+- libnuma
+- moto>=4.0.8
+- ninja
+- numpy>=1.23,<3.0a0
+- numpydoc
+- packaging
+- pre-commit
+- pytest
+- pytest-asyncio
+- pytest-cov
+- pytest-timeout
+- python>=3.10,<3.14
+- rangehttpserver
+- rapids-build-backend>=0.4.0,<0.5.0.dev0
+- rapids-dask-dependency==25.10.*,>=0.0.0a0
+- scikit-build-core>=0.10.0
+- sphinx
+- sphinx-click
+- sphinx_rtd_theme
+- sysroot_linux-aarch64=2.28
+- zarr>=3.0.0,<4.0.0
+name: all_cuda-130_arch-aarch64
diff --git a/conda/environments/all_cuda-130_arch-x86_64.yaml b/conda/environments/all_cuda-130_arch-x86_64.yaml
new file mode 100644
index 0000000000..812bfdad5e
--- /dev/null
+++ b/conda/environments/all_cuda-130_arch-x86_64.yaml
@@ -0,0 +1,42 @@
+# This file is generated by `rapids-dependency-file-generator`.
+# To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
+channels:
+- rapidsai
+- rapidsai-nightly
+- conda-forge
+dependencies:
+- boto3>=1.21.21
+- c-compiler
+- cmake>=3.30.4
+- cuda-nvcc
+- cuda-python>=13.0.1,<14.0a0
+- cuda-version=13.0
+- cupy>=13.6.0
+- cxx-compiler
+- cython>=3.0.0
+- doxygen=1.9.1
+- gcc_linux-64=14.*
+- libcufile-dev
+- libcurl>=8.5.0,<9.0a0
+- libnuma
+- moto>=4.0.8
+- ninja
+- numpy>=1.23,<3.0a0
+- numpydoc
+- packaging
+- pre-commit
+- pytest
+- pytest-asyncio
+- pytest-cov
+- pytest-timeout
+- python>=3.10,<3.14
+- rangehttpserver
+- rapids-build-backend>=0.4.0,<0.5.0.dev0
+- rapids-dask-dependency==25.10.*,>=0.0.0a0
+- scikit-build-core>=0.10.0
+- sphinx
+- sphinx-click
+- sphinx_rtd_theme
+- sysroot_linux-64=2.28
+- zarr>=3.0.0,<4.0.0
+name: all_cuda-130_arch-x86_64
diff --git a/conda/recipes/kvikio/recipe.yaml b/conda/recipes/kvikio/recipe.yaml
index f4d8f07143..c4c52b540c 100644
--- a/conda/recipes/kvikio/recipe.yaml
+++ b/conda/recipes/kvikio/recipe.yaml
@@ -73,7 +73,7 @@ requirements:
     - cuda-cudart-dev
   run:
     - ${{ pin_compatible("cuda-version", upper_bound="x", lower_bound="x") }}
-    - cupy >=12.0.0
+    - cupy >=13.6.0
     - libkvikio =${{ version }}
     - numpy >=1.23,<3.0a0
     - packaging
diff --git a/cpp/doxygen/main_page.md b/cpp/doxygen/main_page.md
index 8d9d0e9320..cd9fe05557 100644
--- a/cpp/doxygen/main_page.md
+++ b/cpp/doxygen/main_page.md
@@ -30,9 +30,14 @@ For convenience we release Conda packages that makes it easy to include KvikIO i
 We strongly recommend using [mamba](https://github.com/mamba-org/mamba) in place of conda, which we will do throughout the documentation.
 
 Install the **stable release** from the ``rapidsai`` channel with the following:
+
 ```sh
 # Install in existing environment
 mamba install -c rapidsai -c conda-forge libkvikio
+
+# Create new environment (CUDA 13)
+mamba create -n libkvikio-env -c rapidsai -c conda-forge cuda-version=13.0 libkvikio
+
 # Create new environment (CUDA 12)
 mamba create -n libkvikio-env -c rapidsai -c conda-forge cuda-version=12.9 libkvikio
 ```
@@ -42,6 +47,10 @@ Install the **nightly release** from the ``rapidsai-nightly`` channel with the f
 ```sh
 # Install in existing environment
 mamba install -c rapidsai-nightly -c conda-forge libkvikio
+
+# Create new environment (CUDA 13)
+mamba create -n libkvikio-env -c rapidsai-nightly -c conda-forge python=3.13 cuda-version=13.0 libkvikio
+
 # Create new environment (CUDA 12)
 mamba create -n libkvikio-env -c rapidsai-nightly -c conda-forge python=3.13 cuda-version=12.9 libkvikio
 ```
diff --git a/cpp/src/mmap.cpp b/cpp/src/mmap.cpp
index 968b96aa42..ee41a55d3d 100644
--- a/cpp/src/mmap.cpp
+++ b/cpp/src/mmap.cpp
@@ -218,7 +218,9 @@ void read_impl(void* dst_buf,
                                                &attrs,
                                                attrs_idxs,
                                                static_cast<std::size_t>(1) /* num_attrs */,
+#if CUDA_VERSION < 13000
                                                static_cast<std::size_t*>(nullptr),
+#endif
                                                stream));
       } else {
         // Fall back to the conventional H2D copy if the batch copy API is not available.
diff --git a/cpp/src/shim/cuda.cpp b/cpp/src/shim/cuda.cpp
index 8053ea152a..ae69a495aa 100644
--- a/cpp/src/shim/cuda.cpp
+++ b/cpp/src/shim/cuda.cpp
@@ -56,7 +56,7 @@ cudaAPI::cudaAPI()
   CUDA_DRIVER_TRY(DriverGetVersion(&driver_version));
 
 #if CUDA_VERSION >= 12080
-  // cuMemcpyBatchAsync was introduced in CUDA 12.8.
+  // cuMemcpyBatchAsync was introduced in CUDA 12.8, and its parameters were changed in CUDA 13.0.
   try {
     decltype(cuMemcpyBatchAsync)* fp;
     get_symbol(fp, lib, KVIKIO_STRINGIFY(cuMemcpyBatchAsync));
diff --git a/dependencies.yaml b/dependencies.yaml
index 5d70bdba18..ce1b6f1b8d 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -3,7 +3,7 @@ files:
   all:
     output: conda
     matrix:
-      cuda: ["12.9"]
+      cuda: ["12.9", "13.0"]
       arch: [aarch64, x86_64]
     includes:
       - build-universal
@@ -128,6 +128,7 @@ dependencies:
       - output_types: conda
         packages:
           - c-compiler
+          - cuda-nvcc
           - cxx-compiler
           - libcurl>=8.5.0,<9.0a0
     specific:
@@ -135,22 +136,14 @@ dependencies:
         matrices:
           - matrix:
               arch: x86_64
-              cuda: "12.*"
             packages:
               - gcc_linux-64=14.*
               - sysroot_linux-64=2.28
           - matrix:
               arch: aarch64
-              cuda: "12.*"
             packages:
               - gcc_linux-aarch64=14.*
               - sysroot_linux-aarch64=2.28
-      - output_types: conda
-        matrices:
-          - matrix:
-              cuda: "12.*"
-            packages:
-              - cuda-nvcc
   build-use-libkvikio-wheel:
     common:
       - output_types: conda
@@ -164,6 +157,11 @@ dependencies:
               cuda_suffixed: "true"
             packages:
               - libkvikio-cu12==25.10.*,>=0.0.0a0
+          - matrix:
+              cuda: "13.*"
+              cuda_suffixed: "true"
+            packages:
+              - libkvikio-cu13==25.10.*,>=0.0.0a0
           - {matrix: null, packages: *libkvikio_packages}
   build-py-wrapper:
     common:
@@ -199,31 +197,35 @@ dependencies:
               cuda: "12.9"
             packages:
               - cuda-version=12.9
-  cuda:
-    specific:
-      - output_types: conda
-        matrices:
           - matrix:
-              cuda: "12.*"
+              cuda: "13.0"
             packages:
+              - cuda-version=13.0
+  cuda:
+    common:
       - output_types: conda
-        matrices:
-          - matrix:
-              cuda: "12.*"
-            packages:
-              - libcufile-dev
-              - libnuma
+        packages:
+          - libcufile-dev
+          - libnuma
   depends_on_cupy:
     common:
       - output_types: conda
         packages:
-          - cupy>=12.0.0
+          - &cupy_unsuffixed cupy>=13.6.0
     specific:
       - output_types: [requirements, pyproject]
         matrices:
-          - matrix: # All CUDA 12 versions
+          - matrix:
+              cuda: "12.*"
+            packages:
+              - cupy-cuda12x>=13.6.0
+          - matrix:
+              cuda: "13.*"
+            packages:
+              - &cupy_cu13 cupy-cuda13x>=13.6.0
+          - matrix:
             packages:
-              - cupy-cuda12x>=12.0.0
+              - *cupy_cu13
   depends_on_libkvikio:
     common:
       - output_types: conda
@@ -232,7 +234,7 @@ dependencies:
       - output_types: requirements
         packages:
           # pip recognizes the index as a global option for the requirements.txt file
-          # This index is needed for libkvikio-cu12.
+          # This index is needed for libkvikio-cu{12,13}.
           - --extra-index-url=https://pypi.nvidia.com
           - --extra-index-url=https://pypi.anaconda.org/rapidsai-wheels-nightly/simple
     specific:
@@ -243,6 +245,11 @@ dependencies:
               cuda_suffixed: "true"
             packages:
               - libkvikio-cu12==25.10.*,>=0.0.0a0
+          - matrix:
+              cuda: "13.*"
+              cuda_suffixed: "true"
+            packages:
+              - libkvikio-cu13==25.10.*,>=0.0.0a0
           - {matrix: null, packages: [*libkvikio_unsuffixed]}
   docs:
     common:
@@ -347,9 +354,17 @@ dependencies:
             packages:
       - output_types: [conda, requirements, pyproject]
         matrices:
-          - matrix: # All CUDA 12 versions
+          - matrix:
+              cuda: "12.*"
+            packages:
+              - cuda-python>=12.9.2,<13.0a0
+          - matrix:
+              cuda: "13.*"
+            packages:
+              - &cuda_python_cu13 cuda-python>=13.0.1,<14.0a0
+          - matrix:
             packages:
-              - cuda-python>=12.6.2,<13.0a0
+              - *cuda_python_cu13
   test_java:
     common:
       - output_types: conda
diff --git a/docs/source/install.rst b/docs/source/install.rst
index 5642b137af..91cc39af1f 100644
--- a/docs/source/install.rst
+++ b/docs/source/install.rst
@@ -15,6 +15,10 @@ Install the **stable release** from the ``rapidsai`` channel like:
 
   # Install in existing environment
   mamba install -c rapidsai -c conda-forge kvikio
+
+  # Create new environment (CUDA 13)
+  mamba create -n kvikio-env -c rapidsai -c conda-forge python=3.13 cuda-version=13.0 kvikio
+
   # Create new environment (CUDA 12)
   mamba create -n kvikio-env -c rapidsai -c conda-forge python=3.13 cuda-version=12.9 kvikio
 
@@ -24,10 +28,13 @@ Install the **nightly release** from the ``rapidsai-nightly`` channel like:
 
   # Install in existing environment
   mamba install -c rapidsai-nightly -c conda-forge kvikio
+
+  # Create new environment (CUDA 13)
+  mamba create -n kvikio-env -c rapidsai-nightly -c conda-forge python=3.13 cuda-version=13.0 kvikio
+
   # Create new environment (CUDA 12)
   mamba create -n kvikio-env -c rapidsai-nightly -c conda-forge python=3.13 cuda-version=12.9 kvikio
 
-
 .. note::
 
   If the nightly install doesn't work, set ``channel_priority: flexible`` in your ``.condarc``.
@@ -40,8 +47,11 @@ KvikIO is also available on PyPI. Install the latest release like:
 
 .. code-block::
 
-  pip install kvikio-cu12 # for CUDA 12
+  pip install kvikio-cu13 # for CUDA 13
+
+.. code-block::
 
+  pip install kvikio-cu12 # for CUDA 12
 
 Build from source
 -----------------
@@ -50,8 +60,8 @@ In order to setup a development environment, we recommend Conda:
 
 .. code-block::
 
-  # CUDA 12
-  mamba env create --name kvikio-dev --file conda/environments/all_cuda-128_arch-x86_64.yaml
+  # CUDA 13
+  mamba env create --name kvikio-dev --file conda/environments/all_cuda-130_arch-x86_64.yaml
 
 The Python library depends on the C++ library, thus we build and install both:
 
diff --git a/python/kvikio/pyproject.toml b/python/kvikio/pyproject.toml
index 51ff2419de..8f214c31d9 100644
--- a/python/kvikio/pyproject.toml
+++ b/python/kvikio/pyproject.toml
@@ -19,7 +19,7 @@ authors = [
 license = { text = "Apache-2.0" }
 requires-python = ">=3.10"
 dependencies = [
-    "cupy-cuda12x>=12.0.0",
+    "cupy-cuda13x>=13.6.0",
     "libkvikio==25.10.*,>=0.0.0a0",
     "numpy>=1.23,<3.0a0",
     "packaging",
@@ -39,7 +39,7 @@ classifiers = [
 [project.optional-dependencies]
 test = [
     "boto3>=1.21.21",
-    "cuda-python>=12.6.2,<13.0a0",
+    "cuda-python>=13.0.1,<14.0a0",
     "moto[server]>=4.0.8",
     "pytest",
     "pytest-asyncio",

From 7841a2529e1d37f5526f18ea75a06586140d49be Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Fri, 22 Aug 2025 18:11:50 -0500
Subject: [PATCH 23/40] Devendor libnvcomp from libkvikio (#805)

This fully devendors libnvcomp from libkvikio wheels.

A complementary PR is needed to vendor libnvcomp.so.* inside of libcudf wheels: https://github.com/rapidsai/cudf/pull/19743

Authors:
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - Kyle Edwards (https://github.com/KyleFromNVIDIA)
  - Matthew Murray (https://github.com/Matt711)
  - Mike Sarahan (https://github.com/msarahan)

URL: https://github.com/rapidsai/kvikio/pull/805
---
 ci/build_wheel_cpp.sh                         |  1 -
 ci/build_wheel_python.sh                      |  2 -
 python/libkvikio/CMakeLists.txt               |  8 ---
 .../cmake/Modules/WheelHelpers.cmake          | 59 -------------------
 .../cmake/thirdparty/get_nvcomp.cmake         | 35 -----------
 python/libkvikio/libkvikio/load.py            |  4 --
 python/libkvikio/pyproject.toml               |  4 +-
 7 files changed, 1 insertion(+), 112 deletions(-)
 delete mode 100644 python/libkvikio/cmake/Modules/WheelHelpers.cmake
 delete mode 100644 python/libkvikio/cmake/thirdparty/get_nvcomp.cmake

diff --git a/ci/build_wheel_cpp.sh b/ci/build_wheel_cpp.sh
index 8107f24160..61654c66b5 100755
--- a/ci/build_wheel_cpp.sh
+++ b/ci/build_wheel_cpp.sh
@@ -27,7 +27,6 @@ rapids-pip-retry install \
 # 0 really means "add --no-build-isolation" (ref: https://github.com/pypa/pip/issues/5735)
 export PIP_NO_BUILD_ISOLATION=0
 
-export SKBUILD_CMAKE_ARGS="-DUSE_NVCOMP_RUNTIME_WHEEL=OFF"
 ./ci/build_wheel.sh "${package_name}" "${package_dir}"
 
 python -m auditwheel repair \
diff --git a/ci/build_wheel_python.sh b/ci/build_wheel_python.sh
index ed2df80542..312fedc0e7 100755
--- a/ci/build_wheel_python.sh
+++ b/ci/build_wheel_python.sh
@@ -17,12 +17,10 @@ RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen "${RAPIDS_CUDA_VERSION}")"
 LIBKVIKIO_WHEELHOUSE=$(RAPIDS_PY_WHEEL_NAME="libkvikio_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-github cpp)
 echo "libkvikio-${RAPIDS_PY_CUDA_SUFFIX} @ file://$(echo "${LIBKVIKIO_WHEELHOUSE}"/libkvikio_*.whl)" >> "${PIP_CONSTRAINT}"
 
-export SKBUILD_CMAKE_ARGS="-DUSE_NVCOMP_RUNTIME_WHEEL=ON"
 ./ci/build_wheel.sh "${package_name}" "${package_dir}"
 
 python -m auditwheel repair \
     --exclude libkvikio.so \
-    --exclude libnvcomp.so.5 \
     -w "${RAPIDS_WHEEL_BLD_OUTPUT_DIR}" \
     ${package_dir}/dist/*
 
diff --git a/python/libkvikio/CMakeLists.txt b/python/libkvikio/CMakeLists.txt
index 231bebbd84..fe0019bf58 100644
--- a/python/libkvikio/CMakeLists.txt
+++ b/python/libkvikio/CMakeLists.txt
@@ -40,14 +40,6 @@ set(KvikIO_BUILD_BENCHMARKS OFF)
 set(KvikIO_BUILD_EXAMPLES OFF)
 set(KvikIO_BUILD_TESTS OFF)
 
-# vendor nvcomp but not the entire kvikio-export set because that's huge TODO: Move nvcomp vendoring
-# to libcudf wheel instead
-include(cmake/thirdparty/get_nvcomp.cmake)
-include(cmake/Modules/WheelHelpers.cmake)
-install_aliased_imported_targets(
-  TARGETS nvcomp::nvcomp DESTINATION ${SKBUILD_PLATLIB_DIR}/libkvikio/lib64/
-)
-
 set(CUDA_STATIC_RUNTIME ON)
 
 add_subdirectory(../../cpp kvikio-cpp)
diff --git a/python/libkvikio/cmake/Modules/WheelHelpers.cmake b/python/libkvikio/cmake/Modules/WheelHelpers.cmake
deleted file mode 100644
index abdde95298..0000000000
--- a/python/libkvikio/cmake/Modules/WheelHelpers.cmake
+++ /dev/null
@@ -1,59 +0,0 @@
-# =============================================================================
-# Copyright (c) 2022-2025, NVIDIA CORPORATION.
-#
-# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
-# in compliance with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software distributed under the License
-# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
-# or implied. See the License for the specific language governing permissions and limitations under
-# the License.
-# =============================================================================
-include_guard(GLOBAL)
-
-# Making libraries available inside wheels by installing the associated targets.
-function(install_aliased_imported_targets)
-  list(APPEND CMAKE_MESSAGE_CONTEXT "install_aliased_imported_targets")
-
-  set(options "")
-  set(one_value "DESTINATION")
-  set(multi_value "TARGETS")
-  cmake_parse_arguments(_ "${options}" "${one_value}" "${multi_value}" ${ARGN})
-
-  message(VERBOSE "Installing targets '${__TARGETS}' into lib_dir '${__DESTINATION}'")
-
-  foreach(target IN LISTS __TARGETS)
-
-    if(NOT TARGET ${target})
-      message(VERBOSE "No target named ${target}")
-      continue()
-    endif()
-
-    get_target_property(alias_target ${target} ALIASED_TARGET)
-    if(alias_target)
-      set(target ${alias_target})
-    endif()
-
-    get_target_property(is_imported ${target} IMPORTED)
-    if(NOT is_imported)
-      # If the target isn't imported, install it into the wheel
-      install(TARGETS ${target} DESTINATION ${__DESTINATION})
-      message(VERBOSE "install(TARGETS ${target} DESTINATION ${__DESTINATION})")
-    else()
-      # If the target is imported, make sure it's global
-      get_target_property(type ${target} TYPE)
-      if(${type} STREQUAL "UNKNOWN_LIBRARY")
-        install(FILES $<TARGET_FILE:${target}> DESTINATION ${__DESTINATION})
-        message(VERBOSE "install(FILES $<TARGET_FILE:${target}> DESTINATION ${__DESTINATION})")
-      else()
-        install(IMPORTED_RUNTIME_ARTIFACTS ${target} DESTINATION ${__DESTINATION})
-        message(
-          VERBOSE
-          "install(IMPORTED_RUNTIME_ARTIFACTS $<TARGET_FILE:${target}> DESTINATION ${__DESTINATION})"
-        )
-      endif()
-    endif()
-  endforeach()
-endfunction()
diff --git a/python/libkvikio/cmake/thirdparty/get_nvcomp.cmake b/python/libkvikio/cmake/thirdparty/get_nvcomp.cmake
deleted file mode 100644
index 9d62dfda38..0000000000
--- a/python/libkvikio/cmake/thirdparty/get_nvcomp.cmake
+++ /dev/null
@@ -1,35 +0,0 @@
-# =============================================================================
-# Copyright (c) 2021-2025, NVIDIA CORPORATION.
-#
-# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
-# in compliance with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software distributed under the License
-# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
-# or implied. See the License for the specific language governing permissions and limitations under
-# the License.
-# =============================================================================
-
-set(KVIKIO_USE_PROPRIETARY_BINARY ON)
-
-# This function finds nvcomp and sets any additional necessary environment variables.
-function(find_and_configure_nvcomp)
-
-  include(${rapids-cmake-dir}/cpm/nvcomp.cmake)
-  set(export_args)
-  if(KvikIO_EXPORT_NVCOMP)
-    # We're vendoring nvcomp and we only want `libnvcomp.so.5`
-    set(export_args BUILD_EXPORT_SET nvcomp INSTALL_EXPORT_SET nvcomp)
-  endif()
-
-  rapids_cpm_nvcomp(${export_args} USE_PROPRIETARY_BINARY ${KVIKIO_USE_PROPRIETARY_BINARY})
-
-  # Per-thread default stream
-  if(TARGET nvcomp AND PER_THREAD_DEFAULT_STREAM)
-    target_compile_definitions(nvcomp PRIVATE CUDA_API_PER_THREAD_DEFAULT_STREAM)
-  endif()
-endfunction()
-
-find_and_configure_nvcomp()
diff --git a/python/libkvikio/libkvikio/load.py b/python/libkvikio/libkvikio/load.py
index 7a432d95f8..2fd25e642c 100644
--- a/python/libkvikio/libkvikio/load.py
+++ b/python/libkvikio/libkvikio/load.py
@@ -44,10 +44,6 @@ def _load_wheel_installation(soname: str):
 
 
 def load_library():
-    # TODO: remove this nvcomp load when `nvcomp` is vendored into cudf instead.
-    # Currently this load only exists to ensure that libcudf wheels are not broken by
-    # prematurely removing the load
-    _load_library("libnvcomp.so.5")
     return _load_library("libkvikio.so")
 
 
diff --git a/python/libkvikio/pyproject.toml b/python/libkvikio/pyproject.toml
index 43b659aec8..3239d8c651 100644
--- a/python/libkvikio/pyproject.toml
+++ b/python/libkvikio/pyproject.toml
@@ -67,6 +67,4 @@ select = [
 ]
 
 # PyPI limit is 100 MiB, fail CI before we get too close to that
-# TODO: drop this to 75M after we re-de-vendor nvcomp
-# https://github.com/rapidsai/build-planning/issues/171
-max_allowed_size_compressed = '90M'
+max_allowed_size_compressed = '75M'

From 46fa7dd127994df2f8b816eebea0aef6488c508a Mon Sep 17 00:00:00 2001
From: Tianyu Liu <kingcrimsontianyu@gmail.com>
Date: Mon, 25 Aug 2025 12:03:50 -0400
Subject: [PATCH 24/40] Add a unified remote I/O interface that infers the
 endpoint type from URL (1/2): C++ implementation (#793)

This PR adds a new remote I/O utility function `RemoteHandle::open(url)` that infers the remote endpoint type from the URL to facilitate `RemoteHandle` creation.

- Supported endpoint types include S3, S3 with presigned URL, WebHDFS, and generic HTTP/HTTPS.
- Optionally, instead of letting `open` figure it out, users can explicitly specify the endpoint type by passing an enum argument `RemoteEndpointType`.
- Optionally, users can provide an allowlist that restricts the endpoint candidates
- Optionally, users can specify the expected file size. This design is to fully support the existing constructor overload `RemoteHandle(endpoint, nbytes)`.

A byproduct of this PR is an internal utility class `UrlParser` that uses the idiomatic libcurl URL API to validate the URL against "[RFC 3986 plus](https://curl.se/docs/url-syntax.html)".

## This PR depends on
- [x] #791
- [x] #788

Authors:
  - Tianyu Liu (https://github.com/kingcrimsontianyu)

Approvers:
  - Mads R. B. Kristensen (https://github.com/madsbk)

URL: https://github.com/rapidsai/kvikio/pull/793
---
 cpp/CMakeLists.txt                   |   2 +-
 cpp/include/kvikio/detail/url.hpp    | 199 +++++++++++++++++++++++
 cpp/include/kvikio/hdfs.hpp          |   8 +
 cpp/include/kvikio/remote_handle.hpp | 136 ++++++++++++++++
 cpp/src/detail/url.cpp               | 142 ++++++++++++++++
 cpp/src/hdfs.cpp                     |  15 +-
 cpp/src/remote_handle.cpp            | 197 +++++++++++++++++++++-
 cpp/tests/CMakeLists.txt             |   1 +
 cpp/tests/test_defaults.cpp          |   2 +-
 cpp/tests/test_remote_handle.cpp     | 233 ++++++++++++++++++++++++++-
 cpp/tests/test_url.cpp               |  64 ++++++++
 11 files changed, 991 insertions(+), 8 deletions(-)
 create mode 100644 cpp/include/kvikio/detail/url.hpp
 create mode 100644 cpp/src/detail/url.cpp
 create mode 100644 cpp/tests/test_url.cpp

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 9fa8e1f798..6f9c249cfa 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -163,7 +163,7 @@ set(SOURCES
 
 if(KvikIO_REMOTE_SUPPORT)
   list(APPEND SOURCES "src/hdfs.cpp" "src/remote_handle.cpp" "src/detail/remote_handle.cpp"
-       "src/shim/libcurl.cpp"
+       "src/detail/url.cpp" "src/shim/libcurl.cpp"
   )
 endif()
 
diff --git a/cpp/include/kvikio/detail/url.hpp b/cpp/include/kvikio/detail/url.hpp
new file mode 100644
index 0000000000..e57d2c4c94
--- /dev/null
+++ b/cpp/include/kvikio/detail/url.hpp
@@ -0,0 +1,199 @@
+/*
+ * Copyright (c) 2025, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <optional>
+#include <string>
+
+#include <curl/curl.h>
+
+namespace kvikio::detail {
+/**
+ * @brief RAII wrapper for libcurl's URL handle (CURLU)
+ *
+ * This class provides automatic resource management for libcurl URL handles,
+ * ensuring proper cleanup when the handle goes out of scope. The class is
+ * move-only to prevent accidental sharing of the underlying resource.
+ */
+class CurlUrlHandle {
+ private:
+  CURLU* _handle{nullptr};
+
+ public:
+  /**
+   * @brief Create a new libcurl URL handle
+   *
+   * @exception std::runtime_error if libcurl cannot allocate the handle (usually due to out of
+   * memory)
+   */
+  CurlUrlHandle();
+
+  /**
+   * @brief Clean up the underlying URL handle
+   */
+  ~CurlUrlHandle() noexcept;
+
+  CurlUrlHandle(CurlUrlHandle const&)            = delete;
+  CurlUrlHandle& operator=(CurlUrlHandle const&) = delete;
+
+  CurlUrlHandle(CurlUrlHandle&& other) noexcept;
+  CurlUrlHandle& operator=(CurlUrlHandle&& other) noexcept;
+
+  /**
+   * @brief Get the underlying libcurl URL handle
+   *
+   * @return Pointer to the underlying libcurl URL handle
+   * @note The returned pointer should not be freed manually as it is managed by this class
+   */
+  CURLU* get() const;
+};
+
+/**
+ * @brief URL parsing utility using libcurl's URL API
+ *
+ * This class provides static methods for parsing URLs into their constituent
+ * components (scheme, host, port, path, query, fragment).
+ *
+ * @note This class uses libcurl's URL parsing which follows RFC 3986 plus. See
+ * https://curl.se/docs/url-syntax.html
+ *
+ * Example:
+ * @code{.cpp}
+ * auto components = UrlParser::parse("https://example.com:8080/path?query=1#frag");
+ * if (components.scheme.has_value()) {
+ *     std::cout << "Scheme: " << components.scheme.value() << std::endl;
+ * }
+ * if (components.host.has_value()) {
+ *     std::cout << "Host: " << components.host.value() << std::endl;
+ * }
+ * @endcode
+ */
+class UrlParser {
+ public:
+  /**
+   * @brief Container for parsed URL components
+   */
+  struct UrlComponents {
+    /**
+     * @brief The URL scheme (e.g., "http", "https", "ftp"). May be empty for scheme-relative URLs
+     * or paths.
+     */
+    std::optional<std::string> scheme;
+
+    /**
+     * @brief The hostname or IP address. May be empty for URLs without an authority component
+     * (e.g., "file:///path").
+     */
+    std::optional<std::string> host;
+
+    /**
+     * @brief The port number as a string. Will be empty if no explicit port is specified in the
+     * URL.
+     * @note Default ports (e.g., 80 for HTTP, 443 for HTTPS) are not automatically filled in.
+     */
+    std::optional<std::string> port;
+
+    /**
+     * @brief The path component of the URL. Libcurl ensures that the path component is always
+     * present, even if empty (will be "/" for URLs like "http://example.com").
+     */
+    std::optional<std::string> path;
+
+    /**
+     * @brief The query string (without the leading "?"). Empty if no query parameters are present.
+     */
+    std::optional<std::string> query;
+
+    /**
+     * @brief The fragment identifier (without the leading "#"). Empty if no fragment is present.
+     */
+    std::optional<std::string> fragment;
+  };
+
+  /**
+   * @brief Parses the given URL according to RFC 3986 plus and extracts its components.
+   *
+   * @param url The URL string to parse
+   * @param bitmask_url_flags Optional flags for URL parsing. Common flags include:
+   *  - CURLU_DEFAULT_SCHEME: Allows URLs without schemes
+   *  - CURLU_NON_SUPPORT_SCHEME: Accept non-supported schemes
+   *  - CURLU_URLENCODE: URL encode the path
+   * @param bitmask_component_flags Optional flags for component extraction. Common flags include:
+   *  - CURLU_URLDECODE: URL decode the component
+   *  - CURLU_PUNYCODE: Return host as punycode
+   *
+   * @return UrlComponents structure containing the parsed URL components
+   *
+   * @throw std::runtime_error if the URL cannot be parsed or if component extraction fails
+   *
+   * Example:
+   * @code{.cpp}
+   * // Basic parsing
+   * auto components = UrlParser::parse("https://api.example.com/v1/users?page=1");
+   *
+   * // Parsing with URL decoding
+   * auto decoded = UrlParser::parse(
+   *     "https://example.com/hello%20world",
+   *     std::nullopt,
+   *     CURLU_URLDECODE
+   * );
+   *
+   * // Allow non-standard schemes
+   * auto custom = UrlParser::parse(
+   *     "myscheme://example.com",
+   *     CURLU_NON_SUPPORT_SCHEME
+   * );
+   * @endcode
+   */
+  static UrlComponents parse(std::string const& url,
+                             std::optional<unsigned int> bitmask_url_flags       = std::nullopt,
+                             std::optional<unsigned int> bitmask_component_flags = std::nullopt);
+
+  /**
+   * @brief Extract a specific component from a CurlUrlHandle
+   *
+   * @param handle The CurlUrlHandle containing the parsed URL
+   * @param part The URL part to extract (e.g., CURLUPART_SCHEME)
+   * @param bitmask_component_flags Flags controlling extraction behavior
+   * @param allowed_err_code Optional error code to treat as valid (e.g., CURLUE_NO_SCHEME)
+   * @return The extracted component as a string, or std::nullopt if not present
+   * @throw std::runtime_error if extraction fails with an unexpected error
+   */
+  static std::optional<std::string> extract_component(
+    CurlUrlHandle const& handle,
+    CURLUPart part,
+    std::optional<unsigned int> bitmask_component_flags = std::nullopt,
+    std::optional<CURLUcode> allowed_err_code           = std::nullopt);
+
+  /**
+   * @brief Extract a specific component from a URL string
+   *
+   * @param url The URL string from which to extract a component
+   * @param part The URL part to extract
+   * @param bitmask_url_flags Optional flags for URL parsing.
+   * @param bitmask_component_flags Flags controlling extraction behavior
+   * @param allowed_err_code Optional error code to treat as valid
+   * @return The extracted component as a string, or std::nullopt if not present
+   * @throw std::runtime_error if extraction fails with an unexpected error
+   */
+  static std::optional<std::string> extract_component(
+    std::string const& url,
+    CURLUPart part,
+    std::optional<unsigned int> bitmask_url_flags       = std::nullopt,
+    std::optional<unsigned int> bitmask_component_flags = std::nullopt,
+    std::optional<CURLUcode> allowed_err_code           = std::nullopt);
+};
+}  // namespace kvikio::detail
diff --git a/cpp/include/kvikio/hdfs.hpp b/cpp/include/kvikio/hdfs.hpp
index 0b20d658bd..345051bcbd 100644
--- a/cpp/include/kvikio/hdfs.hpp
+++ b/cpp/include/kvikio/hdfs.hpp
@@ -58,5 +58,13 @@ class WebHdfsEndpoint : public RemoteEndpoint {
   std::string str() const override;
   std::size_t get_file_size() override;
   void setup_range_request(CurlHandle& curl, std::size_t file_offset, std::size_t size) override;
+
+  /**
+   * @brief Whether the given URL is valid for the WebHDFS endpoints.
+   *
+   * @param url A URL.
+   * @return Boolean answer.
+   */
+  static bool is_url_valid(std::string const& url) noexcept;
 };
 }  // namespace kvikio
diff --git a/cpp/include/kvikio/remote_handle.hpp b/cpp/include/kvikio/remote_handle.hpp
index b2e2d1d0ff..0d56231d03 100644
--- a/cpp/include/kvikio/remote_handle.hpp
+++ b/cpp/include/kvikio/remote_handle.hpp
@@ -34,6 +34,18 @@ namespace kvikio {
 
 class CurlHandle;  // Prototype
 
+/**
+ * @brief Type of remote file.
+ */
+enum class RemoteEndpointType : uint8_t {
+  AUTO,  ///< Let KvikIO infer the type of remote file from the URL and create a proper endpoint.
+  S3,    ///< AWS S3 (based on HTTP/HTTPS protocols).
+  S3_PRESIGNED_URL,  ///< AWS S3 presigned URL (based on HTTP/HTTPS protocols).
+  WEBHDFS,           ///< Apache Hadoop WebHDFS (based on HTTP/HTTPS protocols).
+  HTTP,  ///< Generic HTTP/HTTPS, excluding all the specific types listed above that use HTTP/HTTPS
+         ///< protocols.
+};
+
 /**
  * @brief Abstract base class for remote endpoints.
  *
@@ -43,6 +55,10 @@ class CurlHandle;  // Prototype
  * its own ctor that takes communication protocol specific arguments.
  */
 class RemoteEndpoint {
+ protected:
+  RemoteEndpointType _remote_endpoint_type{RemoteEndpointType::AUTO};
+  RemoteEndpoint(RemoteEndpointType remote_endpoint_type);
+
  public:
   virtual ~RemoteEndpoint() = default;
 
@@ -74,6 +90,13 @@ class RemoteEndpoint {
    * size.
    */
   virtual void setup_range_request(CurlHandle& curl, std::size_t file_offset, std::size_t size) = 0;
+
+  /**
+   * @brief Get the type of the remote file.
+   *
+   * @return The type of the remote file.
+   */
+  [[nodiscard]] RemoteEndpointType remote_endpoint_type() const noexcept;
 };
 
 /**
@@ -96,6 +119,14 @@ class HttpEndpoint : public RemoteEndpoint {
   std::string str() const override;
   std::size_t get_file_size() override;
   void setup_range_request(CurlHandle& curl, std::size_t file_offset, std::size_t size) override;
+
+  /**
+   * @brief Whether the given URL is valid for HTTP/HTTPS endpoints.
+   *
+   * @param url A URL.
+   * @return Boolean answer.
+   */
+  static bool is_url_valid(std::string const& url) noexcept;
 };
 
 /**
@@ -206,6 +237,14 @@ class S3Endpoint : public RemoteEndpoint {
   std::string str() const override;
   std::size_t get_file_size() override;
   void setup_range_request(CurlHandle& curl, std::size_t file_offset, std::size_t size) override;
+
+  /**
+   * @brief Whether the given URL is valid for S3 endpoints (excluding presigned URL).
+   *
+   * @param url A URL.
+   * @return Boolean answer.
+   */
+  static bool is_url_valid(std::string const& url) noexcept;
 };
 
 /**
@@ -224,6 +263,14 @@ class S3EndpointWithPresignedUrl : public RemoteEndpoint {
   std::string str() const override;
   std::size_t get_file_size() override;
   void setup_range_request(CurlHandle& curl, std::size_t file_offset, std::size_t size) override;
+
+  /**
+   * @brief Whether the given URL is valid for S3 endpoints with presigned URL.
+   *
+   * @param url A URL.
+   * @return Boolean answer.
+   */
+  static bool is_url_valid(std::string const& url) noexcept;
 };
 
 /**
@@ -235,6 +282,88 @@ class RemoteHandle {
   std::size_t _nbytes;
 
  public:
+  /**
+   * @brief Create a remote file handle from a URL.
+   *
+   * This function creates a RemoteHandle for reading data from various remote endpoints
+   * including HTTP/HTTPS servers, AWS S3 buckets, S3 presigned URLs, and WebHDFS.
+   * The endpoint type can be automatically detected from the URL or explicitly specified.
+   *
+   * @param url The URL of the remote file. Supported formats include:
+   *   - S3 with credentials
+   *   - S3 presigned URL
+   *   - WebHDFS
+   *   - HTTP/HTTPS
+   * @param remote_endpoint_type The type of remote endpoint. Default is RemoteEndpointType::AUTO
+   * which automatically detects the endpoint type from the URL. Can be explicitly set to
+   * RemoteEndpointType::S3, RemoteEndpointType::S3_PRESIGNED_URL, RemoteEndpointType::WEBHDFS, or
+   * RemoteEndpointType::HTTP to force a specific endpoint type.
+   * @param allow_list Optional list of allowed endpoint types. If provided:
+   *   - If remote_endpoint_type is RemoteEndpointType::AUTO, Types are tried in the exact order
+   *     specified until a match is found.
+   *   - In explicit mode, the specified type must be in this list, otherwise an exception is
+   *     thrown.
+   *
+   * If not provided, defaults to all supported types in this order: RemoteEndpointType::S3,
+   * RemoteEndpointType::S3_PRESIGNED_URL, RemoteEndpointType::WEBHDFS, and
+   * RemoteEndpointType::HTTP.
+   * @param nbytes Optional file size in bytes. If not provided, the function sends additional
+   * request to the server to query the file size.
+   * @return A RemoteHandle object that can be used to read data from the remote file.
+   * @exception std::runtime_error If:
+   *   - If the URL is malformed or missing required components.
+   *   - RemoteEndpointType::AUTO mode is used and the URL doesn't match any supported endpoint
+   * type.
+   *   - The specified endpoint type is not in the `allow_list`.
+   *   - The URL is invalid for the specified endpoint type.
+   *   - Unable to connect to the remote server or determine file size (when nbytes not provided).
+   *
+   * Example:
+   * - Auto-detect endpoint type from URL
+   *   @code{.cpp}
+   *   auto handle = kvikio::RemoteHandle::open(
+   *       "https://bucket.s3.amazonaws.com/object?X-Amz-Algorithm=AWS4-HMAC-SHA256"
+   *       "&X-Amz-Credential=...&X-Amz-Signature=..."
+   *   );
+   *   @endcode
+   *
+   * - Open S3 file with explicit endpoint type
+   *   @code{.cpp}
+   *
+   *   auto handle = kvikio::RemoteHandle::open(
+   *       "https://my-bucket.s3.us-east-1.amazonaws.com/data.bin",
+   *       kvikio::RemoteEndpointType::S3
+   *   );
+   *   @endcode
+   *
+   * - Restrict endpoint type candidates
+   *   @code{.cpp}
+   *   std::vector<kvikio::RemoteEndpointType> allow_list = {
+   *       kvikio::RemoteEndpointType::HTTP,
+   *       kvikio::RemoteEndpointType::S3_PRESIGNED_URL
+   *   };
+   *   auto handle = kvikio::RemoteHandle::open(
+   *       user_provided_url,
+   *       kvikio::RemoteEndpointType::AUTO,
+   *       allow_list
+   *   );
+   *   @endcode
+   *
+   * - Provide known file size to skip HEAD request
+   *   @code{.cpp}
+   *   auto handle = kvikio::RemoteHandle::open(
+   *       "https://example.com/large-file.bin",
+   *       kvikio::RemoteEndpointType::HTTP,
+   *       std::nullopt,
+   *       1024 * 1024 * 100  // 100 MB
+   *   );
+   *   @endcode
+   */
+  static RemoteHandle open(std::string url,
+                           RemoteEndpointType remote_endpoint_type = RemoteEndpointType::AUTO,
+                           std::optional<std::vector<RemoteEndpointType>> allow_list = std::nullopt,
+                           std::optional<std::size_t> nbytes = std::nullopt);
+
   /**
    * @brief Create a new remote handle from an endpoint and a file size.
    *
@@ -258,6 +387,13 @@ class RemoteHandle {
   RemoteHandle(RemoteHandle const&)            = delete;
   RemoteHandle& operator=(RemoteHandle const&) = delete;
 
+  /**
+   * @brief Get the type of the remote file.
+   *
+   * @return The type of the remote file.
+   */
+  [[nodiscard]] RemoteEndpointType remote_endpoint_type() const noexcept;
+
   /**
    * @brief Get the file size.
    *
diff --git a/cpp/src/detail/url.cpp b/cpp/src/detail/url.cpp
new file mode 100644
index 0000000000..64f5b8fde9
--- /dev/null
+++ b/cpp/src/detail/url.cpp
@@ -0,0 +1,142 @@
+/*
+ * Copyright (c) 2025, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <sstream>
+#include <stdexcept>
+#include <utility>
+
+#include <curl/curl.h>
+#include <kvikio/detail/url.hpp>
+#include <kvikio/error.hpp>
+
+#define CHECK_CURL_URL_ERR(err_code) check_curl_url_err(err_code, __LINE__, __FILE__)
+
+namespace kvikio::detail {
+namespace {
+void check_curl_url_err(CURLUcode err_code, int line_number, char const* filename)
+{
+  if (err_code == CURLUcode::CURLUE_OK) { return; }
+
+  std::stringstream ss;
+  ss << "KvikIO detects an URL error at: " << filename << ":" << line_number << ": ";
+  char const* msg = curl_url_strerror(err_code);
+  if (msg == nullptr) {
+    ss << "(no message)";
+  } else {
+    ss << msg;
+  }
+  throw std::runtime_error(ss.str());
+}
+}  // namespace
+
+CurlUrlHandle::CurlUrlHandle() : _handle(curl_url())
+{
+  KVIKIO_EXPECT(_handle != nullptr,
+                "Libcurl is unable to allocate a URL handle (likely out of memory).");
+}
+
+CurlUrlHandle::~CurlUrlHandle() noexcept
+{
+  if (_handle) { curl_url_cleanup(_handle); }
+}
+
+CurlUrlHandle::CurlUrlHandle(CurlUrlHandle&& other) noexcept
+  : _handle{std::exchange(other._handle, nullptr)}
+{
+}
+
+CurlUrlHandle& CurlUrlHandle::operator=(CurlUrlHandle&& other) noexcept
+{
+  if (this != &other) {
+    if (_handle) { curl_url_cleanup(_handle); }
+    _handle = std::exchange(other._handle, nullptr);
+  }
+
+  return *this;
+}
+
+CURLU* CurlUrlHandle::get() const { return _handle; }
+
+std::optional<std::string> UrlParser::extract_component(
+  CurlUrlHandle const& handle,
+  CURLUPart part,
+  std::optional<unsigned int> bitmask_component_flags,
+  std::optional<CURLUcode> allowed_err_code)
+{
+  if (!bitmask_component_flags.has_value()) { bitmask_component_flags = 0U; }
+
+  char* value{};
+  auto err_code = curl_url_get(handle.get(), part, &value, bitmask_component_flags.value());
+
+  if (err_code == CURLUcode::CURLUE_OK && value != nullptr) {
+    std::string result{value};
+    curl_free(value);
+    return result;
+  }
+
+  if (allowed_err_code.has_value() && allowed_err_code.value() == err_code) { return std::nullopt; }
+
+  // Throws an exception and explains the reason.
+  CHECK_CURL_URL_ERR(err_code);
+  return std::nullopt;
+}
+
+std::optional<std::string> UrlParser::extract_component(
+  std::string const& url,
+  CURLUPart part,
+  std::optional<unsigned int> bitmask_url_flags,
+  std::optional<unsigned int> bitmask_component_flags,
+  std::optional<CURLUcode> allowed_err_code)
+{
+  if (!bitmask_url_flags.has_value()) { bitmask_url_flags = 0U; }
+  if (!bitmask_component_flags.has_value()) { bitmask_component_flags = 0U; }
+
+  CurlUrlHandle handle;
+  CHECK_CURL_URL_ERR(
+    curl_url_set(handle.get(), CURLUPART_URL, url.c_str(), bitmask_url_flags.value()));
+
+  return extract_component(handle, part, bitmask_component_flags, allowed_err_code);
+}
+
+UrlParser::UrlComponents UrlParser::parse(std::string const& url,
+                                          std::optional<unsigned int> bitmask_url_flags,
+                                          std::optional<unsigned int> bitmask_component_flags)
+{
+  if (!bitmask_url_flags.has_value()) { bitmask_url_flags = 0U; }
+  if (!bitmask_component_flags.has_value()) { bitmask_component_flags = 0U; }
+
+  CurlUrlHandle handle;
+  CHECK_CURL_URL_ERR(
+    curl_url_set(handle.get(), CURLUPART_URL, url.c_str(), bitmask_url_flags.value()));
+
+  UrlComponents components;
+  CURLUcode err_code{};
+
+  components.scheme = extract_component(
+    handle, CURLUPART_SCHEME, bitmask_component_flags.value(), CURLUcode::CURLUE_NO_SCHEME);
+  components.host = extract_component(
+    handle, CURLUPART_HOST, bitmask_component_flags.value(), CURLUcode::CURLUE_NO_HOST);
+  components.port = extract_component(
+    handle, CURLUPART_PORT, bitmask_component_flags.value(), CURLUcode::CURLUE_NO_PORT);
+  components.path  = extract_component(handle, CURLUPART_PATH, bitmask_component_flags.value());
+  components.query = extract_component(
+    handle, CURLUPART_QUERY, bitmask_component_flags.value(), CURLUcode::CURLUE_NO_QUERY);
+  components.fragment = extract_component(
+    handle, CURLUPART_FRAGMENT, bitmask_component_flags.value(), CURLUcode::CURLUE_NO_FRAGMENT);
+
+  return components;
+}
+}  // namespace kvikio::detail
diff --git a/cpp/src/hdfs.cpp b/cpp/src/hdfs.cpp
index 12455b3a26..2e032a1af7 100644
--- a/cpp/src/hdfs.cpp
+++ b/cpp/src/hdfs.cpp
@@ -25,7 +25,7 @@
 
 namespace kvikio {
 
-WebHdfsEndpoint::WebHdfsEndpoint(std::string url)
+WebHdfsEndpoint::WebHdfsEndpoint(std::string url) : RemoteEndpoint{RemoteEndpointType::WEBHDFS}
 {
   // todo: Use libcurl URL API for more secure and idiomatic parsing.
   // Split the URL into two parts: one without query and one with.
@@ -64,7 +64,7 @@ WebHdfsEndpoint::WebHdfsEndpoint(std::string host,
                                  std::string port,
                                  std::string file_path,
                                  std::optional<std::string> username)
-  : _username{std::move(username)}
+  : RemoteEndpoint{RemoteEndpointType::WEBHDFS}, _username{std::move(username)}
 {
   std::stringstream ss;
   ss << "http://" << host << ":" << port << "/webhdfs/v1" << file_path;
@@ -128,4 +128,15 @@ void WebHdfsEndpoint::setup_range_request(CurlHandle& curl,
   ss << "op=OPEN&offset=" << file_offset << "&length=" << size;
   curl.setopt(CURLOPT_URL, ss.str().c_str());
 }
+
+bool WebHdfsEndpoint::is_url_valid(std::string const& url) noexcept
+{
+  try {
+    std::regex const pattern(R"(^https?://[^/]+:\d+/webhdfs/v1/.+$)", std::regex_constants::icase);
+    std::smatch match_result;
+    return std::regex_match(url, match_result, pattern);
+  } catch (...) {
+    return false;
+  }
+}
 }  // namespace kvikio
diff --git a/cpp/src/remote_handle.cpp b/cpp/src/remote_handle.cpp
index 23cf5c6305..3cf2acc862 100644
--- a/cpp/src/remote_handle.cpp
+++ b/cpp/src/remote_handle.cpp
@@ -14,6 +14,7 @@
  * limitations under the License.
  */
 
+#include <algorithm>
 #include <cassert>
 #include <cstddef>
 #include <cstring>
@@ -26,7 +27,9 @@
 
 #include <kvikio/defaults.hpp>
 #include <kvikio/detail/remote_handle.hpp>
+#include <kvikio/detail/url.hpp>
 #include <kvikio/error.hpp>
+#include <kvikio/hdfs.hpp>
 #include <kvikio/nvtx.hpp>
 #include <kvikio/parallel_operation.hpp>
 #include <kvikio/posix_io.hpp>
@@ -177,9 +180,69 @@ void setup_range_request_impl(CurlHandle& curl, std::size_t file_offset, std::si
   curl.setopt(CURLOPT_RANGE, byte_range.c_str());
 }
 
+/**
+ * @brief Whether the given URL is compatible with the S3 endpoint (including the credential-based
+ * access and presigned URL) which uses HTTP/HTTPS.
+ *
+ * @param url A URL.
+ * @return Boolean answer.
+ */
+bool url_has_aws_s3_http_format(std::string const& url)
+{
+  // Currently KvikIO supports the following AWS S3 HTTP URL formats:
+  static std::array const s3_patterns = {
+    // Virtual host style: https://<bucket-name>.s3.<region-code>.amazonaws.com/<object-key-name>
+    std::regex(R"(https?://[^/]+\.s3\.[^.]+\.amazonaws\.com/.+$)", std::regex_constants::icase),
+
+    // Path style (deprecated but still popular):
+    // https://s3.<region-code>.amazonaws.com/<bucket-name>/<object-key-name>
+    std::regex(R"(https?://s3\.[^.]+\.amazonaws\.com/[^/]+/.+$)", std::regex_constants::icase),
+
+    // Legacy global endpoint: no region code
+    std::regex(R"(https?://[^/]+\.s3\.amazonaws\.com/.+$)", std::regex_constants::icase),
+    std::regex(R"(https?://s3\.amazonaws\.com/[^/]+/.+$)", std::regex_constants::icase),
+
+    // Legacy regional endpoint: s3 and region code are delimited by - instead of .
+    std::regex(R"(https?://[^/]+\.s3-[^.]+\.amazonaws\.com/.+$)", std::regex_constants::icase),
+    std::regex(R"(https?://s3-[^.]+\.amazonaws\.com/[^/]+/.+$)", std::regex_constants::icase)};
+
+  return std::any_of(s3_patterns.begin(), s3_patterns.end(), [&url = url](auto const& pattern) {
+    std::smatch match_result;
+    return std::regex_match(url, match_result, pattern);
+  });
+}
+
+char const* get_remote_endpoint_type_name(RemoteEndpointType remote_endpoint_type)
+{
+  switch (remote_endpoint_type) {
+    case RemoteEndpointType::S3: return "S3";
+    case RemoteEndpointType::S3_PRESIGNED_URL: return "S3 with presigned URL";
+    case RemoteEndpointType::WEBHDFS: return "WebHDFS";
+    case RemoteEndpointType::HTTP: return "HTTP";
+    case RemoteEndpointType::AUTO: return "AUTO";
+    default:
+      // Unreachable
+      KVIKIO_FAIL("Unknown RemoteEndpointType: " +
+                  std::to_string(static_cast<int>(remote_endpoint_type)));
+      return "UNKNOWN";
+  }
+}
 }  // namespace
 
-HttpEndpoint::HttpEndpoint(std::string url) : _url{std::move(url)} {}
+RemoteEndpoint::RemoteEndpoint(RemoteEndpointType remote_endpoint_type)
+  : _remote_endpoint_type{remote_endpoint_type}
+{
+}
+
+RemoteEndpointType RemoteEndpoint::remote_endpoint_type() const noexcept
+{
+  return _remote_endpoint_type;
+}
+
+HttpEndpoint::HttpEndpoint(std::string url)
+  : RemoteEndpoint{RemoteEndpointType::HTTP}, _url{std::move(url)}
+{
+}
 
 std::string HttpEndpoint::str() const { return _url; }
 
@@ -194,6 +257,19 @@ void HttpEndpoint::setup_range_request(CurlHandle& curl, std::size_t file_offset
   setup_range_request_impl(curl, file_offset, size);
 }
 
+bool HttpEndpoint::is_url_valid(std::string const& url) noexcept
+{
+  try {
+    auto parsed_url = detail::UrlParser::parse(url);
+    if ((parsed_url.scheme != "http") && (parsed_url.scheme != "https")) { return false; };
+
+    // Check whether the file path exists, excluding the leading "/"
+    return parsed_url.path->length() > 1;
+  } catch (...) {
+    return false;
+  }
+}
+
 void HttpEndpoint::setopt(CurlHandle& curl) { curl.setopt(CURLOPT_URL, _url.c_str()); }
 
 void S3Endpoint::setopt(CurlHandle& curl)
@@ -256,7 +332,7 @@ S3Endpoint::S3Endpoint(std::string url,
                        std::optional<std::string> aws_access_key,
                        std::optional<std::string> aws_secret_access_key,
                        std::optional<std::string> aws_session_token)
-  : _url{std::move(url)}
+  : RemoteEndpoint{RemoteEndpointType::S3}, _url{std::move(url)}
 {
   KVIKIO_NVTX_FUNC_RANGE();
   // Regular expression to match http[s]://
@@ -348,8 +424,29 @@ void S3Endpoint::setup_range_request(CurlHandle& curl, std::size_t file_offset,
   setup_range_request_impl(curl, file_offset, size);
 }
 
+bool S3Endpoint::is_url_valid(std::string const& url) noexcept
+{
+  try {
+    auto parsed_url = detail::UrlParser::parse(url, CURLU_NON_SUPPORT_SCHEME);
+
+    if (parsed_url.scheme == "s3") {
+      if (!parsed_url.host.has_value()) { return false; }
+      if (!parsed_url.path.has_value()) { return false; }
+
+      // Check whether the S3 object key exists
+      std::regex const pattern(R"(^/[^/]+$)", std::regex::icase);
+      std::smatch match_result;
+      return std::regex_search(parsed_url.path.value(), match_result, pattern);
+    } else if ((parsed_url.scheme == "http") || (parsed_url.scheme == "https")) {
+      return url_has_aws_s3_http_format(url) && !S3EndpointWithPresignedUrl::is_url_valid(url);
+    }
+  } catch (...) {
+  }
+  return false;
+}
+
 S3EndpointWithPresignedUrl::S3EndpointWithPresignedUrl(std::string presigned_url)
-  : _url{std::move(presigned_url)}
+  : RemoteEndpoint{RemoteEndpointType::S3_PRESIGNED_URL}, _url{std::move(presigned_url)}
 {
 }
 
@@ -439,6 +536,95 @@ void S3EndpointWithPresignedUrl::setup_range_request(CurlHandle& curl,
   setup_range_request_impl(curl, file_offset, size);
 }
 
+bool S3EndpointWithPresignedUrl::is_url_valid(std::string const& url) noexcept
+{
+  try {
+    if (!url_has_aws_s3_http_format(url)) { return false; }
+
+    auto parsed_url = detail::UrlParser::parse(url);
+    if (!parsed_url.query.has_value()) { return false; }
+
+    // Reference: https://docs.aws.amazon.com/AmazonS3/latest/API/sigv4-query-string-auth.html
+    return parsed_url.query->find("X-Amz-Algorithm") != std::string::npos &&
+           parsed_url.query->find("X-Amz-Signature") != std::string::npos;
+  } catch (...) {
+    return false;
+  }
+}
+
+RemoteHandle RemoteHandle::open(std::string url,
+                                RemoteEndpointType remote_endpoint_type,
+                                std::optional<std::vector<RemoteEndpointType>> allow_list,
+                                std::optional<std::size_t> nbytes)
+{
+  if (!allow_list.has_value()) {
+    allow_list = {RemoteEndpointType::S3,
+                  RemoteEndpointType::S3_PRESIGNED_URL,
+                  RemoteEndpointType::WEBHDFS,
+                  RemoteEndpointType::HTTP};
+  }
+
+  auto const scheme =
+    detail::UrlParser::extract_component(url, CURLUPART_SCHEME, CURLU_NON_SUPPORT_SCHEME);
+  KVIKIO_EXPECT(scheme.has_value(), "Missing scheme in URL.");
+
+  // Helper to create endpoint based on type
+  auto create_endpoint =
+    [&url = url, &scheme = scheme](RemoteEndpointType type) -> std::unique_ptr<RemoteEndpoint> {
+    switch (type) {
+      case RemoteEndpointType::S3:
+        if (!S3Endpoint::is_url_valid(url)) { return nullptr; }
+        if (scheme.value() == "s3") {
+          auto const [bucket, object] = S3Endpoint::parse_s3_url(url);
+          return std::make_unique<S3Endpoint>(std::pair{bucket, object});
+        }
+        return std::make_unique<S3Endpoint>(url);
+
+      case RemoteEndpointType::S3_PRESIGNED_URL:
+        if (!S3EndpointWithPresignedUrl::is_url_valid(url)) { return nullptr; }
+        return std::make_unique<S3EndpointWithPresignedUrl>(url);
+
+      case RemoteEndpointType::WEBHDFS:
+        if (!WebHdfsEndpoint::is_url_valid(url)) { return nullptr; }
+        return std::make_unique<WebHdfsEndpoint>(url);
+
+      case RemoteEndpointType::HTTP:
+        if (!HttpEndpoint::is_url_valid(url)) { return nullptr; }
+        return std::make_unique<HttpEndpoint>(url);
+
+      default: return nullptr;
+    }
+  };
+
+  std::unique_ptr<RemoteEndpoint> endpoint;
+
+  if (remote_endpoint_type == RemoteEndpointType::AUTO) {
+    // Try each allowed type in the order of allowlist
+    for (auto const& type : allow_list.value()) {
+      endpoint = create_endpoint(type);
+      if (endpoint) { break; }
+    }
+    KVIKIO_EXPECT(endpoint.get() != nullptr, "Unsupported endpoint URL.", std::runtime_error);
+  } else {
+    // Validate it is in the allow list
+    KVIKIO_EXPECT(
+      std::find(allow_list->begin(), allow_list->end(), remote_endpoint_type) != allow_list->end(),
+      std::string{get_remote_endpoint_type_name(remote_endpoint_type)} +
+        " is not in the allowlist.",
+      std::runtime_error);
+
+    // Create the specific type
+    endpoint = create_endpoint(remote_endpoint_type);
+    KVIKIO_EXPECT(endpoint.get() != nullptr,
+                  std::string{"Invalid URL for "} +
+                    get_remote_endpoint_type_name(remote_endpoint_type) + " endpoint",
+                  std::runtime_error);
+  }
+
+  return nbytes.has_value() ? RemoteHandle(std::move(endpoint), nbytes.value())
+                            : RemoteHandle(std::move(endpoint));
+}
+
 RemoteHandle::RemoteHandle(std::unique_ptr<RemoteEndpoint> endpoint, std::size_t nbytes)
   : _endpoint{std::move(endpoint)}, _nbytes{nbytes}
 {
@@ -452,6 +638,11 @@ RemoteHandle::RemoteHandle(std::unique_ptr<RemoteEndpoint> endpoint)
   _endpoint = std::move(endpoint);
 }
 
+RemoteEndpointType RemoteHandle::remote_endpoint_type() const noexcept
+{
+  return _endpoint->remote_endpoint_type();
+}
+
 std::size_t RemoteHandle::nbytes() const noexcept { return _nbytes; }
 
 RemoteEndpoint const& RemoteHandle::endpoint() const noexcept { return *_endpoint; }
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index 41de4bb6fa..a6fd2c67e4 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -79,6 +79,7 @@ kvikio_add_test(NAME MMAP_TEST SOURCES test_mmap.cpp)
 if(KvikIO_REMOTE_SUPPORT)
   kvikio_add_test(NAME REMOTE_HANDLE_TEST SOURCES test_remote_handle.cpp utils/env.cpp)
   kvikio_add_test(NAME HDFS_TEST SOURCES test_hdfs.cpp utils/hdfs_helper.cpp)
+  kvikio_add_test(NAME URL_TEST SOURCES test_url.cpp)
 endif()
 
 rapids_test_install_relocatable(INSTALL_COMPONENT_SET testing DESTINATION bin/tests/libkvikio)
diff --git a/cpp/tests/test_defaults.cpp b/cpp/tests/test_defaults.cpp
index 89bbe7399c..a74f38c86f 100644
--- a/cpp/tests/test_defaults.cpp
+++ b/cpp/tests/test_defaults.cpp
@@ -19,9 +19,9 @@
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
+#include <kvikio/compat_mode.hpp>
 #include <kvikio/defaults.hpp>
 
-#include "kvikio/compat_mode.hpp"
 #include "utils/env.hpp"
 
 using ::testing::HasSubstr;
diff --git a/cpp/tests/test_remote_handle.cpp b/cpp/tests/test_remote_handle.cpp
index 918479b0f0..ffb7c82266 100644
--- a/cpp/tests/test_remote_handle.cpp
+++ b/cpp/tests/test_remote_handle.cpp
@@ -14,12 +14,101 @@
  * limitations under the License.
  */
 
+#include <functional>
+#include <optional>
+#include <stdexcept>
+#include <utility>
+#include <vector>
+
+#include <gmock/gmock.h>
 #include <gtest/gtest.h>
+#include <kvikio/hdfs.hpp>
 #include <kvikio/remote_handle.hpp>
 
 #include "utils/env.hpp"
 
-TEST(RemoteHandleTest, s3_endpoint_constructor)
+using ::testing::HasSubstr;
+using ::testing::ThrowsMessage;
+
+class RemoteHandleTest : public testing::Test {
+ protected:
+  void SetUp() override
+  {
+    _sample_urls = {
+      // Endpoint type: S3
+      {"s3://bucket-name/object-key-name", kvikio::RemoteEndpointType::S3},
+      {"https://bucket-name.s3.region-code.amazonaws.com/object-key-name",
+       kvikio::RemoteEndpointType::S3},
+      {"https://s3.region-code.amazonaws.com/bucket-name/object-key-name",
+       kvikio::RemoteEndpointType::S3},
+      {"https://bucket-name.s3.amazonaws.com/object-key-name", kvikio::RemoteEndpointType::S3},
+      {"https://s3.amazonaws.com/bucket-name/object-key-name", kvikio::RemoteEndpointType::S3},
+      {"https://bucket-name.s3-region-code.amazonaws.com/object-key-name",
+       kvikio::RemoteEndpointType::S3},
+      {"https://s3-region-code.amazonaws.com/bucket-name/object-key-name",
+       kvikio::RemoteEndpointType::S3},
+
+      // Endpoint type: S3 presigned URL
+      {"https://bucket-name.s3.region-code.amazonaws.com/"
+       "object-key-name?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Signature=sig&X-Amz-Credential=cred&"
+       "X-Amz-SignedHeaders=host",
+       kvikio::RemoteEndpointType::S3_PRESIGNED_URL},
+
+      // Endpoint type: WebHDFS
+      {"https://host:1234/webhdfs/v1/data.bin", kvikio::RemoteEndpointType::WEBHDFS},
+    };
+  }
+
+  void TearDown() override {}
+
+  void test_helper(kvikio::RemoteEndpointType expected_endpoint_type,
+                   std::function<bool(const std::string&)> url_validity_checker)
+  {
+    for (auto const& [url, endpoint_type] : _sample_urls) {
+      if (endpoint_type == expected_endpoint_type) {
+        // Given that the URL is the expected endpoint type
+
+        // Test URL validity checker
+        EXPECT_TRUE(url_validity_checker(url));
+
+        // Test unified interface
+        {
+          // Here we pass the 1-byte argument to RemoteHandle::open. This prevents the endpoint
+          // constructor from querying the file size and sending requests to the server, thus
+          // allowing us to use dummy URLs for testing purpose.
+          auto remote_handle =
+            kvikio::RemoteHandle::open(url, kvikio::RemoteEndpointType::AUTO, std::nullopt, 1);
+          EXPECT_EQ(remote_handle.remote_endpoint_type(), expected_endpoint_type);
+        }
+
+        // Test explicit endpoint type specification
+        {
+          EXPECT_NO_THROW({
+            auto remote_handle =
+              kvikio::RemoteHandle::open(url, expected_endpoint_type, std::nullopt, 1);
+          });
+        }
+      } else {
+        // Given that the URL is NOT the expected endpoint type
+
+        // Test URL validity checker
+        EXPECT_FALSE(url_validity_checker(url));
+
+        // Test explicit endpoint type specification
+        {
+          EXPECT_ANY_THROW({
+            auto remote_handle =
+              kvikio::RemoteHandle::open(url, expected_endpoint_type, std::nullopt, 1);
+          });
+        }
+      }
+    }
+  }
+
+  std::vector<std::pair<std::string, kvikio::RemoteEndpointType>> _sample_urls;
+};
+
+TEST_F(RemoteHandleTest, s3_endpoint_constructor)
 {
   kvikio::test::EnvVarContext env_var_ctx{{"AWS_DEFAULT_REGION", "my_aws_default_region"},
                                           {"AWS_ACCESS_KEY_ID", "my_aws_access_key_id"},
@@ -37,3 +126,145 @@ TEST(RemoteHandleTest, s3_endpoint_constructor)
 
   EXPECT_EQ(s1.str(), s2.str());
 }
+
+TEST_F(RemoteHandleTest, test_http_url)
+{
+  // Invalid URLs
+  {
+    std::vector<std::string> const invalid_urls{// Incorrect scheme
+                                                "s3://example.com",
+                                                "hdfs://example.com",
+                                                // Missing file path
+                                                "http://example.com"};
+    for (auto const& invalid_url : invalid_urls) {
+      EXPECT_FALSE(kvikio::HttpEndpoint::is_url_valid(invalid_url));
+    }
+  }
+}
+
+TEST_F(RemoteHandleTest, test_s3_url)
+{
+  kvikio::test::EnvVarContext env_var_ctx{{"AWS_DEFAULT_REGION", "my_aws_default_region"},
+                                          {"AWS_ACCESS_KEY_ID", "my_aws_access_key_id"},
+                                          {"AWS_SECRET_ACCESS_KEY", "my_aws_secrete_access_key"}};
+
+  {
+    test_helper(kvikio::RemoteEndpointType::S3, kvikio::S3Endpoint::is_url_valid);
+  }
+
+  // Invalid URLs
+  {
+    std::vector<std::string> const invalid_urls{
+      // Lack object-name
+      "s3://bucket-name",
+      "https://bucket-name.s3.region-code.amazonaws.com",
+      // Presigned URL
+      "https://bucket-name.s3.region-code.amazonaws.com/"
+      "object-key-name?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Signature=sig&X-Amz-Credential="
+      "cred&"
+      "X-Amz-SignedHeaders=host"};
+    for (auto const& invalid_url : invalid_urls) {
+      EXPECT_FALSE(kvikio::S3Endpoint::is_url_valid(invalid_url));
+    }
+  }
+}
+
+TEST_F(RemoteHandleTest, test_s3_url_with_presigned_url)
+{
+  {
+    test_helper(kvikio::RemoteEndpointType::S3_PRESIGNED_URL,
+                kvikio::S3EndpointWithPresignedUrl::is_url_valid);
+  }
+
+  // Invalid URLs
+  {
+    std::vector<std::string> const invalid_urls{
+      // Presigned URL should not use S3 scheme
+      "s3://bucket-name/object-key-name",
+
+      // Completely missing query
+      "https://bucket-name.s3.region-code.amazonaws.com/object-key-name",
+
+      // Missing key parameters ("X-Amz-..."") in query
+      "https://bucket-name.s3.region-code.amazonaws.com/object-key-name?k0=v0&k1=v2"};
+    for (auto const& invalid_url : invalid_urls) {
+      EXPECT_FALSE(kvikio::S3EndpointWithPresignedUrl::is_url_valid(invalid_url));
+    }
+  }
+}
+
+TEST_F(RemoteHandleTest, test_webhdfs_url)
+{
+  {
+    test_helper(kvikio::RemoteEndpointType::WEBHDFS, kvikio::WebHdfsEndpoint::is_url_valid);
+  }
+
+  // Invalid URLs
+  {
+    std::vector<std::string> const invalid_urls{// Missing file
+                                                "https://host:1234/webhdfs/v1",
+                                                "https://host:1234/webhdfs/v1/",
+
+                                                // Missing WebHDFS identifier
+                                                "https://host:1234/data.bin",
+
+                                                // Missing port number
+                                                "https://host/webhdfs/v1/data.bin"};
+    for (auto const& invalid_url : invalid_urls) {
+      EXPECT_FALSE(kvikio::WebHdfsEndpoint::is_url_valid(invalid_url));
+    }
+  }
+}
+
+TEST_F(RemoteHandleTest, test_open)
+{
+  // Missing scheme
+  {
+    std::vector<std::string> const urls{
+      "example.com/path", "example.com:8080/path", "//example.com/path", "://example.com/path"};
+    for (auto const& url : urls) {
+      EXPECT_THROW(
+        { kvikio::RemoteHandle::open(url, kvikio::RemoteEndpointType::AUTO, std::nullopt, 1); },
+        std::runtime_error);
+    }
+  }
+
+  // Unsupported type
+  {
+    std::string const url{"unsupported://example.com/path"};
+    EXPECT_THAT(
+      [&] { kvikio::RemoteHandle::open(url, kvikio::RemoteEndpointType::AUTO, std::nullopt, 1); },
+      ThrowsMessage<std::runtime_error>(HasSubstr("Unsupported endpoint URL")));
+  }
+
+  // Specified URL not in the allowlist
+  {
+    std::string const url{"https://host:1234/webhdfs/v1/data.bin"};
+    std::vector<std::vector<kvikio::RemoteEndpointType>> const wrong_allowlists{
+      {},
+      {kvikio::RemoteEndpointType::S3},
+    };
+    for (auto const& wrong_allowlist : wrong_allowlists) {
+      EXPECT_THAT(
+        [&] {
+          kvikio::RemoteHandle::open(url, kvikio::RemoteEndpointType::WEBHDFS, wrong_allowlist, 1);
+        },
+        ThrowsMessage<std::runtime_error>(HasSubstr("is not in the allowlist")));
+    }
+  }
+
+  // Invalid URLs
+  {
+    std::vector<std::pair<std::string, kvikio::RemoteEndpointType>> const invalid_urls{
+      {"s3://bucket-name", kvikio::RemoteEndpointType::S3},
+      {"https://bucket-name.s3.region-code.amazonaws.com/object-key-name",
+       kvikio::RemoteEndpointType::S3_PRESIGNED_URL},
+      {"https://host:1234/webhdfs/v1", kvikio::RemoteEndpointType::WEBHDFS},
+      {"http://example.com", kvikio::RemoteEndpointType::HTTP},
+    };
+    for (auto const& [invalid_url, endpoint_type] : invalid_urls) {
+      EXPECT_THAT([&] { kvikio::RemoteHandle::open(invalid_url, endpoint_type, std::nullopt, 1); },
+                  ThrowsMessage<std::runtime_error>(HasSubstr("Invalid URL")));
+    }
+  }
+}
diff --git a/cpp/tests/test_url.cpp b/cpp/tests/test_url.cpp
new file mode 100644
index 0000000000..ce419ed5a5
--- /dev/null
+++ b/cpp/tests/test_url.cpp
@@ -0,0 +1,64 @@
+/*
+ * Copyright (c) 2025, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+
+#include <kvikio/detail/url.hpp>
+#include <stdexcept>
+
+using ::testing::HasSubstr;
+using ::testing::ThrowsMessage;
+
+TEST(UrlTest, parse_scheme)
+{
+  {
+    std::vector<std::string> invalid_scheme_urls{
+      "invalid_scheme://host",
+      // The S3 scheme is not supported by libcurl. Without the CURLU_NON_SUPPORT_SCHEME flag, an
+      // exception is expected.
+      "s3://host"};
+
+    for (auto const& invalid_scheme_url : invalid_scheme_urls) {
+      EXPECT_THAT([&] { kvikio::detail::UrlParser::parse(invalid_scheme_url); },
+                  ThrowsMessage<std::runtime_error>(HasSubstr("KvikIO detects an URL error")));
+    }
+  }
+
+  // With the CURLU_NON_SUPPORT_SCHEME flag, the S3 scheme is now accepted.
+  {
+    std::vector<std::string> schemes{"s3", "S3"};
+    for (auto const& scheme : schemes) {
+      auto parsed_url =
+        kvikio::detail::UrlParser::parse(scheme + "://host", CURLU_NON_SUPPORT_SCHEME);
+      EXPECT_EQ(parsed_url.scheme.value(), "s3");  // Lowercase due to CURL's normalization
+    }
+  }
+}
+
+TEST(UrlTest, parse_host)
+{
+  std::vector<std::string> invalid_host_urls{"http://host with spaces.com",
+                                             "http://host[brackets].com",
+                                             "http://host{braces}.com",
+                                             "http://host<angle>.com",
+                                             R"(http://host\backslash.com)",
+                                             "http://host^caret.com",
+                                             "http://host`backtick.com"};
+  for (auto const& invalid_host_url : invalid_host_urls) {
+    EXPECT_THROW({ kvikio::detail::UrlParser::parse(invalid_host_url); }, std::runtime_error);
+  }
+}

From ecfe488b6f5c9d18efefb0a6cf0ce7c1616e7cd1 Mon Sep 17 00:00:00 2001
From: Kyle Edwards <kyedwards@nvidia.com>
Date: Tue, 26 Aug 2025 16:24:46 -0400
Subject: [PATCH 25/40] Update rapids-dependency-file-generator (#809)

This PR updates the rapids-dependency-file-generator hook to get https://github.com/rapidsai/dependency-file-generator/pull/163.

Authors:
  - Kyle Edwards (https://github.com/KyleFromNVIDIA)

Approvers:
  - James Lamb (https://github.com/jameslamb)

URL: https://github.com/rapidsai/kvikio/pull/809
---
 .pre-commit-config.yaml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index f095cc2807..17ccb948fc 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -95,10 +95,10 @@ repos:
       - id: verify-codeowners
         args: [--fix, --project-prefix=kvikio]
   - repo: https://github.com/rapidsai/dependency-file-generator
-    rev: v1.19.0
+    rev: v1.20.0
     hooks:
       - id: rapids-dependency-file-generator
-        args: ["--clean"]
+        args: ["--clean", "--warn-all", "--strict"]
   - repo: https://github.com/shellcheck-py/shellcheck-py
     rev: v0.10.0.1
     hooks:

From fed9abdf09d4fa727278d7fac265b91e132fb153 Mon Sep 17 00:00:00 2001
From: Tianyu Liu <tialiu@nvidia.com>
Date: Wed, 27 Aug 2025 09:22:11 -0400
Subject: [PATCH 26/40] Fix an S3 parsing bug in the open function. Improve
 regex usage (#810)

AWS S3 provides a non-standard S3 scheme for internal use (such as for AWS CLI). The URL takes the form `s3://<bucket-name>/<object-name>`, where `<object-name>` may contain `/` characters indicating subdirectories.

The newly added `open` function for remote I/O currently uses an incorrect regular expression, causing object names containing subdirectories to be rejected. This PR fixes this bug.

This PR also improves the usage of regular expression by making the pattern constant `static`.

Authors:
  - Tianyu Liu (https://github.com/kingcrimsontianyu)

Approvers:
  - Mads R. B. Kristensen (https://github.com/madsbk)

URL: https://github.com/rapidsai/kvikio/pull/810
---
 cpp/src/hdfs.cpp                 | 12 ++++++------
 cpp/src/http_status_codes.cpp    |  4 ++--
 cpp/src/remote_handle.cpp        | 11 +++++------
 cpp/tests/test_remote_handle.cpp |  1 +
 4 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/cpp/src/hdfs.cpp b/cpp/src/hdfs.cpp
index 2e032a1af7..2aa5f53278 100644
--- a/cpp/src/hdfs.cpp
+++ b/cpp/src/hdfs.cpp
@@ -29,7 +29,7 @@ WebHdfsEndpoint::WebHdfsEndpoint(std::string url) : RemoteEndpoint{RemoteEndpoin
 {
   // todo: Use libcurl URL API for more secure and idiomatic parsing.
   // Split the URL into two parts: one without query and one with.
-  std::regex const pattern{R"(^([^?]+)\?([^#]*))"};
+  std::regex static const pattern{R"(^([^?]+)\?([^#]*))"};
   // Regex meaning:
   // ^: From the start of the line
   // [^?]+: Matches non-question-mark characters one or more times. The question mark ushers in the
@@ -51,7 +51,7 @@ WebHdfsEndpoint::WebHdfsEndpoint(std::string url) : RemoteEndpoint{RemoteEndpoin
   {
     // Extract user name if provided. In WebHDFS, user name is specified as the key=value pair in
     // the query
-    std::regex const pattern{R"(user.name=([^&]+))"};
+    std::regex static const pattern{R"(user.name=([^&]+))"};
     // Regex meaning:
     // [^&]+: Matches the non-ampersand character one or more times. The ampersand delimits
     // different parameters.
@@ -104,7 +104,7 @@ std::size_t WebHdfsEndpoint::get_file_size()
   KVIKIO_EXPECT(http_status_code == 200, "HTTP response is not successful.");
 
   // The response is in JSON format. The file size is given by `"length":<file_size>`.
-  std::regex const pattern{R"("length"\s*:\s*(\d+)[^\d])"};
+  std::regex static const pattern{R"("length"\s*:\s*(\d+)[^\d])"};
   // Regex meaning:
   // \s*: Matches the space character zero or more times.
   // \d+: Matches the digit one or more times.
@@ -132,9 +132,9 @@ void WebHdfsEndpoint::setup_range_request(CurlHandle& curl,
 bool WebHdfsEndpoint::is_url_valid(std::string const& url) noexcept
 {
   try {
-    std::regex const pattern(R"(^https?://[^/]+:\d+/webhdfs/v1/.+$)", std::regex_constants::icase);
-    std::smatch match_result;
-    return std::regex_match(url, match_result, pattern);
+    std::regex static const pattern(R"(^https?://[^/]+:\d+/webhdfs/v1/.+$)",
+                                    std::regex_constants::icase);
+    return std::regex_match(url, pattern);
   } catch (...) {
     return false;
   }
diff --git a/cpp/src/http_status_codes.cpp b/cpp/src/http_status_codes.cpp
index 9b9cd3d793..da4a4cc379 100644
--- a/cpp/src/http_status_codes.cpp
+++ b/cpp/src/http_status_codes.cpp
@@ -31,13 +31,13 @@ std::vector<int> parse_http_status_codes(std::string_view env_var_name,
                                          std::string const& status_codes)
 {
   // Ensure `status_codes` consists only of 3-digit integers separated by commas, allowing spaces.
-  std::regex const check_pattern(R"(^\s*\d{3}\s*(\s*,\s*\d{3}\s*)*$)");
+  std::regex static const check_pattern(R"(^\s*\d{3}\s*(\s*,\s*\d{3}\s*)*$)");
   KVIKIO_EXPECT(std::regex_match(status_codes, check_pattern),
                 std::string{env_var_name} + ": invalid format, expected comma-separated integers.",
                 std::invalid_argument);
 
   // Match every integer in `status_codes`.
-  std::regex const number_pattern(R"(\d+)");
+  std::regex static const number_pattern(R"(\d+)");
 
   // For each match, we push_back `std::stoi(match.str())` into `ret`.
   std::vector<int> ret;
diff --git a/cpp/src/remote_handle.cpp b/cpp/src/remote_handle.cpp
index 3cf2acc862..a173804321 100644
--- a/cpp/src/remote_handle.cpp
+++ b/cpp/src/remote_handle.cpp
@@ -320,7 +320,7 @@ std::pair<std::string, std::string> S3Endpoint::parse_s3_url(std::string const&
 {
   KVIKIO_NVTX_FUNC_RANGE();
   // Regular expression to match s3://<bucket>/<object>
-  std::regex const pattern{R"(^s3://([^/]+)/(.+))", std::regex_constants::icase};
+  std::regex static const pattern{R"(^s3://([^/]+)/(.+))", std::regex_constants::icase};
   std::smatch matches;
   if (std::regex_match(s3_url, matches, pattern)) { return {matches[1].str(), matches[2].str()}; }
   KVIKIO_FAIL("Input string does not match the expected S3 URL format.", std::invalid_argument);
@@ -336,7 +336,7 @@ S3Endpoint::S3Endpoint(std::string url,
 {
   KVIKIO_NVTX_FUNC_RANGE();
   // Regular expression to match http[s]://
-  std::regex pattern{R"(^https?://.*)", std::regex_constants::icase};
+  std::regex static const pattern{R"(^https?://.*)", std::regex_constants::icase};
   KVIKIO_EXPECT(std::regex_search(_url, pattern),
                 "url must start with http:// or https://",
                 std::invalid_argument);
@@ -434,9 +434,8 @@ bool S3Endpoint::is_url_valid(std::string const& url) noexcept
       if (!parsed_url.path.has_value()) { return false; }
 
       // Check whether the S3 object key exists
-      std::regex const pattern(R"(^/[^/]+$)", std::regex::icase);
-      std::smatch match_result;
-      return std::regex_search(parsed_url.path.value(), match_result, pattern);
+      std::regex static const pattern(R"(^/.+$)");
+      return std::regex_search(parsed_url.path.value(), pattern);
     } else if ((parsed_url.scheme == "http") || (parsed_url.scheme == "https")) {
       return url_has_aws_s3_http_format(url) && !S3EndpointWithPresignedUrl::is_url_valid(url);
     }
@@ -485,7 +484,7 @@ std::size_t callback_header(char* data, std::size_t size, std::size_t num_bytes,
   // Content-Range: <unit> <range>/<size>
   // Content-Range: <unit> <range>/*
   // Content-Range: <unit> */<size>
-  std::regex const pattern(R"(Content-Range:[^/]+/(.*))", std::regex::icase);
+  std::regex static const pattern(R"(Content-Range:[^/]+/(.*))", std::regex::icase);
   std::smatch match_result;
   bool found = std::regex_search(header_line, match_result, pattern);
   if (found) {
diff --git a/cpp/tests/test_remote_handle.cpp b/cpp/tests/test_remote_handle.cpp
index ffb7c82266..a7e3d1e829 100644
--- a/cpp/tests/test_remote_handle.cpp
+++ b/cpp/tests/test_remote_handle.cpp
@@ -37,6 +37,7 @@ class RemoteHandleTest : public testing::Test {
     _sample_urls = {
       // Endpoint type: S3
       {"s3://bucket-name/object-key-name", kvikio::RemoteEndpointType::S3},
+      {"s3://bucket-name/object-key-name-dir/object-key-name-file", kvikio::RemoteEndpointType::S3},
       {"https://bucket-name.s3.region-code.amazonaws.com/object-key-name",
        kvikio::RemoteEndpointType::S3},
       {"https://s3.region-code.amazonaws.com/bucket-name/object-key-name",

From 6efd22dc6ae3389caea7d3e736c7f954b9db0619 Mon Sep 17 00:00:00 2001
From: Tianyu Liu <tialiu@nvidia.com>
Date: Wed, 27 Aug 2025 09:44:43 -0400
Subject: [PATCH 27/40] Add a unified remote I/O interface that infers the
 endpoint type from URL (2/2): Python binding (#808)

This PR adds Python binding to https://github.com/rapidsai/kvikio/pull/793
Closes #807

Authors:
  - Tianyu Liu (https://github.com/kingcrimsontianyu)

Approvers:
  - Mads R. B. Kristensen (https://github.com/madsbk)

URL: https://github.com/rapidsai/kvikio/pull/808
---
 cpp/include/kvikio/remote_handle.hpp        |  22 ++-
 docs/source/api.rst                         |   2 +
 python/kvikio/kvikio/__init__.py            |   3 +-
 python/kvikio/kvikio/_lib/remote_handle.pyx |  86 +++++++++-
 python/kvikio/kvikio/remote_file.py         | 170 ++++++++++++++++++++
 python/kvikio/tests/test_s3_io.py           |  61 +++++++
 6 files changed, 333 insertions(+), 11 deletions(-)

diff --git a/cpp/include/kvikio/remote_handle.hpp b/cpp/include/kvikio/remote_handle.hpp
index 0d56231d03..1c6c887281 100644
--- a/cpp/include/kvikio/remote_handle.hpp
+++ b/cpp/include/kvikio/remote_handle.hpp
@@ -35,15 +35,23 @@ namespace kvikio {
 class CurlHandle;  // Prototype
 
 /**
- * @brief Type of remote file.
+ * @brief Types of remote file endpoints supported by KvikIO.
+ *
+ * This enum defines the different protocols and services that can be used to access remote files.
+ * It is used to specify or detect the type of remote endpoint when opening files.
  */
 enum class RemoteEndpointType : uint8_t {
-  AUTO,  ///< Let KvikIO infer the type of remote file from the URL and create a proper endpoint.
-  S3,    ///< AWS S3 (based on HTTP/HTTPS protocols).
-  S3_PRESIGNED_URL,  ///< AWS S3 presigned URL (based on HTTP/HTTPS protocols).
-  WEBHDFS,           ///< Apache Hadoop WebHDFS (based on HTTP/HTTPS protocols).
-  HTTP,  ///< Generic HTTP/HTTPS, excluding all the specific types listed above that use HTTP/HTTPS
-         ///< protocols.
+  AUTO,  ///< Automatically detect the endpoint type from the URL. KvikIO will attempt to infer the
+         ///< appropriate protocol based on the URL format.
+  S3,    ///< AWS S3 endpoint using credentials-based authentication. Requires AWS environment
+         ///< variables (such as AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY, AWS_DEFAULT_REGION) to be
+         ///< set.
+  S3_PRESIGNED_URL,  ///< AWS S3 endpoint using a presigned URL. No credentials required as
+                     ///< authentication is embedded in the URL with time-limited access.
+  WEBHDFS,  ///< Apache Hadoop WebHDFS (Web-based Hadoop Distributed File System) endpoint for
+            ///< accessing files stored in HDFS over HTTP/HTTPS.
+  HTTP,  ///< Generic HTTP or HTTPS endpoint for accessing files from web servers. This is used for
+         ///< standard web resources that do not fit the other specific categories.
 };
 
 /**
diff --git a/docs/source/api.rst b/docs/source/api.rst
index 7ae724dec5..e11f4cf55b 100644
--- a/docs/source/api.rst
+++ b/docs/source/api.rst
@@ -51,6 +51,8 @@ RemoteFile
 ----------
 .. currentmodule:: kvikio.remote_file
 
+.. autoclass:: RemoteEndpointType
+
 .. autoclass:: RemoteFile
     :members:
 
diff --git a/python/kvikio/kvikio/__init__.py b/python/kvikio/kvikio/__init__.py
index 5c6d8b6353..124698206e 100644
--- a/python/kvikio/kvikio/__init__.py
+++ b/python/kvikio/kvikio/__init__.py
@@ -16,7 +16,7 @@
 from kvikio._version import __git_commit__, __version__
 from kvikio.cufile import CuFile, clear_page_cache, get_page_cache_info
 from kvikio.mmap import Mmap
-from kvikio.remote_file import RemoteFile, is_remote_file_available
+from kvikio.remote_file import RemoteEndpointType, RemoteFile, is_remote_file_available
 
 __all__ = [
     "__git_commit__",
@@ -26,5 +26,6 @@
     "Mmap",
     "get_page_cache_info",
     "is_remote_file_available",
+    "RemoteEndpointType",
     "RemoteFile",
 ]
diff --git a/python/kvikio/kvikio/_lib/remote_handle.pyx b/python/kvikio/kvikio/_lib/remote_handle.pyx
index 17222685ae..edfd59e78d 100644
--- a/python/kvikio/kvikio/_lib/remote_handle.pyx
+++ b/python/kvikio/kvikio/_lib/remote_handle.pyx
@@ -7,17 +7,25 @@
 from typing import Optional
 
 from cython.operator cimport dereference as deref
-from libc.stdint cimport uintptr_t
+from libc.stdint cimport uint8_t, uintptr_t
 from libcpp.memory cimport make_unique, unique_ptr
+from libcpp.optional cimport nullopt, optional
 from libcpp.pair cimport pair
 from libcpp.string cimport string
 from libcpp.utility cimport move, pair
+from libcpp.vector cimport vector
 
 from kvikio._lib.arr cimport parse_buffer_argument
 from kvikio._lib.future cimport IOFuture, _wrap_io_future, future
 
 
-cdef extern from "<kvikio/remote_handle.hpp>" nogil:
+cdef extern from "<kvikio/remote_handle.hpp>" namespace "kvikio" nogil:
+    cpdef enum class RemoteEndpointType(uint8_t):
+        AUTO = 0
+        S3 = 1
+        S3_PRESIGNED_URL = 2
+        WEBHDFS = 3
+        HTTP = 4
     cdef cppclass cpp_RemoteEndpoint "kvikio::RemoteEndpoint":
         string str() except +
 
@@ -40,7 +48,8 @@ cdef extern from "<kvikio/remote_handle.hpp>" nogil:
             unique_ptr[cpp_RemoteEndpoint] endpoint, size_t nbytes
         ) except +
         cpp_RemoteHandle(unique_ptr[cpp_RemoteEndpoint] endpoint) except +
-        size_t nbytes() except +
+        RemoteEndpointType remote_endpoint_type() noexcept
+        size_t nbytes() noexcept
         const cpp_RemoteEndpoint& endpoint() except +
         size_t read(
             void* buf,
@@ -53,6 +62,14 @@ cdef extern from "<kvikio/remote_handle.hpp>" nogil:
             size_t file_offset
         ) except +
 
+        @staticmethod
+        cpp_RemoteHandle cpp_easy_open "open"(
+            string url,
+            RemoteEndpointType remote_endpoint_type,
+            optional[vector[RemoteEndpointType]] allow_list,
+            optional[size_t] nbytes
+        ) except +
+
 cdef extern from "<kvikio/hdfs.hpp>" nogil:
     cdef cppclass cpp_WebHdfsEndpoint "kvikio::WebHdfsEndpoint"(cpp_RemoteEndpoint):
         cpp_WebHdfsEndpoint(string url) except +
@@ -80,6 +97,28 @@ cdef extern from * nogil:
     """
     cdef unique_ptr[cpp_RemoteEndpoint] cast_to_remote_endpoint[T](T handle) except +
 
+# Helper function for the cpp_RemoteHandle.open method to return
+# unique_ptr[cpp_RemoteHandle] instead of cpp_RemoteHandle. Due to lack of a nullary
+# constructor, cpp_RemoteHandle cannot be created as a stack variable in Cython.
+cdef extern from * nogil:
+    """
+    inline std::unique_ptr<kvikio::RemoteHandle> create_remote_handle_from_open(
+        std::string url,
+        kvikio::RemoteEndpointType remote_endpoint_type,
+        std::optional<std::vector<kvikio::RemoteEndpointType>> allow_list,
+        std::optional<std::size_t> nbytes)
+    {
+        return std::make_unique<kvikio::RemoteHandle>(
+            kvikio::RemoteHandle::open(url, remote_endpoint_type, allow_list, nbytes)
+        );
+    }
+    """
+    cdef unique_ptr[cpp_RemoteHandle] create_remote_handle_from_open(
+        string url,
+        RemoteEndpointType remote_endpoint_type,
+        optional[vector[RemoteEndpointType]] allow_list,
+        optional[size_t] nbytes
+    ) except +
 
 cdef class RemoteFile:
     cdef unique_ptr[cpp_RemoteHandle] _handle
@@ -209,12 +248,53 @@ cdef class RemoteFile:
             nbytes
         )
 
+    @staticmethod
+    def open(
+        url: str,
+        remote_endpoint_type: RemoteEndpointType,
+        allow_list: Optional[list],
+        nbytes: Optional[int]
+    ):
+        cdef optional[vector[RemoteEndpointType]] cpp_allow_list
+        cdef vector[RemoteEndpointType] vec_allow_list
+        if allow_list is None:
+            cpp_allow_list = nullopt
+        else:
+            for allow_item in allow_list:
+                vec_allow_list.push_back(allow_item.value)
+            cpp_allow_list = vec_allow_list
+
+        cdef optional[size_t] cpp_nbytes
+        if nbytes is None:
+            cpp_nbytes = nullopt
+        else:
+            cpp_nbytes = <size_t>nbytes
+
+        cdef RemoteFile ret = RemoteFile()
+        cdef unique_ptr[cpp_RemoteHandle] cpp_handle
+        cdef string cpp_url = _to_string(url)
+        with nogil:
+            cpp_handle = create_remote_handle_from_open(
+                cpp_url,
+                remote_endpoint_type,
+                cpp_allow_list,
+                cpp_nbytes)
+        ret._handle = move(cpp_handle)
+
+        return ret
+
     def __str__(self) -> str:
         cdef string ep_str
         with nogil:
             ep_str = deref(self._handle).endpoint().str()
         return f'<{self.__class__.__name__} "{ep_str.decode()}">'
 
+    def remote_endpoint_type(self) -> RemoteEndpointType:
+        cdef RemoteEndpointType result
+        with nogil:
+            result = deref(self._handle).remote_endpoint_type()
+        return result
+
     def nbytes(self) -> int:
         cdef size_t result
         with nogil:
diff --git a/python/kvikio/kvikio/remote_file.py b/python/kvikio/kvikio/remote_file.py
index 105c42e438..a3f73d271a 100644
--- a/python/kvikio/kvikio/remote_file.py
+++ b/python/kvikio/kvikio/remote_file.py
@@ -3,6 +3,7 @@
 
 from __future__ import annotations
 
+import enum
 import functools
 import urllib.parse
 from typing import Optional
@@ -10,6 +11,50 @@
 from kvikio.cufile import IOFuture
 
 
+class RemoteEndpointType(enum.Enum):
+    """
+    Types of remote file endpoints supported by KvikIO.
+
+    This enum defines the different protocols and services that can be used
+    to access remote files. It is used to specify or detect the type of
+    remote endpoint when opening files.
+
+    Attributes
+    ----------
+    AUTO : int
+        Automatically detect the endpoint type from the URL. KvikIO will
+        attempt to infer the appropriate protocol based on the URL format.
+    S3 : int
+        AWS S3 endpoint using credentials-based authentication. Requires
+        AWS environment variables (such as AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY,
+        AWS_DEFAULT_REGION) to be set.
+    S3_PRESIGNED_URL : int
+        AWS S3 endpoint using a presigned URL. No credentials required as
+        authentication is embedded in the URL with time-limited access.
+    WEBHDFS : int
+        Apache Hadoop WebHDFS (Web-based Hadoop Distributed File System)
+        endpoint for accessing files stored in HDFS over HTTP/HTTPS.
+    HTTP : int
+        Generic HTTP or HTTPS endpoint for accessing files from web servers.
+        This is used for standard web resources that do not fit the other
+        specific categories.
+
+    See Also
+    --------
+    RemoteFile.open : Factory method that uses this enum to specify endpoint types.
+    """
+
+    AUTO = 0
+    S3 = 1
+    S3_PRESIGNED_URL = 2
+    WEBHDFS = 3
+    HTTP = 4
+
+    @staticmethod
+    def _map_to_internal(remote_endpoint_type: RemoteEndpointType):
+        return _get_remote_module().RemoteEndpointType[remote_endpoint_type.name]
+
+
 @functools.cache
 def is_remote_file_available() -> bool:
     """Check if the remote module is available"""
@@ -185,6 +230,122 @@ def open_webhdfs(
         """
         return RemoteFile(_get_remote_module().RemoteFile.open_webhdfs(url, nbytes))
 
+    @classmethod
+    def open(
+        cls,
+        url: str,
+        remote_endpoint_type: RemoteEndpointType = RemoteEndpointType.AUTO,
+        allow_list: Optional[list] = None,
+        nbytes: Optional[int] = None,
+    ) -> RemoteFile:
+        """
+        Create a remote file handle from a URL.
+
+        This function creates a RemoteFile for reading data from various remote
+        endpoints including HTTP/HTTPS servers, AWS S3 buckets, S3 presigned URLs,
+        and WebHDFS. The endpoint type can be automatically detected from the URL
+        or explicitly specified.
+
+        Parameters
+        ----------
+        url : str
+            The URL of the remote file. Supported formats include:
+
+            - S3 with credentials
+            - S3 presigned URL
+            - WebHDFS
+            - HTTP/HTTPS
+        remote_endpoint_type : RemoteEndpointType, optional
+            The type of remote endpoint. Default is :class:`RemoteEndpointType.AUTO`
+            which automatically detects the endpoint type from the URL. Can be
+            explicitly set to :class:`RemoteEndpointType.S3`,
+            :class:`RemoteEndpointType.S3_PRESIGNED_URL`,
+            :class:`RemoteEndpointType.WEBHDFS`, or :class:`RemoteEndpointType.HTTP`
+            to force a specific endpoint type.
+        allow_list : list of RemoteEndpointType, optional
+            List of allowed endpoint types. If provided:
+
+            - If remote_endpoint_type is :class:`RemoteEndpointType.AUTO`, types are
+              tried in the exact order specified until a match is found.
+            - In explicit mode, the specified type must be in this list, otherwise an
+              exception is thrown.
+
+            If not provided, defaults to all supported types in this order:
+            :class:`RemoteEndpointType.S3`,
+            :class:`RemoteEndpointType.S3_PRESIGNED_URL`,
+            :class:`RemoteEndpointType.WEBHDFS`, and :class:`RemoteEndpointType.HTTP`.
+        nbytes : int, optional
+            File size in bytes. If not provided, the function sends an additional
+            request to the server to query the file size.
+
+        Returns
+        -------
+        RemoteFile
+            A RemoteFile object that can be used to read data from the remote file.
+
+        Raises
+        ------
+        RuntimeError
+            - If the URL is malformed or missing required components.
+            - :class:`RemoteEndpointType.AUTO` mode is used and the URL does not match
+              any supported endpoint type.
+            - The specified endpoint type is not in the `allow_list`.
+            - The URL is invalid for the specified endpoint type.
+            - Unable to connect to the remote server or determine file size
+              (when nbytes not provided).
+
+        Examples
+        --------
+        - Auto-detect endpoint type from URL:
+
+          .. code-block::
+
+             handle = RemoteFile.open(
+                 "https://bucket.s3.amazonaws.com/object?X-Amz-Algorithm=AWS4-HMAC-SHA256"
+                 "&X-Amz-Credential=...&X-Amz-Signature=..."
+             )
+
+        - Open S3 file with explicit endpoint type:
+
+          .. code-block::
+
+             handle = RemoteFile.open(
+                 "https://my-bucket.s3.us-east-1.amazonaws.com/data.bin",
+                 remote_endpoint_type=RemoteEndpointType.S3
+             )
+
+        - Restrict endpoint type candidates:
+
+          .. code-block::
+
+             handle = RemoteFile.open(
+                 user_provided_url,
+                 remote_endpoint_type=RemoteEndpointType.AUTO,
+                 allow_list=[
+                     RemoteEndpointType.HTTP,
+                     RemoteEndpointType.S3_PRESIGNED_URL
+                 ]
+             )
+
+        - Provide known file size to skip HEAD request:
+
+          .. code-block::
+
+             handle = RemoteFile.open(
+                 "https://example.com/large-file.bin",
+                 remote_endpoint_type=RemoteEndpointType.HTTP,
+                 nbytes=1024 * 1024 * 100  # 100 MB
+             )
+        """
+        return RemoteFile(
+            _get_remote_module().RemoteFile.open(
+                url,
+                RemoteEndpointType._map_to_internal(remote_endpoint_type),
+                allow_list,
+                nbytes,
+            )
+        )
+
     def close(self) -> None:
         """Close the file"""
         pass
@@ -198,6 +359,15 @@ def __exit__(self, exc_type, exc_val, exc_tb) -> None:
     def __str__(self) -> str:
         return str(self._handle)
 
+    def remote_endpoint_type(self) -> RemoteEndpointType:
+        """Get the type of the remote file.
+
+        Returns
+        -------
+        The type of the remote file.
+        """
+        return RemoteEndpointType[self._handle.remote_endpoint_type().name]
+
     def nbytes(self) -> int:
         """Get the file size.
 
diff --git a/python/kvikio/tests/test_s3_io.py b/python/kvikio/tests/test_s3_io.py
index 2256bc1284..510940d284 100644
--- a/python/kvikio/tests/test_s3_io.py
+++ b/python/kvikio/tests/test_s3_io.py
@@ -156,3 +156,64 @@ def test_read_with_file_offset(s3_base, xp, start, end):
             b = xp.zeros(shape=(end - start,), dtype=xp.int64)
             assert f.read(b, file_offset=start * a.itemsize) == b.nbytes
             xp.testing.assert_array_equal(a[start:end], b)
+
+
+@pytest.mark.parametrize("scheme", ["S3"])
+@pytest.mark.parametrize(
+    "remote_endpoint_type",
+    [kvikio.RemoteEndpointType.S3.AUTO, kvikio.RemoteEndpointType.S3],
+)
+@pytest.mark.parametrize("allow_list", [None, [kvikio.RemoteEndpointType.S3]])
+@pytest.mark.parametrize("nbytes", [None, 1])
+def test_open_valid(s3_base, scheme, remote_endpoint_type, allow_list, nbytes):
+    bucket_name = "bucket_name"
+    object_name = "object_name"
+    data = b"file content"
+    with s3_context(
+        s3_base=s3_base, bucket=bucket_name, files={object_name: bytes(data)}
+    ) as server_address:
+        if scheme == "S3":
+            url = f"{scheme}://{bucket_name}/{object_name}"
+        else:
+            url = f"{server_address}/{bucket_name}/{object_name}"
+
+        if nbytes is None:
+            expected_file_size = len(data)
+        else:
+            expected_file_size = nbytes
+
+        with kvikio.RemoteFile.open(url, remote_endpoint_type, allow_list, nbytes) as f:
+            assert f.nbytes() == expected_file_size
+            assert f.remote_endpoint_type() == kvikio.RemoteEndpointType.S3
+
+
+def test_open_invalid(s3_base):
+    bucket_name = "bucket_name"
+    object_name = "object_name"
+    data = b"file content"
+    with s3_context(
+        s3_base=s3_base, bucket=bucket_name, files={object_name: bytes(data)}
+    ) as server_address:
+        # Missing scheme
+        url = f"://{bucket_name}/{object_name}"
+        with pytest.raises(RuntimeError, match="Bad scheme"):
+            kvikio.RemoteFile.open(url)
+
+        # Unsupported type
+        url = f"unsupported://{bucket_name}/{object_name}"
+        with pytest.raises(RuntimeError, match="Unsupported endpoint URL"):
+            kvikio.RemoteFile.open(url)
+
+        # Specified URL not in the allowlist
+        url = f"{server_address}/{bucket_name}/{object_name}"
+        with pytest.raises(RuntimeError, match="not in the allowlist"):
+            kvikio.RemoteFile.open(
+                url, kvikio.RemoteEndpointType.S3, [kvikio.RemoteEndpointType.WEBHDFS]
+            )
+
+        # Invalid URLs
+        url = f"s3://{bucket_name}"
+        with pytest.raises(RuntimeError, match="Unsupported endpoint URL"):
+            kvikio.RemoteFile.open(url)
+        with pytest.raises(RuntimeError, match="Invalid URL"):
+            kvikio.RemoteFile.open(url, kvikio.RemoteEndpointType.S3)

From 2ec42b4a565872a781765f7690b91b41535b1897 Mon Sep 17 00:00:00 2001
From: James Lamb <jaylamb20@gmail.com>
Date: Fri, 5 Sep 2025 14:54:32 -0500
Subject: [PATCH 28/40] Use branch-25.10 again (#812)

Contributes to https://github.com/rapidsai/build-planning/issues/208

Now that https://github.com/rapidsai/shared-workflows/pull/413 is
merged, this converts all GitHub Actions references from `@cuda13.0`
back to `branch-25.10`.

## Notes for Reviewers

This is safe to admin-merge because the change is a no-op... configs on
those 2 branches are identical.
---
 .github/workflows/build.yaml                  | 16 ++++++------
 .github/workflows/pr.yaml                     | 26 +++++++++----------
 .github/workflows/test.yaml                   |  6 ++---
 .../trigger-breaking-change-alert.yaml        |  2 +-
 4 files changed, 25 insertions(+), 25 deletions(-)

diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
index e119ba8ac1..0070274f37 100644
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -34,7 +34,7 @@ concurrency:
 jobs:
   cpp-build:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@cuda13.0
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-25.10
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -44,7 +44,7 @@ jobs:
   python-build:
     needs: [cpp-build]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@cuda13.0
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-25.10
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -54,7 +54,7 @@ jobs:
   upload-conda:
     needs: [cpp-build, python-build]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-upload-packages.yaml@cuda13.0
+    uses: rapidsai/shared-workflows/.github/workflows/conda-upload-packages.yaml@branch-25.10
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -64,7 +64,7 @@ jobs:
     if: github.ref_type == 'branch'
     needs: python-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@cuda13.0
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-25.10
     with:
       arch: "amd64"
       branch: ${{ inputs.branch }}
@@ -76,7 +76,7 @@ jobs:
       sha: ${{ inputs.sha }}
   wheel-build-cpp:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@cuda13.0
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-25.10
     with:
       matrix_filter: group_by([.ARCH, (.CUDA_VER|split(".")|map(tonumber)|.[0])]) | map(max_by(.PY_VER|split(".")|map(tonumber)))
       build_type: ${{ inputs.build_type || 'branch' }}
@@ -89,7 +89,7 @@ jobs:
   wheel-build-python:
     needs: wheel-build-cpp
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@cuda13.0
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-25.10
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -101,7 +101,7 @@ jobs:
   wheel-publish-cpp:
     needs: wheel-build-cpp
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@cuda13.0
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-25.10
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -112,7 +112,7 @@ jobs:
   wheel-publish-python:
     needs: wheel-build-python
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@cuda13.0
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-25.10
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index 4afc5ca548..f610dce204 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -27,7 +27,7 @@ jobs:
       - wheel-python-tests
       - telemetry-setup
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@cuda13.0
+    uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@branch-25.10
     if: always()
     with:
       needs: ${{ toJSON(needs) }}
@@ -43,7 +43,7 @@ jobs:
           repo: kvikio
   changed-files:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/changed-files.yaml@cuda13.0
+    uses: rapidsai/shared-workflows/.github/workflows/changed-files.yaml@branch-25.10
     with:
       files_yaml: |
         test_cpp:
@@ -86,20 +86,20 @@ jobs:
   checks:
     secrets: inherit
     needs: telemetry-setup
-    uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@cuda13.0
+    uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@branch-25.10
     with:
       ignored_pr_jobs: telemetry-summarize
   conda-cpp-build:
     needs: checks
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@cuda13.0
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-25.10
     with:
       build_type: pull-request
       script: ci/build_cpp.sh
   conda-cpp-tests:
     needs: [conda-cpp-build, changed-files]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@cuda13.0
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-25.10
     if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_cpp
     with:
       build_type: pull-request
@@ -107,7 +107,7 @@ jobs:
   conda-java-tests:
     needs: conda-cpp-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@cuda13.0
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-25.10
     with:
       build_type: pull-request
       node_type: "gpu-l4-latest-1"
@@ -117,14 +117,14 @@ jobs:
   conda-python-build:
     needs: conda-cpp-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@cuda13.0
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-25.10
     with:
       build_type: pull-request
       script: ci/build_python.sh
   conda-python-tests:
     needs: [conda-python-build, changed-files]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@cuda13.0
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-25.10
     if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_python
     with:
       build_type: pull-request
@@ -132,7 +132,7 @@ jobs:
   docs-build:
     needs: conda-python-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@cuda13.0
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-25.10
     with:
       build_type: pull-request
       node_type: "gpu-l4-latest-1"
@@ -142,7 +142,7 @@ jobs:
   devcontainer:
     needs: telemetry-setup
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/build-in-devcontainer.yaml@cuda13.0
+    uses: rapidsai/shared-workflows/.github/workflows/build-in-devcontainer.yaml@branch-25.10
     with:
       arch: '["amd64", "arm64"]'
       cuda: '["13.0"]'
@@ -160,7 +160,7 @@ jobs:
         sccache --show-adv-stats | tee telemetry-artifacts/sccache-stats.txt;
   wheel-cpp-build:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@cuda13.0
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-25.10
     with:
       matrix_filter: group_by([.ARCH, (.CUDA_VER|split(".")|map(tonumber)|.[0])]) | map(max_by(.PY_VER|split(".")|map(tonumber)))
       build_type: pull-request
@@ -170,7 +170,7 @@ jobs:
   wheel-python-build:
     needs: wheel-cpp-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@cuda13.0
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-25.10
     with:
       build_type: pull-request
       script: ci/build_wheel_python.sh
@@ -179,7 +179,7 @@ jobs:
   wheel-python-tests:
     needs: [wheel-python-build, changed-files]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@cuda13.0
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-25.10
     if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_python
     with:
       build_type: pull-request
diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
index 4c4ff60385..5f618498b3 100644
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -25,7 +25,7 @@ on:
 jobs:
   cpp-tests:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@cuda13.0
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-25.10
     with:
       build_type: ${{ inputs.build_type }}
       branch: ${{ inputs.branch }}
@@ -34,7 +34,7 @@ jobs:
       sha: ${{ inputs.sha }}
   python-tests:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@cuda13.0
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-25.10
     with:
       build_type: ${{ inputs.build_type }}
       branch: ${{ inputs.branch }}
@@ -43,7 +43,7 @@ jobs:
       sha: ${{ inputs.sha }}
   conda-java-tests:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@cuda13.0
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-25.10
     with:
       build_type: ${{ inputs.build_type }}
       branch: ${{ inputs.branch }}
diff --git a/.github/workflows/trigger-breaking-change-alert.yaml b/.github/workflows/trigger-breaking-change-alert.yaml
index 72751d071b..48bf37afc4 100644
--- a/.github/workflows/trigger-breaking-change-alert.yaml
+++ b/.github/workflows/trigger-breaking-change-alert.yaml
@@ -12,7 +12,7 @@ jobs:
   trigger-notifier:
     if: contains(github.event.pull_request.labels.*.name, 'breaking')
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/breaking-change-alert.yaml@cuda13.0
+    uses: rapidsai/shared-workflows/.github/workflows/breaking-change-alert.yaml@branch-25.10
     with:
       sender_login: ${{ github.event.sender.login }}
       sender_avatar: ${{ github.event.sender.avatar_url }}

From f4e022e9fcb768421c6e6ab5dddb0611b746fac8 Mon Sep 17 00:00:00 2001
From: ahoyle-nvidia <ahoyle@nvidia.com>
Date: Tue, 9 Sep 2025 06:30:04 -0700
Subject: [PATCH 29/40] Skip max_device_cache_size setter when BAR1 memory
 isn't present on the GPUs in the system (#814)

We've seen multiple issues over the months from DGX Spark users when it comes to this specific file. This PR address these issues by applying a skip for the max_device_cache_size (cuFileDriverSetMaxCacheSize) setter by examining the output of nvidia-smi.

Authors:
  - https://github.com/ahoyle-nvidia
  - Tianyu Liu (https://github.com/kingcrimsontianyu)

Approvers:
  - Tianyu Liu (https://github.com/kingcrimsontianyu)
  - Mads R. B. Kristensen (https://github.com/madsbk)

URL: https://github.com/rapidsai/kvikio/pull/814
---
 python/kvikio/tests/test_cufile_driver.py | 34 +++++++++++++++++------
 1 file changed, 25 insertions(+), 9 deletions(-)

diff --git a/python/kvikio/tests/test_cufile_driver.py b/python/kvikio/tests/test_cufile_driver.py
index d85cd35d3e..ed715b859a 100644
--- a/python/kvikio/tests/test_cufile_driver.py
+++ b/python/kvikio/tests/test_cufile_driver.py
@@ -1,11 +1,21 @@
 # Copyright (c) 2024-2025, NVIDIA CORPORATION. All rights reserved.
 # See file LICENSE for terms.
 
+import subprocess
+
 import pytest
 
 import kvikio.cufile_driver
 
 
+def has_bar_memory() -> bool:
+    try:
+        output = subprocess.check_output(["nvidia-smi"], text=True)
+        return "Not Supported" not in output
+    except Exception:
+        return False
+
+
 def test_version():
     major, minor = kvikio.cufile_driver.libcufile_version()
     assert major >= 0
@@ -56,16 +66,22 @@ def test_property_accessor():
         max_device_cache_size_default = kvikio.cufile_driver.get(
             "max_device_cache_size"
         )
-        with kvikio.cufile_driver.set(
-            {"poll_mode": True, "max_device_cache_size": 2048}
-        ):
-            assert kvikio.cufile_driver.get("poll_mode") and (
-                kvikio.cufile_driver.get("max_device_cache_size") == 2048
+        if has_bar_memory():
+            with kvikio.cufile_driver.set(
+                {"poll_mode": True, "max_device_cache_size": 2048}
+            ):
+                assert kvikio.cufile_driver.get("poll_mode") and (
+                    kvikio.cufile_driver.get("max_device_cache_size") == 2048
+                )
+            assert (kvikio.cufile_driver.get("poll_mode") == poll_mode_default) and (
+                kvikio.cufile_driver.get("max_device_cache_size")
+                == max_device_cache_size_default
             )
-        assert (kvikio.cufile_driver.get("poll_mode") == poll_mode_default) and (
-            kvikio.cufile_driver.get("max_device_cache_size")
-            == max_device_cache_size_default
-        )
+        else:
+            with kvikio.cufile_driver.set("poll_mode", True):
+                assert kvikio.cufile_driver.get("poll_mode")
+            assert kvikio.cufile_driver.get("poll_mode") == poll_mode_default
+
     except RuntimeError as e:
         if "KvikIO not compiled with cuFile.h" in str(e):
             pytest.skip("KvikIO not compiled with cuFile.h, skipping cuFile tests")

From c87a02201829053019f033a8afb7aa99f7518987 Mon Sep 17 00:00:00 2001
From: Tom Augspurger <toaugspurger@nvidia.com>
Date: Wed, 10 Sep 2025 18:11:52 -0500
Subject: [PATCH 30/40] Added KVIKIO_REMOTE_VERBOSE option (#815)

Our HTTP library, libcurl, includes a [`CURLOPT_VERBOSE`](https://curl.se/libcurl/c/CURLOPT_VERBOSE.html) setting that can be useful for debugging. To help our users debug things, I've added a new `KVIKIO_REMOTE_VERBOSE` option that configures this. By default, it's off (no change). If the user sets `KVIKIO_REMOTE_VERBOSE=1` then information from the HTTP requests and responses will be printed to stderr.

Authors:
  - Tom Augspurger (https://github.com/TomAugspurger)

Approvers:
  - Tianyu Liu (https://github.com/kingcrimsontianyu)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/kvikio/pull/815
---
 cpp/doxygen/main_page.md         |  7 +++++++
 cpp/src/shim/libcurl.cpp         |  4 ++++
 docs/source/runtime_settings.rst | 11 +++++++++++
 python/kvikio/kvikio/defaults.py |  3 ++-
 4 files changed, 24 insertions(+), 1 deletion(-)

diff --git a/cpp/doxygen/main_page.md b/cpp/doxygen/main_page.md
index cd9fe05557..a74bad75d6 100644
--- a/cpp/doxygen/main_page.md
+++ b/cpp/doxygen/main_page.md
@@ -137,6 +137,13 @@ Note that if you're reading a large file that has been split into multiple reads
 
 These settings can also be controlled by `defaults::http_max_attempts()`, `defaults::http_max_attempts_reset()`, `defaults::http_status_codes()`, and `defaults::http_status_codes_reset()`.
 
+#### Remote Verbose (KVIKIO_REMOTE_VERBOSE)
+For debugging HTTP requests, you can enable verbose output that shows detailed information about HTTP communication including headers, request/response bodies, connection details, and SSL handshake information.
+
+Set the environment variable `KVIKIO_REMOTE_VERBOSE` to `true`, `on`, `yes`, or `1` (case-insensitive) to enable verbose output. Otherwise, verbose output is disabled by default.
+
+**Warning** this may show sensitive contents from headers and data.
+
 ## Example
 
 ```cpp
diff --git a/cpp/src/shim/libcurl.cpp b/cpp/src/shim/libcurl.cpp
index 613dad32f8..feddf33254 100644
--- a/cpp/src/shim/libcurl.cpp
+++ b/cpp/src/shim/libcurl.cpp
@@ -112,6 +112,10 @@ CurlHandle::CurlHandle(LibCurl::UniqueHandlePtr handle,
 
   // Make requests time out after `value` seconds.
   setopt(CURLOPT_TIMEOUT, kvikio::defaults::http_timeout());
+
+  // Optionally enable verbose output if it's configured.
+  auto const verbose = getenv_or("KVIKIO_REMOTE_VERBOSE", false);
+  if (verbose) { setopt(CURLOPT_VERBOSE, 1L); }
 }
 
 CurlHandle::~CurlHandle() noexcept { LibCurl::instance().retain_handle(std::move(_handle)); }
diff --git a/docs/source/runtime_settings.rst b/docs/source/runtime_settings.rst
index bb347ba23c..0e304c9ed5 100644
--- a/docs/source/runtime_settings.rst
+++ b/docs/source/runtime_settings.rst
@@ -53,3 +53,14 @@ KvikIO will retry a request should any of the HTTP status code in ``KVIKIO_HTTP_
 The maximum number of attempts to make before throwing an exception is controlled by ``KVIKIO_HTTP_MAX_ATTEMPTS``. The default value is 3. This setting can also be accessed using :py:func:`kvikio.defaults.http_max_attempts` (getter) and :py:func:`kvikio.defaults.set` (setter).
 
 The maximum duration of each HTTP request is controlled by ``KVIKIO_HTTP_TIMEOUT``. The default value is 60, which is the duration in seconds to allow. This setting can also be accessed using :py:func:`kvikio.defaults.http_timoeout` (getter) and :py:func:`kvikio.defaults.set` (setter).
+
+HTTP Verbose ``KVIKIO_REMOTE_VERBOSE``
+--------------------------------------
+
+For debugging HTTP requests, you can enable verbose output that shows detailed information about HTTP communication including headers, request/response bodies, connection details, and SSL handshake information.
+
+Set the environment variable ``KVIKIO_REMOTE_VERBOSE`` to ``true``, ``on``, ``yes``, or ``1`` (case-insensitive) to enable verbose output. Otherwise, verbose output is disabled by default.
+
+.. warning::
+
+   This may show sensitive contents from headers and data.
diff --git a/python/kvikio/kvikio/defaults.py b/python/kvikio/kvikio/defaults.py
index be57d2739c..ee0ebf5f95 100644
--- a/python/kvikio/kvikio/defaults.py
+++ b/python/kvikio/kvikio/defaults.py
@@ -124,7 +124,7 @@ def set(*config) -> ConfigContextManager:
         - ``"bounce_buffer_size"``
         - ``"http_max_attempts"``
         - ``"http_status_codes"``
-        - ``*http_timeout*``
+        - ``"http_timeout"``
 
     Returns
     -------
@@ -167,6 +167,7 @@ def get(config_name: str) -> Any:
         - ``"bounce_buffer_size"``
         - ``"http_max_attempts"``
         - ``"http_status_codes"``
+        - ``"http_timeout"``
 
     Returns
     -------

From d637c6ecb0d32259e9a5e2d8137f875c7de569e2 Mon Sep 17 00:00:00 2001
From: Tianyu Liu <kingcrimsontianyu@gmail.com>
Date: Fri, 12 Sep 2025 09:15:32 -0400
Subject: [PATCH 31/40] Use C++20 for KvikIO main library (#819)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Previous PR https://github.com/rapidsai/kvikio/pull/749 forgets to bring the entrée to the table: Only the C++ code in tests and benchmarks use C++20, but not the main library. This PR fixes this oversight.

Authors:
  - Tianyu Liu (https://github.com/kingcrimsontianyu)

Approvers:
  - Mads R. B. Kristensen (https://github.com/madsbk)

URL: https://github.com/rapidsai/kvikio/pull/819
---
 cpp/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 6f9c249cfa..f94409c57e 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -205,7 +205,7 @@ set_target_properties(
   kvikio
   PROPERTIES BUILD_RPATH "\$ORIGIN"
              INSTALL_RPATH "\$ORIGIN"
-             CXX_STANDARD 17
+             CXX_STANDARD 20
              CXX_STANDARD_REQUIRED ON
              POSITION_INDEPENDENT_CODE ON
              INTERFACE_POSITION_INDEPENDENT_CODE ON

From dde14359d9461707da865d1c65bebc268954a03e Mon Sep 17 00:00:00 2001
From: Tianyu Liu <kingcrimsontianyu@gmail.com>
Date: Mon, 15 Sep 2025 09:02:55 -0400
Subject: [PATCH 32/40] Explicitly manage TLS/SSL CA paths for remote I/O
 (#817)

## Background

`libcurl` have two path parameters related to the certificate authority (CA):

- `CURLOPT_CAINFO`, which specifies the CA bundle file path.
- `CURLOPT_CAPATH`, which specifies the directory of individual CA certificates with hash-based naming.

The default paths are determined at compile-time, which can cause issues if the Linux distributions where `libcurl` is built and run are different (e.g. on Rocky Linux vs Ubuntu as in our CI vs our lab system), and the certificates files are likely at different locations. This problem has been observed in KvikIO's wheel distribution, where HTTPS would fail with the message:

>error setting certificate verify locations:  CAfile: /etc/pki/tls/certs/ca-bundle.crt CApath: /etc/ssl/certs

## This PR

This PR addresses this problem. The certificate path is now explicitly searched for in the following order. The compile-time parameters, if any, are still used but treated with lowest priority.

- CA bundle file: Check env vars `CURL_CA_BUNDLE`, and `SSL_CERT_FILE`
- CA directory: Check env vars `SSL_CERT_DIR`
- CA bundle file: Search a set of distribution-specific locations for accessible bundle
- CA directory: Search a set of distribution-specific locations for accessible directory
- CA bundle file: Check if the compile-time path is given and accessible
- CA directory: Check if the compile-time parameter is given and accessible

Depends on https://github.com/rapidsai/kvikio/pull/819 for the use of  `static` structured binding which is only available in C++ >=20

Closes #711

Authors:
  - Tianyu Liu (https://github.com/kingcrimsontianyu)

Approvers:
  - Mads R. B. Kristensen (https://github.com/madsbk)

URL: https://github.com/rapidsai/kvikio/pull/817
---
 cpp/CMakeLists.txt                |   2 +-
 cpp/include/kvikio/detail/tls.hpp |  58 ++++++++++++
 cpp/src/detail/tls.cpp            | 150 ++++++++++++++++++++++++++++++
 cpp/src/shim/libcurl.cpp          |   3 +
 cpp/tests/CMakeLists.txt          |   1 +
 cpp/tests/test_tls.cpp            |  48 ++++++++++
 docs/source/runtime_settings.rst  |  10 ++
 7 files changed, 271 insertions(+), 1 deletion(-)
 create mode 100644 cpp/include/kvikio/detail/tls.hpp
 create mode 100644 cpp/src/detail/tls.cpp
 create mode 100644 cpp/tests/test_tls.cpp

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index f94409c57e..9c66e51875 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -163,7 +163,7 @@ set(SOURCES
 
 if(KvikIO_REMOTE_SUPPORT)
   list(APPEND SOURCES "src/hdfs.cpp" "src/remote_handle.cpp" "src/detail/remote_handle.cpp"
-       "src/detail/url.cpp" "src/shim/libcurl.cpp"
+       "src/detail/tls.cpp" "src/detail/url.cpp" "src/shim/libcurl.cpp"
   )
 endif()
 
diff --git a/cpp/include/kvikio/detail/tls.hpp b/cpp/include/kvikio/detail/tls.hpp
new file mode 100644
index 0000000000..be40eaf5e5
--- /dev/null
+++ b/cpp/include/kvikio/detail/tls.hpp
@@ -0,0 +1,58 @@
+/*
+ * Copyright (c) 2025, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <optional>
+#include <string>
+#include <utility>
+
+#include <kvikio/shim/libcurl.hpp>
+
+namespace kvikio::detail {
+/**
+ * @brief Search for the CA bundle file and directory paths
+ *
+ * This function searches for the Certificate Authority (CA) paths required for TLS/SSL verification
+ * in libcurl. The search is performed in the following priority order, returning as soon as either
+ * a bundle file or a directory is found:
+ * - CA bundle file: Check env vars CURL_CA_BUNDLE, SSL_CERT_FILE
+ * - CA directory: Check env var SSL_CERT_DIR
+ * - CA bundle file: Search distribution-specific locations for accessible bundle
+ * - CA directory: Search distribution-specific locations for accessible directory
+ * - CA bundle file: Check if curl's compile-time default bundle path is accessible
+ * - CA directory: Check if curl's compile-time default directory path is accessible
+ *
+ * @return Result containing CA bundle file and CA certificate directory
+ *
+ * @exception std::runtime_error if neither CA bundle nor directory is found
+ *
+ * @note Environment Variables:
+ * - CURL_CA_BUNDLE: Override CA bundle file location (curl-specific)
+ * - SSL_CERT_FILE: Override CA bundle file location (OpenSSL-compatible)
+ * - SSL_CERT_DIR: Override CA directory location (OpenSSL-compatible)
+ */
+std::pair<std::optional<std::string>, std::optional<std::string>> get_ca_paths();
+
+/**
+ * @brief Configure curl handle with discovered CA certificate paths
+ *
+ * As a performance optimization, the discovered CA certificate paths are cached to avoid repeated
+ * searching.
+ *
+ * @param curl Curl handle to configure with CA certificate paths
+ */
+void set_up_ca_paths(CurlHandle& curl);
+}  // namespace kvikio::detail
diff --git a/cpp/src/detail/tls.cpp b/cpp/src/detail/tls.cpp
new file mode 100644
index 0000000000..0dccb6b612
--- /dev/null
+++ b/cpp/src/detail/tls.cpp
@@ -0,0 +1,150 @@
+/*
+ * Copyright (c) 2025, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <unistd.h>
+#include <cstdlib>
+
+#include <curl/curl.h>
+#include <kvikio/detail/tls.hpp>
+#include <kvikio/error.hpp>
+#include <kvikio/shim/libcurl.hpp>
+#include <optional>
+#include <stdexcept>
+
+namespace kvikio::detail {
+
+namespace {
+/**
+ * @brief Search for a CA certificate path using environment variables
+ *
+ * @param env_vars Environment variable names to check in order
+ * @return Path string if found in any environment variable, std::nullopt otherwise
+ */
+std::optional<std::string> find_ca_path_from_env_var(std::vector<std::string> const& env_vars)
+{
+  for (auto const& env_var : env_vars) {
+    auto const* path = std::getenv(env_var.data());
+    if (path != nullptr) { return path; }
+  }
+
+  return std::nullopt;
+}
+
+/**
+ * @brief Search for a CA certificate path in standard system locations
+ *
+ * @param system_paths file system paths to check in order
+ * @return First accessible path if found, std::nullopt otherwise
+ */
+std::optional<std::string> find_ca_path_in_system_locations(
+  std::vector<std::string> const& system_paths)
+{
+  for (auto const& path : system_paths) {
+    // Check whether the file/directory exists, and whether it grants read permission to the calling
+    // process's real UID and GID. If the path is a symbolic link, it is dereferenced.
+    auto const result = access(path.data(), R_OK);
+
+    if (result != -1) { return path; }
+  }
+
+  return std::nullopt;
+}
+
+/**
+ * @brief Get CA certificate path from curl's compile-time defaults
+ *
+ * @param default_path Path provided by curl_version_info (may be nullptr)
+ * @return Path string if accessible, std::nullopt otherwise
+ */
+std::optional<std::string> get_ca_path_from_curl_defaults(char const* default_path)
+{
+  if (default_path != nullptr && access(default_path, R_OK) != -1) { return default_path; }
+
+  return std::nullopt;
+}
+}  // namespace
+
+std::pair<std::optional<std::string>, std::optional<std::string>> get_ca_paths()
+{
+  auto* version_info = curl_version_info(::CURLVERSION_NOW);
+  KVIKIO_EXPECT(version_info != nullptr, "Failed to get curl version info", std::runtime_error);
+
+  std::optional<std::string> ca_bundle_file;
+  std::optional<std::string> ca_directory;
+
+  // Priority 1: CA bundle file from environment variables
+  ca_bundle_file = find_ca_path_from_env_var({
+    "CURL_CA_BUNDLE",  // curl program
+    "SSL_CERT_FILE"    // OpenSSL
+  });
+  if (ca_bundle_file.has_value()) { return {ca_bundle_file, ca_directory}; }
+
+  // Priority 2: CA directory from environment variables
+  ca_directory = find_ca_path_from_env_var({
+    "SSL_CERT_DIR"  // OpenSSL
+  });
+  if (ca_directory.has_value()) { return {ca_bundle_file, ca_directory}; }
+
+  // Priority 3: CA bundle file from system locations
+  ca_bundle_file = find_ca_path_in_system_locations(
+    {"/etc/ssl/certs/ca-certificates.crt",                 // Debian/Ubuntu, Arch, Alpine, Gentoo
+     "/etc/pki/tls/certs/ca-bundle.crt",                   // RHEL/CentOS/Rocky/AlmaLinux, Fedora
+     "/etc/ssl/ca-bundle.pem",                             // OpenSUSE/SLES
+     "/etc/pki/tls/cert.pem",                              // RHEL-based (symlink to ca-bundle.crt)
+     "/etc/pki/ca-trust/extracted/pem/tls-ca-bundle.pem",  // Fedora 28+, RHEL 8+
+
+     // Additional locations mentioned by libcurl:
+     // https://github.com/curl/curl/blob/master/CMakeLists.txt
+     "/usr/share/ssl/certs/ca-bundle.crt",
+     "/usr/local/share/certs/ca-root-nss.crt",
+     "/etc/ssl/cert.pem"});
+  if (ca_bundle_file.has_value()) { return {ca_bundle_file, ca_directory}; }
+
+  // Priority 4: CA directory from system locations
+  ca_directory = find_ca_path_in_system_locations({
+    "/etc/ssl/certs",     // Debian/Ubuntu, Arch, Alpine, OpenSUSE, Gentoo
+    "/etc/pki/tls/certs"  // RHEL/CentOS/Rocky/AlmaLinux, Fedora
+  });
+  if (ca_directory.has_value()) { return {ca_bundle_file, ca_directory}; }
+
+  // Priority 5: CA bundle file from curl compile-time defaults
+  ca_bundle_file = get_ca_path_from_curl_defaults(version_info->cainfo);
+  if (ca_bundle_file.has_value()) { return {ca_bundle_file, ca_directory}; }
+
+  // Priority 6: CA directory from curl compile-time defaults
+  ca_directory = get_ca_path_from_curl_defaults(version_info->capath);
+  if (ca_directory.has_value()) { return {ca_bundle_file, ca_directory}; }
+
+  // At least one path must exist
+  KVIKIO_EXPECT(ca_bundle_file.has_value() || ca_directory.has_value(),
+                "Failed to find accessible CA certificates.",
+                std::runtime_error);
+  return {ca_bundle_file, ca_directory};
+}
+
+void set_up_ca_paths(CurlHandle& curl)
+{
+  static auto const [ca_bundle_file, ca_directory] = get_ca_paths();
+
+  if (ca_bundle_file.has_value()) {
+    curl.setopt(CURLOPT_CAINFO, ca_bundle_file->c_str());
+    curl.setopt(CURLOPT_CAPATH, nullptr);
+  } else if (ca_directory.has_value()) {
+    curl.setopt(CURLOPT_CAINFO, nullptr);
+    curl.setopt(CURLOPT_CAPATH, ca_directory->c_str());
+  }
+}
+}  // namespace kvikio::detail
diff --git a/cpp/src/shim/libcurl.cpp b/cpp/src/shim/libcurl.cpp
index feddf33254..a78fb33d30 100644
--- a/cpp/src/shim/libcurl.cpp
+++ b/cpp/src/shim/libcurl.cpp
@@ -29,6 +29,7 @@
 #include <curl/curl.h>
 
 #include <kvikio/defaults.hpp>
+#include <kvikio/detail/tls.hpp>
 #include <kvikio/error.hpp>
 #include <kvikio/parallel_operation.hpp>
 #include <kvikio/posix_io.hpp>
@@ -116,6 +117,8 @@ CurlHandle::CurlHandle(LibCurl::UniqueHandlePtr handle,
   // Optionally enable verbose output if it's configured.
   auto const verbose = getenv_or("KVIKIO_REMOTE_VERBOSE", false);
   if (verbose) { setopt(CURLOPT_VERBOSE, 1L); }
+
+  detail::set_up_ca_paths(*this);
 }
 
 CurlHandle::~CurlHandle() noexcept { LibCurl::instance().retain_handle(std::move(_handle)); }
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index a6fd2c67e4..afa7e8d97b 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -79,6 +79,7 @@ kvikio_add_test(NAME MMAP_TEST SOURCES test_mmap.cpp)
 if(KvikIO_REMOTE_SUPPORT)
   kvikio_add_test(NAME REMOTE_HANDLE_TEST SOURCES test_remote_handle.cpp utils/env.cpp)
   kvikio_add_test(NAME HDFS_TEST SOURCES test_hdfs.cpp utils/hdfs_helper.cpp)
+  kvikio_add_test(NAME TLS_TEST SOURCES test_tls.cpp utils/env.cpp)
   kvikio_add_test(NAME URL_TEST SOURCES test_url.cpp)
 endif()
 
diff --git a/cpp/tests/test_tls.cpp b/cpp/tests/test_tls.cpp
new file mode 100644
index 0000000000..4b04f10296
--- /dev/null
+++ b/cpp/tests/test_tls.cpp
@@ -0,0 +1,48 @@
+/*
+ * Copyright (c) 2025, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+
+#include <kvikio/detail/tls.hpp>
+
+#include "utils/env.hpp"
+
+TEST(TlsTest, get_ca_paths)
+{
+  std::string const expected_ca_bundle_path{"ca_bundle_path"};
+  std::string const expected_ca_directory{"ca_directory"};
+  {
+    // Env var CURL_CA_BUNDLE has the highest priority. Both SSL_CERT_FILE and SSL_CERT_DIR shall be
+    // skipped
+    kvikio::test::EnvVarContext env_var_ctx{{"CURL_CA_BUNDLE", expected_ca_bundle_path},
+                                            {"SSL_CERT_FILE", "another_ca_bundle_path"},
+                                            {"SSL_CERT_DIR", expected_ca_directory}};
+    auto const& [ca_bundle_file, ca_directory] = kvikio::detail::get_ca_paths();
+
+    EXPECT_EQ(ca_bundle_file, expected_ca_bundle_path);
+    EXPECT_EQ(ca_directory, std::nullopt);
+  }
+
+  {
+    // Env var CURL_CA_BUNDLE and SSL_CERT_FILE are not specified, SSL_CERT_DIR shall be used
+    kvikio::test::EnvVarContext env_var_ctx{{"SSL_CERT_DIR", expected_ca_directory}};
+    auto const& [ca_bundle_file, ca_directory] = kvikio::detail::get_ca_paths();
+
+    EXPECT_EQ(ca_bundle_file, std::nullopt);
+    EXPECT_EQ(ca_directory, expected_ca_directory);
+  }
+}
diff --git a/docs/source/runtime_settings.rst b/docs/source/runtime_settings.rst
index 0e304c9ed5..e707031720 100644
--- a/docs/source/runtime_settings.rst
+++ b/docs/source/runtime_settings.rst
@@ -64,3 +64,13 @@ Set the environment variable ``KVIKIO_REMOTE_VERBOSE`` to ``true``, ``on``, ``ye
 .. warning::
 
    This may show sensitive contents from headers and data.
+
+CA bundle file and CA directory ``CURL_CA_BUNDLE``, ``SSL_CERT_FILE``, ``SSL_CERT_DIR``
+---------------------------------------------------------------------------------------
+
+The Certificate Authority (CA) paths required for TLS/SSL verification in ``libcurl`` can be explicitly specified using the following environment variables in order of overriding priority:
+
+  * ``CURL_CA_BUNDLE`` (also used in the ``curl`` program) or ``SSL_CERT_FILE`` (also used in OpenSSL): Specifies the CA certificate bundle file location.
+  * ``SSL_CERT_DIR`` (also used in OpenSSL): Specifies the CA certificate directory.
+
+When neither is specified, KvikIO searches several standard system locations for the CA file and directory, and if the search fails falls back to the libcurl compile-time defaults.

From b8626abbe9cfd341e7542b23e1d19bd13a685873 Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Mon, 15 Sep 2025 08:27:11 -0700
Subject: [PATCH 33/40] Reduce duplication between compat manager and defaults
 (#816)

Some of these APIs were identical but presumably duplicated due to otherwise creating a circular include dependency. Moving the manager out of the compat_mode header resolves that and allows us to remove the duplication.

Authors:
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Kyle Edwards (https://github.com/KyleFromNVIDIA)
  - Mads R. B. Kristensen (https://github.com/madsbk)

URL: https://github.com/rapidsai/kvikio/pull/816
---
 cpp/CMakeLists.txt                         |   1 +
 cpp/include/kvikio/compat_mode.hpp         | 102 +------------------
 cpp/include/kvikio/compat_mode_manager.hpp | 106 ++++++++++++++++++++
 cpp/include/kvikio/file_handle.hpp         |   4 +-
 cpp/src/compat_mode.cpp                    |  99 -------------------
 cpp/src/compat_mode_manager.cpp            | 110 +++++++++++++++++++++
 6 files changed, 219 insertions(+), 203 deletions(-)
 create mode 100644 cpp/include/kvikio/compat_mode_manager.hpp
 create mode 100644 cpp/src/compat_mode_manager.cpp

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 9c66e51875..5db5fa6f50 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -144,6 +144,7 @@ set(SOURCES
     "src/bounce_buffer.cpp"
     "src/buffer.cpp"
     "src/compat_mode.cpp"
+    "src/compat_mode_manager.cpp"
     "src/http_status_codes.cpp"
     "src/cufile/config.cpp"
     "src/cufile/driver.cpp"
diff --git a/cpp/include/kvikio/compat_mode.hpp b/cpp/include/kvikio/compat_mode.hpp
index 03ed443fe0..d25f0ffaef 100644
--- a/cpp/include/kvikio/compat_mode.hpp
+++ b/cpp/include/kvikio/compat_mode.hpp
@@ -17,9 +17,7 @@
 #pragma once
 
 #include <cstdint>
-#include <string>
-
-#include <kvikio/file_utils.hpp>
+#include <string_view>
 
 namespace kvikio {
 /**
@@ -49,102 +47,4 @@ CompatMode parse_compat_mode_str(std::string_view compat_mode_str);
 
 }  // namespace detail
 
-// Forward declaration.
-class FileHandle;
-
-/**
- * @brief Store and manage the compatibility mode data associated with a FileHandle.
- */
-class CompatModeManager {
- private:
-  CompatMode _compat_mode_requested{CompatMode::AUTO};
-  bool _is_compat_mode_preferred{true};
-  bool _is_compat_mode_preferred_for_async{true};
-
- public:
-  /**
-   * @brief Construct an empty compatibility mode manager.
-   */
-  CompatModeManager() noexcept = default;
-
-  /**
-   * @brief Construct a compatibility mode manager associated with a FileHandle.
-   *
-   * According to the file path, requested compatibility mode, and the system configuration, the
-   * compatibility manager:
-   * - Infers the final compatibility modes for synchronous and asynchronous I/O paths,
-   * respectively.
-   * - Initializes the file wrappers and cuFile handle associated with a FileHandle.
-   *
-   * @param file_path Refer to
-   * FileHandle::FileHandle(std::string const&, std::string const&, mode_t, CompatMode).
-   * @param flags Same as above.
-   * @param mode Same as above.
-   * @param compat_mode_requested Same as above.
-   * @param file_handle Pointer to the FileHandle object that owns this compatibility mode manager.
-   */
-  CompatModeManager(std::string const& file_path,
-                    std::string const& flags,
-                    mode_t mode,
-                    CompatMode compat_mode_requested,
-                    FileHandle* file_handle);
-
-  ~CompatModeManager() noexcept                              = default;
-  CompatModeManager(const CompatModeManager&)                = default;
-  CompatModeManager& operator=(const CompatModeManager&)     = default;
-  CompatModeManager(CompatModeManager&&) noexcept            = default;
-  CompatModeManager& operator=(CompatModeManager&&) noexcept = default;
-
-  /**
-   * @brief Functionally identical to defaults::infer_compat_mode_if_auto(CompatMode).
-   *
-   * @param compat_mode Compatibility mode.
-   * @return If the given compatibility mode is CompatMode::AUTO, infer the final compatibility
-   * mode.
-   */
-  CompatMode infer_compat_mode_if_auto(CompatMode compat_mode) noexcept;
-
-  /**
-   * @brief Functionally identical to defaults::is_compat_mode_preferred(CompatMode).
-   *
-   * @param compat_mode Compatibility mode.
-   * @return Boolean answer.
-   */
-  bool is_compat_mode_preferred(CompatMode compat_mode) noexcept;
-
-  /**
-   * @brief Check if the compatibility mode for synchronous I/O of the associated FileHandle is
-   * expected to be CompatMode::ON.
-   *
-   * @return Boolean answer.
-   */
-  bool is_compat_mode_preferred() const noexcept;
-
-  /**
-   * @brief Check if the compatibility mode for asynchronous I/O of the associated FileHandle is
-   * expected to be CompatMode::ON.
-   *
-   * @return Boolean answer.
-   */
-  bool is_compat_mode_preferred_for_async() const noexcept;
-
-  /**
-   * @brief Retrieve the original compatibility mode requested.
-   *
-   * @return The original compatibility mode requested.
-   */
-  CompatMode compat_mode_requested() const noexcept;
-
-  /**
-   * @brief Determine if asynchronous I/O can be performed or not (throw exceptions)
-   * according to the existing compatibility mode data in the manager.
-   *
-   * Asynchronous I/O cannot be performed, for instance, when compat_mode_requested() is
-   * CompatMode::OFF, is_compat_mode_preferred() is CompatMode::OFF, but
-   * is_compat_mode_preferred_for_async() is CompatMode::ON (due to missing cuFile stream API or
-   * cuFile configuration file).
-   */
-  void validate_compat_mode_for_async() const;
-};
-
 }  // namespace kvikio
diff --git a/cpp/include/kvikio/compat_mode_manager.hpp b/cpp/include/kvikio/compat_mode_manager.hpp
new file mode 100644
index 0000000000..4d15f2ba9f
--- /dev/null
+++ b/cpp/include/kvikio/compat_mode_manager.hpp
@@ -0,0 +1,106 @@
+/*
+ * Copyright (c) 2025, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <string>
+
+#include <kvikio/compat_mode.hpp>
+
+namespace kvikio {
+
+// Forward declaration.
+class FileHandle;
+
+/**
+ * @brief Store and manage the compatibility mode data associated with a FileHandle.
+ */
+class CompatModeManager {
+ private:
+  CompatMode _compat_mode_requested{CompatMode::AUTO};
+  bool _is_compat_mode_preferred{true};
+  bool _is_compat_mode_preferred_for_async{true};
+
+ public:
+  /**
+   * @brief Construct an empty compatibility mode manager.
+   */
+  CompatModeManager() noexcept = default;
+
+  /**
+   * @brief Construct a compatibility mode manager associated with a FileHandle.
+   *
+   * According to the file path, requested compatibility mode, and the system configuration, the
+   * compatibility manager:
+   * - Infers the final compatibility modes for synchronous and asynchronous I/O paths,
+   * respectively.
+   * - Initializes the file wrappers and cuFile handle associated with a FileHandle.
+   *
+   * @param file_path Refer to
+   * FileHandle::FileHandle(std::string const&, std::string const&, mode_t, CompatMode).
+   * @param flags Same as above.
+   * @param mode Same as above.
+   * @param compat_mode_requested Same as above.
+   * @param file_handle Pointer to the FileHandle object that owns this compatibility mode manager.
+   */
+  CompatModeManager(std::string const& file_path,
+                    std::string const& flags,
+                    mode_t mode,
+                    CompatMode compat_mode_requested,
+                    FileHandle* file_handle);
+
+  ~CompatModeManager() noexcept                              = default;
+  CompatModeManager(const CompatModeManager&)                = default;
+  CompatModeManager& operator=(const CompatModeManager&)     = default;
+  CompatModeManager(CompatModeManager&&) noexcept            = default;
+  CompatModeManager& operator=(CompatModeManager&&) noexcept = default;
+
+  /**
+   * @brief Check if the compatibility mode for synchronous I/O of the associated FileHandle is
+   * expected to be CompatMode::ON.
+   *
+   * @return Boolean answer.
+   */
+  bool is_compat_mode_preferred() const noexcept;
+
+  /**
+   * @brief Check if the compatibility mode for asynchronous I/O of the associated FileHandle is
+   * expected to be CompatMode::ON.
+   *
+   * @return Boolean answer.
+   */
+  bool is_compat_mode_preferred_for_async() const noexcept;
+
+  /**
+   * @brief Retrieve the original compatibility mode requested.
+   *
+   * @return The original compatibility mode requested.
+   */
+  CompatMode compat_mode_requested() const noexcept;
+
+  /**
+   * @brief Determine if asynchronous I/O can be performed or not (throw exceptions)
+   * according to the existing compatibility mode data in the manager.
+   *
+   * Asynchronous I/O cannot be performed, for instance, when compat_mode_requested() is
+   * CompatMode::OFF, is_compat_mode_preferred() is CompatMode::OFF, but
+   * is_compat_mode_preferred_for_async() is CompatMode::ON (due to missing cuFile stream API or
+   * cuFile configuration file).
+   */
+  void validate_compat_mode_for_async() const;
+};
+
+}  // namespace kvikio
diff --git a/cpp/include/kvikio/file_handle.hpp b/cpp/include/kvikio/file_handle.hpp
index 50e1c679c3..78e493c9b0 100644
--- a/cpp/include/kvikio/file_handle.hpp
+++ b/cpp/include/kvikio/file_handle.hpp
@@ -20,12 +20,10 @@
 
 #include <cstddef>
 #include <cstdlib>
-#include <stdexcept>
-#include <system_error>
-#include <utility>
 
 #include <kvikio/buffer.hpp>
 #include <kvikio/compat_mode.hpp>
+#include <kvikio/compat_mode_manager.hpp>
 #include <kvikio/cufile/config.hpp>
 #include <kvikio/defaults.hpp>
 #include <kvikio/error.hpp>
diff --git a/cpp/src/compat_mode.cpp b/cpp/src/compat_mode.cpp
index 78a96c66be..d8346253bf 100644
--- a/cpp/src/compat_mode.cpp
+++ b/cpp/src/compat_mode.cpp
@@ -15,15 +15,11 @@
  */
 
 #include <algorithm>
-#include <cassert>
 #include <stdexcept>
 
 #include <kvikio/compat_mode.hpp>
-#include <kvikio/cufile/config.hpp>
 #include <kvikio/error.hpp>
-#include <kvikio/file_handle.hpp>
 #include <kvikio/nvtx.hpp>
-#include <kvikio/shim/cufile.hpp>
 
 namespace kvikio {
 
@@ -50,99 +46,4 @@ CompatMode parse_compat_mode_str(std::string_view compat_mode_str)
 
 }  // namespace detail
 
-CompatMode CompatModeManager::infer_compat_mode_if_auto(CompatMode compat_mode) noexcept
-{
-  KVIKIO_NVTX_FUNC_RANGE();
-  if (compat_mode == CompatMode::AUTO) {
-    return is_cufile_available() ? CompatMode::OFF : CompatMode::ON;
-  }
-  return compat_mode;
-}
-
-bool CompatModeManager::is_compat_mode_preferred(CompatMode compat_mode) noexcept
-{
-  return compat_mode == CompatMode::ON ||
-         (compat_mode == CompatMode::AUTO &&
-          infer_compat_mode_if_auto(compat_mode) == CompatMode::ON);
-}
-
-bool CompatModeManager::is_compat_mode_preferred() const noexcept
-{
-  return _is_compat_mode_preferred;
-}
-
-bool CompatModeManager::is_compat_mode_preferred_for_async() const noexcept
-{
-  return _is_compat_mode_preferred_for_async;
-}
-
-CompatMode CompatModeManager::compat_mode_requested() const noexcept
-{
-  return _compat_mode_requested;
-}
-
-CompatModeManager::CompatModeManager(std::string const& file_path,
-                                     std::string const& flags,
-                                     mode_t mode,
-                                     CompatMode compat_mode_requested_v,
-                                     FileHandle* file_handle)
-{
-  KVIKIO_NVTX_FUNC_RANGE();
-  KVIKIO_EXPECT(file_handle != nullptr,
-                "The compatibility mode manager does not have a proper owning file handle.",
-                std::invalid_argument);
-
-  file_handle->_file_direct_off.open(file_path, flags, false, mode);
-  _is_compat_mode_preferred = is_compat_mode_preferred(compat_mode_requested_v);
-
-  // Nothing to do in compatibility mode
-  if (_is_compat_mode_preferred) { return; }
-
-  try {
-    file_handle->_file_direct_on.open(file_path, flags, true, mode);
-  } catch (...) {
-    // Try to open the file with the O_DIRECT flag. Fall back to compatibility mode, if it fails.
-    if (compat_mode_requested_v == CompatMode::AUTO) {
-      _is_compat_mode_preferred = true;
-    } else {  // CompatMode::OFF
-      throw;
-    }
-  }
-
-  if (_is_compat_mode_preferred) { return; }
-
-  auto error_code = file_handle->_cufile_handle.register_handle(file_handle->_file_direct_on.fd());
-  assert(error_code.has_value());
-
-  // For the AUTO mode, if the first cuFile API call fails, fall back to the compatibility
-  // mode.
-  if (compat_mode_requested_v == CompatMode::AUTO && error_code.value().err != CU_FILE_SUCCESS) {
-    _is_compat_mode_preferred = true;
-  } else {
-    CUFILE_TRY(error_code.value());
-  }
-
-  // Check cuFile async API
-  static bool const is_extra_symbol_available = is_stream_api_available();
-  static bool const is_config_path_empty      = config_path().empty();
-  _is_compat_mode_preferred_for_async =
-    _is_compat_mode_preferred || !is_extra_symbol_available || is_config_path_empty;
-}
-
-void CompatModeManager::validate_compat_mode_for_async() const
-{
-  KVIKIO_NVTX_FUNC_RANGE();
-  if (!_is_compat_mode_preferred && _is_compat_mode_preferred_for_async &&
-      _compat_mode_requested == CompatMode::OFF) {
-    std::string err_msg;
-    if (!is_stream_api_available()) { err_msg += "Missing the cuFile stream api."; }
-
-    // When checking for availability, we also check if cuFile's config file exists. This is
-    // because even when the stream API is available, it doesn't work if no config file exists.
-    if (config_path().empty()) { err_msg += " Missing cuFile configuration file."; }
-
-    KVIKIO_FAIL(err_msg, std::runtime_error);
-  }
-}
-
 }  // namespace kvikio
diff --git a/cpp/src/compat_mode_manager.cpp b/cpp/src/compat_mode_manager.cpp
new file mode 100644
index 0000000000..569e9e73e7
--- /dev/null
+++ b/cpp/src/compat_mode_manager.cpp
@@ -0,0 +1,110 @@
+/*
+ * Copyright (c) 2025, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cassert>
+#include <stdexcept>
+
+#include <kvikio/compat_mode_manager.hpp>
+#include <kvikio/cufile/config.hpp>
+#include <kvikio/defaults.hpp>
+#include <kvikio/error.hpp>
+#include <kvikio/file_handle.hpp>
+#include <kvikio/nvtx.hpp>
+#include <kvikio/shim/cufile.hpp>
+
+namespace kvikio {
+
+bool CompatModeManager::is_compat_mode_preferred() const noexcept
+{
+  return _is_compat_mode_preferred;
+}
+
+bool CompatModeManager::is_compat_mode_preferred_for_async() const noexcept
+{
+  return _is_compat_mode_preferred_for_async;
+}
+
+CompatMode CompatModeManager::compat_mode_requested() const noexcept
+{
+  return _compat_mode_requested;
+}
+
+CompatModeManager::CompatModeManager(std::string const& file_path,
+                                     std::string const& flags,
+                                     mode_t mode,
+                                     CompatMode compat_mode_requested_v,
+                                     FileHandle* file_handle)
+{
+  KVIKIO_NVTX_FUNC_RANGE();
+  KVIKIO_EXPECT(file_handle != nullptr,
+                "The compatibility mode manager does not have a proper owning file handle.",
+                std::invalid_argument);
+
+  _compat_mode_requested = compat_mode_requested_v;
+  file_handle->_file_direct_off.open(file_path, flags, false, mode);
+  _is_compat_mode_preferred = defaults::is_compat_mode_preferred(compat_mode_requested_v);
+
+  // Nothing to do in compatibility mode
+  if (_is_compat_mode_preferred) { return; }
+
+  try {
+    file_handle->_file_direct_on.open(file_path, flags, true, mode);
+  } catch (...) {
+    // Try to open the file with the O_DIRECT flag. Fall back to compatibility mode, if it fails.
+    if (compat_mode_requested_v == CompatMode::AUTO) {
+      _is_compat_mode_preferred = true;
+    } else {  // CompatMode::OFF
+      throw;
+    }
+  }
+
+  if (_is_compat_mode_preferred) { return; }
+
+  auto error_code = file_handle->_cufile_handle.register_handle(file_handle->_file_direct_on.fd());
+  assert(error_code.has_value());
+
+  // For the AUTO mode, if the first cuFile API call fails, fall back to the compatibility
+  // mode.
+  if (compat_mode_requested_v == CompatMode::AUTO && error_code.value().err != CU_FILE_SUCCESS) {
+    _is_compat_mode_preferred = true;
+  } else {
+    CUFILE_TRY(error_code.value());
+  }
+
+  // Check cuFile async API
+  static bool const is_extra_symbol_available = is_stream_api_available();
+  static bool const is_config_path_empty      = config_path().empty();
+  _is_compat_mode_preferred_for_async =
+    _is_compat_mode_preferred || !is_extra_symbol_available || is_config_path_empty;
+}
+
+void CompatModeManager::validate_compat_mode_for_async() const
+{
+  KVIKIO_NVTX_FUNC_RANGE();
+  if (!_is_compat_mode_preferred && _is_compat_mode_preferred_for_async &&
+      _compat_mode_requested == CompatMode::OFF) {
+    std::string err_msg;
+    if (!is_stream_api_available()) { err_msg += "Missing the cuFile stream api."; }
+
+    // When checking for availability, we also check if cuFile's config file exists. This is
+    // because even when the stream API is available, it doesn't work if no config file exists.
+    if (config_path().empty()) { err_msg += " Missing cuFile configuration file."; }
+
+    KVIKIO_FAIL(err_msg, std::runtime_error);
+  }
+}
+
+}  // namespace kvikio

From b69d9aeae79fdae990801c05ba21a2762821af63 Mon Sep 17 00:00:00 2001
From: Tianyu Liu <kingcrimsontianyu@gmail.com>
Date: Tue, 16 Sep 2025 10:13:01 -0400
Subject: [PATCH 34/40] Support access to public S3 (#820)

## Background
KvikIO supports access to private S3 objects that require AWS credentials:
```python
# Method 1
kvikio.RemoteFile.open_s3(bucket, key)

# Method 2
kvikio.RemoteFile.open_s3_url(url)

# Method 3
kvikio.RemoteFile.open(url)
```
For public S3 object, these functions will throw the following exceptions.
>S3: must provide `aws_region` if AWS_DEFAULT_REGION isn't set.

A workaround is to simply use the generic HTTP/HTTPS endpoint:
```python
# Method 1
kvikio.RemoteFile.open_http(http_url)

# Method 2
kvikio.RemoteFile.open(url, RemoteEndpointType.HTTP)
```

However, this workaround loses the feature of S3 URL syntax check.

## This PR
- Adds support for accessing public S3 objects in C++ and Python by having a new endpoint type `S3PublicEndpoint`. This endpoint does not require AWS credentials.
- Updates the unified interface `open(url)` that can automatically infer the endpoint type. Under `AUTO` mode, for a syntactically valid S3 URL using HTTP/HTTPS protocol, KvikIO now checks the connectivity using a private S3 endpoint, and if failed proceeds to use a public S3 endpoint.
- Updates the comments on each endpoint to further improve clarity.
- Adjusts Python APIs `kvikio.RemoteFile.open_*` from class method to static method (which is a breaking change).

Closes https://github.com/rapidsai/kvikio/issues/806

Authors:
  - Tianyu Liu (https://github.com/kingcrimsontianyu)

Approvers:
  - Mads R. B. Kristensen (https://github.com/madsbk)

URL: https://github.com/rapidsai/kvikio/pull/820
---
 cpp/include/kvikio/hdfs.hpp                 |  5 +-
 cpp/include/kvikio/remote_handle.hpp        | 47 ++++++++++++++--
 cpp/src/remote_handle.cpp                   | 62 ++++++++++++++++++++-
 cpp/tests/test_remote_handle.cpp            | 44 +++++++++++----
 python/kvikio/kvikio/_lib/remote_handle.pyx | 30 ++++++++--
 python/kvikio/kvikio/remote_file.py         | 56 ++++++++++++-------
 6 files changed, 201 insertions(+), 43 deletions(-)

diff --git a/cpp/include/kvikio/hdfs.hpp b/cpp/include/kvikio/hdfs.hpp
index 345051bcbd..9d89d4d2e1 100644
--- a/cpp/include/kvikio/hdfs.hpp
+++ b/cpp/include/kvikio/hdfs.hpp
@@ -24,8 +24,9 @@ namespace kvikio {
 /**
  * @brief A remote endpoint for Apache Hadoop WebHDFS.
  *
- * If KvikIO is run within a Docker, the argument `--network host` needs to be passed to the `docker
- * run` command.
+ * This endpoint is for accessing HDFS files via the WebHDFS REST API over HTTP/HTTPS. If KvikIO is
+ * run within Docker, pass `--network host` to the `docker run` command to ensure proper name node
+ * connectivity.
  */
 class WebHdfsEndpoint : public RemoteEndpoint {
  private:
diff --git a/cpp/include/kvikio/remote_handle.hpp b/cpp/include/kvikio/remote_handle.hpp
index 1c6c887281..c48e84e8fb 100644
--- a/cpp/include/kvikio/remote_handle.hpp
+++ b/cpp/include/kvikio/remote_handle.hpp
@@ -46,6 +46,9 @@ enum class RemoteEndpointType : uint8_t {
   S3,    ///< AWS S3 endpoint using credentials-based authentication. Requires AWS environment
          ///< variables (such as AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY, AWS_DEFAULT_REGION) to be
          ///< set.
+  S3_PUBLIC,  ///< AWS S3 endpoint for publicly accessible objects. No credentials required as the
+              ///< objects have public read permissions enabled. Used for open datasets and public
+              ///< buckets.
   S3_PRESIGNED_URL,  ///< AWS S3 endpoint using a presigned URL. No credentials required as
                      ///< authentication is embedded in the URL with time-limited access.
   WEBHDFS,  ///< Apache Hadoop WebHDFS (Web-based Hadoop Distributed File System) endpoint for
@@ -108,7 +111,10 @@ class RemoteEndpoint {
 };
 
 /**
- * @brief A remote endpoint using http.
+ * @brief A remote endpoint for HTTP/HTTPS resources
+ *
+ * This endpoint is for accessing files via standard HTTP/HTTPS protocols without any specialized
+ * authentication.
  */
 class HttpEndpoint : public RemoteEndpoint {
  private:
@@ -138,7 +144,10 @@ class HttpEndpoint : public RemoteEndpoint {
 };
 
 /**
- * @brief A remote endpoint using AWS's S3 protocol.
+ * @brief A remote endpoint for AWS S3 storage requiring credentials
+ *
+ * This endpoint is for accessing private S3 objects using AWS credentials (access key, secret key,
+ * region and optional session token).
  */
 class S3Endpoint : public RemoteEndpoint {
  private:
@@ -256,8 +265,38 @@ class S3Endpoint : public RemoteEndpoint {
 };
 
 /**
- * @brief A remote endpoint using AWS's S3 protocol and expecting a presigned URL. File access via
- * this type of URL is time-limited and does not require AWS credentials.
+ * @brief A remote endpoint for publicly accessible S3 objects without authentication
+ *
+ * This endpoint is for accessing S3 objects configured with public read permissions,
+ * requiring no authentication. Supports AWS S3 services with anonymous access enabled.
+ */
+class S3PublicEndpoint : public RemoteEndpoint {
+ private:
+  std::string _url;
+
+ public:
+  explicit S3PublicEndpoint(std::string url);
+
+  ~S3PublicEndpoint() override = default;
+  void setopt(CurlHandle& curl) override;
+  std::string str() const override;
+  std::size_t get_file_size() override;
+  void setup_range_request(CurlHandle& curl, std::size_t file_offset, std::size_t size) override;
+
+  /**
+   * @brief Whether the given URL is valid for S3 public endpoints.
+   *
+   * @param url A URL.
+   * @return Boolean answer.
+   */
+  static bool is_url_valid(std::string const& url) noexcept;
+};
+
+/**
+ * @brief A remote endpoint for AWS S3 storage using presigned URLs.
+ *
+ * This endpoint is for accessing S3 objects via presigned URLs, which provide time-limited access
+ * without requiring AWS credentials on the client side.
  */
 class S3EndpointWithPresignedUrl : public RemoteEndpoint {
  private:
diff --git a/cpp/src/remote_handle.cpp b/cpp/src/remote_handle.cpp
index a173804321..cee6bdb700 100644
--- a/cpp/src/remote_handle.cpp
+++ b/cpp/src/remote_handle.cpp
@@ -216,6 +216,7 @@ char const* get_remote_endpoint_type_name(RemoteEndpointType remote_endpoint_typ
 {
   switch (remote_endpoint_type) {
     case RemoteEndpointType::S3: return "S3";
+    case RemoteEndpointType::S3_PUBLIC: return "S3 public";
     case RemoteEndpointType::S3_PRESIGNED_URL: return "S3 with presigned URL";
     case RemoteEndpointType::WEBHDFS: return "WebHDFS";
     case RemoteEndpointType::HTTP: return "HTTP";
@@ -308,7 +309,9 @@ std::string S3Endpoint::url_from_bucket_and_object(std::string bucket_name,
       unwrap_or_default(std::move(aws_region),
                         "AWS_DEFAULT_REGION",
                         "S3: must provide `aws_region` if AWS_DEFAULT_REGION isn't set.");
-    // We default to the official AWS url scheme.
+    // "s3" is a non-standard URI scheme used by AWS CLI and AWS SDK, and cannot be identified by
+    // libcurl. A valid HTTP/HTTPS URL needs to be constructed for use in libcurl. Here the AWS
+    // virtual host style is used.
     ss << "https://" << bucket_name << ".s3." << region << ".amazonaws.com/" << object_name;
   } else {
     ss << endpoint_url << "/" << bucket_name << "/" << object_name;
@@ -444,6 +447,34 @@ bool S3Endpoint::is_url_valid(std::string const& url) noexcept
   return false;
 }
 
+S3PublicEndpoint::S3PublicEndpoint(std::string url)
+  : RemoteEndpoint{RemoteEndpointType::S3_PUBLIC}, _url{std::move(url)}
+{
+}
+
+void S3PublicEndpoint::setopt(CurlHandle& curl) { curl.setopt(CURLOPT_URL, _url.c_str()); }
+
+std::string S3PublicEndpoint::str() const { return _url; }
+
+std::size_t S3PublicEndpoint::get_file_size()
+{
+  KVIKIO_NVTX_FUNC_RANGE();
+  return get_file_size_using_head_impl(*this, _url);
+}
+
+void S3PublicEndpoint::setup_range_request(CurlHandle& curl,
+                                           std::size_t file_offset,
+                                           std::size_t size)
+{
+  KVIKIO_NVTX_FUNC_RANGE();
+  setup_range_request_impl(curl, file_offset, size);
+}
+
+bool S3PublicEndpoint::is_url_valid(std::string const& url) noexcept
+{
+  return S3Endpoint::is_url_valid(url);
+}
+
 S3EndpointWithPresignedUrl::S3EndpointWithPresignedUrl(std::string presigned_url)
   : RemoteEndpoint{RemoteEndpointType::S3_PRESIGNED_URL}, _url{std::move(presigned_url)}
 {
@@ -558,6 +589,7 @@ RemoteHandle RemoteHandle::open(std::string url,
 {
   if (!allow_list.has_value()) {
     allow_list = {RemoteEndpointType::S3,
+                  RemoteEndpointType::S3_PUBLIC,
                   RemoteEndpointType::S3_PRESIGNED_URL,
                   RemoteEndpointType::WEBHDFS,
                   RemoteEndpointType::HTTP};
@@ -579,6 +611,10 @@ RemoteHandle RemoteHandle::open(std::string url,
         }
         return std::make_unique<S3Endpoint>(url);
 
+      case RemoteEndpointType::S3_PUBLIC:
+        if (!S3PublicEndpoint::is_url_valid(url)) { return nullptr; }
+        return std::make_unique<S3PublicEndpoint>(url);
+
       case RemoteEndpointType::S3_PRESIGNED_URL:
         if (!S3EndpointWithPresignedUrl::is_url_valid(url)) { return nullptr; }
         return std::make_unique<S3EndpointWithPresignedUrl>(url);
@@ -601,7 +637,29 @@ RemoteHandle RemoteHandle::open(std::string url,
     // Try each allowed type in the order of allowlist
     for (auto const& type : allow_list.value()) {
       endpoint = create_endpoint(type);
-      if (endpoint) { break; }
+      if (endpoint == nullptr) { continue; }
+
+      // If the credential-based S3 endpoint cannot be used to access the URL, try using S3 public
+      // endpoint instead if it is in the allowlist
+      if (endpoint->remote_endpoint_type() == RemoteEndpointType::S3) {
+        try {
+          // Check connectivity for the credential-based S3 endpoint, and throw an exception if
+          // failed
+          endpoint->get_file_size();
+        } catch (...) {
+          auto it =
+            std::find(allow_list->begin(), allow_list->end(), RemoteEndpointType::S3_PUBLIC);
+          if (it != allow_list->end()) {
+            // If S3 public endpoint is in the allowlist, use it and end the search
+            endpoint = std::make_unique<S3PublicEndpoint>(url);
+          } else {
+            continue;
+          }
+        }
+      }
+
+      // At this point, a matching endpoint has been found
+      break;
     }
     KVIKIO_EXPECT(endpoint.get() != nullptr, "Unsupported endpoint URL.", std::runtime_error);
   } else {
diff --git a/cpp/tests/test_remote_handle.cpp b/cpp/tests/test_remote_handle.cpp
index a7e3d1e829..f1e25ea34d 100644
--- a/cpp/tests/test_remote_handle.cpp
+++ b/cpp/tests/test_remote_handle.cpp
@@ -36,18 +36,21 @@ class RemoteHandleTest : public testing::Test {
   {
     _sample_urls = {
       // Endpoint type: S3
-      {"s3://bucket-name/object-key-name", kvikio::RemoteEndpointType::S3},
-      {"s3://bucket-name/object-key-name-dir/object-key-name-file", kvikio::RemoteEndpointType::S3},
+      {"s3://bucket-name/object-key-name", kvikio::RemoteEndpointType::S3_PUBLIC},
+      {"s3://bucket-name/object-key-name-dir/object-key-name-file",
+       kvikio::RemoteEndpointType::S3_PUBLIC},
       {"https://bucket-name.s3.region-code.amazonaws.com/object-key-name",
-       kvikio::RemoteEndpointType::S3},
+       kvikio::RemoteEndpointType::S3_PUBLIC},
       {"https://s3.region-code.amazonaws.com/bucket-name/object-key-name",
-       kvikio::RemoteEndpointType::S3},
-      {"https://bucket-name.s3.amazonaws.com/object-key-name", kvikio::RemoteEndpointType::S3},
-      {"https://s3.amazonaws.com/bucket-name/object-key-name", kvikio::RemoteEndpointType::S3},
+       kvikio::RemoteEndpointType::S3_PUBLIC},
+      {"https://bucket-name.s3.amazonaws.com/object-key-name",
+       kvikio::RemoteEndpointType::S3_PUBLIC},
+      {"https://s3.amazonaws.com/bucket-name/object-key-name",
+       kvikio::RemoteEndpointType::S3_PUBLIC},
       {"https://bucket-name.s3-region-code.amazonaws.com/object-key-name",
-       kvikio::RemoteEndpointType::S3},
+       kvikio::RemoteEndpointType::S3_PUBLIC},
       {"https://s3-region-code.amazonaws.com/bucket-name/object-key-name",
-       kvikio::RemoteEndpointType::S3},
+       kvikio::RemoteEndpointType::S3_PUBLIC},
 
       // Endpoint type: S3 presigned URL
       {"https://bucket-name.s3.region-code.amazonaws.com/"
@@ -74,9 +77,13 @@ class RemoteHandleTest : public testing::Test {
 
         // Test unified interface
         {
-          // Here we pass the 1-byte argument to RemoteHandle::open. This prevents the endpoint
-          // constructor from querying the file size and sending requests to the server, thus
-          // allowing us to use dummy URLs for testing purpose.
+          // Here we pass the 1-byte argument to RemoteHandle::open. For all endpoints except
+          // kvikio::RemoteEndpointType::S3, this prevents the endpoint constructor from querying
+          // the file size and sending requests to the server, thus allowing us to use dummy URLs
+          // for testing purpose.
+          // For kvikio::RemoteEndpointType::S3, RemoteHandle::open sends HEAD request as a
+          // connectivity check and will fail on the syntactically valid dummy URL. The
+          // kvikio::RemoteEndpointType::S3_PUBLIC will then be used as the endpoint.
           auto remote_handle =
             kvikio::RemoteHandle::open(url, kvikio::RemoteEndpointType::AUTO, std::nullopt, 1);
           EXPECT_EQ(remote_handle.remote_endpoint_type(), expected_endpoint_type);
@@ -150,7 +157,7 @@ TEST_F(RemoteHandleTest, test_s3_url)
                                           {"AWS_SECRET_ACCESS_KEY", "my_aws_secrete_access_key"}};
 
   {
-    test_helper(kvikio::RemoteEndpointType::S3, kvikio::S3Endpoint::is_url_valid);
+    test_helper(kvikio::RemoteEndpointType::S3_PUBLIC, kvikio::S3Endpoint::is_url_valid);
   }
 
   // Invalid URLs
@@ -168,6 +175,19 @@ TEST_F(RemoteHandleTest, test_s3_url)
       EXPECT_FALSE(kvikio::S3Endpoint::is_url_valid(invalid_url));
     }
   }
+
+  // S3_PUBLIC is not in the allowlist. So when the connectivity check fails on the dummy URL,
+  // KvikIO cannot fall back to S3_PUBLIC.
+  {
+    EXPECT_ANY_THROW({
+      kvikio::RemoteHandle::open(
+        "s3://bucket-name/object-key-name",
+        kvikio::RemoteEndpointType::AUTO,
+        std::vector<kvikio::RemoteEndpointType>{kvikio::RemoteEndpointType::S3,
+                                                kvikio::RemoteEndpointType::HTTP},
+        1);
+    });
+  }
 }
 
 TEST_F(RemoteHandleTest, test_s3_url_with_presigned_url)
diff --git a/python/kvikio/kvikio/_lib/remote_handle.pyx b/python/kvikio/kvikio/_lib/remote_handle.pyx
index edfd59e78d..8fae78c534 100644
--- a/python/kvikio/kvikio/_lib/remote_handle.pyx
+++ b/python/kvikio/kvikio/_lib/remote_handle.pyx
@@ -23,9 +23,10 @@ cdef extern from "<kvikio/remote_handle.hpp>" namespace "kvikio" nogil:
     cpdef enum class RemoteEndpointType(uint8_t):
         AUTO = 0
         S3 = 1
-        S3_PRESIGNED_URL = 2
-        WEBHDFS = 3
-        HTTP = 4
+        S3_PUBLIC = 2
+        S3_PRESIGNED_URL = 3
+        WEBHDFS = 4
+        HTTP = 5
     cdef cppclass cpp_RemoteEndpoint "kvikio::RemoteEndpoint":
         string str() except +
 
@@ -39,6 +40,9 @@ cdef extern from "<kvikio/remote_handle.hpp>" namespace "kvikio" nogil:
     pair[string, string] cpp_parse_s3_url \
         "kvikio::S3Endpoint::parse_s3_url"(string url) except +
 
+    cdef cppclass cpp_S3PublicEndpoint "kvikio::S3PublicEndpoint" (cpp_RemoteEndpoint):
+        cpp_S3PublicEndpoint(string url) except +
+
     cdef cppclass cpp_S3EndpointWithPresignedUrl "kvikio::S3EndpointWithPresignedUrl" \
                                                  (cpp_RemoteEndpoint):
         cpp_S3EndpointWithPresignedUrl(string presigned_url) except +
@@ -219,7 +223,25 @@ cdef class RemoteFile:
         )
 
     @staticmethod
-    def open_s3_from_http_presigned_url(
+    def open_s3_public(
+        url: str,
+        nbytes: Optional[int],
+    ):
+        cdef string cpp_url = _to_string(url)
+        cdef unique_ptr[cpp_RemoteEndpoint] cpp_endpoint
+
+        with nogil:
+            cpp_endpoint = cast_to_remote_endpoint(
+                make_unique[cpp_S3PublicEndpoint](cpp_url)
+            )
+
+        return RemoteFile._from_endpoint(
+            move(cpp_endpoint),
+            nbytes
+        )
+
+    @staticmethod
+    def open_s3_presigned_url(
         presigned_url: str,
         nbytes: Optional[int],
     ):
diff --git a/python/kvikio/kvikio/remote_file.py b/python/kvikio/kvikio/remote_file.py
index a3f73d271a..2064320914 100644
--- a/python/kvikio/kvikio/remote_file.py
+++ b/python/kvikio/kvikio/remote_file.py
@@ -28,6 +28,10 @@ class RemoteEndpointType(enum.Enum):
         AWS S3 endpoint using credentials-based authentication. Requires
         AWS environment variables (such as AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY,
         AWS_DEFAULT_REGION) to be set.
+    S3_PUBLIC : INT
+        AWS S3 endpoint for publicly accessible objects. No credentials required as the
+        objects have public read permissions enabled. Used for open datasets and public
+        buckets.
     S3_PRESIGNED_URL : int
         AWS S3 endpoint using a presigned URL. No credentials required as
         authentication is embedded in the URL with time-limited access.
@@ -46,9 +50,10 @@ class RemoteEndpointType(enum.Enum):
 
     AUTO = 0
     S3 = 1
-    S3_PRESIGNED_URL = 2
-    WEBHDFS = 3
-    HTTP = 4
+    S3_PUBLIC = 2
+    S3_PRESIGNED_URL = 3
+    WEBHDFS = 4
+    HTTP = 5
 
     @staticmethod
     def _map_to_internal(remote_endpoint_type: RemoteEndpointType):
@@ -102,7 +107,7 @@ def open_http(
         url: str,
         nbytes: Optional[int] = None,
     ) -> RemoteFile:
-        """Open a http file.
+        """Open a HTTP/HTTPS file.
 
         Parameters
         ----------
@@ -112,7 +117,7 @@ def open_http(
             The size of the file. If None, KvikIO will ask the server
             for the file size.
         """
-        return RemoteFile(_get_remote_module().RemoteFile.open_http(url, nbytes))
+        return cls(_get_remote_module().RemoteFile.open_http(url, nbytes))
 
     @classmethod
     def open_s3(
@@ -142,7 +147,7 @@ def open_s3(
             The size of the file. If None, KvikIO will ask the server
             for the file size.
         """
-        return RemoteFile(
+        return cls(
             _get_remote_module().RemoteFile.open_s3(bucket_name, object_name, nbytes)
         )
 
@@ -178,15 +183,27 @@ def open_s3_url(
         """
         parsed_result = urllib.parse.urlparse(url.lower())
         if parsed_result.scheme in ("http", "https"):
-            return RemoteFile(
+            return cls(
                 _get_remote_module().RemoteFile.open_s3_from_http_url(url, nbytes)
             )
         if parsed_result.scheme == "s3":
-            return RemoteFile(
-                _get_remote_module().RemoteFile.open_s3_from_s3_url(url, nbytes)
-            )
+            return cls(_get_remote_module().RemoteFile.open_s3_from_s3_url(url, nbytes))
         raise ValueError(f"Unsupported protocol: {url}")
 
+    @classmethod
+    def open_s3_public(cls, url: str, nbytes: Optional[int] = None) -> RemoteFile:
+        """Open a publicly accessible AWS S3 file.
+
+        Parameters
+        ----------
+        url
+             URL to the remote file.
+        nbytes
+            The size of the file. If None, KvikIO will ask the server
+            for the file size.
+        """
+        return cls(_get_remote_module().RemoteFile.open_s3_public(url, nbytes))
+
     @classmethod
     def open_s3_presigned_url(
         cls,
@@ -203,10 +220,8 @@ def open_s3_presigned_url(
             The size of the file. If None, KvikIO will ask the server
             for the file size.
         """
-        return RemoteFile(
-            _get_remote_module().RemoteFile.open_s3_from_http_presigned_url(
-                presigned_url, nbytes
-            )
+        return cls(
+            _get_remote_module().RemoteFile.open_s3_presigned_url(presigned_url, nbytes)
         )
 
     @classmethod
@@ -228,7 +243,7 @@ def open_webhdfs(
             The size of the file. If None, KvikIO will ask the server for the file
             size.
         """
-        return RemoteFile(_get_remote_module().RemoteFile.open_webhdfs(url, nbytes))
+        return cls(_get_remote_module().RemoteFile.open_webhdfs(url, nbytes))
 
     @classmethod
     def open(
@@ -242,9 +257,9 @@ def open(
         Create a remote file handle from a URL.
 
         This function creates a RemoteFile for reading data from various remote
-        endpoints including HTTP/HTTPS servers, AWS S3 buckets, S3 presigned URLs,
-        and WebHDFS. The endpoint type can be automatically detected from the URL
-        or explicitly specified.
+        endpoints including HTTP/HTTPS servers, AWS S3 buckets, S3 for public access,
+        S3 presigned URLs, and WebHDFS. The endpoint type can be automatically detected
+        from the URL or explicitly specified.
 
         Parameters
         ----------
@@ -252,6 +267,7 @@ def open(
             The URL of the remote file. Supported formats include:
 
             - S3 with credentials
+            - S3 for public access
             - S3 presigned URL
             - WebHDFS
             - HTTP/HTTPS
@@ -259,6 +275,7 @@ def open(
             The type of remote endpoint. Default is :class:`RemoteEndpointType.AUTO`
             which automatically detects the endpoint type from the URL. Can be
             explicitly set to :class:`RemoteEndpointType.S3`,
+            :class:`RemoteEndpointType.S3_PUBLIC`,
             :class:`RemoteEndpointType.S3_PRESIGNED_URL`,
             :class:`RemoteEndpointType.WEBHDFS`, or :class:`RemoteEndpointType.HTTP`
             to force a specific endpoint type.
@@ -272,6 +289,7 @@ def open(
 
             If not provided, defaults to all supported types in this order:
             :class:`RemoteEndpointType.S3`,
+            :class:`RemoteEndpointType.S3_PUBLIC`,
             :class:`RemoteEndpointType.S3_PRESIGNED_URL`,
             :class:`RemoteEndpointType.WEBHDFS`, and :class:`RemoteEndpointType.HTTP`.
         nbytes : int, optional
@@ -337,7 +355,7 @@ def open(
                  nbytes=1024 * 1024 * 100  # 100 MB
              )
         """
-        return RemoteFile(
+        return cls(
             _get_remote_module().RemoteFile.open(
                 url,
                 RemoteEndpointType._map_to_internal(remote_endpoint_type),

From 4dea5c4c41c1ade2b8bfccc22c8afa276ca45d19 Mon Sep 17 00:00:00 2001
From: Jake Awe <jawe@nvidia.com>
Date: Tue, 16 Sep 2025 13:37:05 -0500
Subject: [PATCH 35/40] configuring repo for automatic release notes generation

---
 .github/release.yml | 27 +++++++++++++++++++++++++++
 1 file changed, 27 insertions(+)
 create mode 100644 .github/release.yml

diff --git a/.github/release.yml b/.github/release.yml
new file mode 100644
index 0000000000..2c9a85805c
--- /dev/null
+++ b/.github/release.yml
@@ -0,0 +1,27 @@
+# GitHub Auto-Generated Release Notes Configuration for RAPIDS
+# This file configures how GitHub automatically generates release notes
+
+changelog:
+  exclude:
+    labels:
+      - ignore-for-release
+      - dependencies
+    authors:
+      - rapids-bot[bot]
+      - dependabot[bot]
+  categories:
+    - title: 🚨 Breaking Changes
+      labels:
+        - breaking
+    - title: 🐛 Bug Fixes
+      labels:
+        - bug
+    - title: 📖 Documentation
+      labels:
+        - doc
+    - title: 🚀 New Features
+      labels:
+        - feature request
+    - title: 🛠️ Improvements
+      labels:
+        - improvement

From b5b42326a69cd4f754a019a687fdecfa030d641b Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Wed, 17 Sep 2025 01:05:42 -0500
Subject: [PATCH 36/40] Fix CUDA 13 handling of libcufile on aarch64 (#827)

Fixes an issue where CUDA 13 packages named like `linux-aarch64/libkvikio-25.10.00a43-cuda13_0_250916_b69d9aea.conda` were getting dependencies on `cuda-version >=12.2.0a0,<14.0a0`, which allowed them to be used in CUDA 12 environments. That is not desired and could cause problems.

Authors:
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - James Lamb (https://github.com/jameslamb)
  - Mads R. B. Kristensen (https://github.com/madsbk)

URL: https://github.com/rapidsai/kvikio/pull/827
---
 conda/recipes/libkvikio/recipe.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/conda/recipes/libkvikio/recipe.yaml b/conda/recipes/libkvikio/recipe.yaml
index f4164c9611..3b3a0bc1bf 100644
--- a/conda/recipes/libkvikio/recipe.yaml
+++ b/conda/recipes/libkvikio/recipe.yaml
@@ -93,7 +93,7 @@ outputs:
         - cuda-version =${{ cuda_version }}
         - libcurl ==${{ libcurl_version }}
       run:
-        - if: x86_64
+        - if: x86_64 or (aarch64 and cuda_version >= "13.0")
           then:
           - ${{ pin_compatible("cuda-version", upper_bound="x", lower_bound="x") }}
           else:

From 987683af02da8847225359786201ef1038ab4394 Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Thu, 18 Sep 2025 15:01:43 -0700
Subject: [PATCH 37/40] Keep string alive until nvtxDomainResourceCreate (#832)

Resolves #830

Authors:
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Tianyu Liu (https://github.com/kingcrimsontianyu)

URL: https://github.com/rapidsai/kvikio/pull/832
---
 cpp/src/nvtx.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/cpp/src/nvtx.cpp b/cpp/src/nvtx.cpp
index b9d7c3e146..0467e46e38 100644
--- a/cpp/src/nvtx.cpp
+++ b/cpp/src/nvtx.cpp
@@ -85,7 +85,8 @@ void NvtxManager::rename_current_thread(std::string_view new_name) noexcept
   attribs.identifierType           = NVTX_RESOURCE_TYPE_GENERIC_THREAD_NATIVE;
   attribs.identifier.ullValue      = tid;
   attribs.messageType              = NVTX_MESSAGE_TYPE_ASCII;
-  attribs.message.ascii            = ss.str().c_str();
+  auto st                          = ss.str();
+  attribs.message.ascii            = st.c_str();
   nvtxResourceHandle_t handle =
     nvtxDomainResourceCreate(nvtx3::domain::get<libkvikio_domain>(), &attribs);
 #endif

From d4b7773e0cc4f164b1a69df62f00ddd179226c6c Mon Sep 17 00:00:00 2001
From: Tianyu Liu <kingcrimsontianyu@gmail.com>
Date: Mon, 22 Sep 2025 16:43:20 -0400
Subject: [PATCH 38/40] Fix a bug in public S3 inference (#831)

`kvikio::RemoteHandle::open()` has started to support public S3 since https://github.com/rapidsai/kvikio/pull/820. When `open()` sees an S3 URL, it first assumes a private S3 object and queries its size. If the query fails, it proceeds to assume that the file is a public S3.

During the construction of a private S3 object, the constructor scans the environment variables for AWS credentials. Manual testing of https://github.com/rapidsai/kvikio/pull/820 accidentally includes the env vars all the time and hides a bug: in absence of env vars, the constructor of the private S3 object will throw an exception, which is unhandled, and KvikIO never gets a chance to try with public S3.

This PR fixes this bug.

Authors:
  - Tianyu Liu (https://github.com/kingcrimsontianyu)

Approvers:
  - Mads R. B. Kristensen (https://github.com/madsbk)
  - Vukasin Milovanovic (https://github.com/vuule)

URL: https://github.com/rapidsai/kvikio/pull/831
---
 cpp/src/remote_handle.cpp | 30 ++++++++++++++----------------
 1 file changed, 14 insertions(+), 16 deletions(-)

diff --git a/cpp/src/remote_handle.cpp b/cpp/src/remote_handle.cpp
index cee6bdb700..33b481a74a 100644
--- a/cpp/src/remote_handle.cpp
+++ b/cpp/src/remote_handle.cpp
@@ -636,25 +636,23 @@ RemoteHandle RemoteHandle::open(std::string url,
   if (remote_endpoint_type == RemoteEndpointType::AUTO) {
     // Try each allowed type in the order of allowlist
     for (auto const& type : allow_list.value()) {
-      endpoint = create_endpoint(type);
-      if (endpoint == nullptr) { continue; }
-
-      // If the credential-based S3 endpoint cannot be used to access the URL, try using S3 public
-      // endpoint instead if it is in the allowlist
-      if (endpoint->remote_endpoint_type() == RemoteEndpointType::S3) {
-        try {
+      try {
+        endpoint = create_endpoint(type);
+        if (endpoint == nullptr) { continue; }
+        if (type == RemoteEndpointType::S3) {
           // Check connectivity for the credential-based S3 endpoint, and throw an exception if
           // failed
           endpoint->get_file_size();
-        } catch (...) {
-          auto it =
-            std::find(allow_list->begin(), allow_list->end(), RemoteEndpointType::S3_PUBLIC);
-          if (it != allow_list->end()) {
-            // If S3 public endpoint is in the allowlist, use it and end the search
-            endpoint = std::make_unique<S3PublicEndpoint>(url);
-          } else {
-            continue;
-          }
+        }
+      } catch (...) {
+        // If the credential-based S3 endpoint cannot be used to access the URL, try using S3 public
+        // endpoint instead if it is in the allowlist
+        if (type == RemoteEndpointType::S3 &&
+            std::find(allow_list->begin(), allow_list->end(), RemoteEndpointType::S3_PUBLIC) !=
+              allow_list->end()) {
+          endpoint = std::make_unique<S3PublicEndpoint>(url);
+        } else {
+          throw;
         }
       }
 

From 1b70488a0ee6ed7590ca16618e8ee5d8e6605853 Mon Sep 17 00:00:00 2001
From: Mike Sarahan <msarahan@nvidia.com>
Date: Tue, 23 Sep 2025 16:24:14 -0500
Subject: [PATCH 39/40] Empty commit to trigger a build (#840)

This is an empty commit to trigger a build. It is used when builds get
stuck with an old ABI. Rebuilding updates them to the new one.

From 9ac0c317a352315bc82d925e09a6c82684ce3695 Mon Sep 17 00:00:00 2001
From: Jake Awe <jawe@nvidia.com>
Date: Wed, 8 Oct 2025 15:24:37 +0000
Subject: [PATCH 40/40] Update Changelog [skip ci]

---
 CHANGELOG.md | 56 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 56 insertions(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index a3aeaa2633..1e6789b685 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,3 +1,59 @@
+# kvikio 25.10.00 (8 Oct 2025)
+
+## 🚨 Breaking Changes
+
+- Support access to public S3 ([#820](https://github.com/rapidsai/kvikio/pull/820)) [@kingcrimsontianyu](https://github.com/kingcrimsontianyu)
+- Reduce duplication between compat manager and defaults ([#816](https://github.com/rapidsai/kvikio/pull/816)) [@vyasr](https://github.com/vyasr)
+- Devendor libnvcomp from libkvikio ([#805](https://github.com/rapidsai/kvikio/pull/805)) [@bdice](https://github.com/bdice)
+- Remove Python nvCOMP bindings and Zarr 2 support ([#798](https://github.com/rapidsai/kvikio/pull/798)) [@vuule](https://github.com/vuule)
+
+## 🐛 Bug Fixes
+
+- Keep string alive until nvtxDomainResourceCreate ([#832](https://github.com/rapidsai/kvikio/pull/832)) [@vyasr](https://github.com/vyasr)
+- Fix a bug in public S3 inference ([#831](https://github.com/rapidsai/kvikio/pull/831)) [@kingcrimsontianyu](https://github.com/kingcrimsontianyu)
+- Fix CUDA 13 handling of libcufile on aarch64 ([#827](https://github.com/rapidsai/kvikio/pull/827)) [@bdice](https://github.com/bdice)
+- Skip max_device_cache_size setter when BAR1 memory isn&#39;t present on the GPUs in the system ([#814](https://github.com/rapidsai/kvikio/pull/814)) [@ahoyle-nvidia](https://github.com/ahoyle-nvidia)
+- Fix an S3 parsing bug in the open function. Improve regex usage ([#810](https://github.com/rapidsai/kvikio/pull/810)) [@kingcrimsontianyu](https://github.com/kingcrimsontianyu)
+- Remove remaining nvcomp references ([#801](https://github.com/rapidsai/kvikio/pull/801)) [@vyasr](https://github.com/vyasr)
+- Revert &quot;Set compiler versions in context ([#755)&quot; (#784](https://github.com/rapidsai/kvikio/pull/755)&quot; (#784)) [@vyasr](https://github.com/vyasr)
+- Relax mmap read requirement. Improve error message. ([#781](https://github.com/rapidsai/kvikio/pull/781)) [@kingcrimsontianyu](https://github.com/kingcrimsontianyu)
+
+## 🚀 New Features
+
+- Support access to public S3 ([#820](https://github.com/rapidsai/kvikio/pull/820)) [@kingcrimsontianyu](https://github.com/kingcrimsontianyu)
+- Upgrade to nvCOMP 5.0.0.6 ([#800](https://github.com/rapidsai/kvikio/pull/800)) [@vuule](https://github.com/vuule)
+- Remove Python nvCOMP bindings and Zarr 2 support ([#798](https://github.com/rapidsai/kvikio/pull/798)) [@vuule](https://github.com/vuule)
+- Support WebHDFS (2/2): Python binding ([#791](https://github.com/rapidsai/kvikio/pull/791)) [@kingcrimsontianyu](https://github.com/kingcrimsontianyu)
+- Support file size query for S3 presigned URL ([#789](https://github.com/rapidsai/kvikio/pull/789)) [@kingcrimsontianyu](https://github.com/kingcrimsontianyu)
+- Support WebHDFS (1/2): C++ implementation ([#788](https://github.com/rapidsai/kvikio/pull/788)) [@kingcrimsontianyu](https://github.com/kingcrimsontianyu)
+- Support file-backed mapping (2/n): host/device read Python binding ([#742](https://github.com/rapidsai/kvikio/pull/742)) [@kingcrimsontianyu](https://github.com/kingcrimsontianyu)
+
+## 🛠️ Improvements
+
+- Empty commit to trigger a build ([#840](https://github.com/rapidsai/kvikio/pull/840)) [@msarahan](https://github.com/msarahan)
+- Configure repo for automatic release notes generation ([#825](https://github.com/rapidsai/kvikio/pull/825)) [@AyodeAwe](https://github.com/AyodeAwe)
+- Use C++20 for KvikIO main library ([#819](https://github.com/rapidsai/kvikio/pull/819)) [@kingcrimsontianyu](https://github.com/kingcrimsontianyu)
+- Explicitly manage TLS/SSL CA paths for remote I/O ([#817](https://github.com/rapidsai/kvikio/pull/817)) [@kingcrimsontianyu](https://github.com/kingcrimsontianyu)
+- Reduce duplication between compat manager and defaults ([#816](https://github.com/rapidsai/kvikio/pull/816)) [@vyasr](https://github.com/vyasr)
+- Added KVIKIO_REMOTE_VERBOSE option ([#815](https://github.com/rapidsai/kvikio/pull/815)) [@TomAugspurger](https://github.com/TomAugspurger)
+- Use branch-25.10 again ([#812](https://github.com/rapidsai/kvikio/pull/812)) [@jameslamb](https://github.com/jameslamb)
+- Update rapids-dependency-file-generator ([#809](https://github.com/rapidsai/kvikio/pull/809)) [@KyleFromNVIDIA](https://github.com/KyleFromNVIDIA)
+- Add a unified remote I/O interface that infers the endpoint type from URL (2/2): Python binding ([#808](https://github.com/rapidsai/kvikio/pull/808)) [@kingcrimsontianyu](https://github.com/kingcrimsontianyu)
+- Devendor libnvcomp from libkvikio ([#805](https://github.com/rapidsai/kvikio/pull/805)) [@bdice](https://github.com/bdice)
+- Remove more nvcomp packaging for conda ([#804](https://github.com/rapidsai/kvikio/pull/804)) [@bdice](https://github.com/bdice)
+- Build and test with CUDA 13.0.0 ([#803](https://github.com/rapidsai/kvikio/pull/803)) [@jameslamb](https://github.com/jameslamb)
+- Optionally require zarr&gt;=3.0.0 ([#802](https://github.com/rapidsai/kvikio/pull/802)) [@TomAugspurger](https://github.com/TomAugspurger)
+- Use build cluster in devcontainers ([#797](https://github.com/rapidsai/kvikio/pull/797)) [@trxcllnt](https://github.com/trxcllnt)
+- Improve KvikIO Python binding performance by releasing GIL wherever deemed necessary ([#796](https://github.com/rapidsai/kvikio/pull/796)) [@kingcrimsontianyu](https://github.com/kingcrimsontianyu)
+- Update rapids_config to handle user defined branch name ([#794](https://github.com/rapidsai/kvikio/pull/794)) [@robertmaynard](https://github.com/robertmaynard)
+- Add a unified remote I/O interface that infers the endpoint type from URL (1/2): C++ implementation ([#793](https://github.com/rapidsai/kvikio/pull/793)) [@kingcrimsontianyu](https://github.com/kingcrimsontianyu)
+- Update rapids-build-backend to 0.4.0 ([#790](https://github.com/rapidsai/kvikio/pull/790)) [@KyleFromNVIDIA](https://github.com/KyleFromNVIDIA)
+- Fix C++20 warning in the mmap test ([#785](https://github.com/rapidsai/kvikio/pull/785)) [@kingcrimsontianyu](https://github.com/kingcrimsontianyu)
+- Allow latest OS in devcontainers ([#780](https://github.com/rapidsai/kvikio/pull/780)) [@bdice](https://github.com/bdice)
+- Update build infra to support new branching strategy ([#776](https://github.com/rapidsai/kvikio/pull/776)) [@robertmaynard](https://github.com/robertmaynard)
+- Use GCC 14 in conda builds. ([#756](https://github.com/rapidsai/kvikio/pull/756)) [@vyasr](https://github.com/vyasr)
+- Use C++20 standard ([#749](https://github.com/rapidsai/kvikio/pull/749)) [@kingcrimsontianyu](https://github.com/kingcrimsontianyu)
+
 # kvikio 25.08.00 (6 Aug 2025)
 
 ## 🚨 Breaking Changes