diff --git a/.devcontainer/Dockerfile b/.devcontainer/Dockerfile
index 5d1d536704..4775d28f23 100644
--- a/.devcontainer/Dockerfile
+++ b/.devcontainer/Dockerfile
@@ -13,6 +13,8 @@ ENV DEFAULT_CONDA_ENV=rapids
 
 FROM ${PYTHON_PACKAGE_MANAGER}-base
 
+ARG TARGETARCH
+
 ARG CUDA
 ENV CUDAARCHS="RAPIDS"
 ENV CUDA_VERSION="${CUDA_VERSION:-${CUDA}}"
@@ -24,7 +26,35 @@ ENV PYTHONSAFEPATH="1"
 ENV PYTHONUNBUFFERED="1"
 ENV PYTHONDONTWRITEBYTECODE="1"
 
+ENV HISTFILE="/home/coder/.cache/._bash_history"
+
+###
+# sccache configuration
+###
+ENV AWS_ROLE_ARN="arn:aws:iam::279114543810:role/nv-gha-token-sccache-devs"
 ENV SCCACHE_REGION="us-east-2"
 ENV SCCACHE_BUCKET="rapids-sccache-devs"
-ENV AWS_ROLE_ARN="arn:aws:iam::279114543810:role/nv-gha-token-sccache-devs"
-ENV HISTFILE="/home/coder/.cache/._bash_history"
+# 2hr (1 minute longer than sccache-dist request timeout)
+ENV SCCACHE_IDLE_TIMEOUT=7200
+
+###
+# sccache-dist configuration
+###
+# Enable sccache-dist by default
+ENV DEVCONTAINER_UTILS_ENABLE_SCCACHE_DIST=1
+# Compile locally if max retries exceeded
+ENV SCCACHE_DIST_FALLBACK_TO_LOCAL_COMPILE=true
+# Retry transient errors 4 times (for a total of 5 attempts)
+ENV SCCACHE_DIST_MAX_RETRIES=4
+ENV SCCACHE_DIST_CONNECT_TIMEOUT=30
+ENV SCCACHE_DIST_CONNECTION_POOL=false
+# 1hr 59min (to accommodate debug builds)
+ENV SCCACHE_DIST_REQUEST_TIMEOUT=7140
+ENV SCCACHE_DIST_KEEPALIVE_ENABLED=true
+ENV SCCACHE_DIST_KEEPALIVE_INTERVAL=20
+ENV SCCACHE_DIST_KEEPALIVE_TIMEOUT=600
+ENV SCCACHE_DIST_URL="https://${TARGETARCH}.linux.sccache.rapids.nvidia.com"
+
+# Build as much in parallel as possible
+ENV INFER_NUM_DEVICE_ARCHITECTURES=1
+ENV MAX_DEVICE_OBJ_TO_COMPILE_IN_PARALLEL=20
diff --git a/.devcontainer/cuda12.9-conda/devcontainer.json b/.devcontainer/cuda12.9-conda/devcontainer.json
index 7e000efb91..175af7a73c 100644
--- a/.devcontainer/cuda12.9-conda/devcontainer.json
+++ b/.devcontainer/cuda12.9-conda/devcontainer.json
@@ -5,17 +5,19 @@
     "args": {
       "CUDA": "12.9",
       "PYTHON_PACKAGE_MANAGER": "conda",
-      "BASE": "rapidsai/devcontainers:25.08-cpp-mambaforge-ubuntu22.04"
+      "BASE": "rapidsai/devcontainers:25.10-cpp-mambaforge"
     }
   },
   "runArgs": [
     "--rm",
     "--name",
-    "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-25.08-cuda12.9-conda"
+    "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-25.10-cuda12.9-conda",
+    "--ulimit",
+    "nofile=500000"
   ],
   "hostRequirements": {"gpu": "optional"},
   "features": {
-    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:25.8": {}
+    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:25.10": {}
   },
   "overrideFeatureInstallOrder": [
     "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils"
diff --git a/.devcontainer/cuda12.9-pip/devcontainer.json b/.devcontainer/cuda12.9-pip/devcontainer.json
index 087acd2900..ff13ba7e3c 100644
--- a/.devcontainer/cuda12.9-pip/devcontainer.json
+++ b/.devcontainer/cuda12.9-pip/devcontainer.json
@@ -5,17 +5,19 @@
     "args": {
       "CUDA": "12.9",
       "PYTHON_PACKAGE_MANAGER": "pip",
-      "BASE": "rapidsai/devcontainers:25.08-cpp-cuda12.9-ubuntu22.04"
+      "BASE": "rapidsai/devcontainers:25.10-cpp-cuda12.9"
     }
   },
   "runArgs": [
     "--rm",
     "--name",
-    "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-25.08-cuda12.9-pip"
+    "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-25.10-cuda12.9-pip",
+    "--ulimit",
+    "nofile=500000"
   ],
   "hostRequirements": {"gpu": "optional"},
   "features": {
-    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:25.8": {}
+    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:25.10": {}
   },
   "overrideFeatureInstallOrder": [
     "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils"
diff --git a/.devcontainer/cuda13.0-conda/devcontainer.json b/.devcontainer/cuda13.0-conda/devcontainer.json
new file mode 100644
index 0000000000..21e7bf83c4
--- /dev/null
+++ b/.devcontainer/cuda13.0-conda/devcontainer.json
@@ -0,0 +1,44 @@
+{
+  "build": {
+    "context": "${localWorkspaceFolder}/.devcontainer",
+    "dockerfile": "${localWorkspaceFolder}/.devcontainer/Dockerfile",
+    "args": {
+      "CUDA": "13.0",
+      "PYTHON_PACKAGE_MANAGER": "conda",
+      "BASE": "rapidsai/devcontainers:25.10-cpp-mambaforge"
+    }
+  },
+  "runArgs": [
+    "--rm",
+    "--name",
+    "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-25.10-cuda13.0-conda",
+    "--ulimit",
+    "nofile=500000"
+  ],
+  "hostRequirements": {"gpu": "optional"},
+  "features": {
+    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:25.10": {}
+  },
+  "overrideFeatureInstallOrder": [
+    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils"
+  ],
+  "initializeCommand": ["/bin/bash", "-c", "mkdir -m 0755 -p ${localWorkspaceFolder}/../.{aws,cache,config,conda/pkgs,conda/${localWorkspaceFolderBasename}-cuda13.0-envs}"],
+  "postAttachCommand": ["/bin/bash", "-c", "if [ ${CODESPACES:-false} = 'true' ]; then . devcontainer-utils-post-attach-command; . rapids-post-attach-command; fi"],
+  "workspaceFolder": "/home/coder",
+  "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/kvikio,type=bind,consistency=consistent",
+  "mounts": [
+    "source=${localWorkspaceFolder}/../.aws,target=/home/coder/.aws,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/../.cache,target=/home/coder/.cache,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/../.config,target=/home/coder/.config,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/../.conda/pkgs,target=/home/coder/.conda/pkgs,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/../.conda/${localWorkspaceFolderBasename}-cuda13.0-envs,target=/home/coder/.conda/envs,type=bind,consistency=consistent"
+  ],
+  "customizations": {
+    "vscode": {
+      "extensions": [
+        "ms-python.flake8",
+        "nvidia.nsight-vscode-edition"
+      ]
+    }
+  }
+}
diff --git a/.devcontainer/cuda13.0-pip/devcontainer.json b/.devcontainer/cuda13.0-pip/devcontainer.json
new file mode 100644
index 0000000000..3c035eee05
--- /dev/null
+++ b/.devcontainer/cuda13.0-pip/devcontainer.json
@@ -0,0 +1,43 @@
+{
+  "build": {
+    "context": "${localWorkspaceFolder}/.devcontainer",
+    "dockerfile": "${localWorkspaceFolder}/.devcontainer/Dockerfile",
+    "args": {
+      "CUDA": "13.0",
+      "PYTHON_PACKAGE_MANAGER": "pip",
+      "BASE": "rapidsai/devcontainers:25.10-cpp-cuda13.0"
+    }
+  },
+  "runArgs": [
+    "--rm",
+    "--name",
+    "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-25.10-cuda13.0-pip",
+    "--ulimit",
+    "nofile=500000"
+  ],
+  "hostRequirements": {"gpu": "optional"},
+  "features": {
+    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:25.10": {}
+  },
+  "overrideFeatureInstallOrder": [
+    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils"
+  ],
+  "initializeCommand": ["/bin/bash", "-c", "mkdir -m 0755 -p ${localWorkspaceFolder}/../.{aws,cache,config/pip,local/share/${localWorkspaceFolderBasename}-cuda13.0-venvs}"],
+  "postAttachCommand": ["/bin/bash", "-c", "if [ ${CODESPACES:-false} = 'true' ]; then . devcontainer-utils-post-attach-command; . rapids-post-attach-command; fi"],
+  "workspaceFolder": "/home/coder",
+  "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/kvikio,type=bind,consistency=consistent",
+  "mounts": [
+    "source=${localWorkspaceFolder}/../.aws,target=/home/coder/.aws,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/../.cache,target=/home/coder/.cache,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/../.config,target=/home/coder/.config,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/../.local/share/${localWorkspaceFolderBasename}-cuda13.0-venvs,target=/home/coder/.local/share/venvs,type=bind,consistency=consistent"
+  ],
+  "customizations": {
+    "vscode": {
+      "extensions": [
+        "ms-python.flake8",
+        "nvidia.nsight-vscode-edition"
+      ]
+    }
+  }
+}
diff --git a/.github/release.yml b/.github/release.yml
new file mode 100644
index 0000000000..2c9a85805c
--- /dev/null
+++ b/.github/release.yml
@@ -0,0 +1,27 @@
+# GitHub Auto-Generated Release Notes Configuration for RAPIDS
+# This file configures how GitHub automatically generates release notes
+
+changelog:
+  exclude:
+    labels:
+      - ignore-for-release
+      - dependencies
+    authors:
+      - rapids-bot[bot]
+      - dependabot[bot]
+  categories:
+    - title: 🚨 Breaking Changes
+      labels:
+        - breaking
+    - title: 🐛 Bug Fixes
+      labels:
+        - bug
+    - title: 📖 Documentation
+      labels:
+        - doc
+    - title: 🚀 New Features
+      labels:
+        - feature request
+    - title: 🛠️ Improvements
+      labels:
+        - improvement
diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
index 26013cb954..0070274f37 100644
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -34,7 +34,7 @@ concurrency:
 jobs:
   cpp-build:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-25.08
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-25.10
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -44,7 +44,7 @@ jobs:
   python-build:
     needs: [cpp-build]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-25.08
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-25.10
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -54,7 +54,7 @@ jobs:
   upload-conda:
     needs: [cpp-build, python-build]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-upload-packages.yaml@branch-25.08
+    uses: rapidsai/shared-workflows/.github/workflows/conda-upload-packages.yaml@branch-25.10
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -64,7 +64,7 @@ jobs:
     if: github.ref_type == 'branch'
     needs: python-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-25.08
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-25.10
     with:
       arch: "amd64"
       branch: ${{ inputs.branch }}
@@ -76,7 +76,7 @@ jobs:
       sha: ${{ inputs.sha }}
   wheel-build-cpp:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-25.08
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-25.10
     with:
       matrix_filter: group_by([.ARCH, (.CUDA_VER|split(".")|map(tonumber)|.[0])]) | map(max_by(.PY_VER|split(".")|map(tonumber)))
       build_type: ${{ inputs.build_type || 'branch' }}
@@ -89,7 +89,7 @@ jobs:
   wheel-build-python:
     needs: wheel-build-cpp
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-25.08
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-25.10
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -101,7 +101,7 @@ jobs:
   wheel-publish-cpp:
     needs: wheel-build-cpp
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-25.08
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-25.10
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -112,7 +112,7 @@ jobs:
   wheel-publish-python:
     needs: wheel-build-python
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-25.08
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-25.10
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index 6decb7f4ef..f610dce204 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -27,7 +27,7 @@ jobs:
       - wheel-python-tests
       - telemetry-setup
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@branch-25.08
+    uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@branch-25.10
     if: always()
     with:
       needs: ${{ toJSON(needs) }}
@@ -43,7 +43,7 @@ jobs:
           repo: kvikio
   changed-files:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/changed-files.yaml@branch-25.08
+    uses: rapidsai/shared-workflows/.github/workflows/changed-files.yaml@branch-25.10
     with:
       files_yaml: |
         test_cpp:
@@ -86,20 +86,20 @@ jobs:
   checks:
     secrets: inherit
     needs: telemetry-setup
-    uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@branch-25.08
+    uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@branch-25.10
     with:
       ignored_pr_jobs: telemetry-summarize
   conda-cpp-build:
     needs: checks
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-25.08
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-25.10
     with:
       build_type: pull-request
       script: ci/build_cpp.sh
   conda-cpp-tests:
     needs: [conda-cpp-build, changed-files]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-25.08
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-25.10
     if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_cpp
     with:
       build_type: pull-request
@@ -107,7 +107,7 @@ jobs:
   conda-java-tests:
     needs: conda-cpp-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-25.08
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-25.10
     with:
       build_type: pull-request
       node_type: "gpu-l4-latest-1"
@@ -117,14 +117,14 @@ jobs:
   conda-python-build:
     needs: conda-cpp-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-25.08
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-25.10
     with:
       build_type: pull-request
       script: ci/build_python.sh
   conda-python-tests:
     needs: [conda-python-build, changed-files]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-25.08
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-25.10
     if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_python
     with:
       build_type: pull-request
@@ -132,7 +132,7 @@ jobs:
   docs-build:
     needs: conda-python-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-25.08
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-25.10
     with:
       build_type: pull-request
       node_type: "gpu-l4-latest-1"
@@ -142,18 +142,25 @@ jobs:
   devcontainer:
     needs: telemetry-setup
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/build-in-devcontainer.yaml@branch-25.08
+    uses: rapidsai/shared-workflows/.github/workflows/build-in-devcontainer.yaml@branch-25.10
     with:
-      arch: '["amd64"]'
-      cuda: '["12.9"]'
+      arch: '["amd64", "arm64"]'
+      cuda: '["13.0"]'
+      node_type: "cpu8"
+      rapids-aux-secret-1: GIST_REPO_READ_ORG_GITHUB_TOKEN
+      env: |
+        SCCACHE_DIST_MAX_RETRIES=inf
+        SCCACHE_SERVER_LOG=sccache=debug
+        SCCACHE_DIST_FALLBACK_TO_LOCAL_COMPILE=false
+        SCCACHE_DIST_AUTH_TOKEN_VAR=RAPIDS_AUX_SECRET_1
       build_command: |
-        sccache -z;
-        build-all --verbose;
+        sccache --zero-stats;
+        build-all --verbose 2>&1 | tee telemetry-artifacts/build.log;
         python -c "import kvikio; print(kvikio.__version__)";
-        sccache -s;
+        sccache --show-adv-stats | tee telemetry-artifacts/sccache-stats.txt;
   wheel-cpp-build:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-25.08
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-25.10
     with:
       matrix_filter: group_by([.ARCH, (.CUDA_VER|split(".")|map(tonumber)|.[0])]) | map(max_by(.PY_VER|split(".")|map(tonumber)))
       build_type: pull-request
@@ -163,7 +170,7 @@ jobs:
   wheel-python-build:
     needs: wheel-cpp-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-25.08
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-25.10
     with:
       build_type: pull-request
       script: ci/build_wheel_python.sh
@@ -172,7 +179,7 @@ jobs:
   wheel-python-tests:
     needs: [wheel-python-build, changed-files]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-25.08
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-25.10
     if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_python
     with:
       build_type: pull-request
diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
index 2164e3ced8..5f618498b3 100644
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -25,7 +25,7 @@ on:
 jobs:
   cpp-tests:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-25.08
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-25.10
     with:
       build_type: ${{ inputs.build_type }}
       branch: ${{ inputs.branch }}
@@ -34,7 +34,7 @@ jobs:
       sha: ${{ inputs.sha }}
   python-tests:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-25.08
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-25.10
     with:
       build_type: ${{ inputs.build_type }}
       branch: ${{ inputs.branch }}
@@ -43,7 +43,7 @@ jobs:
       sha: ${{ inputs.sha }}
   conda-java-tests:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-25.08
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-25.10
     with:
       build_type: ${{ inputs.build_type }}
       branch: ${{ inputs.branch }}
diff --git a/.github/workflows/trigger-breaking-change-alert.yaml b/.github/workflows/trigger-breaking-change-alert.yaml
index 593fcb1086..48bf37afc4 100644
--- a/.github/workflows/trigger-breaking-change-alert.yaml
+++ b/.github/workflows/trigger-breaking-change-alert.yaml
@@ -12,7 +12,7 @@ jobs:
   trigger-notifier:
     if: contains(github.event.pull_request.labels.*.name, 'breaking')
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/breaking-change-alert.yaml@branch-25.08
+    uses: rapidsai/shared-workflows/.github/workflows/breaking-change-alert.yaml@branch-25.10
     with:
       sender_login: ${{ github.event.sender.login }}
       sender_avatar: ${{ github.event.sender.avatar_url }}
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index f095cc2807..17ccb948fc 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -95,10 +95,10 @@ repos:
       - id: verify-codeowners
         args: [--fix, --project-prefix=kvikio]
   - repo: https://github.com/rapidsai/dependency-file-generator
-    rev: v1.19.0
+    rev: v1.20.0
     hooks:
       - id: rapids-dependency-file-generator
-        args: ["--clean"]
+        args: ["--clean", "--warn-all", "--strict"]
   - repo: https://github.com/shellcheck-py/shellcheck-py
     rev: v0.10.0.1
     hooks:
diff --git a/CHANGELOG.md b/CHANGELOG.md
index a3aeaa2633..1e6789b685 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,3 +1,59 @@
+# kvikio 25.10.00 (8 Oct 2025)
+
+## 🚨 Breaking Changes
+
+- Support access to public S3 ([#820](https://github.com/rapidsai/kvikio/pull/820)) [@kingcrimsontianyu](https://github.com/kingcrimsontianyu)
+- Reduce duplication between compat manager and defaults ([#816](https://github.com/rapidsai/kvikio/pull/816)) [@vyasr](https://github.com/vyasr)
+- Devendor libnvcomp from libkvikio ([#805](https://github.com/rapidsai/kvikio/pull/805)) [@bdice](https://github.com/bdice)
+- Remove Python nvCOMP bindings and Zarr 2 support ([#798](https://github.com/rapidsai/kvikio/pull/798)) [@vuule](https://github.com/vuule)
+
+## 🐛 Bug Fixes
+
+- Keep string alive until nvtxDomainResourceCreate ([#832](https://github.com/rapidsai/kvikio/pull/832)) [@vyasr](https://github.com/vyasr)
+- Fix a bug in public S3 inference ([#831](https://github.com/rapidsai/kvikio/pull/831)) [@kingcrimsontianyu](https://github.com/kingcrimsontianyu)
+- Fix CUDA 13 handling of libcufile on aarch64 ([#827](https://github.com/rapidsai/kvikio/pull/827)) [@bdice](https://github.com/bdice)
+- Skip max_device_cache_size setter when BAR1 memory isn&#39;t present on the GPUs in the system ([#814](https://github.com/rapidsai/kvikio/pull/814)) [@ahoyle-nvidia](https://github.com/ahoyle-nvidia)
+- Fix an S3 parsing bug in the open function. Improve regex usage ([#810](https://github.com/rapidsai/kvikio/pull/810)) [@kingcrimsontianyu](https://github.com/kingcrimsontianyu)
+- Remove remaining nvcomp references ([#801](https://github.com/rapidsai/kvikio/pull/801)) [@vyasr](https://github.com/vyasr)
+- Revert &quot;Set compiler versions in context ([#755)&quot; (#784](https://github.com/rapidsai/kvikio/pull/755)&quot; (#784)) [@vyasr](https://github.com/vyasr)
+- Relax mmap read requirement. Improve error message. ([#781](https://github.com/rapidsai/kvikio/pull/781)) [@kingcrimsontianyu](https://github.com/kingcrimsontianyu)
+
+## 🚀 New Features
+
+- Support access to public S3 ([#820](https://github.com/rapidsai/kvikio/pull/820)) [@kingcrimsontianyu](https://github.com/kingcrimsontianyu)
+- Upgrade to nvCOMP 5.0.0.6 ([#800](https://github.com/rapidsai/kvikio/pull/800)) [@vuule](https://github.com/vuule)
+- Remove Python nvCOMP bindings and Zarr 2 support ([#798](https://github.com/rapidsai/kvikio/pull/798)) [@vuule](https://github.com/vuule)
+- Support WebHDFS (2/2): Python binding ([#791](https://github.com/rapidsai/kvikio/pull/791)) [@kingcrimsontianyu](https://github.com/kingcrimsontianyu)
+- Support file size query for S3 presigned URL ([#789](https://github.com/rapidsai/kvikio/pull/789)) [@kingcrimsontianyu](https://github.com/kingcrimsontianyu)
+- Support WebHDFS (1/2): C++ implementation ([#788](https://github.com/rapidsai/kvikio/pull/788)) [@kingcrimsontianyu](https://github.com/kingcrimsontianyu)
+- Support file-backed mapping (2/n): host/device read Python binding ([#742](https://github.com/rapidsai/kvikio/pull/742)) [@kingcrimsontianyu](https://github.com/kingcrimsontianyu)
+
+## 🛠️ Improvements
+
+- Empty commit to trigger a build ([#840](https://github.com/rapidsai/kvikio/pull/840)) [@msarahan](https://github.com/msarahan)
+- Configure repo for automatic release notes generation ([#825](https://github.com/rapidsai/kvikio/pull/825)) [@AyodeAwe](https://github.com/AyodeAwe)
+- Use C++20 for KvikIO main library ([#819](https://github.com/rapidsai/kvikio/pull/819)) [@kingcrimsontianyu](https://github.com/kingcrimsontianyu)
+- Explicitly manage TLS/SSL CA paths for remote I/O ([#817](https://github.com/rapidsai/kvikio/pull/817)) [@kingcrimsontianyu](https://github.com/kingcrimsontianyu)
+- Reduce duplication between compat manager and defaults ([#816](https://github.com/rapidsai/kvikio/pull/816)) [@vyasr](https://github.com/vyasr)
+- Added KVIKIO_REMOTE_VERBOSE option ([#815](https://github.com/rapidsai/kvikio/pull/815)) [@TomAugspurger](https://github.com/TomAugspurger)
+- Use branch-25.10 again ([#812](https://github.com/rapidsai/kvikio/pull/812)) [@jameslamb](https://github.com/jameslamb)
+- Update rapids-dependency-file-generator ([#809](https://github.com/rapidsai/kvikio/pull/809)) [@KyleFromNVIDIA](https://github.com/KyleFromNVIDIA)
+- Add a unified remote I/O interface that infers the endpoint type from URL (2/2): Python binding ([#808](https://github.com/rapidsai/kvikio/pull/808)) [@kingcrimsontianyu](https://github.com/kingcrimsontianyu)
+- Devendor libnvcomp from libkvikio ([#805](https://github.com/rapidsai/kvikio/pull/805)) [@bdice](https://github.com/bdice)
+- Remove more nvcomp packaging for conda ([#804](https://github.com/rapidsai/kvikio/pull/804)) [@bdice](https://github.com/bdice)
+- Build and test with CUDA 13.0.0 ([#803](https://github.com/rapidsai/kvikio/pull/803)) [@jameslamb](https://github.com/jameslamb)
+- Optionally require zarr&gt;=3.0.0 ([#802](https://github.com/rapidsai/kvikio/pull/802)) [@TomAugspurger](https://github.com/TomAugspurger)
+- Use build cluster in devcontainers ([#797](https://github.com/rapidsai/kvikio/pull/797)) [@trxcllnt](https://github.com/trxcllnt)
+- Improve KvikIO Python binding performance by releasing GIL wherever deemed necessary ([#796](https://github.com/rapidsai/kvikio/pull/796)) [@kingcrimsontianyu](https://github.com/kingcrimsontianyu)
+- Update rapids_config to handle user defined branch name ([#794](https://github.com/rapidsai/kvikio/pull/794)) [@robertmaynard](https://github.com/robertmaynard)
+- Add a unified remote I/O interface that infers the endpoint type from URL (1/2): C++ implementation ([#793](https://github.com/rapidsai/kvikio/pull/793)) [@kingcrimsontianyu](https://github.com/kingcrimsontianyu)
+- Update rapids-build-backend to 0.4.0 ([#790](https://github.com/rapidsai/kvikio/pull/790)) [@KyleFromNVIDIA](https://github.com/KyleFromNVIDIA)
+- Fix C++20 warning in the mmap test ([#785](https://github.com/rapidsai/kvikio/pull/785)) [@kingcrimsontianyu](https://github.com/kingcrimsontianyu)
+- Allow latest OS in devcontainers ([#780](https://github.com/rapidsai/kvikio/pull/780)) [@bdice](https://github.com/bdice)
+- Update build infra to support new branching strategy ([#776](https://github.com/rapidsai/kvikio/pull/776)) [@robertmaynard](https://github.com/robertmaynard)
+- Use GCC 14 in conda builds. ([#756](https://github.com/rapidsai/kvikio/pull/756)) [@vyasr](https://github.com/vyasr)
+- Use C++20 standard ([#749](https://github.com/rapidsai/kvikio/pull/749)) [@kingcrimsontianyu](https://github.com/kingcrimsontianyu)
+
 # kvikio 25.08.00 (6 Aug 2025)
 
 ## 🚨 Breaking Changes
diff --git a/RAPIDS_BRANCH b/RAPIDS_BRANCH
new file mode 100644
index 0000000000..9b1c52d941
--- /dev/null
+++ b/RAPIDS_BRANCH
@@ -0,0 +1 @@
+branch-25.10
diff --git a/README.md b/README.md
index ac2ee2a4bd..549d8bcfc4 100644
--- a/README.md
+++ b/README.md
@@ -15,7 +15,6 @@ KvikIO also works efficiently when GDS isn't available and can read/write both h
 * Concurrent reads and writes using an internal thread pool.
 * Non-blocking API.
 * Transparently handles reads and writes to/from memory on both host and device.
-* (Deprecated) Provides Python bindings to [nvCOMP](https://docs.nvidia.com/cuda/nvcomp/py_api.html).
 
 
 ### Documentation
diff --git a/VERSION b/VERSION
index 3af4bda020..296e35288d 100644
--- a/VERSION
+++ b/VERSION
@@ -1 +1 @@
-25.08.00
+25.10.00
diff --git a/ci/build_cpp.sh b/ci/build_cpp.sh
index b12ed048ad..700ee25707 100755
--- a/ci/build_cpp.sh
+++ b/ci/build_cpp.sh
@@ -26,13 +26,33 @@ export RAPIDS_ARTIFACTS_DIR
 source rapids-rattler-channel-string
 
 # Construct the extra variants according to the architecture
-cat > variants.yaml << EOF
-cuda_version:
-  - ${RAPIDS_CUDA_VERSION%.*}
+if [[ "$(arch)" == "x86_64" ]]; then
+    cat > variants.yaml << EOF
+    c_compiler_version:
+      - 14
+
+    cxx_compiler_version:
+      - 14
+
+    cuda_version:
+      - ${RAPIDS_CUDA_VERSION%.*}
 EOF
-if [[ "$(arch)" == "aarch64" ]]; then
-  cat >> variants.yaml << EOF
-  - 12.1 # The last version to not support cufile
+else
+    cat > variants.yaml << EOF
+    zip_keys:
+    - [c_compiler_version, cxx_compiler_version, cuda_version]
+
+    c_compiler_version:
+    - 12
+    - 14
+
+    cxx_compiler_version:
+    - 12
+    - 14
+
+    cuda_version:
+    - 12.1 # The last version to not support cufile
+    - ${RAPIDS_CUDA_VERSION%.*}
 EOF
 fi
 
diff --git a/ci/build_wheel_cpp.sh b/ci/build_wheel_cpp.sh
index 8107f24160..61654c66b5 100755
--- a/ci/build_wheel_cpp.sh
+++ b/ci/build_wheel_cpp.sh
@@ -27,7 +27,6 @@ rapids-pip-retry install \
 # 0 really means "add --no-build-isolation" (ref: https://github.com/pypa/pip/issues/5735)
 export PIP_NO_BUILD_ISOLATION=0
 
-export SKBUILD_CMAKE_ARGS="-DUSE_NVCOMP_RUNTIME_WHEEL=OFF"
 ./ci/build_wheel.sh "${package_name}" "${package_dir}"
 
 python -m auditwheel repair \
diff --git a/ci/build_wheel_python.sh b/ci/build_wheel_python.sh
index 55ee0c05a0..312fedc0e7 100755
--- a/ci/build_wheel_python.sh
+++ b/ci/build_wheel_python.sh
@@ -17,12 +17,10 @@ RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen "${RAPIDS_CUDA_VERSION}")"
 LIBKVIKIO_WHEELHOUSE=$(RAPIDS_PY_WHEEL_NAME="libkvikio_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-github cpp)
 echo "libkvikio-${RAPIDS_PY_CUDA_SUFFIX} @ file://$(echo "${LIBKVIKIO_WHEELHOUSE}"/libkvikio_*.whl)" >> "${PIP_CONSTRAINT}"
 
-export SKBUILD_CMAKE_ARGS="-DUSE_NVCOMP_RUNTIME_WHEEL=ON"
 ./ci/build_wheel.sh "${package_name}" "${package_dir}"
 
 python -m auditwheel repair \
     --exclude libkvikio.so \
-    --exclude libnvcomp.so.4 \
     -w "${RAPIDS_WHEEL_BLD_OUTPUT_DIR}" \
     ${package_dir}/dist/*
 
diff --git a/cmake/RAPIDS.cmake b/cmake/RAPIDS.cmake
index d112951d3c..ddef819498 100644
--- a/cmake/RAPIDS.cmake
+++ b/cmake/RAPIDS.cmake
@@ -18,9 +18,9 @@
 cmake_minimum_required(VERSION 3.30.4 FATAL_ERROR)
 
 # Allow users to control which version is used
-if(NOT rapids-cmake-version OR NOT rapids-cmake-version MATCHES [[^([0-9][0-9])\.([0-9][0-9])$]])
+if(NOT (rapids-cmake-branch OR rapids-cmake-version))
   message(
-    FATAL_ERROR "The CMake variable rapids-cmake-version must be defined in the format MAJOR.MINOR."
+    FATAL_ERROR "The CMake variable `rapids-cmake-branch` or `rapids-cmake-version` must be defined"
   )
 endif()
 
@@ -33,7 +33,7 @@ endif()
 # Allow users to control which branch is fetched
 if(NOT rapids-cmake-branch)
   # Define a default branch if the user doesn't set one
-  set(rapids-cmake-branch "branch-${rapids-cmake-version}")
+  set(rapids-cmake-branch "release/${rapids-cmake-version}")
 endif()
 
 # Allow users to control the exact URL passed to FetchContent
diff --git a/cmake/rapids_config.cmake b/cmake/rapids_config.cmake
index abe468dce8..b2c54a3f27 100644
--- a/cmake/rapids_config.cmake
+++ b/cmake/rapids_config.cmake
@@ -26,5 +26,19 @@ else()
   )
 endif()
 
-set(rapids-cmake-version "${RAPIDS_VERSION_MAJOR_MINOR}")
+# Use STRINGS to trim whitespace/newlines
+file(STRINGS "${CMAKE_CURRENT_LIST_DIR}/../RAPIDS_BRANCH" _rapids_branch)
+if(NOT _rapids_branch)
+  message(
+    FATAL_ERROR
+      "Could not determine branch name to use for checking out rapids-cmake. The file \"${CMAKE_CURRENT_LIST_DIR}/../RAPIDS_BRANCH\" is missing."
+  )
+endif()
+
+if(NOT rapids-cmake-version)
+  set(rapids-cmake-version "${RAPIDS_VERSION_MAJOR_MINOR}")
+endif()
+if(NOT rapids-cmake-branch)
+  set(rapids-cmake-branch "${_rapids_branch}")
+endif()
 include("${CMAKE_CURRENT_LIST_DIR}/RAPIDS.cmake")
diff --git a/conda/environments/all_cuda-129_arch-aarch64.yaml b/conda/environments/all_cuda-129_arch-aarch64.yaml
index 1ba892c85a..279b130e55 100644
--- a/conda/environments/all_cuda-129_arch-aarch64.yaml
+++ b/conda/environments/all_cuda-129_arch-aarch64.yaml
@@ -9,20 +9,18 @@ dependencies:
 - c-compiler
 - cmake>=3.30.4
 - cuda-nvcc
-- cuda-python>=12.6.2,<13.0a0
+- cuda-python>=12.9.2,<13.0a0
 - cuda-version=12.9
-- cupy>=12.0.0
+- cupy>=13.6.0
 - cxx-compiler
 - cython>=3.0.0
 - doxygen=1.9.1
-- gcc_linux-aarch64=13.*
+- gcc_linux-aarch64=14.*
 - libcufile-dev
 - libcurl>=8.5.0,<9.0a0
 - libnuma
-- libnvcomp-dev==4.2.0.11
 - moto>=4.0.8
 - ninja
-- numcodecs !=0.12.0
 - numpy>=1.23,<3.0a0
 - numpydoc
 - packaging
@@ -33,12 +31,12 @@ dependencies:
 - pytest-timeout
 - python>=3.10,<3.14
 - rangehttpserver
-- rapids-build-backend>=0.3.0,<0.4.0.dev0
-- rapids-dask-dependency==25.8.*,>=0.0.0a0
+- rapids-build-backend>=0.4.0,<0.5.0.dev0
+- rapids-dask-dependency==25.10.*,>=0.0.0a0
 - scikit-build-core>=0.10.0
 - sphinx
 - sphinx-click
 - sphinx_rtd_theme
 - sysroot_linux-aarch64=2.28
-- zarr>=2.0.0,<4.0.0
+- zarr>=3.0.0,<4.0.0
 name: all_cuda-129_arch-aarch64
diff --git a/conda/environments/all_cuda-129_arch-x86_64.yaml b/conda/environments/all_cuda-129_arch-x86_64.yaml
index d95bcba871..5460d73ab7 100644
--- a/conda/environments/all_cuda-129_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-129_arch-x86_64.yaml
@@ -9,20 +9,18 @@ dependencies:
 - c-compiler
 - cmake>=3.30.4
 - cuda-nvcc
-- cuda-python>=12.6.2,<13.0a0
+- cuda-python>=12.9.2,<13.0a0
 - cuda-version=12.9
-- cupy>=12.0.0
+- cupy>=13.6.0
 - cxx-compiler
 - cython>=3.0.0
 - doxygen=1.9.1
-- gcc_linux-64=13.*
+- gcc_linux-64=14.*
 - libcufile-dev
 - libcurl>=8.5.0,<9.0a0
 - libnuma
-- libnvcomp-dev==4.2.0.11
 - moto>=4.0.8
 - ninja
-- numcodecs !=0.12.0
 - numpy>=1.23,<3.0a0
 - numpydoc
 - packaging
@@ -33,12 +31,12 @@ dependencies:
 - pytest-timeout
 - python>=3.10,<3.14
 - rangehttpserver
-- rapids-build-backend>=0.3.0,<0.4.0.dev0
-- rapids-dask-dependency==25.8.*,>=0.0.0a0
+- rapids-build-backend>=0.4.0,<0.5.0.dev0
+- rapids-dask-dependency==25.10.*,>=0.0.0a0
 - scikit-build-core>=0.10.0
 - sphinx
 - sphinx-click
 - sphinx_rtd_theme
 - sysroot_linux-64=2.28
-- zarr>=2.0.0,<4.0.0
+- zarr>=3.0.0,<4.0.0
 name: all_cuda-129_arch-x86_64
diff --git a/conda/environments/all_cuda-130_arch-aarch64.yaml b/conda/environments/all_cuda-130_arch-aarch64.yaml
new file mode 100644
index 0000000000..1b752cbe8e
--- /dev/null
+++ b/conda/environments/all_cuda-130_arch-aarch64.yaml
@@ -0,0 +1,42 @@
+# This file is generated by `rapids-dependency-file-generator`.
+# To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
+channels:
+- rapidsai
+- rapidsai-nightly
+- conda-forge
+dependencies:
+- boto3>=1.21.21
+- c-compiler
+- cmake>=3.30.4
+- cuda-nvcc
+- cuda-python>=13.0.1,<14.0a0
+- cuda-version=13.0
+- cupy>=13.6.0
+- cxx-compiler
+- cython>=3.0.0
+- doxygen=1.9.1
+- gcc_linux-aarch64=14.*
+- libcufile-dev
+- libcurl>=8.5.0,<9.0a0
+- libnuma
+- moto>=4.0.8
+- ninja
+- numpy>=1.23,<3.0a0
+- numpydoc
+- packaging
+- pre-commit
+- pytest
+- pytest-asyncio
+- pytest-cov
+- pytest-timeout
+- python>=3.10,<3.14
+- rangehttpserver
+- rapids-build-backend>=0.4.0,<0.5.0.dev0
+- rapids-dask-dependency==25.10.*,>=0.0.0a0
+- scikit-build-core>=0.10.0
+- sphinx
+- sphinx-click
+- sphinx_rtd_theme
+- sysroot_linux-aarch64=2.28
+- zarr>=3.0.0,<4.0.0
+name: all_cuda-130_arch-aarch64
diff --git a/conda/environments/all_cuda-130_arch-x86_64.yaml b/conda/environments/all_cuda-130_arch-x86_64.yaml
new file mode 100644
index 0000000000..812bfdad5e
--- /dev/null
+++ b/conda/environments/all_cuda-130_arch-x86_64.yaml
@@ -0,0 +1,42 @@
+# This file is generated by `rapids-dependency-file-generator`.
+# To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
+channels:
+- rapidsai
+- rapidsai-nightly
+- conda-forge
+dependencies:
+- boto3>=1.21.21
+- c-compiler
+- cmake>=3.30.4
+- cuda-nvcc
+- cuda-python>=13.0.1,<14.0a0
+- cuda-version=13.0
+- cupy>=13.6.0
+- cxx-compiler
+- cython>=3.0.0
+- doxygen=1.9.1
+- gcc_linux-64=14.*
+- libcufile-dev
+- libcurl>=8.5.0,<9.0a0
+- libnuma
+- moto>=4.0.8
+- ninja
+- numpy>=1.23,<3.0a0
+- numpydoc
+- packaging
+- pre-commit
+- pytest
+- pytest-asyncio
+- pytest-cov
+- pytest-timeout
+- python>=3.10,<3.14
+- rangehttpserver
+- rapids-build-backend>=0.4.0,<0.5.0.dev0
+- rapids-dask-dependency==25.10.*,>=0.0.0a0
+- scikit-build-core>=0.10.0
+- sphinx
+- sphinx-click
+- sphinx_rtd_theme
+- sysroot_linux-64=2.28
+- zarr>=3.0.0,<4.0.0
+name: all_cuda-130_arch-x86_64
diff --git a/conda/recipes/kvikio/conda_build_config.yaml b/conda/recipes/kvikio/conda_build_config.yaml
index c831f4c3cb..f5f37a39a2 100644
--- a/conda/recipes/kvikio/conda_build_config.yaml
+++ b/conda/recipes/kvikio/conda_build_config.yaml
@@ -1,8 +1,8 @@
 c_compiler_version:
-  - 13
+  - 14
 
 cxx_compiler_version:
-  - 13
+  - 14
 
 cmake_version:
   - ">=3.30.4"
@@ -18,6 +18,3 @@ c_stdlib_version:
 
 libcurl_version:
   - "==8.5.0"
-
-nvcomp_version:
-  - "=4.2.0.11"
diff --git a/conda/recipes/kvikio/recipe.yaml b/conda/recipes/kvikio/recipe.yaml
index 3a7957133e..c4c52b540c 100644
--- a/conda/recipes/kvikio/recipe.yaml
+++ b/conda/recipes/kvikio/recipe.yaml
@@ -66,22 +66,18 @@ requirements:
     - cython >=3.0.0
     - libcurl ${{ libcurl_version }}
     - libkvikio =${{ version }}
-    - libnvcomp-dev ${{ nvcomp_version }}
     - pip
     - python =${{ py_version }}
-    - rapids-build-backend >=0.3.0,<0.4.0.dev0
+    - rapids-build-backend >=0.4.0,<0.5.0.dev0
     - scikit-build-core >=0.10.0
     - cuda-cudart-dev
   run:
     - ${{ pin_compatible("cuda-version", upper_bound="x", lower_bound="x") }}
-    - cupy >=12.0.0
+    - cupy >=13.6.0
     - libkvikio =${{ version }}
-    # See https://github.com/zarr-developers/numcodecs/pull/475
-    - numcodecs !=0.12.0
     - numpy >=1.23,<3.0a0
     - packaging
     - python
-    - zarr >=2.0.0,<4.0.0a0
     - cuda-cudart
   ignore_run_exports:
     by_name:
diff --git a/conda/recipes/libkvikio/conda_build_config.yaml b/conda/recipes/libkvikio/conda_build_config.yaml
index 1149e0f9fa..b67ab5d118 100644
--- a/conda/recipes/libkvikio/conda_build_config.yaml
+++ b/conda/recipes/libkvikio/conda_build_config.yaml
@@ -11,4 +11,4 @@ c_stdlib_version:
   - "2.28"
 
 libcurl_version:
-  - "==8.5.0"
+  - "8.5.0"
diff --git a/conda/recipes/libkvikio/recipe.yaml b/conda/recipes/libkvikio/recipe.yaml
index 6d6316593e..3b3a0bc1bf 100644
--- a/conda/recipes/libkvikio/recipe.yaml
+++ b/conda/recipes/libkvikio/recipe.yaml
@@ -10,8 +10,6 @@ context:
   # 3. Linux aarch64 with CUDA < 12.2, which does not use libcufile
   # Each case has different cuda-version constraints as expressed below
   should_use_cufile: ${{ x86_64 or (aarch64 and cuda_version >= "12.2") }}
-  c_compiler_version: ${{ 13 if should_use_cufile else 12 }}
-  cxx_compiler_version: ${{ c_compiler_version }}
   # When reverting, instances of cuda_key_string can be replaced with cuda_major
   cuda_key_string: ${{ cuda_version | replace(".", "_") }}
   #cuda_version: ${{ (env.get("RAPIDS_CUDA_VERSION") | split("."))[:2] | join(".") }}
@@ -67,7 +65,7 @@ cache:
       - ${{ stdlib("c") }}
     host:
       - cuda-version =${{ cuda_version }}
-      - libcurl ${{ libcurl_version }}
+      - libcurl ==${{ libcurl_version }}
       - if: should_use_cufile
         then:
         - libcufile-dev
@@ -93,9 +91,9 @@ outputs:
         - ${{ compiler("c") }}
       host:
         - cuda-version =${{ cuda_version }}
-        - libcurl ${{ libcurl_version }}
+        - libcurl ==${{ libcurl_version }}
       run:
-        - if: x86_64
+        - if: x86_64 or (aarch64 and cuda_version >= "13.0")
           then:
           - ${{ pin_compatible("cuda-version", upper_bound="x", lower_bound="x") }}
           else:
@@ -110,7 +108,6 @@ outputs:
       ignore_run_exports:
         by_name:
           - cuda-version
-          - libcurl
           - if: should_use_cufile
             then:
             - libcufile
@@ -140,6 +137,7 @@ outputs:
         - ${{ pin_subpackage("libkvikio", exact=True) }}
         - cuda-version =${{ cuda_version }}
         - cuda-cudart-dev
+        - libcurl ==${{ libcurl_version }}
         - if: should_use_cufile
           then:
           - libcufile-dev
@@ -158,7 +156,6 @@ outputs:
         by_name:
           - cuda-cudart
           - cuda-version
-          - libcurl
           - libnuma
           - if: should_use_cufile
             then:
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 37d237e0e3..5db5fa6f50 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -29,6 +29,11 @@ project(
 set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
 set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_CURRENT_SOURCE_DIR}/cmake/Modules/")
 
+# For now, disable CMake's automatic module scanning for C++ files. There is an sccache bug in the
+# version RAPIDS uses in CI that causes it to handle the resulting -M* flags incorrectly with
+# gcc>=14. We can remove this once we upgrade to a newer sccache version.
+set(CMAKE_CXX_SCAN_FOR_MODULES OFF)
+
 # Write the version header
 rapids_cmake_write_version_file(include/kvikio/version_config.hpp)
 
@@ -44,7 +49,6 @@ option(KvikIO_BUILD_EXAMPLES "Configure CMake to build examples" ON)
 option(KvikIO_BUILD_TESTS "Configure CMake to build tests" ON)
 option(KvikIO_REMOTE_SUPPORT "Configure CMake to build with remote IO support" ON)
 option(KvikIO_CUDA_SUPPORT "Configure CMake to build with CUDA support" ON)
-option(KvikIO_EXPORT_NVCOMP "Export NVCOMP as a dependency" ON)
 
 # ##################################################################################################
 # * conda environment ------------------------------------------------------------------------------
@@ -140,6 +144,7 @@ set(SOURCES
     "src/bounce_buffer.cpp"
     "src/buffer.cpp"
     "src/compat_mode.cpp"
+    "src/compat_mode_manager.cpp"
     "src/http_status_codes.cpp"
     "src/cufile/config.cpp"
     "src/cufile/driver.cpp"
@@ -158,7 +163,9 @@ set(SOURCES
 )
 
 if(KvikIO_REMOTE_SUPPORT)
-  list(APPEND SOURCES "src/remote_handle.cpp" "src/shim/libcurl.cpp")
+  list(APPEND SOURCES "src/hdfs.cpp" "src/remote_handle.cpp" "src/detail/remote_handle.cpp"
+       "src/detail/tls.cpp" "src/detail/url.cpp" "src/shim/libcurl.cpp"
+  )
 endif()
 
 add_library(kvikio ${SOURCES})
@@ -199,7 +206,7 @@ set_target_properties(
   kvikio
   PROPERTIES BUILD_RPATH "\$ORIGIN"
              INSTALL_RPATH "\$ORIGIN"
-             CXX_STANDARD 17
+             CXX_STANDARD 20
              CXX_STANDARD_REQUIRED ON
              POSITION_INDEPENDENT_CODE ON
              INTERFACE_POSITION_INDEPENDENT_CODE ON
diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt
index 665d45edfb..fc857fdfdf 100644
--- a/cpp/benchmarks/CMakeLists.txt
+++ b/cpp/benchmarks/CMakeLists.txt
@@ -42,7 +42,14 @@ function(kvikio_add_benchmark)
   endif()
 
   add_executable(${_KVIKIO_NAME} ${_KVIKIO_SOURCES})
-  set_target_properties(${_KVIKIO_NAME} PROPERTIES INSTALL_RPATH "\$ORIGIN/../../../lib")
+  set_target_properties(
+    ${_KVIKIO_NAME}
+    PROPERTIES INSTALL_RPATH "\$ORIGIN/../../../lib"
+               CXX_STANDARD 20
+               CXX_STANDARD_REQUIRED ON
+               CUDA_STANDARD 20
+               CUDA_STANDARD_REQUIRED ON
+  )
 
   target_link_libraries(${_KVIKIO_NAME} PUBLIC benchmark::benchmark kvikio::kvikio)
 
diff --git a/cpp/doxygen/main_page.md b/cpp/doxygen/main_page.md
index 8d9d0e9320..a74bad75d6 100644
--- a/cpp/doxygen/main_page.md
+++ b/cpp/doxygen/main_page.md
@@ -30,9 +30,14 @@ For convenience we release Conda packages that makes it easy to include KvikIO i
 We strongly recommend using [mamba](https://github.com/mamba-org/mamba) in place of conda, which we will do throughout the documentation.
 
 Install the **stable release** from the ``rapidsai`` channel with the following:
+
 ```sh
 # Install in existing environment
 mamba install -c rapidsai -c conda-forge libkvikio
+
+# Create new environment (CUDA 13)
+mamba create -n libkvikio-env -c rapidsai -c conda-forge cuda-version=13.0 libkvikio
+
 # Create new environment (CUDA 12)
 mamba create -n libkvikio-env -c rapidsai -c conda-forge cuda-version=12.9 libkvikio
 ```
@@ -42,6 +47,10 @@ Install the **nightly release** from the ``rapidsai-nightly`` channel with the f
 ```sh
 # Install in existing environment
 mamba install -c rapidsai-nightly -c conda-forge libkvikio
+
+# Create new environment (CUDA 13)
+mamba create -n libkvikio-env -c rapidsai-nightly -c conda-forge python=3.13 cuda-version=13.0 libkvikio
+
 # Create new environment (CUDA 12)
 mamba create -n libkvikio-env -c rapidsai-nightly -c conda-forge python=3.13 cuda-version=12.9 libkvikio
 ```
@@ -128,6 +137,13 @@ Note that if you're reading a large file that has been split into multiple reads
 
 These settings can also be controlled by `defaults::http_max_attempts()`, `defaults::http_max_attempts_reset()`, `defaults::http_status_codes()`, and `defaults::http_status_codes_reset()`.
 
+#### Remote Verbose (KVIKIO_REMOTE_VERBOSE)
+For debugging HTTP requests, you can enable verbose output that shows detailed information about HTTP communication including headers, request/response bodies, connection details, and SSL handshake information.
+
+Set the environment variable `KVIKIO_REMOTE_VERBOSE` to `true`, `on`, `yes`, or `1` (case-insensitive) to enable verbose output. Otherwise, verbose output is disabled by default.
+
+**Warning** this may show sensitive contents from headers and data.
+
 ## Example
 
 ```cpp
diff --git a/cpp/examples/downstream/cmake/get_kvikio.cmake b/cpp/examples/downstream/cmake/get_kvikio.cmake
index 5e4132f8e5..4e03834e56 100644
--- a/cpp/examples/downstream/cmake/get_kvikio.cmake
+++ b/cpp/examples/downstream/cmake/get_kvikio.cmake
@@ -30,4 +30,4 @@ function(find_and_configure_kvikio MIN_VERSION)
 
 endfunction()
 
-find_and_configure_kvikio("25.08")
+find_and_configure_kvikio("25.10")
diff --git a/cpp/include/kvikio/compat_mode.hpp b/cpp/include/kvikio/compat_mode.hpp
index 03ed443fe0..d25f0ffaef 100644
--- a/cpp/include/kvikio/compat_mode.hpp
+++ b/cpp/include/kvikio/compat_mode.hpp
@@ -17,9 +17,7 @@
 #pragma once
 
 #include <cstdint>
-#include <string>
-
-#include <kvikio/file_utils.hpp>
+#include <string_view>
 
 namespace kvikio {
 /**
@@ -49,102 +47,4 @@ CompatMode parse_compat_mode_str(std::string_view compat_mode_str);
 
 }  // namespace detail
 
-// Forward declaration.
-class FileHandle;
-
-/**
- * @brief Store and manage the compatibility mode data associated with a FileHandle.
- */
-class CompatModeManager {
- private:
-  CompatMode _compat_mode_requested{CompatMode::AUTO};
-  bool _is_compat_mode_preferred{true};
-  bool _is_compat_mode_preferred_for_async{true};
-
- public:
-  /**
-   * @brief Construct an empty compatibility mode manager.
-   */
-  CompatModeManager() noexcept = default;
-
-  /**
-   * @brief Construct a compatibility mode manager associated with a FileHandle.
-   *
-   * According to the file path, requested compatibility mode, and the system configuration, the
-   * compatibility manager:
-   * - Infers the final compatibility modes for synchronous and asynchronous I/O paths,
-   * respectively.
-   * - Initializes the file wrappers and cuFile handle associated with a FileHandle.
-   *
-   * @param file_path Refer to
-   * FileHandle::FileHandle(std::string const&, std::string const&, mode_t, CompatMode).
-   * @param flags Same as above.
-   * @param mode Same as above.
-   * @param compat_mode_requested Same as above.
-   * @param file_handle Pointer to the FileHandle object that owns this compatibility mode manager.
-   */
-  CompatModeManager(std::string const& file_path,
-                    std::string const& flags,
-                    mode_t mode,
-                    CompatMode compat_mode_requested,
-                    FileHandle* file_handle);
-
-  ~CompatModeManager() noexcept                              = default;
-  CompatModeManager(const CompatModeManager&)                = default;
-  CompatModeManager& operator=(const CompatModeManager&)     = default;
-  CompatModeManager(CompatModeManager&&) noexcept            = default;
-  CompatModeManager& operator=(CompatModeManager&&) noexcept = default;
-
-  /**
-   * @brief Functionally identical to defaults::infer_compat_mode_if_auto(CompatMode).
-   *
-   * @param compat_mode Compatibility mode.
-   * @return If the given compatibility mode is CompatMode::AUTO, infer the final compatibility
-   * mode.
-   */
-  CompatMode infer_compat_mode_if_auto(CompatMode compat_mode) noexcept;
-
-  /**
-   * @brief Functionally identical to defaults::is_compat_mode_preferred(CompatMode).
-   *
-   * @param compat_mode Compatibility mode.
-   * @return Boolean answer.
-   */
-  bool is_compat_mode_preferred(CompatMode compat_mode) noexcept;
-
-  /**
-   * @brief Check if the compatibility mode for synchronous I/O of the associated FileHandle is
-   * expected to be CompatMode::ON.
-   *
-   * @return Boolean answer.
-   */
-  bool is_compat_mode_preferred() const noexcept;
-
-  /**
-   * @brief Check if the compatibility mode for asynchronous I/O of the associated FileHandle is
-   * expected to be CompatMode::ON.
-   *
-   * @return Boolean answer.
-   */
-  bool is_compat_mode_preferred_for_async() const noexcept;
-
-  /**
-   * @brief Retrieve the original compatibility mode requested.
-   *
-   * @return The original compatibility mode requested.
-   */
-  CompatMode compat_mode_requested() const noexcept;
-
-  /**
-   * @brief Determine if asynchronous I/O can be performed or not (throw exceptions)
-   * according to the existing compatibility mode data in the manager.
-   *
-   * Asynchronous I/O cannot be performed, for instance, when compat_mode_requested() is
-   * CompatMode::OFF, is_compat_mode_preferred() is CompatMode::OFF, but
-   * is_compat_mode_preferred_for_async() is CompatMode::ON (due to missing cuFile stream API or
-   * cuFile configuration file).
-   */
-  void validate_compat_mode_for_async() const;
-};
-
 }  // namespace kvikio
diff --git a/cpp/include/kvikio/compat_mode_manager.hpp b/cpp/include/kvikio/compat_mode_manager.hpp
new file mode 100644
index 0000000000..4d15f2ba9f
--- /dev/null
+++ b/cpp/include/kvikio/compat_mode_manager.hpp
@@ -0,0 +1,106 @@
+/*
+ * Copyright (c) 2025, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <string>
+
+#include <kvikio/compat_mode.hpp>
+
+namespace kvikio {
+
+// Forward declaration.
+class FileHandle;
+
+/**
+ * @brief Store and manage the compatibility mode data associated with a FileHandle.
+ */
+class CompatModeManager {
+ private:
+  CompatMode _compat_mode_requested{CompatMode::AUTO};
+  bool _is_compat_mode_preferred{true};
+  bool _is_compat_mode_preferred_for_async{true};
+
+ public:
+  /**
+   * @brief Construct an empty compatibility mode manager.
+   */
+  CompatModeManager() noexcept = default;
+
+  /**
+   * @brief Construct a compatibility mode manager associated with a FileHandle.
+   *
+   * According to the file path, requested compatibility mode, and the system configuration, the
+   * compatibility manager:
+   * - Infers the final compatibility modes for synchronous and asynchronous I/O paths,
+   * respectively.
+   * - Initializes the file wrappers and cuFile handle associated with a FileHandle.
+   *
+   * @param file_path Refer to
+   * FileHandle::FileHandle(std::string const&, std::string const&, mode_t, CompatMode).
+   * @param flags Same as above.
+   * @param mode Same as above.
+   * @param compat_mode_requested Same as above.
+   * @param file_handle Pointer to the FileHandle object that owns this compatibility mode manager.
+   */
+  CompatModeManager(std::string const& file_path,
+                    std::string const& flags,
+                    mode_t mode,
+                    CompatMode compat_mode_requested,
+                    FileHandle* file_handle);
+
+  ~CompatModeManager() noexcept                              = default;
+  CompatModeManager(const CompatModeManager&)                = default;
+  CompatModeManager& operator=(const CompatModeManager&)     = default;
+  CompatModeManager(CompatModeManager&&) noexcept            = default;
+  CompatModeManager& operator=(CompatModeManager&&) noexcept = default;
+
+  /**
+   * @brief Check if the compatibility mode for synchronous I/O of the associated FileHandle is
+   * expected to be CompatMode::ON.
+   *
+   * @return Boolean answer.
+   */
+  bool is_compat_mode_preferred() const noexcept;
+
+  /**
+   * @brief Check if the compatibility mode for asynchronous I/O of the associated FileHandle is
+   * expected to be CompatMode::ON.
+   *
+   * @return Boolean answer.
+   */
+  bool is_compat_mode_preferred_for_async() const noexcept;
+
+  /**
+   * @brief Retrieve the original compatibility mode requested.
+   *
+   * @return The original compatibility mode requested.
+   */
+  CompatMode compat_mode_requested() const noexcept;
+
+  /**
+   * @brief Determine if asynchronous I/O can be performed or not (throw exceptions)
+   * according to the existing compatibility mode data in the manager.
+   *
+   * Asynchronous I/O cannot be performed, for instance, when compat_mode_requested() is
+   * CompatMode::OFF, is_compat_mode_preferred() is CompatMode::OFF, but
+   * is_compat_mode_preferred_for_async() is CompatMode::ON (due to missing cuFile stream API or
+   * cuFile configuration file).
+   */
+  void validate_compat_mode_for_async() const;
+};
+
+}  // namespace kvikio
diff --git a/cpp/include/kvikio/detail/remote_handle.hpp b/cpp/include/kvikio/detail/remote_handle.hpp
new file mode 100644
index 0000000000..f6bd55c4c0
--- /dev/null
+++ b/cpp/include/kvikio/detail/remote_handle.hpp
@@ -0,0 +1,34 @@
+/*
+ * Copyright (c) 2025, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <cstring>
+
+namespace kvikio::detail {
+/**
+ * @brief Callback for `CURLOPT_WRITEFUNCTION` that copies received data into a `std::string`.
+ *
+ * @param data Received data
+ * @param size Curl internal implementation always sets this parameter to 1
+ * @param num_bytes Number of bytes received
+ * @param userdata Must be cast from `std::string*`
+ * @return The number of bytes consumed by the callback
+ */
+std::size_t callback_get_string_response(char* data,
+                                         std::size_t size,
+                                         std::size_t num_bytes,
+                                         void* userdata);
+}  // namespace kvikio::detail
diff --git a/cpp/include/kvikio/detail/tls.hpp b/cpp/include/kvikio/detail/tls.hpp
new file mode 100644
index 0000000000..be40eaf5e5
--- /dev/null
+++ b/cpp/include/kvikio/detail/tls.hpp
@@ -0,0 +1,58 @@
+/*
+ * Copyright (c) 2025, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <optional>
+#include <string>
+#include <utility>
+
+#include <kvikio/shim/libcurl.hpp>
+
+namespace kvikio::detail {
+/**
+ * @brief Search for the CA bundle file and directory paths
+ *
+ * This function searches for the Certificate Authority (CA) paths required for TLS/SSL verification
+ * in libcurl. The search is performed in the following priority order, returning as soon as either
+ * a bundle file or a directory is found:
+ * - CA bundle file: Check env vars CURL_CA_BUNDLE, SSL_CERT_FILE
+ * - CA directory: Check env var SSL_CERT_DIR
+ * - CA bundle file: Search distribution-specific locations for accessible bundle
+ * - CA directory: Search distribution-specific locations for accessible directory
+ * - CA bundle file: Check if curl's compile-time default bundle path is accessible
+ * - CA directory: Check if curl's compile-time default directory path is accessible
+ *
+ * @return Result containing CA bundle file and CA certificate directory
+ *
+ * @exception std::runtime_error if neither CA bundle nor directory is found
+ *
+ * @note Environment Variables:
+ * - CURL_CA_BUNDLE: Override CA bundle file location (curl-specific)
+ * - SSL_CERT_FILE: Override CA bundle file location (OpenSSL-compatible)
+ * - SSL_CERT_DIR: Override CA directory location (OpenSSL-compatible)
+ */
+std::pair<std::optional<std::string>, std::optional<std::string>> get_ca_paths();
+
+/**
+ * @brief Configure curl handle with discovered CA certificate paths
+ *
+ * As a performance optimization, the discovered CA certificate paths are cached to avoid repeated
+ * searching.
+ *
+ * @param curl Curl handle to configure with CA certificate paths
+ */
+void set_up_ca_paths(CurlHandle& curl);
+}  // namespace kvikio::detail
diff --git a/cpp/include/kvikio/detail/url.hpp b/cpp/include/kvikio/detail/url.hpp
new file mode 100644
index 0000000000..e57d2c4c94
--- /dev/null
+++ b/cpp/include/kvikio/detail/url.hpp
@@ -0,0 +1,199 @@
+/*
+ * Copyright (c) 2025, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <optional>
+#include <string>
+
+#include <curl/curl.h>
+
+namespace kvikio::detail {
+/**
+ * @brief RAII wrapper for libcurl's URL handle (CURLU)
+ *
+ * This class provides automatic resource management for libcurl URL handles,
+ * ensuring proper cleanup when the handle goes out of scope. The class is
+ * move-only to prevent accidental sharing of the underlying resource.
+ */
+class CurlUrlHandle {
+ private:
+  CURLU* _handle{nullptr};
+
+ public:
+  /**
+   * @brief Create a new libcurl URL handle
+   *
+   * @exception std::runtime_error if libcurl cannot allocate the handle (usually due to out of
+   * memory)
+   */
+  CurlUrlHandle();
+
+  /**
+   * @brief Clean up the underlying URL handle
+   */
+  ~CurlUrlHandle() noexcept;
+
+  CurlUrlHandle(CurlUrlHandle const&)            = delete;
+  CurlUrlHandle& operator=(CurlUrlHandle const&) = delete;
+
+  CurlUrlHandle(CurlUrlHandle&& other) noexcept;
+  CurlUrlHandle& operator=(CurlUrlHandle&& other) noexcept;
+
+  /**
+   * @brief Get the underlying libcurl URL handle
+   *
+   * @return Pointer to the underlying libcurl URL handle
+   * @note The returned pointer should not be freed manually as it is managed by this class
+   */
+  CURLU* get() const;
+};
+
+/**
+ * @brief URL parsing utility using libcurl's URL API
+ *
+ * This class provides static methods for parsing URLs into their constituent
+ * components (scheme, host, port, path, query, fragment).
+ *
+ * @note This class uses libcurl's URL parsing which follows RFC 3986 plus. See
+ * https://curl.se/docs/url-syntax.html
+ *
+ * Example:
+ * @code{.cpp}
+ * auto components = UrlParser::parse("https://example.com:8080/path?query=1#frag");
+ * if (components.scheme.has_value()) {
+ *     std::cout << "Scheme: " << components.scheme.value() << std::endl;
+ * }
+ * if (components.host.has_value()) {
+ *     std::cout << "Host: " << components.host.value() << std::endl;
+ * }
+ * @endcode
+ */
+class UrlParser {
+ public:
+  /**
+   * @brief Container for parsed URL components
+   */
+  struct UrlComponents {
+    /**
+     * @brief The URL scheme (e.g., "http", "https", "ftp"). May be empty for scheme-relative URLs
+     * or paths.
+     */
+    std::optional<std::string> scheme;
+
+    /**
+     * @brief The hostname or IP address. May be empty for URLs without an authority component
+     * (e.g., "file:///path").
+     */
+    std::optional<std::string> host;
+
+    /**
+     * @brief The port number as a string. Will be empty if no explicit port is specified in the
+     * URL.
+     * @note Default ports (e.g., 80 for HTTP, 443 for HTTPS) are not automatically filled in.
+     */
+    std::optional<std::string> port;
+
+    /**
+     * @brief The path component of the URL. Libcurl ensures that the path component is always
+     * present, even if empty (will be "/" for URLs like "http://example.com").
+     */
+    std::optional<std::string> path;
+
+    /**
+     * @brief The query string (without the leading "?"). Empty if no query parameters are present.
+     */
+    std::optional<std::string> query;
+
+    /**
+     * @brief The fragment identifier (without the leading "#"). Empty if no fragment is present.
+     */
+    std::optional<std::string> fragment;
+  };
+
+  /**
+   * @brief Parses the given URL according to RFC 3986 plus and extracts its components.
+   *
+   * @param url The URL string to parse
+   * @param bitmask_url_flags Optional flags for URL parsing. Common flags include:
+   *  - CURLU_DEFAULT_SCHEME: Allows URLs without schemes
+   *  - CURLU_NON_SUPPORT_SCHEME: Accept non-supported schemes
+   *  - CURLU_URLENCODE: URL encode the path
+   * @param bitmask_component_flags Optional flags for component extraction. Common flags include:
+   *  - CURLU_URLDECODE: URL decode the component
+   *  - CURLU_PUNYCODE: Return host as punycode
+   *
+   * @return UrlComponents structure containing the parsed URL components
+   *
+   * @throw std::runtime_error if the URL cannot be parsed or if component extraction fails
+   *
+   * Example:
+   * @code{.cpp}
+   * // Basic parsing
+   * auto components = UrlParser::parse("https://api.example.com/v1/users?page=1");
+   *
+   * // Parsing with URL decoding
+   * auto decoded = UrlParser::parse(
+   *     "https://example.com/hello%20world",
+   *     std::nullopt,
+   *     CURLU_URLDECODE
+   * );
+   *
+   * // Allow non-standard schemes
+   * auto custom = UrlParser::parse(
+   *     "myscheme://example.com",
+   *     CURLU_NON_SUPPORT_SCHEME
+   * );
+   * @endcode
+   */
+  static UrlComponents parse(std::string const& url,
+                             std::optional<unsigned int> bitmask_url_flags       = std::nullopt,
+                             std::optional<unsigned int> bitmask_component_flags = std::nullopt);
+
+  /**
+   * @brief Extract a specific component from a CurlUrlHandle
+   *
+   * @param handle The CurlUrlHandle containing the parsed URL
+   * @param part The URL part to extract (e.g., CURLUPART_SCHEME)
+   * @param bitmask_component_flags Flags controlling extraction behavior
+   * @param allowed_err_code Optional error code to treat as valid (e.g., CURLUE_NO_SCHEME)
+   * @return The extracted component as a string, or std::nullopt if not present
+   * @throw std::runtime_error if extraction fails with an unexpected error
+   */
+  static std::optional<std::string> extract_component(
+    CurlUrlHandle const& handle,
+    CURLUPart part,
+    std::optional<unsigned int> bitmask_component_flags = std::nullopt,
+    std::optional<CURLUcode> allowed_err_code           = std::nullopt);
+
+  /**
+   * @brief Extract a specific component from a URL string
+   *
+   * @param url The URL string from which to extract a component
+   * @param part The URL part to extract
+   * @param bitmask_url_flags Optional flags for URL parsing.
+   * @param bitmask_component_flags Flags controlling extraction behavior
+   * @param allowed_err_code Optional error code to treat as valid
+   * @return The extracted component as a string, or std::nullopt if not present
+   * @throw std::runtime_error if extraction fails with an unexpected error
+   */
+  static std::optional<std::string> extract_component(
+    std::string const& url,
+    CURLUPart part,
+    std::optional<unsigned int> bitmask_url_flags       = std::nullopt,
+    std::optional<unsigned int> bitmask_component_flags = std::nullopt,
+    std::optional<CURLUcode> allowed_err_code           = std::nullopt);
+};
+}  // namespace kvikio::detail
diff --git a/cpp/include/kvikio/file_handle.hpp b/cpp/include/kvikio/file_handle.hpp
index 50e1c679c3..78e493c9b0 100644
--- a/cpp/include/kvikio/file_handle.hpp
+++ b/cpp/include/kvikio/file_handle.hpp
@@ -20,12 +20,10 @@
 
 #include <cstddef>
 #include <cstdlib>
-#include <stdexcept>
-#include <system_error>
-#include <utility>
 
 #include <kvikio/buffer.hpp>
 #include <kvikio/compat_mode.hpp>
+#include <kvikio/compat_mode_manager.hpp>
 #include <kvikio/cufile/config.hpp>
 #include <kvikio/defaults.hpp>
 #include <kvikio/error.hpp>
diff --git a/cpp/include/kvikio/hdfs.hpp b/cpp/include/kvikio/hdfs.hpp
new file mode 100644
index 0000000000..9d89d4d2e1
--- /dev/null
+++ b/cpp/include/kvikio/hdfs.hpp
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2025, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <optional>
+
+#include <kvikio/remote_handle.hpp>
+
+namespace kvikio {
+
+/**
+ * @brief A remote endpoint for Apache Hadoop WebHDFS.
+ *
+ * This endpoint is for accessing HDFS files via the WebHDFS REST API over HTTP/HTTPS. If KvikIO is
+ * run within Docker, pass `--network host` to the `docker run` command to ensure proper name node
+ * connectivity.
+ */
+class WebHdfsEndpoint : public RemoteEndpoint {
+ private:
+  std::string _url;
+  std::optional<std::string> _username;
+
+ public:
+  /**
+   * @brief Create an WebHDFS endpoint from a url.
+   *
+   * @param url The WebHDFS HTTP/HTTPS url to the remote file.
+   */
+  explicit WebHdfsEndpoint(std::string url);
+
+  /**
+   * @brief Create an WebHDFS endpoint from the host, port, file path and optionally username.
+   *
+   * @param host Host
+   * @param port Port
+   * @param remote_file_path Remote file path
+   * @param username User name
+   */
+  explicit WebHdfsEndpoint(std::string host,
+                           std::string port,
+                           std::string remote_file_path,
+                           std::optional<std::string> username = std::nullopt);
+
+  ~WebHdfsEndpoint() override = default;
+  void setopt(CurlHandle& curl) override;
+  std::string str() const override;
+  std::size_t get_file_size() override;
+  void setup_range_request(CurlHandle& curl, std::size_t file_offset, std::size_t size) override;
+
+  /**
+   * @brief Whether the given URL is valid for the WebHDFS endpoints.
+   *
+   * @param url A URL.
+   * @return Boolean answer.
+   */
+  static bool is_url_valid(std::string const& url) noexcept;
+};
+}  // namespace kvikio
diff --git a/cpp/include/kvikio/mmap.hpp b/cpp/include/kvikio/mmap.hpp
index f5245858ef..b5730727f1 100644
--- a/cpp/include/kvikio/mmap.hpp
+++ b/cpp/include/kvikio/mmap.hpp
@@ -59,7 +59,6 @@ class MmapHandle {
    *
    * @exception std::out_of_range if the read region specified by `offset` and `size` is
    * outside the initial region specified when the mapping handle was constructed
-   * @exception std::invalid_argument if the size is given but is 0
    * @exception std::runtime_error if the mapping handle is closed
    */
   std::size_t validate_and_adjust_read_args(std::optional<std::size_t> const& size,
@@ -81,11 +80,16 @@ class MmapHandle {
    *   - "w": "open for writing, truncating the file first"
    *   - "a": "open for writing, appending to the end of file if it exists"
    *   - "+": "open for updating (reading and writing)"
-   * @param initial_map_size Size in bytes of the mapped region. If not specified, map the region
-   * starting from `initial_map_offset` to the end of file
+   * @param initial_map_size Size in bytes of the mapped region. Must be greater than 0. If not
+   * specified, map the region starting from `initial_map_offset` to the end of file
    * @param initial_map_offset File offset of the mapped region
    * @param mode Access mode
    * @param map_flags Flags to be passed to the system call `mmap`. See `mmap(2)` for details
+   * @exception std::out_of_range if `initial_map_offset` (left bound of the mapped region) is equal
+   * to or greater than the file size
+   * @exception std::out_of_range if the sum of `initial_map_offset` and `initial_map_size` (right
+   * bound of the mapped region) is greater than the file size
+   * @exception std::invalid_argument if `initial_map_size` is given but is 0
    */
   MmapHandle(std::string const& file_path,
              std::string const& flags                    = "r",
@@ -147,14 +151,13 @@ class MmapHandle {
    * destination buffer `buf`
    *
    * @param buf Address of the host or device memory (destination buffer)
-   * @param size Size in bytes to read. If not specified, read starts from `offset` to the end
-   * of file
+   * @param size Size in bytes to read. Can be 0 in which case nothing will be read. If not
+   * specified, read starts from `offset` to the end of file
    * @param offset File offset
    * @return Number of bytes that have been read
    *
    * @exception std::out_of_range if the read region specified by `offset` and `size` is
    * outside the initial region specified when the mapping handle was constructed
-   * @exception std::invalid_argument if the size is given but is 0
    * @exception std::runtime_error if the mapping handle is closed
    */
   std::size_t read(void* buf,
@@ -166,15 +169,14 @@ class MmapHandle {
    * destination buffer `buf`
    *
    * @param buf Address of the host or device memory (destination buffer)
-   * @param size Size in bytes to read. If not specified, read starts from `offset` to the end
-   * of file
+   * @param size Size in bytes to read. Can be 0 in which case nothing will be read. If not
+   * specified, read starts from `offset` to the end of file
    * @param offset File offset
    * @param task_size Size of each task in bytes
    * @return Future that on completion returns the size of bytes that were successfully read.
    *
    * @exception std::out_of_range if the read region specified by `offset` and `size` is
    * outside the initial region specified when the mapping handle was constructed
-   * @exception std::invalid_argument if the size is given but is 0
    * @exception std::runtime_error if the mapping handle is closed
    *
    * @note The `std::future` object's `wait()` or `get()` should not be called after the lifetime of
diff --git a/cpp/include/kvikio/remote_handle.hpp b/cpp/include/kvikio/remote_handle.hpp
index 7c197ea2b0..c48e84e8fb 100644
--- a/cpp/include/kvikio/remote_handle.hpp
+++ b/cpp/include/kvikio/remote_handle.hpp
@@ -18,12 +18,8 @@
 #include <cassert>
 #include <cstddef>
 #include <cstring>
-#include <iostream>
 #include <memory>
 #include <optional>
-#include <regex>
-#include <sstream>
-#include <stdexcept>
 #include <string>
 
 #include <kvikio/defaults.hpp>
@@ -38,6 +34,29 @@ namespace kvikio {
 
 class CurlHandle;  // Prototype
 
+/**
+ * @brief Types of remote file endpoints supported by KvikIO.
+ *
+ * This enum defines the different protocols and services that can be used to access remote files.
+ * It is used to specify or detect the type of remote endpoint when opening files.
+ */
+enum class RemoteEndpointType : uint8_t {
+  AUTO,  ///< Automatically detect the endpoint type from the URL. KvikIO will attempt to infer the
+         ///< appropriate protocol based on the URL format.
+  S3,    ///< AWS S3 endpoint using credentials-based authentication. Requires AWS environment
+         ///< variables (such as AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY, AWS_DEFAULT_REGION) to be
+         ///< set.
+  S3_PUBLIC,  ///< AWS S3 endpoint for publicly accessible objects. No credentials required as the
+              ///< objects have public read permissions enabled. Used for open datasets and public
+              ///< buckets.
+  S3_PRESIGNED_URL,  ///< AWS S3 endpoint using a presigned URL. No credentials required as
+                     ///< authentication is embedded in the URL with time-limited access.
+  WEBHDFS,  ///< Apache Hadoop WebHDFS (Web-based Hadoop Distributed File System) endpoint for
+            ///< accessing files stored in HDFS over HTTP/HTTPS.
+  HTTP,  ///< Generic HTTP or HTTPS endpoint for accessing files from web servers. This is used for
+         ///< standard web resources that do not fit the other specific categories.
+};
+
 /**
  * @brief Abstract base class for remote endpoints.
  *
@@ -47,7 +66,13 @@ class CurlHandle;  // Prototype
  * its own ctor that takes communication protocol specific arguments.
  */
 class RemoteEndpoint {
+ protected:
+  RemoteEndpointType _remote_endpoint_type{RemoteEndpointType::AUTO};
+  RemoteEndpoint(RemoteEndpointType remote_endpoint_type);
+
  public:
+  virtual ~RemoteEndpoint() = default;
+
   /**
    * @brief Set needed connection options on a curl handle.
    *
@@ -64,11 +89,32 @@ class RemoteEndpoint {
    */
   virtual std::string str() const = 0;
 
-  virtual ~RemoteEndpoint() = default;
+  /**
+   * @brief Get the size of the remote file.
+   *
+   * @return The file size
+   */
+  virtual std::size_t get_file_size() = 0;
+
+  /**
+   * @brief Set up the range request in order to read part of a file given the file offset and read
+   * size.
+   */
+  virtual void setup_range_request(CurlHandle& curl, std::size_t file_offset, std::size_t size) = 0;
+
+  /**
+   * @brief Get the type of the remote file.
+   *
+   * @return The type of the remote file.
+   */
+  [[nodiscard]] RemoteEndpointType remote_endpoint_type() const noexcept;
 };
 
 /**
- * @brief A remote endpoint using http.
+ * @brief A remote endpoint for HTTP/HTTPS resources
+ *
+ * This endpoint is for accessing files via standard HTTP/HTTPS protocols without any specialized
+ * authentication.
  */
 class HttpEndpoint : public RemoteEndpoint {
  private:
@@ -81,13 +127,27 @@ class HttpEndpoint : public RemoteEndpoint {
    * @param url The full http url to the remote file.
    */
   HttpEndpoint(std::string url);
+
+  ~HttpEndpoint() override = default;
   void setopt(CurlHandle& curl) override;
   std::string str() const override;
-  ~HttpEndpoint() override = default;
+  std::size_t get_file_size() override;
+  void setup_range_request(CurlHandle& curl, std::size_t file_offset, std::size_t size) override;
+
+  /**
+   * @brief Whether the given URL is valid for HTTP/HTTPS endpoints.
+   *
+   * @param url A URL.
+   * @return Boolean answer.
+   */
+  static bool is_url_valid(std::string const& url) noexcept;
 };
 
 /**
- * @brief A remote endpoint using AWS's S3 protocol.
+ * @brief A remote endpoint for AWS S3 storage requiring credentials
+ *
+ * This endpoint is for accessing private S3 objects using AWS credentials (access key, secret key,
+ * region and optional session token).
  */
 class S3Endpoint : public RemoteEndpoint {
  private:
@@ -189,9 +249,75 @@ class S3Endpoint : public RemoteEndpoint {
              std::optional<std::string> aws_endpoint_url      = std::nullopt,
              std::optional<std::string> aws_session_token     = std::nullopt);
 
+  ~S3Endpoint() override;
   void setopt(CurlHandle& curl) override;
   std::string str() const override;
-  ~S3Endpoint() override;
+  std::size_t get_file_size() override;
+  void setup_range_request(CurlHandle& curl, std::size_t file_offset, std::size_t size) override;
+
+  /**
+   * @brief Whether the given URL is valid for S3 endpoints (excluding presigned URL).
+   *
+   * @param url A URL.
+   * @return Boolean answer.
+   */
+  static bool is_url_valid(std::string const& url) noexcept;
+};
+
+/**
+ * @brief A remote endpoint for publicly accessible S3 objects without authentication
+ *
+ * This endpoint is for accessing S3 objects configured with public read permissions,
+ * requiring no authentication. Supports AWS S3 services with anonymous access enabled.
+ */
+class S3PublicEndpoint : public RemoteEndpoint {
+ private:
+  std::string _url;
+
+ public:
+  explicit S3PublicEndpoint(std::string url);
+
+  ~S3PublicEndpoint() override = default;
+  void setopt(CurlHandle& curl) override;
+  std::string str() const override;
+  std::size_t get_file_size() override;
+  void setup_range_request(CurlHandle& curl, std::size_t file_offset, std::size_t size) override;
+
+  /**
+   * @brief Whether the given URL is valid for S3 public endpoints.
+   *
+   * @param url A URL.
+   * @return Boolean answer.
+   */
+  static bool is_url_valid(std::string const& url) noexcept;
+};
+
+/**
+ * @brief A remote endpoint for AWS S3 storage using presigned URLs.
+ *
+ * This endpoint is for accessing S3 objects via presigned URLs, which provide time-limited access
+ * without requiring AWS credentials on the client side.
+ */
+class S3EndpointWithPresignedUrl : public RemoteEndpoint {
+ private:
+  std::string _url;
+
+ public:
+  explicit S3EndpointWithPresignedUrl(std::string presigned_url);
+
+  ~S3EndpointWithPresignedUrl() override = default;
+  void setopt(CurlHandle& curl) override;
+  std::string str() const override;
+  std::size_t get_file_size() override;
+  void setup_range_request(CurlHandle& curl, std::size_t file_offset, std::size_t size) override;
+
+  /**
+   * @brief Whether the given URL is valid for S3 endpoints with presigned URL.
+   *
+   * @param url A URL.
+   * @return Boolean answer.
+   */
+  static bool is_url_valid(std::string const& url) noexcept;
 };
 
 /**
@@ -203,6 +329,88 @@ class RemoteHandle {
   std::size_t _nbytes;
 
  public:
+  /**
+   * @brief Create a remote file handle from a URL.
+   *
+   * This function creates a RemoteHandle for reading data from various remote endpoints
+   * including HTTP/HTTPS servers, AWS S3 buckets, S3 presigned URLs, and WebHDFS.
+   * The endpoint type can be automatically detected from the URL or explicitly specified.
+   *
+   * @param url The URL of the remote file. Supported formats include:
+   *   - S3 with credentials
+   *   - S3 presigned URL
+   *   - WebHDFS
+   *   - HTTP/HTTPS
+   * @param remote_endpoint_type The type of remote endpoint. Default is RemoteEndpointType::AUTO
+   * which automatically detects the endpoint type from the URL. Can be explicitly set to
+   * RemoteEndpointType::S3, RemoteEndpointType::S3_PRESIGNED_URL, RemoteEndpointType::WEBHDFS, or
+   * RemoteEndpointType::HTTP to force a specific endpoint type.
+   * @param allow_list Optional list of allowed endpoint types. If provided:
+   *   - If remote_endpoint_type is RemoteEndpointType::AUTO, Types are tried in the exact order
+   *     specified until a match is found.
+   *   - In explicit mode, the specified type must be in this list, otherwise an exception is
+   *     thrown.
+   *
+   * If not provided, defaults to all supported types in this order: RemoteEndpointType::S3,
+   * RemoteEndpointType::S3_PRESIGNED_URL, RemoteEndpointType::WEBHDFS, and
+   * RemoteEndpointType::HTTP.
+   * @param nbytes Optional file size in bytes. If not provided, the function sends additional
+   * request to the server to query the file size.
+   * @return A RemoteHandle object that can be used to read data from the remote file.
+   * @exception std::runtime_error If:
+   *   - If the URL is malformed or missing required components.
+   *   - RemoteEndpointType::AUTO mode is used and the URL doesn't match any supported endpoint
+   * type.
+   *   - The specified endpoint type is not in the `allow_list`.
+   *   - The URL is invalid for the specified endpoint type.
+   *   - Unable to connect to the remote server or determine file size (when nbytes not provided).
+   *
+   * Example:
+   * - Auto-detect endpoint type from URL
+   *   @code{.cpp}
+   *   auto handle = kvikio::RemoteHandle::open(
+   *       "https://bucket.s3.amazonaws.com/object?X-Amz-Algorithm=AWS4-HMAC-SHA256"
+   *       "&X-Amz-Credential=...&X-Amz-Signature=..."
+   *   );
+   *   @endcode
+   *
+   * - Open S3 file with explicit endpoint type
+   *   @code{.cpp}
+   *
+   *   auto handle = kvikio::RemoteHandle::open(
+   *       "https://my-bucket.s3.us-east-1.amazonaws.com/data.bin",
+   *       kvikio::RemoteEndpointType::S3
+   *   );
+   *   @endcode
+   *
+   * - Restrict endpoint type candidates
+   *   @code{.cpp}
+   *   std::vector<kvikio::RemoteEndpointType> allow_list = {
+   *       kvikio::RemoteEndpointType::HTTP,
+   *       kvikio::RemoteEndpointType::S3_PRESIGNED_URL
+   *   };
+   *   auto handle = kvikio::RemoteHandle::open(
+   *       user_provided_url,
+   *       kvikio::RemoteEndpointType::AUTO,
+   *       allow_list
+   *   );
+   *   @endcode
+   *
+   * - Provide known file size to skip HEAD request
+   *   @code{.cpp}
+   *   auto handle = kvikio::RemoteHandle::open(
+   *       "https://example.com/large-file.bin",
+   *       kvikio::RemoteEndpointType::HTTP,
+   *       std::nullopt,
+   *       1024 * 1024 * 100  // 100 MB
+   *   );
+   *   @endcode
+   */
+  static RemoteHandle open(std::string url,
+                           RemoteEndpointType remote_endpoint_type = RemoteEndpointType::AUTO,
+                           std::optional<std::vector<RemoteEndpointType>> allow_list = std::nullopt,
+                           std::optional<std::size_t> nbytes = std::nullopt);
+
   /**
    * @brief Create a new remote handle from an endpoint and a file size.
    *
@@ -226,10 +434,18 @@ class RemoteHandle {
   RemoteHandle(RemoteHandle const&)            = delete;
   RemoteHandle& operator=(RemoteHandle const&) = delete;
 
+  /**
+   * @brief Get the type of the remote file.
+   *
+   * @return The type of the remote file.
+   */
+  [[nodiscard]] RemoteEndpointType remote_endpoint_type() const noexcept;
+
   /**
    * @brief Get the file size.
    *
-   * Note, this is very fast, no communication needed.
+   * Note, the file size is retrieved at construction so this method is very fast, no communication
+   * needed.
    *
    * @return The number of bytes.
    */
diff --git a/cpp/src/compat_mode.cpp b/cpp/src/compat_mode.cpp
index 78a96c66be..d8346253bf 100644
--- a/cpp/src/compat_mode.cpp
+++ b/cpp/src/compat_mode.cpp
@@ -15,15 +15,11 @@
  */
 
 #include <algorithm>
-#include <cassert>
 #include <stdexcept>
 
 #include <kvikio/compat_mode.hpp>
-#include <kvikio/cufile/config.hpp>
 #include <kvikio/error.hpp>
-#include <kvikio/file_handle.hpp>
 #include <kvikio/nvtx.hpp>
-#include <kvikio/shim/cufile.hpp>
 
 namespace kvikio {
 
@@ -50,99 +46,4 @@ CompatMode parse_compat_mode_str(std::string_view compat_mode_str)
 
 }  // namespace detail
 
-CompatMode CompatModeManager::infer_compat_mode_if_auto(CompatMode compat_mode) noexcept
-{
-  KVIKIO_NVTX_FUNC_RANGE();
-  if (compat_mode == CompatMode::AUTO) {
-    return is_cufile_available() ? CompatMode::OFF : CompatMode::ON;
-  }
-  return compat_mode;
-}
-
-bool CompatModeManager::is_compat_mode_preferred(CompatMode compat_mode) noexcept
-{
-  return compat_mode == CompatMode::ON ||
-         (compat_mode == CompatMode::AUTO &&
-          infer_compat_mode_if_auto(compat_mode) == CompatMode::ON);
-}
-
-bool CompatModeManager::is_compat_mode_preferred() const noexcept
-{
-  return _is_compat_mode_preferred;
-}
-
-bool CompatModeManager::is_compat_mode_preferred_for_async() const noexcept
-{
-  return _is_compat_mode_preferred_for_async;
-}
-
-CompatMode CompatModeManager::compat_mode_requested() const noexcept
-{
-  return _compat_mode_requested;
-}
-
-CompatModeManager::CompatModeManager(std::string const& file_path,
-                                     std::string const& flags,
-                                     mode_t mode,
-                                     CompatMode compat_mode_requested_v,
-                                     FileHandle* file_handle)
-{
-  KVIKIO_NVTX_FUNC_RANGE();
-  KVIKIO_EXPECT(file_handle != nullptr,
-                "The compatibility mode manager does not have a proper owning file handle.",
-                std::invalid_argument);
-
-  file_handle->_file_direct_off.open(file_path, flags, false, mode);
-  _is_compat_mode_preferred = is_compat_mode_preferred(compat_mode_requested_v);
-
-  // Nothing to do in compatibility mode
-  if (_is_compat_mode_preferred) { return; }
-
-  try {
-    file_handle->_file_direct_on.open(file_path, flags, true, mode);
-  } catch (...) {
-    // Try to open the file with the O_DIRECT flag. Fall back to compatibility mode, if it fails.
-    if (compat_mode_requested_v == CompatMode::AUTO) {
-      _is_compat_mode_preferred = true;
-    } else {  // CompatMode::OFF
-      throw;
-    }
-  }
-
-  if (_is_compat_mode_preferred) { return; }
-
-  auto error_code = file_handle->_cufile_handle.register_handle(file_handle->_file_direct_on.fd());
-  assert(error_code.has_value());
-
-  // For the AUTO mode, if the first cuFile API call fails, fall back to the compatibility
-  // mode.
-  if (compat_mode_requested_v == CompatMode::AUTO && error_code.value().err != CU_FILE_SUCCESS) {
-    _is_compat_mode_preferred = true;
-  } else {
-    CUFILE_TRY(error_code.value());
-  }
-
-  // Check cuFile async API
-  static bool const is_extra_symbol_available = is_stream_api_available();
-  static bool const is_config_path_empty      = config_path().empty();
-  _is_compat_mode_preferred_for_async =
-    _is_compat_mode_preferred || !is_extra_symbol_available || is_config_path_empty;
-}
-
-void CompatModeManager::validate_compat_mode_for_async() const
-{
-  KVIKIO_NVTX_FUNC_RANGE();
-  if (!_is_compat_mode_preferred && _is_compat_mode_preferred_for_async &&
-      _compat_mode_requested == CompatMode::OFF) {
-    std::string err_msg;
-    if (!is_stream_api_available()) { err_msg += "Missing the cuFile stream api."; }
-
-    // When checking for availability, we also check if cuFile's config file exists. This is
-    // because even when the stream API is available, it doesn't work if no config file exists.
-    if (config_path().empty()) { err_msg += " Missing cuFile configuration file."; }
-
-    KVIKIO_FAIL(err_msg, std::runtime_error);
-  }
-}
-
 }  // namespace kvikio
diff --git a/cpp/src/compat_mode_manager.cpp b/cpp/src/compat_mode_manager.cpp
new file mode 100644
index 0000000000..569e9e73e7
--- /dev/null
+++ b/cpp/src/compat_mode_manager.cpp
@@ -0,0 +1,110 @@
+/*
+ * Copyright (c) 2025, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cassert>
+#include <stdexcept>
+
+#include <kvikio/compat_mode_manager.hpp>
+#include <kvikio/cufile/config.hpp>
+#include <kvikio/defaults.hpp>
+#include <kvikio/error.hpp>
+#include <kvikio/file_handle.hpp>
+#include <kvikio/nvtx.hpp>
+#include <kvikio/shim/cufile.hpp>
+
+namespace kvikio {
+
+bool CompatModeManager::is_compat_mode_preferred() const noexcept
+{
+  return _is_compat_mode_preferred;
+}
+
+bool CompatModeManager::is_compat_mode_preferred_for_async() const noexcept
+{
+  return _is_compat_mode_preferred_for_async;
+}
+
+CompatMode CompatModeManager::compat_mode_requested() const noexcept
+{
+  return _compat_mode_requested;
+}
+
+CompatModeManager::CompatModeManager(std::string const& file_path,
+                                     std::string const& flags,
+                                     mode_t mode,
+                                     CompatMode compat_mode_requested_v,
+                                     FileHandle* file_handle)
+{
+  KVIKIO_NVTX_FUNC_RANGE();
+  KVIKIO_EXPECT(file_handle != nullptr,
+                "The compatibility mode manager does not have a proper owning file handle.",
+                std::invalid_argument);
+
+  _compat_mode_requested = compat_mode_requested_v;
+  file_handle->_file_direct_off.open(file_path, flags, false, mode);
+  _is_compat_mode_preferred = defaults::is_compat_mode_preferred(compat_mode_requested_v);
+
+  // Nothing to do in compatibility mode
+  if (_is_compat_mode_preferred) { return; }
+
+  try {
+    file_handle->_file_direct_on.open(file_path, flags, true, mode);
+  } catch (...) {
+    // Try to open the file with the O_DIRECT flag. Fall back to compatibility mode, if it fails.
+    if (compat_mode_requested_v == CompatMode::AUTO) {
+      _is_compat_mode_preferred = true;
+    } else {  // CompatMode::OFF
+      throw;
+    }
+  }
+
+  if (_is_compat_mode_preferred) { return; }
+
+  auto error_code = file_handle->_cufile_handle.register_handle(file_handle->_file_direct_on.fd());
+  assert(error_code.has_value());
+
+  // For the AUTO mode, if the first cuFile API call fails, fall back to the compatibility
+  // mode.
+  if (compat_mode_requested_v == CompatMode::AUTO && error_code.value().err != CU_FILE_SUCCESS) {
+    _is_compat_mode_preferred = true;
+  } else {
+    CUFILE_TRY(error_code.value());
+  }
+
+  // Check cuFile async API
+  static bool const is_extra_symbol_available = is_stream_api_available();
+  static bool const is_config_path_empty      = config_path().empty();
+  _is_compat_mode_preferred_for_async =
+    _is_compat_mode_preferred || !is_extra_symbol_available || is_config_path_empty;
+}
+
+void CompatModeManager::validate_compat_mode_for_async() const
+{
+  KVIKIO_NVTX_FUNC_RANGE();
+  if (!_is_compat_mode_preferred && _is_compat_mode_preferred_for_async &&
+      _compat_mode_requested == CompatMode::OFF) {
+    std::string err_msg;
+    if (!is_stream_api_available()) { err_msg += "Missing the cuFile stream api."; }
+
+    // When checking for availability, we also check if cuFile's config file exists. This is
+    // because even when the stream API is available, it doesn't work if no config file exists.
+    if (config_path().empty()) { err_msg += " Missing cuFile configuration file."; }
+
+    KVIKIO_FAIL(err_msg, std::runtime_error);
+  }
+}
+
+}  // namespace kvikio
diff --git a/cpp/src/detail/remote_handle.cpp b/cpp/src/detail/remote_handle.cpp
new file mode 100644
index 0000000000..b023859ef8
--- /dev/null
+++ b/cpp/src/detail/remote_handle.cpp
@@ -0,0 +1,32 @@
+/*
+ * Copyright (c) 2025, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <string>
+
+#include <kvikio/detail/remote_handle.hpp>
+
+namespace kvikio::detail {
+std::size_t callback_get_string_response(char* data,
+                                         std::size_t size,
+                                         std::size_t num_bytes,
+                                         void* userdata)
+{
+  auto new_data_size = size * num_bytes;
+  auto* response     = reinterpret_cast<std::string*>(userdata);
+  response->append(data, new_data_size);
+  return new_data_size;
+}
+}  // namespace kvikio::detail
diff --git a/cpp/src/detail/tls.cpp b/cpp/src/detail/tls.cpp
new file mode 100644
index 0000000000..0dccb6b612
--- /dev/null
+++ b/cpp/src/detail/tls.cpp
@@ -0,0 +1,150 @@
+/*
+ * Copyright (c) 2025, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <unistd.h>
+#include <cstdlib>
+
+#include <curl/curl.h>
+#include <kvikio/detail/tls.hpp>
+#include <kvikio/error.hpp>
+#include <kvikio/shim/libcurl.hpp>
+#include <optional>
+#include <stdexcept>
+
+namespace kvikio::detail {
+
+namespace {
+/**
+ * @brief Search for a CA certificate path using environment variables
+ *
+ * @param env_vars Environment variable names to check in order
+ * @return Path string if found in any environment variable, std::nullopt otherwise
+ */
+std::optional<std::string> find_ca_path_from_env_var(std::vector<std::string> const& env_vars)
+{
+  for (auto const& env_var : env_vars) {
+    auto const* path = std::getenv(env_var.data());
+    if (path != nullptr) { return path; }
+  }
+
+  return std::nullopt;
+}
+
+/**
+ * @brief Search for a CA certificate path in standard system locations
+ *
+ * @param system_paths file system paths to check in order
+ * @return First accessible path if found, std::nullopt otherwise
+ */
+std::optional<std::string> find_ca_path_in_system_locations(
+  std::vector<std::string> const& system_paths)
+{
+  for (auto const& path : system_paths) {
+    // Check whether the file/directory exists, and whether it grants read permission to the calling
+    // process's real UID and GID. If the path is a symbolic link, it is dereferenced.
+    auto const result = access(path.data(), R_OK);
+
+    if (result != -1) { return path; }
+  }
+
+  return std::nullopt;
+}
+
+/**
+ * @brief Get CA certificate path from curl's compile-time defaults
+ *
+ * @param default_path Path provided by curl_version_info (may be nullptr)
+ * @return Path string if accessible, std::nullopt otherwise
+ */
+std::optional<std::string> get_ca_path_from_curl_defaults(char const* default_path)
+{
+  if (default_path != nullptr && access(default_path, R_OK) != -1) { return default_path; }
+
+  return std::nullopt;
+}
+}  // namespace
+
+std::pair<std::optional<std::string>, std::optional<std::string>> get_ca_paths()
+{
+  auto* version_info = curl_version_info(::CURLVERSION_NOW);
+  KVIKIO_EXPECT(version_info != nullptr, "Failed to get curl version info", std::runtime_error);
+
+  std::optional<std::string> ca_bundle_file;
+  std::optional<std::string> ca_directory;
+
+  // Priority 1: CA bundle file from environment variables
+  ca_bundle_file = find_ca_path_from_env_var({
+    "CURL_CA_BUNDLE",  // curl program
+    "SSL_CERT_FILE"    // OpenSSL
+  });
+  if (ca_bundle_file.has_value()) { return {ca_bundle_file, ca_directory}; }
+
+  // Priority 2: CA directory from environment variables
+  ca_directory = find_ca_path_from_env_var({
+    "SSL_CERT_DIR"  // OpenSSL
+  });
+  if (ca_directory.has_value()) { return {ca_bundle_file, ca_directory}; }
+
+  // Priority 3: CA bundle file from system locations
+  ca_bundle_file = find_ca_path_in_system_locations(
+    {"/etc/ssl/certs/ca-certificates.crt",                 // Debian/Ubuntu, Arch, Alpine, Gentoo
+     "/etc/pki/tls/certs/ca-bundle.crt",                   // RHEL/CentOS/Rocky/AlmaLinux, Fedora
+     "/etc/ssl/ca-bundle.pem",                             // OpenSUSE/SLES
+     "/etc/pki/tls/cert.pem",                              // RHEL-based (symlink to ca-bundle.crt)
+     "/etc/pki/ca-trust/extracted/pem/tls-ca-bundle.pem",  // Fedora 28+, RHEL 8+
+
+     // Additional locations mentioned by libcurl:
+     // https://github.com/curl/curl/blob/master/CMakeLists.txt
+     "/usr/share/ssl/certs/ca-bundle.crt",
+     "/usr/local/share/certs/ca-root-nss.crt",
+     "/etc/ssl/cert.pem"});
+  if (ca_bundle_file.has_value()) { return {ca_bundle_file, ca_directory}; }
+
+  // Priority 4: CA directory from system locations
+  ca_directory = find_ca_path_in_system_locations({
+    "/etc/ssl/certs",     // Debian/Ubuntu, Arch, Alpine, OpenSUSE, Gentoo
+    "/etc/pki/tls/certs"  // RHEL/CentOS/Rocky/AlmaLinux, Fedora
+  });
+  if (ca_directory.has_value()) { return {ca_bundle_file, ca_directory}; }
+
+  // Priority 5: CA bundle file from curl compile-time defaults
+  ca_bundle_file = get_ca_path_from_curl_defaults(version_info->cainfo);
+  if (ca_bundle_file.has_value()) { return {ca_bundle_file, ca_directory}; }
+
+  // Priority 6: CA directory from curl compile-time defaults
+  ca_directory = get_ca_path_from_curl_defaults(version_info->capath);
+  if (ca_directory.has_value()) { return {ca_bundle_file, ca_directory}; }
+
+  // At least one path must exist
+  KVIKIO_EXPECT(ca_bundle_file.has_value() || ca_directory.has_value(),
+                "Failed to find accessible CA certificates.",
+                std::runtime_error);
+  return {ca_bundle_file, ca_directory};
+}
+
+void set_up_ca_paths(CurlHandle& curl)
+{
+  static auto const [ca_bundle_file, ca_directory] = get_ca_paths();
+
+  if (ca_bundle_file.has_value()) {
+    curl.setopt(CURLOPT_CAINFO, ca_bundle_file->c_str());
+    curl.setopt(CURLOPT_CAPATH, nullptr);
+  } else if (ca_directory.has_value()) {
+    curl.setopt(CURLOPT_CAINFO, nullptr);
+    curl.setopt(CURLOPT_CAPATH, ca_directory->c_str());
+  }
+}
+}  // namespace kvikio::detail
diff --git a/cpp/src/detail/url.cpp b/cpp/src/detail/url.cpp
new file mode 100644
index 0000000000..64f5b8fde9
--- /dev/null
+++ b/cpp/src/detail/url.cpp
@@ -0,0 +1,142 @@
+/*
+ * Copyright (c) 2025, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <sstream>
+#include <stdexcept>
+#include <utility>
+
+#include <curl/curl.h>
+#include <kvikio/detail/url.hpp>
+#include <kvikio/error.hpp>
+
+#define CHECK_CURL_URL_ERR(err_code) check_curl_url_err(err_code, __LINE__, __FILE__)
+
+namespace kvikio::detail {
+namespace {
+void check_curl_url_err(CURLUcode err_code, int line_number, char const* filename)
+{
+  if (err_code == CURLUcode::CURLUE_OK) { return; }
+
+  std::stringstream ss;
+  ss << "KvikIO detects an URL error at: " << filename << ":" << line_number << ": ";
+  char const* msg = curl_url_strerror(err_code);
+  if (msg == nullptr) {
+    ss << "(no message)";
+  } else {
+    ss << msg;
+  }
+  throw std::runtime_error(ss.str());
+}
+}  // namespace
+
+CurlUrlHandle::CurlUrlHandle() : _handle(curl_url())
+{
+  KVIKIO_EXPECT(_handle != nullptr,
+                "Libcurl is unable to allocate a URL handle (likely out of memory).");
+}
+
+CurlUrlHandle::~CurlUrlHandle() noexcept
+{
+  if (_handle) { curl_url_cleanup(_handle); }
+}
+
+CurlUrlHandle::CurlUrlHandle(CurlUrlHandle&& other) noexcept
+  : _handle{std::exchange(other._handle, nullptr)}
+{
+}
+
+CurlUrlHandle& CurlUrlHandle::operator=(CurlUrlHandle&& other) noexcept
+{
+  if (this != &other) {
+    if (_handle) { curl_url_cleanup(_handle); }
+    _handle = std::exchange(other._handle, nullptr);
+  }
+
+  return *this;
+}
+
+CURLU* CurlUrlHandle::get() const { return _handle; }
+
+std::optional<std::string> UrlParser::extract_component(
+  CurlUrlHandle const& handle,
+  CURLUPart part,
+  std::optional<unsigned int> bitmask_component_flags,
+  std::optional<CURLUcode> allowed_err_code)
+{
+  if (!bitmask_component_flags.has_value()) { bitmask_component_flags = 0U; }
+
+  char* value{};
+  auto err_code = curl_url_get(handle.get(), part, &value, bitmask_component_flags.value());
+
+  if (err_code == CURLUcode::CURLUE_OK && value != nullptr) {
+    std::string result{value};
+    curl_free(value);
+    return result;
+  }
+
+  if (allowed_err_code.has_value() && allowed_err_code.value() == err_code) { return std::nullopt; }
+
+  // Throws an exception and explains the reason.
+  CHECK_CURL_URL_ERR(err_code);
+  return std::nullopt;
+}
+
+std::optional<std::string> UrlParser::extract_component(
+  std::string const& url,
+  CURLUPart part,
+  std::optional<unsigned int> bitmask_url_flags,
+  std::optional<unsigned int> bitmask_component_flags,
+  std::optional<CURLUcode> allowed_err_code)
+{
+  if (!bitmask_url_flags.has_value()) { bitmask_url_flags = 0U; }
+  if (!bitmask_component_flags.has_value()) { bitmask_component_flags = 0U; }
+
+  CurlUrlHandle handle;
+  CHECK_CURL_URL_ERR(
+    curl_url_set(handle.get(), CURLUPART_URL, url.c_str(), bitmask_url_flags.value()));
+
+  return extract_component(handle, part, bitmask_component_flags, allowed_err_code);
+}
+
+UrlParser::UrlComponents UrlParser::parse(std::string const& url,
+                                          std::optional<unsigned int> bitmask_url_flags,
+                                          std::optional<unsigned int> bitmask_component_flags)
+{
+  if (!bitmask_url_flags.has_value()) { bitmask_url_flags = 0U; }
+  if (!bitmask_component_flags.has_value()) { bitmask_component_flags = 0U; }
+
+  CurlUrlHandle handle;
+  CHECK_CURL_URL_ERR(
+    curl_url_set(handle.get(), CURLUPART_URL, url.c_str(), bitmask_url_flags.value()));
+
+  UrlComponents components;
+  CURLUcode err_code{};
+
+  components.scheme = extract_component(
+    handle, CURLUPART_SCHEME, bitmask_component_flags.value(), CURLUcode::CURLUE_NO_SCHEME);
+  components.host = extract_component(
+    handle, CURLUPART_HOST, bitmask_component_flags.value(), CURLUcode::CURLUE_NO_HOST);
+  components.port = extract_component(
+    handle, CURLUPART_PORT, bitmask_component_flags.value(), CURLUcode::CURLUE_NO_PORT);
+  components.path  = extract_component(handle, CURLUPART_PATH, bitmask_component_flags.value());
+  components.query = extract_component(
+    handle, CURLUPART_QUERY, bitmask_component_flags.value(), CURLUcode::CURLUE_NO_QUERY);
+  components.fragment = extract_component(
+    handle, CURLUPART_FRAGMENT, bitmask_component_flags.value(), CURLUcode::CURLUE_NO_FRAGMENT);
+
+  return components;
+}
+}  // namespace kvikio::detail
diff --git a/cpp/src/hdfs.cpp b/cpp/src/hdfs.cpp
new file mode 100644
index 0000000000..2aa5f53278
--- /dev/null
+++ b/cpp/src/hdfs.cpp
@@ -0,0 +1,142 @@
+/*
+ * Copyright (c) 2025, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <regex>
+
+#include <kvikio/detail/remote_handle.hpp>
+#include <kvikio/error.hpp>
+#include <kvikio/hdfs.hpp>
+#include <kvikio/nvtx.hpp>
+#include <kvikio/remote_handle.hpp>
+#include <kvikio/shim/libcurl.hpp>
+
+namespace kvikio {
+
+WebHdfsEndpoint::WebHdfsEndpoint(std::string url) : RemoteEndpoint{RemoteEndpointType::WEBHDFS}
+{
+  // todo: Use libcurl URL API for more secure and idiomatic parsing.
+  // Split the URL into two parts: one without query and one with.
+  std::regex static const pattern{R"(^([^?]+)\?([^#]*))"};
+  // Regex meaning:
+  // ^: From the start of the line
+  // [^?]+: Matches non-question-mark characters one or more times. The question mark ushers in the
+  // URL query component.
+  // \?: Matches the question mark, which needs to be escaped.
+  // [^#]*: Matches the non-pound characters zero or more times. The pound sign ushers in the URL
+  // fragment component. It is very likely that this part does not exist.
+  std::smatch match_results;
+  bool found = std::regex_search(url, match_results, pattern);
+  // If the match is not found, the URL contains no query.
+  if (!found) {
+    _url = url;
+    return;
+  }
+
+  _url       = match_results[1].str();
+  auto query = match_results[2].str();
+
+  {
+    // Extract user name if provided. In WebHDFS, user name is specified as the key=value pair in
+    // the query
+    std::regex static const pattern{R"(user.name=([^&]+))"};
+    // Regex meaning:
+    // [^&]+: Matches the non-ampersand character one or more times. The ampersand delimits
+    // different parameters.
+    std::smatch match_results;
+    if (std::regex_search(query, match_results, pattern)) { _username = match_results[1].str(); }
+  }
+}
+
+WebHdfsEndpoint::WebHdfsEndpoint(std::string host,
+                                 std::string port,
+                                 std::string file_path,
+                                 std::optional<std::string> username)
+  : RemoteEndpoint{RemoteEndpointType::WEBHDFS}, _username{std::move(username)}
+{
+  std::stringstream ss;
+  ss << "http://" << host << ":" << port << "/webhdfs/v1" << file_path;
+  _url = ss.str();
+}
+
+std::string WebHdfsEndpoint::str() const { return _url; }
+
+void WebHdfsEndpoint::setopt(CurlHandle& curl)
+{
+  KVIKIO_NVTX_FUNC_RANGE();
+  curl.setopt(CURLOPT_URL, _url.c_str());
+  curl.setopt(CURLOPT_FOLLOWLOCATION, 1L);
+}
+
+std::size_t WebHdfsEndpoint::get_file_size()
+{
+  KVIKIO_NVTX_FUNC_RANGE();
+
+  std::stringstream ss;
+  ss << _url << "?";
+  if (_username.has_value()) { ss << "user.name=" << _username.value() << "&"; }
+  ss << "op=GETFILESTATUS";
+
+  auto curl = create_curl_handle();
+  curl.setopt(CURLOPT_URL, ss.str().c_str());
+  curl.setopt(CURLOPT_FOLLOWLOCATION, 1L);
+
+  std::string response;
+  curl.setopt(CURLOPT_WRITEDATA, static_cast<void*>(&response));
+  curl.setopt(CURLOPT_WRITEFUNCTION, detail::callback_get_string_response);
+
+  curl.perform();
+
+  long http_status_code{};
+  curl.getinfo(CURLINFO_RESPONSE_CODE, &http_status_code);
+  KVIKIO_EXPECT(http_status_code == 200, "HTTP response is not successful.");
+
+  // The response is in JSON format. The file size is given by `"length":<file_size>`.
+  std::regex static const pattern{R"("length"\s*:\s*(\d+)[^\d])"};
+  // Regex meaning:
+  // \s*: Matches the space character zero or more times.
+  // \d+: Matches the digit one or more times.
+  // [^\d]: Matches a non-digit character.
+  std::smatch match_results;
+  bool found = std::regex_search(response, match_results, pattern);
+  KVIKIO_EXPECT(
+    found, "Regular expression search failed. Cannot extract file length from the JSON response.");
+  return std::stoull(match_results[1].str());
+}
+
+void WebHdfsEndpoint::setup_range_request(CurlHandle& curl,
+                                          std::size_t file_offset,
+                                          std::size_t size)
+{
+  // WebHDFS does not support CURLOPT_RANGE. The range is specified as query parameters in the URL.
+  KVIKIO_NVTX_FUNC_RANGE();
+  std::stringstream ss;
+  ss << _url << "?";
+  if (_username.has_value()) { ss << "user.name=" << _username.value() << "&"; }
+  ss << "op=OPEN&offset=" << file_offset << "&length=" << size;
+  curl.setopt(CURLOPT_URL, ss.str().c_str());
+}
+
+bool WebHdfsEndpoint::is_url_valid(std::string const& url) noexcept
+{
+  try {
+    std::regex static const pattern(R"(^https?://[^/]+:\d+/webhdfs/v1/.+$)",
+                                    std::regex_constants::icase);
+    return std::regex_match(url, pattern);
+  } catch (...) {
+    return false;
+  }
+}
+}  // namespace kvikio
diff --git a/cpp/src/http_status_codes.cpp b/cpp/src/http_status_codes.cpp
index 9b9cd3d793..da4a4cc379 100644
--- a/cpp/src/http_status_codes.cpp
+++ b/cpp/src/http_status_codes.cpp
@@ -31,13 +31,13 @@ std::vector<int> parse_http_status_codes(std::string_view env_var_name,
                                          std::string const& status_codes)
 {
   // Ensure `status_codes` consists only of 3-digit integers separated by commas, allowing spaces.
-  std::regex const check_pattern(R"(^\s*\d{3}\s*(\s*,\s*\d{3}\s*)*$)");
+  std::regex static const check_pattern(R"(^\s*\d{3}\s*(\s*,\s*\d{3}\s*)*$)");
   KVIKIO_EXPECT(std::regex_match(status_codes, check_pattern),
                 std::string{env_var_name} + ": invalid format, expected comma-separated integers.",
                 std::invalid_argument);
 
   // Match every integer in `status_codes`.
-  std::regex const number_pattern(R"(\d+)");
+  std::regex static const number_pattern(R"(\d+)");
 
   // For each match, we push_back `std::stoi(match.str())` into `ret`.
   std::vector<int> ret;
diff --git a/cpp/src/mmap.cpp b/cpp/src/mmap.cpp
index 11b0416c29..ee41a55d3d 100644
--- a/cpp/src/mmap.cpp
+++ b/cpp/src/mmap.cpp
@@ -19,6 +19,7 @@
 #include <cstdlib>
 #include <future>
 #include <optional>
+#include <sstream>
 #include <stdexcept>
 #include <type_traits>
 #include <unordered_map>
@@ -217,7 +218,9 @@ void read_impl(void* dst_buf,
                                                &attrs,
                                                attrs_idxs,
                                                static_cast<std::size_t>(1) /* num_attrs */,
+#if CUDA_VERSION < 13000
                                                static_cast<std::size_t*>(nullptr),
+#endif
                                                stream));
       } else {
         // Fall back to the conventional H2D copy if the batch copy API is not available.
@@ -273,17 +276,33 @@ MmapHandle::MmapHandle(std::string const& file_path,
                        std::size_t initial_map_offset,
                        mode_t mode,
                        std::optional<int> map_flags)
-  : _initial_map_offset(initial_map_offset),
-    _initialized{true},
-    _file_wrapper(file_path, flags, false /* o_direct */, mode)
+  : _initial_map_offset(initial_map_offset), _initialized{true}
 {
   KVIKIO_NVTX_FUNC_RANGE();
 
-  _file_size = get_file_size(_file_wrapper.fd());
+  switch (flags[0]) {
+    case 'r': {
+      _map_protection = PROT_READ;
+      break;
+    }
+    case 'w': {
+      KVIKIO_FAIL("File-backed mmap write is not supported yet", std::invalid_argument);
+    }
+    default: {
+      KVIKIO_FAIL("Unknown file open flag", std::invalid_argument);
+    }
+  }
+
+  _file_wrapper = FileWrapper(file_path, flags, false /* o_direct */, mode);
+  _file_size    = get_file_size(_file_wrapper.fd());
   if (_file_size == 0) { return; }
 
-  KVIKIO_EXPECT(
-    _initial_map_offset < _file_size, "Offset is past the end of file", std::out_of_range);
+  {
+    std::stringstream ss;
+    ss << "Offset must be less than the file size. initial_map_offset: " << _initial_map_offset
+       << ", file size: " << _file_size << "\n";
+    KVIKIO_EXPECT(_initial_map_offset < _file_size, ss.str(), std::out_of_range);
+  }
 
   // An initial size of std::nullopt is a shorthand for "starting from _initial_map_offset to the
   // end of file".
@@ -292,30 +311,20 @@ MmapHandle::MmapHandle(std::string const& file_path,
 
   KVIKIO_EXPECT(
     _initial_map_size > 0, "Mapped region should not be zero byte", std::invalid_argument);
-  KVIKIO_EXPECT(_initial_map_offset + _initial_map_size <= _file_size,
-                "Mapped region is past the end of file",
-                std::out_of_range);
+
+  {
+    std::stringstream ss;
+    ss << "Mapped region is past the end of file. initial map offset: " << _initial_map_offset
+       << ", initial map size: " << _initial_map_size << ", file size: " << _file_size << "\n";
+    KVIKIO_EXPECT(
+      _initial_map_offset + _initial_map_size <= _file_size, ss.str(), std::out_of_range);
+  }
 
   auto const page_size    = get_page_size();
   _map_offset             = detail::align_down(_initial_map_offset, page_size);
   auto const offset_delta = _initial_map_offset - _map_offset;
   _map_size               = _initial_map_size + offset_delta;
-
-  switch (flags[0]) {
-    case 'r': {
-      _map_protection = PROT_READ;
-      break;
-    }
-    case 'w': {
-      KVIKIO_FAIL("File-backed mmap write is not supported yet", std::invalid_argument);
-    }
-    default: {
-      KVIKIO_FAIL("Unknown file open flag", std::invalid_argument);
-    }
-  }
-
-  _map_flags = map_flags.has_value() ? map_flags.value() : MAP_PRIVATE;
-
+  _map_flags              = map_flags.has_value() ? map_flags.value() : MAP_PRIVATE;
   _map_addr =
     mmap(nullptr, _map_size, _map_protection, _map_flags, _file_wrapper.fd(), _map_offset);
   SYSCALL_CHECK(_map_addr, "Cannot create memory mapping", MAP_FAILED);
@@ -401,6 +410,7 @@ std::size_t MmapHandle::read(void* buf, std::optional<std::size_t> size, std::si
   KVIKIO_NVTX_FUNC_RANGE();
 
   auto actual_size = validate_and_adjust_read_args(size, offset);
+  if (actual_size == 0) { return actual_size; }
 
   auto const is_dst_buf_host_mem = is_host_memory(buf);
   CUcontext ctx{};
@@ -420,6 +430,7 @@ std::future<std::size_t> MmapHandle::pread(void* buf,
   KVIKIO_EXPECT(task_size <= defaults::bounce_buffer_size(),
                 "bounce buffer size cannot be less than task size.");
   auto actual_size = validate_and_adjust_read_args(size, offset);
+  if (actual_size == 0) { return make_ready_future(actual_size); }
 
   auto& [nvtx_color, call_idx] = detail::get_next_color_and_call_idx();
   KVIKIO_NVTX_FUNC_RANGE(actual_size, nvtx_color);
@@ -454,14 +465,27 @@ std::future<std::size_t> MmapHandle::pread(void* buf,
 std::size_t MmapHandle::validate_and_adjust_read_args(std::optional<std::size_t> const& size,
                                                       std::size_t offset)
 {
-  KVIKIO_EXPECT(!closed(), "Cannot read from a closed MmapHandle", std::runtime_error);
-  KVIKIO_EXPECT(offset < _file_size, "Offset is past the end of file", std::out_of_range);
+  {
+    std::stringstream ss;
+    KVIKIO_EXPECT(!closed(), "Cannot read from a closed MmapHandle", std::runtime_error);
+
+    ss << "Offset is past the end of file. offset: " << offset << ", file size: " << _file_size
+       << "\n";
+    KVIKIO_EXPECT(offset <= _file_size, ss.str(), std::out_of_range);
+  }
+
   auto actual_size = size.has_value() ? size.value() : _file_size - offset;
-  KVIKIO_EXPECT(actual_size > 0, "Read size must be greater than 0", std::invalid_argument);
-  KVIKIO_EXPECT(offset >= _initial_map_offset &&
-                  offset + actual_size <= _initial_map_offset + _initial_map_size,
-                "Read is out of bound",
-                std::out_of_range);
+
+  {
+    std::stringstream ss;
+    ss << "Read is out of bound. offset: " << offset << ", actual size to read: " << actual_size
+       << ", initial map offset: " << _initial_map_offset
+       << ", initial map size: " << _initial_map_size << "\n";
+    KVIKIO_EXPECT(offset >= _initial_map_offset &&
+                    offset + actual_size <= _initial_map_offset + _initial_map_size,
+                  ss.str(),
+                  std::out_of_range);
+  }
   return actual_size;
 }
 
diff --git a/cpp/src/nvtx.cpp b/cpp/src/nvtx.cpp
index b9d7c3e146..0467e46e38 100644
--- a/cpp/src/nvtx.cpp
+++ b/cpp/src/nvtx.cpp
@@ -85,7 +85,8 @@ void NvtxManager::rename_current_thread(std::string_view new_name) noexcept
   attribs.identifierType           = NVTX_RESOURCE_TYPE_GENERIC_THREAD_NATIVE;
   attribs.identifier.ullValue      = tid;
   attribs.messageType              = NVTX_MESSAGE_TYPE_ASCII;
-  attribs.message.ascii            = ss.str().c_str();
+  auto st                          = ss.str();
+  attribs.message.ascii            = st.c_str();
   nvtxResourceHandle_t handle =
     nvtxDomainResourceCreate(nvtx3::domain::get<libkvikio_domain>(), &attribs);
 #endif
diff --git a/cpp/src/remote_handle.cpp b/cpp/src/remote_handle.cpp
index 485e0739ac..33b481a74a 100644
--- a/cpp/src/remote_handle.cpp
+++ b/cpp/src/remote_handle.cpp
@@ -14,17 +14,22 @@
  * limitations under the License.
  */
 
+#include <algorithm>
 #include <cassert>
 #include <cstddef>
 #include <cstring>
 #include <iostream>
 #include <memory>
+#include <regex>
 #include <sstream>
 #include <stdexcept>
 #include <string>
 
 #include <kvikio/defaults.hpp>
+#include <kvikio/detail/remote_handle.hpp>
+#include <kvikio/detail/url.hpp>
 #include <kvikio/error.hpp>
+#include <kvikio/hdfs.hpp>
 #include <kvikio/nvtx.hpp>
 #include <kvikio/parallel_operation.hpp>
 #include <kvikio/posix_io.hpp>
@@ -133,21 +138,143 @@ class BounceBufferH2D {
   }
 };
 
+/**
+ * @brief Get the file size, if using `HEAD` request to obtain the content-length header is
+ * permitted.
+ *
+ * This function works for the `HttpEndpoint` and `S3Endpoint`, but not for
+ * `S3EndpointWithPresignedUrl`, which does not allow `HEAD` request.
+ *
+ * @param endpoint The remote endpoint
+ * @param url The URL of the remote file
+ * @return The file size
+ */
+std::size_t get_file_size_using_head_impl(RemoteEndpoint& endpoint, std::string const& url)
+{
+  auto curl = create_curl_handle();
+
+  endpoint.setopt(curl);
+  curl.setopt(CURLOPT_NOBODY, 1L);
+  curl.setopt(CURLOPT_FOLLOWLOCATION, 1L);
+  curl.perform();
+  curl_off_t cl;
+  curl.getinfo(CURLINFO_CONTENT_LENGTH_DOWNLOAD_T, &cl);
+  KVIKIO_EXPECT(
+    cl >= 0,
+    "cannot get size of " + endpoint.str() + ", content-length not provided by the server",
+    std::runtime_error);
+  return static_cast<std::size_t>(cl);
+}
+
+/**
+ * @brief Set up the range request for libcurl. Use this method when HTTP range request is supposed.
+ *
+ * @param curl A curl handle
+ * @param file_offset File offset
+ * @param size read size
+ */
+void setup_range_request_impl(CurlHandle& curl, std::size_t file_offset, std::size_t size)
+{
+  std::string const byte_range =
+    std::to_string(file_offset) + "-" + std::to_string(file_offset + size - 1);
+  curl.setopt(CURLOPT_RANGE, byte_range.c_str());
+}
+
+/**
+ * @brief Whether the given URL is compatible with the S3 endpoint (including the credential-based
+ * access and presigned URL) which uses HTTP/HTTPS.
+ *
+ * @param url A URL.
+ * @return Boolean answer.
+ */
+bool url_has_aws_s3_http_format(std::string const& url)
+{
+  // Currently KvikIO supports the following AWS S3 HTTP URL formats:
+  static std::array const s3_patterns = {
+    // Virtual host style: https://<bucket-name>.s3.<region-code>.amazonaws.com/<object-key-name>
+    std::regex(R"(https?://[^/]+\.s3\.[^.]+\.amazonaws\.com/.+$)", std::regex_constants::icase),
+
+    // Path style (deprecated but still popular):
+    // https://s3.<region-code>.amazonaws.com/<bucket-name>/<object-key-name>
+    std::regex(R"(https?://s3\.[^.]+\.amazonaws\.com/[^/]+/.+$)", std::regex_constants::icase),
+
+    // Legacy global endpoint: no region code
+    std::regex(R"(https?://[^/]+\.s3\.amazonaws\.com/.+$)", std::regex_constants::icase),
+    std::regex(R"(https?://s3\.amazonaws\.com/[^/]+/.+$)", std::regex_constants::icase),
+
+    // Legacy regional endpoint: s3 and region code are delimited by - instead of .
+    std::regex(R"(https?://[^/]+\.s3-[^.]+\.amazonaws\.com/.+$)", std::regex_constants::icase),
+    std::regex(R"(https?://s3-[^.]+\.amazonaws\.com/[^/]+/.+$)", std::regex_constants::icase)};
+
+  return std::any_of(s3_patterns.begin(), s3_patterns.end(), [&url = url](auto const& pattern) {
+    std::smatch match_result;
+    return std::regex_match(url, match_result, pattern);
+  });
+}
+
+char const* get_remote_endpoint_type_name(RemoteEndpointType remote_endpoint_type)
+{
+  switch (remote_endpoint_type) {
+    case RemoteEndpointType::S3: return "S3";
+    case RemoteEndpointType::S3_PUBLIC: return "S3 public";
+    case RemoteEndpointType::S3_PRESIGNED_URL: return "S3 with presigned URL";
+    case RemoteEndpointType::WEBHDFS: return "WebHDFS";
+    case RemoteEndpointType::HTTP: return "HTTP";
+    case RemoteEndpointType::AUTO: return "AUTO";
+    default:
+      // Unreachable
+      KVIKIO_FAIL("Unknown RemoteEndpointType: " +
+                  std::to_string(static_cast<int>(remote_endpoint_type)));
+      return "UNKNOWN";
+  }
+}
 }  // namespace
 
-HttpEndpoint::HttpEndpoint(std::string url) : _url{std::move(url)} {}
+RemoteEndpoint::RemoteEndpoint(RemoteEndpointType remote_endpoint_type)
+  : _remote_endpoint_type{remote_endpoint_type}
+{
+}
+
+RemoteEndpointType RemoteEndpoint::remote_endpoint_type() const noexcept
+{
+  return _remote_endpoint_type;
+}
+
+HttpEndpoint::HttpEndpoint(std::string url)
+  : RemoteEndpoint{RemoteEndpointType::HTTP}, _url{std::move(url)}
+{
+}
 
 std::string HttpEndpoint::str() const { return _url; }
 
-void HttpEndpoint::setopt(CurlHandle& curl)
+std::size_t HttpEndpoint::get_file_size()
 {
   KVIKIO_NVTX_FUNC_RANGE();
-  curl.setopt(CURLOPT_URL, _url.c_str());
+  return get_file_size_using_head_impl(*this, _url);
+}
+
+void HttpEndpoint::setup_range_request(CurlHandle& curl, std::size_t file_offset, std::size_t size)
+{
+  setup_range_request_impl(curl, file_offset, size);
+}
+
+bool HttpEndpoint::is_url_valid(std::string const& url) noexcept
+{
+  try {
+    auto parsed_url = detail::UrlParser::parse(url);
+    if ((parsed_url.scheme != "http") && (parsed_url.scheme != "https")) { return false; };
+
+    // Check whether the file path exists, excluding the leading "/"
+    return parsed_url.path->length() > 1;
+  } catch (...) {
+    return false;
+  }
 }
 
+void HttpEndpoint::setopt(CurlHandle& curl) { curl.setopt(CURLOPT_URL, _url.c_str()); }
+
 void S3Endpoint::setopt(CurlHandle& curl)
 {
-  KVIKIO_NVTX_FUNC_RANGE();
   curl.setopt(CURLOPT_URL, _url.c_str());
   curl.setopt(CURLOPT_AWS_SIGV4, _aws_sigv4.c_str());
   curl.setopt(CURLOPT_USERPWD, _aws_userpwd.c_str());
@@ -182,7 +309,9 @@ std::string S3Endpoint::url_from_bucket_and_object(std::string bucket_name,
       unwrap_or_default(std::move(aws_region),
                         "AWS_DEFAULT_REGION",
                         "S3: must provide `aws_region` if AWS_DEFAULT_REGION isn't set.");
-    // We default to the official AWS url scheme.
+    // "s3" is a non-standard URI scheme used by AWS CLI and AWS SDK, and cannot be identified by
+    // libcurl. A valid HTTP/HTTPS URL needs to be constructed for use in libcurl. Here the AWS
+    // virtual host style is used.
     ss << "https://" << bucket_name << ".s3." << region << ".amazonaws.com/" << object_name;
   } else {
     ss << endpoint_url << "/" << bucket_name << "/" << object_name;
@@ -194,7 +323,7 @@ std::pair<std::string, std::string> S3Endpoint::parse_s3_url(std::string const&
 {
   KVIKIO_NVTX_FUNC_RANGE();
   // Regular expression to match s3://<bucket>/<object>
-  std::regex const pattern{R"(^s3://([^/]+)/(.+))", std::regex_constants::icase};
+  std::regex static const pattern{R"(^s3://([^/]+)/(.+))", std::regex_constants::icase};
   std::smatch matches;
   if (std::regex_match(s3_url, matches, pattern)) { return {matches[1].str(), matches[2].str()}; }
   KVIKIO_FAIL("Input string does not match the expected S3 URL format.", std::invalid_argument);
@@ -206,11 +335,11 @@ S3Endpoint::S3Endpoint(std::string url,
                        std::optional<std::string> aws_access_key,
                        std::optional<std::string> aws_secret_access_key,
                        std::optional<std::string> aws_session_token)
-  : _url{std::move(url)}
+  : RemoteEndpoint{RemoteEndpointType::S3}, _url{std::move(url)}
 {
   KVIKIO_NVTX_FUNC_RANGE();
   // Regular expression to match http[s]://
-  std::regex pattern{R"(^https?://.*)", std::regex_constants::icase};
+  std::regex static const pattern{R"(^https?://.*)", std::regex_constants::icase};
   KVIKIO_EXPECT(std::regex_search(_url, pattern),
                 "url must start with http:// or https://",
                 std::invalid_argument);
@@ -286,31 +415,289 @@ S3Endpoint::~S3Endpoint() { curl_slist_free_all(_curl_header_list); }
 
 std::string S3Endpoint::str() const { return _url; }
 
-RemoteHandle::RemoteHandle(std::unique_ptr<RemoteEndpoint> endpoint, std::size_t nbytes)
-  : _endpoint{std::move(endpoint)}, _nbytes{nbytes}
+std::size_t S3Endpoint::get_file_size()
 {
   KVIKIO_NVTX_FUNC_RANGE();
+  return get_file_size_using_head_impl(*this, _url);
 }
 
-RemoteHandle::RemoteHandle(std::unique_ptr<RemoteEndpoint> endpoint)
+void S3Endpoint::setup_range_request(CurlHandle& curl, std::size_t file_offset, std::size_t size)
+{
+  KVIKIO_NVTX_FUNC_RANGE();
+  setup_range_request_impl(curl, file_offset, size);
+}
+
+bool S3Endpoint::is_url_valid(std::string const& url) noexcept
+{
+  try {
+    auto parsed_url = detail::UrlParser::parse(url, CURLU_NON_SUPPORT_SCHEME);
+
+    if (parsed_url.scheme == "s3") {
+      if (!parsed_url.host.has_value()) { return false; }
+      if (!parsed_url.path.has_value()) { return false; }
+
+      // Check whether the S3 object key exists
+      std::regex static const pattern(R"(^/.+$)");
+      return std::regex_search(parsed_url.path.value(), pattern);
+    } else if ((parsed_url.scheme == "http") || (parsed_url.scheme == "https")) {
+      return url_has_aws_s3_http_format(url) && !S3EndpointWithPresignedUrl::is_url_valid(url);
+    }
+  } catch (...) {
+  }
+  return false;
+}
+
+S3PublicEndpoint::S3PublicEndpoint(std::string url)
+  : RemoteEndpoint{RemoteEndpointType::S3_PUBLIC}, _url{std::move(url)}
+{
+}
+
+void S3PublicEndpoint::setopt(CurlHandle& curl) { curl.setopt(CURLOPT_URL, _url.c_str()); }
+
+std::string S3PublicEndpoint::str() const { return _url; }
+
+std::size_t S3PublicEndpoint::get_file_size()
+{
+  KVIKIO_NVTX_FUNC_RANGE();
+  return get_file_size_using_head_impl(*this, _url);
+}
+
+void S3PublicEndpoint::setup_range_request(CurlHandle& curl,
+                                           std::size_t file_offset,
+                                           std::size_t size)
+{
+  KVIKIO_NVTX_FUNC_RANGE();
+  setup_range_request_impl(curl, file_offset, size);
+}
+
+bool S3PublicEndpoint::is_url_valid(std::string const& url) noexcept
+{
+  return S3Endpoint::is_url_valid(url);
+}
+
+S3EndpointWithPresignedUrl::S3EndpointWithPresignedUrl(std::string presigned_url)
+  : RemoteEndpoint{RemoteEndpointType::S3_PRESIGNED_URL}, _url{std::move(presigned_url)}
+{
+}
+
+void S3EndpointWithPresignedUrl::setopt(CurlHandle& curl)
+{
+  curl.setopt(CURLOPT_URL, _url.c_str());
+}
+
+std::string S3EndpointWithPresignedUrl::str() const { return _url; }
+
+namespace {
+/**
+ * @brief Callback for the `CURLOPT_HEADERFUNCTION` parameter in libcurl
+ *
+ * The header callback is called once for each header and only complete header lines are passed on
+ * to the callback. The provided header line is not null-terminated.
+ *
+ * @param data Transfer buffer where new data is received
+ * @param size Curl internal implementation always sets this parameter to 1
+ * @param num_bytes The size of new data received
+ * @param userdata User-defined data
+ * @return The number of bytes consumed by the callback
+ * @exception std::invalid_argument if the server does not know the file size, thereby using "*" as
+ * the filler text in the content-range header of the HTTP message.
+ */
+std::size_t callback_header(char* data, std::size_t size, std::size_t num_bytes, void* userdata)
+{
+  auto new_data_size = size * num_bytes;
+  auto* file_size    = reinterpret_cast<long*>(userdata);
+
+  // The header line is not null-terminated. This constructor overload ensures header_line.data() is
+  // null-terminated.
+  std::string const header_line{data, new_data_size};
+
+  // The content-range header has the format
+  // Content-Range: <unit> <range>/<size>
+  // Content-Range: <unit> <range>/*
+  // Content-Range: <unit> */<size>
+  std::regex static const pattern(R"(Content-Range:[^/]+/(.*))", std::regex::icase);
+  std::smatch match_result;
+  bool found = std::regex_search(header_line, match_result, pattern);
+  if (found) {
+    // If the file size is unknown (represented by "*" in the content-range header), string-to-long
+    // conversion will throw an `std::invalid_argument` exception. The exception message from
+    // `std::stol` is usually too concise to be useful (being simply a string of "stol"), so a
+    // custom exception is used instead.
+    try {
+      *file_size = std::stol(match_result[1].str());
+    } catch (...) {
+      KVIKIO_FAIL("File size information missing on the server side.", std::invalid_argument);
+    }
+  }
+  return new_data_size;
+}
+}  // namespace
+
+std::size_t S3EndpointWithPresignedUrl::get_file_size()
 {
+  // Usually the `HEAD` request is used to obtain the content-length (file size). However, AWS S3
+  // does not allow it for presigned URL. The workaround here is to send the `GET` request with
+  // 1-byte range, so that we can still obtain the header information at a negligible cost. Since
+  // the content-length header is now at a fixed value of 1, we instead extract the file size value
+  // from content-range.
+
   KVIKIO_NVTX_FUNC_RANGE();
+
   auto curl = create_curl_handle();
+  curl.setopt(CURLOPT_URL, _url.c_str());
+
+  // 1-byte range, specified in the format "<start-byte>-<end-byte>""
+  std::string my_range{"0-0"};
+  curl.setopt(CURLOPT_RANGE, my_range.c_str());
+
+  long file_size{};
+  curl.setopt(CURLOPT_HEADERDATA, static_cast<void*>(&file_size));
+  curl.setopt(CURLOPT_HEADERFUNCTION, callback_header);
 
-  endpoint->setopt(curl);
-  curl.setopt(CURLOPT_NOBODY, 1L);
-  curl.setopt(CURLOPT_FOLLOWLOCATION, 1L);
   curl.perform();
-  curl_off_t cl;
-  curl.getinfo(CURLINFO_CONTENT_LENGTH_DOWNLOAD_T, &cl);
-  KVIKIO_EXPECT(
-    cl >= 0,
-    "cannot get size of " + endpoint->str() + ", content-length not provided by the server",
-    std::runtime_error);
-  _nbytes   = cl;
+  return file_size;
+}
+
+void S3EndpointWithPresignedUrl::setup_range_request(CurlHandle& curl,
+                                                     std::size_t file_offset,
+                                                     std::size_t size)
+{
+  KVIKIO_NVTX_FUNC_RANGE();
+  setup_range_request_impl(curl, file_offset, size);
+}
+
+bool S3EndpointWithPresignedUrl::is_url_valid(std::string const& url) noexcept
+{
+  try {
+    if (!url_has_aws_s3_http_format(url)) { return false; }
+
+    auto parsed_url = detail::UrlParser::parse(url);
+    if (!parsed_url.query.has_value()) { return false; }
+
+    // Reference: https://docs.aws.amazon.com/AmazonS3/latest/API/sigv4-query-string-auth.html
+    return parsed_url.query->find("X-Amz-Algorithm") != std::string::npos &&
+           parsed_url.query->find("X-Amz-Signature") != std::string::npos;
+  } catch (...) {
+    return false;
+  }
+}
+
+RemoteHandle RemoteHandle::open(std::string url,
+                                RemoteEndpointType remote_endpoint_type,
+                                std::optional<std::vector<RemoteEndpointType>> allow_list,
+                                std::optional<std::size_t> nbytes)
+{
+  if (!allow_list.has_value()) {
+    allow_list = {RemoteEndpointType::S3,
+                  RemoteEndpointType::S3_PUBLIC,
+                  RemoteEndpointType::S3_PRESIGNED_URL,
+                  RemoteEndpointType::WEBHDFS,
+                  RemoteEndpointType::HTTP};
+  }
+
+  auto const scheme =
+    detail::UrlParser::extract_component(url, CURLUPART_SCHEME, CURLU_NON_SUPPORT_SCHEME);
+  KVIKIO_EXPECT(scheme.has_value(), "Missing scheme in URL.");
+
+  // Helper to create endpoint based on type
+  auto create_endpoint =
+    [&url = url, &scheme = scheme](RemoteEndpointType type) -> std::unique_ptr<RemoteEndpoint> {
+    switch (type) {
+      case RemoteEndpointType::S3:
+        if (!S3Endpoint::is_url_valid(url)) { return nullptr; }
+        if (scheme.value() == "s3") {
+          auto const [bucket, object] = S3Endpoint::parse_s3_url(url);
+          return std::make_unique<S3Endpoint>(std::pair{bucket, object});
+        }
+        return std::make_unique<S3Endpoint>(url);
+
+      case RemoteEndpointType::S3_PUBLIC:
+        if (!S3PublicEndpoint::is_url_valid(url)) { return nullptr; }
+        return std::make_unique<S3PublicEndpoint>(url);
+
+      case RemoteEndpointType::S3_PRESIGNED_URL:
+        if (!S3EndpointWithPresignedUrl::is_url_valid(url)) { return nullptr; }
+        return std::make_unique<S3EndpointWithPresignedUrl>(url);
+
+      case RemoteEndpointType::WEBHDFS:
+        if (!WebHdfsEndpoint::is_url_valid(url)) { return nullptr; }
+        return std::make_unique<WebHdfsEndpoint>(url);
+
+      case RemoteEndpointType::HTTP:
+        if (!HttpEndpoint::is_url_valid(url)) { return nullptr; }
+        return std::make_unique<HttpEndpoint>(url);
+
+      default: return nullptr;
+    }
+  };
+
+  std::unique_ptr<RemoteEndpoint> endpoint;
+
+  if (remote_endpoint_type == RemoteEndpointType::AUTO) {
+    // Try each allowed type in the order of allowlist
+    for (auto const& type : allow_list.value()) {
+      try {
+        endpoint = create_endpoint(type);
+        if (endpoint == nullptr) { continue; }
+        if (type == RemoteEndpointType::S3) {
+          // Check connectivity for the credential-based S3 endpoint, and throw an exception if
+          // failed
+          endpoint->get_file_size();
+        }
+      } catch (...) {
+        // If the credential-based S3 endpoint cannot be used to access the URL, try using S3 public
+        // endpoint instead if it is in the allowlist
+        if (type == RemoteEndpointType::S3 &&
+            std::find(allow_list->begin(), allow_list->end(), RemoteEndpointType::S3_PUBLIC) !=
+              allow_list->end()) {
+          endpoint = std::make_unique<S3PublicEndpoint>(url);
+        } else {
+          throw;
+        }
+      }
+
+      // At this point, a matching endpoint has been found
+      break;
+    }
+    KVIKIO_EXPECT(endpoint.get() != nullptr, "Unsupported endpoint URL.", std::runtime_error);
+  } else {
+    // Validate it is in the allow list
+    KVIKIO_EXPECT(
+      std::find(allow_list->begin(), allow_list->end(), remote_endpoint_type) != allow_list->end(),
+      std::string{get_remote_endpoint_type_name(remote_endpoint_type)} +
+        " is not in the allowlist.",
+      std::runtime_error);
+
+    // Create the specific type
+    endpoint = create_endpoint(remote_endpoint_type);
+    KVIKIO_EXPECT(endpoint.get() != nullptr,
+                  std::string{"Invalid URL for "} +
+                    get_remote_endpoint_type_name(remote_endpoint_type) + " endpoint",
+                  std::runtime_error);
+  }
+
+  return nbytes.has_value() ? RemoteHandle(std::move(endpoint), nbytes.value())
+                            : RemoteHandle(std::move(endpoint));
+}
+
+RemoteHandle::RemoteHandle(std::unique_ptr<RemoteEndpoint> endpoint, std::size_t nbytes)
+  : _endpoint{std::move(endpoint)}, _nbytes{nbytes}
+{
+  KVIKIO_NVTX_FUNC_RANGE();
+}
+
+RemoteHandle::RemoteHandle(std::unique_ptr<RemoteEndpoint> endpoint)
+{
+  KVIKIO_NVTX_FUNC_RANGE();
+  _nbytes   = endpoint->get_file_size();
   _endpoint = std::move(endpoint);
 }
 
+RemoteEndpointType RemoteHandle::remote_endpoint_type() const noexcept
+{
+  return _endpoint->remote_endpoint_type();
+}
+
 std::size_t RemoteHandle::nbytes() const noexcept { return _nbytes; }
 
 RemoteEndpoint const& RemoteHandle::endpoint() const noexcept { return *_endpoint; }
@@ -397,10 +784,7 @@ std::size_t RemoteHandle::read(void* buf, std::size_t size, std::size_t file_off
   bool const is_host_mem = is_host_memory(buf);
   auto curl              = create_curl_handle();
   _endpoint->setopt(curl);
-
-  std::string const byte_range =
-    std::to_string(file_offset) + "-" + std::to_string(file_offset + size - 1);
-  curl.setopt(CURLOPT_RANGE, byte_range.c_str());
+  _endpoint->setup_range_request(curl, file_offset, size);
 
   if (is_host_mem) {
     curl.setopt(CURLOPT_WRITEFUNCTION, callback_host_memory);
diff --git a/cpp/src/shim/cuda.cpp b/cpp/src/shim/cuda.cpp
index 8053ea152a..ae69a495aa 100644
--- a/cpp/src/shim/cuda.cpp
+++ b/cpp/src/shim/cuda.cpp
@@ -56,7 +56,7 @@ cudaAPI::cudaAPI()
   CUDA_DRIVER_TRY(DriverGetVersion(&driver_version));
 
 #if CUDA_VERSION >= 12080
-  // cuMemcpyBatchAsync was introduced in CUDA 12.8.
+  // cuMemcpyBatchAsync was introduced in CUDA 12.8, and its parameters were changed in CUDA 13.0.
   try {
     decltype(cuMemcpyBatchAsync)* fp;
     get_symbol(fp, lib, KVIKIO_STRINGIFY(cuMemcpyBatchAsync));
diff --git a/cpp/src/shim/libcurl.cpp b/cpp/src/shim/libcurl.cpp
index 613dad32f8..a78fb33d30 100644
--- a/cpp/src/shim/libcurl.cpp
+++ b/cpp/src/shim/libcurl.cpp
@@ -29,6 +29,7 @@
 #include <curl/curl.h>
 
 #include <kvikio/defaults.hpp>
+#include <kvikio/detail/tls.hpp>
 #include <kvikio/error.hpp>
 #include <kvikio/parallel_operation.hpp>
 #include <kvikio/posix_io.hpp>
@@ -112,6 +113,12 @@ CurlHandle::CurlHandle(LibCurl::UniqueHandlePtr handle,
 
   // Make requests time out after `value` seconds.
   setopt(CURLOPT_TIMEOUT, kvikio::defaults::http_timeout());
+
+  // Optionally enable verbose output if it's configured.
+  auto const verbose = getenv_or("KVIKIO_REMOTE_VERBOSE", false);
+  if (verbose) { setopt(CURLOPT_VERBOSE, 1L); }
+
+  detail::set_up_ca_paths(*this);
 }
 
 CurlHandle::~CurlHandle() noexcept { LibCurl::instance().retain_handle(std::move(_handle)); }
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index a6b8391928..afa7e8d97b 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -50,16 +50,14 @@ function(kvikio_add_test)
   set_target_properties(
     ${_KVIKIO_NAME}
     PROPERTIES RUNTIME_OUTPUT_DIRECTORY "$<BUILD_INTERFACE:${KvikIO_BINARY_DIR}/gtests>"
-               CXX_STANDARD 17
+               CXX_STANDARD 20
                CXX_STANDARD_REQUIRED ON
-               # For std:: support of __int128_t. Can be removed once using cuda::std
-               CXX_EXTENSIONS ON
-               CUDA_STANDARD 17
+               CUDA_STANDARD 20
                CUDA_STANDARD_REQUIRED ON
   )
   target_link_libraries(
     ${_KVIKIO_NAME} PRIVATE kvikio::kvikio GTest::gmock GTest::gmock_main GTest::gtest
-                            GTest::gtest_main CUDA::cudart
+                            GTest::gtest_main CUDA::cudart $<TARGET_NAME_IF_EXISTS:CURL::libcurl>
   )
 
   rapids_test_add(
@@ -78,6 +76,11 @@ kvikio_add_test(NAME ERROR_TEST SOURCES test_error.cpp)
 
 kvikio_add_test(NAME MMAP_TEST SOURCES test_mmap.cpp)
 
-kvikio_add_test(NAME REMOTE_HANDLE_TEST SOURCES test_remote_handle.cpp utils/env.cpp)
+if(KvikIO_REMOTE_SUPPORT)
+  kvikio_add_test(NAME REMOTE_HANDLE_TEST SOURCES test_remote_handle.cpp utils/env.cpp)
+  kvikio_add_test(NAME HDFS_TEST SOURCES test_hdfs.cpp utils/hdfs_helper.cpp)
+  kvikio_add_test(NAME TLS_TEST SOURCES test_tls.cpp utils/env.cpp)
+  kvikio_add_test(NAME URL_TEST SOURCES test_url.cpp)
+endif()
 
 rapids_test_install_relocatable(INSTALL_COMPONENT_SET testing DESTINATION bin/tests/libkvikio)
diff --git a/cpp/tests/test_basic_io.cpp b/cpp/tests/test_basic_io.cpp
index d72ba8841f..141fe386c3 100644
--- a/cpp/tests/test_basic_io.cpp
+++ b/cpp/tests/test_basic_io.cpp
@@ -27,15 +27,16 @@ class BasicIOTest : public testing::Test {
     TempDir tmp_dir{false};
     _filepath = tmp_dir.path() / "test";
 
-    _dev_a = std::move(DevBuffer::arange(100));
-    _dev_b = std::move(DevBuffer::zero_like(_dev_a));
+    _dev_a = std::move(DevBuffer<value_type>::arange(100));
+    _dev_b = std::move(DevBuffer<value_type>::zero_like(_dev_a));
   }
 
   void TearDown() override {}
 
   std::filesystem::path _filepath;
-  DevBuffer _dev_a;
-  DevBuffer _dev_b;
+  using value_type = std::int64_t;
+  DevBuffer<value_type> _dev_a;
+  DevBuffer<value_type> _dev_b;
 };
 
 TEST_F(BasicIOTest, write_read)
diff --git a/cpp/tests/test_defaults.cpp b/cpp/tests/test_defaults.cpp
index 89bbe7399c..a74f38c86f 100644
--- a/cpp/tests/test_defaults.cpp
+++ b/cpp/tests/test_defaults.cpp
@@ -19,9 +19,9 @@
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
+#include <kvikio/compat_mode.hpp>
 #include <kvikio/defaults.hpp>
 
-#include "kvikio/compat_mode.hpp"
 #include "utils/env.hpp"
 
 using ::testing::HasSubstr;
diff --git a/cpp/tests/test_hdfs.cpp b/cpp/tests/test_hdfs.cpp
new file mode 100644
index 0000000000..354c8c4aca
--- /dev/null
+++ b/cpp/tests/test_hdfs.cpp
@@ -0,0 +1,180 @@
+/*
+ * Copyright (c) 2025, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <gtest/gtest.h>
+#include <cstdlib>
+#include <memory>
+
+#include <kvikio/file_handle.hpp>
+#include <kvikio/hdfs.hpp>
+#include <kvikio/remote_handle.hpp>
+
+#include "utils/hdfs_helper.hpp"
+#include "utils/utils.hpp"
+
+// This test makes the following assumptions:
+// - This KvikIO unit test is run on the "name node" of a WebHDFS server.
+// - Port 9870 (default for WebHDFS) is being used to listen to the requests.
+// - The environment variable `KVIKIO_USER` is specified prior to the test. It contains a valid user
+//   name that has been granted access to the HDFS.
+// - The user has the proper permission to create a file under the `/tmp` directory on the HDFS.
+// - If the unit test is run within a Docker. The following arguments are passed to the `docker run`
+//   command:
+//   - `--network host`
+//   - `--env KVIKIO_USER=<hdfs_username>`
+//
+// If any of these assumptions is not satisfied, this unit test is expected to be skipped
+// gracefully.
+
+using value_type = double;
+
+namespace kvikio::test {
+struct Config {
+  std::size_t num_elements{1024ull * 1024ull};
+  std::vector<value_type> host_buf;
+  kvikio::test::DevBuffer<value_type> dev_buf;
+  std::string host;
+  std::string port;
+  std::string _username;
+  std::string remote_file_path;
+  bool file_created{false};
+};
+}  // namespace kvikio::test
+
+class WebHdfsTest : public testing::Test {
+ protected:
+  static void SetUpTestSuite()
+  {
+    config.num_elements = 1024ull * 1024ull;
+    config.host_buf.resize(config.num_elements);
+    std::iota(config.host_buf.begin(), config.host_buf.end(), 0);
+
+    config.dev_buf = kvikio::test::DevBuffer<value_type>{config.host_buf};
+
+    config.host = "localhost";
+    config.port = "9870";
+
+    config.remote_file_path = "/tmp/kvikio-test-webhdfs.bin";
+
+    auto res = std::getenv("KVIKIO_USER");
+    if (res) {
+      config._username = res;
+    } else {
+      GTEST_SKIP() << "Environment variable KVIKIO_USER is not set for this test.";
+    }
+
+    webhdfs_helper =
+      std::make_unique<kvikio::test::WebHdfsTestHelper>(config.host, config.port, config._username);
+
+    if (!webhdfs_helper->can_connect()) {
+      GTEST_SKIP() << "Cannot connect to WebHDFS. Skipping all tests for this fixture.";
+    }
+
+    std::span<std::byte> buffer{reinterpret_cast<std::byte*>(config.host_buf.data()),
+                                config.host_buf.size() * sizeof(value_type)};
+    if (!webhdfs_helper->upload_data(buffer, config.remote_file_path)) {
+      GTEST_SKIP()
+        << "Failed to upload test data using WebHDFS. Skipping all tests for this fixture.";
+    };
+
+    config.file_created = true;
+  }
+
+  static void TearDownTestSuite()
+  {
+    if (config.file_created) { webhdfs_helper->delete_data(config.remote_file_path); }
+  }
+
+  static kvikio::test::Config config;
+  static std::unique_ptr<kvikio::test::WebHdfsTestHelper> webhdfs_helper;
+};
+
+kvikio::test::Config WebHdfsTest::config{};
+std::unique_ptr<kvikio::test::WebHdfsTestHelper> WebHdfsTest::webhdfs_helper{};
+
+TEST_F(WebHdfsTest, constructor)
+{
+  auto do_test = [&](kvikio::RemoteHandle& remote_handle) {
+    kvikio::test::DevBuffer<value_type> out_device_buf(config.num_elements);
+    auto read_size    = remote_handle.read(out_device_buf.ptr, remote_handle.nbytes());
+    auto out_host_buf = out_device_buf.to_vector();
+    for (std::size_t i = 0; i < config.num_elements; ++i) {
+      EXPECT_EQ(config.host_buf[i], out_host_buf[i]);
+    }
+    EXPECT_EQ(read_size, remote_handle.nbytes());
+  };
+
+  std::stringstream ss;
+  ss << "http://" << config.host << ":" << config.port << "/webhdfs/v1" << config.remote_file_path
+     << "?user.name=" << config._username;
+  std::vector<kvikio::RemoteHandle> remote_handles;
+
+  remote_handles.emplace_back(std::make_unique<kvikio::WebHdfsEndpoint>(ss.str()));
+  remote_handles.emplace_back(std::make_unique<kvikio::WebHdfsEndpoint>(
+    config.host, config.port, config.remote_file_path, config._username));
+
+  for (auto& remote_handle : remote_handles) {
+    do_test(remote_handle);
+  }
+}
+
+TEST_F(WebHdfsTest, read_parallel)
+{
+  auto do_test = [&](std::string const& url,
+                     std::size_t num_elements_to_skip,
+                     std::size_t num_elements_to_read,
+                     std::size_t task_size) {
+    kvikio::RemoteHandle remote_handle{std::make_unique<kvikio::WebHdfsEndpoint>(url)};
+    auto const offset             = num_elements_to_skip * sizeof(value_type);
+    auto const expected_read_size = num_elements_to_read * sizeof(value_type);
+
+    // host
+    {
+      std::vector<value_type> out_host_buf(num_elements_to_read, {});
+      auto fut = remote_handle.pread(out_host_buf.data(), expected_read_size, offset, task_size);
+      auto const read_size = fut.get();
+      for (std::size_t i = num_elements_to_skip; i < num_elements_to_read; ++i) {
+        EXPECT_EQ(config.host_buf[i], out_host_buf[i - num_elements_to_skip]);
+      }
+      EXPECT_EQ(read_size, expected_read_size);
+    }
+
+    // device
+    {
+      kvikio::test::DevBuffer<value_type> out_device_buf(num_elements_to_read);
+      auto fut = remote_handle.pread(out_device_buf.ptr, expected_read_size, offset, task_size);
+      auto const read_size = fut.get();
+      auto out_host_buf    = out_device_buf.to_vector();
+      for (std::size_t i = num_elements_to_skip; i < num_elements_to_read; ++i) {
+        EXPECT_EQ(config.host_buf[i], out_host_buf[i - num_elements_to_skip]);
+      }
+      EXPECT_EQ(read_size, expected_read_size);
+    }
+  };
+
+  std::stringstream ss;
+  ss << "http://" << config.host << ":" << config.port << "/webhdfs/v1" << config.remote_file_path
+     << "?user.name=" << config._username;
+  std::vector<std::size_t> task_sizes{256, 1024, kvikio::defaults::task_size()};
+
+  for (const auto& task_size : task_sizes) {
+    for (const auto& num_elements_to_read : {10, 9999}) {
+      for (const auto& num_elements_to_skip : {0, 10, 100, 1000, 9999}) {
+        do_test(ss.str(), num_elements_to_skip, num_elements_to_read, task_size);
+      }
+    }
+  }
+}
diff --git a/cpp/tests/test_mmap.cpp b/cpp/tests/test_mmap.cpp
index b52730ee74..1c6e3a54b3 100644
--- a/cpp/tests/test_mmap.cpp
+++ b/cpp/tests/test_mmap.cpp
@@ -40,7 +40,7 @@ class MmapTest : public testing::Test {
     _filepath                = tmp_dir.path() / "test.bin";
     std::size_t num_elements = 1024ull * 1024ull;
     _host_buf                = CreateTempFile<value_type>(_filepath, num_elements);
-    _dev_buf                 = kvikio::test::DevBuffer{_host_buf};
+    _dev_buf                 = kvikio::test::DevBuffer<value_type>{_host_buf};
     _page_size               = kvikio::get_page_size();
   }
 
@@ -62,16 +62,15 @@ class MmapTest : public testing::Test {
   std::size_t _file_size;
   std::size_t _page_size;
   std::vector<std::int64_t> _host_buf;
-  kvikio::test::DevBuffer _dev_buf;
-
   using value_type = decltype(_host_buf)::value_type;
+  kvikio::test::DevBuffer<value_type> _dev_buf;
 };
 
 TEST_F(MmapTest, invalid_file_open_flag)
 {
   // Empty file open flag
   EXPECT_THAT(
-    [=] {
+    [&] {
       {
         kvikio::MmapHandle(_filepath, "");
       }
@@ -80,7 +79,7 @@ TEST_F(MmapTest, invalid_file_open_flag)
 
   // Invalid file open flag
   EXPECT_THAT(
-    [=] {
+    [&] {
       {
         kvikio::MmapHandle(_filepath, "z");
       }
@@ -91,7 +90,7 @@ TEST_F(MmapTest, invalid_file_open_flag)
 TEST_F(MmapTest, invalid_mmap_flag)
 {
   EXPECT_THAT(
-    [=] {
+    [&] {
       {
         int invalid_flag{-1};
         kvikio::MmapHandle(_filepath, "r", std::nullopt, 0, kvikio::FileHandle::m644, invalid_flag);
@@ -107,12 +106,13 @@ TEST_F(MmapTest, constructor_invalid_range)
               ThrowsMessage<std::out_of_range>(HasSubstr("Mapped region is past the end of file")));
 
   // init_file_offset is too large (by 1 char)
-  EXPECT_THAT([=] { kvikio::MmapHandle(_filepath, "r", std::nullopt, _file_size); },
-              ThrowsMessage<std::out_of_range>(HasSubstr("Offset is past the end of file")));
+  EXPECT_THAT(
+    [&] { kvikio::MmapHandle(_filepath, "r", std::nullopt, _file_size); },
+    ThrowsMessage<std::out_of_range>(HasSubstr("Offset must be less than the file size")));
 
   // init_size is 0
   EXPECT_THAT(
-    [=] { kvikio::MmapHandle(_filepath, "r", 0); },
+    [&] { kvikio::MmapHandle(_filepath, "r", 0); },
     ThrowsMessage<std::invalid_argument>(HasSubstr("Mapped region should not be zero byte")));
 }
 
@@ -134,29 +134,36 @@ TEST_F(MmapTest, read_invalid_range)
   std::size_t const initial_file_offset{512};
   std::vector<value_type> out_host_buf(_file_size / sizeof(value_type), {});
 
-  // file_offset is too large
+  // Right bound is too large
   EXPECT_THAT(
     [&] {
       kvikio::MmapHandle mmap_handle(_filepath, "r", initial_map_size, initial_file_offset);
       mmap_handle.read(out_host_buf.data(), initial_map_size, _file_size);
     },
-    ThrowsMessage<std::out_of_range>(HasSubstr("Offset is past the end of file")));
+    ThrowsMessage<std::out_of_range>(HasSubstr("Read is out of bound")));
 
-  // file_offset is too small
+  // Left bound is too large
   EXPECT_THAT(
     [&] {
       kvikio::MmapHandle mmap_handle(_filepath, "r", initial_map_size, initial_file_offset);
-      mmap_handle.read(out_host_buf.data(), initial_map_size, initial_file_offset - 128);
+      mmap_handle.read(out_host_buf.data(), 0, initial_file_offset + initial_map_size + 1);
     },
     ThrowsMessage<std::out_of_range>(HasSubstr("Read is out of bound")));
 
-  // size is 0
+  EXPECT_THAT(
+    [&] {
+      kvikio::MmapHandle mmap_handle(_filepath, "r");
+      mmap_handle.read(out_host_buf.data(), 0, _file_size + 1);
+    },
+    ThrowsMessage<std::out_of_range>(HasSubstr("Offset is past the end of file")));
+
+  // Left bound is too small
   EXPECT_THAT(
     [&] {
       kvikio::MmapHandle mmap_handle(_filepath, "r", initial_map_size, initial_file_offset);
-      mmap_handle.read(out_host_buf.data(), 0, initial_file_offset);
+      mmap_handle.read(out_host_buf.data(), initial_map_size, initial_file_offset - 128);
     },
-    ThrowsMessage<std::invalid_argument>(HasSubstr("Read size must be greater than 0")));
+    ThrowsMessage<std::out_of_range>(HasSubstr("Read is out of bound")));
 
   // size is too large
   EXPECT_THAT(
@@ -167,6 +174,24 @@ TEST_F(MmapTest, read_invalid_range)
     ThrowsMessage<std::out_of_range>(HasSubstr("Read is out of bound")));
 }
 
+TEST_F(MmapTest, read_valid_range)
+{
+  std::size_t const initial_map_size{1024};
+  std::size_t const initial_file_offset{512};
+  std::vector<value_type> out_host_buf(_file_size / sizeof(value_type), {});
+
+  // size is 0
+  EXPECT_NO_THROW({
+    kvikio::MmapHandle mmap_handle(_filepath, "r", initial_map_size, initial_file_offset);
+    mmap_handle.read(out_host_buf.data(), 0, initial_file_offset + initial_map_size);
+  });
+
+  EXPECT_NO_THROW({
+    kvikio::MmapHandle mmap_handle(_filepath, "r");
+    mmap_handle.read(out_host_buf.data(), 0, _file_size);
+  });
+}
+
 TEST_F(MmapTest, read_seq)
 {
   auto do_test = [&](std::size_t num_elements_to_skip, std::size_t num_elements_to_read) {
@@ -186,7 +211,7 @@ TEST_F(MmapTest, read_seq)
 
     // device
     {
-      kvikio::test::DevBuffer out_device_buf(num_elements_to_read);
+      kvikio::test::DevBuffer<value_type> out_device_buf(num_elements_to_read);
       auto const read_size = mmap_handle.read(out_device_buf.ptr, expected_read_size, offset);
       auto out_host_buf    = out_device_buf.to_vector();
       for (std::size_t i = num_elements_to_skip; i < num_elements_to_read; ++i) {
@@ -224,7 +249,7 @@ TEST_F(MmapTest, read_parallel)
 
       // device
       {
-        kvikio::test::DevBuffer out_device_buf(num_elements_to_read);
+        kvikio::test::DevBuffer<value_type> out_device_buf(num_elements_to_read);
         auto fut             = mmap_handle.pread(out_device_buf.ptr, expected_read_size, offset);
         auto const read_size = fut.get();
         auto out_host_buf    = out_device_buf.to_vector();
@@ -274,7 +299,7 @@ TEST_F(MmapTest, read_with_default_arguments)
 
   // device
   {
-    kvikio::test::DevBuffer out_device_buf(num_elements);
+    kvikio::test::DevBuffer<value_type> out_device_buf(num_elements);
 
     {
       auto const read_size = mmap_handle.read(out_device_buf.ptr);
diff --git a/cpp/tests/test_remote_handle.cpp b/cpp/tests/test_remote_handle.cpp
index 918479b0f0..f1e25ea34d 100644
--- a/cpp/tests/test_remote_handle.cpp
+++ b/cpp/tests/test_remote_handle.cpp
@@ -14,12 +14,109 @@
  * limitations under the License.
  */
 
+#include <functional>
+#include <optional>
+#include <stdexcept>
+#include <utility>
+#include <vector>
+
+#include <gmock/gmock.h>
 #include <gtest/gtest.h>
+#include <kvikio/hdfs.hpp>
 #include <kvikio/remote_handle.hpp>
 
 #include "utils/env.hpp"
 
-TEST(RemoteHandleTest, s3_endpoint_constructor)
+using ::testing::HasSubstr;
+using ::testing::ThrowsMessage;
+
+class RemoteHandleTest : public testing::Test {
+ protected:
+  void SetUp() override
+  {
+    _sample_urls = {
+      // Endpoint type: S3
+      {"s3://bucket-name/object-key-name", kvikio::RemoteEndpointType::S3_PUBLIC},
+      {"s3://bucket-name/object-key-name-dir/object-key-name-file",
+       kvikio::RemoteEndpointType::S3_PUBLIC},
+      {"https://bucket-name.s3.region-code.amazonaws.com/object-key-name",
+       kvikio::RemoteEndpointType::S3_PUBLIC},
+      {"https://s3.region-code.amazonaws.com/bucket-name/object-key-name",
+       kvikio::RemoteEndpointType::S3_PUBLIC},
+      {"https://bucket-name.s3.amazonaws.com/object-key-name",
+       kvikio::RemoteEndpointType::S3_PUBLIC},
+      {"https://s3.amazonaws.com/bucket-name/object-key-name",
+       kvikio::RemoteEndpointType::S3_PUBLIC},
+      {"https://bucket-name.s3-region-code.amazonaws.com/object-key-name",
+       kvikio::RemoteEndpointType::S3_PUBLIC},
+      {"https://s3-region-code.amazonaws.com/bucket-name/object-key-name",
+       kvikio::RemoteEndpointType::S3_PUBLIC},
+
+      // Endpoint type: S3 presigned URL
+      {"https://bucket-name.s3.region-code.amazonaws.com/"
+       "object-key-name?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Signature=sig&X-Amz-Credential=cred&"
+       "X-Amz-SignedHeaders=host",
+       kvikio::RemoteEndpointType::S3_PRESIGNED_URL},
+
+      // Endpoint type: WebHDFS
+      {"https://host:1234/webhdfs/v1/data.bin", kvikio::RemoteEndpointType::WEBHDFS},
+    };
+  }
+
+  void TearDown() override {}
+
+  void test_helper(kvikio::RemoteEndpointType expected_endpoint_type,
+                   std::function<bool(const std::string&)> url_validity_checker)
+  {
+    for (auto const& [url, endpoint_type] : _sample_urls) {
+      if (endpoint_type == expected_endpoint_type) {
+        // Given that the URL is the expected endpoint type
+
+        // Test URL validity checker
+        EXPECT_TRUE(url_validity_checker(url));
+
+        // Test unified interface
+        {
+          // Here we pass the 1-byte argument to RemoteHandle::open. For all endpoints except
+          // kvikio::RemoteEndpointType::S3, this prevents the endpoint constructor from querying
+          // the file size and sending requests to the server, thus allowing us to use dummy URLs
+          // for testing purpose.
+          // For kvikio::RemoteEndpointType::S3, RemoteHandle::open sends HEAD request as a
+          // connectivity check and will fail on the syntactically valid dummy URL. The
+          // kvikio::RemoteEndpointType::S3_PUBLIC will then be used as the endpoint.
+          auto remote_handle =
+            kvikio::RemoteHandle::open(url, kvikio::RemoteEndpointType::AUTO, std::nullopt, 1);
+          EXPECT_EQ(remote_handle.remote_endpoint_type(), expected_endpoint_type);
+        }
+
+        // Test explicit endpoint type specification
+        {
+          EXPECT_NO_THROW({
+            auto remote_handle =
+              kvikio::RemoteHandle::open(url, expected_endpoint_type, std::nullopt, 1);
+          });
+        }
+      } else {
+        // Given that the URL is NOT the expected endpoint type
+
+        // Test URL validity checker
+        EXPECT_FALSE(url_validity_checker(url));
+
+        // Test explicit endpoint type specification
+        {
+          EXPECT_ANY_THROW({
+            auto remote_handle =
+              kvikio::RemoteHandle::open(url, expected_endpoint_type, std::nullopt, 1);
+          });
+        }
+      }
+    }
+  }
+
+  std::vector<std::pair<std::string, kvikio::RemoteEndpointType>> _sample_urls;
+};
+
+TEST_F(RemoteHandleTest, s3_endpoint_constructor)
 {
   kvikio::test::EnvVarContext env_var_ctx{{"AWS_DEFAULT_REGION", "my_aws_default_region"},
                                           {"AWS_ACCESS_KEY_ID", "my_aws_access_key_id"},
@@ -37,3 +134,158 @@ TEST(RemoteHandleTest, s3_endpoint_constructor)
 
   EXPECT_EQ(s1.str(), s2.str());
 }
+
+TEST_F(RemoteHandleTest, test_http_url)
+{
+  // Invalid URLs
+  {
+    std::vector<std::string> const invalid_urls{// Incorrect scheme
+                                                "s3://example.com",
+                                                "hdfs://example.com",
+                                                // Missing file path
+                                                "http://example.com"};
+    for (auto const& invalid_url : invalid_urls) {
+      EXPECT_FALSE(kvikio::HttpEndpoint::is_url_valid(invalid_url));
+    }
+  }
+}
+
+TEST_F(RemoteHandleTest, test_s3_url)
+{
+  kvikio::test::EnvVarContext env_var_ctx{{"AWS_DEFAULT_REGION", "my_aws_default_region"},
+                                          {"AWS_ACCESS_KEY_ID", "my_aws_access_key_id"},
+                                          {"AWS_SECRET_ACCESS_KEY", "my_aws_secrete_access_key"}};
+
+  {
+    test_helper(kvikio::RemoteEndpointType::S3_PUBLIC, kvikio::S3Endpoint::is_url_valid);
+  }
+
+  // Invalid URLs
+  {
+    std::vector<std::string> const invalid_urls{
+      // Lack object-name
+      "s3://bucket-name",
+      "https://bucket-name.s3.region-code.amazonaws.com",
+      // Presigned URL
+      "https://bucket-name.s3.region-code.amazonaws.com/"
+      "object-key-name?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Signature=sig&X-Amz-Credential="
+      "cred&"
+      "X-Amz-SignedHeaders=host"};
+    for (auto const& invalid_url : invalid_urls) {
+      EXPECT_FALSE(kvikio::S3Endpoint::is_url_valid(invalid_url));
+    }
+  }
+
+  // S3_PUBLIC is not in the allowlist. So when the connectivity check fails on the dummy URL,
+  // KvikIO cannot fall back to S3_PUBLIC.
+  {
+    EXPECT_ANY_THROW({
+      kvikio::RemoteHandle::open(
+        "s3://bucket-name/object-key-name",
+        kvikio::RemoteEndpointType::AUTO,
+        std::vector<kvikio::RemoteEndpointType>{kvikio::RemoteEndpointType::S3,
+                                                kvikio::RemoteEndpointType::HTTP},
+        1);
+    });
+  }
+}
+
+TEST_F(RemoteHandleTest, test_s3_url_with_presigned_url)
+{
+  {
+    test_helper(kvikio::RemoteEndpointType::S3_PRESIGNED_URL,
+                kvikio::S3EndpointWithPresignedUrl::is_url_valid);
+  }
+
+  // Invalid URLs
+  {
+    std::vector<std::string> const invalid_urls{
+      // Presigned URL should not use S3 scheme
+      "s3://bucket-name/object-key-name",
+
+      // Completely missing query
+      "https://bucket-name.s3.region-code.amazonaws.com/object-key-name",
+
+      // Missing key parameters ("X-Amz-..."") in query
+      "https://bucket-name.s3.region-code.amazonaws.com/object-key-name?k0=v0&k1=v2"};
+    for (auto const& invalid_url : invalid_urls) {
+      EXPECT_FALSE(kvikio::S3EndpointWithPresignedUrl::is_url_valid(invalid_url));
+    }
+  }
+}
+
+TEST_F(RemoteHandleTest, test_webhdfs_url)
+{
+  {
+    test_helper(kvikio::RemoteEndpointType::WEBHDFS, kvikio::WebHdfsEndpoint::is_url_valid);
+  }
+
+  // Invalid URLs
+  {
+    std::vector<std::string> const invalid_urls{// Missing file
+                                                "https://host:1234/webhdfs/v1",
+                                                "https://host:1234/webhdfs/v1/",
+
+                                                // Missing WebHDFS identifier
+                                                "https://host:1234/data.bin",
+
+                                                // Missing port number
+                                                "https://host/webhdfs/v1/data.bin"};
+    for (auto const& invalid_url : invalid_urls) {
+      EXPECT_FALSE(kvikio::WebHdfsEndpoint::is_url_valid(invalid_url));
+    }
+  }
+}
+
+TEST_F(RemoteHandleTest, test_open)
+{
+  // Missing scheme
+  {
+    std::vector<std::string> const urls{
+      "example.com/path", "example.com:8080/path", "//example.com/path", "://example.com/path"};
+    for (auto const& url : urls) {
+      EXPECT_THROW(
+        { kvikio::RemoteHandle::open(url, kvikio::RemoteEndpointType::AUTO, std::nullopt, 1); },
+        std::runtime_error);
+    }
+  }
+
+  // Unsupported type
+  {
+    std::string const url{"unsupported://example.com/path"};
+    EXPECT_THAT(
+      [&] { kvikio::RemoteHandle::open(url, kvikio::RemoteEndpointType::AUTO, std::nullopt, 1); },
+      ThrowsMessage<std::runtime_error>(HasSubstr("Unsupported endpoint URL")));
+  }
+
+  // Specified URL not in the allowlist
+  {
+    std::string const url{"https://host:1234/webhdfs/v1/data.bin"};
+    std::vector<std::vector<kvikio::RemoteEndpointType>> const wrong_allowlists{
+      {},
+      {kvikio::RemoteEndpointType::S3},
+    };
+    for (auto const& wrong_allowlist : wrong_allowlists) {
+      EXPECT_THAT(
+        [&] {
+          kvikio::RemoteHandle::open(url, kvikio::RemoteEndpointType::WEBHDFS, wrong_allowlist, 1);
+        },
+        ThrowsMessage<std::runtime_error>(HasSubstr("is not in the allowlist")));
+    }
+  }
+
+  // Invalid URLs
+  {
+    std::vector<std::pair<std::string, kvikio::RemoteEndpointType>> const invalid_urls{
+      {"s3://bucket-name", kvikio::RemoteEndpointType::S3},
+      {"https://bucket-name.s3.region-code.amazonaws.com/object-key-name",
+       kvikio::RemoteEndpointType::S3_PRESIGNED_URL},
+      {"https://host:1234/webhdfs/v1", kvikio::RemoteEndpointType::WEBHDFS},
+      {"http://example.com", kvikio::RemoteEndpointType::HTTP},
+    };
+    for (auto const& [invalid_url, endpoint_type] : invalid_urls) {
+      EXPECT_THAT([&] { kvikio::RemoteHandle::open(invalid_url, endpoint_type, std::nullopt, 1); },
+                  ThrowsMessage<std::runtime_error>(HasSubstr("Invalid URL")));
+    }
+  }
+}
diff --git a/cpp/tests/test_tls.cpp b/cpp/tests/test_tls.cpp
new file mode 100644
index 0000000000..4b04f10296
--- /dev/null
+++ b/cpp/tests/test_tls.cpp
@@ -0,0 +1,48 @@
+/*
+ * Copyright (c) 2025, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+
+#include <kvikio/detail/tls.hpp>
+
+#include "utils/env.hpp"
+
+TEST(TlsTest, get_ca_paths)
+{
+  std::string const expected_ca_bundle_path{"ca_bundle_path"};
+  std::string const expected_ca_directory{"ca_directory"};
+  {
+    // Env var CURL_CA_BUNDLE has the highest priority. Both SSL_CERT_FILE and SSL_CERT_DIR shall be
+    // skipped
+    kvikio::test::EnvVarContext env_var_ctx{{"CURL_CA_BUNDLE", expected_ca_bundle_path},
+                                            {"SSL_CERT_FILE", "another_ca_bundle_path"},
+                                            {"SSL_CERT_DIR", expected_ca_directory}};
+    auto const& [ca_bundle_file, ca_directory] = kvikio::detail::get_ca_paths();
+
+    EXPECT_EQ(ca_bundle_file, expected_ca_bundle_path);
+    EXPECT_EQ(ca_directory, std::nullopt);
+  }
+
+  {
+    // Env var CURL_CA_BUNDLE and SSL_CERT_FILE are not specified, SSL_CERT_DIR shall be used
+    kvikio::test::EnvVarContext env_var_ctx{{"SSL_CERT_DIR", expected_ca_directory}};
+    auto const& [ca_bundle_file, ca_directory] = kvikio::detail::get_ca_paths();
+
+    EXPECT_EQ(ca_bundle_file, std::nullopt);
+    EXPECT_EQ(ca_directory, expected_ca_directory);
+  }
+}
diff --git a/cpp/tests/test_url.cpp b/cpp/tests/test_url.cpp
new file mode 100644
index 0000000000..ce419ed5a5
--- /dev/null
+++ b/cpp/tests/test_url.cpp
@@ -0,0 +1,64 @@
+/*
+ * Copyright (c) 2025, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+
+#include <kvikio/detail/url.hpp>
+#include <stdexcept>
+
+using ::testing::HasSubstr;
+using ::testing::ThrowsMessage;
+
+TEST(UrlTest, parse_scheme)
+{
+  {
+    std::vector<std::string> invalid_scheme_urls{
+      "invalid_scheme://host",
+      // The S3 scheme is not supported by libcurl. Without the CURLU_NON_SUPPORT_SCHEME flag, an
+      // exception is expected.
+      "s3://host"};
+
+    for (auto const& invalid_scheme_url : invalid_scheme_urls) {
+      EXPECT_THAT([&] { kvikio::detail::UrlParser::parse(invalid_scheme_url); },
+                  ThrowsMessage<std::runtime_error>(HasSubstr("KvikIO detects an URL error")));
+    }
+  }
+
+  // With the CURLU_NON_SUPPORT_SCHEME flag, the S3 scheme is now accepted.
+  {
+    std::vector<std::string> schemes{"s3", "S3"};
+    for (auto const& scheme : schemes) {
+      auto parsed_url =
+        kvikio::detail::UrlParser::parse(scheme + "://host", CURLU_NON_SUPPORT_SCHEME);
+      EXPECT_EQ(parsed_url.scheme.value(), "s3");  // Lowercase due to CURL's normalization
+    }
+  }
+}
+
+TEST(UrlTest, parse_host)
+{
+  std::vector<std::string> invalid_host_urls{"http://host with spaces.com",
+                                             "http://host[brackets].com",
+                                             "http://host{braces}.com",
+                                             "http://host<angle>.com",
+                                             R"(http://host\backslash.com)",
+                                             "http://host^caret.com",
+                                             "http://host`backtick.com"};
+  for (auto const& invalid_host_url : invalid_host_urls) {
+    EXPECT_THROW({ kvikio::detail::UrlParser::parse(invalid_host_url); }, std::runtime_error);
+  }
+}
diff --git a/cpp/tests/utils/hdfs_helper.cpp b/cpp/tests/utils/hdfs_helper.cpp
new file mode 100644
index 0000000000..2bcbc7fed2
--- /dev/null
+++ b/cpp/tests/utils/hdfs_helper.cpp
@@ -0,0 +1,197 @@
+/*
+ * Copyright (c) 2025, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "hdfs_helper.hpp"
+
+#include <algorithm>
+#include <iostream>
+#include <regex>
+#include <sstream>
+#include <string>
+
+#include <kvikio/detail/remote_handle.hpp>
+#include <kvikio/shim/libcurl.hpp>
+
+namespace kvikio::test {
+
+namespace {
+
+/**
+ * @brief Helper struct that wraps a buffer view and tracks how many data have been processed via an
+ * offset value.
+ */
+struct tracked_buffer_t {
+  std::span<std::byte> buffer;
+  std::size_t offset;
+};
+
+/**
+ * @brief Callback for `CURLOPT_READFUNCTION` to upload data.
+ *
+ * @param data
+ * @param size Curl internal implementation always sets this parameter to 1
+ * @param num_bytes_max The maximum number of bytes that can be uploaded
+ * @param userdata Must be cast from `tracked_buffer_t*`
+ * @return The number of bytes that have been copied to the transfer buffer.
+ */
+std::size_t callback_upload(char* data, std::size_t size, std::size_t num_bytes_max, void* userdata)
+{
+  auto new_data_size_max = size * num_bytes_max;
+  auto* tracked_buffer   = reinterpret_cast<tracked_buffer_t*>(userdata);
+
+  // All data have been uploaded. Nothing more to do.
+  if (tracked_buffer->offset >= tracked_buffer->buffer.size()) { return 0; }
+
+  auto copy_size =
+    std::min(new_data_size_max, tracked_buffer->buffer.size() - tracked_buffer->offset);
+  std::memcpy(data, tracked_buffer->buffer.data() + tracked_buffer->offset, copy_size);
+  tracked_buffer->offset += copy_size;
+
+  return copy_size;
+}
+}  // namespace
+
+WebHdfsTestHelper::WebHdfsTestHelper(std::string const& host,
+                                     std::string const& port,
+                                     std::string const& username)
+  : _host{host}, _port{port}, _username{username}
+{
+  std::stringstream ss;
+  ss << "http://" << host << ":" << port << "/webhdfs/v1";
+  _url_before_path = ss.str();
+}
+
+bool WebHdfsTestHelper::can_connect() noexcept
+{
+  try {
+    auto curl = create_curl_handle();
+
+    std::stringstream ss;
+    ss << _url_before_path << "/?user.name=" << _username << "&op=GETHOMEDIRECTORY";
+
+    curl.setopt(CURLOPT_URL, ss.str().c_str());
+
+    std::string response{};
+    curl.setopt(CURLOPT_WRITEDATA, &response);
+    curl.setopt(CURLOPT_WRITEFUNCTION, kvikio::detail::callback_get_string_response);
+    curl.setopt(CURLOPT_FOLLOWLOCATION, 1L);
+    curl.perform();
+    return true;
+  } catch (std::exception const& e) {
+    std::cout << e.what() << "\n";
+    return false;
+  }
+}
+
+bool WebHdfsTestHelper::upload_data(std::span<std::byte> buffer,
+                                    std::string const& remote_file_path) noexcept
+{
+  try {
+    // Official reference on how to create and write to a file:
+    // https://hadoop.apache.org/docs/current/hadoop-project-dist/hadoop-hdfs/WebHDFS.html#Create_and_Write_to_a_File
+    std::string redirect_url;
+
+    {
+      // Step 1: Submit a HTTP PUT request without automatically following redirects and without
+      // sending the file data.
+      auto curl = create_curl_handle();
+
+      std::stringstream ss;
+      ss << _url_before_path << remote_file_path << "?user.name=" << _username << "&op=CREATE";
+      std::string redirect_data_node_location{};
+
+      curl.setopt(CURLOPT_URL, ss.str().c_str());
+      curl.setopt(CURLOPT_FOLLOWLOCATION, 0L);
+      curl.setopt(CURLOPT_CUSTOMREQUEST, "PUT");
+
+      std::string response{};
+      curl.setopt(CURLOPT_HEADERDATA, &response);
+      curl.setopt(CURLOPT_HEADERFUNCTION, kvikio::detail::callback_get_string_response);
+
+      curl.perform();
+
+      long http_status_code{};
+      curl.getinfo(CURLINFO_RESPONSE_CODE, &http_status_code);
+      KVIKIO_EXPECT(http_status_code == 307, "Redirection from name node to data node failed.");
+
+      std::regex const pattern{R"(Location:\s*(.*)\s*)"};
+      std::smatch match_results;
+      bool found = std::regex_search(response, match_results, pattern);
+      KVIKIO_EXPECT(found,
+                    "Regular expression search failed. Cannot extract redirect location from the "
+                    "JSON response.");
+      redirect_url = match_results[1].str();
+    }
+
+    {
+      // Step 2: Submit another HTTP PUT request using the URL in the Location header with the file
+      // data to be written.
+      auto curl = create_curl_handle();
+      curl.setopt(CURLOPT_URL, redirect_url.c_str());
+      curl.setopt(CURLOPT_UPLOAD, 1L);
+
+      tracked_buffer_t tracked_buffer{.buffer = buffer, .offset = 0};
+      curl.setopt(CURLOPT_READDATA, &tracked_buffer);
+      curl.setopt(CURLOPT_READFUNCTION, callback_upload);
+      curl.setopt(CURLOPT_INFILESIZE_LARGE, static_cast<curl_off_t>(buffer.size()));
+
+      curl.perform();
+
+      long http_status_code{};
+      curl.getinfo(CURLINFO_RESPONSE_CODE, &http_status_code);
+      KVIKIO_EXPECT(http_status_code == 201, "File creation failed.");
+    }
+
+    return true;
+  } catch (std::exception const& e) {
+    std::cout << e.what() << "\n";
+    return false;
+  }
+}
+
+bool WebHdfsTestHelper::delete_data(std::string const& remote_file_path) noexcept
+{
+  try {
+    // Official reference on how to delete a file:
+    // https://hadoop.apache.org/docs/current/hadoop-project-dist/hadoop-hdfs/WebHDFS.html#Delete_a_File.2FDirectory
+    auto curl = create_curl_handle();
+
+    std::stringstream ss;
+    ss << _url_before_path << remote_file_path << "?user.name=" << _username << "&op=DELETE";
+    std::string const url = ss.str();
+    std::string redirect_data_node_location{};
+
+    curl.setopt(CURLOPT_URL, url.c_str());
+    curl.setopt(CURLOPT_FOLLOWLOCATION, 1L);
+    curl.setopt(CURLOPT_CUSTOMREQUEST, "DELETE");
+
+    std::string response{};
+    curl.setopt(CURLOPT_HEADERDATA, &response);
+    curl.setopt(CURLOPT_HEADERFUNCTION, kvikio::detail::callback_get_string_response);
+
+    curl.perform();
+
+    long http_status_code{};
+    curl.getinfo(CURLINFO_RESPONSE_CODE, &http_status_code);
+    KVIKIO_EXPECT(http_status_code == 200, "File deletion failed.");
+
+    return true;
+  } catch (std::exception const& e) {
+    std::cout << e.what() << "\n";
+    return false;
+  }
+}
+}  // namespace kvikio::test
diff --git a/cpp/tests/utils/hdfs_helper.hpp b/cpp/tests/utils/hdfs_helper.hpp
new file mode 100644
index 0000000000..7cc5da377c
--- /dev/null
+++ b/cpp/tests/utils/hdfs_helper.hpp
@@ -0,0 +1,61 @@
+/*
+ * Copyright (c) 2025, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <span>
+#include <string>
+
+namespace kvikio::test {
+
+/**
+ * @brief Helper class to create and upload a file on WebHDFS so as to enable read testing.
+ */
+class WebHdfsTestHelper {
+ private:
+  std::string _host;
+  std::string _port;
+  std::string _username;
+  std::string _url_before_path;
+
+ public:
+  WebHdfsTestHelper(std::string const& host, std::string const& port, std::string const& username);
+
+  /**
+   * @brief Whether KvikIO can connect to the WebHDFS server.
+   *
+   * @return A boolean answer.
+   */
+  bool can_connect() noexcept;
+
+  /**
+   * @brief Copy the data from a host buffer to a remote file on the WebHDFS server.
+   *
+   * @param buffer View to the host buffer whose data will be copied to the WebHDFS server
+   * @param remote_file_path Remote file path
+   * @return True if the file has been successfully uploaded; false otherwise.
+   */
+  bool upload_data(std::span<std::byte> buffer, std::string const& remote_file_path) noexcept;
+
+  /**
+   * @brief Delete a remote file on the WebHDFS server.
+   *
+   * @param remote_file_path Remote file path
+   * @return True if the file has been successfully deleted; false otherwise.
+   */
+  bool delete_data(std::string const& remote_file_path) noexcept;
+};
+
+}  // namespace kvikio::test
diff --git a/cpp/tests/utils/utils.hpp b/cpp/tests/utils/utils.hpp
index bb50d07c13..7d733621a7 100644
--- a/cpp/tests/utils/utils.hpp
+++ b/cpp/tests/utils/utils.hpp
@@ -108,6 +108,7 @@ class TempDir {
 /**
  * @brief Help class for creating and comparing buffers.
  */
+template <typename T>
 class DevBuffer {
  public:
   std::size_t nelem;
@@ -116,12 +117,12 @@ class DevBuffer {
 
   DevBuffer() : nelem{0}, nbytes{0} {};
 
-  DevBuffer(std::size_t nelem) : nelem{nelem}, nbytes{nelem * sizeof(std::int64_t)}
+  DevBuffer(std::size_t nelem) : nelem{nelem}, nbytes{nelem * sizeof(T)}
   {
     KVIKIO_CHECK_CUDA(cudaMalloc(&ptr, nbytes));
     KVIKIO_CHECK_CUDA(cudaMemset(ptr, 0, nbytes));
   }
-  DevBuffer(std::vector<std::int64_t> const& host_buffer) : DevBuffer{host_buffer.size()}
+  DevBuffer(std::vector<T> const& host_buffer) : DevBuffer{host_buffer.size()}
   {
     KVIKIO_CHECK_CUDA(cudaMemcpy(ptr, host_buffer.data(), nbytes, cudaMemcpyHostToDevice));
   }
@@ -143,9 +144,9 @@ class DevBuffer {
 
   ~DevBuffer() noexcept { cudaFree(ptr); }
 
-  [[nodiscard]] static DevBuffer arange(std::size_t nelem, std::int64_t start = 0)
+  [[nodiscard]] static DevBuffer arange(std::size_t nelem, T start = 0)
   {
-    std::vector<std::int64_t> host_buffer(nelem);
+    std::vector<T> host_buffer(nelem);
     std::iota(host_buffer.begin(), host_buffer.end(), start);
     return DevBuffer{host_buffer};
   }
@@ -157,9 +158,9 @@ class DevBuffer {
     return ret;
   }
 
-  [[nodiscard]] std::vector<std::int64_t> to_vector() const
+  [[nodiscard]] std::vector<T> to_vector() const
   {
-    std::vector<std::int64_t> ret(nelem);
+    std::vector<T> ret(nelem);
     KVIKIO_CHECK_CUDA(cudaMemcpy(ret.data(), this->ptr, nbytes, cudaMemcpyDeviceToHost));
     return ret;
   }
@@ -177,7 +178,8 @@ class DevBuffer {
 /**
  * @brief Check that two buffers are equal
  */
-inline void expect_equal(DevBuffer const& a, DevBuffer const& b)
+template <typename T>
+inline void expect_equal(DevBuffer<T> const& a, DevBuffer<T> const& b)
 {
   EXPECT_EQ(a.nbytes, b.nbytes);
   auto a_vec = a.to_vector();
diff --git a/dependencies.yaml b/dependencies.yaml
index d753e75ded..ce1b6f1b8d 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -3,7 +3,7 @@ files:
   all:
     output: conda
     matrix:
-      cuda: ["12.9"]
+      cuda: ["12.9", "13.0"]
       arch: [aarch64, x86_64]
     includes:
       - build-universal
@@ -13,7 +13,6 @@ files:
       - cuda
       - cuda_version
       - depends_on_cupy
-      - depends_on_libnvcomp
       - docs
       - py_version
       - rapids_build_skbuild
@@ -66,10 +65,6 @@ files:
       table: project
     includes:
       - depends_on_cupy
-      # TODO: restore runtime dependency when we no longer vendor nvcomp
-      # (when nvcomp ships C++ wheels)
-      # https://github.com/rapidsai/build-planning/issues/171
-      # - depends_on_libnvcomp
       - depends_on_libkvikio
       - run
   py_rapids_build_libkvikio:
@@ -91,6 +86,14 @@ files:
       - build-cpp
       - build-py-wrapper
       - build-use-libkvikio-wheel
+  py_optional_zarr:
+    output: pyproject
+    pyproject_dir: python/kvikio
+    extras:
+      table: project.optional-dependencies
+      key: zarr
+    includes:
+      - zarr
   py_optional_test:
     output: pyproject
     pyproject_dir: python/kvikio
@@ -99,6 +102,7 @@ files:
       key: test
     includes:
       - test_python
+      - zarr
   test_java:
     output: none
     includes:
@@ -124,6 +128,7 @@ dependencies:
       - output_types: conda
         packages:
           - c-compiler
+          - cuda-nvcc
           - cxx-compiler
           - libcurl>=8.5.0,<9.0a0
     specific:
@@ -131,27 +136,19 @@ dependencies:
         matrices:
           - matrix:
               arch: x86_64
-              cuda: "12.*"
             packages:
-              - gcc_linux-64=13.*
+              - gcc_linux-64=14.*
               - sysroot_linux-64=2.28
           - matrix:
               arch: aarch64
-              cuda: "12.*"
             packages:
-              - gcc_linux-aarch64=13.*
+              - gcc_linux-aarch64=14.*
               - sysroot_linux-aarch64=2.28
-      - output_types: conda
-        matrices:
-          - matrix:
-              cuda: "12.*"
-            packages:
-              - cuda-nvcc
   build-use-libkvikio-wheel:
     common:
       - output_types: conda
         packages: &libkvikio_packages
-          - libkvikio==25.8.*,>=0.0.0a0
+          - libkvikio==25.10.*,>=0.0.0a0
     specific:
       - output_types: [requirements, pyproject]
         matrices:
@@ -159,7 +156,12 @@ dependencies:
               cuda: "12.*"
               cuda_suffixed: "true"
             packages:
-              - libkvikio-cu12==25.8.*,>=0.0.0a0
+              - libkvikio-cu12==25.10.*,>=0.0.0a0
+          - matrix:
+              cuda: "13.*"
+              cuda_suffixed: "true"
+            packages:
+              - libkvikio-cu13==25.10.*,>=0.0.0a0
           - {matrix: null, packages: *libkvikio_packages}
   build-py-wrapper:
     common:
@@ -195,45 +197,44 @@ dependencies:
               cuda: "12.9"
             packages:
               - cuda-version=12.9
-  cuda:
-    specific:
-      - output_types: conda
-        matrices:
           - matrix:
-              cuda: "12.*"
+              cuda: "13.0"
             packages:
+              - cuda-version=13.0
+  cuda:
+    common:
       - output_types: conda
-        matrices:
-          - matrix:
-              cuda: "12.*"
-            packages:
-              - libcufile-dev
-              - libnuma
+        packages:
+          - libcufile-dev
+          - libnuma
   depends_on_cupy:
     common:
       - output_types: conda
         packages:
-          - cupy>=12.0.0
+          - &cupy_unsuffixed cupy>=13.6.0
     specific:
       - output_types: [requirements, pyproject]
         matrices:
-          - matrix: # All CUDA 12 versions
+          - matrix:
+              cuda: "12.*"
             packages:
-              - cupy-cuda12x>=12.0.0
-  depends_on_libnvcomp:
-    common:
-      - output_types: conda
-        packages:
-          - libnvcomp-dev==4.2.0.11
+              - cupy-cuda12x>=13.6.0
+          - matrix:
+              cuda: "13.*"
+            packages:
+              - &cupy_cu13 cupy-cuda13x>=13.6.0
+          - matrix:
+            packages:
+              - *cupy_cu13
   depends_on_libkvikio:
     common:
       - output_types: conda
         packages:
-          - &libkvikio_unsuffixed libkvikio==25.8.*,>=0.0.0a0
+          - &libkvikio_unsuffixed libkvikio==25.10.*,>=0.0.0a0
       - output_types: requirements
         packages:
           # pip recognizes the index as a global option for the requirements.txt file
-          # This index is needed for libkvikio-cu12.
+          # This index is needed for libkvikio-cu{12,13}.
           - --extra-index-url=https://pypi.nvidia.com
           - --extra-index-url=https://pypi.anaconda.org/rapidsai-wheels-nightly/simple
     specific:
@@ -243,13 +244,19 @@ dependencies:
               cuda: "12.*"
               cuda_suffixed: "true"
             packages:
-              - libkvikio-cu12==25.8.*,>=0.0.0a0
+              - libkvikio-cu12==25.10.*,>=0.0.0a0
+          - matrix:
+              cuda: "13.*"
+              cuda_suffixed: "true"
+            packages:
+              - libkvikio-cu13==25.10.*,>=0.0.0a0
           - {matrix: null, packages: [*libkvikio_unsuffixed]}
   docs:
     common:
       - output_types: [conda, requirements]
         packages:
           - numpydoc
+          - zarr>=3.0.0,<4.0.0
           - sphinx
           - sphinx-click
           - sphinx_rtd_theme
@@ -283,7 +290,7 @@ dependencies:
     common:
       - output_types: [conda, requirements, pyproject]
         packages:
-          - rapids-build-backend>=0.3.0,<0.4.0.dev0
+          - rapids-build-backend>=0.4.0,<0.5.0.dev0
       - output_types: conda
         packages:
           - scikit-build-core>=0.10.0
@@ -295,22 +302,24 @@ dependencies:
       - output_types: [conda, requirements, pyproject]
         packages:
           - numpy>=1.23,<3.0a0
-          - zarr>=2.0.0,<4.0.0
-          # See https://github.com/zarr-developers/numcodecs/pull/475
-          - numcodecs !=0.12.0
           - packaging
+  zarr:
+    common:
+      - output_types: [requirements, pyproject]
+        packages:
+          - "zarr>=3.0.0,<4.0.0; python_version >= '3.11'"
   test_libkvikio:
     common:
       - output_types: conda
         packages:
-          - libkvikio==25.8.*,>=0.0.0a0
-          - libkvikio-tests==25.8.*,>=0.0.0a0
+          - libkvikio==25.10.*,>=0.0.0a0
+          - libkvikio-tests==25.10.*,>=0.0.0a0
   test_kvikio:
     common:
       - output_types: conda
         packages:
-          - libkvikio==25.8.*,>=0.0.0a0
-          - kvikio==25.8.*,>=0.0.0a0
+          - libkvikio==25.10.*,>=0.0.0a0
+          - kvikio==25.10.*,>=0.0.0a0
   test_cpp:
     common:
       - output_types: conda
@@ -320,7 +329,7 @@ dependencies:
     common:
       - output_types: [conda, requirements, pyproject]
         packages:
-          - rapids-dask-dependency==25.8.*,>=0.0.0a0
+          - rapids-dask-dependency==25.10.*,>=0.0.0a0
           - pytest
           - pytest-asyncio
           - pytest-cov
@@ -334,11 +343,28 @@ dependencies:
         packages:
           - moto>=4.0.8
     specific:
+      - output_types: [conda]
+        matrices:
+          # zarr 3 is not supported on Python 3.10
+          - matrix:
+              py: "3.1[123]"
+            packages:
+              - zarr>=3.0.0,<4.0.0
+          - matrix:
+            packages:
       - output_types: [conda, requirements, pyproject]
         matrices:
-          - matrix: # All CUDA 12 versions
+          - matrix:
+              cuda: "12.*"
+            packages:
+              - cuda-python>=12.9.2,<13.0a0
+          - matrix:
+              cuda: "13.*"
+            packages:
+              - &cuda_python_cu13 cuda-python>=13.0.1,<14.0a0
+          - matrix:
             packages:
-              - cuda-python>=12.6.2,<13.0a0
+              - *cuda_python_cu13
   test_java:
     common:
       - output_types: conda
diff --git a/docs/source/api.rst b/docs/source/api.rst
index 1e19f12bdc..e11f4cf55b 100644
--- a/docs/source/api.rst
+++ b/docs/source/api.rst
@@ -33,6 +33,13 @@ CuFile driver
 
 .. autofunction:: initialize
 
+Mmap
+----
+.. currentmodule:: kvikio.mmap
+
+.. autoclass:: Mmap
+    :members:
+
 Zarr
 ----
 .. currentmodule:: kvikio.zarr
@@ -44,6 +51,8 @@ RemoteFile
 ----------
 .. currentmodule:: kvikio.remote_file
 
+.. autoclass:: RemoteEndpointType
+
 .. autoclass:: RemoteFile
     :members:
 
diff --git a/docs/source/install.rst b/docs/source/install.rst
index 5642b137af..91cc39af1f 100644
--- a/docs/source/install.rst
+++ b/docs/source/install.rst
@@ -15,6 +15,10 @@ Install the **stable release** from the ``rapidsai`` channel like:
 
   # Install in existing environment
   mamba install -c rapidsai -c conda-forge kvikio
+
+  # Create new environment (CUDA 13)
+  mamba create -n kvikio-env -c rapidsai -c conda-forge python=3.13 cuda-version=13.0 kvikio
+
   # Create new environment (CUDA 12)
   mamba create -n kvikio-env -c rapidsai -c conda-forge python=3.13 cuda-version=12.9 kvikio
 
@@ -24,10 +28,13 @@ Install the **nightly release** from the ``rapidsai-nightly`` channel like:
 
   # Install in existing environment
   mamba install -c rapidsai-nightly -c conda-forge kvikio
+
+  # Create new environment (CUDA 13)
+  mamba create -n kvikio-env -c rapidsai-nightly -c conda-forge python=3.13 cuda-version=13.0 kvikio
+
   # Create new environment (CUDA 12)
   mamba create -n kvikio-env -c rapidsai-nightly -c conda-forge python=3.13 cuda-version=12.9 kvikio
 
-
 .. note::
 
   If the nightly install doesn't work, set ``channel_priority: flexible`` in your ``.condarc``.
@@ -40,8 +47,11 @@ KvikIO is also available on PyPI. Install the latest release like:
 
 .. code-block::
 
-  pip install kvikio-cu12 # for CUDA 12
+  pip install kvikio-cu13 # for CUDA 13
+
+.. code-block::
 
+  pip install kvikio-cu12 # for CUDA 12
 
 Build from source
 -----------------
@@ -50,8 +60,8 @@ In order to setup a development environment, we recommend Conda:
 
 .. code-block::
 
-  # CUDA 12
-  mamba env create --name kvikio-dev --file conda/environments/all_cuda-128_arch-x86_64.yaml
+  # CUDA 13
+  mamba env create --name kvikio-dev --file conda/environments/all_cuda-130_arch-x86_64.yaml
 
 The Python library depends on the C++ library, thus we build and install both:
 
diff --git a/docs/source/runtime_settings.rst b/docs/source/runtime_settings.rst
index bb347ba23c..e707031720 100644
--- a/docs/source/runtime_settings.rst
+++ b/docs/source/runtime_settings.rst
@@ -53,3 +53,24 @@ KvikIO will retry a request should any of the HTTP status code in ``KVIKIO_HTTP_
 The maximum number of attempts to make before throwing an exception is controlled by ``KVIKIO_HTTP_MAX_ATTEMPTS``. The default value is 3. This setting can also be accessed using :py:func:`kvikio.defaults.http_max_attempts` (getter) and :py:func:`kvikio.defaults.set` (setter).
 
 The maximum duration of each HTTP request is controlled by ``KVIKIO_HTTP_TIMEOUT``. The default value is 60, which is the duration in seconds to allow. This setting can also be accessed using :py:func:`kvikio.defaults.http_timoeout` (getter) and :py:func:`kvikio.defaults.set` (setter).
+
+HTTP Verbose ``KVIKIO_REMOTE_VERBOSE``
+--------------------------------------
+
+For debugging HTTP requests, you can enable verbose output that shows detailed information about HTTP communication including headers, request/response bodies, connection details, and SSL handshake information.
+
+Set the environment variable ``KVIKIO_REMOTE_VERBOSE`` to ``true``, ``on``, ``yes``, or ``1`` (case-insensitive) to enable verbose output. Otherwise, verbose output is disabled by default.
+
+.. warning::
+
+   This may show sensitive contents from headers and data.
+
+CA bundle file and CA directory ``CURL_CA_BUNDLE``, ``SSL_CERT_FILE``, ``SSL_CERT_DIR``
+---------------------------------------------------------------------------------------
+
+The Certificate Authority (CA) paths required for TLS/SSL verification in ``libcurl`` can be explicitly specified using the following environment variables in order of overriding priority:
+
+  * ``CURL_CA_BUNDLE`` (also used in the ``curl`` program) or ``SSL_CERT_FILE`` (also used in OpenSSL): Specifies the CA certificate bundle file location.
+  * ``SSL_CERT_DIR`` (also used in OpenSSL): Specifies the CA certificate directory.
+
+When neither is specified, KvikIO searches several standard system locations for the CA file and directory, and if the search fails falls back to the libcurl compile-time defaults.
diff --git a/docs/source/zarr.rst b/docs/source/zarr.rst
index 019eff2767..baa48fa3e2 100644
--- a/docs/source/zarr.rst
+++ b/docs/source/zarr.rst
@@ -3,18 +3,16 @@ Zarr
 
 `Zarr <https://github.com/zarr-developers/zarr-specs>`_ is a binary file format for chunked, compressed, N-Dimensional array. It is used throughout the PyData ecosystem and especially for climate and biological science applications.
 
-
 `Zarr-Python <https://zarr.readthedocs.io/en/stable/>`_ is the official Python package for reading and writing Zarr arrays. Its main feature is a NumPy-like array that translates array operations into file IO seamlessly.
 KvikIO provides a GPU backend to Zarr-Python that enables `GPUDirect Storage (GDS) <https://developer.nvidia.com/blog/gpudirect-storage/>`_ seamlessly.
 
-KvikIO supports either zarr-python 2.x or zarr-python 3.x.
-However, the API provided in :mod:`kvikio.zarr` differs based on which version of zarr you have, following the differences between zarr-python 2.x and zarr-python 3.x.
-
+If the optional zarr-python dependency is installed, then ``kvikio.zarr`` will be available.
+KvikIO supports zarr-python 3.x.
 
-Zarr Python 3.x
----------------
+Usage
+-----
 
-Zarr-python includes native support for reading Zarr chunks into device memory if you `configure Zarr <https://zarr.readthedocs.io/en/stable/user-guide/gpu.html#>`__ to use GPUs.
+Zarr-Python includes native support for reading Zarr chunks into device memory if you `configure Zarr <https://zarr.readthedocs.io/en/stable/user-guide/gpu.html#>`__ to use GPUs.
 You can use any store, but KvikIO provides :py:class:`kvikio.zarr.GDSStore` to efficiently load data directly into GPU memory.
 
 .. code-block:: python
@@ -28,16 +26,3 @@ You can use any store, but KvikIO provides :py:class:`kvikio.zarr.GDSStore` to e
    ... )
    >>> type(z[:10, :10])
    cupy.ndarray
-
-
-
-Zarr Python 2.x
----------------
-
-
-The following uses zarr-python 2.x, and is an example of how to use the convenience function :py:meth:`kvikio.zarr.open_cupy_array`
-to create a new Zarr array and how to open an existing Zarr array.
-
-
-.. literalinclude:: ../../python/kvikio/examples/zarr_cupy_nvcomp.py
-    :language: python
diff --git a/java/pom.xml b/java/pom.xml
index 228c82a3d5..7e00c88661 100644
--- a/java/pom.xml
+++ b/java/pom.xml
@@ -7,7 +7,7 @@
 
   <groupId>ai.rapids.kvikio</groupId>
   <artifactId>cufile</artifactId>
-  <version>25.08.0-SNAPSHOT</version>
+  <version>25.10.0-SNAPSHOT</version>
 
   <name>cuFile</name>
   <description>
diff --git a/notebooks/nvcomp_batch_codec.ipynb b/notebooks/nvcomp_batch_codec.ipynb
deleted file mode 100644
index f4f4689f3c..0000000000
--- a/notebooks/nvcomp_batch_codec.ipynb
+++ /dev/null
@@ -1,357 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "code",
-   "execution_count": 1,
-   "id": "b543ae63",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import json\n",
-    "\n",
-    "import numcodecs\n",
-    "\n",
-    "import numpy as np\n",
-    "\n",
-    "import zarr\n",
-    "\n",
-    "from IPython.display import display\n",
-    "\n",
-    "np.set_printoptions(precision=4, suppress=True)"
-   ]
-  },
-  {
-   "attachments": {},
-   "cell_type": "markdown",
-   "id": "1a8e55d5",
-   "metadata": {},
-   "source": [
-    "### Basic usage\n",
-    "\n",
-    "Get nvCOMP codec from numcodecs registry:"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 2,
-   "id": "75524650",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "NvCompBatchCodec(algorithm='lz4', options={})"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    }
-   ],
-   "source": [
-    "NVCOMP_CODEC_ID = \"nvcomp_batch\"\n",
-    "\n",
-    "# Currently supported algorithms.\n",
-    "LZ4_ALGO = \"LZ4\"\n",
-    "GDEFLATE_ALGO = \"Gdeflate\"\n",
-    "SNAPPY_ALGO = \"snappy\"\n",
-    "ZSTD_ALGO = \"zstd\"\n",
-    "\n",
-    "codec = numcodecs.registry.get_codec(dict(id=NVCOMP_CODEC_ID, algorithm=LZ4_ALGO))\n",
-    "# To pass algorithm-specific options, use options parameter:\n",
-    "# codec = numcodecs.registry.get_codec(dict(id=NVCOMP_CODEC_ID, algo=LZ4_ALGO, options={\"data_type\": 1}))\n",
-    "\n",
-    "display(codec)"
-   ]
-  },
-  {
-   "attachments": {},
-   "cell_type": "markdown",
-   "id": "46641ccb",
-   "metadata": {},
-   "source": [
-    "Create data:"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 3,
-   "id": "12a4fffd",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "array([[ 1.6243, -0.6118, -0.5282, ...,  0.0436, -0.62  ,  0.698 ],\n",
-       "       [-0.4471,  1.2245,  0.4035, ...,  0.4203,  0.811 ,  1.0444],\n",
-       "       [-0.4009,  0.824 , -0.5623, ...,  0.7848, -0.9554,  0.5859],\n",
-       "       ...,\n",
-       "       [ 1.3797,  0.1387,  1.2255, ...,  1.8051,  0.3722,  0.1253],\n",
-       "       [ 0.7348, -0.7115, -0.1248, ..., -1.9533, -0.7684, -0.5345],\n",
-       "       [ 0.2183, -0.8654,  0.8886, ..., -1.0141, -0.0627, -1.4379]],\n",
-       "      dtype=float32)"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "text/html": [
-       "<table class=\"zarr-info\"><tbody><tr><th style=\"text-align: left\">Type</th><td style=\"text-align: left\">zarr.core.Array</td></tr><tr><th style=\"text-align: left\">Data type</th><td style=\"text-align: left\">float32</td></tr><tr><th style=\"text-align: left\">Shape</th><td style=\"text-align: left\">(100, 100)</td></tr><tr><th style=\"text-align: left\">Chunk shape</th><td style=\"text-align: left\">(10, 10)</td></tr><tr><th style=\"text-align: left\">Order</th><td style=\"text-align: left\">C</td></tr><tr><th style=\"text-align: left\">Read-only</th><td style=\"text-align: left\">False</td></tr><tr><th style=\"text-align: left\">Compressor</th><td style=\"text-align: left\">NvCompBatchCodec(algorithm='lz4', options={})</td></tr><tr><th style=\"text-align: left\">Store type</th><td style=\"text-align: left\">zarr.storage.KVStore</td></tr><tr><th style=\"text-align: left\">No. bytes</th><td style=\"text-align: left\">40000 (39.1K)</td></tr><tr><th style=\"text-align: left\">No. bytes stored</th><td style=\"text-align: left\">41006 (40.0K)</td></tr><tr><th style=\"text-align: left\">Storage ratio</th><td style=\"text-align: left\">1.0</td></tr><tr><th style=\"text-align: left\">Chunks initialized</th><td style=\"text-align: left\">100/100</td></tr></tbody></table>"
-      ],
-      "text/plain": [
-       "Type               : zarr.core.Array\n",
-       "Data type          : float32\n",
-       "Shape              : (100, 100)\n",
-       "Chunk shape        : (10, 10)\n",
-       "Order              : C\n",
-       "Read-only          : False\n",
-       "Compressor         : NvCompBatchCodec(algorithm='lz4', options={})\n",
-       "Store type         : zarr.storage.KVStore\n",
-       "No. bytes          : 40000 (39.1K)\n",
-       "No. bytes stored   : 41006 (40.0K)\n",
-       "Storage ratio      : 1.0\n",
-       "Chunks initialized : 100/100"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    }
-   ],
-   "source": [
-    "shape = (100, 100)\n",
-    "chunks = (10, 10)\n",
-    "\n",
-    "np.random.seed(1)\n",
-    "\n",
-    "x = zarr.array(np.random.randn(*shape).astype(np.float32), chunks=chunks, compressor=codec)\n",
-    "display(x[:])\n",
-    "display(x.info)"
-   ]
-  },
-  {
-   "attachments": {},
-   "cell_type": "markdown",
-   "id": "c15cbdff",
-   "metadata": {},
-   "source": [
-    "Store and load back the data:"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 4,
-   "id": "730cde85",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "{'chunks': [10, 10],\n",
-       " 'compressor': {'algorithm': 'lz4', 'id': 'nvcomp_batch', 'options': {}},\n",
-       " 'dtype': '<f4',\n",
-       " 'fill_value': 0.0,\n",
-       " 'filters': None,\n",
-       " 'order': 'C',\n",
-       " 'shape': [100, 100],\n",
-       " 'zarr_format': 2}"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    }
-   ],
-   "source": [
-    "# Use simple dictionary store, see zarr documentation for other options.\n",
-    "zarr_store = {}\n",
-    "\n",
-    "zarr.save_array(zarr_store, x, compressor=codec)\n",
-    "\n",
-    "# Check stored metadata.\n",
-    "meta_info = json.loads(zarr_store[\".zarray\"])\n",
-    "display(meta_info)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 5,
-   "id": "11338a99",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<table class=\"zarr-info\"><tbody><tr><th style=\"text-align: left\">Type</th><td style=\"text-align: left\">zarr.core.Array</td></tr><tr><th style=\"text-align: left\">Data type</th><td style=\"text-align: left\">float32</td></tr><tr><th style=\"text-align: left\">Shape</th><td style=\"text-align: left\">(100, 100)</td></tr><tr><th style=\"text-align: left\">Chunk shape</th><td style=\"text-align: left\">(10, 10)</td></tr><tr><th style=\"text-align: left\">Order</th><td style=\"text-align: left\">C</td></tr><tr><th style=\"text-align: left\">Read-only</th><td style=\"text-align: left\">False</td></tr><tr><th style=\"text-align: left\">Compressor</th><td style=\"text-align: left\">NvCompBatchCodec(algorithm='lz4', options={})</td></tr><tr><th style=\"text-align: left\">Store type</th><td style=\"text-align: left\">zarr.storage.KVStore</td></tr><tr><th style=\"text-align: left\">No. bytes</th><td style=\"text-align: left\">40000 (39.1K)</td></tr><tr><th style=\"text-align: left\">No. bytes stored</th><td style=\"text-align: left\">41006 (40.0K)</td></tr><tr><th style=\"text-align: left\">Storage ratio</th><td style=\"text-align: left\">1.0</td></tr><tr><th style=\"text-align: left\">Chunks initialized</th><td style=\"text-align: left\">100/100</td></tr></tbody></table>"
-      ],
-      "text/plain": [
-       "Type               : zarr.core.Array\n",
-       "Data type          : float32\n",
-       "Shape              : (100, 100)\n",
-       "Chunk shape        : (10, 10)\n",
-       "Order              : C\n",
-       "Read-only          : False\n",
-       "Compressor         : NvCompBatchCodec(algorithm='lz4', options={})\n",
-       "Store type         : zarr.storage.KVStore\n",
-       "No. bytes          : 40000 (39.1K)\n",
-       "No. bytes stored   : 41006 (40.0K)\n",
-       "Storage ratio      : 1.0\n",
-       "Chunks initialized : 100/100"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    }
-   ],
-   "source": [
-    "y = zarr.open_array(zarr_store)\n",
-    "display(y.info)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 6,
-   "id": "5b6cc2ca",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Test the roundtrip.\n",
-    "np.testing.assert_equal(y[:], x[:])"
-   ]
-  },
-  {
-   "attachments": {},
-   "cell_type": "markdown",
-   "id": "1a8eea79",
-   "metadata": {},
-   "source": [
-    "### CPU compression / GPU decompression\n",
-    "\n",
-    "Some algorithms, such as LZ4, can be used interchangeably on CPU and GPU. For example, the data might be created using CPU LZ4 codec and then decompressed using GPU version of LZ4 codec."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 7,
-   "id": "87d25b76",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "{'chunks': [10, 10],\n",
-       " 'compressor': {'acceleration': 1, 'id': 'lz4'},\n",
-       " 'dtype': '<f4',\n",
-       " 'fill_value': 0.0,\n",
-       " 'filters': None,\n",
-       " 'order': 'C',\n",
-       " 'shape': [100, 100],\n",
-       " 'zarr_format': 2}"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "text/html": [
-       "<table class=\"zarr-info\"><tbody><tr><th style=\"text-align: left\">Type</th><td style=\"text-align: left\">zarr.core.Array</td></tr><tr><th style=\"text-align: left\">Data type</th><td style=\"text-align: left\">float32</td></tr><tr><th style=\"text-align: left\">Shape</th><td style=\"text-align: left\">(100, 100)</td></tr><tr><th style=\"text-align: left\">Chunk shape</th><td style=\"text-align: left\">(10, 10)</td></tr><tr><th style=\"text-align: left\">Order</th><td style=\"text-align: left\">C</td></tr><tr><th style=\"text-align: left\">Read-only</th><td style=\"text-align: left\">False</td></tr><tr><th style=\"text-align: left\">Compressor</th><td style=\"text-align: left\">LZ4(acceleration=1)</td></tr><tr><th style=\"text-align: left\">Store type</th><td style=\"text-align: left\">zarr.storage.KVStore</td></tr><tr><th style=\"text-align: left\">No. bytes</th><td style=\"text-align: left\">40000 (39.1K)</td></tr><tr><th style=\"text-align: left\">No. bytes stored</th><td style=\"text-align: left\">40973 (40.0K)</td></tr><tr><th style=\"text-align: left\">Storage ratio</th><td style=\"text-align: left\">1.0</td></tr><tr><th style=\"text-align: left\">Chunks initialized</th><td style=\"text-align: left\">100/100</td></tr></tbody></table>"
-      ],
-      "text/plain": [
-       "Type               : zarr.core.Array\n",
-       "Data type          : float32\n",
-       "Shape              : (100, 100)\n",
-       "Chunk shape        : (10, 10)\n",
-       "Order              : C\n",
-       "Read-only          : False\n",
-       "Compressor         : LZ4(acceleration=1)\n",
-       "Store type         : zarr.storage.KVStore\n",
-       "No. bytes          : 40000 (39.1K)\n",
-       "No. bytes stored   : 40973 (40.0K)\n",
-       "Storage ratio      : 1.0\n",
-       "Chunks initialized : 100/100"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "text/html": [
-       "<table class=\"zarr-info\"><tbody><tr><th style=\"text-align: left\">Type</th><td style=\"text-align: left\">zarr.core.Array</td></tr><tr><th style=\"text-align: left\">Data type</th><td style=\"text-align: left\">float32</td></tr><tr><th style=\"text-align: left\">Shape</th><td style=\"text-align: left\">(100, 100)</td></tr><tr><th style=\"text-align: left\">Chunk shape</th><td style=\"text-align: left\">(10, 10)</td></tr><tr><th style=\"text-align: left\">Order</th><td style=\"text-align: left\">C</td></tr><tr><th style=\"text-align: left\">Read-only</th><td style=\"text-align: left\">False</td></tr><tr><th style=\"text-align: left\">Compressor</th><td style=\"text-align: left\">NvCompBatchCodec(algorithm='lz4', options={})</td></tr><tr><th style=\"text-align: left\">Store type</th><td style=\"text-align: left\">zarr.storage.KVStore</td></tr><tr><th style=\"text-align: left\">No. bytes</th><td style=\"text-align: left\">40000 (39.1K)</td></tr><tr><th style=\"text-align: left\">No. bytes stored</th><td style=\"text-align: left\">40883 (39.9K)</td></tr><tr><th style=\"text-align: left\">Storage ratio</th><td style=\"text-align: left\">1.0</td></tr><tr><th style=\"text-align: left\">Chunks initialized</th><td style=\"text-align: left\">100/100</td></tr></tbody></table>"
-      ],
-      "text/plain": [
-       "Type               : zarr.core.Array\n",
-       "Data type          : float32\n",
-       "Shape              : (100, 100)\n",
-       "Chunk shape        : (10, 10)\n",
-       "Order              : C\n",
-       "Read-only          : False\n",
-       "Compressor         : NvCompBatchCodec(algorithm='lz4', options={})\n",
-       "Store type         : zarr.storage.KVStore\n",
-       "No. bytes          : 40000 (39.1K)\n",
-       "No. bytes stored   : 40883 (39.9K)\n",
-       "Storage ratio      : 1.0\n",
-       "Chunks initialized : 100/100"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    }
-   ],
-   "source": [
-    "# Get default (CPU) implementation of LZ4 codec.\n",
-    "cpu_codec = numcodecs.registry.get_codec({\"id\": \"lz4\"})\n",
-    "\n",
-    "x = zarr.array(np.random.randn(*shape).astype(np.float32), chunks=chunks, compressor=cpu_codec)\n",
-    "# Define a simple, dictionary-based store. In real scenarios this can be a filesystem or some other persistent store.\n",
-    "store = {}\n",
-    "zarr.save_array(store, x, compressor=cpu_codec)\n",
-    "\n",
-    "# Check that the data was written by the expected codec.\n",
-    "meta = json.loads(store[\".zarray\"])\n",
-    "display(meta)\n",
-    "assert meta[\"compressor\"][\"id\"] == \"lz4\"\n",
-    "\n",
-    "# Change codec to GPU/nvCOMP-based.\n",
-    "meta[\"compressor\"] = {\"id\": NVCOMP_CODEC_ID, \"algorithm\": LZ4_ALGO}\n",
-    "store[\".zarray\"] = json.dumps(meta).encode()\n",
-    "\n",
-    "y = zarr.open_array(store, compressor=codec)\n",
-    "\n",
-    "display(x.info)\n",
-    "display(y.info)\n",
-    "\n",
-    "np.testing.assert_equal(x[:], y[:])\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "b9294992",
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3 (ipykernel)",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.8.10"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 5
-}
diff --git a/notebooks/nvcomp_vs_zarr_lz4.ipynb b/notebooks/nvcomp_vs_zarr_lz4.ipynb
deleted file mode 100644
index 3b6d947ac3..0000000000
--- a/notebooks/nvcomp_vs_zarr_lz4.ipynb
+++ /dev/null
@@ -1,2832 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "code",
-   "execution_count": 42,
-   "id": "f95b6759-533a-470b-8f08-5f91ebcea625",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import cupy as cp\n",
-    "import numpy as np\n",
-    "import pandas as pd\n",
-    "import time\n",
-    "import zarr\n",
-    "\n",
-    "import kvikio.nvcomp\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 43,
-   "id": "d1e60a9b-0bca-4c66-b2f0-829acc3b1ba2",
-   "metadata": {
-    "scrolled": true,
-    "tags": []
-   },
-   "outputs": [],
-   "source": [
-    "# conda install -c conda-forge zarr"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 44,
-   "id": "2cedb529-c0fa-4883-a2fd-78b1ad3c1a59",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[2013929216, 1006964608, 503482304, 251741152, 125870576, 62935288, 31467644, 15733822, 7866911, 3933455, 1966727, 983363, 491681, 245840, 122920, 61460, 30730, 15365, 7682, 3841]\n"
-     ]
-    }
-   ],
-   "source": [
-    "HOST_LZ4_MAX = 2013929216 # 2113929216\n",
-    "sizes = list(map(lambda x: HOST_LZ4_MAX//(2**x), np.arange(20)))\n",
-    "print(sizes)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 45,
-   "id": "39483573-e79b-4dca-aee3-13bf392da3a7",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "input_size = []\n",
-    "cascaded_size = []\n",
-    "cascaded_temp_size = []\n",
-    "cascaded_round_trip_time = []\n",
-    "lz4_gpu_size = []\n",
-    "lz4_gpu_temp_size = []\n",
-    "lz4_gpu_round_trip_time = []\n",
-    "bitcomp_gpu_size = []\n",
-    "bitcomp_gpu_temp_size = []\n",
-    "bitcomp_gpu_round_trip_time = []\n",
-    "lz4_size = []\n",
-    "lz4_round_trip_time = []"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 46,
-   "id": "ccd9b1e7-b607-4948-8256-73bedf1ec7a8",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "821.61s - pydevd: Sending message related to process being replaced timed-out after 5 seconds\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "--2023-09-29 13:44:43--  http://textfiles.com/etext/NONFICTION/kjv10.txt\n",
-      "Resolving textfiles.com (textfiles.com)... 208.86.224.90\n",
-      "Connecting to textfiles.com (textfiles.com)|208.86.224.90|:80... connected.\n",
-      "HTTP request sent, awaiting response... 200 OK\n",
-      "Length: 4432803 (4.2M) [text/plain]\n",
-      "Saving to: ‘kjv10.txt.2’\n",
-      "\n",
-      "kjv10.txt.2         100%[===================>]   4.23M   316KB/s    in 14s     \n",
-      "\n",
-      "2023-09-29 13:44:58 (304 KB/s) - ‘kjv10.txt.2’ saved [4432803/4432803]\n",
-      "\n"
-     ]
-    }
-   ],
-   "source": [
-    "!wget http://textfiles.com/etext/NONFICTION/kjv10.txt"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 47,
-   "id": "4c9a5c4c-4c49-4834-8dc2-3e6fc11ea930",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "text = open('kjv10.txt').read()\n",
-    "bib = np.frombuffer(bytes(text, 'utf-8'), dtype=np.int8)\n",
-    "data_buffer = np.tile(bib, 500)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 48,
-   "id": "74740819-b987-4012-ba6c-ed3d3b9afd60",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# One of the three below keys, this will set the arrangement of test data for a full run of the notebook.\n",
-    "TARGET = \"Ascending\"\n",
-    "DTYPE = cp.int32"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 49,
-   "id": "0a1307ed-034c-4943-a7e1-36665cba8ad5",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "data = {\n",
-    "    \"Ascending\": np.arange(0, HOST_LZ4_MAX, dtype=np.int32),\n",
-    "    \"Random\": np.random.randint(0, 100, HOST_LZ4_MAX, dtype=np.int32),\n",
-    "    \"Text\": data_buffer\n",
-    "}"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 50,
-   "id": "68adbb33-ddb7-4603-8863-fdd25b8bdc51",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def get_host_data(offset, dtype):\n",
-    "    exemplar = np.array([1], dtype=dtype)\n",
-    "    print(offset)\n",
-    "    print(exemplar.itemsize)\n",
-    "    print(data[TARGET].itemsize)\n",
-    "    index = offset // data[TARGET].itemsize\n",
-    "    index = index - (index % exemplar.itemsize)\n",
-    "    print(index)\n",
-    "    return data[TARGET][0:index].view(dtype)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 51,
-   "id": "f067cdc2-ee14-4258-b89d-0bb4a224c698",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "2013929216\n",
-      "4\n",
-      "4\n",
-      "503482304\n",
-      "-----\n",
-      "Input size:  2013929216\n",
-      "Cascaded GPU compressor output size:  33434464\n",
-      "Cascaded GPU decompressor output size:  2013929216\n",
-      "Cascaded GPU compress/decompress round trip time:  0.1076362133026123\n",
-      "2021826967\n",
-      "Lz4 zarr time:  4.681669235229492\n",
-      "Lz4 compressed size:  2021826967\n",
-      "1006964608\n",
-      "4\n",
-      "4\n",
-      "251741152\n",
-      "-----\n",
-      "Input size:  1006964608\n",
-      "Cascaded GPU compressor output size:  16717276\n",
-      "Cascaded GPU decompressor output size:  1006964608\n",
-      "Cascaded GPU compress/decompress round trip time:  0.11769247055053711\n",
-      "1010913478\n",
-      "Lz4 zarr time:  2.57978892326355\n",
-      "Lz4 compressed size:  1010913478\n",
-      "503482304\n",
-      "4\n",
-      "4\n",
-      "125870576\n",
-      "-----\n",
-      "Input size:  503482304\n",
-      "Cascaded GPU compressor output size:  8358716\n",
-      "Cascaded GPU decompressor output size:  503482304\n",
-      "Cascaded GPU compress/decompress round trip time:  0.05775332450866699\n",
-      "505456734\n",
-      "Lz4 zarr time:  1.2365527153015137\n",
-      "Lz4 compressed size:  505456734\n",
-      "251741152\n",
-      "4\n",
-      "4\n",
-      "62935288\n",
-      "-----\n",
-      "Input size:  251741152\n",
-      "Cascaded GPU compressor output size:  4179436\n",
-      "Cascaded GPU decompressor output size:  251741152\n",
-      "Cascaded GPU compress/decompress round trip time:  0.0284881591796875\n",
-      "252728362\n",
-      "Lz4 zarr time:  0.5986642837524414\n",
-      "Lz4 compressed size:  252728362\n",
-      "125870576\n",
-      "4\n",
-      "4\n",
-      "31467644\n",
-      "-----\n",
-      "Input size:  125870576\n",
-      "Cascaded GPU compressor output size:  2089796\n",
-      "Cascaded GPU decompressor output size:  125870576\n",
-      "Cascaded GPU compress/decompress round trip time:  0.01472783088684082\n",
-      "126364175\n",
-      "Lz4 zarr time:  0.30330395698547363\n",
-      "Lz4 compressed size:  126364175\n",
-      "62935288\n",
-      "4\n",
-      "4\n",
-      "15733820\n",
-      "-----\n",
-      "Input size:  62935280\n",
-      "Cascaded GPU compressor output size:  1044976\n",
-      "Cascaded GPU decompressor output size:  62935280\n",
-      "Cascaded GPU compress/decompress round trip time:  0.007399559020996094\n",
-      "63182074\n",
-      "Lz4 zarr time:  0.1610257625579834\n",
-      "Lz4 compressed size:  63182074\n",
-      "31467644\n",
-      "4\n",
-      "4\n",
-      "7866908\n",
-      "-----\n",
-      "Input size:  31467632\n",
-      "Cascaded GPU compressor output size:  522532\n",
-      "Cascaded GPU decompressor output size:  31467632\n",
-      "Cascaded GPU compress/decompress round trip time:  0.004503726959228516\n",
-      "31591024\n",
-      "Lz4 zarr time:  0.1471562385559082\n",
-      "Lz4 compressed size:  31591024\n",
-      "15733822\n",
-      "4\n",
-      "4\n",
-      "3933452\n",
-      "-----\n",
-      "Input size:  15733808\n",
-      "Cascaded GPU compressor output size:  261344\n",
-      "Cascaded GPU decompressor output size:  15733808\n",
-      "Cascaded GPU compress/decompress round trip time:  0.0025734901428222656\n",
-      "15795499\n",
-      "Lz4 zarr time:  0.03436875343322754\n",
-      "Lz4 compressed size:  15795499\n",
-      "7866911\n",
-      "4\n",
-      "4\n",
-      "1966724\n",
-      "-----\n",
-      "Input size:  7866896\n",
-      "Cascaded GPU compressor output size:  130716\n",
-      "Cascaded GPU decompressor output size:  7866896\n",
-      "Cascaded GPU compress/decompress round trip time:  0.0018618106842041016\n",
-      "7897736\n",
-      "Lz4 zarr time:  0.010539531707763672\n",
-      "Lz4 compressed size:  7897736\n",
-      "3933455\n",
-      "4\n",
-      "4\n",
-      "983360\n",
-      "-----\n",
-      "Input size:  3933440\n",
-      "Cascaded GPU compressor output size:  65436\n",
-      "Cascaded GPU decompressor output size:  3933440\n",
-      "Cascaded GPU compress/decompress round trip time:  0.0017323493957519531\n",
-      "3948855\n",
-      "Lz4 zarr time:  0.028203964233398438\n",
-      "Lz4 compressed size:  3948855\n",
-      "1966727\n",
-      "4\n",
-      "4\n",
-      "491680\n",
-      "-----\n",
-      "Input size:  1966720\n",
-      "Cascaded GPU compressor output size:  32796\n",
-      "Cascaded GPU decompressor output size:  1966720\n",
-      "Cascaded GPU compress/decompress round trip time:  0.0020630359649658203\n",
-      "1974422\n",
-      "Lz4 zarr time:  0.002621889114379883\n",
-      "Lz4 compressed size:  1974422\n",
-      "983363\n",
-      "4\n",
-      "4\n",
-      "245840\n",
-      "-----\n",
-      "Input size:  983360\n",
-      "Cascaded GPU compressor output size:  16476\n",
-      "Cascaded GPU decompressor output size:  983360\n",
-      "Cascaded GPU compress/decompress round trip time:  0.0014410018920898438\n",
-      "987206\n",
-      "Lz4 zarr time:  0.0007197856903076172\n",
-      "Lz4 compressed size:  987206\n",
-      "491681\n",
-      "4\n",
-      "4\n",
-      "122920\n",
-      "-----\n",
-      "Input size:  491680\n",
-      "Cascaded GPU compressor output size:  8316\n",
-      "Cascaded GPU decompressor output size:  491680\n",
-      "Cascaded GPU compress/decompress round trip time:  0.0011644363403320312\n",
-      "493597\n",
-      "Lz4 zarr time:  0.000965118408203125\n",
-      "Lz4 compressed size:  493597\n",
-      "245840\n",
-      "4\n",
-      "4\n",
-      "61460\n",
-      "-----\n",
-      "Input size:  245840\n",
-      "Cascaded GPU compressor output size:  4236\n",
-      "Cascaded GPU decompressor output size:  245840\n",
-      "Cascaded GPU compress/decompress round trip time:  0.0015044212341308594\n",
-      "246793\n",
-      "Lz4 zarr time:  0.0004220008850097656\n",
-      "Lz4 compressed size:  246793\n",
-      "122920\n",
-      "4\n",
-      "4\n",
-      "30728\n",
-      "-----\n",
-      "Input size:  122912\n",
-      "Cascaded GPU compressor output size:  2184\n",
-      "Cascaded GPU decompressor output size:  122912\n",
-      "Cascaded GPU compress/decompress round trip time:  0.0011115074157714844\n",
-      "123383\n",
-      "Lz4 zarr time:  0.0002646446228027344\n",
-      "Lz4 compressed size:  123383\n",
-      "61460\n",
-      "4\n",
-      "4\n",
-      "15364\n",
-      "-----\n",
-      "Input size:  61456\n",
-      "Cascaded GPU compressor output size:  1148\n",
-      "Cascaded GPU decompressor output size:  61456\n",
-      "Cascaded GPU compress/decompress round trip time:  0.0009233951568603516\n",
-      "61678\n",
-      "Lz4 zarr time:  0.00020623207092285156\n",
-      "Lz4 compressed size:  61678\n",
-      "30730\n",
-      "4\n",
-      "4\n",
-      "7680\n",
-      "-----\n",
-      "Input size:  30720\n",
-      "Cascaded GPU compressor output size:  632\n",
-      "Cascaded GPU decompressor output size:  30720\n",
-      "Cascaded GPU compress/decompress round trip time:  0.001186370849609375\n",
-      "30822\n",
-      "Lz4 zarr time:  0.00011777877807617188\n",
-      "Lz4 compressed size:  30822\n",
-      "15365\n",
-      "4\n",
-      "4\n",
-      "3840\n",
-      "-----\n",
-      "Input size:  15360\n",
-      "Cascaded GPU compressor output size:  360\n",
-      "Cascaded GPU decompressor output size:  15360\n",
-      "Cascaded GPU compress/decompress round trip time:  0.001523733139038086\n",
-      "15401\n",
-      "Lz4 zarr time:  0.0003781318664550781\n",
-      "Lz4 compressed size:  15401\n",
-      "7682\n",
-      "4\n",
-      "4\n",
-      "1920\n",
-      "-----\n",
-      "Input size:  7680\n",
-      "Cascaded GPU compressor output size:  224\n",
-      "Cascaded GPU decompressor output size:  7680\n",
-      "Cascaded GPU compress/decompress round trip time:  0.0012781620025634766\n",
-      "7699\n",
-      "Lz4 zarr time:  0.0001780986785888672\n",
-      "Lz4 compressed size:  7699\n",
-      "3841\n",
-      "4\n",
-      "4\n",
-      "960\n",
-      "-----\n",
-      "Input size:  3840\n",
-      "Cascaded GPU compressor output size:  156\n",
-      "Cascaded GPU decompressor output size:  3840\n",
-      "Cascaded GPU compress/decompress round trip time:  0.001318216323852539\n",
-      "3852\n",
-      "Lz4 zarr time:  0.00019931793212890625\n",
-      "Lz4 compressed size:  3852\n"
-     ]
-    }
-   ],
-   "source": [
-    "input_size = []\n",
-    "cascaded_size = []\n",
-    "cascaded_temp_size = []\n",
-    "cascaded_round_trip_time = []\n",
-    "lz4_gpu_size = []\n",
-    "lz4_gpu_temp_size = []\n",
-    "lz4_gpu_round_trip_time = []\n",
-    "lz4_size = []\n",
-    "lz4_round_trip_time = []\n",
-    "for size in sizes:\n",
-    "    data_host = get_host_data(size, DTYPE)\n",
-    "    data_gpu = cp.array(data_host)\n",
-    "    \"\"\"Cascaded GPU\"\"\"\n",
-    "    t_gpu = time.time()\n",
-    "    compressor = kvikio.nvcomp.CascadedManager(dtype=data_gpu.dtype)\n",
-    "    compressed = compressor.compress(data_gpu)\n",
-    "    output_size = compressed.nbytes\n",
-    "\n",
-    "    decompressed = compressor.decompress(compressed)\n",
-    "    decompressed_size = decompressed.size * decompressed.itemsize\n",
-    "    input_size.append(data_gpu.size * data_gpu.itemsize)\n",
-    "    cascaded_round_trip_time.append(time.time() - t_gpu)\n",
-    "    cascaded_size.append(output_size)\n",
-    "    print('-----')\n",
-    "    print('Input size: ', data_gpu.size * data_gpu.itemsize)\n",
-    "    print('Cascaded GPU compressor output size: ', output_size)\n",
-    "    print('Cascaded GPU decompressor output size: ', decompressed_size)\n",
-    "    print('Cascaded GPU compress/decompress round trip time: ',time.time() - t_gpu)\n",
-    "    \n",
-    "    del compressor\n",
-    "        \n",
-    "    \"\"\"LZ4 Host\"\"\"\n",
-    "    lz4 = zarr.LZ4()\n",
-    "    t_host = time.time()\n",
-    "    host_compressed = lz4.encode(data_gpu.get())\n",
-    "    del data_gpu\n",
-    "    print(len(host_compressed))\n",
-    "    host_compressed = host_compressed[:2113929216]\n",
-    "    host_decompressed = lz4.decode(host_compressed)\n",
-    "    print('Lz4 zarr time: ', time.time() - t_host)\n",
-    "    print('Lz4 compressed size: ', len(host_compressed))\n",
-    "    lz4_size.append(len(host_compressed))\n",
-    "    lz4_round_trip_time.append(time.time() - t_host)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 52,
-   "id": "c981e8bc-e96a-4af4-9fe1-414aa2ff4c99",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "2013929216\n",
-      "4\n",
-      "4\n",
-      "503482304\n",
-      "lz4 GPU compressor output size:  2022340697\n",
-      "lz4 GPU decompressor output size:  2013929216\n",
-      "lz4 GPU compress/decompress round trip time:  0.7271463871002197\n",
-      "1006964608\n",
-      "4\n",
-      "4\n",
-      "251741152\n",
-      "lz4 GPU compressor output size:  1011170371\n",
-      "lz4 GPU decompressor output size:  1006964608\n",
-      "lz4 GPU compress/decompress round trip time:  0.36713171005249023\n",
-      "503482304\n",
-      "4\n",
-      "4\n",
-      "125870576\n",
-      "lz4 GPU compressor output size:  505585200\n",
-      "lz4 GPU decompressor output size:  503482304\n",
-      "lz4 GPU compress/decompress round trip time:  0.1900792121887207\n",
-      "251741152\n",
-      "4\n",
-      "4\n",
-      "62935288\n",
-      "lz4 GPU compressor output size:  252792621\n",
-      "lz4 GPU decompressor output size:  251741152\n",
-      "lz4 GPU compress/decompress round trip time:  0.09049177169799805\n",
-      "125870576\n",
-      "4\n",
-      "4\n",
-      "31467644\n",
-      "lz4 GPU compressor output size:  126396327\n",
-      "lz4 GPU decompressor output size:  125870576\n",
-      "lz4 GPU compress/decompress round trip time:  0.04643416404724121\n",
-      "62935288\n",
-      "4\n",
-      "4\n",
-      "15733820\n",
-      "lz4 GPU compressor output size:  63198181\n",
-      "lz4 GPU decompressor output size:  62935280\n",
-      "lz4 GPU compress/decompress round trip time:  0.02284073829650879\n",
-      "31467644\n",
-      "4\n",
-      "4\n",
-      "7866908\n",
-      "lz4 GPU compressor output size:  31599109\n",
-      "lz4 GPU decompressor output size:  31467632\n",
-      "lz4 GPU compress/decompress round trip time:  0.015845537185668945\n",
-      "15733822\n",
-      "4\n",
-      "4\n",
-      "3933452\n",
-      "lz4 GPU compressor output size:  15799573\n",
-      "lz4 GPU decompressor output size:  15733808\n",
-      "lz4 GPU compress/decompress round trip time:  0.009501934051513672\n",
-      "7866911\n",
-      "4\n",
-      "4\n",
-      "1966724\n",
-      "lz4 GPU compressor output size:  7899801\n",
-      "lz4 GPU decompressor output size:  7866896\n",
-      "lz4 GPU compress/decompress round trip time:  0.011568546295166016\n",
-      "3933455\n",
-      "4\n",
-      "4\n",
-      "983360\n",
-      "lz4 GPU compressor output size:  3949915\n",
-      "lz4 GPU decompressor output size:  3933440\n",
-      "lz4 GPU compress/decompress round trip time:  0.00696110725402832\n",
-      "1966727\n",
-      "4\n",
-      "4\n",
-      "491680\n",
-      "lz4 GPU compressor output size:  1974981\n",
-      "lz4 GPU decompressor output size:  1966720\n",
-      "lz4 GPU compress/decompress round trip time:  0.012327194213867188\n",
-      "983363\n",
-      "4\n",
-      "4\n",
-      "245840\n",
-      "lz4 GPU compressor output size:  987514\n",
-      "lz4 GPU decompressor output size:  983360\n",
-      "lz4 GPU compress/decompress round trip time:  0.006538867950439453\n",
-      "491681\n",
-      "4\n",
-      "4\n",
-      "122920\n",
-      "lz4 GPU compressor output size:  493774\n",
-      "lz4 GPU decompressor output size:  491680\n",
-      "lz4 GPU compress/decompress round trip time:  0.012677907943725586\n",
-      "245840\n",
-      "4\n",
-      "4\n",
-      "61460\n",
-      "lz4 GPU compressor output size:  246904\n",
-      "lz4 GPU decompressor output size:  245840\n",
-      "lz4 GPU compress/decompress round trip time:  0.006706953048706055\n",
-      "122920\n",
-      "4\n",
-      "4\n",
-      "30728\n",
-      "lz4 GPU compressor output size:  123459\n",
-      "lz4 GPU decompressor output size:  122912\n",
-      "lz4 GPU compress/decompress round trip time:  0.010996580123901367\n",
-      "61460\n",
-      "4\n",
-      "4\n",
-      "15364\n",
-      "lz4 GPU compressor output size:  61745\n",
-      "lz4 GPU decompressor output size:  61456\n",
-      "lz4 GPU compress/decompress round trip time:  0.006911039352416992\n",
-      "30730\n",
-      "4\n",
-      "4\n",
-      "7680\n",
-      "lz4 GPU compressor output size:  30907\n",
-      "lz4 GPU decompressor output size:  30720\n",
-      "lz4 GPU compress/decompress round trip time:  0.004134178161621094\n",
-      "15365\n",
-      "4\n",
-      "4\n",
-      "3840\n",
-      "lz4 GPU compressor output size:  15498\n",
-      "lz4 GPU decompressor output size:  15360\n",
-      "lz4 GPU compress/decompress round trip time:  0.0048847198486328125\n",
-      "7682\n",
-      "4\n",
-      "4\n",
-      "1920\n",
-      "lz4 GPU compressor output size:  7787\n",
-      "lz4 GPU decompressor output size:  7680\n",
-      "lz4 GPU compress/decompress round trip time:  0.0031135082244873047\n",
-      "3841\n",
-      "4\n",
-      "4\n",
-      "960\n",
-      "lz4 GPU compressor output size:  3940\n",
-      "lz4 GPU decompressor output size:  3840\n",
-      "lz4 GPU compress/decompress round trip time:  0.0027506351470947266\n"
-     ]
-    }
-   ],
-   "source": [
-    "lz4_gpu_size = []\n",
-    "lz4_gpu_temp_size = []\n",
-    "lz4_gpu_round_trip_time = []\n",
-    "for size in sizes:\n",
-    "    data_host = get_host_data(size, DTYPE)\n",
-    "    data_gpu = cp.array(data_host)\n",
-    "\n",
-    "    \"\"\"LZ4 GPU\"\"\"\n",
-    "    data_gpu = cp.array(data_host)\n",
-    "    t_gpu = time.time()\n",
-    "    compressor = kvikio.nvcomp.LZ4Manager(dtype=data_gpu.dtype)\n",
-    "    compressed = compressor.compress(data_gpu)\n",
-    "    output_size = compressed.nbytes\n",
-    "\n",
-    "    decompressed = compressor.decompress(compressed)\n",
-    "    decompressed_size = decompressed.size * decompressed.itemsize\n",
-    "    lz4_gpu_round_trip_time.append(time.time() - t_gpu)\n",
-    "    lz4_gpu_size.append(output_size)\n",
-    "    print('lz4 GPU compressor output size: ', output_size)\n",
-    "    print('lz4 GPU decompressor output size: ', decompressed_size)\n",
-    "    print('lz4 GPU compress/decompress round trip time: ',time.time() - t_gpu)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 53,
-   "id": "0b9e6efb-439b-4d9e-b221-1a728adee7d6",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# zarr lz4 max buffer size is 264241152 int64s\n",
-    "# zarr lz4 max buffer size is 2113929216 bytes\n",
-    "# cascaded max buffer size is 2147483640 bytes\n",
-    "# cascaded max buffer size is 268435456 int64s"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 54,
-   "id": "9cd69f83-88de-4929-b760-b8ebfb916b8f",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[2013929216, 1006964608, 503482304, 251741152, 125870576, 62935280, 31467632, 15733808, 7866896, 3933440, 1966720, 983360, 491680, 245840, 122912, 61456, 30720, 15360, 7680, 3840]\n",
-      "[33434464, 16717276, 8358716, 4179436, 2089796, 1044976, 522532, 261344, 130716, 65436, 32796, 16476, 8316, 4236, 2184, 1148, 632, 360, 224, 156]\n",
-      "[]\n",
-      "[0.10751104354858398, 0.11756682395935059, 0.05767321586608887, 0.028416156768798828, 0.014620304107666016, 0.007331132888793945, 0.004427194595336914, 0.0025060176849365234, 0.0017902851104736328, 0.0016641616821289062, 0.001974821090698242, 0.0013790130615234375, 0.0011060237884521484, 0.0014438629150390625, 0.0010533332824707031, 0.0008640289306640625, 0.001127481460571289, 0.0014081001281738281, 0.0011692047119140625, 0.0012063980102539062]\n",
-      "[2022340697, 1011170371, 505585200, 252792621, 126396327, 63198181, 31599109, 15799573, 7899801, 3949915, 1974981, 987514, 493774, 246904, 123459, 61745, 30907, 15498, 7787, 3940]\n",
-      "[]\n",
-      "[0.7270452976226807, 0.3670234680175781, 0.18999958038330078, 0.09043264389038086, 0.04634451866149902, 0.022789478302001953, 0.015785932540893555, 0.009443283081054688, 0.011508703231811523, 0.00690460205078125, 0.012271881103515625, 0.00648951530456543, 0.012626171112060547, 0.006663322448730469, 0.010945320129394531, 0.00687098503112793, 0.004094123840332031, 0.004844188690185547, 0.0030717849731445312, 0.0027098655700683594]\n",
-      "[2021826967, 1010913478, 505456734, 252728362, 126364175, 63182074, 31591024, 15795499, 7897736, 3948855, 1974422, 987206, 493597, 246793, 123383, 61678, 30822, 15401, 7699, 3852]\n",
-      "[4.681788921356201, 2.579982280731201, 1.2367866039276123, 0.5987403392791748, 0.3033754825592041, 0.16110515594482422, 0.1472797393798828, 0.03442859649658203, 0.010602712631225586, 0.028273344039916992, 0.0026633739471435547, 0.0007534027099609375, 0.0009970664978027344, 0.0004544258117675781, 0.0002968311309814453, 0.0002384185791015625, 0.00015044212341308594, 0.00044274330139160156, 0.00023889541625976562, 0.00026869773864746094]\n"
-     ]
-    }
-   ],
-   "source": [
-    "print(input_size)\n",
-    "print(cascaded_size)\n",
-    "print(cascaded_temp_size)\n",
-    "print(cascaded_round_trip_time)\n",
-    "print(lz4_gpu_size)\n",
-    "print(lz4_gpu_temp_size)\n",
-    "print(lz4_gpu_round_trip_time)\n",
-    "print(lz4_size)\n",
-    "print(lz4_round_trip_time)\n",
-    "df = pd.DataFrame({\n",
-    "    'Input Size (Bytes)': input_size,\n",
-    "    'cascaded_size': cascaded_size,\n",
-    "    'cascaded_round_trip_time': cascaded_round_trip_time,\n",
-    "    'lz4_gpu_size': lz4_gpu_size,\n",
-    "    'lz4_gpu_round_trip_time': lz4_gpu_round_trip_time,\n",
-    "    'lz4_size': lz4_size,\n",
-    "    'lz4_round_trip_time': lz4_round_trip_time\n",
-    "})"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 55,
-   "id": "c7a23383-a073-4156-9be6-9da6b8c9026e",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "### You'll need the following to display the upcoming plots. ###\n",
-    "\n",
-    "# !conda install -c conda-forge plotly\n",
-    "# !npm install require"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 56,
-   "id": "8a7d2c60-79d7-4840-a5fb-c7e1eb42f829",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Index(['Input Size (Bytes)', 'cascaded_size', 'cascaded_round_trip_time',\n",
-      "       'lz4_gpu_size', 'lz4_gpu_round_trip_time', 'lz4_size',\n",
-      "       'lz4_round_trip_time', 'Cascaded Compression Ratio',\n",
-      "       'Lz4 Gpu Compression Ratio', 'Lz4 Host Compression Ratio',\n",
-      "       'Cascaded Speedup', 'Lz4 Gpu Speedup'],\n",
-      "      dtype='object')\n"
-     ]
-    }
-   ],
-   "source": [
-    "df['Cascaded Compression Ratio'] = df['Input Size (Bytes)'] / df['cascaded_size']\n",
-    "df['Lz4 Gpu Compression Ratio'] = df['Input Size (Bytes)'] / df['lz4_gpu_size']\n",
-    "df['Lz4 Host Compression Ratio'] = df['Input Size (Bytes)'] / df['lz4_size']\n",
-    "df['Cascaded Speedup'] = df['lz4_round_trip_time'] / df['cascaded_round_trip_time']\n",
-    "df['Lz4 Gpu Speedup'] = df['lz4_round_trip_time'] / df['lz4_gpu_round_trip_time']\n",
-    "print(df.columns)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 57,
-   "id": "8c6f225a-61e6-42b2-a991-6eeab56aae48",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "application/vnd.plotly.v1+json": {
-       "config": {
-        "plotlyServerURL": "https://plot.ly"
-       },
-       "data": [
-        {
-         "hovertemplate": "variable=Cascaded Speedup<br>Input Size (Bytes)=%{x}<br>Multiple Faster=%{y}<extra></extra>",
-         "legendgroup": "Cascaded Speedup",
-         "line": {
-          "color": "#636efa",
-          "dash": "solid"
-         },
-         "marker": {
-          "symbol": "circle"
-         },
-         "mode": "lines",
-         "name": "Cascaded Speedup",
-         "orientation": "v",
-         "showlegend": true,
-         "type": "scatter",
-         "x": [
-          2013929216,
-          1006964608,
-          503482304,
-          251741152,
-          125870576,
-          62935280,
-          31467632,
-          15733808,
-          7866896,
-          3933440,
-          1966720,
-          983360,
-          491680,
-          245840,
-          122912,
-          61456,
-          30720,
-          15360,
-          7680,
-          3840
-         ],
-         "xaxis": "x",
-         "y": [
-          43.54705123144407,
-          21.94481567030547,
-          21.444731065444667,
-          21.07041934455389,
-          20.75028537882,
-          21.975478877361866,
-          33.267057999892295,
-          13.738369327371325,
-          5.922359834864829,
-          16.989541547277938,
-          1.3486659422914402,
-          0.5463347164591977,
-          0.9014873895236042,
-          0.31472919418758255,
-          0.2818017202354006,
-          0.27593818984547464,
-          0.13343201522520617,
-          0.3144260074500508,
-          0.20432300163132136,
-          0.22272727272727272
-         ],
-         "yaxis": "y"
-        },
-        {
-         "hovertemplate": "variable=Lz4 Gpu Speedup<br>Input Size (Bytes)=%{x}<br>Multiple Faster=%{y}<extra></extra>",
-         "legendgroup": "Lz4 Gpu Speedup",
-         "line": {
-          "color": "#EF553B",
-          "dash": "solid"
-         },
-         "marker": {
-          "symbol": "circle"
-         },
-         "mode": "lines",
-         "name": "Lz4 Gpu Speedup",
-         "orientation": "v",
-         "showlegend": true,
-         "type": "scatter",
-         "x": [
-          2013929216,
-          1006964608,
-          503482304,
-          251741152,
-          125870576,
-          62935280,
-          31467632,
-          15733808,
-          7866896,
-          3933440,
-          1966720,
-          983360,
-          491680,
-          245840,
-          122912,
-          61456,
-          30720,
-          15360,
-          7680,
-          3840
-         ],
-         "xaxis": "x",
-         "y": [
-          6.439473491768513,
-          7.029474967000302,
-          6.5094175546732655,
-          6.620843022182852,
-          6.5460919936414195,
-          7.069277927730003,
-          9.329809246197762,
-          3.645829125429206,
-          0.9212777858341448,
-          4.094854972375691,
-          0.21703061858874728,
-          0.11609537455453911,
-          0.07896823898183467,
-          0.06819808215256906,
-          0.027119456303912173,
-          0.03469933030292515,
-          0.036745865362217564,
-          0.09139679102273846,
-          0.0777708786091276,
-          0.09915537568185817
-         ],
-         "yaxis": "y"
-        }
-       ],
-       "layout": {
-        "legend": {
-         "title": {
-          "text": "variable"
-         },
-         "tracegroupgap": 0
-        },
-        "template": {
-         "data": {
-          "bar": [
-           {
-            "error_x": {
-             "color": "#2a3f5f"
-            },
-            "error_y": {
-             "color": "#2a3f5f"
-            },
-            "marker": {
-             "line": {
-              "color": "#E5ECF6",
-              "width": 0.5
-             },
-             "pattern": {
-              "fillmode": "overlay",
-              "size": 10,
-              "solidity": 0.2
-             }
-            },
-            "type": "bar"
-           }
-          ],
-          "barpolar": [
-           {
-            "marker": {
-             "line": {
-              "color": "#E5ECF6",
-              "width": 0.5
-             },
-             "pattern": {
-              "fillmode": "overlay",
-              "size": 10,
-              "solidity": 0.2
-             }
-            },
-            "type": "barpolar"
-           }
-          ],
-          "carpet": [
-           {
-            "aaxis": {
-             "endlinecolor": "#2a3f5f",
-             "gridcolor": "white",
-             "linecolor": "white",
-             "minorgridcolor": "white",
-             "startlinecolor": "#2a3f5f"
-            },
-            "baxis": {
-             "endlinecolor": "#2a3f5f",
-             "gridcolor": "white",
-             "linecolor": "white",
-             "minorgridcolor": "white",
-             "startlinecolor": "#2a3f5f"
-            },
-            "type": "carpet"
-           }
-          ],
-          "choropleth": [
-           {
-            "colorbar": {
-             "outlinewidth": 0,
-             "ticks": ""
-            },
-            "type": "choropleth"
-           }
-          ],
-          "contour": [
-           {
-            "colorbar": {
-             "outlinewidth": 0,
-             "ticks": ""
-            },
-            "colorscale": [
-             [
-              0,
-              "#0d0887"
-             ],
-             [
-              0.1111111111111111,
-              "#46039f"
-             ],
-             [
-              0.2222222222222222,
-              "#7201a8"
-             ],
-             [
-              0.3333333333333333,
-              "#9c179e"
-             ],
-             [
-              0.4444444444444444,
-              "#bd3786"
-             ],
-             [
-              0.5555555555555556,
-              "#d8576b"
-             ],
-             [
-              0.6666666666666666,
-              "#ed7953"
-             ],
-             [
-              0.7777777777777778,
-              "#fb9f3a"
-             ],
-             [
-              0.8888888888888888,
-              "#fdca26"
-             ],
-             [
-              1,
-              "#f0f921"
-             ]
-            ],
-            "type": "contour"
-           }
-          ],
-          "contourcarpet": [
-           {
-            "colorbar": {
-             "outlinewidth": 0,
-             "ticks": ""
-            },
-            "type": "contourcarpet"
-           }
-          ],
-          "heatmap": [
-           {
-            "colorbar": {
-             "outlinewidth": 0,
-             "ticks": ""
-            },
-            "colorscale": [
-             [
-              0,
-              "#0d0887"
-             ],
-             [
-              0.1111111111111111,
-              "#46039f"
-             ],
-             [
-              0.2222222222222222,
-              "#7201a8"
-             ],
-             [
-              0.3333333333333333,
-              "#9c179e"
-             ],
-             [
-              0.4444444444444444,
-              "#bd3786"
-             ],
-             [
-              0.5555555555555556,
-              "#d8576b"
-             ],
-             [
-              0.6666666666666666,
-              "#ed7953"
-             ],
-             [
-              0.7777777777777778,
-              "#fb9f3a"
-             ],
-             [
-              0.8888888888888888,
-              "#fdca26"
-             ],
-             [
-              1,
-              "#f0f921"
-             ]
-            ],
-            "type": "heatmap"
-           }
-          ],
-          "heatmapgl": [
-           {
-            "colorbar": {
-             "outlinewidth": 0,
-             "ticks": ""
-            },
-            "colorscale": [
-             [
-              0,
-              "#0d0887"
-             ],
-             [
-              0.1111111111111111,
-              "#46039f"
-             ],
-             [
-              0.2222222222222222,
-              "#7201a8"
-             ],
-             [
-              0.3333333333333333,
-              "#9c179e"
-             ],
-             [
-              0.4444444444444444,
-              "#bd3786"
-             ],
-             [
-              0.5555555555555556,
-              "#d8576b"
-             ],
-             [
-              0.6666666666666666,
-              "#ed7953"
-             ],
-             [
-              0.7777777777777778,
-              "#fb9f3a"
-             ],
-             [
-              0.8888888888888888,
-              "#fdca26"
-             ],
-             [
-              1,
-              "#f0f921"
-             ]
-            ],
-            "type": "heatmapgl"
-           }
-          ],
-          "histogram": [
-           {
-            "marker": {
-             "pattern": {
-              "fillmode": "overlay",
-              "size": 10,
-              "solidity": 0.2
-             }
-            },
-            "type": "histogram"
-           }
-          ],
-          "histogram2d": [
-           {
-            "colorbar": {
-             "outlinewidth": 0,
-             "ticks": ""
-            },
-            "colorscale": [
-             [
-              0,
-              "#0d0887"
-             ],
-             [
-              0.1111111111111111,
-              "#46039f"
-             ],
-             [
-              0.2222222222222222,
-              "#7201a8"
-             ],
-             [
-              0.3333333333333333,
-              "#9c179e"
-             ],
-             [
-              0.4444444444444444,
-              "#bd3786"
-             ],
-             [
-              0.5555555555555556,
-              "#d8576b"
-             ],
-             [
-              0.6666666666666666,
-              "#ed7953"
-             ],
-             [
-              0.7777777777777778,
-              "#fb9f3a"
-             ],
-             [
-              0.8888888888888888,
-              "#fdca26"
-             ],
-             [
-              1,
-              "#f0f921"
-             ]
-            ],
-            "type": "histogram2d"
-           }
-          ],
-          "histogram2dcontour": [
-           {
-            "colorbar": {
-             "outlinewidth": 0,
-             "ticks": ""
-            },
-            "colorscale": [
-             [
-              0,
-              "#0d0887"
-             ],
-             [
-              0.1111111111111111,
-              "#46039f"
-             ],
-             [
-              0.2222222222222222,
-              "#7201a8"
-             ],
-             [
-              0.3333333333333333,
-              "#9c179e"
-             ],
-             [
-              0.4444444444444444,
-              "#bd3786"
-             ],
-             [
-              0.5555555555555556,
-              "#d8576b"
-             ],
-             [
-              0.6666666666666666,
-              "#ed7953"
-             ],
-             [
-              0.7777777777777778,
-              "#fb9f3a"
-             ],
-             [
-              0.8888888888888888,
-              "#fdca26"
-             ],
-             [
-              1,
-              "#f0f921"
-             ]
-            ],
-            "type": "histogram2dcontour"
-           }
-          ],
-          "mesh3d": [
-           {
-            "colorbar": {
-             "outlinewidth": 0,
-             "ticks": ""
-            },
-            "type": "mesh3d"
-           }
-          ],
-          "parcoords": [
-           {
-            "line": {
-             "colorbar": {
-              "outlinewidth": 0,
-              "ticks": ""
-             }
-            },
-            "type": "parcoords"
-           }
-          ],
-          "pie": [
-           {
-            "automargin": true,
-            "type": "pie"
-           }
-          ],
-          "scatter": [
-           {
-            "fillpattern": {
-             "fillmode": "overlay",
-             "size": 10,
-             "solidity": 0.2
-            },
-            "type": "scatter"
-           }
-          ],
-          "scatter3d": [
-           {
-            "line": {
-             "colorbar": {
-              "outlinewidth": 0,
-              "ticks": ""
-             }
-            },
-            "marker": {
-             "colorbar": {
-              "outlinewidth": 0,
-              "ticks": ""
-             }
-            },
-            "type": "scatter3d"
-           }
-          ],
-          "scattercarpet": [
-           {
-            "marker": {
-             "colorbar": {
-              "outlinewidth": 0,
-              "ticks": ""
-             }
-            },
-            "type": "scattercarpet"
-           }
-          ],
-          "scattergeo": [
-           {
-            "marker": {
-             "colorbar": {
-              "outlinewidth": 0,
-              "ticks": ""
-             }
-            },
-            "type": "scattergeo"
-           }
-          ],
-          "scattergl": [
-           {
-            "marker": {
-             "colorbar": {
-              "outlinewidth": 0,
-              "ticks": ""
-             }
-            },
-            "type": "scattergl"
-           }
-          ],
-          "scattermapbox": [
-           {
-            "marker": {
-             "colorbar": {
-              "outlinewidth": 0,
-              "ticks": ""
-             }
-            },
-            "type": "scattermapbox"
-           }
-          ],
-          "scatterpolar": [
-           {
-            "marker": {
-             "colorbar": {
-              "outlinewidth": 0,
-              "ticks": ""
-             }
-            },
-            "type": "scatterpolar"
-           }
-          ],
-          "scatterpolargl": [
-           {
-            "marker": {
-             "colorbar": {
-              "outlinewidth": 0,
-              "ticks": ""
-             }
-            },
-            "type": "scatterpolargl"
-           }
-          ],
-          "scatterternary": [
-           {
-            "marker": {
-             "colorbar": {
-              "outlinewidth": 0,
-              "ticks": ""
-             }
-            },
-            "type": "scatterternary"
-           }
-          ],
-          "surface": [
-           {
-            "colorbar": {
-             "outlinewidth": 0,
-             "ticks": ""
-            },
-            "colorscale": [
-             [
-              0,
-              "#0d0887"
-             ],
-             [
-              0.1111111111111111,
-              "#46039f"
-             ],
-             [
-              0.2222222222222222,
-              "#7201a8"
-             ],
-             [
-              0.3333333333333333,
-              "#9c179e"
-             ],
-             [
-              0.4444444444444444,
-              "#bd3786"
-             ],
-             [
-              0.5555555555555556,
-              "#d8576b"
-             ],
-             [
-              0.6666666666666666,
-              "#ed7953"
-             ],
-             [
-              0.7777777777777778,
-              "#fb9f3a"
-             ],
-             [
-              0.8888888888888888,
-              "#fdca26"
-             ],
-             [
-              1,
-              "#f0f921"
-             ]
-            ],
-            "type": "surface"
-           }
-          ],
-          "table": [
-           {
-            "cells": {
-             "fill": {
-              "color": "#EBF0F8"
-             },
-             "line": {
-              "color": "white"
-             }
-            },
-            "header": {
-             "fill": {
-              "color": "#C8D4E3"
-             },
-             "line": {
-              "color": "white"
-             }
-            },
-            "type": "table"
-           }
-          ]
-         },
-         "layout": {
-          "annotationdefaults": {
-           "arrowcolor": "#2a3f5f",
-           "arrowhead": 0,
-           "arrowwidth": 1
-          },
-          "autotypenumbers": "strict",
-          "coloraxis": {
-           "colorbar": {
-            "outlinewidth": 0,
-            "ticks": ""
-           }
-          },
-          "colorscale": {
-           "diverging": [
-            [
-             0,
-             "#8e0152"
-            ],
-            [
-             0.1,
-             "#c51b7d"
-            ],
-            [
-             0.2,
-             "#de77ae"
-            ],
-            [
-             0.3,
-             "#f1b6da"
-            ],
-            [
-             0.4,
-             "#fde0ef"
-            ],
-            [
-             0.5,
-             "#f7f7f7"
-            ],
-            [
-             0.6,
-             "#e6f5d0"
-            ],
-            [
-             0.7,
-             "#b8e186"
-            ],
-            [
-             0.8,
-             "#7fbc41"
-            ],
-            [
-             0.9,
-             "#4d9221"
-            ],
-            [
-             1,
-             "#276419"
-            ]
-           ],
-           "sequential": [
-            [
-             0,
-             "#0d0887"
-            ],
-            [
-             0.1111111111111111,
-             "#46039f"
-            ],
-            [
-             0.2222222222222222,
-             "#7201a8"
-            ],
-            [
-             0.3333333333333333,
-             "#9c179e"
-            ],
-            [
-             0.4444444444444444,
-             "#bd3786"
-            ],
-            [
-             0.5555555555555556,
-             "#d8576b"
-            ],
-            [
-             0.6666666666666666,
-             "#ed7953"
-            ],
-            [
-             0.7777777777777778,
-             "#fb9f3a"
-            ],
-            [
-             0.8888888888888888,
-             "#fdca26"
-            ],
-            [
-             1,
-             "#f0f921"
-            ]
-           ],
-           "sequentialminus": [
-            [
-             0,
-             "#0d0887"
-            ],
-            [
-             0.1111111111111111,
-             "#46039f"
-            ],
-            [
-             0.2222222222222222,
-             "#7201a8"
-            ],
-            [
-             0.3333333333333333,
-             "#9c179e"
-            ],
-            [
-             0.4444444444444444,
-             "#bd3786"
-            ],
-            [
-             0.5555555555555556,
-             "#d8576b"
-            ],
-            [
-             0.6666666666666666,
-             "#ed7953"
-            ],
-            [
-             0.7777777777777778,
-             "#fb9f3a"
-            ],
-            [
-             0.8888888888888888,
-             "#fdca26"
-            ],
-            [
-             1,
-             "#f0f921"
-            ]
-           ]
-          },
-          "colorway": [
-           "#636efa",
-           "#EF553B",
-           "#00cc96",
-           "#ab63fa",
-           "#FFA15A",
-           "#19d3f3",
-           "#FF6692",
-           "#B6E880",
-           "#FF97FF",
-           "#FECB52"
-          ],
-          "font": {
-           "color": "#2a3f5f"
-          },
-          "geo": {
-           "bgcolor": "white",
-           "lakecolor": "white",
-           "landcolor": "#E5ECF6",
-           "showlakes": true,
-           "showland": true,
-           "subunitcolor": "white"
-          },
-          "hoverlabel": {
-           "align": "left"
-          },
-          "hovermode": "closest",
-          "mapbox": {
-           "style": "light"
-          },
-          "paper_bgcolor": "white",
-          "plot_bgcolor": "#E5ECF6",
-          "polar": {
-           "angularaxis": {
-            "gridcolor": "white",
-            "linecolor": "white",
-            "ticks": ""
-           },
-           "bgcolor": "#E5ECF6",
-           "radialaxis": {
-            "gridcolor": "white",
-            "linecolor": "white",
-            "ticks": ""
-           }
-          },
-          "scene": {
-           "xaxis": {
-            "backgroundcolor": "#E5ECF6",
-            "gridcolor": "white",
-            "gridwidth": 2,
-            "linecolor": "white",
-            "showbackground": true,
-            "ticks": "",
-            "zerolinecolor": "white"
-           },
-           "yaxis": {
-            "backgroundcolor": "#E5ECF6",
-            "gridcolor": "white",
-            "gridwidth": 2,
-            "linecolor": "white",
-            "showbackground": true,
-            "ticks": "",
-            "zerolinecolor": "white"
-           },
-           "zaxis": {
-            "backgroundcolor": "#E5ECF6",
-            "gridcolor": "white",
-            "gridwidth": 2,
-            "linecolor": "white",
-            "showbackground": true,
-            "ticks": "",
-            "zerolinecolor": "white"
-           }
-          },
-          "shapedefaults": {
-           "line": {
-            "color": "#2a3f5f"
-           }
-          },
-          "ternary": {
-           "aaxis": {
-            "gridcolor": "white",
-            "linecolor": "white",
-            "ticks": ""
-           },
-           "baxis": {
-            "gridcolor": "white",
-            "linecolor": "white",
-            "ticks": ""
-           },
-           "bgcolor": "#E5ECF6",
-           "caxis": {
-            "gridcolor": "white",
-            "linecolor": "white",
-            "ticks": ""
-           }
-          },
-          "title": {
-           "x": 0.05
-          },
-          "xaxis": {
-           "automargin": true,
-           "gridcolor": "white",
-           "linecolor": "white",
-           "ticks": "",
-           "title": {
-            "standoff": 15
-           },
-           "zerolinecolor": "white",
-           "zerolinewidth": 2
-          },
-          "yaxis": {
-           "automargin": true,
-           "gridcolor": "white",
-           "linecolor": "white",
-           "ticks": "",
-           "title": {
-            "standoff": 15
-           },
-           "zerolinecolor": "white",
-           "zerolinewidth": 2
-          }
-         }
-        },
-        "title": {
-         "text": "Gpu Acceleration over Zarr Lz4 - Ascending <class 'numpy.int32'>"
-        },
-        "xaxis": {
-         "anchor": "y",
-         "domain": [
-          0,
-          1
-         ],
-         "title": {
-          "text": "Input Size (Bytes)"
-         },
-         "type": "category"
-        },
-        "yaxis": {
-         "anchor": "x",
-         "domain": [
-          0,
-          1
-         ],
-         "title": {
-          "text": "Multiple Faster"
-         }
-        }
-       }
-      }
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    }
-   ],
-   "source": [
-    "import plotly.express as px\n",
-    "title = 'Gpu Acceleration over Zarr Lz4 - ' + TARGET + \" \" + str(DTYPE)\n",
-    "subtitle = 'Includes host->gpu copy time'\n",
-    "fig = px.line(df, x='Input Size (Bytes)',\n",
-    "              y=['Cascaded Speedup', 'Lz4 Gpu Speedup'],\n",
-    "              labels={'value': 'Multiple Faster'},\n",
-    "              title=title)\n",
-    "fig.update_xaxes(type='category')\n",
-    "fig.show()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 58,
-   "id": "e3d57a90-ca86-41da-9747-696151d66184",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "application/vnd.plotly.v1+json": {
-       "config": {
-        "plotlyServerURL": "https://plot.ly"
-       },
-       "data": [
-        {
-         "hovertemplate": "variable=Lz4 Gpu Compression Ratio<br>Input Size (Bytes)=%{x}<br>Compression Factor=%{y}<extra></extra>",
-         "legendgroup": "Lz4 Gpu Compression Ratio",
-         "line": {
-          "color": "#636efa",
-          "dash": "solid"
-         },
-         "marker": {
-          "symbol": "circle"
-         },
-         "mode": "lines",
-         "name": "Lz4 Gpu Compression Ratio",
-         "orientation": "v",
-         "showlegend": true,
-         "type": "scatter",
-         "x": [
-          2013929216,
-          1006964608,
-          503482304,
-          251741152,
-          125870576,
-          62935280,
-          31467632,
-          15733808,
-          7866896,
-          3933440,
-          1966720,
-          983360,
-          491680,
-          245840,
-          122912,
-          61456,
-          30720,
-          15360,
-          7680,
-          3840
-         ],
-         "xaxis": "x",
-         "y": [
-          0.9958407201059258,
-          0.9958406979470327,
-          0.995840669386683,
-          0.9958405866601621,
-          0.9958404566613712,
-          0.9958400543205508,
-          0.9958392181247895,
-          0.9958375457362044,
-          0.9958347052033336,
-          0.9958290241688745,
-          0.9958171749500375,
-          0.9957934773582957,
-          0.9957591934771778,
-          0.9956906327965525,
-          0.9955693793081104,
-          0.9953194590655113,
-          0.9939495907076067,
-          0.9910956252419667,
-          0.9862591498651598,
-          0.9746192893401016
-         ],
-         "yaxis": "y"
-        },
-        {
-         "hovertemplate": "variable=Cascaded Compression Ratio<br>Input Size (Bytes)=%{x}<br>Compression Factor=%{y}<extra></extra>",
-         "legendgroup": "Cascaded Compression Ratio",
-         "line": {
-          "color": "#EF553B",
-          "dash": "solid"
-         },
-         "marker": {
-          "symbol": "circle"
-         },
-         "mode": "lines",
-         "name": "Cascaded Compression Ratio",
-         "orientation": "v",
-         "showlegend": true,
-         "type": "scatter",
-         "x": [
-          2013929216,
-          1006964608,
-          503482304,
-          251741152,
-          125870576,
-          62935280,
-          31467632,
-          15733808,
-          7866896,
-          3933440,
-          1966720,
-          983360,
-          491680,
-          245840,
-          122912,
-          61456,
-          30720,
-          15360,
-          7680,
-          3840
-         ],
-         "xaxis": "x",
-         "y": [
-          60.23512792069884,
-          60.23496938137529,
-          60.23440729413465,
-          60.23328315112374,
-          60.231034990975196,
-          60.226531518427215,
-          60.2214448110355,
-          60.20344067589078,
-          60.183114538388566,
-          60.11125374411639,
-          59.96828881570923,
-          59.68438941490653,
-          59.12457912457913,
-          58.035882908404155,
-          56.27838827838828,
-          53.53310104529617,
-          48.607594936708864,
-          42.666666666666664,
-          34.285714285714285,
-          24.615384615384617
-         ],
-         "yaxis": "y"
-        },
-        {
-         "hovertemplate": "variable=Lz4 Host Compression Ratio<br>Input Size (Bytes)=%{x}<br>Compression Factor=%{y}<extra></extra>",
-         "legendgroup": "Lz4 Host Compression Ratio",
-         "line": {
-          "color": "#00cc96",
-          "dash": "solid"
-         },
-         "marker": {
-          "symbol": "circle"
-         },
-         "mode": "lines",
-         "name": "Lz4 Host Compression Ratio",
-         "orientation": "v",
-         "showlegend": true,
-         "type": "scatter",
-         "x": [
-          2013929216,
-          1006964608,
-          503482304,
-          251741152,
-          125870576,
-          62935280,
-          31467632,
-          15733808,
-          7866896,
-          3933440,
-          1966720,
-          983360,
-          491680,
-          245840,
-          122912,
-          61456,
-          30720,
-          15360,
-          7680,
-          3840
-         ],
-         "xaxis": "x",
-         "y": [
-          0.9960937552377597,
-          0.9960937606571312,
-          0.9960937705105339,
-          0.9960937902173401,
-          0.9960938375136782,
-          0.9960939237290628,
-          0.9960940803944817,
-          0.9960943937257063,
-          0.9960950834517639,
-          0.996096336786233,
-          0.9960991115374525,
-          0.9961041565792752,
-          0.9961162648881577,
-          0.9961384642190013,
-          0.9961826183509884,
-          0.9964006615000487,
-          0.996690675491532,
-          0.9973378352055061,
-          0.997532147032082,
-          0.9968847352024922
-         ],
-         "yaxis": "y"
-        }
-       ],
-       "layout": {
-        "legend": {
-         "title": {
-          "text": "variable"
-         },
-         "tracegroupgap": 0
-        },
-        "template": {
-         "data": {
-          "bar": [
-           {
-            "error_x": {
-             "color": "#2a3f5f"
-            },
-            "error_y": {
-             "color": "#2a3f5f"
-            },
-            "marker": {
-             "line": {
-              "color": "#E5ECF6",
-              "width": 0.5
-             },
-             "pattern": {
-              "fillmode": "overlay",
-              "size": 10,
-              "solidity": 0.2
-             }
-            },
-            "type": "bar"
-           }
-          ],
-          "barpolar": [
-           {
-            "marker": {
-             "line": {
-              "color": "#E5ECF6",
-              "width": 0.5
-             },
-             "pattern": {
-              "fillmode": "overlay",
-              "size": 10,
-              "solidity": 0.2
-             }
-            },
-            "type": "barpolar"
-           }
-          ],
-          "carpet": [
-           {
-            "aaxis": {
-             "endlinecolor": "#2a3f5f",
-             "gridcolor": "white",
-             "linecolor": "white",
-             "minorgridcolor": "white",
-             "startlinecolor": "#2a3f5f"
-            },
-            "baxis": {
-             "endlinecolor": "#2a3f5f",
-             "gridcolor": "white",
-             "linecolor": "white",
-             "minorgridcolor": "white",
-             "startlinecolor": "#2a3f5f"
-            },
-            "type": "carpet"
-           }
-          ],
-          "choropleth": [
-           {
-            "colorbar": {
-             "outlinewidth": 0,
-             "ticks": ""
-            },
-            "type": "choropleth"
-           }
-          ],
-          "contour": [
-           {
-            "colorbar": {
-             "outlinewidth": 0,
-             "ticks": ""
-            },
-            "colorscale": [
-             [
-              0,
-              "#0d0887"
-             ],
-             [
-              0.1111111111111111,
-              "#46039f"
-             ],
-             [
-              0.2222222222222222,
-              "#7201a8"
-             ],
-             [
-              0.3333333333333333,
-              "#9c179e"
-             ],
-             [
-              0.4444444444444444,
-              "#bd3786"
-             ],
-             [
-              0.5555555555555556,
-              "#d8576b"
-             ],
-             [
-              0.6666666666666666,
-              "#ed7953"
-             ],
-             [
-              0.7777777777777778,
-              "#fb9f3a"
-             ],
-             [
-              0.8888888888888888,
-              "#fdca26"
-             ],
-             [
-              1,
-              "#f0f921"
-             ]
-            ],
-            "type": "contour"
-           }
-          ],
-          "contourcarpet": [
-           {
-            "colorbar": {
-             "outlinewidth": 0,
-             "ticks": ""
-            },
-            "type": "contourcarpet"
-           }
-          ],
-          "heatmap": [
-           {
-            "colorbar": {
-             "outlinewidth": 0,
-             "ticks": ""
-            },
-            "colorscale": [
-             [
-              0,
-              "#0d0887"
-             ],
-             [
-              0.1111111111111111,
-              "#46039f"
-             ],
-             [
-              0.2222222222222222,
-              "#7201a8"
-             ],
-             [
-              0.3333333333333333,
-              "#9c179e"
-             ],
-             [
-              0.4444444444444444,
-              "#bd3786"
-             ],
-             [
-              0.5555555555555556,
-              "#d8576b"
-             ],
-             [
-              0.6666666666666666,
-              "#ed7953"
-             ],
-             [
-              0.7777777777777778,
-              "#fb9f3a"
-             ],
-             [
-              0.8888888888888888,
-              "#fdca26"
-             ],
-             [
-              1,
-              "#f0f921"
-             ]
-            ],
-            "type": "heatmap"
-           }
-          ],
-          "heatmapgl": [
-           {
-            "colorbar": {
-             "outlinewidth": 0,
-             "ticks": ""
-            },
-            "colorscale": [
-             [
-              0,
-              "#0d0887"
-             ],
-             [
-              0.1111111111111111,
-              "#46039f"
-             ],
-             [
-              0.2222222222222222,
-              "#7201a8"
-             ],
-             [
-              0.3333333333333333,
-              "#9c179e"
-             ],
-             [
-              0.4444444444444444,
-              "#bd3786"
-             ],
-             [
-              0.5555555555555556,
-              "#d8576b"
-             ],
-             [
-              0.6666666666666666,
-              "#ed7953"
-             ],
-             [
-              0.7777777777777778,
-              "#fb9f3a"
-             ],
-             [
-              0.8888888888888888,
-              "#fdca26"
-             ],
-             [
-              1,
-              "#f0f921"
-             ]
-            ],
-            "type": "heatmapgl"
-           }
-          ],
-          "histogram": [
-           {
-            "marker": {
-             "pattern": {
-              "fillmode": "overlay",
-              "size": 10,
-              "solidity": 0.2
-             }
-            },
-            "type": "histogram"
-           }
-          ],
-          "histogram2d": [
-           {
-            "colorbar": {
-             "outlinewidth": 0,
-             "ticks": ""
-            },
-            "colorscale": [
-             [
-              0,
-              "#0d0887"
-             ],
-             [
-              0.1111111111111111,
-              "#46039f"
-             ],
-             [
-              0.2222222222222222,
-              "#7201a8"
-             ],
-             [
-              0.3333333333333333,
-              "#9c179e"
-             ],
-             [
-              0.4444444444444444,
-              "#bd3786"
-             ],
-             [
-              0.5555555555555556,
-              "#d8576b"
-             ],
-             [
-              0.6666666666666666,
-              "#ed7953"
-             ],
-             [
-              0.7777777777777778,
-              "#fb9f3a"
-             ],
-             [
-              0.8888888888888888,
-              "#fdca26"
-             ],
-             [
-              1,
-              "#f0f921"
-             ]
-            ],
-            "type": "histogram2d"
-           }
-          ],
-          "histogram2dcontour": [
-           {
-            "colorbar": {
-             "outlinewidth": 0,
-             "ticks": ""
-            },
-            "colorscale": [
-             [
-              0,
-              "#0d0887"
-             ],
-             [
-              0.1111111111111111,
-              "#46039f"
-             ],
-             [
-              0.2222222222222222,
-              "#7201a8"
-             ],
-             [
-              0.3333333333333333,
-              "#9c179e"
-             ],
-             [
-              0.4444444444444444,
-              "#bd3786"
-             ],
-             [
-              0.5555555555555556,
-              "#d8576b"
-             ],
-             [
-              0.6666666666666666,
-              "#ed7953"
-             ],
-             [
-              0.7777777777777778,
-              "#fb9f3a"
-             ],
-             [
-              0.8888888888888888,
-              "#fdca26"
-             ],
-             [
-              1,
-              "#f0f921"
-             ]
-            ],
-            "type": "histogram2dcontour"
-           }
-          ],
-          "mesh3d": [
-           {
-            "colorbar": {
-             "outlinewidth": 0,
-             "ticks": ""
-            },
-            "type": "mesh3d"
-           }
-          ],
-          "parcoords": [
-           {
-            "line": {
-             "colorbar": {
-              "outlinewidth": 0,
-              "ticks": ""
-             }
-            },
-            "type": "parcoords"
-           }
-          ],
-          "pie": [
-           {
-            "automargin": true,
-            "type": "pie"
-           }
-          ],
-          "scatter": [
-           {
-            "fillpattern": {
-             "fillmode": "overlay",
-             "size": 10,
-             "solidity": 0.2
-            },
-            "type": "scatter"
-           }
-          ],
-          "scatter3d": [
-           {
-            "line": {
-             "colorbar": {
-              "outlinewidth": 0,
-              "ticks": ""
-             }
-            },
-            "marker": {
-             "colorbar": {
-              "outlinewidth": 0,
-              "ticks": ""
-             }
-            },
-            "type": "scatter3d"
-           }
-          ],
-          "scattercarpet": [
-           {
-            "marker": {
-             "colorbar": {
-              "outlinewidth": 0,
-              "ticks": ""
-             }
-            },
-            "type": "scattercarpet"
-           }
-          ],
-          "scattergeo": [
-           {
-            "marker": {
-             "colorbar": {
-              "outlinewidth": 0,
-              "ticks": ""
-             }
-            },
-            "type": "scattergeo"
-           }
-          ],
-          "scattergl": [
-           {
-            "marker": {
-             "colorbar": {
-              "outlinewidth": 0,
-              "ticks": ""
-             }
-            },
-            "type": "scattergl"
-           }
-          ],
-          "scattermapbox": [
-           {
-            "marker": {
-             "colorbar": {
-              "outlinewidth": 0,
-              "ticks": ""
-             }
-            },
-            "type": "scattermapbox"
-           }
-          ],
-          "scatterpolar": [
-           {
-            "marker": {
-             "colorbar": {
-              "outlinewidth": 0,
-              "ticks": ""
-             }
-            },
-            "type": "scatterpolar"
-           }
-          ],
-          "scatterpolargl": [
-           {
-            "marker": {
-             "colorbar": {
-              "outlinewidth": 0,
-              "ticks": ""
-             }
-            },
-            "type": "scatterpolargl"
-           }
-          ],
-          "scatterternary": [
-           {
-            "marker": {
-             "colorbar": {
-              "outlinewidth": 0,
-              "ticks": ""
-             }
-            },
-            "type": "scatterternary"
-           }
-          ],
-          "surface": [
-           {
-            "colorbar": {
-             "outlinewidth": 0,
-             "ticks": ""
-            },
-            "colorscale": [
-             [
-              0,
-              "#0d0887"
-             ],
-             [
-              0.1111111111111111,
-              "#46039f"
-             ],
-             [
-              0.2222222222222222,
-              "#7201a8"
-             ],
-             [
-              0.3333333333333333,
-              "#9c179e"
-             ],
-             [
-              0.4444444444444444,
-              "#bd3786"
-             ],
-             [
-              0.5555555555555556,
-              "#d8576b"
-             ],
-             [
-              0.6666666666666666,
-              "#ed7953"
-             ],
-             [
-              0.7777777777777778,
-              "#fb9f3a"
-             ],
-             [
-              0.8888888888888888,
-              "#fdca26"
-             ],
-             [
-              1,
-              "#f0f921"
-             ]
-            ],
-            "type": "surface"
-           }
-          ],
-          "table": [
-           {
-            "cells": {
-             "fill": {
-              "color": "#EBF0F8"
-             },
-             "line": {
-              "color": "white"
-             }
-            },
-            "header": {
-             "fill": {
-              "color": "#C8D4E3"
-             },
-             "line": {
-              "color": "white"
-             }
-            },
-            "type": "table"
-           }
-          ]
-         },
-         "layout": {
-          "annotationdefaults": {
-           "arrowcolor": "#2a3f5f",
-           "arrowhead": 0,
-           "arrowwidth": 1
-          },
-          "autotypenumbers": "strict",
-          "coloraxis": {
-           "colorbar": {
-            "outlinewidth": 0,
-            "ticks": ""
-           }
-          },
-          "colorscale": {
-           "diverging": [
-            [
-             0,
-             "#8e0152"
-            ],
-            [
-             0.1,
-             "#c51b7d"
-            ],
-            [
-             0.2,
-             "#de77ae"
-            ],
-            [
-             0.3,
-             "#f1b6da"
-            ],
-            [
-             0.4,
-             "#fde0ef"
-            ],
-            [
-             0.5,
-             "#f7f7f7"
-            ],
-            [
-             0.6,
-             "#e6f5d0"
-            ],
-            [
-             0.7,
-             "#b8e186"
-            ],
-            [
-             0.8,
-             "#7fbc41"
-            ],
-            [
-             0.9,
-             "#4d9221"
-            ],
-            [
-             1,
-             "#276419"
-            ]
-           ],
-           "sequential": [
-            [
-             0,
-             "#0d0887"
-            ],
-            [
-             0.1111111111111111,
-             "#46039f"
-            ],
-            [
-             0.2222222222222222,
-             "#7201a8"
-            ],
-            [
-             0.3333333333333333,
-             "#9c179e"
-            ],
-            [
-             0.4444444444444444,
-             "#bd3786"
-            ],
-            [
-             0.5555555555555556,
-             "#d8576b"
-            ],
-            [
-             0.6666666666666666,
-             "#ed7953"
-            ],
-            [
-             0.7777777777777778,
-             "#fb9f3a"
-            ],
-            [
-             0.8888888888888888,
-             "#fdca26"
-            ],
-            [
-             1,
-             "#f0f921"
-            ]
-           ],
-           "sequentialminus": [
-            [
-             0,
-             "#0d0887"
-            ],
-            [
-             0.1111111111111111,
-             "#46039f"
-            ],
-            [
-             0.2222222222222222,
-             "#7201a8"
-            ],
-            [
-             0.3333333333333333,
-             "#9c179e"
-            ],
-            [
-             0.4444444444444444,
-             "#bd3786"
-            ],
-            [
-             0.5555555555555556,
-             "#d8576b"
-            ],
-            [
-             0.6666666666666666,
-             "#ed7953"
-            ],
-            [
-             0.7777777777777778,
-             "#fb9f3a"
-            ],
-            [
-             0.8888888888888888,
-             "#fdca26"
-            ],
-            [
-             1,
-             "#f0f921"
-            ]
-           ]
-          },
-          "colorway": [
-           "#636efa",
-           "#EF553B",
-           "#00cc96",
-           "#ab63fa",
-           "#FFA15A",
-           "#19d3f3",
-           "#FF6692",
-           "#B6E880",
-           "#FF97FF",
-           "#FECB52"
-          ],
-          "font": {
-           "color": "#2a3f5f"
-          },
-          "geo": {
-           "bgcolor": "white",
-           "lakecolor": "white",
-           "landcolor": "#E5ECF6",
-           "showlakes": true,
-           "showland": true,
-           "subunitcolor": "white"
-          },
-          "hoverlabel": {
-           "align": "left"
-          },
-          "hovermode": "closest",
-          "mapbox": {
-           "style": "light"
-          },
-          "paper_bgcolor": "white",
-          "plot_bgcolor": "#E5ECF6",
-          "polar": {
-           "angularaxis": {
-            "gridcolor": "white",
-            "linecolor": "white",
-            "ticks": ""
-           },
-           "bgcolor": "#E5ECF6",
-           "radialaxis": {
-            "gridcolor": "white",
-            "linecolor": "white",
-            "ticks": ""
-           }
-          },
-          "scene": {
-           "xaxis": {
-            "backgroundcolor": "#E5ECF6",
-            "gridcolor": "white",
-            "gridwidth": 2,
-            "linecolor": "white",
-            "showbackground": true,
-            "ticks": "",
-            "zerolinecolor": "white"
-           },
-           "yaxis": {
-            "backgroundcolor": "#E5ECF6",
-            "gridcolor": "white",
-            "gridwidth": 2,
-            "linecolor": "white",
-            "showbackground": true,
-            "ticks": "",
-            "zerolinecolor": "white"
-           },
-           "zaxis": {
-            "backgroundcolor": "#E5ECF6",
-            "gridcolor": "white",
-            "gridwidth": 2,
-            "linecolor": "white",
-            "showbackground": true,
-            "ticks": "",
-            "zerolinecolor": "white"
-           }
-          },
-          "shapedefaults": {
-           "line": {
-            "color": "#2a3f5f"
-           }
-          },
-          "ternary": {
-           "aaxis": {
-            "gridcolor": "white",
-            "linecolor": "white",
-            "ticks": ""
-           },
-           "baxis": {
-            "gridcolor": "white",
-            "linecolor": "white",
-            "ticks": ""
-           },
-           "bgcolor": "#E5ECF6",
-           "caxis": {
-            "gridcolor": "white",
-            "linecolor": "white",
-            "ticks": ""
-           }
-          },
-          "title": {
-           "x": 0.05
-          },
-          "xaxis": {
-           "automargin": true,
-           "gridcolor": "white",
-           "linecolor": "white",
-           "ticks": "",
-           "title": {
-            "standoff": 15
-           },
-           "zerolinecolor": "white",
-           "zerolinewidth": 2
-          },
-          "yaxis": {
-           "automargin": true,
-           "gridcolor": "white",
-           "linecolor": "white",
-           "ticks": "",
-           "title": {
-            "standoff": 15
-           },
-           "zerolinecolor": "white",
-           "zerolinewidth": 2
-          }
-         }
-        },
-        "title": {
-         "text": "Compression - Ascending <class 'numpy.int32'>"
-        },
-        "xaxis": {
-         "anchor": "y",
-         "domain": [
-          0,
-          1
-         ],
-         "title": {
-          "text": "Input Size (Bytes)"
-         },
-         "type": "category"
-        },
-        "yaxis": {
-         "anchor": "x",
-         "domain": [
-          0,
-          1
-         ],
-         "title": {
-          "text": "Compression Factor"
-         }
-        }
-       }
-      }
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    }
-   ],
-   "source": [
-    "import plotly.express as px\n",
-    "title = 'Compression - ' + TARGET + \" \" + str(DTYPE)\n",
-    "fig = px.line(df, x='Input Size (Bytes)',\n",
-    "              y=[\n",
-    "                  'Lz4 Gpu Compression Ratio',\n",
-    "                  'Cascaded Compression Ratio',\n",
-    "                  'Lz4 Host Compression Ratio'\n",
-    "              ],\n",
-    "              labels={'value': 'Compression Factor'},\n",
-    "              title=title)\n",
-    "fig.update_xaxes(type='category')\n",
-    "fig.show()"
-   ]
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3 (ipykernel)",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.10.12"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 5
-}
diff --git a/notebooks/zarr.ipynb b/notebooks/zarr.ipynb
deleted file mode 100644
index 33a981ebf5..0000000000
--- a/notebooks/zarr.ipynb
+++ /dev/null
@@ -1,364 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "code",
-   "execution_count": 23,
-   "id": "7a060f7d-9a0c-4763-98df-7dc82409c6ba",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "\"\"\"\n",
-    "In this tutorial, we will show how to use KvikIO to read and write GPU memory directly to/from Zarr files.\n",
-    "\"\"\"\n",
-    "import json\n",
-    "import shutil\n",
-    "import numpy\n",
-    "import cupy\n",
-    "import zarr\n",
-    "import kvikio\n",
-    "import kvikio.zarr\n",
-    "from kvikio.nvcomp_codec import NvCompBatchCodec\n",
-    "from numcodecs import LZ4"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "99f4d25b-2006-4026-8629-1accafb338ef",
-   "metadata": {},
-   "source": [
-    "We need to set three Zarr arguments: \n",
-    " - `meta_array`: in order to make Zarr read into GPU memory (instead of CPU memory), we set the `meta_array` argument to an empty CuPy array. \n",
-    " - `store`: we need to use a GPU compatible Zarr Store, which will be KvikIO’s GDS store in our case. \n",
-    " - `compressor`: finally, we need to use a GPU compatible compressor (or `None`). KvikIO provides a nvCOMP compressor `kvikio.nvcomp_codec.NvCompBatchCodec` that we will use."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 24,
-   "id": "c179c24a-766e-4e09-83c5-349868042576",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "(<zarr.core.Array (10,) int64>,\n",
-       " NvCompBatchCodec(algorithm='lz4', options={}),\n",
-       " <kvikio.zarr.GDSStore at 0x7fd42021ac20>)"
-      ]
-     },
-     "execution_count": 24,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "# Let's create a new Zarr array using KvikIO's GDS store and LZ4 compression\n",
-    "z = zarr.array(\n",
-    "    cupy.arange(10), \n",
-    "    chunks=2, \n",
-    "    store=kvikio.zarr.GDSStore(\"my-zarr-file.zarr\"), \n",
-    "    meta_array=cupy.empty(()),\n",
-    "    compressor=NvCompBatchCodec(\"lz4\"),\n",
-    "    overwrite=True,\n",
-    ")\n",
-    "z, z.compressor, z.store"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 25,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "cupy.ndarray"
-      ]
-     },
-     "execution_count": 25,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "# And because we set the `meta_array` argument, reading the Zarr array returns a CuPy array\n",
-    "type(z[:])"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "549ded39-1053-4f82-a8a7-5a2ee999a4a1",
-   "metadata": {},
-   "source": [
-    "From this point onwards, `z` can be used just like any other Zarr array."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 26,
-   "id": "8221742d-f15c-450a-9701-dc8c05326126",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "array([1, 2, 3, 4, 5, 6, 7, 8])"
-      ]
-     },
-     "execution_count": 26,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "z[1:9]"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 27,
-   "id": "f0c451c1-a240-4b26-a5ef-6e70a5bbeb55",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "array([42, 43, 44, 45, 46, 47, 48, 49, 50, 51])"
-      ]
-     },
-     "execution_count": 27,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "z[:] + 42"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "7797155f-40f4-4c50-b704-2356ca64cba3",
-   "metadata": {},
-   "source": [
-    "### GPU compression / CPU decompression"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "a0029deb-19b9-4dbb-baf0-ce4b199605a5",
-   "metadata": {},
-   "source": [
-    "In order to read GPU-written Zarr file into a NumPy array, we simply open that file **without** setting the `meta_array` argument:"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 28,
-   "id": "399f23f7-4475-496a-a537-a7163a35c888",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "(numpy.ndarray,\n",
-       " kvikio.nvcomp_codec.NvCompBatchCodec,\n",
-       " array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]))"
-      ]
-     },
-     "execution_count": 28,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "z = zarr.open_array(kvikio.zarr.GDSStore(\"my-zarr-file.zarr\"))\n",
-    "type(z[:]), type(z.compressor), z[:]"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "8e9f31d5",
-   "metadata": {},
-   "source": [
-    "And we don't need to use `kvikio.zarr.GDSStore` either:"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 29,
-   "id": "4b1f46b2",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "(numpy.ndarray,\n",
-       " kvikio.nvcomp_codec.NvCompBatchCodec,\n",
-       " array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]))"
-      ]
-     },
-     "execution_count": 29,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "z = zarr.open_array(\"my-zarr-file.zarr\")\n",
-    "type(z[:]), type(z.compressor), z[:]"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "f10fd704-35f7-46b7-aabe-ea68fb2bf88d",
-   "metadata": {},
-   "source": [
-    "However, the above use `NvCompBatchCodec(\"lz4\")` for decompression. In the following, we will show how to read Zarr file written and compressed using a GPU on the CPU.\n",
-    "\n",
-    "Some algorithms, such as LZ4, can be used interchangeably on CPU and GPU but Zarr will always use the compressor used to write the Zarr file. We are working with the Zarr team to fix this shortcoming but for now, we will use a workaround where we _patch_ the metadata manually."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 30,
-   "id": "d980361a-e132-4f29-ab13-cbceec5bbbb5",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "(numpy.ndarray, numcodecs.lz4.LZ4, array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]))"
-      ]
-     },
-     "execution_count": 30,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "# Read the Zarr metadata and replace the compressor with a CPU implementation of LZ4\n",
-    "store = zarr.DirectoryStore(\"my-zarr-file.zarr\")  # We could also have used kvikio.zarr.GDSStore\n",
-    "meta = json.loads(store[\".zarray\"])\n",
-    "meta[\"compressor\"] = LZ4().get_config()\n",
-    "store[\".zarray\"] = json.dumps(meta).encode()  # NB: this changes the Zarr metadata on disk\n",
-    "\n",
-    "# And then open the file as usually\n",
-    "z = zarr.open_array(store)\n",
-    "type(z[:]), type(z.compressor), z[:]"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "8ea73705",
-   "metadata": {},
-   "source": [
-    "### CPU compression / GPU decompression\n",
-    "\n",
-    "Now, let's try the otherway around."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 31,
-   "id": "c9b2d56a",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "(<zarr.core.Array (10,) int64>,\n",
-       " LZ4(acceleration=1),\n",
-       " <zarr.storage.DirectoryStore at 0x7fd351e7a9b0>)"
-      ]
-     },
-     "execution_count": 31,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "import numcodecs\n",
-    "# Let's create a new Zarr array using the default compression.\n",
-    "z = zarr.array(\n",
-    "    numpy.arange(10), \n",
-    "    chunks=2, \n",
-    "    store=\"my-zarr-file.zarr\", \n",
-    "    overwrite=True,\n",
-    "    # The default (CPU) implementation of LZ4 codec.\n",
-    "    compressor=numcodecs.registry.get_codec({\"id\": \"lz4\"})\n",
-    ")\n",
-    "z, z.compressor, z.store"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "dedd4623",
-   "metadata": {},
-   "source": [
-    "Again, we will use a workaround where we _patch_ the metadata manually."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 32,
-   "id": "ac3f30b1",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "(cupy.ndarray,\n",
-       " kvikio.nvcomp_codec.NvCompBatchCodec,\n",
-       " array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]))"
-      ]
-     },
-     "execution_count": 32,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "# Read the Zarr metadata and replace the compressor with a GPU implementation of LZ4\n",
-    "store = kvikio.zarr.GDSStore(\"my-zarr-file.zarr\")  # We could also have used zarr.DirectoryStore\n",
-    "meta = json.loads(store[\".zarray\"])\n",
-    "meta[\"compressor\"] = NvCompBatchCodec(\"lz4\").get_config()\n",
-    "store[\".zarray\"] = json.dumps(meta).encode()  # NB: this changes the Zarr metadata on disk\n",
-    "\n",
-    "# And then open the file as usually\n",
-    "z = zarr.open_array(store, meta_array=cupy.empty(()))\n",
-    "type(z[:]), type(z.compressor), z[:]"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 33,
-   "id": "80682922-b7b0-4b08-b595-228c2b446a78",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Clean up\n",
-    "shutil.rmtree(\"my-zarr-file.zarr\", ignore_errors=True)"
-   ]
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3 (ipykernel)",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.10.11"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 5
-}
diff --git a/python/kvikio/CMakeLists.txt b/python/kvikio/CMakeLists.txt
index 3e5af8c857..480e485d2b 100644
--- a/python/kvikio/CMakeLists.txt
+++ b/python/kvikio/CMakeLists.txt
@@ -26,8 +26,6 @@ project(
   LANGUAGES CXX CUDA
 )
 
-option(USE_NVCOMP_RUNTIME_WHEEL "Use the nvcomp wheel at runtime instead of the system library" OFF)
-
 find_package(kvikio REQUIRED "${RAPIDS_VERSION}")
 
 find_package(CUDAToolkit REQUIRED)
@@ -35,8 +33,6 @@ find_package(CUDAToolkit REQUIRED)
 include(rapids-cython-core)
 rapids_cython_init()
 
-add_subdirectory(cmake)
-
 set(cython_lib_dir kvikio)
 
 add_subdirectory(kvikio/_lib)
diff --git a/python/kvikio/cli/gpu_compressor.py b/python/kvikio/cli/gpu_compressor.py
deleted file mode 100755
index ac34b15e8f..0000000000
--- a/python/kvikio/cli/gpu_compressor.py
+++ /dev/null
@@ -1,136 +0,0 @@
-# NVIDIA 2022
-
-import argparse
-import os
-import sys
-import time
-
-import cupy
-
-import kvikio
-import kvikio.nvcomp as nvcomp
-
-
-def get_parser():
-    class NvcompParser(argparse.ArgumentParser):
-        """
-        Handle special case and show help on invalid argument
-        """
-
-        def error(self, message):
-            sys.stderr.write("\nERROR: {}\n\n".format(message))
-            self.print_help()
-            sys.exit(2)
-
-    parser = NvcompParser()
-    parser.add_argument("-v", "--verbose", action="store_true", help="Verbose Output")
-    parser.add_argument(
-        "-o",
-        "--out_file",
-        action="store",
-        dest="out_file",
-        help="Output filename",
-    )
-    parser.add_argument(
-        "-c",
-        choices=["ans", "bitcomp", "cascaded", "gdeflate", "lz4", "snappy"],
-        action="store",
-        dest="compression",
-        help="Which GPU algorithm to use for compression.",
-    )
-    parser.add_argument(
-        "-d",
-        action="store_true",
-        help="Decompress the incoming file",
-    )
-    parser.add_argument(action="store", dest="filename", help="Relative Filename")
-    return parser
-
-
-def main():
-    parser = get_parser()
-    args = parser.parse_args()
-
-    print("GPU Compression Initialized") if args.verbose else None
-
-    file_size = os.path.getsize(args.filename)
-    """ test
-    data = cupy.arange(10000, dtype="uint8")
-    """
-    data = cupy.zeros(file_size, dtype=cupy.int8)
-    t = time.time()
-    f = kvikio.CuFile(args.filename, "r")
-    f.read(data)
-    f.close()
-    read_time = time.time() - t
-    print(f"File read time: {read_time:.3} seconds.") if args.verbose else None
-
-    if args.d:
-        compressor = nvcomp.ManagedDecompressionManager(data)
-    elif args.compression == "ans":
-        compressor = nvcomp.ANSManager()
-    elif args.compression == "bitcomp":
-        compressor = nvcomp.BitcompManager()
-    elif args.compression == "cascaded":
-        compressor = nvcomp.CascadedManager()
-    elif args.compression == "gdeflate":
-        compressor = nvcomp.GdeflateManager()
-    elif args.compression == "snappy":
-        compressor = nvcomp.SnappyManager()
-    else:
-        compressor = nvcomp.LZ4Manager(chunk_size=1 << 16)
-
-    if args.d is True:
-        print(f"Decompressing {file_size} bytes") if args.verbose else None
-        t = time.time()
-        converted = compressor.decompress(data)
-        decompress_time = time.time() - t
-        print(
-            f"Decompression time: {decompress_time:.3} seconds"
-        ) if args.verbose else None
-
-        if not args.out_file:
-            raise ValueError("Must specify filename with -o for decompression.")
-
-        t = time.time()
-        o = kvikio.CuFile(args.out_file, "w")
-        o.write(converted)
-        o.close()
-        io_time = time.time() - t
-        print(f"File write time: {io_time:.3} seconds") if args.verbose else None
-
-        print(
-            f"Decompressed file size {os.path.getsize(args.out_file)}"
-        ) if args.verbose else None
-    else:
-        file_size = os.path.getsize(args.filename)
-
-        print(f"Compressing {file_size} bytes") if args.verbose else None
-        t = time.time()
-        converted = compressor.compress(data)
-        compress_time = time.time() - t
-        print(f"Compression time: {compress_time:.3} seconds") if args.verbose else None
-
-        t = time.time()
-        if args.out_file:
-            o = kvikio.CuFile(args.out_file, "w")
-        else:
-            o = kvikio.CuFile(args.filename + ".gpc", "w")
-        o.write(converted)
-        o.close()
-        io_time = time.time() - t
-        print(f"File write time: {io_time:.3} seconds") if args.verbose else None
-
-        print(
-            f"Compressed file size {compressor.get_compressed_output_size(converted)}"
-        ) if args.verbose else None
-
-    if args.out_file:
-        end_name = args.out_file
-    else:
-        end_name = args.filename + ".gpc"
-    print(f"Created file {end_name}") if args.verbose else None
-
-
-if __name__ == "__main__":
-    main()
diff --git a/python/kvikio/cmake/CMakeLists.txt b/python/kvikio/cmake/CMakeLists.txt
deleted file mode 100644
index d3882b5ab3..0000000000
--- a/python/kvikio/cmake/CMakeLists.txt
+++ /dev/null
@@ -1,15 +0,0 @@
-# =============================================================================
-# Copyright (c) 2022-2024, NVIDIA CORPORATION.
-#
-# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
-# in compliance with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software distributed under the License
-# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
-# or implied. See the License for the specific language governing permissions and limitations under
-# the License.
-# =============================================================================
-
-include(thirdparty/get_nvcomp.cmake)
diff --git a/python/kvikio/cmake/thirdparty/get_nvcomp.cmake b/python/kvikio/cmake/thirdparty/get_nvcomp.cmake
deleted file mode 100644
index a2c6326e76..0000000000
--- a/python/kvikio/cmake/thirdparty/get_nvcomp.cmake
+++ /dev/null
@@ -1,33 +0,0 @@
-# =============================================================================
-# Copyright (c) 2021-2024, NVIDIA CORPORATION.
-#
-# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
-# in compliance with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software distributed under the License
-# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
-# or implied. See the License for the specific language governing permissions and limitations under
-# the License.
-# =============================================================================
-
-set(KVIKIO_USE_PROPRIETARY_BINARY ON)
-
-# This function finds nvcomp and sets any additional necessary environment variables.
-function(find_and_configure_nvcomp)
-
-  include(${rapids-cmake-dir}/cpm/nvcomp.cmake)
-  set(export_args)
-  if(KvikIO_EXPORT_NVCOMP)
-    set(export_args BUILD_EXPORT_SET kvikio-exports INSTALL_EXPORT_SET kvikio-exports)
-  endif()
-  rapids_cpm_nvcomp(${export_args} USE_PROPRIETARY_BINARY ${KVIKIO_USE_PROPRIETARY_BINARY})
-
-  # Per-thread default stream
-  if(TARGET nvcomp AND PER_THREAD_DEFAULT_STREAM)
-    target_compile_definitions(nvcomp PRIVATE CUDA_API_PER_THREAD_DEFAULT_STREAM)
-  endif()
-endfunction()
-
-find_and_configure_nvcomp()
diff --git a/python/kvikio/examples/zarr_cupy_nvcomp.py b/python/kvikio/examples/zarr_cupy_nvcomp.py
deleted file mode 100644
index 9f05f7874a..0000000000
--- a/python/kvikio/examples/zarr_cupy_nvcomp.py
+++ /dev/null
@@ -1,88 +0,0 @@
-# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
-# See file LICENSE for terms.
-
-import cupy
-import numpy
-import zarr
-
-import kvikio
-import kvikio.zarr
-
-
-def main(path):
-    a = cupy.arange(20)
-
-    # Let's use KvikIO's convenience function `open_cupy_array()` to create
-    # a new Zarr file on disk. Its semantic is the same as `zarr.open_array()`
-    # but uses a GDS file store, nvCOMP compression, and CuPy arrays.
-    z = kvikio.zarr.open_cupy_array(store=path, mode="w", shape=(20,), chunks=(5,))
-
-    # `z` is a regular Zarr Array that we can write to as usual
-    z[0:10] = numpy.arange(0, 10)
-    # but it also support direct reads and writes of CuPy arrays
-    z[10:20] = cupy.arange(10, 20)
-
-    # Reading `z` returns a CuPy array
-    assert isinstance(z[:], cupy.ndarray)
-    assert (a == z[:]).all()
-
-    # Normally, we cannot assume that GPU and CPU compressors are compatible.
-    # E.g., `open_cupy_array()` uses nvCOMP's Snappy GPU compression by default,
-    # which, as far as we know, isn’t compatible with any CPU compressor. Thus,
-    # let's re-write our Zarr array using a CPU and GPU compatible compressor.
-    #
-    # Warning: it isn't possible to use `CompatCompressor` as a compressor argument
-    #          in Zarr directly. It is only meant for `open_cupy_array()`. However,
-    #          in an example further down, we show how to write using regular Zarr.
-    z = kvikio.zarr.open_cupy_array(
-        store=path,
-        mode="w",
-        shape=(20,),
-        chunks=(5,),
-        compressor=kvikio.zarr.CompatCompressor.lz4(),
-    )
-    z[:] = a
-
-    # Because we are using a CompatCompressor, it is now possible to open the file
-    # using Zarr's built-in LZ4 decompressor that uses the CPU.
-    z = zarr.open_array(path)
-    # `z` is now read as a regular NumPy array
-    assert isinstance(z[:], numpy.ndarray)
-    assert (a.get() == z[:]).all()
-    # and we can write to is as usual
-    z[:] = numpy.arange(20, 40)
-
-    # And we can read the Zarr file back into a CuPy array.
-    z = kvikio.zarr.open_cupy_array(store=path, mode="r")
-    assert isinstance(z[:], cupy.ndarray)
-    assert (cupy.arange(20, 40) == z[:]).all()
-
-    # Similarly, we can also open a file written by regular Zarr.
-    # Let's write the file without any compressor.
-    ary = numpy.arange(10)
-    z = zarr.open(store=path, mode="w", shape=ary.shape, compressor=None)
-    z[:] = ary
-    # This works as before where the file is read as a CuPy array
-    z = kvikio.zarr.open_cupy_array(store=path)
-    assert isinstance(z[:], cupy.ndarray)
-    assert (z[:] == cupy.asarray(ary)).all()
-
-    # Using a compressor is a bit more tricky since not all CPU compressors
-    # are GPU compatible. To make sure we use a compable compressor, we use
-    # the CPU-part of `CompatCompressor.lz4()`.
-    ary = numpy.arange(10)
-    z = zarr.open(
-        store=path,
-        mode="w",
-        shape=ary.shape,
-        compressor=kvikio.zarr.CompatCompressor.lz4().cpu,
-    )
-    z[:] = ary
-    # This works as before where the file is read as a CuPy array
-    z = kvikio.zarr.open_cupy_array(store=path)
-    assert isinstance(z[:], cupy.ndarray)
-    assert (z[:] == cupy.asarray(ary)).all()
-
-
-if __name__ == "__main__":
-    main("/tmp/zarr-cupy-nvcomp")
diff --git a/python/kvikio/kvikio/__init__.py b/python/kvikio/kvikio/__init__.py
index 9208d4e3ce..124698206e 100644
--- a/python/kvikio/kvikio/__init__.py
+++ b/python/kvikio/kvikio/__init__.py
@@ -15,14 +15,17 @@
 from kvikio._lib.defaults import CompatMode  # noqa: F401
 from kvikio._version import __git_commit__, __version__
 from kvikio.cufile import CuFile, clear_page_cache, get_page_cache_info
-from kvikio.remote_file import RemoteFile, is_remote_file_available
+from kvikio.mmap import Mmap
+from kvikio.remote_file import RemoteEndpointType, RemoteFile, is_remote_file_available
 
 __all__ = [
     "__git_commit__",
     "__version__",
     "clear_page_cache",
     "CuFile",
+    "Mmap",
     "get_page_cache_info",
     "is_remote_file_available",
+    "RemoteEndpointType",
     "RemoteFile",
 ]
diff --git a/python/kvikio/kvikio/_lib/CMakeLists.txt b/python/kvikio/kvikio/_lib/CMakeLists.txt
index 1ea9b85dff..b46d59c960 100644
--- a/python/kvikio/kvikio/_lib/CMakeLists.txt
+++ b/python/kvikio/kvikio/_lib/CMakeLists.txt
@@ -14,7 +14,7 @@
 
 # Set the list of Cython files to build, one .so per file
 set(cython_modules arr.pyx buffer.pyx defaults.pyx cufile_driver.pyx file_handle.pyx future.pyx
-                   libnvcomp.pyx libnvcomp_ll.pyx
+                   mmap.pyx
 )
 
 if(KvikIO_REMOTE_SUPPORT)
@@ -30,15 +30,5 @@ endif()
 rapids_cython_create_modules(
   CXX
   SOURCE_FILES "${cython_modules}"
-  LINKED_LIBRARIES kvikio::kvikio nvcomp::nvcomp
+  LINKED_LIBRARIES kvikio::kvikio
 )
-if(USE_NVCOMP_RUNTIME_WHEEL)
-  set(rpaths "$ORIGIN/../../nvidia/nvcomp")
-  foreach(tgt IN LISTS RAPIDS_CYTHON_CREATED_TARGETS)
-    set_property(
-      TARGET ${tgt}
-      PROPERTY INSTALL_RPATH ${rpaths}
-      APPEND
-    )
-  endforeach()
-endif()
diff --git a/python/kvikio/kvikio/_lib/buffer.pyx b/python/kvikio/kvikio/_lib/buffer.pyx
index 3b90f09816..a7b638eaf4 100644
--- a/python/kvikio/kvikio/_lib/buffer.pyx
+++ b/python/kvikio/kvikio/_lib/buffer.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2024-2025, NVIDIA CORPORATION. All rights reserved.
 # See file LICENSE for terms.
 
 # distutils: language = c++
@@ -17,14 +17,16 @@ def memory_register(buf) -> None:
     if not isinstance(buf, Array):
         buf = Array(buf)
     cdef Array arr = buf
-    cpp_memory_register(<void*>arr.ptr)
+    with nogil:
+        cpp_memory_register(<void*>arr.ptr)
 
 
 def memory_deregister(buf) -> None:
     if not isinstance(buf, Array):
         buf = Array(buf)
     cdef Array arr = buf
-    cpp_memory_deregister(<void*>arr.ptr)
+    with nogil:
+        cpp_memory_deregister(<void*>arr.ptr)
 
 
 cdef extern from "<kvikio/bounce_buffer.hpp>" nogil:
@@ -32,4 +34,7 @@ cdef extern from "<kvikio/bounce_buffer.hpp>" nogil:
 
 
 def bounce_buffer_free() -> int:
-    return cpp_alloc_retain_clear()
+    cdef size_t result
+    with nogil:
+        result = cpp_alloc_retain_clear()
+    return result
diff --git a/python/kvikio/kvikio/_lib/cufile_driver.pyx b/python/kvikio/kvikio/_lib/cufile_driver.pyx
index 0488eb3b20..0bc44b2066 100644
--- a/python/kvikio/kvikio/_lib/cufile_driver.pyx
+++ b/python/kvikio/kvikio/_lib/cufile_driver.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2024-2025, NVIDIA CORPORATION. All rights reserved.
 # See file LICENSE for terms.
 
 # distutils: language = c++
@@ -15,15 +15,20 @@ cdef extern from "<kvikio/shim/cufile.hpp>" nogil:
 
 
 def libcufile_version() -> int:
-    return cpp_libcufile_version()
+    cdef int version
+    with nogil:
+        version = cpp_libcufile_version()
+    return version
 
 
 def driver_open():
-    cpp_driver_open()
+    with nogil:
+        cpp_driver_open()
 
 
 def driver_close():
-    cpp_driver_close()
+    with nogil:
+        cpp_driver_close()
 
 
 cdef extern from "<kvikio/cufile/driver.hpp>" nogil:
@@ -49,55 +54,90 @@ cdef class DriverProperties:
 
     @property
     def is_gds_available(self) -> bool:
+        cdef bool result
         try:
-            return self._handle.is_gds_available()
+            with nogil:
+                result = self._handle.is_gds_available()
+            return result
         except RuntimeError:
             return False
 
     @property
     def major_version(self) -> bool:
-        return self._handle.get_nvfs_major_version()
+        cdef unsigned int version
+        with nogil:
+            version = self._handle.get_nvfs_major_version()
+        return version
 
     @property
     def minor_version(self) -> bool:
-        return self._handle.get_nvfs_minor_version()
+        cdef unsigned int version
+        with nogil:
+            version = self._handle.get_nvfs_minor_version()
+        return version
 
     @property
     def allow_compat_mode(self) -> bool:
-        return self._handle.get_nvfs_allow_compat_mode()
+        cdef bool result
+        with nogil:
+            result = self._handle.get_nvfs_allow_compat_mode()
+        return result
 
     @property
     def poll_mode(self) -> bool:
-        return self._handle.get_nvfs_poll_mode()
+        cdef bool result
+        with nogil:
+            result = self._handle.get_nvfs_poll_mode()
+        return result
 
     @poll_mode.setter
     def poll_mode(self, enable: bool) -> None:
-        self._handle.set_nvfs_poll_mode(enable)
+        cdef bool cpp_enable = enable
+        with nogil:
+            self._handle.set_nvfs_poll_mode(cpp_enable)
 
     @property
     def poll_thresh_size(self) -> int:
-        return self._handle.get_nvfs_poll_thresh_size()
+        cdef size_t size
+        with nogil:
+            size = self._handle.get_nvfs_poll_thresh_size()
+        return size
 
     @poll_thresh_size.setter
     def poll_thresh_size(self, size_in_kb: int) -> None:
-        self._handle.set_nvfs_poll_thresh_size(size_in_kb)
+        cdef size_t size = size_in_kb
+        with nogil:
+            self._handle.set_nvfs_poll_thresh_size(size)
 
     @property
     def max_device_cache_size(self) -> int:
-        return self._handle.get_max_device_cache_size()
+        cdef size_t size
+        with nogil:
+            size = self._handle.get_max_device_cache_size()
+        return size
 
     @max_device_cache_size.setter
     def max_device_cache_size(self, size_in_kb: int) -> None:
-        self._handle.set_max_device_cache_size(size_in_kb)
+        cdef size_t size = size_in_kb
+        with nogil:
+            self._handle.set_max_device_cache_size(size)
 
     @property
     def per_buffer_cache_size(self) -> int:
-        return self._handle.get_per_buffer_cache_size()
+        cdef size_t size
+        with nogil:
+            size = self._handle.get_per_buffer_cache_size()
+        return size
 
     @property
     def max_pinned_memory_size(self) -> int:
-        return self._handle.get_max_pinned_memory_size()
+        cdef size_t size
+        with nogil:
+            size = self._handle.get_max_pinned_memory_size()
+        return size
 
     @max_pinned_memory_size.setter
     def max_pinned_memory_size(self, size_in_kb: int) -> None:
-        self._handle.set_max_pinned_memory_size(size_in_kb)
+        cdef size_t size = size_in_kb
+        with nogil:
+            self._handle.set_max_pinned_memory_size(size)
diff --git a/python/kvikio/kvikio/_lib/defaults.pyx b/python/kvikio/kvikio/_lib/defaults.pyx
index 00f1de4ec1..2fb6065680 100644
--- a/python/kvikio/kvikio/_lib/defaults.pyx
+++ b/python/kvikio/kvikio/_lib/defaults.pyx
@@ -37,75 +37,113 @@ cdef extern from "<kvikio/defaults.hpp>" namespace "kvikio" nogil:
     vector[int] cpp_http_status_codes "kvikio::defaults::http_status_codes"() except +
     void cpp_set_http_status_codes \
         "kvikio::defaults::set_http_status_codes"(vector[int] status_codes) except +
-
     long cpp_http_timeout "kvikio::defaults::http_timeout"() except +
     void cpp_set_http_timeout\
         "kvikio::defaults::set_http_timeout"(long timeout_seconds) except +
 
 
 def is_compat_mode_preferred() -> bool:
-    return cpp_is_compat_mode_preferred()
+    cdef bool result
+    with nogil:
+        result = cpp_is_compat_mode_preferred()
+    return result
 
 
 def compat_mode() -> CompatMode:
-    return cpp_compat_mode()
+    cdef CompatMode result
+    with nogil:
+        result = cpp_compat_mode()
+    return result
 
 
 def set_compat_mode(compat_mode: CompatMode) -> None:
-    cpp_set_compat_mode(compat_mode)
+    with nogil:
+        cpp_set_compat_mode(compat_mode)
 
 
 def thread_pool_nthreads() -> int:
-    return cpp_thread_pool_nthreads()
+    cdef unsigned int result
+    with nogil:
+        result = cpp_thread_pool_nthreads()
+    return result
 
 
 def set_thread_pool_nthreads(nthreads: int) -> None:
-    cpp_set_thread_pool_nthreads(nthreads)
+    cdef unsigned int cpp_nthreads = nthreads
+    with nogil:
+        cpp_set_thread_pool_nthreads(cpp_nthreads)
 
 
 def task_size() -> int:
-    return cpp_task_size()
+    cdef size_t result
+    with nogil:
+        result = cpp_task_size()
+    return result
 
 
 def set_task_size(nbytes: int) -> None:
-    cpp_set_task_size(nbytes)
+    cdef size_t cpp_nbytes = nbytes
+    with nogil:
+        cpp_set_task_size(cpp_nbytes)
 
 
 def gds_threshold() -> int:
-    return cpp_gds_threshold()
+    cdef size_t result
+    with nogil:
+        result = cpp_gds_threshold()
+    return result
 
 
 def set_gds_threshold(nbytes: int) -> None:
-    cpp_set_gds_threshold(nbytes)
+    cdef size_t cpp_nbytes = nbytes
+    with nogil:
+        cpp_set_gds_threshold(cpp_nbytes)
 
 
 def bounce_buffer_size() -> int:
-    return cpp_bounce_buffer_size()
+    cdef size_t result
+    with nogil:
+        result = cpp_bounce_buffer_size()
+    return result
 
 
 def set_bounce_buffer_size(nbytes: int) -> None:
-    cpp_set_bounce_buffer_size(nbytes)
+    cdef size_t cpp_nbytes = nbytes
+    with nogil:
+        cpp_set_bounce_buffer_size(cpp_nbytes)
 
 
 def http_max_attempts() -> int:
-    return cpp_http_max_attempts()
+    cdef size_t result
+    with nogil:
+        result = cpp_http_max_attempts()
+    return result
 
 
 def set_http_max_attempts(attempts: int) -> None:
-    cpp_set_http_max_attempts(attempts)
+    cdef size_t cpp_attempts = attempts
+    with nogil:
+        cpp_set_http_max_attempts(cpp_attempts)
 
 
 def http_timeout() -> int:
-    return cpp_http_timeout()
+    cdef long result
+    with nogil:
+        result = cpp_http_timeout()
+    return result
 
 
 def set_http_timeout(timeout: int) -> None:
-    return cpp_set_http_timeout(timeout)
+    cdef long cpp_timeout = timeout
+    with nogil:
+        cpp_set_http_timeout(cpp_timeout)
 
 
 def http_status_codes() -> list[int]:
+    # Cannot use nogil here because we need the GIL for list creation
     return cpp_http_status_codes()
 
 
 def set_http_status_codes(status_codes: list[int]) -> None:
-    return cpp_set_http_status_codes(status_codes)
+    # Cannot use nogil here because we need the GIL for list conversion
+    cpp_set_http_status_codes(status_codes)
diff --git a/python/kvikio/kvikio/_lib/file_handle.pyx b/python/kvikio/kvikio/_lib/file_handle.pyx
index c0d71f36a7..062d0d9fb5 100644
--- a/python/kvikio/kvikio/_lib/file_handle.pyx
+++ b/python/kvikio/kvikio/_lib/file_handle.pyx
@@ -6,7 +6,6 @@
 
 import io
 import os
-import pathlib
 from typing import Optional, Union
 
 from posix cimport fcntl
@@ -95,88 +94,127 @@ cdef class CuFile:
     cdef FileHandle _handle
 
     def __init__(self, file_path, flags="r"):
-        self._handle = move(
-            FileHandle(
-                str.encode(str(pathlib.Path(file_path))),
-                str.encode(str(flags))
+        cdef string cpp_file_path = os.fsencode(file_path)
+        cdef string cpp_flags = str(flags).encode()
+        with nogil:
+            self._handle = move(
+                FileHandle(
+                    cpp_file_path,
+                    cpp_flags
+                )
             )
-        )
 
     def close(self) -> None:
-        self._handle.close()
+        with nogil:
+            self._handle.close()
 
     def closed(self) -> bool:
-        return self._handle.closed()
+        cdef bool result
+        with nogil:
+            result = self._handle.closed()
+        return result
 
     def fileno(self) -> int:
-        return self._handle.fd()
+        cdef int result
+        with nogil:
+            result = self._handle.fd()
+        return result
 
     def open_flags(self) -> int:
-        return self._handle.fd_open_flags()
+        cdef int result
+        with nogil:
+            result = self._handle.fd_open_flags()
+        return result
 
     def pread(self, buf, size: Optional[int], file_offset: int, task_size) -> IOFuture:
         cdef pair[uintptr_t, size_t] info = parse_buffer_argument(buf, size, True)
-        return _wrap_io_future(
-            self._handle.pread(
+        cdef size_t cpp_file_offset = file_offset
+        cdef size_t cpp_task_size = task_size if task_size else defaults.task_size()
+        cdef future[size_t] fut
+        with nogil:
+            fut = self._handle.pread(
                 <void*>info.first,
                 info.second,
-                file_offset,
-                task_size if task_size else defaults.task_size()
+                cpp_file_offset,
+                cpp_task_size
             )
-        )
+        return _wrap_io_future(fut)
 
     def pwrite(self, buf, size: Optional[int], file_offset: int, task_size) -> IOFuture:
         cdef pair[uintptr_t, size_t] info = parse_buffer_argument(buf, size, True)
-        return _wrap_io_future(
-            self._handle.pwrite(
+        cdef size_t cpp_file_offset = file_offset
+        cdef size_t cpp_task_size = task_size if task_size else defaults.task_size()
+        cdef future[size_t] fut
+        with nogil:
+            fut = self._handle.pwrite(
                 <void*>info.first,
                 info.second,
-                file_offset,
-                task_size if task_size else defaults.task_size()
+                cpp_file_offset,
+                cpp_task_size
             )
-        )
+        return _wrap_io_future(fut)
 
     def read(self, buf, size: Optional[int], file_offset: int, dev_offset: int) -> int:
         cdef pair[uintptr_t, size_t] info = parse_buffer_argument(buf, size, False)
-        return self._handle.read(
-            <void*>info.first,
-            info.second,
-            file_offset,
-            dev_offset,
-        )
+        cdef size_t cpp_file_offset = file_offset
+        cdef size_t cpp_dev_offset = dev_offset
+        cdef size_t result
+        with nogil:
+            result = self._handle.read(
+                <void*>info.first,
+                info.second,
+                cpp_file_offset,
+                cpp_dev_offset,
+            )
+        return result
 
     def write(self, buf, size: Optional[int], file_offset: int, dev_offset: int) -> int:
         cdef pair[uintptr_t, size_t] info = parse_buffer_argument(buf, size, False)
-        return self._handle.write(
-            <void*>info.first,
-            info.second,
-            file_offset,
-            dev_offset,
-        )
+        cdef size_t cpp_file_offset = file_offset
+        cdef size_t cpp_dev_offset = dev_offset
+        cdef size_t result
+        with nogil:
+            result = self._handle.write(
+                <void*>info.first,
+                info.second,
+                cpp_file_offset,
+                cpp_dev_offset,
+            )
+        return result
 
     def read_async(self, buf, size: Optional[int], file_offset: int, dev_offset: int,
                    st: uintptr_t) -> IOFutureStream:
-        stream = <CUstream>st
+        cdef CUstream stream = <CUstream>st
         cdef pair[uintptr_t, size_t] info = parse_buffer_argument(buf, size, False)
-        return _wrap_stream_future(self._handle.read_async(
-            <void*>info.first,
-            info.second,
-            file_offset,
-            dev_offset,
-            stream,
-        ))
+        cdef size_t cpp_file_offset = file_offset
+        cdef size_t cpp_dev_offset = dev_offset
+        cdef cpp_StreamFuture fut
+        with nogil:
+            fut = self._handle.read_async(
+                <void*>info.first,
+                info.second,
+                cpp_file_offset,
+                cpp_dev_offset,
+                stream,
+            )
+        return _wrap_stream_future(fut)
 
     def write_async(self, buf, size: Optional[int], file_offset: int, dev_offset: int,
                     st: uintptr_t) -> IOFutureStream:
-        stream = <CUstream>st
+        cdef CUstream stream = <CUstream>st
         cdef pair[uintptr_t, size_t] info = parse_buffer_argument(buf, size, False)
-        return _wrap_stream_future(self._handle.write_async(
-            <void*>info.first,
-            info.second,
-            file_offset,
-            dev_offset,
-            stream,
-        ))
+        cdef size_t cpp_file_offset = file_offset
+        cdef size_t cpp_dev_offset = dev_offset
+        cdef cpp_StreamFuture fut
+        with nogil:
+            fut = self._handle.write_async(
+                <void*>info.first,
+                info.second,
+                cpp_file_offset,
+                cpp_dev_offset,
+                stream,
+            )
+        return _wrap_stream_future(fut)
 
 cdef extern from "<kvikio/file_utils.hpp>" nogil:
     pair[size_t, size_t] cpp_get_page_cache_info_str \
@@ -192,21 +230,37 @@ cdef extern from "<kvikio/file_utils.hpp>" nogil:
 
 def get_page_cache_info(file: Union[os.PathLike, str, int, io.IOBase]) \
         -> tuple[int, int]:
+    cdef pair[size_t, size_t] result
+    cdef string path_bytes
+    cdef int fd
+
     if isinstance(file, os.PathLike) or isinstance(file, str):
         # file is a path or a string object
-        path_bytes = str(pathlib.Path(file)).encode()
-        return cpp_get_page_cache_info_str(path_bytes)
+        path_bytes = os.fsencode(file)
+        with nogil:
+            result = cpp_get_page_cache_info_str(path_bytes)
+        return result
     elif isinstance(file, int):
         # file is a file descriptor
-        return cpp_get_page_cache_info_int(file)
+        fd = file
+        with nogil:
+            result = cpp_get_page_cache_info_int(fd)
+        return result
     elif isinstance(file, io.IOBase):
         # file is a file object
         # pass its file descriptor to the underlying C++ function
-        return cpp_get_page_cache_info_int(file.fileno())
+        fd = file.fileno()
+        with nogil:
+            result = cpp_get_page_cache_info_int(fd)
+        return result
     else:
         raise ValueError("The type of `file` must be `os.PathLike`, `str`, `int`, "
                          "or `io.IOBase`")
 
 
-def clear_page_cache(reclaim_dentries_and_inodes: bool, clear_dirty_pages: bool):
-    return cpp_clear_page_cache(reclaim_dentries_and_inodes, clear_dirty_pages)
+def clear_page_cache(reclaim_dentries_and_inodes: bool,
+                     clear_dirty_pages: bool) -> bool:
+    cdef bool result
+    with nogil:
+        result = cpp_clear_page_cache(reclaim_dentries_and_inodes, clear_dirty_pages)
+    return result
diff --git a/python/kvikio/kvikio/_lib/future.pyx b/python/kvikio/kvikio/_lib/future.pyx
index da6ab308dc..94d9dd6d9c 100644
--- a/python/kvikio/kvikio/_lib/future.pyx
+++ b/python/kvikio/kvikio/_lib/future.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2024-2025, NVIDIA CORPORATION. All rights reserved.
 # See file LICENSE for terms.
 
 # distutils: language = c++
@@ -15,7 +15,10 @@ cdef extern from "<kvikio/utils.hpp>" namespace "kvikio" nogil:
 cdef class IOFutureStream:
     """Wrap a C++ StreamFuture in a Python object"""
     def check_bytes_done(self) -> int:
-        return self._handle.check_bytes_done()
+        cdef size_t bytes_done
+        with nogil:
+            bytes_done = self._handle.check_bytes_done()
+        return bytes_done
 
 
 cdef IOFutureStream _wrap_stream_future(cpp_StreamFuture &fut):
@@ -34,7 +37,10 @@ cdef class IOFuture:
         return ret
 
     def done(self) -> bool:
-        return is_future_done(self._handle)
+        cdef bool result
+        with nogil:
+            result = is_future_done(self._handle)
+        return result
 
 
 cdef IOFuture _wrap_io_future(future[size_t] &fut):
diff --git a/python/kvikio/kvikio/_lib/libnvcomp.pyx b/python/kvikio/kvikio/_lib/libnvcomp.pyx
deleted file mode 100644
index dc5359e9b3..0000000000
--- a/python/kvikio/kvikio/_lib/libnvcomp.pyx
+++ /dev/null
@@ -1,235 +0,0 @@
-# Copyright (c) 2022 Carson Swope
-# Use, modification, and distribution is subject to the MIT License
-# https://github.com/carsonswope/py-nvcomp/blob/main/LICENSE)
-# SPDX-FileCopyrightText: Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES.
-# All rights reserved.
-# SPDX-License-Identifier: MIT
-# Permission is hereby granted, free of charge, to any person obtaining a
-# copy of this software and associated documentation files (the "Software"),
-# to deal in the Software without restriction, including without limitation
-# the rights to use, copy, modify, merge, publish, distribute, sublicense,
-# and/or sell copies of the Software, and to permit persons to whom the
-# Software is furnished to do so, subject to the following conditions:
-# The above copyright notice and this permission notice shall be included in
-# all copies or substantial portions of the Software.
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
-# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-# DEALINGS IN THE SOFTWARE.
-
-from enum import Enum
-
-from libc.stdint cimport uint8_t, uintptr_t
-from libcpp cimport nullptr
-from libcpp.memory cimport make_shared, shared_ptr
-from libcpp.utility cimport move
-
-from kvikio._lib.arr cimport Array
-from kvikio._lib.nvcomp_cxx_api cimport (
-    ANSManager,
-    BitcompManager,
-    CascadedManager,
-    CompressionConfig,
-    DecompressionConfig,
-    GdeflateManager,
-    LZ4Manager,
-    SnappyManager,
-    create_manager,
-    nvcompBatchedANSDefaultOpts,
-    nvcompBatchedANSOpts_t,
-    nvcompBatchedBitcompFormatOpts,
-    nvcompBatchedCascadedDefaultOpts,
-    nvcompBatchedCascadedOpts_t,
-    nvcompBatchedGdeflateOpts_t,
-    nvcompBatchedLZ4Opts_t,
-    nvcompBatchedSnappyDefaultOpts,
-    nvcompBatchedSnappyOpts_t,
-    nvcompManagerBase,
-    nvcompType_t,
-)
-
-
-class pyNvcompType_t(Enum):
-    pyNVCOMP_TYPE_CHAR = nvcompType_t.NVCOMP_TYPE_CHAR
-    pyNVCOMP_TYPE_UCHAR = nvcompType_t.NVCOMP_TYPE_UCHAR
-    pyNVCOMP_TYPE_SHORT = nvcompType_t.NVCOMP_TYPE_SHORT
-    pyNVCOMP_TYPE_USHORT = nvcompType_t.NVCOMP_TYPE_USHORT
-    pyNVCOMP_TYPE_INT = nvcompType_t.NVCOMP_TYPE_INT
-    pyNVCOMP_TYPE_UINT = nvcompType_t.NVCOMP_TYPE_UINT
-    pyNVCOMP_TYPE_LONGLONG = nvcompType_t.NVCOMP_TYPE_LONGLONG
-    pyNVCOMP_TYPE_ULONGLONG = nvcompType_t.NVCOMP_TYPE_ULONGLONG
-    pyNVCOMP_TYPE_BITS = nvcompType_t.NVCOMP_TYPE_BITS
-
-
-cdef class _nvcompManager:
-    # Temporary storage for factory allocated manager to prevent cleanup
-    cdef shared_ptr[nvcompManagerBase] _mgr
-    cdef nvcompManagerBase* _impl
-    cdef shared_ptr[CompressionConfig] _compression_config
-    cdef shared_ptr[DecompressionConfig] _decompression_config
-
-    def __dealloc__(self):
-        # `ManagedManager` uses a temporary object, self._mgr
-        # to retain a reference count to the Manager created by
-        # create_manager. If it is present, then the `shared_ptr`
-        # system will free self._impl. Otherwise, we need to free
-        # self._iNonempl
-        if self._mgr == nullptr:
-            del self._impl
-
-    def configure_compression(self, decomp_buffer_size):
-        cdef shared_ptr[CompressionConfig] partial = make_shared[
-            CompressionConfig](
-                self._impl.configure_compression(decomp_buffer_size)
-        )
-        self._compression_config = make_shared[CompressionConfig](
-            (move(partial.get()[0]))
-        )
-        cdef const CompressionConfig* compression_config_ptr = \
-            self._compression_config.get()
-        return {
-            "uncompressed_buffer_size": compression_config_ptr.
-            uncompressed_buffer_size,
-            "max_compressed_buffer_size": compression_config_ptr.
-            max_compressed_buffer_size,
-            "num_chunks": compression_config_ptr.num_chunks
-        }
-
-    def compress(self, Array decomp_buffer, Array comp_buffer):
-        cdef uintptr_t comp_buffer_ptr = comp_buffer.ptr
-        self._impl.compress(
-            <const uint8_t*>decomp_buffer.ptr,
-            <uint8_t*>comp_buffer_ptr,
-            <CompressionConfig&>self._compression_config.get()[0]
-        )
-        size = self._impl.get_compressed_output_size(
-            <uint8_t*>comp_buffer_ptr
-        )
-        return size
-
-    def configure_decompression_with_compressed_buffer(
-        self,
-        Array comp_buffer
-    ) -> dict:
-        cdef shared_ptr[DecompressionConfig] partial = make_shared[
-            DecompressionConfig](self._impl.configure_decompression(
-                <uint8_t*>comp_buffer.ptr
-            )
-        )
-        self._decompression_config = make_shared[DecompressionConfig](
-            (move(partial.get()[0]))
-        )
-        cdef const DecompressionConfig* decompression_config_ptr = \
-            self._decompression_config.get()
-        return {
-            "decomp_data_size": decompression_config_ptr.decomp_data_size,
-            "num_chunks": decompression_config_ptr.num_chunks
-        }
-
-    def decompress(
-        self,
-        Array decomp_buffer,
-        Array comp_buffer,
-    ):
-        self._impl.decompress(
-            <uint8_t*>decomp_buffer.ptr,
-            <const uint8_t*>comp_buffer.ptr,
-            <DecompressionConfig&>self._decompression_config.get()[0]
-        )
-
-    def get_compressed_output_size(self, Array comp_buffer):
-        return self._impl.get_compressed_output_size(
-            <uint8_t*>comp_buffer.ptr
-        )
-
-
-cdef class _ANSManager(_nvcompManager):
-    def __cinit__(
-        self,
-        size_t uncomp_chunk_size,
-    ):
-        self._impl = <nvcompManagerBase*>new ANSManager(
-            uncomp_chunk_size,
-            <nvcompBatchedANSOpts_t>nvcompBatchedANSDefaultOpts
-        )
-
-
-cdef class _BitcompManager(_nvcompManager):
-    def __cinit__(
-        self,
-        size_t uncomp_chunk_size,
-        nvcompType_t data_type,
-        int bitcomp_algo,
-    ):
-        cdef opts = nvcompBatchedBitcompFormatOpts(bitcomp_algo, data_type)
-        self._impl = <nvcompManagerBase*>new BitcompManager(
-            uncomp_chunk_size,
-            opts
-        )
-
-
-cdef class _CascadedManager(_nvcompManager):
-    def __cinit__(
-        self,
-        _options,
-    ):
-        self._impl = <nvcompManagerBase*>new CascadedManager(
-            _options["chunk_size"],
-            <nvcompBatchedCascadedOpts_t>nvcompBatchedCascadedDefaultOpts
-        )
-
-
-cdef class _GdeflateManager(_nvcompManager):
-    def __cinit__(
-        self,
-        size_t uncomp_chunk_size,
-        int algo,
-    ):
-        cdef opts = nvcompBatchedGdeflateOpts_t(algo)
-        self._impl = <nvcompManagerBase*>new GdeflateManager(
-            uncomp_chunk_size,
-            opts
-        )
-
-
-cdef class _LZ4Manager(_nvcompManager):
-    def __cinit__(
-        self,
-        size_t uncomp_chunk_size,
-        nvcompType_t data_type,
-    ):
-        # TODO: Doesn't work with user specified streams passed down
-        # from anywhere up. I'm not going to rabbit hole on it until
-        # everything else works.
-        # cdef cudaStream_t stream = <cudaStream_t><void*>user_stream
-        cdef opts = nvcompBatchedLZ4Opts_t(data_type)
-        self._impl = <nvcompManagerBase*>new LZ4Manager(
-            uncomp_chunk_size,
-            opts
-        )
-
-
-cdef class _SnappyManager(_nvcompManager):
-    def __cinit__(
-        self,
-        size_t uncomp_chunk_size,
-    ):
-        # TODO: Doesn't work with user specified streams passed down
-        # from anywhere up. I'm not going to rabbit hole on it until
-        # everything else works.
-        self._impl = <nvcompManagerBase*>new SnappyManager(
-            uncomp_chunk_size,
-            <nvcompBatchedSnappyOpts_t>nvcompBatchedSnappyDefaultOpts
-        )
-
-
-cdef class _ManagedManager(_nvcompManager):
-    def __init__(self, compressed_buffer):
-        cdef shared_ptr[nvcompManagerBase] _mgr = create_manager(
-            <uint8_t*><uintptr_t>compressed_buffer.ptr
-        )
-        self._mgr = _mgr
-        self._impl = move(_mgr).get()
diff --git a/python/kvikio/kvikio/_lib/libnvcomp_ll.pyx b/python/kvikio/kvikio/_lib/libnvcomp_ll.pyx
deleted file mode 100644
index 46c7b399a9..0000000000
--- a/python/kvikio/kvikio/_lib/libnvcomp_ll.pyx
+++ /dev/null
@@ -1,1182 +0,0 @@
-# Copyright (c) 2023-2024, NVIDIA CORPORATION. All rights reserved.
-# See file LICENSE for terms.
-
-from __future__ import annotations
-
-from abc import ABC, abstractmethod
-from enum import IntEnum
-
-from libc.stdint cimport uint32_t, uintptr_t
-
-from kvikio._lib.nvcomp_ll_cxx_api cimport cudaStream_t, nvcompStatus_t, nvcompType_t
-
-import cupy
-
-
-class nvCompStatus(IntEnum):
-    Success = nvcompStatus_t.nvcompSuccess,
-    ErrorInvalidValue = nvcompStatus_t.nvcompErrorInvalidValue,
-    ErrorNotSupported = nvcompStatus_t.nvcompErrorNotSupported,
-    ErrorCannotDecompress = nvcompStatus_t.nvcompErrorCannotDecompress,
-    ErrorBadChecksum = nvcompStatus_t.nvcompErrorBadChecksum,
-    ErrorCannotVerifyChecksums = nvcompStatus_t.nvcompErrorCannotVerifyChecksums,
-    ErrorCudaError = nvcompStatus_t.nvcompErrorCudaError,
-    ErrorInternal = nvcompStatus_t.nvcompErrorInternal,
-
-
-class nvCompType(IntEnum):
-    CHAR = nvcompType_t.NVCOMP_TYPE_CHAR
-    UCHAR = nvcompType_t.NVCOMP_TYPE_UCHAR
-    SHORT = nvcompType_t.NVCOMP_TYPE_SHORT
-    USHORT = nvcompType_t.NVCOMP_TYPE_USHORT
-    INT = nvcompType_t.NVCOMP_TYPE_INT
-    UINT = nvcompType_t.NVCOMP_TYPE_UINT
-    LONGLONG = nvcompType_t.NVCOMP_TYPE_LONGLONG
-    ULONGLONG = nvcompType_t.NVCOMP_TYPE_ULONGLONG
-    BITS = nvcompType_t.NVCOMP_TYPE_BITS
-
-
-class nvCompBatchAlgorithm(ABC):
-    """Abstract class that provides interface to nvCOMP batched algorithms."""
-
-    # TODO(akamenev): it might be possible to have a simpler implementation that
-    # eilminates the need to have a separate implementation class for each algorithm,
-    # potentially using fused types in Cython (similar to C++ templates),
-    # but I could not figure out how to do that (e.g. each algorithm API set has
-    # a different type for the options and so on).
-
-    def get_compress_temp_size(
-        self,
-        size_t batch_size,
-        size_t max_uncompressed_chunk_bytes,
-    ):
-        """Get temporary space required for compression.
-
-        Parameters
-        ----------
-        batch_size: int
-            The number of items in the batch.
-        max_uncompressed_chunk_bytes: int
-            The maximum size in bytes of a chunk in the batch.
-
-        Returns
-        -------
-        int
-            The size in bytes of the required GPU workspace for compression.
-        """
-        err, temp_size = self._get_comp_temp_size(
-            batch_size,
-            max_uncompressed_chunk_bytes
-        )
-        if err != nvcompStatus_t.nvcompSuccess:
-            raise RuntimeError(
-                f"Could not get compress temp buffer size, "
-                f"error: {nvCompStatus(err)!r}."
-            )
-        return temp_size
-
-    @abstractmethod
-    def _get_comp_temp_size(
-        self,
-        size_t batch_size,
-        size_t max_uncompressed_chunk_bytes,
-    ) -> tuple[nvcompStatus_t, size_t]:
-        """Algorithm-specific implementation."""
-        ...
-
-    def get_compress_chunk_size(self, size_t max_uncompressed_chunk_bytes):
-        """Get the maximum size any chunk could compress to in the batch.
-
-        Parameters
-        ----------
-        max_uncompressed_chunk_bytes: int
-            The maximum size in bytes of a chunk in the batch.
-
-        Returns
-        -------
-        int
-            The maximum compressed size in bytes of the largest chunk. That is,
-            the minimum amount of output memory required to be given to
-            the corresponding *CompressAsync function.
-        """
-        err, comp_chunk_size = self._get_comp_chunk_size(max_uncompressed_chunk_bytes)
-        if err != nvcompStatus_t.nvcompSuccess:
-            raise RuntimeError(
-                f"Could not get output buffer size, "
-                f"error: {nvCompStatus(err)!r}."
-            )
-        return comp_chunk_size
-
-    @abstractmethod
-    def _get_comp_chunk_size(self, size_t max_uncompressed_chunk_bytes):
-        """Algorithm-specific implementation."""
-        ...
-
-    def compress(
-        self,
-        uncomp_chunks,
-        uncomp_chunk_sizes,
-        size_t max_uncomp_chunk_bytes,
-        size_t batch_size,
-        temp_buf,
-        comp_chunks,
-        comp_chunk_sizes,
-        stream,
-    ):
-        """Perform compression.
-
-        Parameters
-        ----------
-        uncomp_chunks: cp.ndarray[uintp]
-            The pointers on the GPU, to uncompressed batched items.
-        uncomp_chunk_sizes: cp.ndarray[uint64]
-            The size in bytes of each uncompressed batch item on the GPU.
-        max_uncomp_chunk_bytes: int
-            The maximum size in bytes of the largest chunk in the batch.
-        batch_size: int
-            The number of chunks to compress.
-        temp_buf: cp.ndarray
-            The temporary GPU workspace.
-        comp_chunks: cp.ndarray[uintp]
-            (output) The list of pointers on the GPU, to the output location for each
-            compressed batch item.
-        comp_chunk_sizes: cp.ndarray[uint64]
-            (output) The compressed size in bytes of each chunk.
-        stream: cp.cuda.Stream
-            CUDA stream.
-        """
-
-        err = self._compress(
-            uncomp_chunks,
-            uncomp_chunk_sizes,
-            max_uncomp_chunk_bytes,
-            batch_size,
-            temp_buf,
-            comp_chunks,
-            comp_chunk_sizes,
-            stream,
-        )
-        if err != nvcompStatus_t.nvcompSuccess:
-            raise RuntimeError(f"Compression failed, error: {nvCompStatus(err)!r}.")
-
-    @abstractmethod
-    def _compress(
-        self,
-        uncomp_chunks,
-        uncomp_chunk_sizes,
-        size_t max_uncomp_chunk_bytes,
-        size_t batch_size,
-        temp_buf,
-        comp_chunks,
-        comp_chunk_sizes,
-        stream
-    ):
-        """Algorithm-specific implementation."""
-        ...
-
-    def get_decompress_temp_size(
-        self,
-        size_t batch_size,
-        size_t max_uncompressed_chunk_bytes,
-    ):
-        """Get the amount of temp space required on the GPU for decompression.
-
-        Parameters
-        ----------
-        batch_size: int
-            The number of items in the batch.
-        max_uncompressed_chunk_bytes: int
-            The size in bytes of the largest chunk when uncompressed.
-
-        Returns
-        -------
-        int
-            The amount of temporary GPU space in bytes that will be
-            required to decompress.
-        """
-        err, temp_size = self._get_decomp_temp_size(
-            batch_size,
-            max_uncompressed_chunk_bytes
-        )
-        if err != nvcompStatus_t.nvcompSuccess:
-            raise RuntimeError(
-                f"Could not get decompress temp buffer size, "
-                f"error: {nvCompStatus(err)!r}."
-            )
-
-        return temp_size
-
-    @abstractmethod
-    def _get_decomp_temp_size(
-        self,
-        size_t batch_size,
-        size_t max_uncompressed_chunk_bytes,
-    ):
-        """Algorithm-specific implementation."""
-        ...
-
-    def get_decompress_size(
-        self,
-        comp_chunks,
-        comp_chunk_sizes,
-        stream,
-    ):
-        """Get the amount of space required on the GPU for decompression.
-
-        Parameters
-        ----------
-        comp_chunks: cp.ndarray[uintp]
-            The pointers on the GPU, to compressed batched items.
-        comp_chunk_sizes: cp.ndarray[uint64]
-            The size in bytes of each compressed batch item.
-        stream: cp.cuda.Stream
-            CUDA stream.
-
-        Returns
-        -------
-        cp.ndarray[uint64]
-            The amount of GPU space in bytes that will be required
-            to decompress each chunk.
-        """
-
-        assert len(comp_chunks) == len(comp_chunk_sizes)
-        batch_size = len(comp_chunks)
-
-        # nvCOMP requires all buffers to be in GPU memory.
-        uncomp_chunk_sizes = cupy.empty_like(comp_chunk_sizes)
-
-        err = self._get_decomp_size(
-            comp_chunks,
-            comp_chunk_sizes,
-            batch_size,
-            uncomp_chunk_sizes,
-            stream,
-        )
-        if err != nvcompStatus_t.nvcompSuccess:
-            raise RuntimeError(
-                f"Could not get decompress buffer size, error: {nvCompStatus(err)!r}."
-            )
-
-        return uncomp_chunk_sizes
-
-    @abstractmethod
-    def _get_decomp_size(
-        self,
-        comp_chunks,
-        comp_chunk_sizes,
-        size_t batch_size,
-        uncomp_chunk_sizes,
-        stream,
-    ):
-        """Algorithm-specific implementation."""
-        ...
-
-    def decompress(
-        self,
-        comp_chunks,
-        comp_chunk_sizes,
-        size_t batch_size,
-        temp_buf,
-        uncomp_chunks,
-        uncomp_chunk_sizes,
-        actual_uncomp_chunk_sizes,
-        statuses,
-        stream,
-    ):
-        """Perform decompression.
-
-        Parameters
-        ----------
-        comp_chunks: cp.ndarray[uintp]
-            The pointers on the GPU, to compressed batched items.
-        comp_chunk_sizes: cp.ndarray[uint64]
-            The size in bytes of each compressed batch item.
-        batch_size: int
-            The number of chunks to decompress.
-        temp_buf: cp.ndarray
-            The temporary GPU workspace.
-        uncomp_chunks: cp.ndarray[uintp]
-            (output) The pointers on the GPU, to the output location for each
-            decompressed batch item.
-        uncomp_chunk_sizes: cp.ndarray[uint64]
-            The size in bytes of each decompress chunk location on the GPU.
-        actual_uncomp_chunk_sizes: cp.ndarray[uint64]
-            (output) The actual decompressed size in bytes of each chunk on the GPU.
-        statuses: cp.ndarray
-            (output) The status for each chunk of whether it was decompressed or not.
-        stream: cp.cuda.Stream
-            CUDA stream.
-        """
-
-        err = self._decompress(
-            comp_chunks,
-            comp_chunk_sizes,
-            batch_size,
-            temp_buf,
-            uncomp_chunks,
-            uncomp_chunk_sizes,
-            actual_uncomp_chunk_sizes,
-            statuses,
-            stream,
-        )
-        if err != nvcompStatus_t.nvcompSuccess:
-            raise RuntimeError(f"Decompression failed, error: {nvCompStatus(err)!r}.")
-
-    @abstractmethod
-    def _decompress(
-        self,
-        comp_chunks,
-        comp_chunk_sizes,
-        size_t batch_size,
-        temp_buf,
-        uncomp_chunks,
-        uncomp_chunk_sizes,
-        actual_uncomp_chunk_sizes,
-        statuses,
-        stream,
-    ):
-        """Algorithm-specific implementation."""
-        ...
-
-
-cdef uintptr_t to_ptr(buf):
-    return buf.data.ptr
-
-
-cdef cudaStream_t to_stream(stream):
-    return <cudaStream_t><size_t>stream.ptr
-
-
-#
-# LZ4 algorithm.
-#
-
-from kvikio._lib.nvcomp_ll_cxx_api cimport (
-    nvcompBatchedLZ4CompressAsync,
-    nvcompBatchedLZ4CompressGetMaxOutputChunkSize,
-    nvcompBatchedLZ4CompressGetTempSize,
-    nvcompBatchedLZ4DecompressAsync,
-    nvcompBatchedLZ4DecompressGetTempSize,
-    nvcompBatchedLZ4DefaultOpts,
-    nvcompBatchedLZ4GetDecompressSizeAsync,
-    nvcompBatchedLZ4Opts_t,
-)
-
-
-class nvCompBatchAlgorithmLZ4(nvCompBatchAlgorithm):
-    """LZ4 algorithm implementation."""
-
-    algo_id: str = "lz4"
-
-    options: nvcompBatchedLZ4Opts_t
-
-    HEADER_SIZE_BYTES: size_t = sizeof(uint32_t)
-
-    def __init__(self, data_type: int = None, has_header: bool = True):
-        """Initialize the codec.
-
-        Parameters
-        ----------
-        data_type: int or None
-            Source data type. If None, uses nvcomp default options.
-        has_header: bool
-            Whether the compressed data has a header.
-            This enables data compatibility between numcodecs LZ4 codec,
-            which has the header and nvCOMP LZ4 codec which does not
-            require the header.
-        """
-        if data_type is None:
-            self.options = nvcompBatchedLZ4DefaultOpts
-        else:
-            self.options = nvcompBatchedLZ4Opts_t(data_type)
-
-        self.has_header = has_header
-
-        # Note on LZ4 header structure: numcodecs LZ4 codec prepends
-        # a 4-byte (uint32_t) header to each compressed chunk.
-        # The header stores the size of the original (uncompressed) data:
-        # https://github.com/zarr-developers/numcodecs/blob/cb155432e36536e17a2d054c8c24b7bf6f4a7347/numcodecs/lz4.pyx#L89
-        #
-        # The following CUDA kernels read / write chunk header by
-        # casting the chunk pointer to a pointer to unsigned int.
-
-        # CUDA kernel that copies uncompressed chunk size from the chunk header.
-        self._get_size_from_header_kernel = cupy.ElementwiseKernel(
-            "uint64 comp_chunk_ptr",
-            "uint64 uncomp_chunk_size",
-            "uncomp_chunk_size = *((unsigned int *)comp_chunk_ptr)",
-            "get_size_from_header",
-        )
-
-        # CUDA kernel that copies uncompressed chunk size to the chunk header.
-        self._set_chunk_size_header_kernel = cupy.ElementwiseKernel(
-            "uint64 uncomp_chunk_size",
-            "uint64 comp_chunk_ptr",
-            "((unsigned int *)comp_chunk_ptr)[0] = (unsigned int)uncomp_chunk_size",
-            "set_chunk_size_header",
-            no_return=True,
-        )
-
-    def _get_comp_temp_size(
-        self,
-        size_t batch_size,
-        size_t max_uncompressed_chunk_bytes,
-    ) -> tuple[nvcompStatus_t, size_t]:
-        cdef size_t temp_bytes = 0
-
-        err = nvcompBatchedLZ4CompressGetTempSize(
-            batch_size,
-            max_uncompressed_chunk_bytes,
-            self.options,
-            &temp_bytes
-        )
-
-        return (err, temp_bytes)
-
-    def _get_comp_chunk_size(self, size_t max_uncompressed_chunk_bytes):
-        cdef size_t max_compressed_bytes = 0
-
-        err = nvcompBatchedLZ4CompressGetMaxOutputChunkSize(
-            max_uncompressed_chunk_bytes,
-            self.options,
-            &max_compressed_bytes
-        )
-
-        # Add header size, if needed.
-        if err == nvcompStatus_t.nvcompSuccess and self.has_header:
-            max_compressed_bytes += self.HEADER_SIZE_BYTES
-
-        return (err, max_compressed_bytes)
-
-    def compress(
-        self,
-        uncomp_chunks,
-        uncomp_chunk_sizes,
-        size_t max_uncomp_chunk_bytes,
-        size_t batch_size,
-        temp_buf,
-        comp_chunks,
-        comp_chunk_sizes,
-        stream,
-    ):
-        if self.has_header:
-            # If there is a header, we need to:
-            # 1. Copy the uncompressed chunk size to the compressed chunk header.
-            # 2. Update target pointers in comp_chunks to skip the header portion,
-            # which is not compressed.
-            #
-            self._set_chunk_size_header_kernel(uncomp_chunk_sizes, comp_chunks)
-            # Update chunk pointer to skip the header.
-            comp_chunks += self.HEADER_SIZE_BYTES
-
-        super().compress(
-            uncomp_chunks,
-            uncomp_chunk_sizes,
-            max_uncomp_chunk_bytes,
-            batch_size,
-            temp_buf,
-            comp_chunks,
-            comp_chunk_sizes,
-            stream,
-        )
-
-        if self.has_header:
-            # Update chunk pointer and size to include the header.
-            comp_chunks -= self.HEADER_SIZE_BYTES
-            comp_chunk_sizes += self.HEADER_SIZE_BYTES
-
-    def _compress(
-        self,
-        uncomp_chunks,
-        uncomp_chunk_sizes,
-        size_t max_uncomp_chunk_bytes,
-        size_t batch_size,
-        temp_buf,
-        comp_chunks,
-        comp_chunk_sizes,
-        stream
-    ):
-        # Cast buffer pointers that have Python int type to appropriate C types
-        # suitable for passing to nvCOMP API.
-        return nvcompBatchedLZ4CompressAsync(
-            <const void* const*>to_ptr(uncomp_chunks),
-            <const size_t*>to_ptr(uncomp_chunk_sizes),
-            max_uncomp_chunk_bytes,
-            batch_size,
-            <void*>to_ptr(temp_buf),
-            <size_t>temp_buf.nbytes,
-            <void* const*>to_ptr(comp_chunks),
-            <size_t*>to_ptr(comp_chunk_sizes),
-            self.options,
-            to_stream(stream),
-        )
-
-    def _get_decomp_temp_size(
-        self,
-        size_t batch_size,
-        size_t max_uncompressed_chunk_bytes,
-    ):
-        cdef size_t temp_bytes = 0
-
-        err = nvcompBatchedLZ4DecompressGetTempSize(
-            batch_size,
-            max_uncompressed_chunk_bytes,
-            &temp_bytes
-        )
-
-        return (err, temp_bytes)
-
-    def get_decompress_size(
-        self,
-        comp_chunks,
-        comp_chunk_sizes,
-        stream,
-    ):
-        if not self.has_header:
-            return super().get_decompress_size(
-                comp_chunks,
-                comp_chunk_sizes,
-                stream,
-            )
-
-        return self._get_size_from_header_kernel(comp_chunks)
-
-    def _get_decomp_size(
-        self,
-        comp_chunks,
-        comp_chunk_sizes,
-        size_t batch_size,
-        uncomp_chunk_sizes,
-        stream,
-    ):
-        return nvcompBatchedLZ4GetDecompressSizeAsync(
-            <const void* const*>to_ptr(comp_chunks),
-            <const size_t*>to_ptr(comp_chunk_sizes),
-            <size_t*>to_ptr(uncomp_chunk_sizes),
-            batch_size,
-            to_stream(stream),
-        )
-
-    def decompress(
-        self,
-        comp_chunks,
-        comp_chunk_sizes,
-        size_t batch_size,
-        temp_buf,
-        uncomp_chunks,
-        uncomp_chunk_sizes,
-        actual_uncomp_chunk_sizes,
-        statuses,
-        stream,
-    ):
-        if self.has_header:
-            # Update chunk pointer and size to exclude the header.
-            comp_chunks += self.HEADER_SIZE_BYTES
-            comp_chunk_sizes -= self.HEADER_SIZE_BYTES
-
-        super().decompress(
-            comp_chunks,
-            comp_chunk_sizes,
-            batch_size,
-            temp_buf,
-            uncomp_chunks,
-            uncomp_chunk_sizes,
-            actual_uncomp_chunk_sizes,
-            statuses,
-            stream,
-        )
-
-    def _decompress(
-        self,
-        comp_chunks,
-        comp_chunk_sizes,
-        size_t batch_size,
-        temp_buf,
-        uncomp_chunks,
-        uncomp_chunk_sizes,
-        actual_uncomp_chunk_sizes,
-        statuses,
-        stream,
-    ):
-        # Cast buffer pointers that have Python int type to appropriate C types
-        # suitable for passing to nvCOMP API.
-        return nvcompBatchedLZ4DecompressAsync(
-            <const void* const*>to_ptr(comp_chunks),
-            <const size_t*>to_ptr(comp_chunk_sizes),
-            <const size_t*>to_ptr(uncomp_chunk_sizes),
-            <size_t*>NULL,
-            batch_size,
-            <void* const>to_ptr(temp_buf),
-            <size_t>temp_buf.nbytes,
-            <void* const*>to_ptr(uncomp_chunks),
-            <nvcompStatus_t*>NULL,
-            to_stream(stream),
-        )
-
-    def __repr__(self):
-        return f"{self.__class__.__name__}(data_type={self.options['data_type']})"
-
-
-#
-# Gdeflate algorithm.
-#
-from kvikio._lib.nvcomp_ll_cxx_api cimport (
-    nvcompBatchedGdeflateCompressAsync,
-    nvcompBatchedGdeflateCompressGetMaxOutputChunkSize,
-    nvcompBatchedGdeflateCompressGetTempSize,
-    nvcompBatchedGdeflateDecompressAsync,
-    nvcompBatchedGdeflateDecompressGetTempSize,
-    nvcompBatchedGdeflateDefaultOpts,
-    nvcompBatchedGdeflateGetDecompressSizeAsync,
-    nvcompBatchedGdeflateOpts_t,
-)
-
-
-class nvCompBatchAlgorithmGdeflate(nvCompBatchAlgorithm):
-    """Gdeflate algorithm implementation."""
-
-    algo_id: str = "gdeflate"
-
-    options: nvcompBatchedGdeflateOpts_t
-
-    def __init__(self, algo: int = None):
-        if algo is None:
-            self.options = nvcompBatchedGdeflateDefaultOpts
-        else:
-            self.options = nvcompBatchedGdeflateOpts_t(algo)
-
-    def _get_comp_temp_size(
-        self,
-        size_t batch_size,
-        size_t max_uncompressed_chunk_bytes,
-    ) -> tuple[nvcompStatus_t, size_t]:
-        cdef size_t temp_bytes = 0
-
-        err = nvcompBatchedGdeflateCompressGetTempSize(
-            batch_size,
-            max_uncompressed_chunk_bytes,
-            self.options,
-            &temp_bytes
-        )
-
-        return (err, temp_bytes)
-
-    def _get_comp_chunk_size(self, size_t max_uncompressed_chunk_bytes):
-        cdef size_t max_compressed_bytes = 0
-
-        err = nvcompBatchedGdeflateCompressGetMaxOutputChunkSize(
-            max_uncompressed_chunk_bytes,
-            self.options,
-            &max_compressed_bytes
-        )
-
-        return (err, max_compressed_bytes)
-
-    def _compress(
-        self,
-        uncomp_chunks,
-        uncomp_chunk_sizes,
-        size_t max_uncomp_chunk_bytes,
-        size_t batch_size,
-        temp_buf,
-        comp_chunks,
-        comp_chunk_sizes,
-        stream
-    ):
-        return nvcompBatchedGdeflateCompressAsync(
-            <const void* const*>to_ptr(uncomp_chunks),
-            <const size_t*>to_ptr(uncomp_chunk_sizes),
-            max_uncomp_chunk_bytes,
-            batch_size,
-            <void*>to_ptr(temp_buf),
-            <size_t>temp_buf.nbytes,
-            <void* const*>to_ptr(comp_chunks),
-            <size_t*>to_ptr(comp_chunk_sizes),
-            self.options,
-            to_stream(stream),
-        )
-
-    def _get_decomp_temp_size(
-        self,
-        size_t num_chunks,
-        size_t max_uncompressed_chunk_bytes,
-    ):
-        cdef size_t temp_bytes = 0
-
-        err = nvcompBatchedGdeflateDecompressGetTempSize(
-            num_chunks,
-            max_uncompressed_chunk_bytes,
-            &temp_bytes
-        )
-
-        return (err, temp_bytes)
-
-    def _get_decomp_size(
-        self,
-        comp_chunks,
-        comp_chunk_sizes,
-        size_t batch_size,
-        uncomp_chunk_sizes,
-        stream,
-    ):
-        return nvcompBatchedGdeflateGetDecompressSizeAsync(
-            <const void* const*>to_ptr(comp_chunks),
-            <const size_t*>to_ptr(comp_chunk_sizes),
-            <size_t*>to_ptr(uncomp_chunk_sizes),
-            batch_size,
-            to_stream(stream),
-        )
-
-    def _decompress(
-        self,
-        comp_chunks,
-        comp_chunk_sizes,
-        size_t batch_size,
-        temp_buf,
-        uncomp_chunks,
-        uncomp_chunk_sizes,
-        actual_uncomp_chunk_sizes,
-        statuses,
-        stream,
-    ):
-        return nvcompBatchedGdeflateDecompressAsync(
-            <const void* const*>to_ptr(comp_chunks),
-            <const size_t*>to_ptr(comp_chunk_sizes),
-            <const size_t*>to_ptr(uncomp_chunk_sizes),
-            <size_t*>NULL,
-            batch_size,
-            <void* const>to_ptr(temp_buf),
-            <size_t>temp_buf.nbytes,
-            <void* const*>to_ptr(uncomp_chunks),
-            <nvcompStatus_t*>NULL,
-            to_stream(stream),
-        )
-
-    def __repr__(self):
-        return f"{self.__class__.__name__}(algo={self.options['algo']})"
-
-
-#
-# zstd algorithm.
-#
-from kvikio._lib.nvcomp_ll_cxx_api cimport (
-    nvcompBatchedZstdCompressAsync,
-    nvcompBatchedZstdCompressGetMaxOutputChunkSize,
-    nvcompBatchedZstdCompressGetTempSize,
-    nvcompBatchedZstdDecompressAsync,
-    nvcompBatchedZstdDecompressGetTempSize,
-    nvcompBatchedZstdDefaultOpts,
-    nvcompBatchedZstdGetDecompressSizeAsync,
-    nvcompBatchedZstdOpts_t,
-)
-
-
-class nvCompBatchAlgorithmZstd(nvCompBatchAlgorithm):
-    """zstd algorithm implementation."""
-
-    algo_id: str = "zstd"
-
-    options: nvcompBatchedZstdOpts_t
-
-    def __init__(self):
-        self.options = nvcompBatchedZstdDefaultOpts
-
-    def _get_comp_temp_size(
-        self,
-        size_t batch_size,
-        size_t max_uncompressed_chunk_bytes,
-    ) -> tuple[nvcompStatus_t, size_t]:
-        cdef size_t temp_bytes = 0
-
-        err = nvcompBatchedZstdCompressGetTempSize(
-            batch_size,
-            max_uncompressed_chunk_bytes,
-            self.options,
-            &temp_bytes
-        )
-
-        return (err, temp_bytes)
-
-    def _get_comp_chunk_size(self, size_t max_uncompressed_chunk_bytes):
-        cdef size_t max_compressed_bytes = 0
-
-        err = nvcompBatchedZstdCompressGetMaxOutputChunkSize(
-            max_uncompressed_chunk_bytes,
-            self.options,
-            &max_compressed_bytes
-        )
-
-        return (err, max_compressed_bytes)
-
-    def _compress(
-        self,
-        uncomp_chunks,
-        uncomp_chunk_sizes,
-        size_t max_uncomp_chunk_bytes,
-        size_t batch_size,
-        temp_buf,
-        comp_chunks,
-        comp_chunk_sizes,
-        stream
-    ):
-        return nvcompBatchedZstdCompressAsync(
-            <const void* const*>to_ptr(uncomp_chunks),
-            <const size_t*>to_ptr(uncomp_chunk_sizes),
-            max_uncomp_chunk_bytes,
-            batch_size,
-            <void*>to_ptr(temp_buf),
-            <size_t>temp_buf.nbytes,
-            <void* const*>to_ptr(comp_chunks),
-            <size_t*>to_ptr(comp_chunk_sizes),
-            self.options,
-            to_stream(stream),
-        )
-
-    def _get_decomp_temp_size(
-        self,
-        size_t num_chunks,
-        size_t max_uncompressed_chunk_bytes,
-    ):
-        cdef size_t temp_bytes = 0
-
-        err = nvcompBatchedZstdDecompressGetTempSize(
-            num_chunks,
-            max_uncompressed_chunk_bytes,
-            &temp_bytes
-        )
-
-        return (err, temp_bytes)
-
-    def _get_decomp_size(
-        self,
-        comp_chunks,
-        comp_chunk_sizes,
-        size_t batch_size,
-        uncomp_chunk_sizes,
-        stream,
-    ):
-        return nvcompBatchedZstdGetDecompressSizeAsync(
-            <const void* const*>to_ptr(comp_chunks),
-            <const size_t*>to_ptr(comp_chunk_sizes),
-            <size_t*>to_ptr(uncomp_chunk_sizes),
-            batch_size,
-            to_stream(stream),
-        )
-
-    def _decompress(
-        self,
-        comp_chunks,
-        comp_chunk_sizes,
-        size_t batch_size,
-        temp_buf,
-        uncomp_chunks,
-        uncomp_chunk_sizes,
-        actual_uncomp_chunk_sizes,
-        statuses,
-        stream,
-    ):
-        return nvcompBatchedZstdDecompressAsync(
-            <const void* const*>to_ptr(comp_chunks),
-            <const size_t*>to_ptr(comp_chunk_sizes),
-            <const size_t*>to_ptr(uncomp_chunk_sizes),
-            <size_t*>to_ptr(actual_uncomp_chunk_sizes),
-            batch_size,
-            <void* const>to_ptr(temp_buf),
-            <size_t>temp_buf.nbytes,
-            <void* const*>to_ptr(uncomp_chunks),
-            <nvcompStatus_t*>to_ptr(statuses),
-            to_stream(stream),
-        )
-
-    def __repr__(self):
-        return f"{self.__class__.__name__}()"
-
-
-#
-# Snappy algorithm.
-#
-from kvikio._lib.nvcomp_ll_cxx_api cimport (
-    nvcompBatchedSnappyCompressAsync,
-    nvcompBatchedSnappyCompressGetMaxOutputChunkSize,
-    nvcompBatchedSnappyCompressGetTempSize,
-    nvcompBatchedSnappyDecompressAsync,
-    nvcompBatchedSnappyDecompressGetTempSize,
-    nvcompBatchedSnappyDefaultOpts,
-    nvcompBatchedSnappyGetDecompressSizeAsync,
-    nvcompBatchedSnappyOpts_t,
-)
-
-
-class nvCompBatchAlgorithmSnappy(nvCompBatchAlgorithm):
-    """Snappy algorithm implementation."""
-
-    algo_id: str = "snappy"
-
-    options: nvcompBatchedSnappyOpts_t
-
-    def __init__(self):
-        self.options = nvcompBatchedSnappyDefaultOpts
-
-    def _get_comp_temp_size(
-        self,
-        size_t batch_size,
-        size_t max_uncompressed_chunk_bytes,
-    ) -> tuple[nvcompStatus_t, size_t]:
-        cdef size_t temp_bytes = 0
-
-        err = nvcompBatchedSnappyCompressGetTempSize(
-            batch_size,
-            max_uncompressed_chunk_bytes,
-            self.options,
-            &temp_bytes
-        )
-
-        return (err, temp_bytes)
-
-    def _get_comp_chunk_size(self, size_t max_uncompressed_chunk_bytes):
-        cdef size_t max_compressed_bytes = 0
-
-        err = nvcompBatchedSnappyCompressGetMaxOutputChunkSize(
-            max_uncompressed_chunk_bytes,
-            self.options,
-            &max_compressed_bytes
-        )
-
-        return (err, max_compressed_bytes)
-
-    def _compress(
-        self,
-        uncomp_chunks,
-        uncomp_chunk_sizes,
-        size_t max_uncomp_chunk_bytes,
-        size_t batch_size,
-        temp_buf,
-        comp_chunks,
-        comp_chunk_sizes,
-        stream
-    ):
-        return nvcompBatchedSnappyCompressAsync(
-            <const void* const*>to_ptr(uncomp_chunks),
-            <const size_t*>to_ptr(uncomp_chunk_sizes),
-            max_uncomp_chunk_bytes,
-            batch_size,
-            <void*>to_ptr(temp_buf),
-            <size_t>temp_buf.nbytes,
-            <void* const*>to_ptr(comp_chunks),
-            <size_t*>to_ptr(comp_chunk_sizes),
-            self.options,
-            to_stream(stream),
-        )
-
-    def _get_decomp_temp_size(
-        self,
-        size_t num_chunks,
-        size_t max_uncompressed_chunk_bytes,
-    ):
-        cdef size_t temp_bytes = 0
-
-        err = nvcompBatchedSnappyDecompressGetTempSize(
-            num_chunks,
-            max_uncompressed_chunk_bytes,
-            &temp_bytes
-        )
-
-        return (err, temp_bytes)
-
-    def _get_decomp_size(
-        self,
-        comp_chunks,
-        comp_chunk_sizes,
-        size_t batch_size,
-        uncomp_chunk_sizes,
-        stream,
-    ):
-        return nvcompBatchedSnappyGetDecompressSizeAsync(
-            <const void* const*>to_ptr(comp_chunks),
-            <const size_t*>to_ptr(comp_chunk_sizes),
-            <size_t*>to_ptr(uncomp_chunk_sizes),
-            batch_size,
-            to_stream(stream),
-        )
-
-    def _decompress(
-        self,
-        comp_chunks,
-        comp_chunk_sizes,
-        size_t batch_size,
-        temp_buf,
-        uncomp_chunks,
-        uncomp_chunk_sizes,
-        actual_uncomp_chunk_sizes,
-        statuses,
-        stream,
-    ):
-        return nvcompBatchedSnappyDecompressAsync(
-            <const void* const*>to_ptr(comp_chunks),
-            <const size_t*>to_ptr(comp_chunk_sizes),
-            <const size_t*>to_ptr(uncomp_chunk_sizes),
-            <size_t*>NULL,
-            batch_size,
-            <void* const>to_ptr(temp_buf),
-            <size_t>temp_buf.nbytes,
-            <void* const*>to_ptr(uncomp_chunks),
-            <nvcompStatus_t*>NULL,
-            to_stream(stream),
-        )
-
-    def __repr__(self):
-        return f"{self.__class__.__name__}()"
-
-
-#
-# Deflate algorithm.
-#
-from kvikio._lib.nvcomp_ll_cxx_api cimport (
-    nvcompBatchedDeflateCompressAsync,
-    nvcompBatchedDeflateCompressGetMaxOutputChunkSize,
-    nvcompBatchedDeflateCompressGetTempSize,
-    nvcompBatchedDeflateDecompressAsync,
-    nvcompBatchedDeflateDecompressGetTempSize,
-    nvcompBatchedDeflateDefaultOpts,
-    nvcompBatchedDeflateGetDecompressSizeAsync,
-    nvcompBatchedDeflateOpts_t,
-)
-
-
-class nvCompBatchAlgorithmDeflate(nvCompBatchAlgorithm):
-    """Deflate algorithm implementation."""
-
-    algo_id: str = "deflate"
-
-    options: nvcompBatchedDeflateOpts_t
-
-    def __init__(self, algo: int = None):
-        if algo is None:
-            self.options = nvcompBatchedDeflateDefaultOpts
-        else:
-            self.options = nvcompBatchedDeflateOpts_t(algo)
-
-    def _get_comp_temp_size(
-        self,
-        size_t batch_size,
-        size_t max_uncompressed_chunk_bytes,
-    ) -> tuple[nvcompStatus_t, size_t]:
-        cdef size_t temp_bytes = 0
-
-        err = nvcompBatchedDeflateCompressGetTempSize(
-            batch_size,
-            max_uncompressed_chunk_bytes,
-            self.options,
-            &temp_bytes
-        )
-
-        return (err, temp_bytes)
-
-    def _get_comp_chunk_size(self, size_t max_uncompressed_chunk_bytes):
-        cdef size_t max_compressed_bytes = 0
-
-        err = nvcompBatchedDeflateCompressGetMaxOutputChunkSize(
-            max_uncompressed_chunk_bytes,
-            self.options,
-            &max_compressed_bytes
-        )
-
-        return (err, max_compressed_bytes)
-
-    def _compress(
-        self,
-        uncomp_chunks,
-        uncomp_chunk_sizes,
-        size_t max_uncomp_chunk_bytes,
-        size_t batch_size,
-        temp_buf,
-        comp_chunks,
-        comp_chunk_sizes,
-        stream
-    ):
-        # Cast buffer pointers that have Python int type to appropriate C types
-        # suitable for passing to nvCOMP API.
-        return nvcompBatchedDeflateCompressAsync(
-            <const void* const*>to_ptr(uncomp_chunks),
-            <const size_t*>to_ptr(uncomp_chunk_sizes),
-            max_uncomp_chunk_bytes,
-            batch_size,
-            <void*>to_ptr(temp_buf),
-            <size_t>temp_buf.nbytes,
-            <void* const*>to_ptr(comp_chunks),
-            <size_t*>to_ptr(comp_chunk_sizes),
-            self.options,
-            to_stream(stream),
-        )
-
-    def _get_decomp_temp_size(
-        self,
-        size_t num_chunks,
-        size_t max_uncompressed_chunk_bytes,
-    ):
-        cdef size_t temp_bytes = 0
-
-        err = nvcompBatchedDeflateDecompressGetTempSize(
-            num_chunks,
-            max_uncompressed_chunk_bytes,
-            &temp_bytes
-        )
-
-        return (err, temp_bytes)
-
-    def _get_decomp_size(
-        self,
-        comp_chunks,
-        comp_chunk_sizes,
-        size_t batch_size,
-        uncomp_chunk_sizes,
-        stream,
-    ):
-        return nvcompBatchedDeflateGetDecompressSizeAsync(
-            <const void* const*>to_ptr(comp_chunks),
-            <const size_t*>to_ptr(comp_chunk_sizes),
-            <size_t*>to_ptr(uncomp_chunk_sizes),
-            batch_size,
-            to_stream(stream),
-        )
-
-    def _decompress(
-        self,
-        comp_chunks,
-        comp_chunk_sizes,
-        size_t batch_size,
-        temp_buf,
-        uncomp_chunks,
-        uncomp_chunk_sizes,
-        actual_uncomp_chunk_sizes,
-        statuses,
-        stream,
-    ):
-        # Cast buffer pointers that have Python int type to appropriate C types
-        # suitable for passing to nvCOMP API.
-        return nvcompBatchedDeflateDecompressAsync(
-            <const void* const*>to_ptr(comp_chunks),
-            <const size_t*>to_ptr(comp_chunk_sizes),
-            <const size_t*>to_ptr(uncomp_chunk_sizes),
-            <size_t*>to_ptr(actual_uncomp_chunk_sizes),
-            batch_size,
-            <void* const>to_ptr(temp_buf),
-            <size_t>temp_buf.nbytes,
-            <void* const*>to_ptr(uncomp_chunks),
-            <nvcompStatus_t*>to_ptr(statuses),
-            to_stream(stream),
-        )
-
-    def __repr__(self):
-        return f"{self.__class__.__name__}(algo={self.options['algo']})"
-
-
-SUPPORTED_ALGORITHMS = {
-    a.algo_id: a for a in [
-        nvCompBatchAlgorithmLZ4,
-        nvCompBatchAlgorithmGdeflate,
-        nvCompBatchAlgorithmZstd,
-        nvCompBatchAlgorithmSnappy,
-        nvCompBatchAlgorithmDeflate,
-    ]
-}
diff --git a/python/kvikio/kvikio/_lib/mmap.pyx b/python/kvikio/kvikio/_lib/mmap.pyx
new file mode 100644
index 0000000000..46fc3846d0
--- /dev/null
+++ b/python/kvikio/kvikio/_lib/mmap.pyx
@@ -0,0 +1,144 @@
+# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+# See file LICENSE for terms.
+
+# distutils: language = c++
+# cython: language_level=3
+
+import os
+from typing import Any, Optional
+
+from posix cimport fcntl, stat
+
+from libc.stdint cimport uintptr_t
+from libcpp cimport bool
+from libcpp.optional cimport nullopt, optional
+from libcpp.string cimport string
+from libcpp.utility cimport move, pair
+
+from kvikio._lib.arr cimport parse_buffer_argument
+from kvikio._lib.future cimport IOFuture, _wrap_io_future, future
+
+from kvikio._lib import defaults
+
+
+cdef extern from "<kvikio/mmap.hpp>" namespace "kvikio" nogil:
+    cdef cppclass CppMmapHandle "kvikio::MmapHandle":
+        CppMmapHandle() noexcept
+        CppMmapHandle(string file_path, string flags, optional[size_t] initial_map_size,
+                      size_t initial_map_offset, fcntl.mode_t mode,
+                      optional[int] map_flags) except +
+        size_t initial_map_size() noexcept
+        size_t initial_map_offset() noexcept
+        size_t file_size() except +
+        void close() noexcept
+        bool closed() noexcept
+        size_t read(void* buf, optional[size_t] size, size_t offset) except +
+        future[size_t] pread(void* buf, optional[size_t] size, size_t offset,
+                             size_t task_size) except +
+
+cdef class InternalMmapHandle:
+    cdef CppMmapHandle _handle
+
+    def __init__(self, file_path: os.PathLike,
+                 flags: str = "r",
+                 initial_map_size: Optional[int] = None,
+                 initial_map_offset: int = 0,
+                 mode: int = stat.S_IRUSR | stat.S_IWUSR | stat.S_IRGRP | stat.S_IROTH,
+                 map_flags: Optional[int] = None):
+        if not os.path.exists(file_path):
+            raise RuntimeError("Unable to open file")
+
+        cdef string cpp_path_bytes = os.fsencode(file_path)
+        cdef string cpp_flags_bytes = str(flags).encode()
+
+        cdef optional[size_t] cpp_initial_map_size
+        if initial_map_size is None:
+            cpp_initial_map_size = nullopt
+        else:
+            cpp_initial_map_size = <size_t>(initial_map_size)
+
+        cdef size_t cpp_initial_map_offset = initial_map_offset
+        cdef fcntl.mode_t cpp_mode = mode
+
+        cdef optional[int] cpp_map_flags
+        if map_flags is None:
+            cpp_map_flags = nullopt
+        else:
+            cpp_map_flags = <int>(map_flags)
+
+        with nogil:
+            self._handle = move(CppMmapHandle(cpp_path_bytes,
+                                              cpp_flags_bytes,
+                                              cpp_initial_map_size,
+                                              cpp_initial_map_offset,
+                                              cpp_mode,
+                                              cpp_map_flags))
+
+    def initial_map_size(self) -> int:
+        cdef size_t result
+        with nogil:
+            result = self._handle.initial_map_size()
+        return result
+
+    def initial_map_offset(self) -> int:
+        cdef size_t result
+        with nogil:
+            result = self._handle.initial_map_offset()
+        return result
+
+    def file_size(self) -> int:
+        cdef size_t result
+        with nogil:
+            result = self._handle.file_size()
+        return result
+
+    def close(self) -> None:
+        with nogil:
+            self._handle.close()
+
+    def closed(self) -> bool:
+        cdef bool result
+        with nogil:
+            result = self._handle.closed()
+        return result
+
+    def read(self, buf: Any, size: Optional[int] = None, offset: int = 0) -> int:
+        cdef pair[uintptr_t, size_t] info = parse_buffer_argument(buf, size, True)
+        cdef optional[size_t] cpp_size
+        if size is None:
+            cpp_size = nullopt
+        else:
+            cpp_size = <size_t>(size)
+        cdef size_t cpp_offset = offset
+        cdef size_t result
+        with nogil:
+            result = self._handle.read(<void*>info.first,
+                                       cpp_size,
+                                       cpp_offset)
+        return result
+
+    def pread(self, buf: Any, size: Optional[int] = None, offset: int = 0,
+              task_size: Optional[int] = None) -> IOFuture:
+        cdef optional[size_t] cpp_size
+        cdef size_t cpp_task_size
+
+        if size is None:
+            cpp_size = nullopt
+        else:
+            cpp_size = <size_t>(size)
+        cdef pair[uintptr_t, size_t] info = parse_buffer_argument(buf, size, True)
+
+        cdef size_t cpp_offset = offset
+
+        if task_size is None:
+            cpp_task_size = defaults.task_size()
+        else:
+            cpp_task_size = task_size
+
+        cdef future[size_t] cpp_future
+        with nogil:
+            cpp_future = self._handle.pread(<void*>info.first,
+                                            cpp_size,
+                                            cpp_offset,
+                                            cpp_task_size)
+        return _wrap_io_future(cpp_future)
diff --git a/python/kvikio/kvikio/_lib/nvcomp_cxx_api.pxd b/python/kvikio/kvikio/_lib/nvcomp_cxx_api.pxd
deleted file mode 100644
index b86797a93f..0000000000
--- a/python/kvikio/kvikio/_lib/nvcomp_cxx_api.pxd
+++ /dev/null
@@ -1,212 +0,0 @@
-# Copyright (c) 2022 Carson Swope
-# Use, modification, and distribution is subject to the MIT License
-# https://github.com/carsonswope/py-nvcomp/blob/main/LICENSE)
-# SPDX-FileCopyrightText: Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES.
-# All rights reserved.
-# SPDX-License-Identifier: MIT
-# Permission is hereby granted, free of charge, to any person obtaining a
-# copy of this software and associated documentation files (the "Software"),
-# to deal in the Software without restriction, including without limitation
-# the rights to use, copy, modify, merge, publish, distribute, sublicense,
-# and/or sell copies of the Software, and to permit persons to whom the
-# Software is furnished to do so, subject to the following conditions:
-# The above copyright notice and this permission notice shall be included in
-# all copies or substantial portions of the Software.
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
-# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-# DEALINGS IN THE SOFTWARE.
-
-from libc.stdint cimport uint8_t, uint32_t
-from libcpp.memory cimport shared_ptr
-from libcpp.vector cimport vector
-
-
-cdef extern from "cuda_runtime.h":
-    ctypedef void* cudaStream_t
-
-cdef extern from "nvcomp.h":
-    ctypedef enum nvcompType_t:
-        NVCOMP_TYPE_CHAR = 0,       # 1B
-        NVCOMP_TYPE_UCHAR = 1,      # 1B
-        NVCOMP_TYPE_SHORT = 2,      # 2B
-        NVCOMP_TYPE_USHORT = 3,     # 2B
-        NVCOMP_TYPE_INT = 4,        # 4B
-        NVCOMP_TYPE_UINT = 5,       # 4B
-        NVCOMP_TYPE_LONGLONG = 6,   # 8B
-        NVCOMP_TYPE_ULONGLONG = 7,  # 8B
-        NVCOMP_TYPE_BITS = 0xff     # 1b
-
-
-cdef extern from "nvcomp/shared_types.h":
-    ctypedef enum nvcompStatus_t:
-        nvcompSuccess = 0,
-        nvcompErrorInvalidValue = 10,
-        nvcompErrorNotSupported = 11,
-        nvcompErrorCannotDecompress = 12,
-        nvcompErrorBadChecksum = 13,
-        nvcompErrorCannotVerifyChecksums = 14,
-        nvcompErrorCudaError = 1000,
-        nvcompErrorInternal = 10000,
-
-# Manager Factory
-cdef extern from "nvcomp/nvcompManagerFactory.hpp" namespace 'nvcomp':
-    cdef shared_ptr[nvcompManagerBase] create_manager "nvcomp::create_manager"(
-        const uint8_t* comp_buffer
-    ) except +
-
-
-# Compression Manager
-cdef extern from "nvcomp/nvcompManager.hpp" namespace 'nvcomp':
-    cdef cppclass PinnedPtrPool[T]:
-        pass
-
-    cdef cppclass CompressionConfig "nvcomp::CompressionConfig":
-        const size_t uncompressed_buffer_size
-        const size_t max_compressed_buffer_size
-        const size_t num_chunks
-        CompressionConfig(
-            PinnedPtrPool[nvcompStatus_t]* pool,
-            size_t uncompressed_buffer_size) except +
-        nvcompStatus_t* get_status() const
-        CompressionConfig(CompressionConfig& other)
-        CompressionConfig& operator=(const CompressionConfig& other) except +
-        # Commented as Cython doesn't support rvalues, but a user can call
-        # `move` with the existing operator and generate correct C++ code
-        # xref: https://github.com/cython/cython/issues/1445
-        # CompressionConfig& operator=(CompressionConfig&& other) except +
-
-    cdef cppclass DecompressionConfig "nvcomp::DecompressionConfig":
-        size_t decomp_data_size
-        uint32_t num_chunks
-        DecompressionConfig(PinnedPtrPool[nvcompStatus_t]& pool) except +
-        nvcompStatus_t* get_status() const
-        DecompressionConfig(DecompressionConfig& other)
-        DecompressionConfig& operator=(const DecompressionConfig& other) except +
-        # Commented as Cython doesn't support rvalues, but a user can call
-        # `move` with the existing operator and generate correct C++ code
-        # xref: https://github.com/cython/cython/issues/1445
-        # DecompressionConfig& operator=(DecompressionConfig&& other) except +
-
-    cdef cppclass nvcompManagerBase "nvcomp::nvcompManagerBase":
-        CompressionConfig configure_compression(
-            const size_t decomp_buffer_size)
-        void compress(
-            const uint8_t* decomp_buffer,
-            uint8_t* comp_buffer,
-            const CompressionConfig& comp_config) except +
-        DecompressionConfig configure_decompression(
-            const uint8_t* comp_buffer)
-        DecompressionConfig configure_decompression(
-            const CompressionConfig& comp_config)
-        void decompress(
-            uint8_t* decomp_buffer,
-            const uint8_t* comp_buffer,
-            const DecompressionConfig& decomp_config)
-        size_t get_compressed_output_size(uint8_t* comp_buffer) except +
-
-    cdef cppclass PimplManager "nvcomp::PimplManager":
-        CompressionConfig configure_compression(
-            const size_t decomp_buffer_size) except +
-        void compress(
-            const uint8_t* decomp_buffer,
-            uint8_t* comp_buffer,
-            const CompressionConfig& comp_config) except +
-        DecompressionConfig configure_decompression(
-            const uint8_t* comp_buffer)
-        DecompressionConfig configure_decompression(
-            const CompressionConfig& comp_config)
-        void decompress(
-            uint8_t* decomp_buffer,
-            const uint8_t* comp_buffer,
-            const DecompressionConfig& decomp_config) except +
-        size_t get_compressed_output_size(uint8_t* comp_buffer) except +
-
-# C++ Concrete ANS Manager
-cdef extern from "nvcomp/ans.h" nogil:
-    ctypedef enum nvcompANSType_t:
-        nvcomp_rANS = 0
-
-    ctypedef struct nvcompBatchedANSOpts_t:
-        nvcompANSType_t type
-    cdef nvcompBatchedANSOpts_t nvcompBatchedANSDefaultOpts
-
-cdef extern from "nvcomp/ans.hpp":
-    cdef cppclass ANSManager "nvcomp::ANSManager":
-        ANSManager(
-            size_t uncomp_chunk_size,
-            const nvcompBatchedANSOpts_t& format_opts,
-        ) except +
-
-# C++ Concrete Bitcomp Manager
-cdef extern from "nvcomp/bitcomp.h" nogil:
-    ctypedef struct nvcompBatchedBitcompFormatOpts:
-        int algorithm_type
-        nvcompType_t data_type
-    cdef nvcompBatchedBitcompFormatOpts nvcompBatchedBitcompDefaultOpts
-
-cdef extern from "nvcomp/bitcomp.hpp":
-    cdef cppclass BitcompManager "nvcomp::BitcompManager":
-        BitcompManager(
-            size_t uncomp_chunk_size,
-            const nvcompBatchedBitcompFormatOpts& format_opts,
-        ) except +
-
-# C++ Concrete Cascaded Manager
-cdef extern from "nvcomp/cascaded.h" nogil:
-    ctypedef struct nvcompBatchedCascadedOpts_t:
-        size_t chunk_size
-        nvcompType_t type
-        int num_RLEs
-        int num_deltas
-        int use_bp
-    cdef nvcompBatchedCascadedOpts_t nvcompBatchedCascadedDefaultOpts
-
-cdef extern from "nvcomp/cascaded.hpp" nogil:
-    cdef cppclass CascadedManager "nvcomp::CascadedManager":
-        CascadedManager(
-            size_t uncomp_chunk_size,
-            const nvcompBatchedCascadedOpts_t& options,
-        )
-
-# C++ Concrete Gdeflate Manager
-cdef extern from "nvcomp/gdeflate.h" nogil:
-    ctypedef struct nvcompBatchedGdeflateOpts_t:
-        int algo
-    cdef nvcompBatchedGdeflateOpts_t nvcompBatchedGdeflateDefaultOpts
-
-cdef extern from "nvcomp/gdeflate.hpp":
-    cdef cppclass GdeflateManager "nvcomp::GdeflateManager":
-        GdeflateManager(
-            int uncomp_chunk_size,
-            const nvcompBatchedGdeflateOpts_t& format_opts,
-        ) except +
-
-# C++ Concrete LZ4 Manager
-cdef extern from "nvcomp/gdeflate.h" nogil:
-    ctypedef struct nvcompBatchedLZ4Opts_t:
-        nvcompType_t data_type
-    cdef nvcompBatchedLZ4Opts_t nvcompBatchedLZ4DefaultOpts
-
-cdef extern from "nvcomp/lz4.hpp":
-    cdef cppclass LZ4Manager "nvcomp::LZ4Manager":
-        LZ4Manager(
-            size_t uncomp_chunk_size,
-            const nvcompBatchedLZ4Opts_t& format_opts,
-        ) except +
-
-# C++ Concrete Snappy Manager
-cdef extern from "nvcomp/snappy.h" nogil:
-    ctypedef struct nvcompBatchedSnappyOpts_t:
-        int reserved
-    cdef nvcompBatchedSnappyOpts_t nvcompBatchedSnappyDefaultOpts
-
-cdef extern from "nvcomp/snappy.hpp":
-    cdef cppclass SnappyManager "nvcomp::SnappyManager":
-        SnappyManager(
-            size_t uncomp_chunk_size,
-            const nvcompBatchedSnappyOpts_t& format_opts,
-        ) except +
diff --git a/python/kvikio/kvikio/_lib/nvcomp_ll_cxx_api.pxd b/python/kvikio/kvikio/_lib/nvcomp_ll_cxx_api.pxd
deleted file mode 100644
index 6a23eb5cd1..0000000000
--- a/python/kvikio/kvikio/_lib/nvcomp_ll_cxx_api.pxd
+++ /dev/null
@@ -1,362 +0,0 @@
-# Copyright (c) 2023-2024, NVIDIA CORPORATION. All rights reserved.
-# See file LICENSE for terms.
-
-# distutils: language = c++
-# cython: language_level=3
-
-cdef extern from "cuda_runtime.h":
-    ctypedef void* cudaStream_t
-
-    ctypedef enum cudaMemcpyKind:
-        cudaMemcpyHostToHost = 0,
-        cudaMemcpyHostToDevice = 1,
-        cudaMemcpyDeviceToHost = 2,
-        cudaMemcpyDeviceToDevice = 3,
-        cudaMemcpyDefault = 4
-
-cdef extern from "nvcomp.h":
-    ctypedef enum nvcompType_t:
-        NVCOMP_TYPE_CHAR = 0,       # 1B
-        NVCOMP_TYPE_UCHAR = 1,      # 1B
-        NVCOMP_TYPE_SHORT = 2,      # 2B
-        NVCOMP_TYPE_USHORT = 3,     # 2B
-        NVCOMP_TYPE_INT = 4,        # 4B
-        NVCOMP_TYPE_UINT = 5,       # 4B
-        NVCOMP_TYPE_LONGLONG = 6,   # 8B
-        NVCOMP_TYPE_ULONGLONG = 7,  # 8B
-        NVCOMP_TYPE_BITS = 0xff     # 1b
-
-cdef extern from "nvcomp/shared_types.h":
-    ctypedef enum nvcompStatus_t:
-        nvcompSuccess = 0,
-        nvcompErrorInvalidValue = 10,
-        nvcompErrorNotSupported = 11,
-        nvcompErrorCannotDecompress = 12,
-        nvcompErrorBadChecksum = 13,
-        nvcompErrorCannotVerifyChecksums = 14,
-        nvcompErrorCudaError = 1000,
-        nvcompErrorInternal = 10000,
-
-# nvCOMP Low-Level Interface.
-# https://github.com/NVIDIA/nvcomp/blob/main/doc/lowlevel_c_quickstart.md
-
-#
-# LZ4 batch compression/decompression API.
-#
-cdef extern from "nvcomp/lz4.h" nogil:
-    ctypedef struct nvcompBatchedLZ4Opts_t:
-        nvcompType_t data_type
-
-    cdef nvcompBatchedLZ4Opts_t nvcompBatchedLZ4DefaultOpts
-
-    # Compression API.
-    cdef nvcompStatus_t nvcompBatchedLZ4CompressGetTempSize(
-        size_t batch_size,
-        size_t max_uncompressed_chunk_bytes,
-        nvcompBatchedLZ4Opts_t format_opts,
-        size_t* temp_bytes
-    )
-
-    cdef nvcompStatus_t nvcompBatchedLZ4CompressGetMaxOutputChunkSize(
-        size_t max_uncompressed_chunk_bytes,
-        nvcompBatchedLZ4Opts_t format_opts,
-        size_t* max_compressed_bytes
-    )
-
-    cdef nvcompStatus_t nvcompBatchedLZ4CompressAsync(
-        const void* const* device_uncompressed_ptrs,
-        const size_t* device_uncompressed_bytes,
-        size_t max_uncompressed_chunk_bytes,
-        size_t batch_size,
-        void* device_temp_ptr,
-        size_t temp_bytes,
-        void* const* device_compressed_ptrs,
-        size_t* device_compressed_bytes,
-        nvcompBatchedLZ4Opts_t format_opts,
-        cudaStream_t stream
-    )
-
-    # Decompression API.
-    cdef nvcompStatus_t nvcompBatchedLZ4DecompressGetTempSize(
-        size_t num_chunks,
-        size_t max_uncompressed_chunk_bytes,
-        size_t* temp_bytes
-    )
-
-    cdef nvcompStatus_t nvcompBatchedLZ4GetDecompressSizeAsync(
-        const void* const* device_compressed_ptrs,
-        const size_t* device_compressed_bytes,
-        size_t* device_uncompressed_bytes,
-        size_t batch_size,
-        cudaStream_t stream
-    )
-
-    nvcompStatus_t nvcompBatchedLZ4DecompressAsync(
-        const void* const* device_compressed_ptrs,
-        const size_t* device_compressed_bytes,
-        const size_t* device_uncompressed_bytes,
-        size_t* device_actual_uncompressed_bytes,
-        size_t batch_size,
-        void* const device_temp_ptr,
-        size_t temp_bytes,
-        void* const* device_uncompressed_ptrs,
-        nvcompStatus_t* device_statuses,
-        cudaStream_t stream
-    )
-
-#
-# Gdeflate batch compression/decompression API.
-#
-cdef extern from "nvcomp/gdeflate.h" nogil:
-    ctypedef struct nvcompBatchedGdeflateOpts_t:
-        int algo
-
-    cdef nvcompBatchedGdeflateOpts_t nvcompBatchedGdeflateDefaultOpts
-
-    # Compression API.
-    cdef nvcompStatus_t nvcompBatchedGdeflateCompressGetTempSize(
-        size_t batch_size,
-        size_t max_uncompressed_chunk_bytes,
-        nvcompBatchedGdeflateOpts_t format_opts,
-        size_t* temp_bytes
-    )
-
-    cdef nvcompStatus_t nvcompBatchedGdeflateCompressGetMaxOutputChunkSize(
-        size_t max_uncompressed_chunk_bytes,
-        nvcompBatchedGdeflateOpts_t format_opts,
-        size_t* max_compressed_bytes
-    )
-
-    cdef nvcompStatus_t nvcompBatchedGdeflateCompressAsync(
-        const void* const* device_uncompressed_ptrs,
-        const size_t* device_uncompressed_bytes,
-        size_t max_uncompressed_chunk_bytes,
-        size_t batch_size,
-        void* device_temp_ptr,
-        size_t temp_bytes,
-        void* const* device_compressed_ptrs,
-        size_t* device_compressed_bytes,
-        nvcompBatchedGdeflateOpts_t format_opts,
-        cudaStream_t stream
-    )
-
-    # Decompression API.
-    cdef nvcompStatus_t nvcompBatchedGdeflateDecompressGetTempSize(
-        size_t num_chunks,
-        size_t max_uncompressed_chunk_bytes,
-        size_t* temp_bytes
-    )
-
-    nvcompStatus_t nvcompBatchedGdeflateGetDecompressSizeAsync(
-        const void* const* device_compressed_ptrs,
-        const size_t* device_compressed_bytes,
-        size_t* device_uncompressed_bytes,
-        size_t batch_size,
-        cudaStream_t stream
-    )
-
-    nvcompStatus_t nvcompBatchedGdeflateDecompressAsync(
-        const void* const* device_compressed_ptrs,
-        const size_t* device_compressed_bytes,
-        const size_t* device_uncompressed_bytes,
-        size_t* device_actual_uncompressed_bytes,
-        size_t batch_size,
-        void* const device_temp_ptr,
-        size_t temp_bytes,
-        void* const* device_uncompressed_ptrs,
-        nvcompStatus_t* device_statuses,
-        cudaStream_t stream
-    )
-
-#
-# zstd batch compression/decompression API.
-#
-cdef extern from "nvcomp/zstd.h" nogil:
-    ctypedef struct nvcompBatchedZstdOpts_t:
-        int reserved
-
-    cdef nvcompBatchedZstdOpts_t nvcompBatchedZstdDefaultOpts
-
-    # Compression API.
-    cdef nvcompStatus_t nvcompBatchedZstdCompressGetTempSize(
-        size_t batch_size,
-        size_t max_uncompressed_chunk_bytes,
-        nvcompBatchedZstdOpts_t format_opts,
-        size_t* temp_bytes
-    )
-
-    cdef nvcompStatus_t nvcompBatchedZstdCompressGetMaxOutputChunkSize(
-        size_t max_uncompressed_chunk_bytes,
-        nvcompBatchedZstdOpts_t format_opts,
-        size_t* max_compressed_bytes
-    )
-
-    cdef nvcompStatus_t nvcompBatchedZstdCompressAsync(
-        const void* const* device_uncompressed_ptrs,
-        const size_t* device_uncompressed_bytes,
-        size_t max_uncompressed_chunk_bytes,
-        size_t batch_size,
-        void* device_temp_ptr,
-        size_t temp_bytes,
-        void* const* device_compressed_ptrs,
-        size_t* device_compressed_bytes,
-        nvcompBatchedZstdOpts_t format_opts,
-        cudaStream_t stream
-    )
-
-    # Decompression API.
-    cdef nvcompStatus_t nvcompBatchedZstdDecompressGetTempSize(
-        size_t num_chunks,
-        size_t max_uncompressed_chunk_bytes,
-        size_t* temp_bytes
-    )
-
-    nvcompStatus_t nvcompBatchedZstdGetDecompressSizeAsync(
-        const void* const* device_compressed_ptrs,
-        const size_t* device_compressed_bytes,
-        size_t* device_uncompressed_bytes,
-        size_t batch_size,
-        cudaStream_t stream
-    )
-
-    nvcompStatus_t nvcompBatchedZstdDecompressAsync(
-        const void* const* device_compressed_ptrs,
-        const size_t* device_compressed_bytes,
-        const size_t* device_uncompressed_bytes,
-        size_t* device_actual_uncompressed_bytes,
-        size_t batch_size,
-        void* const device_temp_ptr,
-        size_t temp_bytes,
-        void* const* device_uncompressed_ptrs,
-        nvcompStatus_t* device_statuses,
-        cudaStream_t stream
-    )
-
-#
-# Snappy batch compression/decompression API.
-#
-cdef extern from "nvcomp/snappy.h" nogil:
-    ctypedef struct nvcompBatchedSnappyOpts_t:
-        int reserved
-
-    cdef nvcompBatchedSnappyOpts_t nvcompBatchedSnappyDefaultOpts
-
-    # Compression API.
-    cdef nvcompStatus_t nvcompBatchedSnappyCompressGetTempSize(
-        size_t batch_size,
-        size_t max_uncompressed_chunk_bytes,
-        nvcompBatchedSnappyOpts_t format_opts,
-        size_t* temp_bytes
-    )
-
-    cdef nvcompStatus_t nvcompBatchedSnappyCompressGetMaxOutputChunkSize(
-        size_t max_uncompressed_chunk_bytes,
-        nvcompBatchedSnappyOpts_t format_opts,
-        size_t* max_compressed_bytes
-    )
-
-    cdef nvcompStatus_t nvcompBatchedSnappyCompressAsync(
-        const void* const* device_uncompressed_ptrs,
-        const size_t* device_uncompressed_bytes,
-        size_t max_uncompressed_chunk_bytes,
-        size_t batch_size,
-        void* device_temp_ptr,
-        size_t temp_bytes,
-        void* const* device_compressed_ptrs,
-        size_t* device_compressed_bytes,
-        nvcompBatchedSnappyOpts_t format_opts,
-        cudaStream_t stream
-    )
-
-    # Decompression API.
-    cdef nvcompStatus_t nvcompBatchedSnappyDecompressGetTempSize(
-        size_t num_chunks,
-        size_t max_uncompressed_chunk_bytes,
-        size_t* temp_bytes
-    )
-
-    nvcompStatus_t nvcompBatchedSnappyGetDecompressSizeAsync(
-        const void* const* device_compressed_ptrs,
-        const size_t* device_compressed_bytes,
-        size_t* device_uncompressed_bytes,
-        size_t batch_size,
-        cudaStream_t stream
-    )
-
-    nvcompStatus_t nvcompBatchedSnappyDecompressAsync(
-        const void* const* device_compressed_ptrs,
-        const size_t* device_compressed_bytes,
-        const size_t* device_uncompressed_bytes,
-        size_t* device_actual_uncompressed_bytes,
-        size_t batch_size,
-        void* const device_temp_ptr,
-        size_t temp_bytes,
-        void* const* device_uncompressed_ptrs,
-        nvcompStatus_t* device_statuses,
-        cudaStream_t stream
-    )
-
-
-#
-# Deflate batch compression/decompression API.
-#
-cdef extern from "nvcomp/deflate.h" nogil:
-    ctypedef struct nvcompBatchedDeflateOpts_t:
-        int algo
-
-    cdef nvcompBatchedDeflateOpts_t nvcompBatchedDeflateDefaultOpts
-
-    # Compression API.
-    cdef nvcompStatus_t nvcompBatchedDeflateCompressGetTempSize(
-        size_t batch_size,
-        size_t max_uncompressed_chunk_bytes,
-        nvcompBatchedDeflateOpts_t format_opts,
-        size_t* temp_bytes
-    )
-
-    cdef nvcompStatus_t nvcompBatchedDeflateCompressGetMaxOutputChunkSize(
-        size_t max_uncompressed_chunk_bytes,
-        nvcompBatchedDeflateOpts_t format_opts,
-        size_t* max_compressed_bytes
-    )
-
-    cdef nvcompStatus_t nvcompBatchedDeflateCompressAsync(
-        const void* const* device_uncompressed_ptrs,
-        const size_t* device_uncompressed_bytes,
-        size_t max_uncompressed_chunk_bytes,
-        size_t batch_size,
-        void* device_temp_ptr,
-        size_t temp_bytes,
-        void* const* device_compressed_ptrs,
-        size_t* device_compressed_bytes,
-        nvcompBatchedDeflateOpts_t format_opts,
-        cudaStream_t stream
-    )
-
-    # Decompression API.
-    cdef nvcompStatus_t nvcompBatchedDeflateDecompressGetTempSize(
-        size_t num_chunks,
-        size_t max_uncompressed_chunk_bytes,
-        size_t* temp_bytes
-    )
-
-    nvcompStatus_t nvcompBatchedDeflateGetDecompressSizeAsync(
-        const void* const* device_compressed_ptrs,
-        const size_t* device_compressed_bytes,
-        size_t* device_uncompressed_bytes,
-        size_t batch_size,
-        cudaStream_t stream
-    )
-
-    nvcompStatus_t nvcompBatchedDeflateDecompressAsync(
-        const void* const* device_compressed_ptrs,
-        const size_t* device_compressed_bytes,
-        const size_t* device_uncompressed_bytes,
-        size_t* device_actual_uncompressed_bytes,
-        size_t batch_size,
-        void* const device_temp_ptr,
-        size_t temp_bytes,
-        void* const* device_uncompressed_ptrs,
-        nvcompStatus_t* device_statuses,
-        cudaStream_t stream
-    )
diff --git a/python/kvikio/kvikio/_lib/remote_handle.pyx b/python/kvikio/kvikio/_lib/remote_handle.pyx
index dfb662a9fb..8fae78c534 100644
--- a/python/kvikio/kvikio/_lib/remote_handle.pyx
+++ b/python/kvikio/kvikio/_lib/remote_handle.pyx
@@ -7,17 +7,26 @@
 from typing import Optional
 
 from cython.operator cimport dereference as deref
-from libc.stdint cimport uintptr_t
+from libc.stdint cimport uint8_t, uintptr_t
 from libcpp.memory cimport make_unique, unique_ptr
+from libcpp.optional cimport nullopt, optional
 from libcpp.pair cimport pair
 from libcpp.string cimport string
 from libcpp.utility cimport move, pair
+from libcpp.vector cimport vector
 
 from kvikio._lib.arr cimport parse_buffer_argument
 from kvikio._lib.future cimport IOFuture, _wrap_io_future, future
 
 
-cdef extern from "<kvikio/remote_handle.hpp>" nogil:
+cdef extern from "<kvikio/remote_handle.hpp>" namespace "kvikio" nogil:
+    cpdef enum class RemoteEndpointType(uint8_t):
+        AUTO = 0
+        S3 = 1
+        S3_PUBLIC = 2
+        S3_PRESIGNED_URL = 3
+        WEBHDFS = 4
+        HTTP = 5
     cdef cppclass cpp_RemoteEndpoint "kvikio::RemoteEndpoint":
         string str() except +
 
@@ -31,12 +40,20 @@ cdef extern from "<kvikio/remote_handle.hpp>" nogil:
     pair[string, string] cpp_parse_s3_url \
         "kvikio::S3Endpoint::parse_s3_url"(string url) except +
 
+    cdef cppclass cpp_S3PublicEndpoint "kvikio::S3PublicEndpoint" (cpp_RemoteEndpoint):
+        cpp_S3PublicEndpoint(string url) except +
+
+    cdef cppclass cpp_S3EndpointWithPresignedUrl "kvikio::S3EndpointWithPresignedUrl" \
+                                                 (cpp_RemoteEndpoint):
+        cpp_S3EndpointWithPresignedUrl(string presigned_url) except +
+
     cdef cppclass cpp_RemoteHandle "kvikio::RemoteHandle":
         cpp_RemoteHandle(
             unique_ptr[cpp_RemoteEndpoint] endpoint, size_t nbytes
         ) except +
         cpp_RemoteHandle(unique_ptr[cpp_RemoteEndpoint] endpoint) except +
-        size_t nbytes() except +
+        RemoteEndpointType remote_endpoint_type() noexcept
+        size_t nbytes() noexcept
         const cpp_RemoteEndpoint& endpoint() except +
         size_t read(
             void* buf,
@@ -49,6 +66,17 @@ cdef extern from "<kvikio/remote_handle.hpp>" nogil:
             size_t file_offset
         ) except +
 
+        @staticmethod
+        cpp_RemoteHandle cpp_easy_open "open"(
+            string url,
+            RemoteEndpointType remote_endpoint_type,
+            optional[vector[RemoteEndpointType]] allow_list,
+            optional[size_t] nbytes
+        ) except +
+
+cdef extern from "<kvikio/hdfs.hpp>" nogil:
+    cdef cppclass cpp_WebHdfsEndpoint "kvikio::WebHdfsEndpoint"(cpp_RemoteEndpoint):
+        cpp_WebHdfsEndpoint(string url) except +
 
 cdef string _to_string(str s):
     """Convert Python object to a C++ string (if None, return the empty string)"""
@@ -61,8 +89,9 @@ cdef pair[string, string] _to_string_pair(str s1, str s2):
     """Wrap two Python string objects in a C++ pair"""
     return pair[string, string](_to_string(s1), _to_string(s2))
 
+
 # Helper function to cast an endpoint to its base class `RemoteEndpoint`
-cdef extern from *:
+cdef extern from * nogil:
     """
     template <typename T>
     std::unique_ptr<kvikio::RemoteEndpoint> cast_to_remote_endpoint(T endpoint)
@@ -72,6 +101,28 @@ cdef extern from *:
     """
     cdef unique_ptr[cpp_RemoteEndpoint] cast_to_remote_endpoint[T](T handle) except +
 
+# Helper function for the cpp_RemoteHandle.open method to return
+# unique_ptr[cpp_RemoteHandle] instead of cpp_RemoteHandle. Due to lack of a nullary
+# constructor, cpp_RemoteHandle cannot be created as a stack variable in Cython.
+cdef extern from * nogil:
+    """
+    inline std::unique_ptr<kvikio::RemoteHandle> create_remote_handle_from_open(
+        std::string url,
+        kvikio::RemoteEndpointType remote_endpoint_type,
+        std::optional<std::vector<kvikio::RemoteEndpointType>> allow_list,
+        std::optional<std::size_t> nbytes)
+    {
+        return std::make_unique<kvikio::RemoteHandle>(
+            kvikio::RemoteHandle::open(url, remote_endpoint_type, allow_list, nbytes)
+        );
+    }
+    """
+    cdef unique_ptr[cpp_RemoteHandle] create_remote_handle_from_open(
+        string url,
+        RemoteEndpointType remote_endpoint_type,
+        optional[vector[RemoteEndpointType]] allow_list,
+        optional[size_t] nbytes
+    ) except +
 
 cdef class RemoteFile:
     cdef unique_ptr[cpp_RemoteHandle] _handle
@@ -82,11 +133,16 @@ cdef class RemoteFile:
         nbytes: Optional[int],
     ):
         cdef RemoteFile ret = RemoteFile()
+
         if nbytes is None:
-            ret._handle = make_unique[cpp_RemoteHandle](move(ep))
+            with nogil:
+                ret._handle = make_unique[cpp_RemoteHandle](move(ep))
             return ret
+
         cdef size_t n = nbytes
-        ret._handle = make_unique[cpp_RemoteHandle](move(ep), n)
+
+        with nogil:
+            ret._handle = make_unique[cpp_RemoteHandle](move(ep), n)
         return ret
 
     @staticmethod
@@ -94,10 +150,16 @@ cdef class RemoteFile:
         url: str,
         nbytes: Optional[int],
     ):
+        cdef string cpp_url = _to_string(url)
+        cdef unique_ptr[cpp_RemoteEndpoint] cpp_endpoint
+
+        with nogil:
+            cpp_endpoint = cast_to_remote_endpoint(
+                make_unique[cpp_HttpEndpoint](cpp_url)
+            )
+
         return RemoteFile._from_endpoint(
-            cast_to_remote_endpoint(
-                make_unique[cpp_HttpEndpoint](_to_string(url))
-            ),
+            move(cpp_endpoint),
             nbytes
         )
 
@@ -107,12 +169,18 @@ cdef class RemoteFile:
         object_name: str,
         nbytes: Optional[int],
     ):
+        cdef pair[string, string] bucket_and_object_names = _to_string_pair(
+            bucket_name, object_name
+        )
+        cdef unique_ptr[cpp_RemoteEndpoint] cpp_endpoint
+
+        with nogil:
+            cpp_endpoint = cast_to_remote_endpoint(
+                make_unique[cpp_S3Endpoint](bucket_and_object_names)
+            )
+
         return RemoteFile._from_endpoint(
-            cast_to_remote_endpoint(
-                make_unique[cpp_S3Endpoint](
-                    _to_string_pair(bucket_name, object_name)
-                )
-            ),
+            move(cpp_endpoint),
             nbytes
         )
 
@@ -121,10 +189,16 @@ cdef class RemoteFile:
         url: str,
         nbytes: Optional[int],
     ):
+        cdef string cpp_url = _to_string(url)
+        cdef unique_ptr[cpp_RemoteEndpoint] cpp_endpoint
+
+        with nogil:
+            cpp_endpoint = cast_to_remote_endpoint(
+                make_unique[cpp_S3Endpoint](cpp_url)
+            )
+
         return RemoteFile._from_endpoint(
-            cast_to_remote_endpoint(
-                make_unique[cpp_S3Endpoint](_to_string(url))
-            ),
+            move(cpp_endpoint),
             nbytes
         )
 
@@ -133,35 +207,146 @@ cdef class RemoteFile:
         url: str,
         nbytes: Optional[int],
     ):
-        cdef pair[string, string] bucket_and_object = cpp_parse_s3_url(_to_string(url))
+        cdef string cpp_url = _to_string(url)
+        cdef pair[string, string] bucket_and_object_names
+        cdef unique_ptr[cpp_RemoteEndpoint] cpp_endpoint
+
+        with nogil:
+            bucket_and_object_names = cpp_parse_s3_url(cpp_url)
+            cpp_endpoint = cast_to_remote_endpoint(
+                make_unique[cpp_S3Endpoint](bucket_and_object_names)
+            )
+
+        return RemoteFile._from_endpoint(
+            move(cpp_endpoint),
+            nbytes
+        )
+
+    @staticmethod
+    def open_s3_public(
+        url: str,
+        nbytes: Optional[int],
+    ):
+        cdef string cpp_url = _to_string(url)
+        cdef unique_ptr[cpp_RemoteEndpoint] cpp_endpoint
+
+        with nogil:
+            cpp_endpoint = cast_to_remote_endpoint(
+                make_unique[cpp_S3PublicEndpoint](cpp_url)
+            )
+
+        return RemoteFile._from_endpoint(
+            move(cpp_endpoint),
+            nbytes
+        )
+
+    @staticmethod
+    def open_s3_presigned_url(
+        presigned_url: str,
+        nbytes: Optional[int],
+    ):
+        cdef string cpp_url = _to_string(presigned_url)
+        cdef unique_ptr[cpp_RemoteEndpoint] cpp_endpoint
+
+        with nogil:
+            cpp_endpoint = cast_to_remote_endpoint(
+                make_unique[cpp_S3EndpointWithPresignedUrl](cpp_url)
+            )
+
+        return RemoteFile._from_endpoint(
+            move(cpp_endpoint),
+            nbytes
+        )
+
+    @staticmethod
+    def open_webhdfs(
+        url: str,
+        nbytes: Optional[int],
+    ):
         return RemoteFile._from_endpoint(
             cast_to_remote_endpoint(
-                make_unique[cpp_S3Endpoint](bucket_and_object)
+                make_unique[cpp_WebHdfsEndpoint](_to_string(url))
             ),
             nbytes
         )
 
+    @staticmethod
+    def open(
+        url: str,
+        remote_endpoint_type: RemoteEndpointType,
+        allow_list: Optional[list],
+        nbytes: Optional[int]
+    ):
+        cdef optional[vector[RemoteEndpointType]] cpp_allow_list
+        cdef vector[RemoteEndpointType] vec_allow_list
+        if allow_list is None:
+            cpp_allow_list = nullopt
+        else:
+            for allow_item in allow_list:
+                vec_allow_list.push_back(allow_item.value)
+            cpp_allow_list = vec_allow_list
+
+        cdef optional[size_t] cpp_nbytes
+        if nbytes is None:
+            cpp_nbytes = nullopt
+        else:
+            cpp_nbytes = <size_t>nbytes
+
+        cdef RemoteFile ret = RemoteFile()
+        cdef unique_ptr[cpp_RemoteHandle] cpp_handle
+        cdef string cpp_url = _to_string(url)
+        with nogil:
+            cpp_handle = create_remote_handle_from_open(
+                cpp_url,
+                remote_endpoint_type,
+                cpp_allow_list,
+                cpp_nbytes)
+        ret._handle = move(cpp_handle)
+
+        return ret
+
     def __str__(self) -> str:
-        cdef string ep_str = deref(self._handle).endpoint().str()
+        cdef string ep_str
+        with nogil:
+            ep_str = deref(self._handle).endpoint().str()
         return f'<{self.__class__.__name__} "{ep_str.decode()}">'
 
+    def remote_endpoint_type(self) -> RemoteEndpointType:
+        cdef RemoteEndpointType result
+        with nogil:
+            result = deref(self._handle).remote_endpoint_type()
+        return result
+
     def nbytes(self) -> int:
-        return deref(self._handle).nbytes()
+        cdef size_t result
+        with nogil:
+            result = deref(self._handle).nbytes()
+        return result
 
     def read(self, buf, size: Optional[int], file_offset: int) -> int:
         cdef pair[uintptr_t, size_t] info = parse_buffer_argument(buf, size, True)
-        return deref(self._handle).read(
-            <void*>info.first,
-            info.second,
-            file_offset,
-        )
+        cdef size_t cpp_file_offset = file_offset
+        cdef size_t result
+
+        with nogil:
+            result = deref(self._handle).read(
+                <void*>info.first,
+                info.second,
+                cpp_file_offset,
+            )
+
+        return result
 
     def pread(self, buf, size: Optional[int], file_offset: int) -> IOFuture:
         cdef pair[uintptr_t, size_t] info = parse_buffer_argument(buf, size, True)
-        return _wrap_io_future(
-            deref(self._handle).pread(
+        cdef size_t cpp_file_offset = file_offset
+        cdef future[size_t] fut
+
+        with nogil:
+            fut = deref(self._handle).pread(
                 <void*>info.first,
                 info.second,
-                file_offset,
+                cpp_file_offset,
             )
-        )
+
+        return _wrap_io_future(fut)
diff --git a/python/kvikio/kvikio/_nvcomp.py b/python/kvikio/kvikio/_nvcomp.py
deleted file mode 100644
index 5606ad5ce5..0000000000
--- a/python/kvikio/kvikio/_nvcomp.py
+++ /dev/null
@@ -1,368 +0,0 @@
-# Copyright (c) 2021-2025, NVIDIA CORPORATION. All rights reserved.
-# See file LICENSE for terms.
-
-from enum import Enum
-
-import cupy as cp
-import numpy as np
-
-import kvikio._lib.libnvcomp as _lib
-from kvikio._lib.arr import asarray
-
-_dtype_map = {
-    cp.dtype("int8"): _lib.pyNvcompType_t.pyNVCOMP_TYPE_CHAR,
-    cp.dtype("uint8"): _lib.pyNvcompType_t.pyNVCOMP_TYPE_UCHAR,
-    cp.dtype("int16"): _lib.pyNvcompType_t.pyNVCOMP_TYPE_SHORT,
-    cp.dtype("uint16"): _lib.pyNvcompType_t.pyNVCOMP_TYPE_USHORT,
-    cp.dtype("int32"): _lib.pyNvcompType_t.pyNVCOMP_TYPE_INT,
-    cp.dtype("uint32"): _lib.pyNvcompType_t.pyNVCOMP_TYPE_UINT,
-    cp.dtype("int64"): _lib.pyNvcompType_t.pyNVCOMP_TYPE_LONGLONG,
-    cp.dtype("uint64"): _lib.pyNvcompType_t.pyNVCOMP_TYPE_ULONGLONG,
-}
-
-
-def cp_to_nvcomp_dtype(in_type: cp.dtype) -> Enum:
-    """Convert np/cp dtypes to nvcomp integral dtypes.
-
-    Parameters
-    ----------
-    in_type
-        A type argument that can be used to initialize a cupy/numpy dtype.
-
-    Returns
-    -------
-    int
-        The value of the NVCOMP_TYPE for supported dtype.
-    """
-    cp_type = cp.dtype(in_type)
-    return _dtype_map[cp_type]
-
-
-class nvCompManager:
-    """Base class for nvComp Compression Managers.
-
-    Compression managers compress uncompressed data and decompress the result.
-
-    Child types of nvCompManager implement only their constructor, as they each
-    take different options to build. The rest of their implementation is
-    in nvCompManager.
-
-    nvCompManager also keeps all of the options for its child types.
-    """
-
-    _manager: _lib._nvcompManager = None
-    config: dict = {}
-    decompression_config: dict = {}
-
-    # This is a python option: What type was the data when it was passed in?
-    # This is used only for returning a decompressed view of the original
-    # datatype. Untested so far.
-    input_type = cp.int8
-
-    # Default options exist for every option type for every class that inherits
-    # from nvCompManager, which takes advantage of the below property-setting
-    # code.
-    chunk_size: int = 1 << 16
-    data_type: _lib.pyNvcompType_t = _lib.pyNvcompType_t.pyNVCOMP_TYPE_UCHAR
-    # Some classes have this defined as type, some as data_type.
-    type: _lib.pyNvcompType_t = _lib.pyNvcompType_t.pyNVCOMP_TYPE_UCHAR
-
-    # Bitcomp Defaults
-    bitcomp_algo: int = 0
-
-    # Gdeflate defaults
-    algo: int = 0
-
-    def __init__(self, kwargs):
-        """Stores the results of all input arguments as class members.
-
-        This code does type correction, fixing inputs to have an expected
-        shape before calling one of the nvCompManager methods on a child
-        class.
-
-        Special case: Convert data_type to a _lib.pyNvcompType_t
-        """
-        # data_type will be passed in as a python object. Convert it to
-        # a C++ nvcompType_t here.
-        if kwargs.get("data_type"):
-            if not isinstance(kwargs["data_type"], _lib.pyNvcompType_t):
-                kwargs["input_type"] = kwargs.get("data_type")
-                kwargs["data_type"] = cp_to_nvcomp_dtype(
-                    cp.dtype(kwargs["data_type"]).type
-                )
-        # Special case: Convert type to a _lib.pyNvcompType_t
-        if kwargs.get("type"):
-            if not isinstance(kwargs["type"], _lib.pyNvcompType_t):
-                kwargs["input_type"] = kwargs.get("type")
-                kwargs["type"] = cp_to_nvcomp_dtype(cp.dtype(kwargs["type"]).type)
-        for k, v in kwargs.items():
-            setattr(self, k, v)
-
-    def compress(self, data: cp.ndarray) -> cp.ndarray:
-        """Compress a buffer.
-
-        Parameters
-        ----------
-        data: cp.ndarray
-            A GPU buffer of data to compress.
-
-        Returns
-        -------
-        cp.ndarray
-            A GPU buffer of compressed bytes.
-        """
-        # TODO: An option: check if incoming data size matches the size of the
-        # last incoming data, and reuse temp and out buffer if so.
-        data_size = data.size * data.itemsize
-        self.config = self._manager.configure_compression(data_size)
-        self.compress_out_buffer = cp.empty(
-            self.config["max_compressed_buffer_size"], dtype="uint8"
-        )
-        size = self._manager.compress(asarray(data), asarray(self.compress_out_buffer))
-        return self.compress_out_buffer[0:size]
-
-    def decompress(self, data: cp.ndarray) -> cp.ndarray:
-        """Decompress a GPU buffer.
-
-        Parameters
-        ----------
-        data: cp.ndarray
-            A GPU buffer of data to decompress.
-
-        Returns
-        -------
-        cp.ndarray
-            An array of `self.dtype` produced after decompressing the input argument.
-        """
-        self.decompression_config = (
-            self._manager.configure_decompression_with_compressed_buffer(asarray(data))
-        )
-        decomp_buffer = cp.empty(
-            self.decompression_config["decomp_data_size"], dtype="uint8"
-        )
-        self._manager.decompress(asarray(decomp_buffer), asarray(data))
-        return decomp_buffer.view(self.input_type)
-
-    def configure_compression(self, data_size: int) -> dict:
-        """Return the compression configuration object.
-
-        Parameters
-        ----------
-        data_size: int
-            The size of the buffer that is staged to be compressed.
-
-        Returns
-        -------
-        dict {
-            "uncompressed_buffer_size": The size of the input data
-            "max_compressed_buffer_size": The maximum size of the compressed data. The
-                size of the buffer that must be allocated before calling compress.
-            "num_chunks": The number of configured chunks to compress the data over
-        }
-        """
-        return self._manager.configure_compression(data_size)
-
-    def configure_decompression_with_compressed_buffer(
-        self, data: cp.ndarray
-    ) -> cp.ndarray:
-        """Return the decompression configuration object.
-
-        Parameters
-        ----------
-        data: cp.ndarray
-            A GPU buffer of previously compressed data.
-
-        Returns
-        -------
-        dict {
-            "decomp_data_size": The size of each decompression chunk.
-            "num_chunks": The number of chunks that the decompressed data is returned
-            in.
-        }
-        """
-        return self._manager.configure_decompression_with_compressed_buffer(
-            asarray(data)
-        )
-
-    def get_compressed_output_size(self, comp_buffer: cp.ndarray) -> int:
-        """Return the actual size of compression result.
-
-        Returns the number of bytes that should be copied out of
-        `comp_buffer`.
-
-        Parameters
-        ----------
-        comp_buffer: cp.ndarray
-            A GPU buffer that has been previously compressed.
-
-        Returns
-        -------
-        int
-        """
-        return self._manager.get_compressed_output_size(asarray(comp_buffer))
-
-
-class ANSManager(nvCompManager):
-    def __init__(self, **kwargs):
-        """Initialize an ANSManager object.
-
-        Used to compress and decompress GPU buffers.
-        All parameters are optional and will be set to usable defaults.
-
-        Parameters
-        ----------
-        chunk_size: int (optional)
-            Defaults to 4096.
-        """
-        super().__init__(kwargs)
-
-        self._manager = _lib._ANSManager(self.chunk_size)
-
-
-class BitcompManager(nvCompManager):
-    def __init__(self, **kwargs):
-        """Create a GPU BitcompCompressor object.
-
-        Used to compress and decompress GPU buffers.
-        All parameters are optional and will be set to usable defaults.
-
-        Parameters
-        ----------
-        chunk_size: int (optional)
-            Defaults to 4096.
-        """
-        super().__init__(kwargs)
-
-        self._manager = _lib._BitcompManager(
-            self.chunk_size,
-            self.data_type.value,
-            self.bitcomp_algo,
-        )
-
-
-class CascadedManager(nvCompManager):
-    def __init__(self, **kwargs):
-        """Initialize a CascadedManager for a specific dtype.
-
-        Used to compress and decompress GPU buffers.
-        All parameters are optional and will be set to usable defaults.
-
-        Parameters
-        ----------
-        chunk_size: int (optional)
-            Defaults to 4096 and can't currently be changed.
-        dtype: cp.dtype (optional)
-            The dtype of the input buffer to be compressed.
-        num_RLEs: int (optional)
-            Number of Run-Length Encoders to use, see [algorithms overview.md](
-                https://github.com/NVIDIA/nvcomp/blob/main/doc/algorithms_overview.md#run-length-encoding-rle)  # noqa: E501
-        num_deltas: int (optional)
-            Number of Delta Encoders to use, see [algorithms overview.md](
-                https://github.com/NVIDIA/nvcomp/blob/main/doc/algorithms_overview.md#delta-encoding)  # noqa: E501
-        use_bp: bool (optional)
-            Enable Bitpacking, see [algorithms overview.md](
-                https://github.com/NVIDIA/nvcomp/blob/main/doc/algorithms_overview.md#bitpacking)  # noqa: E501
-        """
-        super().__init__(kwargs)
-        default_options = {
-            "chunk_size": 1 << 12,
-            "type": np.int32,
-            "num_RLEs": 2,
-            "num_deltas": 1,
-            "use_bp": True,
-        }
-        # Replace any options that may have been excluded, they are not optional.
-        for k, v in default_options.items():
-            try:
-                getattr(self, k)
-            except Exception:
-                setattr(self, k, v)
-
-        self.options = {
-            "chunk_size": self.chunk_size,
-            "type": self.type,
-            "num_RLEs": self.num_RLEs,
-            "num_deltas": self.num_deltas,
-            "use_bp": self.use_bp,
-        }
-        self._manager = _lib._CascadedManager(default_options)
-
-
-class GdeflateManager(nvCompManager):
-    def __init__(self, **kwargs):
-        """Create a GPU GdeflateCompressor object.
-
-        Used to compress and decompress GPU buffers.
-        All parameters are optional and will be set to usable defaults.
-
-        Parameters
-        ----------
-        chunk_size: int (optional)
-        algo: int (optional)
-            Integer in the range [0, 1, 2]. Only algorithm #0 is currently
-            supported.
-        """
-        super().__init__(kwargs)
-
-        self._manager = _lib._GdeflateManager(self.chunk_size, self.algo)
-
-
-class LZ4Manager(nvCompManager):
-    def __init__(self, **kwargs):
-        """Create a GPU LZ4Compressor object.
-
-        Used to compress and decompress GPU buffers of a specific dtype.
-        All parameters are optional and will be set to usable defaults.
-
-        Parameters
-        ----------
-        chunk_size: int (optional)
-            The size of each chunk of data to decompress indepentently with
-            LZ4. Must be within the range of [32768, 16777216]. Larger sizes will
-            result in higher compression, but with decreased parallelism. The
-            recommended size is 65536.
-            Defaults to the recommended size.
-        data_type: pyNVCOMP_TYPE (optional)
-            The data type returned for decompression.
-            Defaults to pyNVCOMP_TYPE.UCHAR
-        """
-        super().__init__(kwargs)
-        self._manager = _lib._LZ4Manager(self.chunk_size, self.data_type.value)
-
-
-class SnappyManager(nvCompManager):
-    def __init__(self, **kwargs):
-        """Create a GPU SnappyCompressor object.
-
-        Used to compress and decompress GPU buffers.
-        All parameters are optional and will be set to usable defaults.
-
-        Parameters
-        ----------
-        chunk_size: int (optional)
-        """
-        super().__init__(kwargs)
-        self._manager = _lib._SnappyManager(self.chunk_size)
-
-
-class ManagedDecompressionManager(nvCompManager):
-    def __init__(self, compressed_buffer):
-        """Create a Managed compressor using the
-        create_manager factory method.
-
-        This function is used in order to automatically
-        identify which compression algorithm was used on
-        an input buffer.
-
-        It returns a ManagedDecompressionManager that can
-        then be used normally to decompress the unknown
-        compressed binary data, or compress other data
-        into the same format.
-
-        Parameters
-        ----------
-        compressed_buffer: cp.ndarray
-            A buffer of compressed bytes of unknown origin.
-        """
-        super().__init__({})
-        self._manager = _lib._ManagedManager(asarray(compressed_buffer))
diff --git a/python/kvikio/kvikio/_nvcomp_codec.py b/python/kvikio/kvikio/_nvcomp_codec.py
deleted file mode 100644
index dc60d9c7dc..0000000000
--- a/python/kvikio/kvikio/_nvcomp_codec.py
+++ /dev/null
@@ -1,228 +0,0 @@
-# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
-# See file LICENSE for terms.
-
-from typing import Any, Mapping, Optional, Sequence
-
-import cupy as cp
-import cupy.typing
-from numcodecs.compat import ensure_contiguous_ndarray_like
-
-from kvikio._lib.libnvcomp_ll import SUPPORTED_ALGORITHMS
-from kvikio.numcodecs import BufferLike, CudaCodec
-
-
-class NvCompBatchCodec(CudaCodec):
-    """Codec that uses batch algorithms from nvCOMP library.
-
-    An algorithm is selected using `algorithm` parameter.
-    If the algorithm takes additional options, they can be
-    passed to the algorithm using `options` dictionary.
-    """
-
-    # Header stores original uncompressed size. This is required to enable
-    # data compatibility between existing numcodecs codecs and NvCompBatchCodec.
-    HEADER_SIZE_BYTES: int = 4
-
-    codec_id: str = "nvcomp_batch"
-    algorithm: str
-    options: Mapping[str, Any]
-
-    def __init__(
-        self,
-        algorithm: str,
-        options: Optional[Mapping[str, Any]] = None,
-        stream: Optional[cp.cuda.Stream] = None,
-    ) -> None:
-        algo_id = algorithm.lower()
-        algo_t = SUPPORTED_ALGORITHMS.get(algo_id, None)
-        if algo_t is None:
-            raise ValueError(
-                f"{algorithm} is not supported. "
-                f"Must be one of: {list(SUPPORTED_ALGORITHMS.keys())}"
-            )
-
-        self.algorithm = algo_id
-        self.options = dict(options) if options is not None else {}
-
-        # Create an algorithm.
-        self._algo = algo_t(**self.options)
-        # Use default stream, if needed.
-        self._stream = stream if stream is not None else cp.cuda.Stream.ptds
-
-    def encode(self, buf: BufferLike) -> cupy.typing.NDArray:
-        return self.encode_batch([buf])[0]
-
-    def encode_batch(self, bufs: Sequence[Any]) -> Sequence[Any]:
-        """Encode data in `bufs` using nvCOMP.
-
-        Parameters
-        ----------
-        bufs :
-            Data to be encoded. Each buffer in the list may be any object
-            supporting the new-style buffer protocol.
-
-        Returns
-        -------
-            List of encoded buffers. Each buffer may be any object supporting
-            the new-style buffer protocol.
-        """
-        num_chunks = len(bufs)
-        if num_chunks == 0:
-            return []
-
-        bufs = [cp.asarray(ensure_contiguous_ndarray_like(b)) for b in bufs]
-        buf_sizes = [b.size * b.itemsize for b in bufs]
-
-        max_chunk_size = max(buf_sizes)
-
-        # Get temp and output buffer sizes.
-        temp_size = self._algo.get_compress_temp_size(num_chunks, max_chunk_size)
-        comp_chunk_size = self._algo.get_compress_chunk_size(max_chunk_size)
-
-        # Prepare data and size buffers.
-        # uncomp_chunks is used as a container that stores pointers to actual chunks.
-        # nvCOMP requires this and sizes buffers to be in GPU memory.
-        uncomp_chunks = cp.array([b.data.ptr for b in bufs], dtype=cp.uintp)
-        uncomp_chunk_sizes = cp.array(buf_sizes, dtype=cp.uint64)
-
-        temp_buf = cp.empty(temp_size, dtype=cp.uint8)
-
-        comp_chunks = cp.empty((num_chunks, comp_chunk_size), dtype=cp.uint8)
-        # Array of pointers to each compressed chunk.
-        comp_chunk_ptrs = cp.array([c.data.ptr for c in comp_chunks], dtype=cp.uintp)
-        # Resulting compressed chunk sizes.
-        comp_chunk_sizes = cp.empty(num_chunks, dtype=cp.uint64)
-
-        self._algo.compress(
-            uncomp_chunks,
-            uncomp_chunk_sizes,
-            max_chunk_size,
-            num_chunks,
-            temp_buf,
-            comp_chunk_ptrs,
-            comp_chunk_sizes,
-            self._stream,
-        )
-
-        res = []
-        # Copy to host to subsequently avoid many smaller D2H copies.
-        comp_chunks = cp.asnumpy(comp_chunks, self._stream)
-        comp_chunk_sizes = cp.asnumpy(comp_chunk_sizes, self._stream)
-        self._stream.synchronize()
-
-        for i in range(num_chunks):
-            res.append(comp_chunks[i, : comp_chunk_sizes[i]].tobytes())
-        return res
-
-    def decode(self, buf: BufferLike, out: Optional[BufferLike] = None) -> BufferLike:
-        return self.decode_batch([buf], [out])[0]
-
-    def decode_batch(
-        self, bufs: Sequence[Any], out: Optional[Sequence[Any]] = None
-    ) -> Sequence[Any]:
-        """Decode data in `bufs` using nvCOMP.
-
-        Parameters
-        ----------
-        bufs :
-            Encoded data. Each buffer in the list may be any object
-            supporting the new-style buffer protocol.
-        out :
-            List of writeable buffers to store decoded data.
-            N.B. if provided, each buffer must be exactly the right size
-            to store the decoded data.
-
-        Returns
-        -------
-            List of decoded buffers. Each buffer may be any object supporting
-            the new-style buffer protocol.
-        """
-        num_chunks = len(bufs)
-        if num_chunks == 0:
-            return []
-
-        # TODO(akamenev): check only first buffer, assuming they are all
-        # of the same kind.
-        is_host_buffer = not hasattr(bufs[0], "__cuda_array_interface__")
-        if is_host_buffer:
-            bufs = [cp.asarray(ensure_contiguous_ndarray_like(b)) for b in bufs]
-
-        # Prepare compressed chunks buffers.
-        comp_chunks = cp.array([b.data.ptr for b in bufs], dtype=cp.uintp)
-        comp_chunk_sizes = cp.array([b.size for b in bufs], dtype=cp.uint64)
-
-        # Get uncompressed chunk sizes.
-        uncomp_chunk_sizes = self._algo.get_decompress_size(
-            comp_chunks,
-            comp_chunk_sizes,
-            self._stream,
-        )
-
-        # Check whether the uncompressed chunks are all the same size.
-        # cupy.unique returns sorted sizes.
-        sorted_chunk_sizes = cp.unique(uncomp_chunk_sizes)
-        max_chunk_size = sorted_chunk_sizes[-1].item()
-        is_equal_chunks = sorted_chunk_sizes.shape[0] == 1
-
-        # Get temp buffer size.
-        temp_size = self._algo.get_decompress_temp_size(num_chunks, max_chunk_size)
-
-        temp_buf = cp.empty(temp_size, dtype=cp.uint8)
-
-        # Prepare uncompressed chunks buffers.
-        # First, allocate chunks of max_chunk_size and then
-        # copy the pointers to a pointer array in GPU memory as required by nvCOMP.
-        # For performance reasons, we use max_chunk_size so we can create
-        # a rectangular array with the same pointer increments.
-        uncomp_chunks = cp.empty((num_chunks, max_chunk_size), dtype=cp.uint8)
-        p_start = uncomp_chunks.data.ptr
-        uncomp_chunk_ptrs = cp.uint64(p_start) + (
-            cp.arange(0, num_chunks * max_chunk_size, max_chunk_size, dtype=cp.uint64)
-        )
-
-        # TODO(akamenev): currently we provide the following 2 buffers to decompress()
-        # but do not check/use them afterwards since some of the algos
-        # (e.g. LZ4 and Gdeflate) do not require it and run faster
-        # without those arguments passed, while other algos (e.g. zstd) require
-        # these buffers to be valid.
-        actual_uncomp_chunk_sizes = cp.empty(num_chunks, dtype=cp.uint64)
-        statuses = cp.empty(num_chunks, dtype=cp.int32)
-
-        self._algo.decompress(
-            comp_chunks,
-            comp_chunk_sizes,
-            num_chunks,
-            temp_buf,
-            uncomp_chunk_ptrs,
-            uncomp_chunk_sizes,
-            actual_uncomp_chunk_sizes,
-            statuses,
-            self._stream,
-        )
-
-        # If all chunks are the same size, we can just return uncomp_chunks.
-        if is_equal_chunks and out is None:
-            return cp.asnumpy(uncomp_chunks) if is_host_buffer else uncomp_chunks
-
-        res = []
-        uncomp_chunk_sizes = uncomp_chunk_sizes.get()
-        for i in range(num_chunks):
-            ret = uncomp_chunks[i, : uncomp_chunk_sizes[i]]
-            if out is None or out[i] is None:
-                res.append(cp.asnumpy(ret) if is_host_buffer else ret)
-            else:
-                o = ensure_contiguous_ndarray_like(out[i])
-                if hasattr(o, "__cuda_array_interface__"):
-                    cp.copyto(o, ret.view(dtype=o.dtype), casting="no")
-                else:
-                    cp.asnumpy(ret.view(dtype=o.dtype), out=o, stream=self._stream)
-                res.append(o)
-        self._stream.synchronize()
-
-        return res
-
-    def __repr__(self):
-        return (
-            f"{self.__class__.__name__}"
-            f"(algorithm={self.algorithm!r}, options={self.options!r})"
-        )
diff --git a/python/kvikio/kvikio/benchmarks/single_node_io.py b/python/kvikio/kvikio/benchmarks/single_node_io.py
index e3b152cbaf..ddc6680167 100644
--- a/python/kvikio/kvikio/benchmarks/single_node_io.py
+++ b/python/kvikio/kvikio/benchmarks/single_node_io.py
@@ -8,7 +8,7 @@
 import statistics
 import tempfile
 from time import perf_counter as clock
-from typing import Any, ContextManager, Dict, Union
+from typing import ContextManager, Union
 
 import cupy
 from dask.utils import format_bytes, parse_bytes
@@ -19,21 +19,6 @@
 from kvikio.benchmarks.utils import parse_directory, pprint_sys_info
 
 
-def get_zarr_compressors() -> Dict[str, Any]:
-    """Returns a dict of available Zarr compressors"""
-    try:
-        import kvikio.zarr
-    except ImportError:
-        return {}
-    try:
-        compressors = kvikio.zarr.nvcomp_compressors
-    except AttributeError:
-        # zarr-python 3.x
-        return {}
-    else:
-        return {c.__name__.lower(): c for c in compressors}
-
-
 def create_data(nbytes):
     """Return a random uint8 cupy array"""
     return cupy.arange(nbytes, dtype="uint8")
@@ -223,10 +208,6 @@ def run_zarr(args):
     if not kvikio.zarr.supported:
         raise RuntimeError(f"requires Zarr >={kvikio.zarr.MINIMUM_ZARR_VERSION}")
 
-    compressor = None
-    if args.zarr_compressor is not None:
-        compressor = get_zarr_compressors()[args.zarr_compressor]()
-
     a = create_data(args.nbytes)
 
     shutil.rmtree(str(dir_path), ignore_errors=True)
@@ -236,7 +217,6 @@ def run_zarr(args):
     z = zarr.array(
         a,
         chunks=False,
-        compressor=compressor,
         store=kvikio.zarr.GDSStore(dir_path),
         meta_array=cupy.empty(()),
     )
@@ -277,8 +257,6 @@ def main(args):
     print(f"directory         | {args.dir}")
     print(f"nthreads          | {args.nthreads}")
     print(f"nruns             | {args.nruns}")
-    if args.zarr_compressor is not None:
-        print(f"Zarr compressor   | {args.zarr_compressor}")
     print("==================================")
 
     # Run each benchmark using the requested APIs
@@ -354,16 +332,6 @@ def pprint_api_res(name, samples):
         choices=tuple(API.keys()) + ("all",),
         help="List of APIs to use {%(choices)s}",
     )
-    parser.add_argument(
-        "--zarr-compressor",
-        metavar="COMPRESSOR",
-        default=None,
-        choices=tuple(get_zarr_compressors().keys()),
-        help=(
-            "Set a nvCOMP compressor to use with Zarr "
-            "{%(choices)s} (default: %(default)s)"
-        ),
-    )
 
     args = parser.parse_args()
     if "all" in args.api:
diff --git a/python/kvikio/kvikio/benchmarks/zarr_io.py b/python/kvikio/kvikio/benchmarks/zarr_io.py
index 7882fcad8c..14c9fb38d3 100644
--- a/python/kvikio/kvikio/benchmarks/zarr_io.py
+++ b/python/kvikio/kvikio/benchmarks/zarr_io.py
@@ -13,9 +13,9 @@
 from typing import ContextManager, Union
 
 import cupy
-import numcodecs.blosc
 import numpy
 import zarr
+import zarr.storage
 from dask.utils import format_bytes, parse_bytes
 
 import kvikio
@@ -23,62 +23,47 @@
 import kvikio.zarr
 from kvikio.benchmarks.utils import drop_vm_cache, parse_directory, pprint_sys_info
 
-if not kvikio.zarr.supported:
-    raise RuntimeError(f"requires Zarr >={kvikio.zarr.MINIMUM_ZARR_VERSION}")
-
-compressors = {
-    "none": (None, None),
-    "lz4": (numcodecs.blosc.Blosc(cname="lz4"), kvikio.zarr.LZ4()),
-}
-
 
 def create_src_data(args):
     return cupy.random.random(args.nelem, dtype=args.dtype)
 
 
 def run_kvikio(args):
-    dir_path = args.dir / "kvikio"
-    shutil.rmtree(str(dir_path), ignore_errors=True)
-
-    # Get the GPU compressor
-    compressor = compressors[args.compressor][1]
-
-    src = create_src_data(args)
-
-    # Write
-    if args.drop_vm_cache:
-        drop_vm_cache()
-    t0 = clock()
-    z = zarr.create(
-        shape=(args.nelem,),
-        chunks=(args.chunksize,),
-        dtype=args.dtype,
-        compressor=compressor,
-        store=kvikio.zarr.GDSStore(dir_path),
-        meta_array=cupy.empty(()),
-    )
-    z[:] = src
-    os.sync()
-    write_time = clock() - t0
-
-    # Read
-    if args.drop_vm_cache:
-        drop_vm_cache()
-    t0 = clock()
-    res = z[:]
-    read_time = clock() - t0
-    assert res.nbytes == args.nbytes
-
-    return read_time, write_time
+    with zarr.config.enable_gpu():
+        dir_path = args.dir / "kvikio"
+        shutil.rmtree(str(dir_path), ignore_errors=True)
+
+        src = create_src_data(args)
+
+        # Write
+        if args.drop_vm_cache:
+            drop_vm_cache()
+        t0 = clock()
+        z = zarr.create(
+            shape=(args.nelem,),
+            chunks=(args.chunksize,),
+            dtype=args.dtype,
+            store=kvikio.zarr.GDSStore(dir_path),
+        )
+        z[:] = src
+        os.sync()
+        write_time = clock() - t0
+
+        # Read
+        if args.drop_vm_cache:
+            drop_vm_cache()
+        t0 = clock()
+        res = z[:]
+        read_time = clock() - t0
+        assert res.nbytes == args.nbytes
+
+        return read_time, write_time
 
 
 def run_posix(args):
     dir_path = args.dir / "posix"
     shutil.rmtree(str(dir_path), ignore_errors=True)
 
-    # Get the CPU compressor
-    compressor = compressors[args.compressor][0]
-
     src = create_src_data(args)
 
     # Write
@@ -89,9 +74,7 @@ def run_posix(args):
         shape=(args.nelem,),
         chunks=(args.chunksize,),
         dtype=args.dtype,
-        compressor=compressor,
-        store=zarr.DirectoryStore(dir_path),
-        meta_array=numpy.empty(()),
+        store=zarr.storage.LocalStore(dir_path),
     )
     z[:] = src.get()
     os.sync()
@@ -135,7 +118,6 @@ def main(args):
     print(f"directory         | {args.dir}")
     print(f"nthreads          | {args.nthreads}")
     print(f"nruns             | {args.nruns}")
-    print(f"compressor        | {args.compressor}")
     print("==================================")
 
     # Run each benchmark using the requested APIs
@@ -226,16 +208,6 @@ def pprint_api_res(name, samples):
         choices=tuple(API.keys()) + ("all",),
         help="List of APIs to use {%(choices)s}",
     )
-    parser.add_argument(
-        "--compressor",
-        metavar="COMPRESSOR",
-        default="none",
-        choices=tuple(compressors.keys()),
-        help=(
-            "Set a nvCOMP compressor to use with Zarr "
-            "{%(choices)s} (default: %(default)s)"
-        ),
-    )
     parser.add_argument(
         "--drop-vm-cache",
         action="store_true",
diff --git a/python/kvikio/kvikio/defaults.py b/python/kvikio/kvikio/defaults.py
index be57d2739c..ee0ebf5f95 100644
--- a/python/kvikio/kvikio/defaults.py
+++ b/python/kvikio/kvikio/defaults.py
@@ -124,7 +124,7 @@ def set(*config) -> ConfigContextManager:
         - ``"bounce_buffer_size"``
         - ``"http_max_attempts"``
         - ``"http_status_codes"``
-        - ``*http_timeout*``
+        - ``"http_timeout"``
 
     Returns
     -------
@@ -167,6 +167,7 @@ def get(config_name: str) -> Any:
         - ``"bounce_buffer_size"``
         - ``"http_max_attempts"``
         - ``"http_status_codes"``
+        - ``"http_timeout"``
 
     Returns
     -------
diff --git a/python/kvikio/kvikio/mmap.py b/python/kvikio/kvikio/mmap.py
new file mode 100644
index 0000000000..3b506ded02
--- /dev/null
+++ b/python/kvikio/kvikio/mmap.py
@@ -0,0 +1,171 @@
+# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+# See file LICENSE for terms.
+
+import os
+import stat
+from typing import Any, Optional
+
+from kvikio._lib.mmap import InternalMmapHandle
+from kvikio.cufile import IOFuture
+
+
+class Mmap:
+    """Handle of a memory-mapped file"""
+
+    def __init__(
+        self,
+        file_path: os.PathLike,
+        flags: str = "r",
+        initial_map_size: Optional[int] = None,
+        initial_map_offset: int = 0,
+        mode: int = stat.S_IRUSR | stat.S_IWUSR | stat.S_IRGRP | stat.S_IROTH,
+        map_flags: Optional[int] = None,
+    ):
+        """Construct a new memory-mapped file handle
+
+        Parameters
+        ----------
+        file_path : os.PathLike
+            File path.
+        flags : str, optional
+
+            - ``r``: Open for reading (default)
+            - ``w``: (Not implemented yet) Open for writing, truncating the file first
+            - ``a``: (Not implemented yet) Open for writing, appending to the end of
+              file if it exists
+            - ``+``: (Not implemented yet) Open for updating (reading and writing)
+        initial_map_size : int, optional
+            Size in bytes of the mapped region. If not specified, map the region
+            starting from ``initial_map_offset`` to the end of file.
+        initial_map_offset : int, optional
+            File offset of the mapped region. Default is 0.
+        mode : int, optional
+            Access mode (permissions) to use if creating a new file. Default is
+            0644 (octal), 420 (decimal).
+        map_flags : int, optional
+            Flags to be passed to the system call ``mmap``. See `mmap(2)` for details.
+        """
+        self._handle = InternalMmapHandle(
+            file_path, flags, initial_map_size, initial_map_offset, mode, map_flags
+        )
+
+    def initial_map_size(self) -> int:
+        """Size in bytes of the mapped region when the mapping handle was constructed
+
+        Returns
+        -------
+        int
+            Initial size of the mapped region.
+        """
+        return self._handle.initial_map_size()
+
+    def initial_map_offset(self) -> int:
+        """File offset of the mapped region when the mapping handle was constructed
+
+        Returns
+        -------
+        int
+            Initial file offset of the mapped region.
+        """
+        return self._handle.initial_map_offset()
+
+    def file_size(self) -> int:
+        """Get the file size if the file is open
+
+        Returns 0 if the file is closed.
+
+        Returns
+        -------
+        int
+            The file size in bytes.
+        """
+        return self._handle.file_size()
+
+    def close(self) -> None:
+        """Close the mapping handle if it is open; do nothing otherwise
+
+        Unmaps the memory region and closes the underlying file descriptor.
+        """
+        self._handle.close()
+
+    def closed(self) -> bool:
+        """Whether the mapping handle is closed
+
+        Returns
+        -------
+        bool
+            Boolean answer.
+        """
+        return self._handle.closed()
+
+    def read(self, buf: Any, size: Optional[int] = None, offset: int = 0) -> int:
+        """Sequential read ``size`` bytes from the file to the destination buffer
+        ``buf``
+
+        Parameters
+        ----------
+        buf : buffer-like or array-like
+            Address of the host or device memory (destination buffer).
+        size : int, optional
+            Size in bytes to read. If not specified, read starts from ``offset``
+            to the end of file.
+        offset : int, optional
+            File offset. Default is 0.
+
+        Returns
+        -------
+        int
+            Number of bytes that have been read.
+
+        Raises
+        ------
+        IndexError
+            If the read region specified by ``offset`` and ``size`` is outside the
+            initial region specified when the mapping handle was constructed.
+        RuntimeError
+            If the mapping handle is closed.
+        """
+        return self._handle.read(buf, size, offset)
+
+    def pread(
+        self,
+        buf: Any,
+        size: Optional[int] = None,
+        offset: int = 0,
+        task_size: Optional[int] = None,
+    ) -> IOFuture:
+        """Parallel read ``size`` bytes from the file to the destination buffer ``buf``
+
+        Parameters
+        ----------
+        buf : buffer-like or array-like
+            Address of the host or device memory (destination buffer).
+        size : int, optional
+            Size in bytes to read. If not specified, read starts from ``offset``
+            to the end of file.
+        offset : int, optional
+            File offset. Default is 0.
+        task_size : int, optional
+            Size of each task in bytes for parallel execution. If None, uses
+            the default task size from :func:`kvikio.defaults.task_size`.
+
+        Returns
+        -------
+        IOFuture
+            Future that on completion returns the size of bytes that were successfully
+            read.
+
+        Raises
+        ------
+        IndexError
+            If the read region specified by ``offset`` and ``size`` is outside the
+            initial region specified when the mapping handle was constructed.
+        RuntimeError
+            If the mapping handle is closed.
+
+        Notes
+        -----
+        The returned IOFuture object's ``get()`` should not be called after the lifetime
+        of the MmapHandle object ends. Otherwise, the behavior is undefined.
+        """
+        return IOFuture(self._handle.pread(buf, size, offset, task_size))
diff --git a/python/kvikio/kvikio/nvcomp.py b/python/kvikio/kvikio/nvcomp.py
deleted file mode 100644
index 3b62e51e8c..0000000000
--- a/python/kvikio/kvikio/nvcomp.py
+++ /dev/null
@@ -1,20 +0,0 @@
-# Copyright (c) 2021-2025, NVIDIA CORPORATION. All rights reserved.
-# See file LICENSE for terms.
-
-
-from kvikio._nvcomp import (  # noqa: F401
-    ANSManager,
-    BitcompManager,
-    CascadedManager,
-    GdeflateManager,
-    LZ4Manager,
-    ManagedDecompressionManager,
-    SnappyManager,
-    cp_to_nvcomp_dtype,
-    nvCompManager,
-)
-from kvikio.utils import kvikio_deprecate_module
-
-kvikio_deprecate_module(
-    "Use the official nvCOMP API from 'nvidia.nvcomp' instead.", since="25.06"
-)
diff --git a/python/kvikio/kvikio/nvcomp_codec.py b/python/kvikio/kvikio/nvcomp_codec.py
deleted file mode 100644
index ded350cdd5..0000000000
--- a/python/kvikio/kvikio/nvcomp_codec.py
+++ /dev/null
@@ -1,9 +0,0 @@
-# Copyright (c) 2023-2025, NVIDIA CORPORATION. All rights reserved.
-# See file LICENSE for terms.
-
-from kvikio._nvcomp_codec import NvCompBatchCodec  # noqa: F401
-from kvikio.utils import kvikio_deprecate_module
-
-kvikio_deprecate_module(
-    "Use the official nvCOMP API from 'nvidia.nvcomp' instead.", since="25.06"
-)
diff --git a/python/kvikio/kvikio/remote_file.py b/python/kvikio/kvikio/remote_file.py
index 41ec216e5c..2064320914 100644
--- a/python/kvikio/kvikio/remote_file.py
+++ b/python/kvikio/kvikio/remote_file.py
@@ -3,6 +3,7 @@
 
 from __future__ import annotations
 
+import enum
 import functools
 import urllib.parse
 from typing import Optional
@@ -10,6 +11,55 @@
 from kvikio.cufile import IOFuture
 
 
+class RemoteEndpointType(enum.Enum):
+    """
+    Types of remote file endpoints supported by KvikIO.
+
+    This enum defines the different protocols and services that can be used
+    to access remote files. It is used to specify or detect the type of
+    remote endpoint when opening files.
+
+    Attributes
+    ----------
+    AUTO : int
+        Automatically detect the endpoint type from the URL. KvikIO will
+        attempt to infer the appropriate protocol based on the URL format.
+    S3 : int
+        AWS S3 endpoint using credentials-based authentication. Requires
+        AWS environment variables (such as AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY,
+        AWS_DEFAULT_REGION) to be set.
+    S3_PUBLIC : INT
+        AWS S3 endpoint for publicly accessible objects. No credentials required as the
+        objects have public read permissions enabled. Used for open datasets and public
+        buckets.
+    S3_PRESIGNED_URL : int
+        AWS S3 endpoint using a presigned URL. No credentials required as
+        authentication is embedded in the URL with time-limited access.
+    WEBHDFS : int
+        Apache Hadoop WebHDFS (Web-based Hadoop Distributed File System)
+        endpoint for accessing files stored in HDFS over HTTP/HTTPS.
+    HTTP : int
+        Generic HTTP or HTTPS endpoint for accessing files from web servers.
+        This is used for standard web resources that do not fit the other
+        specific categories.
+
+    See Also
+    --------
+    RemoteFile.open : Factory method that uses this enum to specify endpoint types.
+    """
+
+    AUTO = 0
+    S3 = 1
+    S3_PUBLIC = 2
+    S3_PRESIGNED_URL = 3
+    WEBHDFS = 4
+    HTTP = 5
+
+    @staticmethod
+    def _map_to_internal(remote_endpoint_type: RemoteEndpointType):
+        return _get_remote_module().RemoteEndpointType[remote_endpoint_type.name]
+
+
 @functools.cache
 def is_remote_file_available() -> bool:
     """Check if the remote module is available"""
@@ -57,7 +107,7 @@ def open_http(
         url: str,
         nbytes: Optional[int] = None,
     ) -> RemoteFile:
-        """Open a http file.
+        """Open a HTTP/HTTPS file.
 
         Parameters
         ----------
@@ -67,7 +117,7 @@ def open_http(
             The size of the file. If None, KvikIO will ask the server
             for the file size.
         """
-        return RemoteFile(_get_remote_module().RemoteFile.open_http(url, nbytes))
+        return cls(_get_remote_module().RemoteFile.open_http(url, nbytes))
 
     @classmethod
     def open_s3(
@@ -97,7 +147,7 @@ def open_s3(
             The size of the file. If None, KvikIO will ask the server
             for the file size.
         """
-        return RemoteFile(
+        return cls(
             _get_remote_module().RemoteFile.open_s3(bucket_name, object_name, nbytes)
         )
 
@@ -133,15 +183,187 @@ def open_s3_url(
         """
         parsed_result = urllib.parse.urlparse(url.lower())
         if parsed_result.scheme in ("http", "https"):
-            return RemoteFile(
+            return cls(
                 _get_remote_module().RemoteFile.open_s3_from_http_url(url, nbytes)
             )
         if parsed_result.scheme == "s3":
-            return RemoteFile(
-                _get_remote_module().RemoteFile.open_s3_from_s3_url(url, nbytes)
-            )
+            return cls(_get_remote_module().RemoteFile.open_s3_from_s3_url(url, nbytes))
         raise ValueError(f"Unsupported protocol: {url}")
 
+    @classmethod
+    def open_s3_public(cls, url: str, nbytes: Optional[int] = None) -> RemoteFile:
+        """Open a publicly accessible AWS S3 file.
+
+        Parameters
+        ----------
+        url
+             URL to the remote file.
+        nbytes
+            The size of the file. If None, KvikIO will ask the server
+            for the file size.
+        """
+        return cls(_get_remote_module().RemoteFile.open_s3_public(url, nbytes))
+
+    @classmethod
+    def open_s3_presigned_url(
+        cls,
+        presigned_url: str,
+        nbytes: Optional[int] = None,
+    ) -> RemoteFile:
+        """Open a AWS S3 file from a presigned URL.
+
+        Parameters
+        ----------
+        presigned_url
+            Presigned URL to the remote file.
+        nbytes
+            The size of the file. If None, KvikIO will ask the server
+            for the file size.
+        """
+        return cls(
+            _get_remote_module().RemoteFile.open_s3_presigned_url(presigned_url, nbytes)
+        )
+
+    @classmethod
+    def open_webhdfs(
+        cls,
+        url: str,
+        nbytes: Optional[int] = None,
+    ) -> RemoteFile:
+        """Open a file on Apache Hadoop Distributed File System (HDFS) using WebHDFS.
+
+        If KvikIO is run within a Docker, the argument ``--network host`` needs to be
+        passed to the ``docker run`` command.
+
+        Parameters
+        ----------
+        url
+            URL to the remote file.
+        nbytes
+            The size of the file. If None, KvikIO will ask the server for the file
+            size.
+        """
+        return cls(_get_remote_module().RemoteFile.open_webhdfs(url, nbytes))
+
+    @classmethod
+    def open(
+        cls,
+        url: str,
+        remote_endpoint_type: RemoteEndpointType = RemoteEndpointType.AUTO,
+        allow_list: Optional[list] = None,
+        nbytes: Optional[int] = None,
+    ) -> RemoteFile:
+        """
+        Create a remote file handle from a URL.
+
+        This function creates a RemoteFile for reading data from various remote
+        endpoints including HTTP/HTTPS servers, AWS S3 buckets, S3 for public access,
+        S3 presigned URLs, and WebHDFS. The endpoint type can be automatically detected
+        from the URL or explicitly specified.
+
+        Parameters
+        ----------
+        url : str
+            The URL of the remote file. Supported formats include:
+
+            - S3 with credentials
+            - S3 for public access
+            - S3 presigned URL
+            - WebHDFS
+            - HTTP/HTTPS
+        remote_endpoint_type : RemoteEndpointType, optional
+            The type of remote endpoint. Default is :class:`RemoteEndpointType.AUTO`
+            which automatically detects the endpoint type from the URL. Can be
+            explicitly set to :class:`RemoteEndpointType.S3`,
+            :class:`RemoteEndpointType.S3_PUBLIC`,
+            :class:`RemoteEndpointType.S3_PRESIGNED_URL`,
+            :class:`RemoteEndpointType.WEBHDFS`, or :class:`RemoteEndpointType.HTTP`
+            to force a specific endpoint type.
+        allow_list : list of RemoteEndpointType, optional
+            List of allowed endpoint types. If provided:
+
+            - If remote_endpoint_type is :class:`RemoteEndpointType.AUTO`, types are
+              tried in the exact order specified until a match is found.
+            - In explicit mode, the specified type must be in this list, otherwise an
+              exception is thrown.
+
+            If not provided, defaults to all supported types in this order:
+            :class:`RemoteEndpointType.S3`,
+            :class:`RemoteEndpointType.S3_PUBLIC`,
+            :class:`RemoteEndpointType.S3_PRESIGNED_URL`,
+            :class:`RemoteEndpointType.WEBHDFS`, and :class:`RemoteEndpointType.HTTP`.
+        nbytes : int, optional
+            File size in bytes. If not provided, the function sends an additional
+            request to the server to query the file size.
+
+        Returns
+        -------
+        RemoteFile
+            A RemoteFile object that can be used to read data from the remote file.
+
+        Raises
+        ------
+        RuntimeError
+            - If the URL is malformed or missing required components.
+            - :class:`RemoteEndpointType.AUTO` mode is used and the URL does not match
+              any supported endpoint type.
+            - The specified endpoint type is not in the `allow_list`.
+            - The URL is invalid for the specified endpoint type.
+            - Unable to connect to the remote server or determine file size
+              (when nbytes not provided).
+
+        Examples
+        --------
+        - Auto-detect endpoint type from URL:
+
+          .. code-block::
+
+             handle = RemoteFile.open(
+                 "https://bucket.s3.amazonaws.com/object?X-Amz-Algorithm=AWS4-HMAC-SHA256"
+                 "&X-Amz-Credential=...&X-Amz-Signature=..."
+             )
+
+        - Open S3 file with explicit endpoint type:
+
+          .. code-block::
+
+             handle = RemoteFile.open(
+                 "https://my-bucket.s3.us-east-1.amazonaws.com/data.bin",
+                 remote_endpoint_type=RemoteEndpointType.S3
+             )
+
+        - Restrict endpoint type candidates:
+
+          .. code-block::
+
+             handle = RemoteFile.open(
+                 user_provided_url,
+                 remote_endpoint_type=RemoteEndpointType.AUTO,
+                 allow_list=[
+                     RemoteEndpointType.HTTP,
+                     RemoteEndpointType.S3_PRESIGNED_URL
+                 ]
+             )
+
+        - Provide known file size to skip HEAD request:
+
+          .. code-block::
+
+             handle = RemoteFile.open(
+                 "https://example.com/large-file.bin",
+                 remote_endpoint_type=RemoteEndpointType.HTTP,
+                 nbytes=1024 * 1024 * 100  # 100 MB
+             )
+        """
+        return cls(
+            _get_remote_module().RemoteFile.open(
+                url,
+                RemoteEndpointType._map_to_internal(remote_endpoint_type),
+                allow_list,
+                nbytes,
+            )
+        )
+
     def close(self) -> None:
         """Close the file"""
         pass
@@ -155,6 +377,15 @@ def __exit__(self, exc_type, exc_val, exc_tb) -> None:
     def __str__(self) -> str:
         return str(self._handle)
 
+    def remote_endpoint_type(self) -> RemoteEndpointType:
+        """Get the type of the remote file.
+
+        Returns
+        -------
+        The type of the remote file.
+        """
+        return RemoteEndpointType[self._handle.remote_endpoint_type().name]
+
     def nbytes(self) -> int:
         """Get the file size.
 
diff --git a/python/kvikio/kvikio/zarr/__init__.py b/python/kvikio/kvikio/zarr/__init__.py
index 7ec22c275a..ca6cfde7e9 100644
--- a/python/kvikio/kvikio/zarr/__init__.py
+++ b/python/kvikio/kvikio/zarr/__init__.py
@@ -1,10 +1,8 @@
 # Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
 
-from importlib import metadata as _metadata
+try:
+    from ._zarr_python_3 import GDSStore
+except ImportError as e:
+    raise ImportError("kvikio.zarr requires the optional 'zarr>=3' dependency") from e
 
-from packaging.version import Version as _Version, parse as _parse
-
-if _parse(_metadata.version("zarr")) >= _Version("3.0.0"):
-    from ._zarr_python_3 import *  # noqa: F401,F403
-else:
-    from ._zarr_python_2 import *  # type: ignore[assignment] # noqa: F401,F403
+__all__ = ["GDSStore"]
diff --git a/python/kvikio/kvikio/zarr/_zarr_python_2.py b/python/kvikio/kvikio/zarr/_zarr_python_2.py
deleted file mode 100644
index bd1418e799..0000000000
--- a/python/kvikio/kvikio/zarr/_zarr_python_2.py
+++ /dev/null
@@ -1,400 +0,0 @@
-# Copyright (c) 2021-2025, NVIDIA CORPORATION. All rights reserved.
-# See file LICENSE for terms.
-from __future__ import annotations
-
-import contextlib
-import os
-import os.path
-from abc import abstractmethod
-from typing import Any, Literal, Mapping, Optional, Sequence, Union
-
-import cupy
-import cupy.typing
-import numcodecs
-import numpy
-import numpy as np
-import zarr
-import zarr.creation
-import zarr.errors
-import zarr.storage
-from numcodecs.abc import Codec
-from numcodecs.compat import ensure_contiguous_ndarray_like
-from numcodecs.registry import register_codec
-from packaging.version import parse
-
-import kvikio
-import kvikio._nvcomp
-import kvikio._nvcomp_codec
-import kvikio.zarr
-from kvikio._nvcomp_codec import NvCompBatchCodec
-from kvikio.numcodecs import BufferLike, CudaCodec
-
-MINIMUM_ZARR_VERSION = "2.15"
-
-# Is this version of zarr supported? We depend on the `Context`
-# argument introduced in https://github.com/zarr-developers/zarr-python/pull/1131
-# in zarr v2.15.
-supported = parse(zarr.__version__) >= parse(MINIMUM_ZARR_VERSION)
-
-
-class GDSStore(zarr.storage.DirectoryStore):  # type: ignore[name-defined]
-    """GPUDirect Storage (GDS) class using directories and files.
-
-    This class works like `zarr.storage.DirectoryStore` but implements
-    getitems() in order to support direct reading into device memory.
-    It uses KvikIO for reads and writes, which in turn will use GDS
-    when applicable.
-
-    Parameters
-    ----------
-    path : string
-        Location of directory to use as the root of the storage hierarchy.
-    normalize_keys : bool, optional
-        If True, all store keys will be normalized to use lower case characters
-        (e.g. 'foo' and 'FOO' will be treated as equivalent). This can be
-        useful to avoid potential discrepancies between case-sensitive and
-        case-insensitive file system. Default value is False.
-    dimension_separator : {'.', '/'}, optional
-        Separator placed between the dimensions of a chunk.
-    compressor_config_overwrite
-        If not None, use this `Mapping` to specify what is written to the Zarr metadata
-        file on disk (`.zarray`). Normally, Zarr writes the configuration[1] given by
-        the `compressor` argument to the `.zarray` file. Use this argument to overwrite
-        the normal configuration and use the specified `Mapping` instead.
-    decompressor_config_overwrite
-        If not None, use this `Mapping` to specify what compressor configuration[1] is
-        used for decompressing no matter the configuration found in the Zarr metadata
-        on disk (the `.zarray` file).
-
-    [1] https://github.com/zarr-developers/numcodecs/blob/cb155432/numcodecs/abc.py#L79
-
-    Notes
-    -----
-    Atomic writes are used, which means that data are first written to a
-    temporary file, then moved into place when the write is successfully
-    completed. Files are only held open while they are being read or written and are
-    closed immediately afterwards, so there is no need to manually close any files.
-
-    Safe to write in multiple threads or processes.
-    """
-
-    # The default output array type used by getitems().
-    default_meta_array = numpy.empty(())
-
-    def __init__(
-        self,
-        path,
-        normalize_keys=False,
-        dimension_separator=None,
-        *,
-        compressor_config_overwrite: Optional[Mapping] = None,
-        decompressor_config_overwrite: Optional[Mapping] = None,
-    ) -> None:
-        if not kvikio.zarr.supported:
-            raise RuntimeError(
-                f"GDSStore requires Zarr >={kvikio.zarr.MINIMUM_ZARR_VERSION}"
-            )
-        super().__init__(
-            path, normalize_keys=normalize_keys, dimension_separator=dimension_separator
-        )
-        self.compressor_config_overwrite = compressor_config_overwrite
-        self.decompressor_config_overwrite = decompressor_config_overwrite
-
-    def __eq__(self, other):
-        return isinstance(other, GDSStore) and self.path == other.path
-
-    def _tofile(self, a, fn):
-        with kvikio.CuFile(fn, "w") as f:
-            written = f.write(a)
-            assert written == a.nbytes
-
-    def __getitem__(self, key):
-        ret = super().__getitem__(key)
-        if self.decompressor_config_overwrite and key == ".zarray":
-            meta = self._metadata_class.decode_array_metadata(ret)
-            if meta["compressor"]:
-                meta["compressor"] = self.decompressor_config_overwrite
-                ret = self._metadata_class.encode_array_metadata(meta)
-        return ret
-
-    def __setitem__(self, key, value):
-        if self.compressor_config_overwrite and key == ".zarray":
-            meta = self._metadata_class.decode_array_metadata(value)
-            if meta["compressor"]:
-                meta["compressor"] = self.compressor_config_overwrite
-                value = self._metadata_class.encode_array_metadata(meta)
-        super().__setitem__(key, value)
-
-    def getitems(
-        self,
-        keys: Sequence[str],
-        *,
-        contexts: Mapping[str, Mapping] = {},
-    ) -> Mapping[str, Any]:
-        """Retrieve data from multiple keys.
-
-        Parameters
-        ----------
-        keys : Iterable[str]
-            The keys to retrieve
-        contexts: Mapping[str, Context]
-            A mapping of keys to their context. Each context is a mapping of store
-            specific information. If the "meta_array" key exist, GDSStore use its
-            values as the output array otherwise GDSStore.default_meta_array is used.
-
-        Returns
-        -------
-        Mapping
-            A collection mapping the input keys to their results.
-        """
-        ret = {}
-        io_results = []
-
-        with contextlib.ExitStack() as stack:
-            for key in keys:
-                filepath = os.path.join(self.path, key)
-                if not os.path.isfile(filepath):
-                    continue
-                try:
-                    meta_array = contexts[key]["meta_array"]
-                except KeyError:
-                    meta_array = self.default_meta_array
-
-                nbytes = os.path.getsize(filepath)
-                f = stack.enter_context(kvikio.CuFile(filepath, "r"))
-                ret[key] = numpy.empty_like(meta_array, shape=(nbytes,), dtype="u1")
-                io_results.append((f.pread(ret[key]), nbytes))
-
-            for future, nbytes in io_results:
-                nbytes_read = future.get()
-                if nbytes_read != nbytes:
-                    raise RuntimeError(
-                        f"Incomplete read ({nbytes_read}) expected {nbytes}"
-                    )
-        return ret
-
-
-class NVCompCompressor(CudaCodec):
-    """Abstract base class for nvCOMP compressors
-
-    The derived classes must set `codec_id` and implement
-    `get_nvcomp_manager`
-    """
-
-    @abstractmethod
-    def get_nvcomp_manager(self) -> kvikio.nvcomp.nvCompManager:
-        """Abstract method that should return the nvCOMP compressor manager"""
-        pass  # TODO: cache Manager
-
-    def encode(self, buf: BufferLike) -> cupy.typing.NDArray:
-        buf = cupy.asarray(ensure_contiguous_ndarray_like(buf))
-        return self.get_nvcomp_manager().compress(buf)
-
-    def decode(self, buf: BufferLike, out: Optional[BufferLike] = None) -> BufferLike:
-        buf = ensure_contiguous_ndarray_like(buf)
-        is_host_buffer = not hasattr(buf, "__cuda_array_interface__")
-        if is_host_buffer:
-            buf = cupy.asarray(buf)
-
-        ret = self.get_nvcomp_manager().decompress(buf)
-
-        if is_host_buffer:
-            ret = cupy.asnumpy(ret)
-
-        if out is not None:
-            out = ensure_contiguous_ndarray_like(out)
-            if hasattr(out, "__cuda_array_interface__"):
-                cupy.copyto(out, ret.view(dtype=out.dtype), casting="no")
-            else:
-                np.copyto(out, cupy.asnumpy(ret.view(dtype=out.dtype)), casting="no")
-        return ret
-
-
-class ANS(NVCompCompressor):
-    codec_id = "nvcomp_ANS"
-
-    def get_nvcomp_manager(self):
-        return kvikio.nvcomp.ANSManager()
-
-
-class Bitcomp(NVCompCompressor):
-    codec_id = "nvcomp_Bitcomp"
-
-    def get_nvcomp_manager(self):
-        return kvikio.nvcomp.BitcompManager()
-
-
-class Cascaded(NVCompCompressor):
-    codec_id = "nvcomp_Cascaded"
-
-    def get_nvcomp_manager(self):
-        return kvikio.nvcomp.CascadedManager()
-
-
-class Gdeflate(NVCompCompressor):
-    codec_id = "nvcomp_Gdeflate"
-
-    def get_nvcomp_manager(self):
-        return kvikio.nvcomp.GdeflateManager()
-
-
-class LZ4(NVCompCompressor):
-    codec_id = "nvcomp_LZ4"
-
-    def get_nvcomp_manager(self):
-        return kvikio.nvcomp.LZ4Manager()
-
-
-class Snappy(NVCompCompressor):
-    codec_id = "nvcomp_Snappy"
-
-    def get_nvcomp_manager(self):
-        return kvikio.nvcomp.SnappyManager()
-
-
-# Expose a list of available nvCOMP compressors and register them as Zarr condecs
-nvcomp_compressors = [ANS, Bitcomp, Cascaded, Gdeflate, LZ4, Snappy]
-for c in nvcomp_compressors:
-    register_codec(c)
-
-
-class CompatCompressor:
-    """A pair of compatible compressors one using the CPU and one using the GPU
-
-    Warning
-    -------
-    `CompatCompressor` is only supported by KvikIO's `open_cupy_array()` and
-    cannot be used as a compressor argument in Zarr functions like `open()`
-    and `open_array()` directly. However, it is possible to use its `.cpu`
-    like: `open(..., compressor=CompatCompressor.lz4().cpu)`.
-
-    Parameters
-    ----------
-    cpu
-        The CPU compressor.
-    gpu
-        The GPU compressor.
-    """
-
-    def __init__(self, cpu: Codec, gpu: CudaCodec) -> None:
-        self.cpu = cpu
-        self.gpu = gpu
-
-    @classmethod
-    def lz4(cls) -> CompatCompressor:
-        """A compatible pair of LZ4 compressors"""
-        return cls(cpu=numcodecs.LZ4(), gpu=NvCompBatchCodec("lz4"))
-
-
-def open_cupy_array(
-    store: Union[os.PathLike, str],
-    mode: Literal["r", "r+", "a", "w", "w-"] = "a",
-    compressor: Codec | CompatCompressor = Snappy(),
-    meta_array=cupy.empty(()),
-    **kwargs,
-) -> zarr.Array:
-    """Open an Zarr array as a CuPy-like array using file-mode-like semantics.
-
-    This function is a CUDA friendly version of `zarr.open_array` that reads
-    and writes to CuPy arrays. Beside the arguments listed below, the arguments
-    have the same semantics as in `zarr.open_array`.
-
-    Parameters
-    ----------
-    store
-        Path to directory in file system. As opposed to `zarr.open_array`,
-        Store and path to zip files isn't supported.
-    mode
-        Persistence mode: 'r' means read only (must exist); 'r+' means
-        read/write (must exist); 'a' means read/write (create if doesn't
-        exist); 'w' means create (overwrite if exists); 'w-' means create
-        (fail if exists).
-    compressor
-        The compressor used when creating a Zarr file or None if no compressor
-        is to be used. If a `CompatCompressor` is given, `CompatCompressor.gpu`
-        is used for compression and decompression; and `CompatCompressor.cpu`
-        is written as the compressor in the Zarr file metadata on disk.
-        This argument is ignored in "r" and "r+" mode. By default the
-        Snappy compressor by nvCOMP is used.
-    meta_array : array-like, optional
-        An CuPy-like array instance to use for determining arrays to create and
-        return to users. It must implement `__cuda_array_interface__`.
-    **kwargs
-        The rest of the arguments are forwarded to `zarr.open_array` as-is.
-
-    Returns
-    -------
-    Zarr array backed by a GDS file store, nvCOMP compression, and CuPy arrays.
-    """
-
-    if not isinstance(store, (str, os.PathLike)):
-        raise ValueError("store must be a path")
-    store = str(os.fspath(store))
-    if not hasattr(meta_array, "__cuda_array_interface__"):
-        raise ValueError("meta_array must implement __cuda_array_interface__")
-
-    if mode in ("r", "r+", "a"):
-        # In order to handle "a", we start by trying to open the file in read mode.
-        try:
-            ret = zarr.open_array(
-                store=kvikio.zarr.GDSStore(path=store),  # type: ignore[call-arg]
-                mode="r+",
-                meta_array=meta_array,
-                **kwargs,
-            )
-        except (
-            zarr.errors.ContainsGroupError,
-            zarr.errors.ArrayNotFoundError,  # type: ignore[attr-defined]
-        ):
-            # If we are reading, this is a genuine error.
-            if mode in ("r", "r+"):
-                raise
-        else:
-            if ret.compressor is None:
-                return ret
-            # If we are reading a LZ4-CPU compressed file, we overwrite the
-            # metadata on-the-fly to make Zarr use LZ4-GPU for both compression
-            # and decompression.
-            compat_lz4 = CompatCompressor.lz4()
-            if ret.compressor == compat_lz4.cpu:
-                ret = zarr.open_array(
-                    store=kvikio.zarr.GDSStore(  # type: ignore[call-arg]
-                        path=store,
-                        compressor_config_overwrite=compat_lz4.cpu.get_config(),
-                        decompressor_config_overwrite=compat_lz4.gpu.get_config(),
-                    ),
-                    mode=mode,
-                    meta_array=meta_array,
-                    **kwargs,
-                )
-            elif not isinstance(ret.compressor, CudaCodec):
-                raise ValueError(
-                    "The Zarr file was written using a non-CUDA compatible "
-                    f"compressor, {ret.compressor}, please use something "
-                    "like kvikio.zarr.CompatCompressor"
-                )
-            return ret
-
-    # At this point, we known that we are writing a new array
-    if mode not in ("w", "w-", "a"):
-        raise ValueError(f"Unknown mode: {mode}")
-
-    if isinstance(compressor, CompatCompressor):
-        compressor_config_overwrite = compressor.cpu.get_config()
-        decompressor_config_overwrite = compressor.gpu.get_config()
-        compressor = compressor.gpu
-    else:
-        compressor_config_overwrite = None
-        decompressor_config_overwrite = None
-
-    return zarr.open_array(
-        store=kvikio.zarr.GDSStore(  # type: ignore[call-arg]
-            path=store,
-            compressor_config_overwrite=compressor_config_overwrite,
-            decompressor_config_overwrite=decompressor_config_overwrite,
-        ),
-        mode=mode,
-        meta_array=meta_array,
-        compressor=compressor,
-        **kwargs,
-    )
diff --git a/python/kvikio/kvikio/zarr/_zarr_python_3.py b/python/kvikio/kvikio/zarr/_zarr_python_3.py
index 5305cd9b72..b12ff93125 100644
--- a/python/kvikio/kvikio/zarr/_zarr_python_3.py
+++ b/python/kvikio/kvikio/zarr/_zarr_python_3.py
@@ -6,18 +6,29 @@
 import os
 from pathlib import Path
 
-import packaging
-import zarr.storage
-from zarr.abc.store import (
+import packaging.version
+import zarr
+
+_zarr_version = packaging.version.parse(zarr.__version__)
+
+if _zarr_version < packaging.version.parse("3.0.0"):
+    # We include this runtime package checking to help users who relied on
+    # installing kvikio to also include zarr, which is not an optional dependency.
+    raise ImportError(
+        f"'zarr>=3' is required, but 'zarr=={_zarr_version}' is installed."
+    )
+
+import zarr.storage  # noqa: E402
+from zarr.abc.store import (  # noqa: E402
     ByteRequest,
     OffsetByteRequest,
     RangeByteRequest,
     SuffixByteRequest,
 )
-from zarr.core.buffer import Buffer, BufferPrototype
-from zarr.core.buffer.core import default_buffer_prototype
+from zarr.core.buffer import Buffer, BufferPrototype  # noqa: E402
+from zarr.core.buffer.core import default_buffer_prototype  # noqa: E402
 
-import kvikio
+import kvikio  # noqa: E402
 
 # The GDSStore implementation follows the `LocalStore` implementation
 # at https://github.com/zarr-developers/zarr-python/blob/main/src/zarr/storage/_local.py
@@ -26,7 +37,7 @@
 
 @functools.cache
 def _is_ge_zarr_3_0_7():
-    return packaging.version.parse(zarr.__version__) >= packaging.version.parse("3.0.7")
+    return _zarr_version >= packaging.version.parse("3.0.7")
 
 
 def _get(
@@ -138,10 +149,3 @@ async def _set(self, key: str, value: Buffer, exclusive: bool = False) -> None:
         path = self.root / key
 
         await asyncio.to_thread(_put, path, value, start=None, exclusive=exclusive)
-
-
-# Matching the check that zarr.__version__ > 2.15 that's
-# part of the public API for our zarr 2.x support
-# This module is behind a check that zarr.__version__ > 3
-# so we can just assume it's already checked and supported.
-supported = True
diff --git a/python/kvikio/pyproject.toml b/python/kvikio/pyproject.toml
index d71abdb608..8f214c31d9 100644
--- a/python/kvikio/pyproject.toml
+++ b/python/kvikio/pyproject.toml
@@ -4,7 +4,7 @@
 [build-system]
 build-backend = "rapids_build_backend.build"
 requires = [
-    "rapids-build-backend>=0.3.0,<0.4.0.dev0",
+    "rapids-build-backend>=0.4.0,<0.5.0.dev0",
     "scikit-build-core[pyproject]>=0.10.0",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 
@@ -19,12 +19,10 @@ authors = [
 license = { text = "Apache-2.0" }
 requires-python = ">=3.10"
 dependencies = [
-    "cupy-cuda12x>=12.0.0",
-    "libkvikio==25.8.*,>=0.0.0a0",
-    "numcodecs !=0.12.0",
+    "cupy-cuda13x>=13.6.0",
+    "libkvikio==25.10.*,>=0.0.0a0",
     "numpy>=1.23,<3.0a0",
     "packaging",
-    "zarr>=2.0.0,<4.0.0",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 classifiers = [
     "Intended Audience :: Developers",
@@ -41,14 +39,18 @@ classifiers = [
 [project.optional-dependencies]
 test = [
     "boto3>=1.21.21",
-    "cuda-python>=12.6.2,<13.0a0",
+    "cuda-python>=13.0.1,<14.0a0",
     "moto[server]>=4.0.8",
     "pytest",
     "pytest-asyncio",
     "pytest-cov",
     "pytest-timeout",
     "rangehttpserver",
-    "rapids-dask-dependency==25.8.*,>=0.0.0a0",
+    "rapids-dask-dependency==25.10.*,>=0.0.0a0",
+    "zarr>=3.0.0,<4.0.0; python_version >= '3.11'",
+] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
+zarr = [
+    "zarr>=3.0.0,<4.0.0; python_version >= '3.11'",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 
 [project.urls]
@@ -110,15 +112,6 @@ skip = [
 
 [tool.mypy]
 ignore_missing_imports = true
-exclude = [
-    # we type check against zarr-python 3.x
-    # and ignore modules using 2.x
-    "python/kvikio/kvikio/zarr/_zarr_python_2.py",
-    "python/kvikio/tests/test_nvcomp_codec.py",
-]
-
-[project.entry-points."numcodecs.codecs"]
-nvcomp_batch = "kvikio.nvcomp_codec:NvCompBatchCodec"
 
 [tool.rapids-build-backend]
 build-backend = "scikit_build_core.build"
@@ -127,7 +120,7 @@ matrix-entry = "cuda_suffixed=true;use_cuda_wheels=true"
 requires = [
     "cmake>=3.30.4",
     "cython>=3.0.0",
-    "libkvikio==25.8.*,>=0.0.0a0",
+    "libkvikio==25.10.*,>=0.0.0a0",
     "ninja",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 
@@ -160,7 +153,6 @@ filterwarnings = [
     "error",
     "ignore:Jitify is performing a one-time only warm-up to populate the persistent cache",
     "ignore::DeprecationWarning:botocore.*",
-    "ignore:This module is deprecated since.*Use the official nvCOMP API from 'nvidia.nvcomp' instead.:FutureWarning:.*nvcomp|.*nvcomp_codec",
 ]
 markers = [
     "cufile: tests to skip if cuFile isn't available e.g. run with `pytest -m 'not cufile'`",
diff --git a/python/kvikio/tests/conftest.py b/python/kvikio/tests/conftest.py
index 07636095eb..06aef52ecc 100644
--- a/python/kvikio/tests/conftest.py
+++ b/python/kvikio/tests/conftest.py
@@ -57,19 +57,6 @@ def run_cmd(cmd: Iterable[str], cwd, verbose=True):
     p.kill()
 
 
-@pytest.fixture()
-def managers():
-    libnvcomp = pytest.importorskip("kvikio.nvcomp")
-    return [
-        libnvcomp.ANSManager,
-        libnvcomp.BitcompManager,
-        libnvcomp.CascadedManager,
-        libnvcomp.GdeflateManager,
-        libnvcomp.LZ4Manager,
-        libnvcomp.SnappyManager,
-    ]
-
-
 @pytest.fixture(
     params=[("cupy", False), ("cupy", True), ("numpy", False)],
     ids=["cupy", "cupy_async", "numpy"],
diff --git a/python/kvikio/tests/test_benchmarks.py b/python/kvikio/tests/test_benchmarks.py
index 8450fdfc25..584f719fc8 100644
--- a/python/kvikio/tests/test_benchmarks.py
+++ b/python/kvikio/tests/test_benchmarks.py
@@ -7,7 +7,6 @@
 from pathlib import Path
 
 import pytest
-from packaging.version import parse
 
 import kvikio
 
@@ -26,25 +25,12 @@
         "cufile-mfma",
         "cufile-mf",
         "cufile-ma",
-        "zarr",
     ],
 )
 @pytest.mark.timeout(30, method="thread")
 def test_single_node_io(run_cmd, tmp_path, api):
     """Test benchmarks/single_node_io.py"""
 
-    if "zarr" in api:
-        kz = pytest.importorskip("kvikio.zarr")
-        import zarr
-
-        if not kz.supported:
-            pytest.skip(f"requires Zarr >={kz.MINIMUM_ZARR_VERSION}")
-
-        if parse(zarr.__version__) >= parse("3.0.0"):
-            pytest.skip(
-                "requires Zarr<3",
-            )
-
     retcode = run_cmd(
         cmd=[
             sys.executable or "python",
@@ -61,44 +47,6 @@ def test_single_node_io(run_cmd, tmp_path, api):
     assert retcode == 0
 
 
-@pytest.mark.parametrize(
-    "api",
-    [
-        "kvikio",
-        "posix",
-    ],
-)
-@pytest.mark.timeout(30, method="thread")
-def test_zarr_io(run_cmd, tmp_path, api):
-    """Test benchmarks/zarr_io.py"""
-
-    kz = pytest.importorskip("kvikio.zarr")
-    import zarr
-
-    if not kz.supported:
-        pytest.skip(f"requires Zarr >={kz.MINIMUM_ZARR_VERSION}")
-
-    if parse(zarr.__version__) >= parse("3.0.0"):
-        pytest.skip(
-            "requires Zarr<3",
-        )
-
-    retcode = run_cmd(
-        cmd=[
-            sys.executable or "python",
-            "zarr_io.py",
-            "-n",
-            "1MiB",
-            "-d",
-            str(tmp_path),
-            "--api",
-            api,
-        ],
-        cwd=benchmarks_path,
-    )
-    assert retcode == 0
-
-
 @pytest.mark.parametrize(
     "api",
     [
diff --git a/python/kvikio/tests/test_cufile_driver.py b/python/kvikio/tests/test_cufile_driver.py
index d85cd35d3e..ed715b859a 100644
--- a/python/kvikio/tests/test_cufile_driver.py
+++ b/python/kvikio/tests/test_cufile_driver.py
@@ -1,11 +1,21 @@
 # Copyright (c) 2024-2025, NVIDIA CORPORATION. All rights reserved.
 # See file LICENSE for terms.
 
+import subprocess
+
 import pytest
 
 import kvikio.cufile_driver
 
 
+def has_bar_memory() -> bool:
+    try:
+        output = subprocess.check_output(["nvidia-smi"], text=True)
+        return "Not Supported" not in output
+    except Exception:
+        return False
+
+
 def test_version():
     major, minor = kvikio.cufile_driver.libcufile_version()
     assert major >= 0
@@ -56,16 +66,22 @@ def test_property_accessor():
         max_device_cache_size_default = kvikio.cufile_driver.get(
             "max_device_cache_size"
         )
-        with kvikio.cufile_driver.set(
-            {"poll_mode": True, "max_device_cache_size": 2048}
-        ):
-            assert kvikio.cufile_driver.get("poll_mode") and (
-                kvikio.cufile_driver.get("max_device_cache_size") == 2048
+        if has_bar_memory():
+            with kvikio.cufile_driver.set(
+                {"poll_mode": True, "max_device_cache_size": 2048}
+            ):
+                assert kvikio.cufile_driver.get("poll_mode") and (
+                    kvikio.cufile_driver.get("max_device_cache_size") == 2048
+                )
+            assert (kvikio.cufile_driver.get("poll_mode") == poll_mode_default) and (
+                kvikio.cufile_driver.get("max_device_cache_size")
+                == max_device_cache_size_default
             )
-        assert (kvikio.cufile_driver.get("poll_mode") == poll_mode_default) and (
-            kvikio.cufile_driver.get("max_device_cache_size")
-            == max_device_cache_size_default
-        )
+        else:
+            with kvikio.cufile_driver.set("poll_mode", True):
+                assert kvikio.cufile_driver.get("poll_mode")
+            assert kvikio.cufile_driver.get("poll_mode") == poll_mode_default
+
     except RuntimeError as e:
         if "KvikIO not compiled with cuFile.h" in str(e):
             pytest.skip("KvikIO not compiled with cuFile.h, skipping cuFile tests")
diff --git a/python/kvikio/tests/test_examples.py b/python/kvikio/tests/test_examples.py
index f32485b6c4..1d08525a23 100644
--- a/python/kvikio/tests/test_examples.py
+++ b/python/kvikio/tests/test_examples.py
@@ -6,7 +6,6 @@
 from pathlib import Path
 
 import pytest
-from packaging.version import parse
 
 import kvikio
 
@@ -21,18 +20,6 @@ def test_hello_world(tmp_path, monkeypatch):
     import_module("hello_world").main(tmp_path / "test-file")
 
 
-def test_zarr_cupy_nvcomp(tmp_path, monkeypatch):
-    """Test examples/zarr_cupy_nvcomp.py"""
-
-    # `examples/zarr_cupy_nvcomp.py` requires the Zarr submodule
-    zarr = pytest.importorskip("zarr")
-    if parse(zarr.__version__) >= parse("3.0.0"):
-        pytest.skip(reason="Requires zarr<3")
-
-    monkeypatch.syspath_prepend(str(examples_path))
-    import_module("zarr_cupy_nvcomp").main(tmp_path / "test-file")
-
-
 def test_http_io(tmp_path, monkeypatch):
     """Test examples/http_io.py"""
 
diff --git a/python/kvikio/tests/test_hdfs_io.py b/python/kvikio/tests/test_hdfs_io.py
new file mode 100644
index 0000000000..eab4d0a5fb
--- /dev/null
+++ b/python/kvikio/tests/test_hdfs_io.py
@@ -0,0 +1,234 @@
+# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+# See file LICENSE for terms.
+
+from __future__ import annotations
+
+import json
+import urllib.parse
+from http.server import BaseHTTPRequestHandler, HTTPServer
+from multiprocessing import Process, Queue
+from typing import Any, Generator
+
+import cupy as cp
+import numpy as np
+import numpy.typing as npt
+import pytest
+import utils
+
+import kvikio.defaults
+from kvikio import remote_file
+
+
+class RemoteFileData:
+    def __init__(self, file_path: str, num_elements: int, dtype: npt.DTypeLike) -> None:
+        self.file_path = file_path
+        self.num_elements = num_elements
+        self.dtype = dtype
+        self.buf = np.arange(0, self.num_elements, dtype=self.dtype)
+        self.file_size = self.buf.nbytes
+
+
+@pytest.fixture(scope="module")
+def remote_file_data() -> RemoteFileData:
+    return RemoteFileData(
+        file_path="/webhdfs/v1/home/test_user/test_file.bin",
+        num_elements=1024 * 1024,
+        dtype=np.float64,
+    )
+
+
+def run_mock_server(queue: Queue[int], file_size: int, buf: npt.NDArray[Any]) -> None:
+    """Run HTTP server in a separate process"""
+
+    class WebHdfsHandler(BaseHTTPRequestHandler):
+        def do_GET(self) -> None:
+            parsed_url = urllib.parse.urlparse(self.path)
+            query_dict = urllib.parse.parse_qs(parsed_url.query)
+            op = query_dict["op"]
+
+            # Client requests file size
+            if op == ["GETFILESTATUS"]:
+                self.send_response(200)
+                self.send_header("Content-Type", "application/json")
+                self.end_headers()
+                response = json.dumps({"length": file_size})
+                self.wfile.write(response.encode())
+
+            # Client requests file content
+            elif op == ["OPEN"]:
+                offset = int(query_dict["offset"][0])
+                length = int(query_dict["length"][0])
+
+                # Convert byte offsets to element indices
+                element_size = buf.itemsize
+                begin_idx = offset // element_size
+                end_idx = (offset + length) // element_size
+                range_data = buf[begin_idx:end_idx].tobytes()
+
+                self.send_response(200)
+                self.send_header("Content-Type", "application/octet-stream")
+                self.send_header("Content-Length", str(len(range_data)))
+                self.end_headers()
+                self.wfile.write(range_data)
+            else:
+                self.send_response(400)
+                self.end_headers()
+
+        def log_message(self, format: str, *args: Any) -> None:
+            pass
+
+    port = utils.find_free_port()
+    server = HTTPServer((utils.localhost(), port), WebHdfsHandler)
+
+    # Send port back to parent process
+    queue.put(port)
+
+    server.serve_forever()
+
+
+@pytest.fixture
+def mock_webhdfs_server(remote_file_data: RemoteFileData) -> Generator[str, None, None]:
+    """Start WebHDFS mock server in a separate process"""
+    queue: Queue[int] = Queue()
+    server_process = Process(
+        target=run_mock_server,
+        args=(
+            queue,
+            remote_file_data.file_size,
+            remote_file_data.buf,
+        ),
+        daemon=True,
+    )
+    server_process.start()
+
+    # Get the port the server is running on
+    port = queue.get(timeout=5)
+
+    yield f"http://{utils.localhost()}:{port}"
+
+    # Cleanup
+    server_process.terminate()
+    server_process.join(timeout=1)
+
+
+class TestWebHdfsOperations:
+    @pytest.mark.parametrize("url_query", ["", "?op=OPEN"])
+    def test_get_file_size(
+        self,
+        mock_webhdfs_server: str,
+        remote_file_data: RemoteFileData,
+        url_query: str,
+    ) -> None:
+        url = f"{mock_webhdfs_server}{remote_file_data.file_path}{url_query}"
+        handle = remote_file.RemoteFile.open_webhdfs(url)
+        file_size = handle.nbytes()
+        assert file_size == remote_file_data.file_size
+
+    def test_parallel_read(
+        self, mock_webhdfs_server: str, remote_file_data: RemoteFileData, xp: Any
+    ) -> None:
+        url = f"{mock_webhdfs_server}{remote_file_data.file_path}"
+        handle = remote_file.RemoteFile.open_webhdfs(url)
+        result_buf = xp.arange(
+            0, remote_file_data.num_elements, dtype=remote_file_data.dtype
+        )
+        fut = handle.pread(result_buf)
+        read_size = fut.get()
+
+        assert read_size == remote_file_data.file_size
+
+        result_buf_np = result_buf
+        if isinstance(result_buf, cp.ndarray):
+            result_buf_np = cp.asnumpy(result_buf)
+        assert np.array_equal(result_buf_np, remote_file_data.buf)
+
+    @pytest.mark.parametrize("size", [80, 8 * 9999])
+    @pytest.mark.parametrize("offset", [0, 800, 8000, 8 * 9999])
+    @pytest.mark.parametrize("num_threads", [1, 4])
+    @pytest.mark.parametrize("task_size", [1024, 4096])
+    def test_parallel_read_partial(
+        self,
+        mock_webhdfs_server: str,
+        remote_file_data: RemoteFileData,
+        size: int,
+        offset: int,
+        num_threads: int,
+        task_size: int,
+        xp: Any,
+    ) -> None:
+        url = f"{mock_webhdfs_server}{remote_file_data.file_path}"
+        element_size = remote_file_data.buf.itemsize
+        begin_idx = offset // element_size
+        end_idx = (offset + size) // element_size
+        expected_buf = remote_file_data.buf[begin_idx:end_idx]
+
+        actual_num_elements = size // np.dtype(remote_file_data.dtype).itemsize
+        with kvikio.defaults.set({"num_threads": num_threads, "task_size": task_size}):
+            handle = remote_file.RemoteFile.open_webhdfs(url)
+            result_buf = xp.zeros(actual_num_elements, dtype=remote_file_data.dtype)
+            fut = handle.pread(result_buf, size, offset)
+            read_size = fut.get()
+
+            assert read_size == size
+
+            result_buf_np = result_buf
+            if isinstance(result_buf, cp.ndarray):
+                result_buf_np = cp.asnumpy(result_buf)
+            assert np.array_equal(result_buf_np, expected_buf)
+
+
+class TestWebHdfsErrors:
+    @pytest.fixture
+    def mock_bad_server(
+        self, remote_file_data: RemoteFileData
+    ) -> Generator[str, None, None]:
+        """Start a bad WebHDFS server that returns invalid JSON"""
+
+        def run_bad_server(queue: Queue[int]) -> None:
+            class BadHandler(BaseHTTPRequestHandler):
+                def do_GET(self):
+                    parsed = urllib.parse.urlparse(self.path)
+                    query = urllib.parse.parse_qs(parsed.query)
+
+                    if query.get("op") == ["GETFILESTATUS"]:
+                        self.send_response(200)
+                        self.send_header("Content-Type", "application/json")
+                        self.end_headers()
+                        # Missing "length" field
+                        response = json.dumps({})
+                        self.wfile.write(response.encode())
+                    else:
+                        self.send_response(400)
+                        self.end_headers()
+
+                def log_message(self, format, *args):
+                    pass
+
+            port = utils.find_free_port()
+            server = HTTPServer((utils.localhost(), port), BadHandler)
+            queue.put(port)
+            server.serve_forever()
+
+        queue: Queue[int] = Queue()
+        server_process = Process(target=run_bad_server, args=(queue,), daemon=True)
+        server_process.start()
+
+        port = queue.get(timeout=5)
+
+        yield f"http://{utils.localhost()}:{port}"
+
+        server_process.terminate()
+        server_process.join(timeout=1)
+
+    def test_missing_file_size(
+        self, mock_bad_server: str, remote_file_data: RemoteFileData
+    ) -> None:
+        url = f"{mock_bad_server}{remote_file_data.file_path}"
+
+        with pytest.raises(
+            RuntimeError,
+            match="Regular expression search failed. "
+            "Cannot extract file length from the JSON response.",
+        ):
+            handle = remote_file.RemoteFile.open_webhdfs(url)
+            handle.nbytes()
diff --git a/python/kvikio/tests/test_mmap.py b/python/kvikio/tests/test_mmap.py
new file mode 100644
index 0000000000..6ad7468690
--- /dev/null
+++ b/python/kvikio/tests/test_mmap.py
@@ -0,0 +1,188 @@
+# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+# See file LICENSE for terms.
+
+import contextlib
+import os
+
+import pytest
+
+import kvikio.defaults
+
+cupy = pytest.importorskip("cupy")
+numpy = pytest.importorskip("numpy")
+
+
+def test_no_file(tmp_path):
+    nonexistent_file = tmp_path / "nonexistent_file"
+    with pytest.raises(RuntimeError, match=r".*Unable to open file.*"):
+        kvikio.Mmap(nonexistent_file)
+
+
+def test_invalid_file_open_flag(tmp_path):
+    filename = tmp_path / "read-only-test-file"
+    expected_data = numpy.arange(1024)
+    expected_data.tofile(filename)
+
+    with pytest.raises(ValueError, match=r".*Unknown file open flag.*"):
+        kvikio.Mmap(filename, "")
+
+    with pytest.raises(ValueError, match=r".*Unknown file open flag.*"):
+        kvikio.Mmap(filename, "z")
+
+
+def test_constructor_invalid_range(tmp_path, xp):
+    filename = tmp_path / "read-only-test-file"
+    test_data = xp.arange(1024 * 1024)
+    test_data.tofile(filename)
+
+    with pytest.raises(IndexError, match=r".*Offset must be less than the file size.*"):
+        kvikio.Mmap(filename, "r", None, test_data.nbytes * 2)
+
+    with pytest.raises(IndexError, match=r".*Mapped region is past the end of file.*"):
+        kvikio.Mmap(filename, "r", test_data.nbytes * 2)
+
+    with pytest.raises(ValueError, match=r".*Mapped region should not be zero byte.*"):
+        kvikio.Mmap(filename, "r", 0)
+
+
+def test_read_invalid_range(tmp_path, xp):
+    filename = tmp_path / "read-only-test-file"
+    test_data = xp.arange(1024 * 1024)
+    test_data.tofile(filename)
+    output_data = xp.zeros_like(test_data)
+
+    initial_size = 1024
+    initial_file_offset = 512
+
+    with pytest.raises(IndexError, match=r".*Offset is past the end of file.*"):
+        mmap_handle = kvikio.Mmap(filename, "r", initial_size, initial_file_offset)
+        mmap_handle.read(output_data, initial_size, test_data.nbytes + 1)
+
+    with pytest.raises(IndexError, match=r".*Read is out of bound.*"):
+        mmap_handle = kvikio.Mmap(filename, "r", initial_size, initial_file_offset)
+        mmap_handle.read(output_data, initial_size, initial_file_offset + 1)
+
+    with pytest.raises(IndexError, match=r".*Read is out of bound.*"):
+        mmap_handle = kvikio.Mmap(filename, "r", initial_size, initial_file_offset)
+        mmap_handle.read(output_data, initial_size, initial_file_offset - 128)
+
+    with pytest.raises(IndexError, match=r".*Read is out of bound.*"):
+        mmap_handle = kvikio.Mmap(filename, "r", initial_size, initial_file_offset)
+        mmap_handle.read(output_data, initial_size + 128, initial_file_offset)
+
+
+def test_read_valid_range(tmp_path, xp):
+    filename = tmp_path / "read-only-test-file"
+    test_data = xp.arange(1024 * 1024)
+    test_data.tofile(filename)
+    output_data = xp.zeros_like(test_data)
+
+    initial_size = 1024
+    initial_file_offset = 512
+
+    with contextlib.nullcontext():
+        mmap_handle = kvikio.Mmap(filename, "r", initial_size, initial_file_offset)
+        mmap_handle.read(output_data, 0, initial_file_offset)
+
+
+@pytest.mark.parametrize("num_elements_to_read", [None, 10, 9999])
+@pytest.mark.parametrize("num_elements_to_skip", [0, 10, 100, 1000, 9999])
+def test_read_seq(tmp_path, xp, num_elements_to_read, num_elements_to_skip):
+    filename = tmp_path / "read-only-test-file"
+    test_data = xp.arange(1024 * 1024)
+    test_data.tofile(filename)
+
+    if num_elements_to_read is None:
+        initial_size = None
+        actual_num_elements_to_read = int(
+            os.path.getsize(filename) / test_data.itemsize
+        )
+    else:
+        initial_size = num_elements_to_read * test_data.itemsize
+        actual_num_elements_to_read = num_elements_to_read
+
+    initial_file_offset = num_elements_to_skip * test_data.itemsize
+    expected_data = test_data[
+        num_elements_to_skip : (num_elements_to_skip + actual_num_elements_to_read)
+    ]
+    actual_data = xp.zeros_like(expected_data)
+
+    mmap_handle = kvikio.Mmap(filename, "r", initial_size, initial_file_offset)
+    read_size = mmap_handle.read(actual_data, initial_size, initial_file_offset)
+
+    assert read_size == expected_data.nbytes
+    xp.testing.assert_array_equal(actual_data, expected_data)
+
+
+@pytest.mark.parametrize("num_elements_to_read", [None, 10, 9999])
+@pytest.mark.parametrize("num_elements_to_skip", [0, 10, 100, 1000, 9999])
+@pytest.mark.parametrize("task_size", [1024, 12345])
+def test_read_parallel(
+    tmp_path, xp, num_elements_to_read, num_elements_to_skip, task_size
+):
+    filename = tmp_path / "read-only-test-file"
+    test_data = xp.arange(1024 * 1024)
+    test_data.tofile(filename)
+
+    if num_elements_to_read is None:
+        initial_size = None
+        actual_num_elements_to_read = int(
+            os.path.getsize(filename) / test_data.itemsize
+        )
+    else:
+        initial_size = num_elements_to_read * test_data.itemsize
+        actual_num_elements_to_read = num_elements_to_read
+
+    initial_file_offset = num_elements_to_skip * test_data.itemsize
+    expected_data = test_data[
+        num_elements_to_skip : (num_elements_to_skip + actual_num_elements_to_read)
+    ]
+    actual_data = xp.zeros_like(expected_data)
+
+    with kvikio.defaults.set("task_size", task_size):
+        mmap_handle = kvikio.Mmap(filename, "r", initial_size, initial_file_offset)
+        fut = mmap_handle.pread(
+            actual_data, initial_size, initial_file_offset, task_size
+        )
+
+        assert fut.get() == expected_data.nbytes
+        xp.testing.assert_array_equal(actual_data, expected_data)
+
+
+def test_read_with_default_arguments(tmp_path, xp):
+    filename = tmp_path / "read-only-test-file"
+    expected_data = xp.arange(1024 * 1024)
+    expected_data.tofile(filename)
+    actual_data = xp.zeros_like(expected_data)
+
+    # Workaround for a CI failure where defaults.task_size() is somehow 0
+    # instead of 4 MiB when KVIKIO_TASK_SIZE is unset
+    with kvikio.defaults.set("task_size", 4 * 1024 * 1024):
+        mmap_handle = kvikio.Mmap(filename, "r")
+
+        read_size = mmap_handle.read(actual_data)
+        assert read_size == expected_data.nbytes
+        xp.testing.assert_array_equal(actual_data, expected_data)
+
+        fut = mmap_handle.pread(actual_data)
+        assert fut.get() == expected_data.nbytes
+        xp.testing.assert_array_equal(actual_data, expected_data)
+
+
+def test_closed_handle(tmp_path, xp):
+    filename = tmp_path / "read-only-test-file"
+    expected_data = xp.arange(1024 * 1024)
+    expected_data.tofile(filename)
+    actual_data = xp.zeros_like(expected_data)
+
+    mmap_handle = kvikio.Mmap(filename, "r")
+    mmap_handle.close()
+
+    assert mmap_handle.closed()
+    assert mmap_handle.file_size() == 0
+
+    with pytest.raises(RuntimeError, match=r".*Cannot read from a closed MmapHandle.*"):
+        mmap_handle.read(actual_data)
+
+    with pytest.raises(RuntimeError, match=r".*Cannot read from a closed MmapHandle.*"):
+        mmap_handle.pread(actual_data)
diff --git a/python/kvikio/tests/test_nvcomp.py b/python/kvikio/tests/test_nvcomp.py
deleted file mode 100644
index 356c5e77cd..0000000000
--- a/python/kvikio/tests/test_nvcomp.py
+++ /dev/null
@@ -1,444 +0,0 @@
-# Copyright (c) 2021-2025, NVIDIA CORPORATION. All rights reserved.
-# See file LICENSE for terms.
-
-import pytest
-
-np = pytest.importorskip("numpy")
-cupy = pytest.importorskip("cupy")
-kvikio = pytest.importorskip("kvikio")
-libnvcomp = pytest.importorskip("kvikio.nvcomp")
-
-
-# TODO: don't hardcode the following expected values
-LEN = {
-    "ANS": 11144,
-    "Bitcomp": 3208,
-    "Cascaded": 600,
-    "Gdeflate": 760,
-    "LZ4": 393,
-    "Snappy": 3548,
-}
-
-
-def assert_compression_size(actual, desired, rtol=0.1):
-    """Compression ratios might change slightly between library versions
-
-    We mark a failure as "xfail"
-    """
-    try:
-        np.testing.assert_allclose(actual, desired, rtol=rtol)
-    except AssertionError:
-        pytest.xfail("mismatch in compression ratios is acceptable")
-        raise
-
-
-def managers():
-    return [
-        libnvcomp.ANSManager,
-        libnvcomp.BitcompManager,
-        libnvcomp.CascadedManager,
-        libnvcomp.GdeflateManager,
-        libnvcomp.LZ4Manager,
-        libnvcomp.SnappyManager,
-    ]
-
-
-def dtypes():
-    return [
-        "uint8",
-        "uint16",
-        "uint32",
-        "int8",
-        "int16",
-        "int32",
-    ]
-
-
-@pytest.mark.parametrize("manager, dtype", zip(managers(), dtypes()))
-def test_round_trip_dtypes(manager, dtype):
-    length = 10000
-    data = cupy.array(
-        np.arange(
-            0,
-            length // cupy.dtype(dtype).type(0).itemsize,
-            dtype=dtype,
-        )
-    )
-    compressor_instance = manager(data_type=dtype)
-    compressed = compressor_instance.compress(data)
-    decompressed = compressor_instance.decompress(compressed)
-    assert (data == decompressed).all()
-
-
-#
-# ANS Options test
-#
-@pytest.mark.parametrize(
-    "inputs",
-    [
-        {},
-        {
-            "chunk_size": 1 << 16,
-        },
-        {
-            "chunk_size": 1 << 16,
-        },
-    ],
-)
-def test_ans_inputs(inputs):
-    size = 10000
-    dtype = inputs.get("data_type") if inputs.get("data_type") else np.int8
-    data = cupy.array(np.arange(0, size // dtype(0).itemsize, dtype=dtype))
-    compressor = libnvcomp.ANSManager(**inputs)
-    final = compressor.compress(data)
-    assert_compression_size(len(final), LEN["ANS"])
-
-
-@pytest.mark.parametrize(
-    "inputs",
-    [
-        {},
-        {
-            "data_type": np.uint8,
-            "algo": 0,
-        },
-        {"data_type": np.uint8},
-        {
-            "algo": 0,
-        },
-    ],
-)
-def test_bitcomp_inputs(inputs):
-    size = 10000
-    dtype = inputs.get("data_type") if inputs.get("data_type") else np.int8
-    data = cupy.array(np.arange(0, size // dtype(0).itemsize, dtype=dtype))
-    compressor = libnvcomp.BitcompManager(**inputs)
-    final = compressor.compress(data)
-    assert_compression_size(len(final), LEN["Bitcomp"])
-
-
-@pytest.mark.parametrize(
-    "inputs, expected",
-    zip(
-        [
-            {"algo": 0},
-            {"algo": 1},
-            {"algo": 2},
-        ],
-        [LEN["Bitcomp"], LEN["Bitcomp"], LEN["Bitcomp"]],
-    ),
-)
-def test_bitcomp_algorithms(inputs, expected):
-    size = 10000
-    dtype = inputs.get("data_type") if inputs.get("data_type") else np.int8
-    data = cupy.array(np.arange(0, size // dtype(0).itemsize, dtype=dtype))
-    compressor = libnvcomp.BitcompManager(**inputs)
-    final = compressor.compress(data)
-    assert_compression_size(len(final), expected)
-
-
-@pytest.mark.parametrize(
-    "inputs",
-    [
-        {},
-        {
-            "options": {
-                "chunk_size": 1 << 12,
-                "type": np.uint32,
-                "num_RLEs": 2,
-                "num_deltas": 1,
-                "use_bp": True,
-            },
-        },
-        {
-            "options": {
-                "chunk_size": 1 << 12,
-                "type": np.uint32,
-                "num_RLEs": 2,
-                "num_deltas": 1,
-                "use_bp": True,
-            },
-            "chunk_size": 1 << 16,
-        },
-        {
-            "options": {
-                "chunk_size": 1 << 12,
-                "type": np.uint32,
-                "num_RLEs": 2,
-                "num_deltas": 1,
-                "use_bp": True,
-            },
-            "data_type": np.uint8,
-        },
-        {
-            "options": {
-                "chunk_size": 1 << 12,
-                "type": np.uint32,
-                "num_RLEs": 2,
-                "num_deltas": 1,
-                "use_bp": True,
-            },
-        },
-    ],
-)
-def test_cascaded_inputs(inputs):
-    size = 10000
-    dtype = inputs.get("data_type") if inputs.get("data_type") else np.int8
-    data = cupy.array(np.arange(0, size // dtype(0).itemsize, dtype=dtype))
-    compressor = libnvcomp.CascadedManager(**inputs)
-    final = compressor.compress(data)
-    assert_compression_size(len(final), LEN["Cascaded"])
-
-
-@pytest.mark.parametrize(
-    "inputs",
-    [
-        {},
-        {
-            "chunk_size": 1 << 16,
-            "algo": 0,
-        },
-        {
-            "chunk_size": 1 << 16,
-        },
-        {
-            "algo": 0,
-        },
-    ],
-)
-def test_gdeflate_inputs(inputs):
-    size = 10000
-    dtype = inputs.get("data_type") if inputs.get("data_type") else np.int8
-    data = cupy.array(np.arange(0, size // dtype(0).itemsize, dtype=dtype))
-    compressor = libnvcomp.GdeflateManager(**inputs)
-    final = compressor.compress(data)
-    assert_compression_size(len(final), LEN["Gdeflate"])
-
-
-@pytest.mark.parametrize(
-    "inputs, expected",
-    zip(
-        [
-            {"algo": 0},
-        ],
-        [LEN["Gdeflate"]],
-    ),
-)
-def test_gdeflate_algorithms(inputs, expected):
-    size = 10000
-    dtype = np.int8
-    data = cupy.array(np.arange(0, size // dtype(0).itemsize, dtype=dtype))
-    compressor = libnvcomp.GdeflateManager(**inputs)
-    final = compressor.compress(data)
-    assert_compression_size(len(final), expected)
-
-
-@pytest.mark.xfail(raises=ValueError)
-@pytest.mark.parametrize(
-    "inputs, expected",
-    zip([{"algo": 1}, {"algo": 2}], [LEN["Gdeflate"], LEN["Gdeflate"]]),
-)
-def test_gdeflate_algorithms_not_implemented(inputs, expected):
-    size = 10000
-    dtype = np.int8
-    data = cupy.array(np.arange(0, size // dtype(0).itemsize, dtype=dtype))
-    compressor = libnvcomp.GdeflateManager(**inputs)
-    final = compressor.compress(data)
-    assert_compression_size(len(final), expected)
-
-
-@pytest.mark.parametrize(
-    "inputs",
-    [
-        {},
-        {
-            "chunk_size": 1 << 16,
-            "data_type": np.uint8,
-        },
-        {
-            "chunk_size": 1 << 16,
-        },
-        {
-            "data_type": np.uint8,
-        },
-    ],
-)
-def test_lz4_inputs(inputs):
-    size = 10000
-    dtype = inputs.get("data_type") if inputs.get("data_type") else np.int8
-    data = cupy.array(np.arange(0, size // dtype(0).itemsize, dtype=dtype))
-    compressor = libnvcomp.LZ4Manager(**inputs)
-    final = compressor.compress(data)
-    assert_compression_size(len(final), LEN["LZ4"])
-
-
-@pytest.mark.parametrize(
-    "inputs",
-    [
-        {},
-        {
-            "chunk_size": 1 << 16,
-        },
-        {
-            "chunk_size": 1 << 16,
-        },
-        {},
-    ],
-)
-def test_snappy_inputs(inputs):
-    size = 10000
-    dtype = np.int8
-    data = cupy.array(np.arange(0, size // dtype(0).itemsize, dtype=dtype))
-    compressor = libnvcomp.SnappyManager(**inputs)
-    final = compressor.compress(data)
-    assert_compression_size(len(final), LEN["Snappy"])
-
-
-@pytest.mark.parametrize(
-    "compressor_size",
-    zip(
-        managers(),
-        [
-            {  # ANS
-                "max_compressed_buffer_size": 89373,
-                "num_chunks": 1,
-                "uncompressed_buffer_size": 10000,
-            },
-            {  # Bitcomp
-                "max_compressed_buffer_size": 16432,
-                "num_chunks": 1,
-                "uncompressed_buffer_size": 10000,
-            },
-            {  # Cascaded
-                "max_compressed_buffer_size": 12460,
-                "num_chunks": 3,
-                "uncompressed_buffer_size": 10000,
-            },
-            {  # Gdeflate
-                "max_compressed_buffer_size": 131160,
-                "num_chunks": 1,
-                "uncompressed_buffer_size": 10000,
-            },
-            {  # LZ4
-                "max_compressed_buffer_size": 65888,
-                "num_chunks": 1,
-                "uncompressed_buffer_size": 10000,
-            },
-            {  # Snappy
-                "max_compressed_buffer_size": 76575,
-                "num_chunks": 1,
-                "uncompressed_buffer_size": 10000,
-            },
-        ],
-    ),
-)
-def test_get_compression_config_with_default_options(compressor_size):
-    compressor = compressor_size[0]
-    expected = compressor_size[1]
-    length = 10000
-    dtype = cupy.uint8
-    data = cupy.array(
-        np.arange(
-            0,
-            length // cupy.dtype(dtype).type(0).itemsize,
-            dtype=dtype,
-        )
-    )
-    compressor_instance = compressor()
-    result = compressor_instance.configure_compression(len(data))
-    assert_compression_size(
-        result.pop("max_compressed_buffer_size"),
-        expected.pop("max_compressed_buffer_size"),
-    )
-    assert result == expected
-
-
-@pytest.mark.parametrize(
-    "manager,expected",
-    zip(
-        managers(),
-        [
-            {  # ANS
-                "num_chunks": 1,
-                "decomp_data_size": 10000,
-            },
-            {  # Bitcomp
-                "num_chunks": 1,
-                "decomp_data_size": 10000,
-            },
-            {  # Cascaded
-                "num_chunks": 3,
-                "decomp_data_size": 10000,
-            },
-            {  # Gdeflate
-                "num_chunks": 1,
-                "decomp_data_size": 10000,
-            },
-            {  # LZ4
-                "num_chunks": 1,
-                "decomp_data_size": 10000,
-            },
-            {  # Snappy
-                "num_chunks": 1,
-                "decomp_data_size": 10000,
-            },
-        ],
-    ),
-)
-def test_get_decompression_config_with_default_options(manager, expected):
-    length = 10000
-    dtype = cupy.uint8
-    data = cupy.array(
-        np.arange(
-            0,
-            length // cupy.dtype(dtype).type(0).itemsize,
-            dtype=dtype,
-        )
-    )
-    compressor_instance = manager()
-    compressed = compressor_instance.compress(data)
-    result = compressor_instance.configure_decompression_with_compressed_buffer(
-        compressed
-    )
-    assert_compression_size(
-        result.pop("decomp_data_size"), expected.pop("decomp_data_size")
-    )
-    assert result == expected
-
-
-@pytest.mark.parametrize(
-    "manager, expected",
-    zip(managers(), list(LEN.values())),
-)
-def test_get_compressed_output_size(manager, expected):
-    length = 10000
-    dtype = cupy.uint8
-    data = cupy.array(
-        np.arange(
-            0,
-            length // cupy.dtype(dtype).type(0).itemsize,
-            dtype=dtype,
-        )
-    )
-    compressor_instance = manager()
-    compressed = compressor_instance.compress(data)
-    buffer_size = compressor_instance.get_compressed_output_size(compressed)
-    assert_compression_size(buffer_size, expected)
-
-
-@pytest.mark.parametrize("manager", managers())
-def test_managed_manager(manager):
-    length = 10000
-    dtype = cupy.uint8
-    data = cupy.array(
-        np.arange(
-            0,
-            length // cupy.dtype(dtype).type(0).itemsize,
-            dtype=dtype,
-        )
-    )
-    compressor_instance = manager()
-    compressed = compressor_instance.compress(data)
-    manager = libnvcomp.ManagedDecompressionManager(compressed)
-    decompressed = manager.decompress(compressed)
-    assert len(decompressed) == 10000
diff --git a/python/kvikio/tests/test_nvcomp_codec.py b/python/kvikio/tests/test_nvcomp_codec.py
deleted file mode 100644
index 29e50ad64b..0000000000
--- a/python/kvikio/tests/test_nvcomp_codec.py
+++ /dev/null
@@ -1,243 +0,0 @@
-# Copyright (c) 2023-2025, NVIDIA CORPORATION. All rights reserved.
-# See file LICENSE for terms.
-
-import itertools as it
-import json
-
-import cupy as cp
-import numcodecs
-import numpy as np
-import packaging
-import packaging.version
-import pytest
-import zarr
-from numpy.testing import assert_equal
-
-from kvikio.nvcomp_codec import NvCompBatchCodec
-
-NVCOMP_CODEC_ID = "nvcomp_batch"
-
-LZ4_ALGO = "LZ4"
-GDEFLATE_ALGO = "Gdeflate"
-SNAPPY_ALGO = "snappy"
-ZSTD_ALGO = "zstd"
-DEFLATE_ALGO = "deflate"
-
-SUPPORTED_CODECS = [LZ4_ALGO, GDEFLATE_ALGO, SNAPPY_ALGO, ZSTD_ALGO, DEFLATE_ALGO]
-
-
-def skip_if_zarr_v3():
-    return pytest.mark.skipif(
-        packaging.version.parse(zarr.__version__) >= packaging.version.Version("3.0.0"),
-        reason="zarr 3.x not supported.",
-    )
-
-
-def _get_codec(algo: str, **kwargs):
-    codec_args = {"id": NVCOMP_CODEC_ID, "algorithm": algo, "options": kwargs}
-    return numcodecs.registry.get_codec(codec_args)
-
-
-@pytest.fixture(params=[(32,), (8, 16), (16, 16)])
-def shape(request):
-    return request.param
-
-
-# Separate fixture for combinations of shapes and chunks, since
-# chunks array must have the same rank as data array.
-@pytest.fixture(
-    params=it.chain(
-        it.product([(64,)], [(64,), (100,)]),
-        it.product([(16, 8), (16, 16)], [(8, 16), (16, 16), (40, 12)]),
-    )
-)
-def shape_chunks(request):
-    return request.param
-
-
-@pytest.mark.parametrize("algo", SUPPORTED_CODECS)
-def test_codec_registry(algo: str):
-    codec = _get_codec(algo)
-    assert isinstance(codec, numcodecs.abc.Codec)
-
-
-@pytest.mark.parametrize("algo", SUPPORTED_CODECS)
-def test_basic(algo: str, shape):
-    codec = NvCompBatchCodec(algo)
-
-    # Create data.
-    dtype = np.float32
-    data = np.ones(shape, dtype=dtype)
-    # Do roundtrip.
-    comp_data = codec.encode(data)
-    # Decompress and cast to original data type/shape.
-    decomp_data = codec.decode(comp_data).view(dtype).reshape(shape)
-
-    assert_equal(decomp_data, data)
-
-
-@pytest.mark.parametrize("algo", SUPPORTED_CODECS)
-@skip_if_zarr_v3()
-def test_basic_zarr(algo: str, shape_chunks):
-    shape, chunks = shape_chunks
-
-    codec = NvCompBatchCodec(algo)
-
-    data = np.ones(shape, dtype=np.float32)
-
-    # This will do the compression.
-    z = zarr.array(data, chunks=chunks, compressor=codec)
-
-    # Test the decompression.
-    assert_equal(z[:], data[:])
-
-
-@pytest.mark.parametrize("algo", SUPPORTED_CODECS)
-@pytest.mark.parametrize("chunk_sizes", [(100, 100), (100, 150)])
-@pytest.mark.parametrize("out", [None, "cpu", "gpu"])
-def test_batch_comp_decomp(algo: str, chunk_sizes, out: str):
-    codec = _get_codec(algo)
-
-    np.random.seed(1)
-
-    dtype = np.float32
-    chunks = [np.random.randn(s).astype(dtype) for s in chunk_sizes]
-    out_buf = None
-    if out == "cpu":
-        out_buf = [np.empty_like(c) for c in chunks]
-    elif out == "gpu":
-        out_buf = [cp.empty_like(c) for c in chunks]
-
-    comp_chunks = codec.encode_batch([c.tobytes() for c in chunks])
-    assert len(comp_chunks) == 2
-
-    decomp_chunks = codec.decode_batch(comp_chunks, out=out_buf)
-    assert len(decomp_chunks) == 2
-
-    for i, dc in enumerate(decomp_chunks):
-        dc = dc.view(dtype=dtype)
-        if isinstance(dc, cp.ndarray):
-            dc = dc.get()
-        assert_equal(dc, chunks[i], f"{i=}")
-
-        if out_buf is not None:
-            ob = out_buf[i]
-            if isinstance(ob, cp.ndarray):
-                ob = ob.get()
-            assert_equal(ob, chunks[i], f"{i=}")
-
-
-@pytest.mark.parametrize("algo", SUPPORTED_CODECS)
-@skip_if_zarr_v3()
-def test_comp_decomp(algo: str, shape_chunks):
-    shape, chunks = shape_chunks
-
-    codec = _get_codec(algo)
-
-    np.random.seed(1)
-
-    data = np.random.randn(*shape).astype(np.float32)
-
-    z1 = zarr.array(data, chunks=chunks, compressor=codec)
-
-    zarr_store = zarr.MemoryStore()
-    zarr.save_array(zarr_store, z1, compressor=codec)
-    # Check the store.
-    meta = json.loads(zarr_store[".zarray"])
-    assert meta["compressor"]["id"] == NVCOMP_CODEC_ID
-    assert meta["compressor"]["algorithm"] == algo.lower()
-
-    # Read back/decompress.
-    z2 = zarr.open_array(zarr_store)
-
-    assert_equal(z1[:], z2[:])
-
-
-@pytest.mark.parametrize(
-    "algo, options",
-    [
-        ("lz4", {"data_type": 4}),  # NVCOMP_TYPE_INT data type.
-        ("gdeflate", {"algo": 1}),  # low-throughput, high compression ratio algo
-    ],
-)
-@skip_if_zarr_v3()
-def test_codec_options(algo, options):
-    codec = NvCompBatchCodec(algo, options)
-
-    shape = (16, 16)
-    chunks = (8, 8)
-
-    data = np.ones(shape, dtype=np.float32)
-
-    z = zarr.array(data, chunks=chunks, compressor=codec)
-
-    assert_equal(z[:], data[:])
-
-
-@skip_if_zarr_v3()
-def test_codec_invalid_options():
-    # There are currently only 3 supported algos in Gdeflate
-    codec = NvCompBatchCodec(GDEFLATE_ALGO, options={"algo": 10})
-
-    data = np.ones((16, 16), dtype=np.float32)
-
-    with pytest.raises(RuntimeError):
-        zarr.array(data, compressor=codec)
-
-
-@pytest.mark.parametrize(
-    "cpu_algo, gpu_algo",
-    [
-        ("lz4", LZ4_ALGO),
-        ("zstd", ZSTD_ALGO),
-    ],
-)
-@skip_if_zarr_v3()
-def test_cpu_comp_gpu_decomp(cpu_algo, gpu_algo):
-    cpu_codec = numcodecs.registry.get_codec({"id": cpu_algo})
-    gpu_codec = _get_codec(gpu_algo)
-
-    shape = (16, 16)
-    chunks = (8, 8)
-
-    data = np.ones(shape, dtype=np.float32)
-
-    z1 = zarr.array(data, chunks=chunks)
-    store = {}
-    zarr.save_array(store, z1, compressor=cpu_codec)
-
-    meta = json.loads(store[".zarray"])
-    assert meta["compressor"]["id"] == cpu_algo
-
-    meta["compressor"] = {"id": NVCOMP_CODEC_ID, "algorithm": gpu_algo}
-    store[".zarray"] = json.dumps(meta).encode()
-
-    z2 = zarr.open_array(store, compressor=gpu_codec)
-
-    assert_equal(z1[:], z2[:])
-
-
-@skip_if_zarr_v3()
-def test_lz4_codec_header(shape_chunks):
-    shape, chunks = shape_chunks
-
-    # Test LZ4 nvCOMP codecs with and without the header.
-    codec_h = _get_codec(LZ4_ALGO, has_header=True)
-    codec_no_h = _get_codec(LZ4_ALGO, has_header=False)
-
-    np.random.seed(1)
-
-    data = np.random.randn(*shape).astype(np.float32)
-
-    z_h = zarr.array(data, chunks=chunks, compressor=codec_h)
-    z_no_h = zarr.array(data, chunks=chunks, compressor=codec_no_h)
-
-    # Result must be the same regardless of the header presence.
-    assert_equal(z_h[:], z_no_h[:])
-
-
-def test_empty_batch():
-    codec = _get_codec(LZ4_ALGO)
-
-    assert len(codec.encode_batch([])) == 0
-    assert len(codec.decode_batch([])) == 0
diff --git a/python/kvikio/tests/test_s3_io.py b/python/kvikio/tests/test_s3_io.py
index 58a73184a8..510940d284 100644
--- a/python/kvikio/tests/test_s3_io.py
+++ b/python/kvikio/tests/test_s3_io.py
@@ -2,11 +2,11 @@
 # See file LICENSE for terms.
 
 import multiprocessing as mp
-import socket
 import time
 from contextlib import contextmanager
 
 import pytest
+import utils
 
 import kvikio
 import kvikio.defaults
@@ -26,18 +26,13 @@
 
 
 @pytest.fixture(scope="session")
-def endpoint_ip():
-    return "127.0.0.1"
+def endpoint_ip() -> str:
+    return utils.localhost()
 
 
 @pytest.fixture(scope="session")
 def endpoint_port():
-    # Return a free port per worker session.
-    sock = socket.socket()
-    sock.bind(("127.0.0.1", 0))
-    port = sock.getsockname()[1]
-    sock.close()
-    return port
+    return utils.find_free_port()
 
 
 def start_s3_server(ip_address, port):
@@ -161,3 +156,64 @@ def test_read_with_file_offset(s3_base, xp, start, end):
             b = xp.zeros(shape=(end - start,), dtype=xp.int64)
             assert f.read(b, file_offset=start * a.itemsize) == b.nbytes
             xp.testing.assert_array_equal(a[start:end], b)
+
+
+@pytest.mark.parametrize("scheme", ["S3"])
+@pytest.mark.parametrize(
+    "remote_endpoint_type",
+    [kvikio.RemoteEndpointType.S3.AUTO, kvikio.RemoteEndpointType.S3],
+)
+@pytest.mark.parametrize("allow_list", [None, [kvikio.RemoteEndpointType.S3]])
+@pytest.mark.parametrize("nbytes", [None, 1])
+def test_open_valid(s3_base, scheme, remote_endpoint_type, allow_list, nbytes):
+    bucket_name = "bucket_name"
+    object_name = "object_name"
+    data = b"file content"
+    with s3_context(
+        s3_base=s3_base, bucket=bucket_name, files={object_name: bytes(data)}
+    ) as server_address:
+        if scheme == "S3":
+            url = f"{scheme}://{bucket_name}/{object_name}"
+        else:
+            url = f"{server_address}/{bucket_name}/{object_name}"
+
+        if nbytes is None:
+            expected_file_size = len(data)
+        else:
+            expected_file_size = nbytes
+
+        with kvikio.RemoteFile.open(url, remote_endpoint_type, allow_list, nbytes) as f:
+            assert f.nbytes() == expected_file_size
+            assert f.remote_endpoint_type() == kvikio.RemoteEndpointType.S3
+
+
+def test_open_invalid(s3_base):
+    bucket_name = "bucket_name"
+    object_name = "object_name"
+    data = b"file content"
+    with s3_context(
+        s3_base=s3_base, bucket=bucket_name, files={object_name: bytes(data)}
+    ) as server_address:
+        # Missing scheme
+        url = f"://{bucket_name}/{object_name}"
+        with pytest.raises(RuntimeError, match="Bad scheme"):
+            kvikio.RemoteFile.open(url)
+
+        # Unsupported type
+        url = f"unsupported://{bucket_name}/{object_name}"
+        with pytest.raises(RuntimeError, match="Unsupported endpoint URL"):
+            kvikio.RemoteFile.open(url)
+
+        # Specified URL not in the allowlist
+        url = f"{server_address}/{bucket_name}/{object_name}"
+        with pytest.raises(RuntimeError, match="not in the allowlist"):
+            kvikio.RemoteFile.open(
+                url, kvikio.RemoteEndpointType.S3, [kvikio.RemoteEndpointType.WEBHDFS]
+            )
+
+        # Invalid URLs
+        url = f"s3://{bucket_name}"
+        with pytest.raises(RuntimeError, match="Unsupported endpoint URL"):
+            kvikio.RemoteFile.open(url)
+        with pytest.raises(RuntimeError, match="Invalid URL"):
+            kvikio.RemoteFile.open(url, kvikio.RemoteEndpointType.S3)
diff --git a/python/kvikio/tests/test_zarr.py b/python/kvikio/tests/test_zarr.py
deleted file mode 100644
index a793e2568e..0000000000
--- a/python/kvikio/tests/test_zarr.py
+++ /dev/null
@@ -1,292 +0,0 @@
-# Copyright (c) 2021-2025, NVIDIA CORPORATION. All rights reserved.
-# See file LICENSE for terms.
-
-
-import math
-
-import numpy
-import pytest
-from packaging.version import parse
-
-cupy = pytest.importorskip("cupy")
-zarr = pytest.importorskip("zarr")
-kvikio_zarr = pytest.importorskip("kvikio.zarr")
-kvikio_nvcomp_codec = pytest.importorskip("kvikio.nvcomp_codec")
-numcodecs = pytest.importorskip("numcodecs")
-
-if not kvikio_zarr.supported:
-    pytest.skip(
-        f"requires Zarr >={kvikio_zarr.MINIMUM_ZARR_VERSION}",
-        allow_module_level=True,
-    )
-
-if parse(zarr.__version__) >= parse("3.0.0"):
-    pytest.skip(
-        "requires Zarr<3",
-        allow_module_level=True,
-    )
-
-
-@pytest.fixture
-def store(tmp_path):
-    """Fixture that creates a GDS Store"""
-    return kvikio_zarr.GDSStore(tmp_path / "test-file.zarr")
-
-
-def test_direct_store_access(store, xp):
-    """Test accessing the GDS Store directly"""
-
-    a = xp.arange(5, dtype="u1")
-    store["a"] = a
-    b = store["a"]
-
-    # Notice, unless using getitems(), GDSStore always returns bytes
-    assert isinstance(b, bytes)
-    assert (xp.frombuffer(b, dtype="u1") == a).all()
-
-
-@pytest.mark.parametrize("xp_write", ["numpy", "cupy"])
-@pytest.mark.parametrize("xp_read_a", ["numpy", "cupy"])
-@pytest.mark.parametrize("xp_read_b", ["numpy", "cupy"])
-def test_direct_store_access_getitems(store, xp_write, xp_read_a, xp_read_b):
-    """Test accessing the GDS Store directly using getitems()"""
-
-    xp_read_a = pytest.importorskip(xp_read_a)
-    xp_read_b = pytest.importorskip(xp_read_b)
-    xp_write = pytest.importorskip(xp_write)
-    a = xp_write.arange(5, dtype="u1")
-    b = a * 2
-    store["a"] = a
-    store["b"] = b
-
-    res = store.getitems(
-        keys=["a", "b"],
-        contexts={
-            "a": {"meta_array": xp_read_a.empty(())},
-            "b": {"meta_array": xp_read_b.empty(())},
-        },
-    )
-    assert isinstance(res["a"], xp_read_a.ndarray)
-    assert isinstance(res["b"], xp_read_b.ndarray)
-    cupy.testing.assert_array_equal(res["a"], a)
-    cupy.testing.assert_array_equal(res["b"], b)
-
-
-def test_array(store, xp):
-    """Test Zarr array"""
-
-    a = xp.arange(100)
-    z = zarr.array(a, chunks=10, compressor=None, store=store, meta_array=xp.empty(()))
-    assert isinstance(z.meta_array, type(a))
-    assert a.shape == z.shape
-    assert a.dtype == z.dtype
-    assert isinstance(a, type(z[:]))
-    xp.testing.assert_array_equal(a, z[:])
-
-
-def test_group(store, xp):
-    """Test Zarr group"""
-
-    g = zarr.open_group(store, meta_array=xp.empty(()))
-    g.ones("data", shape=(10, 11), dtype=int, compressor=None)
-    a = g["data"]
-    assert a.shape == (10, 11)
-    assert a.dtype == int
-    assert isinstance(a, zarr.Array)
-    assert isinstance(a.meta_array, xp.ndarray)
-    assert isinstance(a[:], xp.ndarray)
-    assert (a[:] == 1).all()
-
-
-def test_open_array(store, xp):
-    """Test Zarr's open_array()"""
-
-    a = xp.arange(10)
-    z = zarr.open_array(
-        store,
-        shape=a.shape,
-        dtype=a.dtype,
-        chunks=(10,),
-        compressor=None,
-        meta_array=xp.empty(()),
-    )
-    z[:] = a
-    assert a.shape == z.shape
-    assert a.dtype == z.dtype
-    assert isinstance(a, type(z[:]))
-    xp.testing.assert_array_equal(a, z[:])
-
-
-@pytest.mark.parametrize("inline_array", [True, False])
-def test_dask_read(store, xp, inline_array):
-    """Test Zarr read in Dask"""
-
-    da = pytest.importorskip("dask.array")
-    a = xp.arange(100)
-    z = zarr.array(a, chunks=10, compressor=None, store=store, meta_array=xp.empty(()))
-    d = da.from_zarr(z, inline_array=inline_array)
-    d += 1
-    xp.testing.assert_array_equal(a + 1, d.compute())
-
-
-def test_dask_write(store, xp):
-    """Test Zarr write in Dask"""
-
-    da = pytest.importorskip("dask.array")
-
-    # Write dask array to disk using Zarr
-    a = xp.arange(100)
-    d = da.from_array(a, chunks=10)
-    da.to_zarr(d, store, compressor=None, meta_array=xp.empty(()))
-
-    # Validate the written Zarr array
-    z = zarr.open_array(store)
-    xp.testing.assert_array_equal(a, z[:])
-
-
-@pytest.mark.parametrize("xp_read", ["numpy", "cupy"])
-@pytest.mark.parametrize("xp_write", ["numpy", "cupy"])
-@pytest.mark.parametrize("compressor", kvikio_zarr.nvcomp_compressors)
-def test_compressor(store, xp_write, xp_read, compressor):
-    xp_read = pytest.importorskip(xp_read)
-    xp_write = pytest.importorskip(xp_write)
-
-    shape = (10, 1)
-    chunks = (10, 1)
-    a = xp_write.arange(math.prod(shape)).reshape(shape)
-    z = zarr.creation.create(
-        shape=shape,
-        chunks=chunks,
-        compressor=compressor(),
-        store=store,
-        meta_array=xp_read.empty(()),
-    )
-    z[:] = a
-    b = z[:]
-    assert isinstance(b, xp_read.ndarray)
-    cupy.testing.assert_array_equal(b, a)
-
-
-@pytest.mark.parametrize("algo", ["lz4", "zstd"])
-def test_decompressor_config_overwrite(tmp_path, xp, algo):
-    cpu_codec = numcodecs.registry.get_codec({"id": algo})
-    gpu_codec = kvikio_nvcomp_codec.NvCompBatchCodec(algo)
-
-    # Write using Zarr's default file store and the `cpu_codec` compressor
-    z = zarr.open_array(tmp_path, mode="w", shape=(10,), compressor=cpu_codec)
-    z[:] = range(10)
-    assert z.compressor == cpu_codec
-
-    # Open file using GDSStore and use `gpu_codec` as decompressor.
-    z = zarr.open_array(
-        kvikio_zarr.GDSStore(
-            tmp_path,
-            decompressor_config_overwrite=gpu_codec.get_config(),
-        ),
-        mode="r",
-        meta_array=xp.empty(()),
-    )
-    assert z.compressor == gpu_codec
-    assert isinstance(z[:], xp.ndarray)
-    xp.testing.assert_array_equal(z[:], range(10))
-
-
-@pytest.mark.parametrize("algo", ["lz4"])
-def test_compressor_config_overwrite(tmp_path, xp, algo):
-    cpu_codec = numcodecs.registry.get_codec({"id": algo})
-    gpu_codec = kvikio_nvcomp_codec.NvCompBatchCodec(algo)
-
-    # Write file using GDSStore and the `gpu_codec` compressor. In order
-    # to make the file compatible with Zarr's builtin CPU decompressor,
-    # we set `cpu_codec` as the compressor in the meta file on disk.
-    z = zarr.open_array(
-        kvikio_zarr.GDSStore(
-            tmp_path,
-            compressor_config_overwrite=cpu_codec.get_config(),
-            decompressor_config_overwrite=gpu_codec.get_config(),
-        ),
-        mode="w",
-        shape=10,
-        compressor=gpu_codec,
-        meta_array=xp.empty(()),
-    )
-    assert z.compressor == gpu_codec
-    z[:] = xp.arange(10)
-
-    # We can now open the file using Zarr's builtin CPU decompressor
-    z = zarr.open_array(tmp_path, mode="r")
-    assert isinstance(z[:], numpy.ndarray)
-    numpy.testing.assert_array_equal(z[:], range(10))
-
-
-@pytest.mark.parametrize("write_mode", ["w", "w-", "a"])
-@pytest.mark.parametrize("read_mode", ["r", "r+", "a"])
-def test_open_cupy_array(tmp_path, write_mode, read_mode):
-    a = cupy.arange(10)
-    z = kvikio_zarr.open_cupy_array(
-        tmp_path,
-        mode=write_mode,
-        shape=a.shape,
-        dtype=a.dtype,
-        chunks=(2,),
-        compressor=kvikio_zarr.CompatCompressor.lz4(),
-    )
-    z[:] = a
-    assert a.shape == z.shape
-    assert a.dtype == z.dtype
-    assert isinstance(z[:], type(a))
-    assert z.compressor == kvikio_nvcomp_codec.NvCompBatchCodec("lz4")
-    cupy.testing.assert_array_equal(a, z[:])
-
-    z = kvikio_zarr.open_cupy_array(
-        tmp_path,
-        mode=read_mode,
-    )
-    assert a.shape == z.shape
-    assert a.dtype == z.dtype
-    assert isinstance(z[:], type(a))
-    assert z.compressor == kvikio_nvcomp_codec.NvCompBatchCodec("lz4")
-    cupy.testing.assert_array_equal(a, z[:])
-
-    z = zarr.open_array(tmp_path, mode=read_mode)
-    assert a.shape == z.shape
-    assert a.dtype == z.dtype
-    assert isinstance(z[:], numpy.ndarray)
-    assert z.compressor == kvikio_zarr.CompatCompressor.lz4().cpu
-    numpy.testing.assert_array_equal(a.get(), z[:])
-
-
-@pytest.mark.parametrize("compressor", [None, kvikio_zarr.CompatCompressor.lz4().cpu])
-def test_open_cupy_array_written_by_zarr(tmp_path, compressor):
-    data = numpy.arange(100)
-    z = zarr.open_array(
-        tmp_path,
-        shape=data.shape,
-        mode="w",
-        compressor=compressor,
-    )
-    z[:] = data
-
-    z = kvikio_zarr.open_cupy_array(tmp_path, mode="r")
-    assert isinstance(z[:], cupy.ndarray)
-    cupy.testing.assert_array_equal(z[:], data)
-
-
-@pytest.mark.parametrize("mode", ["r", "r+", "a"])
-def test_open_cupy_array_incompatible_compressor(tmp_path, mode):
-    zarr.create((10,), store=tmp_path, compressor=numcodecs.Blosc())
-
-    with pytest.raises(ValueError, match="non-CUDA compatible compressor"):
-        kvikio_zarr.open_cupy_array(tmp_path, mode=mode)
-
-
-def test_open_cupy_array_unknown_mode(tmp_path):
-    a = cupy.arange(10)
-    with pytest.raises(ValueError, match="Unknown mode: x"):
-        kvikio_zarr.open_cupy_array(
-            tmp_path,
-            mode="x",
-            shape=a.shape,
-            dtype=a.dtype,
-            chunks=(2,),
-        )
diff --git a/python/kvikio/tests/test_zarr_missing.py b/python/kvikio/tests/test_zarr_missing.py
new file mode 100644
index 0000000000..ef25b41dab
--- /dev/null
+++ b/python/kvikio/tests/test_zarr_missing.py
@@ -0,0 +1,36 @@
+# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+# See file LICENSE for terms.
+
+import sys
+
+import pytest
+
+
+def test_zarr_missing_raises(monkeypatch):
+    modules = list(sys.modules)
+    for module in modules:
+        pkg = module.split(".")[0]
+        if pkg == "kvikio":
+            # remove from the import cache
+            monkeypatch.delitem(sys.modules, module, raising=False)
+        elif pkg == "zarr":
+            # force an ImportError
+            monkeypatch.setitem(sys.modules, module, None)
+
+    with pytest.raises(ImportError):
+        import kvikio.zarr  # noqa: F401
+
+
+def test_zarr_2_installed_raises(monkeypatch):
+    modules = list(sys.modules)
+    zarr = pytest.importorskip("zarr")
+    monkeypatch.setattr(zarr, "__version__", "2.0.0")
+
+    for module in modules:
+        pkg = module.split(".")[0]
+        if pkg == "kvikio":
+            # remove from the import cache
+            monkeypatch.delitem(sys.modules, module, raising=False)
+
+    with pytest.raises(ImportError):
+        import kvikio.zarr  # noqa: F401
diff --git a/python/kvikio/tests/test_zarr_v3.py b/python/kvikio/tests/test_zarr_v3.py
index e44de36d22..59331d5513 100644
--- a/python/kvikio/tests/test_zarr_v3.py
+++ b/python/kvikio/tests/test_zarr_v3.py
@@ -6,16 +6,17 @@
 import cupy as cp
 import pytest
 
-import kvikio.zarr
-
 pytest.importorskip("zarr", minversion="3.0.0")
 
+# these must follow the pytest.importorskip
 
 import zarr.core.buffer  # noqa: E402
 import zarr.storage  # noqa: E402
 from zarr.core.buffer.gpu import Buffer  # noqa: E402
 from zarr.testing.store import StoreTests  # noqa: E402
 
+import kvikio.zarr  # noqa: E402
+
 
 @pytest.mark.asyncio
 async def test_basic(tmp_path: pathlib.Path) -> None:
diff --git a/python/kvikio/tests/utils.py b/python/kvikio/tests/utils.py
new file mode 100644
index 0000000000..58d801efcf
--- /dev/null
+++ b/python/kvikio/tests/utils.py
@@ -0,0 +1,15 @@
+# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+# See file LICENSE for terms.
+
+import socket
+
+
+def localhost() -> str:
+    return "127.0.0.1"
+
+
+def find_free_port(host: str = localhost()) -> int:
+    with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
+        s.bind((host, 0))
+        _, port = s.getsockname()
+    return port
diff --git a/python/libkvikio/CMakeLists.txt b/python/libkvikio/CMakeLists.txt
index ecde2dc288..fe0019bf58 100644
--- a/python/libkvikio/CMakeLists.txt
+++ b/python/libkvikio/CMakeLists.txt
@@ -39,28 +39,7 @@ unset(kvikio_FOUND)
 set(KvikIO_BUILD_BENCHMARKS OFF)
 set(KvikIO_BUILD_EXAMPLES OFF)
 set(KvikIO_BUILD_TESTS OFF)
-if(USE_NVCOMP_RUNTIME_WHEEL)
-  set(KvikIO_EXPORT_NVCOMP OFF)
-else()
-  # vendor nvcomp but not the entire kvikio-export set because that's huge
-  include(cmake/thirdparty/get_nvcomp.cmake)
-  include(cmake/Modules/WheelHelpers.cmake)
-  install_aliased_imported_targets(
-    TARGETS nvcomp::nvcomp DESTINATION ${SKBUILD_PLATLIB_DIR}/libkvikio/lib64/
-  )
-endif()
 
 set(CUDA_STATIC_RUNTIME ON)
 
 add_subdirectory(../../cpp kvikio-cpp)
-
-if(USE_NVCOMP_RUNTIME_WHEEL)
-  set(rpaths "$ORIGIN/../../nvidia/nvcomp")
-  foreach(tgt IN LISTS RAPIDS_CYTHON_CREATED_TARGETS)
-    set_property(
-      TARGET ${tgt}
-      PROPERTY INSTALL_RPATH ${rpaths}
-      APPEND
-    )
-  endforeach()
-endif()
diff --git a/python/libkvikio/cmake/Modules/WheelHelpers.cmake b/python/libkvikio/cmake/Modules/WheelHelpers.cmake
deleted file mode 100644
index abdde95298..0000000000
--- a/python/libkvikio/cmake/Modules/WheelHelpers.cmake
+++ /dev/null
@@ -1,59 +0,0 @@
-# =============================================================================
-# Copyright (c) 2022-2025, NVIDIA CORPORATION.
-#
-# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
-# in compliance with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software distributed under the License
-# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
-# or implied. See the License for the specific language governing permissions and limitations under
-# the License.
-# =============================================================================
-include_guard(GLOBAL)
-
-# Making libraries available inside wheels by installing the associated targets.
-function(install_aliased_imported_targets)
-  list(APPEND CMAKE_MESSAGE_CONTEXT "install_aliased_imported_targets")
-
-  set(options "")
-  set(one_value "DESTINATION")
-  set(multi_value "TARGETS")
-  cmake_parse_arguments(_ "${options}" "${one_value}" "${multi_value}" ${ARGN})
-
-  message(VERBOSE "Installing targets '${__TARGETS}' into lib_dir '${__DESTINATION}'")
-
-  foreach(target IN LISTS __TARGETS)
-
-    if(NOT TARGET ${target})
-      message(VERBOSE "No target named ${target}")
-      continue()
-    endif()
-
-    get_target_property(alias_target ${target} ALIASED_TARGET)
-    if(alias_target)
-      set(target ${alias_target})
-    endif()
-
-    get_target_property(is_imported ${target} IMPORTED)
-    if(NOT is_imported)
-      # If the target isn't imported, install it into the wheel
-      install(TARGETS ${target} DESTINATION ${__DESTINATION})
-      message(VERBOSE "install(TARGETS ${target} DESTINATION ${__DESTINATION})")
-    else()
-      # If the target is imported, make sure it's global
-      get_target_property(type ${target} TYPE)
-      if(${type} STREQUAL "UNKNOWN_LIBRARY")
-        install(FILES $<TARGET_FILE:${target}> DESTINATION ${__DESTINATION})
-        message(VERBOSE "install(FILES $<TARGET_FILE:${target}> DESTINATION ${__DESTINATION})")
-      else()
-        install(IMPORTED_RUNTIME_ARTIFACTS ${target} DESTINATION ${__DESTINATION})
-        message(
-          VERBOSE
-          "install(IMPORTED_RUNTIME_ARTIFACTS $<TARGET_FILE:${target}> DESTINATION ${__DESTINATION})"
-        )
-      endif()
-    endif()
-  endforeach()
-endfunction()
diff --git a/python/libkvikio/cmake/thirdparty/get_nvcomp.cmake b/python/libkvikio/cmake/thirdparty/get_nvcomp.cmake
deleted file mode 100644
index 0901c1e349..0000000000
--- a/python/libkvikio/cmake/thirdparty/get_nvcomp.cmake
+++ /dev/null
@@ -1,35 +0,0 @@
-# =============================================================================
-# Copyright (c) 2021-2025, NVIDIA CORPORATION.
-#
-# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
-# in compliance with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software distributed under the License
-# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
-# or implied. See the License for the specific language governing permissions and limitations under
-# the License.
-# =============================================================================
-
-set(KVIKIO_USE_PROPRIETARY_BINARY ON)
-
-# This function finds nvcomp and sets any additional necessary environment variables.
-function(find_and_configure_nvcomp)
-
-  include(${rapids-cmake-dir}/cpm/nvcomp.cmake)
-  set(export_args)
-  if(KvikIO_EXPORT_NVCOMP)
-    # We're vendoring nvcomp and we only want `libnvcomp.so.4`
-    set(export_args BUILD_EXPORT_SET nvcomp)
-  endif()
-
-  rapids_cpm_nvcomp(${export_args} USE_PROPRIETARY_BINARY ${KVIKIO_USE_PROPRIETARY_BINARY})
-
-  # Per-thread default stream
-  if(TARGET nvcomp AND PER_THREAD_DEFAULT_STREAM)
-    target_compile_definitions(nvcomp PRIVATE CUDA_API_PER_THREAD_DEFAULT_STREAM)
-  endif()
-endfunction()
-
-find_and_configure_nvcomp()
diff --git a/python/libkvikio/libkvikio/load.py b/python/libkvikio/libkvikio/load.py
index c790d2dd00..2fd25e642c 100644
--- a/python/libkvikio/libkvikio/load.py
+++ b/python/libkvikio/libkvikio/load.py
@@ -44,9 +44,6 @@ def _load_wheel_installation(soname: str):
 
 
 def load_library():
-    # TODO: remove this nvcomp load when `nvcomp` is re-de-vendored
-    # https://github.com/rapidsai/build-planning/issues/171
-    _load_library("libnvcomp.so.4")
     return _load_library("libkvikio.so")
 
 
diff --git a/python/libkvikio/pyproject.toml b/python/libkvikio/pyproject.toml
index bbbd6f2e74..3239d8c651 100644
--- a/python/libkvikio/pyproject.toml
+++ b/python/libkvikio/pyproject.toml
@@ -4,7 +4,7 @@
 [build-system]
 build-backend = "rapids_build_backend.build"
 requires = [
-    "rapids-build-backend>=0.3.0,<0.4.0.dev0",
+    "rapids-build-backend>=0.4.0,<0.5.0.dev0",
     "scikit-build-core[pyproject]>=0.10.0",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 
@@ -67,6 +67,4 @@ select = [
 ]
 
 # PyPI limit is 100 MiB, fail CI before we get too close to that
-# TODO: drop this to 75M after we re-de-vendor nvcomp
-# https://github.com/rapidsai/build-planning/issues/171
-max_allowed_size_compressed = '90M'
+max_allowed_size_compressed = '75M'