diff --git a/examples/image_codecs.py b/examples/image_codecs.py new file mode 100644 index 0000000000..2020e49e3b --- /dev/null +++ b/examples/image_codecs.py @@ -0,0 +1,51 @@ +# /// script +# requires-python = ">=3.11" +# dependencies = [ +# "zarr @ git+https://github.com/d-v-b/zarr-python.git@a2bc6555", +# "imagecodecs==2025.3.30", +# "pytest" +# ] +# /// + +# "zarr @ git+https://github.com/zarr-developers/zarr-python.git@main", +from typing import Literal + +import numcodecs +import numpy as np +import pytest +from imagecodecs.numcodecs import Jpeg + +import zarr + +numcodecs.register_codec(Jpeg) +jpg_codec = Jpeg() + + +@pytest.mark.parametrize("zarr_format", [2, 3]) +def test(zarr_format: Literal[2, 3]) -> None: + store = {} + if zarr_format == 2: + z_w = zarr.create_array( + store=store, + data=np.zeros((100, 100, 3), dtype=np.uint8), + compressors=jpg_codec, + zarr_format=zarr_format, + ) + else: + z_w = zarr.create_array( + store=store, + data=np.zeros((100, 100, 3), dtype=np.uint8), + serializer=jpg_codec, + zarr_format=zarr_format, + ) + z_w[:] = 2 + z_r = zarr.open_array(store=store, zarr_format=zarr_format) + assert np.all(z_r[:] == 2) + if zarr_format == 2: + print(z_r.metadata.to_dict()["compressor"]) + else: + print(z_r.metadata.to_dict()["codecs"]) + + +if __name__ == "__main__": + pytest.main([__file__, f"-c {__file__}", "-s"]) diff --git a/pyproject.toml b/pyproject.toml index e36d636b0c..0cfafcfcd5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -82,7 +82,8 @@ test = [ "pytest-xdist", "packaging", "tomlkit", - "uv" + "uv", + "imagecodecs" ] remote_tests = [ 'zarr[remote]', @@ -383,6 +384,7 @@ module = [ "tests.test_indexing", "tests.test_properties", "tests.test_sync", + "tests.test_v2", "tests.test_regression.scripts.*" ] ignore_errors = true diff --git a/src/zarr/abc/codec.py b/src/zarr/abc/codec.py index f8a5447a70..c1140ee4be 100644 --- a/src/zarr/abc/codec.py +++ b/src/zarr/abc/codec.py @@ -1,11 +1,21 @@ from __future__ import annotations from abc import abstractmethod -from typing import TYPE_CHECKING, Generic, TypeVar +from collections.abc import Mapping +from typing import ( + TYPE_CHECKING, + Generic, + Literal, + TypedDict, + TypeVar, + overload, +) + +from typing_extensions import ReadOnly from zarr.abc.metadata import Metadata from zarr.core.buffer import Buffer, NDBuffer -from zarr.core.common import ChunkCoords, concurrent_map +from zarr.core.common import ChunkCoords, NamedConfig, ZarrFormat, concurrent_map from zarr.core.config import config if TYPE_CHECKING: @@ -34,6 +44,21 @@ CodecInput = TypeVar("CodecInput", bound=NDBuffer | Buffer) CodecOutput = TypeVar("CodecOutput", bound=NDBuffer | Buffer) +TName = TypeVar("TName", bound=str, covariant=True) + + +class CodecJSON_V2(TypedDict, Generic[TName]): + id: ReadOnly[TName] + + +CodecConfig_V3 = NamedConfig[str, Mapping[str, object]] + +CodecJSON_V3 = str | CodecConfig_V3 + +# The widest type we will accept for a codec JSON +# This covers v2 and v3 +CodecJSON = str | Mapping[str, object] + class BaseCodec(Metadata, Generic[CodecInput, CodecOutput]): """Generic base class for codecs. @@ -157,6 +182,34 @@ async def encode( """ return await _batching_helper(self._encode_single, chunks_and_specs) + @overload + def to_json(self, zarr_format: Literal[2]) -> CodecJSON_V2[str]: ... + @overload + def to_json(self, zarr_format: Literal[3]) -> NamedConfig[str, Mapping[str, object]]: ... + + def to_json( + self, zarr_format: ZarrFormat + ) -> CodecJSON_V2[str] | NamedConfig[str, Mapping[str, object]]: + raise NotImplementedError + + @classmethod + def _from_json_v2(cls, data: CodecJSON) -> Self: + raise NotImplementedError + + @classmethod + def _from_json_v3(cls, data: CodecJSON) -> Self: + raise NotImplementedError + + @classmethod + def from_json(cls, data: CodecJSON, zarr_format: ZarrFormat) -> Self: + if zarr_format == 2: + return cls._from_json_v2(data) + elif zarr_format == 3: + return cls._from_json_v3(data) + raise ValueError( + f"Unsupported Zarr format {zarr_format}. Expected 2 or 3." + ) # pragma: no cover + class ArrayArrayCodec(BaseCodec[NDBuffer, NDBuffer]): """Base class for array-to-array codecs.""" @@ -447,3 +500,7 @@ async def wrap(chunk: CodecInput | None, chunk_spec: ArraySpec) -> CodecOutput | return await func(chunk, chunk_spec) return wrap + + +# Raised when a codec JSON data is invalid +class CodecValidationError(ValueError): ... diff --git a/src/zarr/codecs/_v2.py b/src/zarr/codecs/_v2.py index 08853f27f1..d347bed420 100644 --- a/src/zarr/codecs/_v2.py +++ b/src/zarr/codecs/_v2.py @@ -19,7 +19,7 @@ @dataclass(frozen=True) -class V2Codec(ArrayBytesCodec): +class _V2Codec(ArrayBytesCodec): filters: tuple[numcodecs.abc.Codec, ...] | None compressor: numcodecs.abc.Codec | None diff --git a/src/zarr/codecs/blosc.py b/src/zarr/codecs/blosc.py index f89f127852..c429ed248f 100644 --- a/src/zarr/codecs/blosc.py +++ b/src/zarr/codecs/blosc.py @@ -1,18 +1,22 @@ from __future__ import annotations import asyncio +from collections.abc import Mapping from dataclasses import dataclass, replace -from enum import Enum from functools import cached_property -from typing import TYPE_CHECKING +from typing import TYPE_CHECKING, Final, Literal, NotRequired, TypedDict, TypeGuard, overload import numcodecs from numcodecs.blosc import Blosc from packaging.version import Version -from zarr.abc.codec import BytesBytesCodec +from zarr.abc.codec import BytesBytesCodec, CodecJSON, CodecJSON_V2, CodecValidationError from zarr.core.buffer.cpu import as_numpy_array_wrapper -from zarr.core.common import JSON, parse_enum, parse_named_configuration +from zarr.core.common import ( + JSON, + NamedRequiredConfig, + ZarrFormat, +) from zarr.core.dtype.common import HasItemSize from zarr.registry import register_codec @@ -22,39 +26,64 @@ from zarr.core.array_spec import ArraySpec from zarr.core.buffer import Buffer +BloscShuffle = Literal["noshuffle", "shuffle", "bitshuffle"] +BLOSC_SHUFFLE: Final = ("noshuffle", "shuffle", "bitshuffle") -class BloscShuffle(Enum): - """ - Enum for shuffle filter used by blosc. - """ +BloscCname = Literal["lz4", "lz4hc", "blosclz", "zstd", "snappy", "zlib"] +BLOSC_CNAME: Final = ("lz4", "lz4hc", "blosclz", "zstd", "snappy", "zlib") - noshuffle = "noshuffle" - shuffle = "shuffle" - bitshuffle = "bitshuffle" - @classmethod - def from_int(cls, num: int) -> BloscShuffle: - blosc_shuffle_int_to_str = { - 0: "noshuffle", - 1: "shuffle", - 2: "bitshuffle", - } - if num not in blosc_shuffle_int_to_str: - raise ValueError(f"Value must be between 0 and 2. Got {num}.") - return BloscShuffle[blosc_shuffle_int_to_str[num]] +class BloscConfigV2(TypedDict): + cname: BloscCname + clevel: int + shuffle: int + blocksize: int + typesize: NotRequired[int] -class BloscCname(Enum): +class BloscConfigV3(TypedDict): + cname: BloscCname + clevel: int + shuffle: BloscShuffle + blocksize: int + typesize: int + + +class BloscJSON_V2(CodecJSON_V2[Literal["blosc"]], BloscConfigV2): """ - Enum for compression library used by blosc. + The JSON form of the Blosc codec in Zarr V2. """ - lz4 = "lz4" - lz4hc = "lz4hc" - blosclz = "blosclz" - zstd = "zstd" - snappy = "snappy" - zlib = "zlib" + +class BloscJSON_V3(NamedRequiredConfig[Literal["blosc"], BloscConfigV3]): + """ + The JSON form of the Blosc codec in Zarr V3. + """ + + +def check_json_v2(data: CodecJSON) -> TypeGuard[BloscJSON_V2]: + return ( + isinstance(data, Mapping) + and set(data.keys()) == {"id", "clevel", "cname", "shuffle", "blocksize"} + and data["id"] == "blosc" + ) + + +def check_json_v3(data: CodecJSON) -> TypeGuard[BloscJSON_V3]: + return ( + isinstance(data, Mapping) + and set(data.keys()) == {"name", "configuration"} + and data["name"] == "blosc" + and isinstance(data["configuration"], Mapping) + and set(data["configuration"].keys()) + == {"cname", "clevel", "shuffle", "blocksize", "typesize"} + ) + + +def parse_cname(value: object) -> BloscCname: + if value not in BLOSC_CNAME: + raise ValueError(f"Value must be one of {BLOSC_CNAME}. Got {value} instead.") + return value # See https://zarr.readthedocs.io/en/stable/user-guide/performance.html#configuring-blosc @@ -85,6 +114,12 @@ def parse_blocksize(data: JSON) -> int: raise TypeError(f"Value should be an int. Got {type(data)} instead.") +def parse_shuffle(data: object) -> BloscShuffle: + if data in BLOSC_SHUFFLE: + return data # type: ignore[return-value] + raise TypeError(f"Value must be one of {BLOSC_SHUFFLE}. Got {data} instead.") + + @dataclass(frozen=True) class BloscCodec(BytesBytesCodec): """blosc codec""" @@ -92,24 +127,24 @@ class BloscCodec(BytesBytesCodec): is_fixed_size = False typesize: int | None - cname: BloscCname = BloscCname.zstd - clevel: int = 5 - shuffle: BloscShuffle | None = BloscShuffle.noshuffle - blocksize: int = 0 + cname: BloscCname + clevel: int + shuffle: BloscShuffle | None + blocksize: int def __init__( self, *, typesize: int | None = None, - cname: BloscCname | str = BloscCname.zstd, + cname: BloscCname = "zstd", clevel: int = 5, - shuffle: BloscShuffle | str | None = None, + shuffle: BloscShuffle | None = None, blocksize: int = 0, ) -> None: typesize_parsed = parse_typesize(typesize) if typesize is not None else None - cname_parsed = parse_enum(cname, BloscCname) + cname_parsed = parse_cname(cname) clevel_parsed = parse_clevel(clevel) - shuffle_parsed = parse_enum(shuffle, BloscShuffle) if shuffle is not None else None + shuffle_parsed = parse_shuffle(shuffle) if shuffle is not None else None blocksize_parsed = parse_blocksize(blocksize) object.__setattr__(self, "typesize", typesize_parsed) @@ -120,24 +155,74 @@ def __init__( @classmethod def from_dict(cls, data: dict[str, JSON]) -> Self: - _, configuration_parsed = parse_named_configuration(data, "blosc") - return cls(**configuration_parsed) # type: ignore[arg-type] + return cls.from_json(data, zarr_format=3) def to_dict(self) -> dict[str, JSON]: - if self.typesize is None: - raise ValueError("`typesize` needs to be set for serialization.") - if self.shuffle is None: - raise ValueError("`shuffle` needs to be set for serialization.") - return { - "name": "blosc", - "configuration": { - "typesize": self.typesize, - "cname": self.cname.value, + return self.to_json(zarr_format=3) + + @classmethod + def _from_json_v2(cls, data: CodecJSON) -> Self: + if check_json_v2(data): + return cls( + cname=data["cname"], + clevel=data["clevel"], + shuffle=BLOSC_SHUFFLE[data["shuffle"]], + blocksize=data["blocksize"], + typesize=data.get("typesize", None), + ) + msg = ( + "Invalid Zarr V2 JSON representation of the blosc codec. " + f"Got {data!r}, expected a Mapping with keys ('id', 'cname', 'clevel', 'shuffle', 'blocksize', 'typesize')" + ) + raise CodecValidationError(msg) + + @classmethod + def _from_json_v3(cls, data: CodecJSON) -> Self: + if check_json_v3(data): + return cls( + typesize=data["configuration"]["typesize"], + cname=data["configuration"]["cname"], + clevel=data["configuration"]["clevel"], + shuffle=data["configuration"]["shuffle"], + blocksize=data["configuration"]["blocksize"], + ) + msg = ( + "Invalid Zarr V3 JSON representation of the blosc codec. " + f"Got {data!r}, expected a Mapping with keys ('name', 'configuration')" + "Where the 'configuration' key is a Mapping with keys ('cname', 'clevel', 'shuffle', 'blocksize', 'typesize')" + ) + raise CodecValidationError(msg) + + @overload + def to_json(self, zarr_format: Literal[2]) -> BloscJSON_V2: ... + @overload + def to_json(self, zarr_format: Literal[3]) -> BloscJSON_V3: ... + + def to_json(self, zarr_format: ZarrFormat) -> BloscJSON_V2 | BloscJSON_V3: + if self.typesize is None or self.shuffle is None: + raise ValueError("typesize and blocksize need to be set for encoding.") + if zarr_format == 2: + return { + "id": "blosc", "clevel": self.clevel, - "shuffle": self.shuffle.value, + "cname": self.cname, + "shuffle": BLOSC_SHUFFLE.index(self.shuffle), "blocksize": self.blocksize, - }, - } + } + elif zarr_format == 3: + return { + "name": "blosc", + "configuration": { + "clevel": self.clevel, + "cname": self.cname, + "shuffle": self.shuffle, + "typesize": self.typesize, + "blocksize": self.blocksize, + }, + } + raise ValueError( + f"Unsupported Zarr format {zarr_format}. Expected 2 or 3." + ) # pragma: no cover def evolve_from_array_spec(self, array_spec: ArraySpec) -> Self: item_size = 1 @@ -147,10 +232,7 @@ def evolve_from_array_spec(self, array_spec: ArraySpec) -> Self: if new_codec.typesize is None: new_codec = replace(new_codec, typesize=item_size) if new_codec.shuffle is None: - new_codec = replace( - new_codec, - shuffle=(BloscShuffle.bitshuffle if item_size == 1 else BloscShuffle.shuffle), - ) + new_codec = replace(new_codec, shuffle="bitshuffle" if item_size == 1 else "shuffle") return new_codec @@ -158,15 +240,10 @@ def evolve_from_array_spec(self, array_spec: ArraySpec) -> Self: def _blosc_codec(self) -> Blosc: if self.shuffle is None: raise ValueError("`shuffle` needs to be set for decoding and encoding.") - map_shuffle_str_to_int = { - BloscShuffle.noshuffle: 0, - BloscShuffle.shuffle: 1, - BloscShuffle.bitshuffle: 2, - } config_dict = { - "cname": self.cname.name, + "cname": self.cname, "clevel": self.clevel, - "shuffle": map_shuffle_str_to_int[self.shuffle], + "shuffle": BLOSC_SHUFFLE.index(self.shuffle), "blocksize": self.blocksize, } # See https://github.com/zarr-developers/numcodecs/pull/713 diff --git a/src/zarr/codecs/bytes.py b/src/zarr/codecs/bytes.py index 7576119c82..ba4fcaf9fb 100644 --- a/src/zarr/codecs/bytes.py +++ b/src/zarr/codecs/bytes.py @@ -1,15 +1,16 @@ from __future__ import annotations import sys +from collections.abc import Mapping from dataclasses import dataclass, replace from enum import Enum -from typing import TYPE_CHECKING +from typing import TYPE_CHECKING, Final, Literal, NotRequired, TypedDict, TypeGuard, overload import numpy as np -from zarr.abc.codec import ArrayBytesCodec +from zarr.abc.codec import ArrayBytesCodec, CodecJSON, CodecJSON_V2 from zarr.core.buffer import Buffer, NDArrayLike, NDBuffer -from zarr.core.common import JSON, parse_enum, parse_named_configuration +from zarr.core.common import JSON, NamedConfig, ZarrFormat from zarr.core.dtype.common import HasEndianness from zarr.registry import register_codec @@ -28,7 +29,47 @@ class Endian(Enum): little = "little" -default_system_endian = Endian(sys.byteorder) +# TODO: unify with the endianness defined in core.dtype.common +EndiannessStr = Literal["little", "big"] +ENDIANNESS_STR: Final = "little", "big" + +default_system_endian = sys.byteorder + + +class BytesConfig(TypedDict): + endian: NotRequired[EndiannessStr] + + +class BytesJSON_V2(CodecJSON_V2[Literal["bytes"]], BytesConfig): ... + + +BytesJSON_V3 = NamedConfig[Literal["bytes"], BytesConfig] | Literal["bytes"] + + +def parse_endianness(data: object) -> EndiannessStr: + if data in ENDIANNESS_STR: + return data # type: ignore [return-value] + raise ValueError(f"Invalid endianness: {data!r}. Expected one of {ENDIANNESS_STR}") + + +def check_json_v2(data: CodecJSON) -> TypeGuard[BytesJSON_V2]: + return ( + isinstance(data, Mapping) + and set(data.keys()) in ({"id", "endian"}, {"id"}) + and data["id"] == "bytes" + ) + + +def check_json_v3(data: CodecJSON) -> TypeGuard[BytesJSON_V3]: + return data == "bytes" or ( + ( + isinstance(data, Mapping) + and set(data.keys()) in ({"name"}, {"name", "configuration"}) + and data["name"] == "bytes" + ) + and isinstance(data.get("configuration", {}), Mapping) + and set(data.get("configuration", {}).keys()) in ({"endian"}, set()) + ) @dataclass(frozen=True) @@ -37,26 +78,59 @@ class BytesCodec(ArrayBytesCodec): is_fixed_size = True - endian: Endian | None + endian: EndiannessStr | None - def __init__(self, *, endian: Endian | str | None = default_system_endian) -> None: - endian_parsed = None if endian is None else parse_enum(endian, Endian) + def __init__(self, *, endian: EndiannessStr | str | None = default_system_endian) -> None: + endian_parsed = None if endian is None else parse_endianness(endian) object.__setattr__(self, "endian", endian_parsed) @classmethod def from_dict(cls, data: dict[str, JSON]) -> Self: - _, configuration_parsed = parse_named_configuration( - data, "bytes", require_configuration=False - ) - configuration_parsed = configuration_parsed or {} - return cls(**configuration_parsed) # type: ignore[arg-type] + return cls.from_json(data, zarr_format=3) def to_dict(self) -> dict[str, JSON]: - if self.endian is None: + return self.to_json(zarr_format=3) + + @classmethod + def _from_json_v2(cls, data: CodecJSON) -> Self: + if check_json_v2(data): + return cls(endian=data.get("endian", None)) + raise ValueError(f"Invalid JSON: {data}") + + @classmethod + def _from_json_v3(cls, data: CodecJSON) -> Self: + if check_json_v3(data): + # Three different representations of the exact same codec... + if data in ("bytes", {"name": "bytes"}, {"name": "bytes", "configuration": {}}): + return cls() + else: + return cls(endian=data["configuration"].get("endian", None)) + raise ValueError(f"Invalid JSON: {data}") + + @overload + def to_json(self, zarr_format: Literal[2]) -> BytesJSON_V2: ... + @overload + def to_json(self, zarr_format: Literal[3]) -> BytesJSON_V3: ... + + def to_json(self, zarr_format: ZarrFormat) -> BytesJSON_V2 | BytesJSON_V3: + if zarr_format == 2: + if self.endian is not None: + return { + "id": "bytes", + "endian": self.endian, + } + return {"id": "bytes"} + elif zarr_format == 3: + if self.endian is not None: + return { + "name": "bytes", + "configuration": {"endian": self.endian}, + } return {"name": "bytes"} - else: - return {"name": "bytes", "configuration": {"endian": self.endian.value}} + raise ValueError( + f"Unsupported Zarr format {zarr_format}. Expected 2 or 3." + ) # pragma: no cover def evolve_from_array_spec(self, array_spec: ArraySpec) -> Self: if not isinstance(array_spec.dtype, HasEndianness): @@ -75,9 +149,9 @@ async def _decode_single( ) -> NDBuffer: assert isinstance(chunk_bytes, Buffer) # TODO: remove endianness enum in favor of literal union - endian_str = self.endian.value if self.endian is not None else None - if isinstance(chunk_spec.dtype, HasEndianness): - dtype = replace(chunk_spec.dtype, endianness=endian_str).to_native_dtype() # type: ignore[call-arg] + endian = self.endian if self.endian is not None else None + if isinstance(chunk_spec.dtype, HasEndianness) and endian is not None: + dtype = replace(chunk_spec.dtype, endianness=endian).to_native_dtype() # type: ignore[call-arg] else: dtype = chunk_spec.dtype.to_native_dtype() as_array_like = chunk_bytes.as_array_like() @@ -109,7 +183,7 @@ async def _encode_single( ): # type-ignore is a numpy bug # see https://github.com/numpy/numpy/issues/26473 - new_dtype = chunk_array.dtype.newbyteorder(self.endian.name) # type: ignore[arg-type] + new_dtype = chunk_array.dtype.newbyteorder(self.endian) # type: ignore[arg-type] chunk_array = chunk_array.astype(new_dtype) nd_array = chunk_array.as_ndarray_like() diff --git a/src/zarr/codecs/crc32c_.py b/src/zarr/codecs/crc32c_.py index c2e30f689a..e0c4e7bc3f 100644 --- a/src/zarr/codecs/crc32c_.py +++ b/src/zarr/codecs/crc32c_.py @@ -1,14 +1,15 @@ from __future__ import annotations +from collections.abc import Mapping from dataclasses import dataclass -from typing import TYPE_CHECKING, cast +from typing import TYPE_CHECKING, Literal, TypedDict, TypeGuard, cast, overload import numpy as np import typing_extensions from crc32c import crc32c -from zarr.abc.codec import BytesBytesCodec -from zarr.core.common import JSON, parse_named_configuration +from zarr.abc.codec import BytesBytesCodec, CodecJSON, CodecJSON_V2, CodecValidationError +from zarr.core.common import JSON, NamedConfig, ZarrFormat, parse_named_configuration from zarr.registry import register_codec if TYPE_CHECKING: @@ -18,6 +19,28 @@ from zarr.core.buffer import Buffer +class Crc32Config(TypedDict): ... + + +class Crc32cJSON_V2(CodecJSON_V2[Literal["crc32c"]]): ... + + +class Crc32cJSON_V3(NamedConfig[Literal["crc32c"], Crc32Config]): ... + + +def check_json_v2(data: CodecJSON) -> TypeGuard[Crc32cJSON_V2]: + return isinstance(data, Mapping) and set(data.keys()) == {"id"} and data["id"] == "crc32c" + + +def check_json_v3(data: CodecJSON) -> TypeGuard[Crc32cJSON_V3]: + return ( + isinstance(data, Mapping) + and set(data.keys()) in ({"name", "configuration"}, {"name"}) + and data["name"] == "crc32c" + and data.get("configuration") in ({}, None) + ) + + @dataclass(frozen=True) class Crc32cCodec(BytesBytesCodec): """crc32c codec""" @@ -26,12 +49,49 @@ class Crc32cCodec(BytesBytesCodec): @classmethod def from_dict(cls, data: dict[str, JSON]) -> Self: + return cls.from_json(data, zarr_format=3) parse_named_configuration(data, "crc32c", require_configuration=False) return cls() + @classmethod + def _from_json_v2(cls, data: CodecJSON) -> Self: + if check_json_v2(data): + return cls() + msg = ( + "Invalid Zarr V2 JSON representation of the crc32c codec. " + f"Got {data!r}, expected a Mapping with keys ('id')" + ) + raise CodecValidationError(msg) + + @classmethod + def _from_json_v3(cls, data: CodecJSON) -> Self: + if check_json_v3(data): + return cls() + msg = ( + "Invalid Zarr V3 JSON representation of the crc32c codec. " + f"Got {data!r}, expected a Mapping with keys ('name')" + ) + raise CodecValidationError(msg) + def to_dict(self) -> dict[str, JSON]: + return self.to_json(zarr_format=3) return {"name": "crc32c"} + @overload + def to_json(self, zarr_format: Literal[2]) -> Crc32cJSON_V2: ... + + @overload + def to_json(self, zarr_format: Literal[3]) -> Crc32cJSON_V3: ... + + def to_json(self, zarr_format: ZarrFormat) -> CodecJSON: + if zarr_format == 2: + return {"id": "crc32c"} + elif zarr_format == 3: + return {"name": "crc32c"} + raise ValueError( + f"Unsupported Zarr format {zarr_format}. Expected 2 or 3." + ) # pragma: no cover + async def _decode_single( self, chunk_bytes: Buffer, diff --git a/src/zarr/codecs/gzip.py b/src/zarr/codecs/gzip.py index 9e6515a4d1..e2733ddcdc 100644 --- a/src/zarr/codecs/gzip.py +++ b/src/zarr/codecs/gzip.py @@ -1,14 +1,19 @@ from __future__ import annotations import asyncio +from collections.abc import Mapping from dataclasses import dataclass -from typing import TYPE_CHECKING +from typing import TYPE_CHECKING, Literal, TypedDict, TypeGuard, overload from numcodecs.gzip import GZip -from zarr.abc.codec import BytesBytesCodec +from zarr.abc.codec import BytesBytesCodec, CodecJSON, CodecJSON_V2 from zarr.core.buffer.cpu import as_numpy_array_wrapper -from zarr.core.common import JSON, parse_named_configuration +from zarr.core.common import ( + JSON, + NamedRequiredConfig, + ZarrFormat, +) from zarr.registry import register_codec if TYPE_CHECKING: @@ -28,6 +33,22 @@ def parse_gzip_level(data: JSON) -> int: return data +class GZipConfig(TypedDict): + level: int + + +class GZipJSON_V2(CodecJSON_V2[Literal["gzip"]], GZipConfig): + """ + The JSON form of the GZip codec in Zarr V2. + """ + + +class GZipJSON_V3(NamedRequiredConfig[Literal["gzip"], GZipConfig]): + """ + The JSON form of the GZip codec in Zarr V3. + """ + + @dataclass(frozen=True) class GzipCodec(BytesBytesCodec): """gzip codec""" @@ -43,11 +64,56 @@ def __init__(self, *, level: int = 5) -> None: @classmethod def from_dict(cls, data: dict[str, JSON]) -> Self: - _, configuration_parsed = parse_named_configuration(data, "gzip") - return cls(**configuration_parsed) # type: ignore[arg-type] + return cls.from_json(data, zarr_format=3) def to_dict(self) -> dict[str, JSON]: - return {"name": "gzip", "configuration": {"level": self.level}} + return self.to_json(zarr_format=3) + + @overload + def to_json(self, zarr_format: Literal[2]) -> GZipJSON_V2: ... + @overload + def to_json(self, zarr_format: Literal[3]) -> GZipJSON_V3: ... + + def to_json(self, zarr_format: ZarrFormat) -> GZipJSON_V2 | GZipJSON_V3: + if zarr_format == 2: + return {"id": "gzip", "level": self.level} + elif zarr_format == 3: + return {"name": "gzip", "configuration": {"level": self.level}} + raise ValueError( + f"Unsupported Zarr format {zarr_format}. Expected 2 or 3." + ) # pragma: no cover + + @classmethod + def _check_json_v2(cls, data: CodecJSON) -> TypeGuard[GZipJSON_V2]: + return ( + isinstance(data, Mapping) + and set(data.keys()) == {"id", "level"} + and data["id"] == "gzip" + and isinstance(data["level"], int) + ) + + @classmethod + def _check_json_v3(cls, data: CodecJSON) -> TypeGuard[GZipJSON_V3]: + return ( + isinstance(data, Mapping) + and set(data.keys()) == {"name", "configuration"} + and data["name"] == "gzip" + and isinstance(data["configuration"], dict) + and "level" in data["configuration"] + and isinstance(data["configuration"]["level"], int) + ) + + @classmethod + def _from_json_v2(cls, data: CodecJSON) -> Self: + if cls._check_json_v2(data): + return cls(level=data["level"]) + raise ValueError(f"Invalid GZip JSON data for Zarr format 2: {data!r}") + + @classmethod + def _from_json_v3(cls, data: CodecJSON) -> Self: + if cls._check_json_v3(data): + return cls(level=data["configuration"]["level"]) + raise ValueError(f"Invalid GZip JSON data for Zarr format 3: {data!r}") async def _decode_single( self, diff --git a/src/zarr/codecs/numcodec.py b/src/zarr/codecs/numcodec.py new file mode 100644 index 0000000000..93eedd979c --- /dev/null +++ b/src/zarr/codecs/numcodec.py @@ -0,0 +1,200 @@ +""" +Utilities for interfacing with the numcodecs library. +""" + +from __future__ import annotations + +import asyncio +from dataclasses import dataclass +from typing import TYPE_CHECKING, Literal, Self, TypeGuard, overload + +import numpy as np +from typing_extensions import Protocol, runtime_checkable + +from zarr.abc.codec import ( + ArrayArrayCodec, + ArrayBytesCodec, + BaseCodec, + BytesBytesCodec, + CodecJSON, + CodecJSON_V2, +) +from zarr.core.array_spec import ArraySpec +from zarr.core.buffer.core import Buffer, BufferPrototype, NDArrayLike, NDBuffer +from zarr.core.buffer.cpu import as_numpy_array_wrapper + +if TYPE_CHECKING: + from zarr.core.array_spec import ArraySpec + from zarr.core.common import BaseConfig, NamedConfig, ZarrFormat + +BufferOrNDArray = Buffer | np.ndarray[tuple[int, ...], np.dtype[np.generic]] | NDArrayLike + + +def get_numcodec_class(name: str) -> type[Numcodec]: + """Obtain a numcodec codec class by name. + + Parameters + ---------- + name : str + The name of the codec to get + + Returns + ------- + codec : Codec + + Examples + -------- + + >>> import numcodecs as codecs + >>> codec = codecs.get_codec('zlib') + >>> codec + Zlib(level=1) + + """ + import numcodecs.registry as numcodecs_registry + + cls = numcodecs_registry.codec_registry.get(name) + if cls is None and name in numcodecs_registry.entries: + cls = numcodecs_registry.entries[name].load() + numcodecs_registry.register_codec(cls, codec_id=name) + if cls is not None: + return cls + raise KeyError(name) + + +@runtime_checkable +class Numcodec(Protocol): + """ + A protocol that models the ``numcodecs.abc.Codec`` interface. + """ + + codec_id: str + + def encode(self, buf: BufferOrNDArray) -> BufferOrNDArray: ... + + def decode( + self, buf: BufferOrNDArray, out: BufferOrNDArray | None = None + ) -> BufferOrNDArray: ... + + def get_config(self) -> CodecJSON_V2[str]: ... + + @classmethod + def from_config(cls, config: CodecJSON_V2[str]) -> Self: ... + + +def is_numcodec_cls(obj: object) -> TypeGuard[type[Numcodec]]: + """ + Check if the given object implements the Numcodec protocol. Because the @runtime_checkable + decorator does not allow issubclass checks for protocols with non-method members (i.e., attributes), + we need to manually check for the presence of the required attributes and methods. + """ + return ( + isinstance(obj, type) + and hasattr(obj, "codec_id") + and isinstance(obj.codec_id, str) + and hasattr(obj, "encode") + and callable(obj.encode) + and hasattr(obj, "decode") + and callable(obj.decode) + and hasattr(obj, "get_config") + and callable(obj.get_config) + and hasattr(obj, "from_config") + and callable(obj.from_config) + ) + + +@dataclass(frozen=True, kw_only=True) +class NumcodecsWrapper(BaseCodec[Buffer | NDBuffer, Buffer | NDBuffer]): + codec: Numcodec + + @overload + def to_json(self, zarr_format: Literal[2]) -> CodecJSON_V2[str]: ... + @overload + def to_json(self, zarr_format: Literal[3]) -> NamedConfig[str, BaseConfig]: ... + + def to_json(self, zarr_format: ZarrFormat) -> CodecJSON_V2[str] | NamedConfig[str, BaseConfig]: + if zarr_format == 2: + return self.codec.get_config() + elif zarr_format == 3: + config = self.codec.get_config() + config_no_id = {k: v for k, v in config.items() if k != "id"} + return {"name": config["id"], "configuration": config_no_id} + raise ValueError(f"Unsupported zarr format: {zarr_format}") # pragma: no cover + + @classmethod + def _from_json_v2(cls, data: CodecJSON) -> Self: + raise NotADirectoryError( + "This class does not support creating instances from JSON data for Zarr format 2." + ) + + @classmethod + def _from_json_v3(cls, data: CodecJSON) -> Self: + raise NotImplementedError( + "This class does not support creating instances from JSON data for Zarr format 3." + ) + + def compute_encoded_size(self, input_byte_length: int, chunk_spec: ArraySpec) -> int: + raise NotImplementedError + + def to_array_array(self) -> NumcodecsArrayArrayCodec: + """ + Use the ``_codec`` attribute to create a NumcodecsArrayArrayCodec. + """ + return NumcodecsArrayArrayCodec(codec=self.codec) + + def to_bytes_bytes(self) -> NumcodecsBytesBytesCodec: + """ + Use the ``_codec`` attribute to create a NumcodecsBytesBytesCodec. + """ + return NumcodecsBytesBytesCodec(codec=self.codec) + + def to_array_bytes(self) -> NumcodecsArrayBytesCodec: + """ + Use the ``_codec`` attribute to create a NumcodecsArrayBytesCodec. + """ + return NumcodecsArrayBytesCodec(codec=self.codec) + + +class NumcodecsBytesBytesCodec(NumcodecsWrapper, BytesBytesCodec): + async def _decode_single(self, chunk_data: Buffer, chunk_spec: ArraySpec) -> Buffer: + return await asyncio.to_thread( + as_numpy_array_wrapper, + self.codec.decode, + chunk_data, + chunk_spec.prototype, + ) + + def _encode(self, chunk_bytes: Buffer, prototype: BufferPrototype) -> Buffer: + encoded = self.codec.encode(chunk_bytes.as_array_like()) + if isinstance(encoded, np.ndarray): # Required for checksum codecs + return prototype.buffer.from_bytes(encoded.tobytes()) + return prototype.buffer.from_bytes(encoded) + + async def _encode_single(self, chunk_data: Buffer, chunk_spec: ArraySpec) -> Buffer: + return await asyncio.to_thread(self._encode, chunk_data, chunk_spec.prototype) + + +@dataclass(kw_only=True, frozen=True) +class NumcodecsArrayArrayCodec(NumcodecsWrapper, ArrayArrayCodec): + async def _decode_single(self, chunk_data: NDBuffer, chunk_spec: ArraySpec) -> NDBuffer: + chunk_ndarray = chunk_data.as_ndarray_like() + out = await asyncio.to_thread(self.codec.decode, chunk_ndarray) + return chunk_spec.prototype.nd_buffer.from_ndarray_like(out.reshape(chunk_spec.shape)) # type: ignore[union-attr] + + async def _encode_single(self, chunk_data: NDBuffer, chunk_spec: ArraySpec) -> NDBuffer: + chunk_ndarray = chunk_data.as_ndarray_like() + out = await asyncio.to_thread(self.codec.encode, chunk_ndarray) + return chunk_spec.prototype.nd_buffer.from_ndarray_like(out) # type: ignore[arg-type] + + +@dataclass(kw_only=True, frozen=True) +class NumcodecsArrayBytesCodec(NumcodecsWrapper, ArrayBytesCodec): + async def _decode_single(self, chunk_data: Buffer, chunk_spec: ArraySpec) -> NDBuffer: + chunk_bytes = chunk_data.to_bytes() + out = await asyncio.to_thread(self.codec.decode, chunk_bytes) + return chunk_spec.prototype.nd_buffer.from_ndarray_like(out.reshape(chunk_spec.shape)) + + async def _encode_single(self, chunk_data: NDBuffer, chunk_spec: ArraySpec) -> Buffer: + chunk_ndarray = chunk_data.as_ndarray_like() + out = await asyncio.to_thread(self.codec.encode, chunk_ndarray) + return chunk_spec.prototype.buffer.from_bytes(out) diff --git a/src/zarr/codecs/sharding.py b/src/zarr/codecs/sharding.py index 888d258649..287c81f0af 100644 --- a/src/zarr/codecs/sharding.py +++ b/src/zarr/codecs/sharding.py @@ -1,11 +1,22 @@ from __future__ import annotations -from collections.abc import Iterable, Mapping, MutableMapping +from collections.abc import Iterable, Mapping, MutableMapping, Sequence from dataclasses import dataclass, field, replace from enum import Enum from functools import lru_cache from operator import itemgetter -from typing import TYPE_CHECKING, Any, NamedTuple, cast +from typing import ( + TYPE_CHECKING, + Any, + Literal, + NamedTuple, + NotRequired, + Self, + TypedDict, + TypeGuard, + cast, + overload, +) import numpy as np import numpy.typing as npt @@ -15,7 +26,11 @@ ArrayBytesCodecPartialDecodeMixin, ArrayBytesCodecPartialEncodeMixin, Codec, + CodecJSON, + CodecJSON_V2, + CodecJSON_V3, CodecPipeline, + CodecValidationError, ) from zarr.abc.store import ( ByteGetter, @@ -38,8 +53,8 @@ from zarr.core.common import ( ChunkCoords, ChunkCoordsLike, + NamedRequiredConfig, parse_enum, - parse_named_configuration, parse_shapelike, product, ) @@ -65,6 +80,28 @@ ShardMapping = Mapping[ChunkCoords, Buffer] ShardMutableMapping = MutableMapping[ChunkCoords, Buffer] +IndexLocation = Literal["start", "end"] + + +class ShardingConfigV2(TypedDict): + codecs: tuple[CodecJSON_V2[str], ...] + chunk_shape: tuple[int, ...] + index_codecs: tuple[CodecJSON_V2[str], ...] + index_location: NotRequired[Literal["start", "end"]] + + +class ShardingConfigV3(TypedDict): + codecs: tuple[CodecJSON_V3, ...] + chunk_shape: tuple[int, ...] + index_codecs: tuple[CodecJSON_V3, ...] + index_location: NotRequired[Literal["start", "end"]] + + +class ShardingJSON_V2(CodecJSON_V2[Literal["sharding_indexed"]], ShardingConfigV2): ... + + +class ShardingJSON_V3(NamedRequiredConfig[Literal["sharding_indexed"], ShardingConfigV3]): ... + class ShardingCodecIndexLocation(Enum): """ @@ -79,6 +116,37 @@ def parse_index_location(data: object) -> ShardingCodecIndexLocation: return parse_enum(data, ShardingCodecIndexLocation) +def check_json_v2(data: CodecJSON) -> TypeGuard[ShardingJSON_V2]: + return ( + isinstance(data, Mapping) + and set(data.keys()) == {"id", "codecs", "chunk_shape"} + and data["id"] == "sharding_indexed" + and isinstance(data["chunk_shape"], Sequence) + and not isinstance(data["chunk_shape"], str) + and isinstance(data["codecs"], Sequence) + and not isinstance(data["codecs"], str) + ) + + +def check_json_v3(data: CodecJSON) -> TypeGuard[ShardingJSON_V3]: + # TODO: Automate this with a function that does runtime type checking on typeddicts. + return ( + isinstance(data, Mapping) + and set(data.keys()) == {"name", "configuration"} + and data["name"] == "sharding_indexed" + and isinstance(data["configuration"], Mapping) + and set(data["configuration"].keys()) + == {"codecs", "chunk_shape", "index_codecs", "index_location"} + and isinstance(data["configuration"]["chunk_shape"], Sequence) + and not isinstance(data["configuration"]["chunk_shape"], str) + and isinstance(data["configuration"]["codecs"], Sequence) + and not isinstance(data["configuration"]["codecs"], str) + and isinstance(data["configuration"]["index_codecs"], Sequence) + and not isinstance(data["configuration"]["index_codecs"], str) + and data["configuration"]["index_location"] in ("start", "end") + ) + + @dataclass(frozen=True) class _ShardingByteGetter(ByteGetter): shard_dict: ShardMapping @@ -385,23 +453,76 @@ def __setstate__(self, state: dict[str, Any]) -> None: @classmethod def from_dict(cls, data: dict[str, JSON]) -> Self: - _, configuration_parsed = parse_named_configuration(data, "sharding_indexed") - return cls(**configuration_parsed) # type: ignore[arg-type] + return cls.from_json(data, zarr_format=3) + + @classmethod + def _from_json_v2(cls, data: CodecJSON) -> Self: + if check_json_v2(data): + return cls( + codecs=data["codecs"], + index_codecs=data["index_codecs"], + index_location=data["index_location"], + chunk_shape=data["chunk_shape"], + ) + msg = ( + "Invalid Zarr V2 JSON representation of the sharding codec. " + f"Got {data!r}, expected a Mapping with keys ('id', 'codecs', 'index_codecs', 'chunk_shape', 'index_location')" + ) + raise CodecValidationError(msg) + + @classmethod + def _from_json_v3(cls, data: CodecJSON) -> Self: + if check_json_v3(data): + return cls( + codecs=data["configuration"]["codecs"], + index_codecs=data["configuration"]["index_codecs"], + index_location=data["configuration"]["index_location"], + chunk_shape=data["configuration"]["chunk_shape"], + ) + msg = ( + "Invalid Zarr V3 JSON representation of the sharding codec. " + f"Got {data!r}, expected a Mapping with keys ('name', 'configuration')" + "Where the 'configuration' key is a Mapping with keys ('codecs', 'index_codecs', 'index_location', 'chunk_shape')" + ) + raise CodecValidationError(msg) @property def codec_pipeline(self) -> CodecPipeline: return get_pipeline_class().from_codecs(self.codecs) def to_dict(self) -> dict[str, JSON]: - return { - "name": "sharding_indexed", - "configuration": { + return self.to_json(zarr_format=3) + + @overload + def to_json(self, zarr_format: Literal[2]) -> ShardingJSON_V2: ... + + @overload + def to_json(self, zarr_format: Literal[3]) -> ShardingJSON_V3: ... + + def to_json(self, zarr_format: int) -> ShardingJSON_V2 | ShardingJSON_V3: + if zarr_format == 2: + return { + "id": "sharding_indexed", + "codecs": tuple(s.to_json(zarr_format=zarr_format) for s in self.codecs), + "index_codecs": tuple( + s.to_json(zarr_format=zarr_format) for s in self.index_codecs + ), "chunk_shape": self.chunk_shape, - "codecs": tuple(s.to_dict() for s in self.codecs), - "index_codecs": tuple(s.to_dict() for s in self.index_codecs), "index_location": self.index_location.value, - }, - } + } + elif zarr_format == 3: + return { + "name": "sharding_indexed", + "configuration": { + "chunk_shape": self.chunk_shape, + "codecs": tuple(s.to_json(zarr_format=zarr_format) for s in self.codecs), + "index_codecs": tuple( + s.to_json(zarr_format=zarr_format) for s in self.index_codecs + ), + "index_location": self.index_location.value, + }, + } + raise ValueError(f"Unsupported Zarr format {zarr_format}. Expected 2 or 3.") def evolve_from_array_spec(self, array_spec: ArraySpec) -> Self: shard_spec = self._get_chunk_spec(array_spec) diff --git a/src/zarr/codecs/transpose.py b/src/zarr/codecs/transpose.py index c87804685c..75ed4448b4 100644 --- a/src/zarr/codecs/transpose.py +++ b/src/zarr/codecs/transpose.py @@ -1,14 +1,19 @@ from __future__ import annotations -from collections.abc import Iterable +from collections.abc import Iterable, Mapping, Sequence from dataclasses import dataclass, replace -from typing import TYPE_CHECKING, cast +from typing import TYPE_CHECKING, Literal, Self, TypedDict, TypeGuard, cast, overload import numpy as np -from zarr.abc.codec import ArrayArrayCodec +from zarr.abc.codec import ArrayArrayCodec, CodecJSON, CodecJSON_V2, CodecValidationError from zarr.core.array_spec import ArraySpec -from zarr.core.common import JSON, ChunkCoordsLike, parse_named_configuration +from zarr.core.common import ( + JSON, + ChunkCoordsLike, + NamedRequiredConfig, + ZarrFormat, +) from zarr.registry import register_codec if TYPE_CHECKING: @@ -27,6 +32,42 @@ def parse_transpose_order(data: JSON | Iterable[int]) -> tuple[int, ...]: return tuple(cast("Iterable[int]", data)) +class TransposeConfig(TypedDict): + order: tuple[int, ...] + + +class TransposeJSON_V2(CodecJSON_V2[Literal["transpose"]], TransposeConfig): + """ + The JSON form of the Transpose codec in Zarr V2. + """ + + +class TransposeJSON_V3(NamedRequiredConfig[Literal["transpose"], TransposeConfig]): + """ + The JSON form of the Transpose codec in Zarr V3. + """ + + +def check_json_v2(data: CodecJSON) -> TypeGuard[TransposeJSON_V2]: + return ( + isinstance(data, Mapping) + and set(data.keys()) == {"id", "configuration"} + and data["id"] == "transpose" + and isinstance(data["order"], Sequence) + and not isinstance(data["order"], str) + ) + + +def check_json_v3(data: CodecJSON) -> TypeGuard[TransposeJSON_V3]: + return ( + isinstance(data, Mapping) + and set(data.keys()) == {"name", "configuration"} + and data["name"] == "transpose" + and isinstance(data["configuration"], Mapping) + and set(data["configuration"].keys()) == {"order"} + ) + + @dataclass(frozen=True) class TransposeCodec(ArrayArrayCodec): """Transpose codec""" @@ -42,12 +83,47 @@ def __init__(self, *, order: ChunkCoordsLike) -> None: @classmethod def from_dict(cls, data: dict[str, JSON]) -> Self: - _, configuration_parsed = parse_named_configuration(data, "transpose") - return cls(**configuration_parsed) # type: ignore[arg-type] + return cls.from_json(data, zarr_format=3) + + @classmethod + def _from_json_v2(cls, data: str | Mapping[str, object]) -> Self: + if check_json_v2(data): + return cls(order=data["order"]) # type: ignore[arg-type] + msg = ( + "Invalid Zarr V2 JSON representation of the transpose codec. " + f"Got {data!r}, expected a Mapping with keys ('id', 'order')" + ) + raise CodecValidationError(msg) + + @classmethod + def _from_json_v3(cls, data: str | Mapping[str, object]) -> Self: + if check_json_v3(data): + return cls(order=data["configuration"]["order"]) + msg = ( + "Invalid Zarr V3 JSON representation of the transpose codec. " + f"Got {data!r}, expected a Mapping with keys ('name', 'configuration')" + "Where the 'configuration' key is a Mapping with keys ('order')" + ) + raise CodecValidationError(msg) def to_dict(self) -> dict[str, JSON]: return {"name": "transpose", "configuration": {"order": tuple(self.order)}} + @overload + def to_json(self, zarr_format: Literal[2]) -> TransposeJSON_V2: ... + + @overload + def to_json(self, zarr_format: Literal[3]) -> TransposeJSON_V3: ... + + def to_json(self, zarr_format: ZarrFormat) -> CodecJSON: + if zarr_format == 2: + return {"id": "transpose", "order": self.order} + elif zarr_format == 3: + return {"name": "transpose", "configuration": {"order": self.order}} + raise ValueError( + f"Unsupported Zarr format {zarr_format}. Expected 2 or 3." + ) # pragma: no cover + def validate( self, shape: tuple[int, ...], diff --git a/src/zarr/codecs/vlen_utf8.py b/src/zarr/codecs/vlen_utf8.py index 28c64be1c0..3b014d403f 100644 --- a/src/zarr/codecs/vlen_utf8.py +++ b/src/zarr/codecs/vlen_utf8.py @@ -1,14 +1,14 @@ from __future__ import annotations from dataclasses import dataclass -from typing import TYPE_CHECKING +from typing import TYPE_CHECKING, Literal, TypedDict, TypeGuard, overload import numpy as np from numcodecs.vlen import VLenBytes, VLenUTF8 -from zarr.abc.codec import ArrayBytesCodec +from zarr.abc.codec import ArrayBytesCodec, CodecJSON, CodecJSON_V2 from zarr.core.buffer import Buffer, NDBuffer -from zarr.core.common import JSON, parse_named_configuration +from zarr.core.common import JSON, NamedConfig, ZarrFormat, parse_named_configuration from zarr.registry import register_codec if TYPE_CHECKING: @@ -22,12 +22,31 @@ _vlen_bytes_codec = VLenBytes() +class VlenUF8Config(TypedDict): ... + + +class VLenUTF8JSON_V2(CodecJSON_V2[Literal["vlen-utf8"]]): ... + + +class VLenUTF8JSON_V3(NamedConfig[Literal["vlen-utf8"], VlenUF8Config]): ... + + +class VLenBytesConfig(TypedDict): ... + + +class VLenBytesJSON_V2(CodecJSON_V2[Literal["vlen-bytes"]]): ... + + +VLenBytesJSON_V3 = NamedConfig[Literal["vlen-bytes"], VLenBytesConfig] | Literal["vlen-bytes"] + + @dataclass(frozen=True) class VLenUTF8Codec(ArrayBytesCodec): """Variable-length UTF8 codec""" @classmethod def from_dict(cls, data: dict[str, JSON]) -> Self: + return cls.from_json(data, zarr_format=3) _, configuration_parsed = parse_named_configuration( data, "vlen-utf8", require_configuration=False ) @@ -37,6 +56,40 @@ def from_dict(cls, data: dict[str, JSON]) -> Self: def to_dict(self) -> dict[str, JSON]: return {"name": "vlen-utf8", "configuration": {}} + @overload + def to_json(self, zarr_format: Literal[2]) -> VLenUTF8JSON_V2: ... + @overload + def to_json(self, zarr_format: Literal[3]) -> VLenUTF8JSON_V3: ... + def to_json(self, zarr_format: ZarrFormat) -> VLenUTF8JSON_V2 | VLenUTF8JSON_V3: + if zarr_format == 2: + return {"id": "vlen-utf8"} + else: + return {"name": "vlen-utf8"} + + @classmethod + def _check_json_v2(cls, data: CodecJSON) -> TypeGuard[VLenUTF8JSON_V2]: + return data == {"id": "vlen-utf8"} + + @classmethod + def _check_json_v3(cls, data: CodecJSON) -> TypeGuard[VLenUTF8JSON_V3]: + return data in ( + {"name": "vlen-utf8"}, + {"name": "vlen-utf8", "configuration": {}}, + "vlen-utf8", + ) + + @classmethod + def _from_json_v2(cls, data: CodecJSON) -> Self: + if cls._check_json_v2(data): + return cls() + raise ValueError(f"Invalid VLenUTF8 JSON data for Zarr format 2: {data!r}") + + @classmethod + def _from_json_v3(cls, data: CodecJSON) -> Self: + if cls._check_json_v3(data): + return cls() + raise ValueError(f"Invalid VLenUTF8 JSON data for Zarr format 3: {data!r}") + def evolve_from_array_spec(self, array_spec: ArraySpec) -> Self: return self @@ -74,15 +127,45 @@ def compute_encoded_size(self, input_byte_length: int, _chunk_spec: ArraySpec) - class VLenBytesCodec(ArrayBytesCodec): @classmethod def from_dict(cls, data: dict[str, JSON]) -> Self: - _, configuration_parsed = parse_named_configuration( - data, "vlen-bytes", require_configuration=False - ) - configuration_parsed = configuration_parsed or {} - return cls(**configuration_parsed) + return cls.from_json(data, zarr_format=3) def to_dict(self) -> dict[str, JSON]: return {"name": "vlen-bytes", "configuration": {}} + @overload + def to_json(self, zarr_format: Literal[2]) -> VLenBytesJSON_V2: ... + @overload + def to_json(self, zarr_format: Literal[3]) -> VLenBytesJSON_V3: ... + def to_json(self, zarr_format: ZarrFormat) -> VLenBytesJSON_V2 | VLenBytesJSON_V3: + if zarr_format == 2: + return {"id": "vlen-bytes"} + else: + return {"name": "vlen-bytes"} + + @classmethod + def _check_json_v2(cls, data: CodecJSON) -> TypeGuard[VLenBytesJSON_V2]: + return data == {"id": "vlen-bytes"} + + @classmethod + def _check_json_v3(cls, data: CodecJSON) -> TypeGuard[VLenBytesJSON_V3]: + return data in ( + {"name": "vlen-bytes"}, + {"name": "vlen-bytes", "configuration": {}}, + "vlen-bytes", + ) + + @classmethod + def _from_json_v2(cls, data: CodecJSON) -> Self: + if cls._check_json_v2(data): + return cls() + raise ValueError(f"Invalid VLenBytes JSON data for Zarr format 2: {data!r}") + + @classmethod + def _from_json_v3(cls, data: CodecJSON) -> Self: + if cls._check_json_v3(data): + return cls() + raise ValueError(f"Invalid VLenBytes JSON data for Zarr format 3: {data!r}") + def evolve_from_array_spec(self, array_spec: ArraySpec) -> Self: return self diff --git a/src/zarr/codecs/zstd.py b/src/zarr/codecs/zstd.py index ead41e7b5f..0418909f3b 100644 --- a/src/zarr/codecs/zstd.py +++ b/src/zarr/codecs/zstd.py @@ -1,17 +1,22 @@ from __future__ import annotations import asyncio +from collections.abc import Mapping from dataclasses import dataclass from functools import cached_property -from typing import TYPE_CHECKING +from typing import TYPE_CHECKING, Literal, Self, TypedDict, TypeGuard, overload import numcodecs from numcodecs.zstd import Zstd from packaging.version import Version -from zarr.abc.codec import BytesBytesCodec +from zarr.abc.codec import BytesBytesCodec, CodecJSON, CodecJSON_V2, CodecValidationError from zarr.core.buffer.cpu import as_numpy_array_wrapper -from zarr.core.common import JSON, parse_named_configuration +from zarr.core.common import ( + JSON, + NamedRequiredConfig, + ZarrFormat, +) from zarr.registry import register_codec if TYPE_CHECKING: @@ -21,6 +26,41 @@ from zarr.core.buffer import Buffer +class ZstdConfig_V2(TypedDict): + level: int + + +class ZstdConfig_V3(TypedDict): + level: int + checksum: bool + + +class ZstdJSON_V2(CodecJSON_V2[Literal["zstd"]], ZstdConfig_V2): + """ + The JSON form of the ZStandard codec in Zarr v2. + """ + + +class ZstdJSON_V3(NamedRequiredConfig[Literal["zstd"], ZstdConfig_V3]): + """ + The JSON form of the ZStandard codec in Zarr v3. + """ + + +def check_json_v2(data: CodecJSON) -> TypeGuard[ZstdJSON_V2]: + return isinstance(data, Mapping) and set(data.keys()).issuperset({"id", "level"}) + + +def check_json_v3(data: CodecJSON) -> TypeGuard[ZstdJSON_V3]: + return ( + isinstance(data, Mapping) + and set(data.keys()) == {"name", "configuration"} + and data["name"] == "zstd" + and isinstance(data["configuration"], Mapping) + and set(data["configuration"].keys()) == {"level", "checksum"} + ) + + def parse_zstd_level(data: JSON) -> int: if isinstance(data, int): if data >= 23: @@ -61,11 +101,52 @@ def __init__(self, *, level: int = 0, checksum: bool = False) -> None: @classmethod def from_dict(cls, data: dict[str, JSON]) -> Self: - _, configuration_parsed = parse_named_configuration(data, "zstd") - return cls(**configuration_parsed) # type: ignore[arg-type] + return cls.from_json(data, zarr_format=3) + + @classmethod + def _from_json_v2(cls, data: CodecJSON) -> Self: + if check_json_v2(data): + if "checksum" in data: + return cls(level=data["level"], checksum=data["checksum"]) + else: + return cls(level=data["level"]) + + msg = ( + "Invalid Zarr V2 JSON representation of the zstd codec. " + f"Got {data!r}, expected a Mapping with keys ('id', 'level')" + ) + raise CodecValidationError(msg) + + @classmethod + def _from_json_v3(cls, data: CodecJSON) -> Self: + if check_json_v3(data): + return cls( + level=data["configuration"]["level"], checksum=data["configuration"]["checksum"] + ) + msg = ( + "Invalid Zarr V3 JSON representation of the zstd codec. " + f"Got {data!r}, expected a Mapping with keys ('name', 'configuration') " + "Where the 'configuration' key is a Mapping with keys ('level', 'checksum')" + ) + raise CodecValidationError(msg) def to_dict(self) -> dict[str, JSON]: - return {"name": "zstd", "configuration": {"level": self.level, "checksum": self.checksum}} + return self.to_json(zarr_format=3) + + @overload + def to_json(self, zarr_format: Literal[2]) -> ZstdJSON_V2: ... + + @overload + def to_json(self, zarr_format: Literal[3]) -> ZstdJSON_V3: ... + + def to_json(self, zarr_format: ZarrFormat) -> ZstdJSON_V2 | ZstdJSON_V3: + if zarr_format == 2: + return {"id": "zstd", "level": self.level} + else: + return { + "name": "zstd", + "configuration": {"level": self.level, "checksum": self.checksum}, + } @cached_property def _zstd_codec(self) -> Zstd: diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index 260e94bc88..e6a11d04ae 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -3,7 +3,7 @@ import json import warnings from asyncio import gather -from collections.abc import Iterable +from collections.abc import Iterable, Sequence from dataclasses import dataclass, field, replace from itertools import starmap from logging import getLogger @@ -19,16 +19,15 @@ ) from warnings import warn -import numcodecs -import numcodecs.abc import numpy as np from typing_extensions import deprecated import zarr from zarr.abc.codec import ArrayArrayCodec, ArrayBytesCodec, BytesBytesCodec, Codec from zarr.abc.store import Store, set_or_delete -from zarr.codecs._v2 import V2Codec from zarr.codecs.bytes import BytesCodec +from zarr.codecs.numcodec import Numcodec +from zarr.codecs.transpose import TransposeCodec from zarr.codecs.vlen_utf8 import VLenBytesCodec, VLenUTF8Codec from zarr.codecs.zstd import ZstdCodec from zarr.core._info import ArrayInfo @@ -66,7 +65,6 @@ parse_shapelike, product, ) -from zarr.core.config import categorize_data_type from zarr.core.config import config as zarr_config from zarr.core.dtype import ( VariableLengthBytes, @@ -112,7 +110,7 @@ T_ArrayMetadata, ) from zarr.core.metadata.v2 import ( - CompressorLikev2, + CompressorLike_V2, get_object_codec_id, parse_compressor, parse_filters, @@ -130,7 +128,7 @@ from zarr.storage._utils import _relativize_path if TYPE_CHECKING: - from collections.abc import Iterator, Sequence + from collections.abc import Iterator from typing import Self import numpy.typing as npt @@ -204,8 +202,26 @@ def create_codec_pipeline(metadata: ArrayMetadata, *, store: Store | None = None if isinstance(metadata, ArrayV3Metadata): return get_pipeline_class().from_codecs(metadata.codecs) elif isinstance(metadata, ArrayV2Metadata): - v2_codec = V2Codec(filters=metadata.filters, compressor=metadata.compressor) - return get_pipeline_class().from_codecs([v2_codec]) + _codecs: tuple[Codec, ...] = () + if metadata.filters is not None: + _codecs += metadata.filters + if metadata.compressor is not None: + _codecs += (metadata.compressor,) + if not any(isinstance(codec, ArrayBytesCodec) for codec in _codecs) and not isinstance( + metadata.dtype, HasObjectCodec + ): + # The role filled by the ArrayBytesCodec was implicit in zarr v2. So a valid zarr v2-style + # chain of filters + compressor might not contain a codec identifiable as an array-bytes codec. + # In such a case, we will insert a bytes codec that applies no endian transformation. + # We skip this insertion if the data type is an instance of HasObjectCodec, because + # in zarr v2 these data types required a special codec that functioned like an array bytes codec. + _codecs = (BytesCodec(endian=None),) + _codecs + if metadata.order == "F": + # Zarr V2 supports declaring the order of an array in metadata. Using the zarr v3 codec + # framework, we express C or F ordered arrays by adding a transpose codec to the front + # of the list of codecs. + _codecs = (TransposeCodec(order=tuple(reversed(range(metadata.ndim)))),) + _codecs + return get_pipeline_class().from_codecs(_codecs) raise TypeError # pragma: no cover @@ -342,7 +358,7 @@ async def create( dimension_separator: Literal[".", "/"] | None = None, order: MemoryOrder | None = None, filters: list[dict[str, JSON]] | None = None, - compressor: CompressorLikev2 | Literal["auto"] = "auto", + compressor: CompressorLike_V2 | Literal["auto"] = "auto", # runtime overwrite: bool = False, data: npt.ArrayLike | None = None, @@ -607,7 +623,7 @@ async def _create( chunks: ShapeLike | None = None, dimension_separator: Literal[".", "/"] | None = None, order: MemoryOrder | None = None, - filters: Iterable[dict[str, JSON] | numcodecs.abc.Codec] | None = None, + filters: Iterable[dict[str, JSON] | Numcodec] | None = None, compressor: CompressorLike = "auto", # runtime overwrite: bool = False, @@ -818,8 +834,8 @@ def _create_metadata_v2( order: MemoryOrder, dimension_separator: Literal[".", "/"] | None = None, fill_value: Any | None = DEFAULT_FILL_VALUE, - filters: Iterable[dict[str, JSON] | numcodecs.abc.Codec] | None = None, - compressor: CompressorLikev2 = None, + filters: Iterable[CompressorLike_V2] | None = None, + compressor: CompressorLike_V2 | None = None, attributes: dict[str, JSON] | None = None, ) -> ArrayV2Metadata: if dimension_separator is None: @@ -856,8 +872,8 @@ async def _create_v2( config: ArrayConfig, dimension_separator: Literal[".", "/"] | None = None, fill_value: Any | None = DEFAULT_FILL_VALUE, - filters: Iterable[dict[str, JSON] | numcodecs.abc.Codec] | None = None, - compressor: CompressorLike = "auto", + filters: Iterable[CompressorLike_V2] | None = None, + compressor: CompressorLike_V2 | Literal["auto"] = "auto", attributes: dict[str, JSON] | None = None, overwrite: bool = False, ) -> AsyncArray[ArrayV2Metadata]: @@ -869,14 +885,9 @@ async def _create_v2( else: await ensure_no_existing_node(store_path, zarr_format=2) - compressor_parsed: CompressorLikev2 + compressor_parsed: CompressorLike_V2 if compressor == "auto": compressor_parsed = default_compressor_v2(dtype) - elif isinstance(compressor, BytesBytesCodec): - raise ValueError( - "Cannot use a BytesBytesCodec as a compressor for zarr v2 arrays. " - "Use a numcodecs codec directly instead." - ) else: compressor_parsed = compressor @@ -1033,7 +1044,7 @@ def size(self) -> int: return np.prod(self.metadata.shape).item() @property - def filters(self) -> tuple[numcodecs.abc.Codec, ...] | tuple[ArrayArrayCodec, ...]: + def filters(self) -> tuple[Codec, ...] | tuple[ArrayArrayCodec, ...]: """ Filters that are applied to each chunk of the array, in order, before serializing that chunk to bytes. @@ -1062,7 +1073,7 @@ def serializer(self) -> ArrayBytesCodec | None: @property @deprecated("Use AsyncArray.compressors instead.") - def compressor(self) -> numcodecs.abc.Codec | None: + def compressor(self) -> Numcodec | None: """ Compressor that is applied to each chunk of the array. @@ -1075,7 +1086,7 @@ def compressor(self) -> numcodecs.abc.Codec | None: raise TypeError("`compressor` is not available for Zarr format 3 arrays.") @property - def compressors(self) -> tuple[numcodecs.abc.Codec, ...] | tuple[BytesBytesCodec, ...]: + def compressors(self) -> tuple[Numcodec, ...] | tuple[BytesBytesCodec, ...]: """ Compressors that are applied to each chunk of the array. Compressors are applied in order, and after any filters are applied (if any are specified) and the data is serialized into bytes. @@ -1518,7 +1529,7 @@ async def _set_selection( if isinstance(array_like, np._typing._SupportsArrayFunc): # TODO: need to handle array types that don't support __array_function__ # like PyTorch and JAX - array_like_ = cast("np._typing._SupportsArrayFunc", array_like) + array_like_ = cast(np._typing._SupportsArrayFunc, array_like) value = np.asanyarray(value, dtype=self.dtype, like=array_like_) else: if not hasattr(value, "shape"): @@ -1532,8 +1543,7 @@ async def _set_selection( value = value.astype(dtype=self.dtype, order="A") else: value = np.array(value, dtype=self.dtype, order="A") - value = cast("NDArrayLike", value) - + value = cast(NDArrayLike, value) # We accept any ndarray like object from the user and convert it # to a NDBuffer (or subclass). From this point onwards, we only pass # Buffer and NDBuffer between components. @@ -2227,7 +2237,7 @@ def fill_value(self) -> Any: return self.metadata.fill_value @property - def filters(self) -> tuple[numcodecs.abc.Codec, ...] | tuple[ArrayArrayCodec, ...]: + def filters(self) -> tuple[Numcodec, ...] | tuple[ArrayArrayCodec, ...]: """ Filters that are applied to each chunk of the array, in order, before serializing that chunk to bytes. @@ -2243,7 +2253,7 @@ def serializer(self) -> None | ArrayBytesCodec: @property @deprecated("Use Array.compressors instead.") - def compressor(self) -> numcodecs.abc.Codec | None: + def compressor(self) -> Numcodec | None: """ Compressor that is applied to each chunk of the array. @@ -2254,7 +2264,7 @@ def compressor(self) -> numcodecs.abc.Codec | None: return self._async_array.compressor @property - def compressors(self) -> tuple[numcodecs.abc.Codec, ...] | tuple[BytesBytesCodec, ...]: + def compressors(self) -> tuple[Numcodec, ...] | tuple[BytesBytesCodec, ...]: """ Compressors that are applied to each chunk of the array. Compressors are applied in order, and after any filters are applied (if any are specified) and the data is serialized into bytes. @@ -3898,23 +3908,21 @@ def _build_parents( FiltersLike: TypeAlias = ( - Iterable[dict[str, JSON] | ArrayArrayCodec | numcodecs.abc.Codec] + Iterable[dict[str, JSON] | ArrayArrayCodec | Numcodec] | ArrayArrayCodec - | Iterable[numcodecs.abc.Codec] - | numcodecs.abc.Codec + | Iterable[Numcodec] + | Numcodec | Literal["auto"] | None ) # Union of acceptable types for users to pass in for both v2 and v3 compressors -CompressorLike: TypeAlias = ( - dict[str, JSON] | BytesBytesCodec | numcodecs.abc.Codec | Literal["auto"] | None -) +CompressorLike: TypeAlias = dict[str, JSON] | BytesBytesCodec | Numcodec | Literal["auto"] | None CompressorsLike: TypeAlias = ( - Iterable[dict[str, JSON] | BytesBytesCodec | numcodecs.abc.Codec] + Iterable[dict[str, JSON] | BytesBytesCodec | Numcodec] | dict[str, JSON] | BytesBytesCodec - | numcodecs.abc.Codec + | Numcodec | Literal["auto"] | None ) @@ -4712,26 +4720,6 @@ def _parse_chunk_key_encoding( return result -def _get_default_chunk_encoding_v3( - dtype: ZDType[TBaseDType, TBaseScalar], -) -> tuple[tuple[ArrayArrayCodec, ...], ArrayBytesCodec, tuple[BytesBytesCodec, ...]]: - """ - Get the default ArrayArrayCodecs, ArrayBytesCodec, and BytesBytesCodec for a given dtype. - """ - - dtype_category = categorize_data_type(dtype) - - filters = zarr_config.get("array.v3_default_filters").get(dtype_category) - compressors = zarr_config.get("array.v3_default_compressors").get(dtype_category) - serializer = zarr_config.get("array.v3_default_serializer").get(dtype_category) - - return ( - tuple(_parse_array_array_codec(f) for f in filters), - _parse_array_bytes_codec(serializer), - tuple(_parse_bytes_bytes_codec(c) for c in compressors), - ) - - def default_filters_v3(dtype: ZDType[Any, Any]) -> tuple[ArrayArrayCodec, ...]: """ Given a data type, return the default filters for that data type. @@ -4775,7 +4763,7 @@ def default_serializer_v3(dtype: ZDType[Any, Any]) -> ArrayBytesCodec: return serializer -def default_filters_v2(dtype: ZDType[Any, Any]) -> tuple[numcodecs.abc.Codec] | None: +def default_filters_v2(dtype: ZDType[Any, Any]) -> tuple[Codec] | None: """ Given a data type, return the default filters for that data type. @@ -4784,28 +4772,22 @@ def default_filters_v2(dtype: ZDType[Any, Any]) -> tuple[numcodecs.abc.Codec] | """ if isinstance(dtype, HasObjectCodec): if dtype.object_codec_id == "vlen-bytes": - from numcodecs import VLenBytes - - return (VLenBytes(),) + return (VLenBytesCodec(),) elif dtype.object_codec_id == "vlen-utf8": - from numcodecs import VLenUTF8 - - return (VLenUTF8(),) + return (VLenUTF8Codec(),) else: msg = f"Data type {dtype} requires an unknown object codec: {dtype.object_codec_id!r}." raise ValueError(msg) return None -def default_compressor_v2(dtype: ZDType[Any, Any]) -> numcodecs.abc.Codec: +def default_compressor_v2(dtype: ZDType[Any, Any]) -> BytesBytesCodec: """ Given a data type, return the default compressors for that data type. - This is just the numcodecs ``Zstd`` codec. + This is just the ``Zstd`` codec. """ - from numcodecs import Zstd - - return Zstd(level=0, checksum=False) + return ZstdCodec() def _parse_chunk_encoding_v2( @@ -4813,23 +4795,20 @@ def _parse_chunk_encoding_v2( compressor: CompressorsLike, filters: FiltersLike, dtype: ZDType[TBaseDType, TBaseScalar], -) -> tuple[tuple[numcodecs.abc.Codec, ...] | None, numcodecs.abc.Codec | None]: +) -> tuple[tuple[Codec, ...] | None, Codec | None]: """ Generate chunk encoding classes for Zarr format 2 arrays with optional defaults. """ - _filters: tuple[numcodecs.abc.Codec, ...] | None - _compressor: numcodecs.abc.Codec | None + _filters: tuple[Codec, ...] | None + _compressor: Codec | None if compressor is None or compressor == (): _compressor = None elif compressor == "auto": _compressor = default_compressor_v2(dtype) - elif isinstance(compressor, tuple | list) and len(compressor) == 1: + elif isinstance(compressor, Sequence) and len(compressor) == 1: _compressor = parse_compressor(compressor[0]) else: - if isinstance(compressor, Iterable) and not isinstance(compressor, dict): - msg = f"For Zarr format 2 arrays, the `compressor` must be a single codec. Got an iterable with type {type(compressor)} instead." - raise TypeError(msg) _compressor = parse_compressor(compressor) if filters is None: @@ -4837,14 +4816,6 @@ def _parse_chunk_encoding_v2( elif filters == "auto": _filters = default_filters_v2(dtype) else: - if isinstance(filters, Iterable): - for idx, f in enumerate(filters): - if not isinstance(f, numcodecs.abc.Codec): - msg = ( - "For Zarr format 2 arrays, all elements of `filters` must be numcodecs codecs. " - f"Element at index {idx} has type {type(f)}, which is not a numcodecs codec." - ) - raise TypeError(msg) _filters = parse_filters(filters) if isinstance(dtype, HasObjectCodec): # check the filters and the compressor for the object codec required for this data type @@ -4852,12 +4823,12 @@ def _parse_chunk_encoding_v2( if _compressor is None: object_codec_id = None else: - object_codec_id = get_object_codec_id((_compressor.get_config(),)) + object_codec_id = get_object_codec_id((_compressor.to_json(zarr_format=2),)) else: object_codec_id = get_object_codec_id( ( - *[f.get_config() for f in _filters], - _compressor.get_config() if _compressor is not None else None, + *[f.to_json(zarr_format=2) for f in _filters], + _compressor.to_json(zarr_format=2) if _compressor is not None else None, ) ) if object_codec_id is None: @@ -4897,7 +4868,9 @@ def _parse_chunk_encoding_v3( maybe_array_array = (filters,) else: maybe_array_array = cast("Iterable[Codec | dict[str, JSON]]", filters) - out_array_array = tuple(_parse_array_array_codec(c) for c in maybe_array_array) + out_array_array = tuple( + _parse_array_array_codec(c, zarr_format=3) for c in maybe_array_array + ) if serializer == "auto": out_array_bytes = default_serializer_v3(dtype) @@ -4905,26 +4878,34 @@ def _parse_chunk_encoding_v3( # TODO: ensure that the serializer is compatible with the ndarray produced by the # array-array codecs. For example, if a sequence of array-array codecs produces an # array with a single-byte data type, then the serializer should not specify endiannesss. - out_array_bytes = _parse_array_bytes_codec(serializer) + out_array_bytes = _parse_array_bytes_codec(serializer, zarr_format=3) if compressors is None: out_bytes_bytes: tuple[BytesBytesCodec, ...] = () elif compressors == "auto": out_bytes_bytes = default_compressors_v3(dtype) else: - maybe_bytes_bytes: Iterable[Codec | dict[str, JSON]] - if isinstance(compressors, dict | Codec): + maybe_bytes_bytes: Iterable[Codec | dict[str, JSON] | Numcodec] + if isinstance(compressors, dict | Codec | Numcodec): maybe_bytes_bytes = (compressors,) else: - maybe_bytes_bytes = cast("Iterable[Codec | dict[str, JSON]]", compressors) + maybe_bytes_bytes = compressors # type: ignore[assignment] + + out_bytes_bytes = tuple( + _parse_bytes_bytes_codec(c, zarr_format=3) for c in maybe_bytes_bytes + ) + + # specialize codecs as needed given the dtype - out_bytes_bytes = tuple(_parse_bytes_bytes_codec(c) for c in maybe_bytes_bytes) + # TODO: refactor so that the config only contains the name of the codec, and we use the dtype + # to create the codec instance, instead of storing a dict representation of a full codec. # TODO: ensure that the serializer is compatible with the ndarray produced by the # array-array codecs. For example, if a sequence of array-array codecs produces an # array with a single-byte data type, then the serializer should not specify endiannesss. - - # TODO: add checks to ensure that the right serializer is used for vlen data types + if isinstance(out_array_bytes, BytesCodec) and not isinstance(dtype, HasEndianness): + # The default endianness in the bytescodec might not be None, so we need to replace it + out_array_bytes = replace(out_array_bytes, endian=None) return out_array_array, out_array_bytes, out_bytes_bytes @@ -4945,8 +4926,6 @@ def _parse_deprecated_compressor( compressors = () else: compressors = (compressor,) - elif zarr_format == 2 and compressor == compressors == "auto": - compressors = ({"id": "blosc"},) return compressors diff --git a/src/zarr/core/codec_pipeline.py b/src/zarr/core/codec_pipeline.py index 23c27e40c6..e4141d0087 100644 --- a/src/zarr/core/codec_pipeline.py +++ b/src/zarr/core/codec_pipeline.py @@ -87,7 +87,6 @@ def evolve_from_array_spec(self, array_spec: ArraySpec) -> Self: @classmethod def from_codecs(cls, codecs: Iterable[Codec], *, batch_size: int | None = None) -> Self: array_array_codecs, array_bytes_codec, bytes_bytes_codecs = codecs_from_list(codecs) - return cls( array_array_codecs=array_array_codecs, array_bytes_codec=array_bytes_codec, @@ -219,7 +218,6 @@ async def encode_batch( zip(chunk_array_batch, chunk_specs, strict=False) ) chunk_specs = resolve_batched(aa_codec, chunk_specs) - chunk_bytes_batch = await self.array_bytes_codec.encode( zip(chunk_array_batch, chunk_specs, strict=False) ) @@ -491,18 +489,172 @@ async def write( def codecs_from_list( codecs: Iterable[Codec], ) -> tuple[tuple[ArrayArrayCodec, ...], ArrayBytesCodec, tuple[BytesBytesCodec, ...]]: + from zarr.codecs.numcodec import NumcodecsWrapper from zarr.codecs.sharding import ShardingCodec array_array: tuple[ArrayArrayCodec, ...] = () - array_bytes_maybe: ArrayBytesCodec | None = None + array_bytes_maybe: ArrayBytesCodec bytes_bytes: tuple[BytesBytesCodec, ...] = () - if any(isinstance(codec, ShardingCodec) for codec in codecs) and len(tuple(codecs)) > 1: + # handle two cases + # either all of the codecs are numcodecwrapper instances, in which case we set the last element + # to array-bytes and the rest to array-array + # or one of the codecs is an array-bytes, in which case we convert any preceding numcodecswrapper + # instances to array-array, and any following numcodecswrapper instances to bytes-bytes + + codecs_tup = tuple(codecs) + array_array_idcs: tuple[tuple[int, ArrayArrayCodec], ...] = () + array_bytes_idcs: tuple[tuple[int, ArrayBytesCodec], ...] = () + bytes_bytes_idcs: tuple[tuple[int, BytesBytesCodec], ...] = () + numcodec_wrapper_idcs: tuple[tuple[int, NumcodecsWrapper], ...] = () + + for idx, codec in enumerate(codecs_tup): + match codec: + case ArrayArrayCodec(): + array_array_idcs += ((idx, codec),) + case ArrayBytesCodec(): + array_bytes_idcs += ((idx, codec),) + case BytesBytesCodec(): + bytes_bytes_idcs += ((idx, codec),) + case NumcodecsWrapper(): # type: ignore[union-attr] + numcodec_wrapper_idcs += ((idx, codec),) + + if any(isinstance(codec, ShardingCodec) for codec in codecs) and len(codecs_tup) > 1: warn( "Combining a `sharding_indexed` codec disables partial reads and " "writes, which may lead to inefficient performance.", stacklevel=3, ) + if len(array_bytes_idcs) == 0: + # There is no array-bytes codec. Unless we can find a numcodec wrapper to act as an + # array-bytes codec, this is an error. + if len(numcodec_wrapper_idcs) == 0: + msg = ( + f"The codecs {codecs_tup} do not include an ArrayBytesCodec or a codec castable to an " + "ArrayBytesCodec, such as a NumcodecsWrapper. This is an invalid sequence of codecs." + ) + raise ValueError(msg) + elif len(numcodec_wrapper_idcs) == len(codecs_tup): + # All the codecs are numcodecs wrappers. This means we have no information about which + # codec is array-array, array-bytes, and bytes-bytes, so we we just cast the numcodecs wrappers + # into a sequence of array-array codecs terminated by a single array-bytes codec. + # This choice is almost arbitrary. + # It would be equally valid to convert the first codec to an array-bytes, and the remaining + # codecs to bytes-bytes, or to pick a random codec and convert it to array-bytes, then + # converting all the preceding codecs to array-array, and the following codecs to bytes-bytes. + # But we know from experience that the Zarr V2-style chunk encoding pipelines typically + # start with array-array transformations, so casting all but one of the unknown codecs + # to array-array is a safe choice. + array_bytes_maybe = codecs_tup[-1].to_array_bytes() + array_array = tuple(c.to_array_array() for c in codecs_tup[:-1]) + else: + # There are no array-bytes codecs, there is at least one numcodec wrapper, but there are + # also some array-array and / or bytes-bytes codecs + if len(array_array_idcs) > 0: + # There is at least one array-array codec. We will use it as a reference point for + # casting any numcodecs wrappers. + last_array_array_idx = array_array_idcs[-1][0] + + if last_array_array_idx == len(codecs_tup) - 1: + # The last codec is an ArrayArrayCodec, but there is no ArrayBytesCodec. This + # cannot be fixed by converting numcodecs wrappers, so we raise an exception. + raise ValueError( + "The last codec is an ArrayArrayCodec, but there is no ArrayBytesCodec." + ) + + for idx, aac in enumerate(codecs_tup[: (last_array_array_idx + 1)]): + # Iterate over the codecs leading up to the last array-array codec. + if isinstance(aac, ArrayArrayCodec): + # Any array-array codec gets added to the list of array-array codecs + array_array += (aac,) + elif isinstance(aac, NumcodecsWrapper): + # Any numcodecs wrapper gets converted to an array-array codec + array_array += (aac.to_array_array(),) + else: + # Any other kind of codec is invalid and we raise an exception. + msg = f"Invalid codec {aac} at index {idx}. Expected an ArrayArrayCodec" + raise TypeError(msg) + + if isinstance(codecs_tup[last_array_array_idx + 1], NumcodecsWrapper): + # The codec following the last array-array codec is a numcodecs wrapper. + # We will cast it to an array-bytes codec. + array_bytes_maybe = codecs_tup[last_array_array_idx + 1].to_array_bytes() + else: + # The codec following the last array-array codec was a bytes bytes codec, or + # something else entirely. This is invalid and we raise an exception. + msg = ( + f"Invalid codec {codecs_tup[last_array_array_idx + 1]} at index " + f"{last_array_array_idx + 1}." + "Expected a NumcodecsWrapper or an ArrayBytesCodec, got " + f"{type(codecs_tup[last_array_array_idx + 1])}" + ) + raise TypeError(msg) + + start = last_array_array_idx + 2 + for idx, rem in enumerate(codecs_tup[start:]): + # We have already checked the codec after the last array-array codec, so we start + # iterating over the codecs after that. + if isinstance(rem, BytesBytesCodec): + bytes_bytes += (rem,) + elif isinstance(rem, NumcodecsWrapper): + bytes_bytes += (rem.to_bytes_bytes(),) + else: + msg = f"Invalid codec {rem} at index {start + idx}. Expected a BytesBytesCodec" + raise TypeError(msg) + else: + # there are no array-array codecs, just numcodecs wrappers and bytes-bytes codecs + first_bytes_bytes_idx = bytes_bytes_idcs[0][0] + if first_bytes_bytes_idx == 0: + raise ValueError( + "The first codec is a BytesBytesCodec, but there is no ArrayBytesCodec." + ) + else: + # Iterate over all codecs. Cast all numcodecs wrappers to array-array codecs, until + # the codec immediately prior to the first bytes-bytes codec, which we cast to + # an array-bytes codec. All codecs after that point are cast to bytes-bytes codecs. + for idx, bb_codec in enumerate(codecs_tup): + if idx < first_bytes_bytes_idx - 1: + # This must be a numcodecs wrapper. cast it to array-array + array_array += (bb_codec.to_array_array(),) + elif idx == first_bytes_bytes_idx - 1: + array_bytes_maybe = bb_codec.to_array_bytes() + else: + if isinstance(bb_codec, BytesBytesCodec): + bytes_bytes += (bb_codec,) + elif isinstance(bb_codec, NumcodecsWrapper): + bytes_bytes += (bb_codec.to_bytes_bytes(),) + else: + msg = f"Invalid codec {bb_codec} at index {idx}. Expected a NumcodecsWrapper" + raise TypeError(msg) + + elif len(array_bytes_idcs) == 1: + bb_idx, ab_codec = array_bytes_idcs[0] + array_bytes_maybe = ab_codec + + end = bb_idx + + for idx, aa_codec in enumerate(codecs_tup[:end]): + if isinstance(aa_codec, ArrayArrayCodec): + array_array += (aa_codec,) + elif isinstance(aa_codec, NumcodecsWrapper): + array_array += (aa_codec.to_array_array(),) + else: + msg = f"Invalid codec {aa_codec} at index {idx}. Expected an ArrayArrayCodec" + raise TypeError(msg) + start = bb_idx + 1 + if bb_idx < len(codecs_tup) - 1: + for idx, bb_codec in enumerate(codecs_tup[start:]): + if isinstance(bb_codec, NumcodecsWrapper): + bytes_bytes += (bb_codec.to_bytes_bytes(),) + elif isinstance(bb_codec, BytesBytesCodec): + bytes_bytes += (bb_codec,) + else: + msg = f"Invalid codec {bb_codec} at index {start + idx}. Expected a BytesBytesCodec" + raise TypeError(msg) + else: + raise ValueError("More than one ArrayBytes codec found, that is a big error!") + + return array_array, array_bytes_maybe, bytes_bytes for prev_codec, cur_codec in pairwise((None, *codecs)): if isinstance(cur_codec, ArrayArrayCodec): @@ -540,7 +692,7 @@ def codecs_from_list( f"Got {type(prev_codec)} instead." ) bytes_bytes += (cur_codec,) - else: + elif isinstance(cur_codec, NumcodecsWrapper): raise TypeError if array_bytes_maybe is None: diff --git a/src/zarr/core/common.py b/src/zarr/core/common.py index 33590c83a5..bae955858f 100644 --- a/src/zarr/core/common.py +++ b/src/zarr/core/common.py @@ -14,6 +14,8 @@ Final, Generic, Literal, + NotRequired, + Required, TypedDict, TypeVar, cast, @@ -46,23 +48,45 @@ ANY_ACCESS_MODE: Final = "r", "r+", "a", "w", "w-" DimensionNames = Iterable[str | None] | None -TName = TypeVar("TName", bound=str) -TConfig = TypeVar("TConfig", bound=Mapping[str, object]) +BaseConfig = Mapping[str, object] +TName_co = TypeVar("TName_co", bound=str, covariant=True) +TConfig_co = TypeVar("TConfig_co", bound=BaseConfig, covariant=True) -class NamedConfig(TypedDict, Generic[TName, TConfig]): + +class NamedConfig(TypedDict, Generic[TName_co, TConfig_co]): + """ + A typed dictionary representing an object with a name and configuration, where the configuration + is a mapping of string keys to values, e.g. another typed dictionary or a JSON object. + + This class is generic with two type parameters: the type of the name (``TName``) and the type of + the configuration (``TConfig``). + + The configuration key is not required. + """ + + name: ReadOnly[TName_co] + """The name of the object.""" + + configuration: NotRequired[ReadOnly[TConfig_co]] + """The configuration of the object.""" + + +class NamedRequiredConfig(TypedDict, Generic[TName_co, TConfig_co]): """ A typed dictionary representing an object with a name and configuration, where the configuration is a mapping of string keys to values, e.g. another typed dictionary or a JSON object. This class is generic with two type parameters: the type of the name (``TName``) and the type of the configuration (``TConfig``). + + The configuration key is required. """ - name: ReadOnly[TName] + name: ReadOnly[TName_co] """The name of the object.""" - configuration: ReadOnly[TConfig] + configuration: Required[ReadOnly[TConfig_co]] """The configuration of the object.""" diff --git a/src/zarr/core/metadata/v2.py b/src/zarr/core/metadata/v2.py index 17af3538a9..832e35f150 100644 --- a/src/zarr/core/metadata/v2.py +++ b/src/zarr/core/metadata/v2.py @@ -1,16 +1,18 @@ from __future__ import annotations import warnings -from collections.abc import Iterable, Sequence +from collections.abc import Iterable, Mapping, Sequence from functools import cached_property from typing import TYPE_CHECKING, Any, TypeAlias, TypedDict, cast -import numcodecs.abc - +from zarr.abc.codec import ArrayArrayCodec, Codec from zarr.abc.metadata import Metadata +from zarr.codecs.numcodec import Numcodec, NumcodecsWrapper +from zarr.core.buffer.core import default_buffer_prototype from zarr.core.chunk_grids import RegularChunkGrid from zarr.core.dtype import get_data_type_from_json -from zarr.core.dtype.common import OBJECT_CODEC_IDS, DTypeSpec_V2 +from zarr.core.dtype.common import OBJECT_CODEC_IDS +from zarr.registry import get_codec if TYPE_CHECKING: from typing import Literal, Self @@ -19,18 +21,17 @@ from zarr.core.buffer import Buffer, BufferPrototype from zarr.core.common import ChunkCoords + from zarr.core.dtype.common import DTypeSpec_V2 from zarr.core.dtype.wrapper import ( TBaseDType, TBaseScalar, TDType_co, TScalar_co, - ZDType, ) import json from dataclasses import dataclass, field, fields, replace -import numcodecs import numpy as np from zarr.core.array_spec import ArrayConfig, ArraySpec @@ -43,6 +44,9 @@ parse_shapelike, ) from zarr.core.config import config, parse_indexing_order +from zarr.core.dtype.wrapper import ( + ZDType, +) from zarr.core.metadata.common import parse_attributes @@ -56,7 +60,7 @@ class ArrayV2MetadataDict(TypedDict): # Union of acceptable types for v2 compressors -CompressorLikev2: TypeAlias = dict[str, JSON] | numcodecs.abc.Codec | None +CompressorLike_V2: TypeAlias = Mapping[str, JSON] | Numcodec | Codec @dataclass(frozen=True, kw_only=True) @@ -66,9 +70,9 @@ class ArrayV2Metadata(Metadata): dtype: ZDType[TBaseDType, TBaseScalar] fill_value: int | float | str | bytes | None = None order: MemoryOrder = "C" - filters: tuple[numcodecs.abc.Codec, ...] | None = None + filters: tuple[Codec, ...] | None = None dimension_separator: Literal[".", "/"] = "." - compressor: numcodecs.abc.Codec | None + compressor: Codec attributes: dict[str, JSON] = field(default_factory=dict) zarr_format: Literal[2] = field(init=False, default=2) @@ -81,8 +85,8 @@ def __init__( fill_value: Any, order: MemoryOrder, dimension_separator: Literal[".", "/"] = ".", - compressor: CompressorLikev2 = None, - filters: Iterable[numcodecs.abc.Codec | dict[str, JSON]] | None = None, + compressor: CompressorLike_V2 | None = None, + filters: Iterable[CompressorLike_V2] | None = None, attributes: dict[str, JSON] | None = None, ) -> None: """ @@ -90,6 +94,9 @@ def __init__( """ shape_parsed = parse_shapelike(shape) chunks_parsed = parse_shapelike(chunks) + # TODO: remove this + if not isinstance(dtype, ZDType): + raise TypeError compressor_parsed = parse_compressor(compressor) order_parsed = parse_indexing_order(order) dimension_separator_parsed = parse_separator(dimension_separator) @@ -99,6 +106,18 @@ def __init__( fill_value_parsed = dtype.cast_scalar(fill_value) else: fill_value_parsed = fill_value + + array_spec = ArraySpec( + shape=shape_parsed, + dtype=dtype, + fill_value=fill_value_parsed, + config=ArrayConfig.from_dict({}), # TODO: config is not needed here. + prototype=default_buffer_prototype(), # TODO: prototype is not needed here. + ) + if compressor_parsed is not None: + compressor_parsed = compressor_parsed.evolve_from_array_spec(array_spec) + if filters_parsed is not None: + filters_parsed = tuple(fp.evolve_from_array_spec(array_spec) for fp in filters_parsed) attributes_parsed = parse_attributes(attributes) object.__setattr__(self, "shape", shape_parsed) @@ -158,6 +177,7 @@ def from_dict(cls, data: dict[str, Any]) -> ArrayV2Metadata: object_codec_id = get_object_codec_id((_compressor,)) # we add a layer of indirection here around the dtype attribute of the array metadata # because we also need to know the object codec id, if any, to resolve the data type + dtype_spec: DTypeSpec_V2 = { "name": data["dtype"], "object_codec_id": object_codec_id, @@ -197,34 +217,24 @@ def from_dict(cls, data: dict[str, Any]) -> ArrayV2Metadata: def to_dict(self) -> dict[str, JSON]: zarray_dict = super().to_dict() - if isinstance(zarray_dict["compressor"], numcodecs.abc.Codec): - codec_config = zarray_dict["compressor"].get_config() - # Hotfix for https://github.com/zarr-developers/zarr-python/issues/2647 - if codec_config["id"] == "zstd" and not codec_config.get("checksum", False): - codec_config.pop("checksum") - zarray_dict["compressor"] = codec_config - + if self.compressor is not None: + zarray_dict["compressor"] = self.compressor.to_json(zarr_format=2) + else: + zarray_dict["compressor"] = None + new_filters = [] if zarray_dict["filters"] is not None: - raw_filters = zarray_dict["filters"] - # TODO: remove this when we can stratically type the output JSON data structure - # entirely - if not isinstance(raw_filters, list | tuple): - raise TypeError("Invalid type for filters. Expected a list or tuple.") - new_filters = [] - for f in raw_filters: - if isinstance(f, numcodecs.abc.Codec): - new_filters.append(f.get_config()) - else: - new_filters.append(f) - zarray_dict["filters"] = new_filters + new_filters.extend([f.to_json(zarr_format=2) for f in self.filters]) + else: + new_filters = None + zarray_dict["filters"] = new_filters # serialize the fill value after dtype-specific JSON encoding if self.fill_value is not None: fill_value = self.dtype.to_json_scalar(self.fill_value, zarr_format=2) zarray_dict["fill_value"] = fill_value - # pull the "name" attribute out of the dtype spec returned by self.dtype.to_json - zarray_dict["dtype"] = self.dtype.to_json(zarr_format=2)["name"] + # serialize the dtype after fill value-specific JSON encoding + zarray_dict["dtype"] = self.dtype.to_json(zarr_format=2)["name"] # type: ignore[assignment] return zarray_dict @@ -262,20 +272,23 @@ def parse_zarr_format(data: object) -> Literal[2]: raise ValueError(f"Invalid value. Expected 2. Got {data}.") -def parse_filters(data: object) -> tuple[numcodecs.abc.Codec, ...] | None: +def parse_filters(data: object) -> tuple[ArrayArrayCodec, ...] | None: """ Parse a potential tuple of filters """ - out: list[numcodecs.abc.Codec] = [] + out: list[Codec | NumcodecsWrapper] = [] if data is None: return data if isinstance(data, Iterable): for idx, val in enumerate(data): - if isinstance(val, numcodecs.abc.Codec): + if isinstance(val, (Codec, NumcodecsWrapper)): out.append(val) + elif isinstance(val, Numcodec): + out.append(NumcodecsWrapper(codec=val)) elif isinstance(val, dict): - out.append(numcodecs.get_codec(val)) + codec = get_codec(val, zarr_format=2) + out.append(codec) else: msg = f"Invalid filter at index {idx}. Expected a numcodecs.abc.Codec or a dict representation of numcodecs.abc.Codec. Got {type(val)} instead." raise TypeError(msg) @@ -285,20 +298,29 @@ def parse_filters(data: object) -> tuple[numcodecs.abc.Codec, ...] | None: else: return tuple(out) # take a single codec instance and wrap it in a tuple - if isinstance(data, numcodecs.abc.Codec): + if isinstance(data, Numcodec): + return (NumcodecsWrapper(codec=data),) + elif isinstance(data, Codec): return (data,) msg = f"Invalid filters. Expected None, an iterable of numcodecs.abc.Codec or dict representations of numcodecs.abc.Codec. Got {type(data)} instead." raise TypeError(msg) -def parse_compressor(data: object) -> numcodecs.abc.Codec | None: +def parse_compressor(data: object) -> Codec | NumcodecsWrapper | None: """ Parse a potential compressor. """ - if data is None or isinstance(data, numcodecs.abc.Codec): + # TODO: only validate the compressor in one place. currently we do it twice, once in init_array + # and again when constructing metadata + if data is None or isinstance(data, Codec | NumcodecsWrapper): return data + if isinstance(data, Numcodec): + try: + return get_codec(data.get_config(), zarr_format=2) + except KeyError: + return NumcodecsWrapper(codec=data) if isinstance(data, dict): - return numcodecs.get_codec(data) + return get_codec(data, zarr_format=2) msg = f"Invalid compressor. Expected None, a numcodecs.abc.Codec, or a dict representation of a numcodecs.abc.Codec. Got {type(data)} instead." raise ValueError(msg) diff --git a/src/zarr/core/metadata/v3.py b/src/zarr/core/metadata/v3.py index 6f79fb4b09..d9fdfaf3e3 100644 --- a/src/zarr/core/metadata/v3.py +++ b/src/zarr/core/metadata/v3.py @@ -3,7 +3,9 @@ from typing import TYPE_CHECKING, TypedDict from zarr.abc.metadata import Metadata +from zarr.codecs.numcodec import NumcodecsWrapper from zarr.core.buffer.core import default_buffer_prototype +from zarr.core.codec_pipeline import codecs_from_list from zarr.core.dtype import VariableLengthUTF8, ZDType, get_data_type_from_json from zarr.core.dtype.common import check_dtype_spec_v3 @@ -30,13 +32,12 @@ ZARR_JSON, ChunkCoords, DimensionNames, - parse_named_configuration, parse_shapelike, ) from zarr.core.config import config from zarr.core.metadata.common import parse_attributes from zarr.errors import MetadataValidationError, NodeTypeValidationError -from zarr.registry import get_codec_class +from zarr.registry import get_codec def parse_zarr_format(data: object) -> Literal[3]: @@ -63,8 +64,7 @@ def parse_codecs(data: object) -> tuple[Codec, ...]: ): # Can't use Codec here because of mypy limitation out += (c,) else: - name_parsed, _ = parse_named_configuration(c, require_configuration=False) - out += (get_codec_class(name_parsed).from_dict(c),) + out += (get_codec(c, zarr_format=3),) return out @@ -84,7 +84,10 @@ def validate_codecs(codecs: tuple[Codec, ...], dtype: ZDType[TBaseDType, TBaseSc """Check that the codecs are valid for the given dtype""" from zarr.codecs.sharding import ShardingCodec - abc = validate_array_bytes_codec(codecs) + array_array_codecs, array_bytes_codec, bytes_bytes_codecs = codecs_from_list(codecs) + _codecs = (*array_array_codecs, array_bytes_codec, *bytes_bytes_codecs) + + abc = validate_array_bytes_codec(_codecs) # Recursively resolve array-bytes codecs within sharding codecs while isinstance(abc, ShardingCodec): @@ -335,6 +338,13 @@ def to_dict(self) -> dict[str, JSON]: if out_dict["dimension_names"] is None: out_dict.pop("dimension_names") + out_dict["codecs"] = () + for codec in self.codecs: + if isinstance(codec, NumcodecsWrapper): + out_dict["codecs"] += (codec.to_json(zarr_format=3),) + else: + out_dict["codecs"] += (codec.to_dict(),) + # TODO: replace the `to_dict` / `from_dict` on the `Metadata`` class with # to_json, from_json, and have ZDType inherit from `Metadata` # until then, we have this hack here, which relies on the fact that to_dict will pass through diff --git a/src/zarr/registry.py b/src/zarr/registry.py index 189d42abed..996733972c 100644 --- a/src/zarr/registry.py +++ b/src/zarr/registry.py @@ -16,10 +16,12 @@ ArrayBytesCodec, BytesBytesCodec, Codec, + CodecJSON, CodecPipeline, ) + from zarr.codecs.numcodec import Numcodec from zarr.core.buffer import Buffer, NDBuffer - from zarr.core.common import JSON + from zarr.core.common import JSON, ZarrFormat __all__ = [ "Registry", @@ -141,15 +143,18 @@ def register_buffer(cls: type[Buffer], qualname: str | None = None) -> None: __buffer_registry.register(cls, qualname) -def get_codec_class(key: str, reload_config: bool = False) -> type[Codec]: +def _get_codec_class( + key: str, registry: dict[str, Registry[Codec]], *, reload_config: bool = False +) -> type[Codec]: if reload_config: _reload_config() - if key in __codec_registries: + if key in registry: # logger.debug("Auto loading codec '%s' from entrypoint", codec_id) - __codec_registries[key].lazy_load() + registry[key].lazy_load() + + codec_classes = registry[key] - codec_classes = __codec_registries[key] if not codec_classes: raise KeyError(key) @@ -169,6 +174,47 @@ def get_codec_class(key: str, reload_config: bool = False) -> type[Codec]: raise KeyError(key) +def get_codec(request: CodecJSON, *, zarr_format: ZarrFormat) -> Codec: + """ + Get an instance of a codec from a name and a configuration + """ + # avoid circular import + from zarr.codecs.numcodec import NumcodecsWrapper, get_numcodec_class + + codec_name: str + if zarr_format == 2: + if isinstance(request, str): + raise TypeError( + f"Invalid request type {type(request)} for zarr format 2. Expected dict, got {request!r}" + ) + else: + codec_name = request["id"] + codec_config = {k: v for k, v in request.items() if k != "id"} + elif zarr_format == 3: + if isinstance(request, str): + codec_name = request + codec_config = {} + else: + codec_name = request["name"] + codec_config = request.get("configuration", {}) + else: + raise ValueError( + f"Invalid zarr format. Must be 2 or 3, got {zarr_format!r}" + ) # pragma: no cover + + try: + codec_cls = get_codec_class(codec_name) + return codec_cls.from_json(request, zarr_format=zarr_format) + except KeyError: + # if we can't find the codec in the zarr python registry, try the numcodecs registry + codec = get_numcodec_class(codec_name)(**codec_config) + return NumcodecsWrapper(codec=codec) + + +def get_codec_class(key: str, reload_config: bool = False) -> type[Codec]: + return _get_codec_class(key, __codec_registries, reload_config=reload_config) + + def _resolve_codec(data: dict[str, JSON]) -> Codec: """ Get a codec instance from a dict representation of that codec. @@ -177,19 +223,28 @@ def _resolve_codec(data: dict[str, JSON]) -> Codec: return get_codec_class(data["name"]).from_dict(data) # type: ignore[arg-type] -def _parse_bytes_bytes_codec(data: dict[str, JSON] | Codec) -> BytesBytesCodec: +def _parse_bytes_bytes_codec( + data: dict[str, JSON] | Codec | Numcodec, *, zarr_format: ZarrFormat +) -> BytesBytesCodec: """ Normalize the input to a ``BytesBytesCodec`` instance. If the input is already a ``BytesBytesCodec``, it is returned as is. If the input is a dict, it is converted to a ``BytesBytesCodec`` instance via the ``_resolve_codec`` function. """ + # avoid circular import, AKA a sign that this function is in the wrong place from zarr.abc.codec import BytesBytesCodec + from zarr.codecs.numcodec import Numcodec, NumcodecsBytesBytesCodec, NumcodecsWrapper + result: BytesBytesCodec if isinstance(data, dict): - result = _resolve_codec(data) + result = get_codec(data, zarr_format=zarr_format) + if isinstance(result, NumcodecsWrapper): + result = result.to_bytes_bytes() if not isinstance(result, BytesBytesCodec): msg = f"Expected a dict representation of a BytesBytesCodec; got a dict representation of a {type(result)} instead." raise TypeError(msg) + elif isinstance(data, Numcodec): + return NumcodecsBytesBytesCodec(codec=data) else: if not isinstance(data, BytesBytesCodec): raise TypeError(f"Expected a BytesBytesCodec. Got {type(data)} instead.") @@ -197,19 +252,26 @@ def _parse_bytes_bytes_codec(data: dict[str, JSON] | Codec) -> BytesBytesCodec: return result -def _parse_array_bytes_codec(data: dict[str, JSON] | Codec) -> ArrayBytesCodec: +def _parse_array_bytes_codec( + data: dict[str, JSON] | Codec | Numcodec, *, zarr_format: ZarrFormat +) -> ArrayBytesCodec: """ Normalize the input to a ``ArrayBytesCodec`` instance. If the input is already a ``ArrayBytesCodec``, it is returned as is. If the input is a dict, it is converted to a ``ArrayBytesCodec`` instance via the ``_resolve_codec`` function. """ from zarr.abc.codec import ArrayBytesCodec + from zarr.codecs.numcodec import Numcodec, NumcodecsArrayBytesCodec, NumcodecsWrapper if isinstance(data, dict): - result = _resolve_codec(data) + result = get_codec(data, zarr_format=zarr_format) + if isinstance(result, NumcodecsWrapper): + result = result.to_array_bytes() if not isinstance(result, ArrayBytesCodec): msg = f"Expected a dict representation of a ArrayBytesCodec; got a dict representation of a {type(result)} instead." raise TypeError(msg) + elif isinstance(data, Numcodec): + return NumcodecsArrayBytesCodec(codec=data) else: if not isinstance(data, ArrayBytesCodec): raise TypeError(f"Expected a ArrayBytesCodec. Got {type(data)} instead.") @@ -217,19 +279,26 @@ def _parse_array_bytes_codec(data: dict[str, JSON] | Codec) -> ArrayBytesCodec: return result -def _parse_array_array_codec(data: dict[str, JSON] | Codec) -> ArrayArrayCodec: +def _parse_array_array_codec( + data: dict[str, JSON] | Codec | Numcodec, *, zarr_format: ZarrFormat +) -> ArrayArrayCodec: """ Normalize the input to a ``ArrayArrayCodec`` instance. If the input is already a ``ArrayArrayCodec``, it is returned as is. If the input is a dict, it is converted to a ``ArrayArrayCodec`` instance via the ``_resolve_codec`` function. """ from zarr.abc.codec import ArrayArrayCodec + from zarr.codecs.numcodec import Numcodec, NumcodecsArrayArrayCodec, NumcodecsWrapper if isinstance(data, dict): - result = _resolve_codec(data) - if not isinstance(result, ArrayArrayCodec): + result = get_codec(data, zarr_format=zarr_format) + if isinstance(result, NumcodecsWrapper): + result = result.to_array_array() + elif not isinstance(result, ArrayArrayCodec): msg = f"Expected a dict representation of a ArrayArrayCodec; got a dict representation of a {type(result)} instead." raise TypeError(msg) + elif isinstance(data, Numcodec): + return NumcodecsArrayArrayCodec(codec=data) else: if not isinstance(data, ArrayArrayCodec): raise TypeError(f"Expected a ArrayArrayCodec. Got {type(data)} instead.") diff --git a/tests/test_api.py b/tests/test_api.py index 01fb40f050..75db29a19e 100644 --- a/tests/test_api.py +++ b/tests/test_api.py @@ -1299,14 +1299,10 @@ def test_v2_without_compressor() -> None: def test_v2_with_v3_compressor() -> None: - # Check trying to create a v2 array with a v3 compressor fails - with pytest.raises( - ValueError, - match="Cannot use a BytesBytesCodec as a compressor for zarr v2 arrays. Use a numcodecs codec directly instead.", - ): - zarr.create( - store={}, shape=(1), dtype="uint8", zarr_format=2, compressor=zarr.codecs.BloscCodec() - ) + # Check trying to create a v2 array with a v3 compressor succeeds + zarr.create( + store={}, shape=(1), dtype="uint8", zarr_format=2, compressor=zarr.codecs.BloscCodec() + ) def add_empty_file(path: Path) -> Path: diff --git a/tests/test_array.py b/tests/test_array.py index f672006f9a..227e0b2a71 100644 --- a/tests/test_array.py +++ b/tests/test_array.py @@ -35,8 +35,6 @@ _parse_chunk_encoding_v3, chunks_initialized, create_array, - default_filters_v2, - default_serializer_v3, ) from zarr.core.buffer import NDArrayLike, NDArrayLikeOrScalar, default_buffer_prototype from zarr.core.chunk_grids import _auto_partition @@ -50,14 +48,12 @@ Structured, TimeDelta64, UInt8, - VariableLengthBytes, VariableLengthUTF8, ZDType, parse_dtype, ) from zarr.core.dtype.common import ENDIANNESS_STR, EndiannessStr from zarr.core.dtype.npy.common import NUMPY_ENDIANNESS_STR, endianness_from_numpy_str -from zarr.core.dtype.npy.string import UTF8Base from zarr.core.group import AsyncGroup from zarr.core.indexing import BasicIndexer from zarr.core.metadata.v2 import ArrayV2Metadata @@ -472,7 +468,7 @@ def test_info_v2(self, chunks: tuple[int, int], shards: tuple[int, int] | None) _read_only=False, _store_type="MemoryStore", _count_bytes=512, - _compressors=(numcodecs.Zstd(),), + _compressors=(ZstdCodec(),), ) assert result == expected @@ -552,7 +548,7 @@ async def test_info_v2_async( _read_only=False, _store_type="MemoryStore", _count_bytes=512, - _compressors=(numcodecs.Zstd(),), + _compressors=(ZstdCodec(),), ) assert result == expected @@ -1011,28 +1007,6 @@ def test_default_fill_value(dtype: ZDType[Any, Any], store: Store) -> None: else: assert a.fill_value == dtype.default_scalar() - @staticmethod - # @pytest.mark.parametrize("zarr_format", [2, 3]) - @pytest.mark.parametrize("dtype", zdtype_examples) - @pytest.mark.filterwarnings("ignore::zarr.core.dtype.common.UnstableSpecificationWarning") - def test_default_fill_value_None( - dtype: ZDType[Any, Any], store: Store, zarr_format: ZarrFormat - ) -> None: - """ - Test that the fill value of an array is set to the default value for an explicit None argument for - Zarr Format 3, and to null for Zarr Format 2 - """ - a = zarr.create_array( - store, shape=(5,), chunks=(5,), dtype=dtype, fill_value=None, zarr_format=zarr_format - ) - if zarr_format == 3: - if isinstance(dtype, DateTime64 | TimeDelta64) and np.isnat(a.fill_value): - assert np.isnat(dtype.default_scalar()) - else: - assert a.fill_value == dtype.default_scalar() - elif zarr_format == 2: - assert a.fill_value is None - @staticmethod @pytest.mark.filterwarnings("ignore::zarr.core.dtype.common.UnstableSpecificationWarning") @pytest.mark.parametrize("dtype", zdtype_examples) @@ -1105,8 +1079,8 @@ def test_dtype_roundtrip( (ZstdCodec(level=3),), (ZstdCodec(level=3), GzipCodec(level=0)), ZstdCodec(level=3), - {"name": "zstd", "configuration": {"level": 3}}, - ({"name": "zstd", "configuration": {"level": 3}},), + {"name": "zstd", "configuration": {"level": 3, "checksum": False}}, + ({"name": "zstd", "configuration": {"level": 3, "checksum": False}},), ], ) @pytest.mark.parametrize( @@ -1280,7 +1254,7 @@ async def test_invalid_v3_arguments( zarr.create(store=store, dtype="uint8", shape=(10,), zarr_format=3, **kwargs) @staticmethod - @pytest.mark.parametrize("dtype", ["uint8", "float32", "str", "U10", "S10", ">M8[10s]"]) + @pytest.mark.parametrize("dtype", ["uint8", "float32"]) @pytest.mark.parametrize( "compressors", [ @@ -1323,7 +1297,6 @@ async def test_v2_chunk_encoding( @staticmethod @pytest.mark.parametrize("dtype", [UInt8(), Float32(), VariableLengthUTF8()]) - @pytest.mark.filterwarnings("ignore::zarr.core.dtype.common.UnstableSpecificationWarning") async def test_default_filters_compressors( store: MemoryStore, dtype: UInt8 | Float32 | VariableLengthUTF8, zarr_format: ZarrFormat ) -> None: @@ -1662,12 +1635,12 @@ def test_roundtrip_numcodecs() -> None: store = MemoryStore() compressors = [ - {"name": "numcodecs.shuffle", "configuration": {"elementsize": 2}}, - {"name": "numcodecs.zlib", "configuration": {"level": 4}}, + {"name": "shuffle", "configuration": {"elementsize": 2}}, + {"name": "zlib", "configuration": {"level": 4}}, ] filters = [ { - "name": "numcodecs.fixedscaleoffset", + "name": "fixedscaleoffset", "configuration": { "scale": 100.0, "offset": 0.0, @@ -1764,74 +1737,4 @@ async def test_sharding_coordinate_selection() -> None: shards=(2, 4, 4), ) arr[:] = np.arange(2 * 3 * 4).reshape((2, 3, 4)) - result = arr[1, [0, 1]] # type: ignore[index] - assert isinstance(result, NDArrayLike) - assert (result == np.array([[12, 13, 14, 15], [16, 17, 18, 19]])).all() - - -@pytest.mark.parametrize("store", ["local", "memory", "zip"], indirect=["store"]) -def test_array_repr(store: Store) -> None: - shape = (2, 3, 4) - dtype = "uint8" - arr = zarr.create_array(store, shape=shape, dtype=dtype) - assert str(arr) == f"" - - -class UnknownObjectDtype(UTF8Base[np.dtypes.ObjectDType]): - object_codec_id = "unknown" # type: ignore[assignment] - - def to_native_dtype(self) -> np.dtypes.ObjectDType: - """ - Create a NumPy object dtype from this VariableLengthUTF8 ZDType. - - Returns - ------- - np.dtypes.ObjectDType - The NumPy object dtype. - """ - return np.dtype("o") # type: ignore[return-value] - - -@pytest.mark.parametrize( - "dtype", [VariableLengthUTF8(), VariableLengthBytes(), UnknownObjectDtype()] -) -def test_chunk_encoding_no_object_codec_errors(dtype: ZDType[Any, Any]) -> None: - """ - Test that a valuerror is raised when checking the chunk encoding for a v2 array with a - data type that requires an object codec, but where no object codec is specified - """ - if isinstance(dtype, VariableLengthUTF8): - codec_name = "the numcodecs.VLenUTF8 codec" - elif isinstance(dtype, VariableLengthBytes): - codec_name = "the numcodecs.VLenBytes codec" - else: - codec_name = f"an unknown object codec with id {dtype.object_codec_id!r}" # type: ignore[attr-defined] - msg = ( - f"Data type {dtype} requires {codec_name}, " - "but no such codec was specified in the filters or compressor parameters for " - "this array. " - ) - with pytest.raises(ValueError, match=re.escape(msg)): - _parse_chunk_encoding_v2(filters=None, compressor=None, dtype=dtype) - - -def test_unknown_object_codec_default_serializer_v3() -> None: - """ - Test that we get a valueerrror when trying to create the default serializer for a data type - that requires an unknown object codec - """ - dtype = UnknownObjectDtype() - msg = f"Data type {dtype} requires an unknown object codec: {dtype.object_codec_id!r}." - with pytest.raises(ValueError, match=re.escape(msg)): - default_serializer_v3(dtype) - - -def test_unknown_object_codec_default_filters_v2() -> None: - """ - Test that we get a valueerrror when trying to create the default serializer for a data type - that requires an unknown object codec - """ - dtype = UnknownObjectDtype() - msg = f"Data type {dtype} requires an unknown object codec: {dtype.object_codec_id!r}." - with pytest.raises(ValueError, match=re.escape(msg)): - default_filters_v2(dtype) + assert (arr[1, [0, 1]] == np.array([[12, 13, 14, 15], [16, 17, 18, 19]])).all() # type: ignore[index] diff --git a/tests/test_config.py b/tests/test_config.py index da5b2cc488..d4b1b0496f 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -180,7 +180,7 @@ async def _encode_single(self, chunk_bytes: Buffer, chunk_spec: ArraySpec) -> Bu chunks=(10,), zarr_format=3, dtype="i4", - compressors=[{"name": "blosc", "configuration": {}}], + compressors=[BloscCodec(typesize=1, shuffle="bitshuffle").to_json(zarr_format=3)], ) arr[:] = range(100) _mock.call.assert_called() diff --git a/tests/test_examples.py b/tests/test_examples.py index c97766364b..8b5705c317 100644 --- a/tests/test_examples.py +++ b/tests/test_examples.py @@ -71,8 +71,11 @@ def test_scripts_can_run(script_path: Path, tmp_path: Path) -> None: # We resave the script after inserting the absolute path to the local Zarr project directory, # and then test its behavior. # This allows the example to be useful to users who don't have Zarr installed, but also testable. + # --refresh ensures that uv doesn't use a cached build of our local package resave_script(script_path, dest_path) - result = subprocess.run(["uv", "run", str(dest_path)], capture_output=True, text=True) + result = subprocess.run( + ["uv", "run", "--refresh", str(dest_path)], capture_output=True, text=True + ) assert result.returncode == 0, ( f"Script at {script_path} failed to run. Output: {result.stdout} Error: {result.stderr}" ) diff --git a/tests/test_group.py b/tests/test_group.py index 7705fa205a..8ed661f8a5 100644 --- a/tests/test_group.py +++ b/tests/test_group.py @@ -11,7 +11,6 @@ import numpy as np import pytest -from numcodecs import Blosc import zarr import zarr.api.asynchronous @@ -21,6 +20,7 @@ from zarr.abc.store import Store from zarr.core import sync_group from zarr.core._info import GroupInfo +from zarr.core.array import default_compressor_v2, default_compressors_v3, default_serializer_v3 from zarr.core.buffer import default_buffer_prototype from zarr.core.config import config as zarr_config from zarr.core.dtype.common import unpack_dtype_json @@ -523,7 +523,7 @@ def test_group_child_iterators(store: Store, zarr_format: ZarrFormat, consolidat "chunks": (1,), "order": "C", "filters": None, - "compressor": Blosc(), + "compressor": default_compressor_v2(dtype).to_json(zarr_format=zarr_format), "zarr_format": zarr_format, }, "subgroup": { @@ -550,8 +550,11 @@ def test_group_child_iterators(store: Store, zarr_format: ZarrFormat, consolidat "name": "default", }, "codecs": ( - {"configuration": {"endian": "little"}, "name": "bytes"}, - {"configuration": {}, "name": "zstd"}, + default_serializer_v3(dtype).to_json(zarr_format=zarr_format), + *[ + c.to_json(zarr_format=zarr_format) + for c in default_compressors_v3(dtype) + ], ), "data_type": unpack_dtype_json(dtype.to_json(zarr_format=zarr_format)), "fill_value": fill_value, diff --git a/tests/test_gzip.py b/tests/test_gzip.py new file mode 100644 index 0000000000..ae7b68be5d --- /dev/null +++ b/tests/test_gzip.py @@ -0,0 +1,26 @@ +import json + +import pytest + +import zarr +from zarr.codecs import GzipCodec + + +@pytest.mark.parametrize("zarr_format", [2, 3]) +def test_gzip_compression(zarr_format) -> None: + store = {} + zarr.create_array( + store=store, + dtype="int", + shape=(1,), + chunks=(10,), + zarr_format=zarr_format, + compressors=GzipCodec(), + ) + + if zarr_format == 2: + print(json.dumps(json.loads(store[".zarray"].to_bytes()), indent=2)) + else: + print(json.dumps(json.loads(store["zarr.json"].to_bytes()), indent=2)) + + zarr.open_array(store=store, zarr_format=zarr_format) diff --git a/tests/test_image_codecs.py b/tests/test_image_codecs.py new file mode 100644 index 0000000000..3372ac9bdf --- /dev/null +++ b/tests/test_image_codecs.py @@ -0,0 +1,17 @@ +import numcodecs +import numpy as np +from imagecodecs.numcodecs import Jpeg + +import zarr + +numcodecs.register_codec(Jpeg) +jpg_codec = Jpeg() +store = {} + +z_w = zarr.create_array( + store=store, data=np.zeros((100, 100, 3), dtype=np.uint8), serializer=jpg_codec, zarr_format=3 +) + +z_r = zarr.open_array(store=store, zarr_format=3) + +print(z_r.metadata.to_dict()["codecs"]) diff --git a/tests/test_info.py b/tests/test_info.py index 28c8803c83..08f2318dc2 100644 --- a/tests/test_info.py +++ b/tests/test_info.py @@ -74,7 +74,7 @@ def test_array_info(zarr_format: ZarrFormat) -> None: Read-only : True Store type : MemoryStore Filters : () - Serializer : BytesCodec(endian=) + Serializer : BytesCodec(endian='little') Compressors : ()""") @@ -117,7 +117,7 @@ def test_array_info_complete( Read-only : True Store type : MemoryStore Filters : () - Serializer : BytesCodec(endian=) + Serializer : BytesCodec(endian='little') Compressors : () No. bytes : {count_bytes} ({count_bytes_formatted}) No. bytes stored : {count_bytes_stored} ({count_bytes_stored_formatted}) diff --git a/tests/test_metadata/test_consolidated.py b/tests/test_metadata/test_consolidated.py index 19eba4fb86..fb80c7155a 100644 --- a/tests/test_metadata/test_consolidated.py +++ b/tests/test_metadata/test_consolidated.py @@ -5,7 +5,6 @@ import numpy as np import pytest -from numcodecs import Blosc import zarr.api.asynchronous import zarr.api.synchronous @@ -17,6 +16,7 @@ open, open_consolidated, ) +from zarr.core.array import default_compressor_v2 from zarr.core.buffer import cpu, default_buffer_prototype from zarr.core.dtype import parse_dtype from zarr.core.group import ConsolidatedMetadata, GroupMetadata @@ -552,7 +552,7 @@ async def test_consolidated_metadata_v2(self): attributes={"key": "a"}, chunks=(1,), fill_value=0, - compressor=Blosc(), + compressor=default_compressor_v2(dtype), order="C", ), "g1": GroupMetadata(