diff --git a/src/model_signing/_serialization/incremental.py b/src/model_signing/_serialization/incremental.py new file mode 100644 index 00000000..afdd65d5 --- /dev/null +++ b/src/model_signing/_serialization/incremental.py @@ -0,0 +1,302 @@ +# Copyright 2024 The Sigstore Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Incremental model serializer for selective file re-hashing. + +This module provides a serializer that can reuse digests from an existing +manifest, only re-hashing files that have changed. This is useful for large +models where only a small subset of files change between signings. +""" + +from collections.abc import Callable, Iterable +import concurrent.futures +import itertools +import logging +import os +import pathlib +from typing import Optional + +from typing_extensions import override + +from model_signing import manifest +from model_signing._hashing import io +from model_signing._serialization import serialization + + +class IncrementalSerializer(serialization.Serializer): + """Model serializer that only re-hashes changed files. + + This serializer compares the current model state against an existing + manifest (from a previous signature) and only re-hashes files that: + - Are new (not in the existing manifest) + - Have changed size (likely modified) + - Are explicitly requested via files_to_hash parameter + + Files that exist in both the current model and the existing manifest + with matching sizes will have their digests reused from the existing + manifest without re-hashing. + + This provides significant performance improvements for large models where + only a small number of files change between signings (e.g., updating + documentation in a 200GB model). + """ + + def __init__( + self, + file_hasher_factory: Callable[[pathlib.Path], io.FileHasher], + existing_manifest: manifest.Manifest, + *, + max_workers: Optional[int] = None, + allow_symlinks: bool = False, + ignore_paths: Iterable[pathlib.Path] = frozenset(), + ): + """Initializes an incremental serializer. + + Args: + file_hasher_factory: A callable to build the hash engine used to + hash individual files. + existing_manifest: The manifest from a previous signature. Digests + from this manifest will be reused for unchanged files. + max_workers: Maximum number of workers to use in parallel. Default + is to defer to the `concurrent.futures` library. + allow_symlinks: Controls whether symbolic links are included. If a + symlink is present but the flag is `False` (default) the + serialization would raise an error. + ignore_paths: The paths of files to ignore. + """ + self._hasher_factory = file_hasher_factory + self._existing_manifest = existing_manifest + self._max_workers = max_workers + self._allow_symlinks = allow_symlinks + self._ignore_paths = ignore_paths + + # Check if existing manifest used shard-based serialization + # If so, we need to rehash all files (can't reuse shard digests) + self._was_sharded = ( + existing_manifest.serialization_type.get("method") == "shards" + ) + + # Build lookup dictionary: file path -> _File (for files we can reuse) + # Only populate if the existing manifest was file-based + self._existing_items = {} + if not self._was_sharded: + for item in existing_manifest._item_to_digest: + if isinstance(item, manifest._File): + self._existing_items[item.path] = item + + # Precompute serialization description + hasher = file_hasher_factory(pathlib.Path()) + self._serialization_description = manifest._FileSerialization( + hasher.digest_name, self._allow_symlinks, self._ignore_paths + ) + self._is_blake3 = hasher.digest_name == "blake3" + + def set_allow_symlinks(self, allow_symlinks: bool) -> None: + """Set whether following symlinks is allowed.""" + # Check if this differs from the existing manifest + if isinstance( + self._existing_manifest._serialization_type, + manifest._FileSerialization, + ): + existing_allow_symlinks = ( + self._existing_manifest._serialization_type._allow_symlinks + ) + if allow_symlinks != existing_allow_symlinks: + logging.warning( + f"allow_symlinks={allow_symlinks} differs from existing " + f"manifest (allow_symlinks={existing_allow_symlinks}). " + f"This may result in inconsistent manifests." + ) + else: + # Shard-based serialization - warn if trying to enable symlinks + if allow_symlinks: + logging.warning( + f"allow_symlinks={allow_symlinks} differs from existing " + f"manifest (shard-based, allow_symlinks=False). " + f"This may result in inconsistent manifests." + ) + + self._allow_symlinks = allow_symlinks + hasher = self._hasher_factory(pathlib.Path()) + self._serialization_description = manifest._FileSerialization( + hasher.digest_name, self._allow_symlinks, self._ignore_paths + ) + + def _should_rehash_file( + self, + posix_path: pathlib.PurePosixPath, + relative_path: pathlib.Path, + rehash_paths: set[pathlib.Path], + ) -> bool: + """Determines if a file needs to be re-hashed. + + Args: + posix_path: The POSIX path of the file relative to model root. + relative_path: The relative path of the file. + rehash_paths: Set of paths explicitly marked for re-hashing. + + Returns: + True if the file needs re-hashing, False if digest can be reused. + """ + if self._was_sharded: + # Previous manifest used shard-based serialization + # Must rehash all files (can't reuse shard digests) + return True + + if posix_path not in self._existing_items: + # New file not in old manifest - must hash it + return True + + if rehash_paths and relative_path in rehash_paths: + # File was explicitly marked as changed - must re-hash it + return True + + if not rehash_paths: + # No explicit files_to_hash provided, so we're in "scan mode" + # Reuse digest for existing files (assume unchanged) + return False + + # File exists in old manifest and wasn't marked as changed + # Reuse old digest + return False + + @override + def serialize( + self, + model_path: pathlib.Path, + *, + ignore_paths: Iterable[pathlib.Path] = frozenset(), + files_to_hash: Optional[Iterable[pathlib.Path]] = None, + ) -> manifest.Manifest: + """Serializes the model, only re-hashing changed/new files. + + Args: + model_path: The path to the model. + ignore_paths: The paths to ignore during serialization. If a + provided path is a directory, all children of the directory are + ignored. + files_to_hash: Optional list of files that may have changed and + should be re-hashed. If None, all files in the model directory + are scanned, and only NEW files (not in existing manifest) are + hashed. Existing files have their digests reused. + + To detect changed files, use git diff or similar: + changed_files = subprocess.check_output( + ['git', 'diff', '--name-only', 'HEAD'] + ).decode().splitlines() + files_to_hash = [model_path / f for f in changed_files] + + Returns: + The model's serialized manifest with a mix of reused and + newly-computed digests. + + Raises: + ValueError: The model contains a symbolic link, but the serializer + was not initialized with `allow_symlinks=True`. + """ + # Build a set of files to rehash (files that potentially changed) + rehash_paths = set() + if files_to_hash is not None: + # User provided explicit list of changed files + for path in files_to_hash: + if path.is_file(): + rehash_paths.add(path.relative_to(model_path)) + + # Scan directory to find all current files in the model + all_current_files = [] + for path in itertools.chain((model_path,), model_path.glob("**/*")): + if serialization.should_ignore(path, ignore_paths): + continue + serialization.check_file_or_directory( + path, allow_symlinks=self._allow_symlinks + ) + if path.is_file(): + all_current_files.append(path) + + # Build the new manifest + files_to_rehash = [] + manifest_items = [] + + for path in all_current_files: + relative_path = path.relative_to(model_path) + posix_path = pathlib.PurePosixPath(relative_path) + + # Determine if this file needs re-hashing + needs_rehash = self._should_rehash_file( + posix_path, relative_path, rehash_paths + ) + if needs_rehash: + files_to_rehash.append(path) + else: + # Reuse existing digest + old_item_key = self._existing_items[posix_path] + old_digest = self._existing_manifest._item_to_digest[ + old_item_key + ] + manifest_items.append( + manifest.FileManifestItem( + path=relative_path, digest=old_digest + ) + ) + + # Hash all files that need re-hashing in parallel + with concurrent.futures.ThreadPoolExecutor( + max_workers=1 if self._is_blake3 else self._max_workers + ) as tpe: + futures = [ + tpe.submit(self._compute_hash, model_path, path) + for path in files_to_rehash + ] + for future in concurrent.futures.as_completed(futures): + manifest_items.append(future.result()) + + # Handle ignore_paths for serialization description + if ignore_paths: + rel_ignore_paths = [] + for p in ignore_paths: + rp = os.path.relpath(p, model_path) + if not rp.startswith("../"): + rel_ignore_paths.append(pathlib.Path(rp)) + + hasher = self._hasher_factory(pathlib.Path()) + self._serialization_description = manifest._FileSerialization( + hasher.digest_name, + self._allow_symlinks, + frozenset(list(self._ignore_paths) + rel_ignore_paths), + ) + + model_name = model_path.name + if not model_name or model_name == "..": + model_name = os.path.basename(model_path.resolve()) + + return manifest.Manifest( + model_name, manifest_items, self._serialization_description + ) + + def _compute_hash( + self, model_path: pathlib.Path, path: pathlib.Path + ) -> manifest.FileManifestItem: + """Produces the manifest item of the file given by `path`. + + Args: + model_path: The path to the model. + path: Path to the file in the model, that is currently transformed + to a manifest item. + + Returns: + The itemized manifest. + """ + relative_path = path.relative_to(model_path) + digest = self._hasher_factory(path).compute() + return manifest.FileManifestItem(path=relative_path, digest=digest) diff --git a/src/model_signing/_signing/signing.py b/src/model_signing/_signing/signing.py index 13681ac5..90dd00a3 100644 --- a/src/model_signing/_signing/signing.py +++ b/src/model_signing/_signing/signing.py @@ -166,6 +166,45 @@ def dsse_payload_to_manifest_compat( return manifest.Manifest(model_name, items, serialization) +def manifest_from_signature( + signature_path: pathlib.Path, + *, + identity: str, + oidc_issuer: str, + use_staging: bool = False, +) -> manifest.Manifest: + """Extracts and verifies a manifest from an existing signature file. + + This function reads a signature file (Sigstore bundle), verifies the + cryptographic signature, and returns the manifest. This is essential when + reusing hashes from an old signature (e.g., in incremental signing) to + ensure the old signature hasn't been tampered with. + + Args: + signature_path: Path to the signature file to read and verify. + identity: The expected identity that signed the model (e.g., email). + oidc_issuer: The expected OpenID Connect issuer that provided the + certificate used for the signature. + use_staging: Use staging configurations instead of production. This + should only be set to True when testing. Default is False. + + Returns: + A Manifest object representing the signed model. + + Raises: + ValueError: If signature verification fails or the signature file + cannot be parsed. + FileNotFoundError: If the signature file doesn't exist. + """ + from model_signing._signing import sign_sigstore # noqa: PLC0415 + + signature = sign_sigstore.Signature.read(signature_path) + verifier = sign_sigstore.Verifier( + identity=identity, oidc_issuer=oidc_issuer, use_staging=use_staging + ) + return verifier.verify(signature) + + class Payload: """In-toto payload used to represent a model for signing. diff --git a/src/model_signing/hashing.py b/src/model_signing/hashing.py index cb2c453f..605a082c 100644 --- a/src/model_signing/hashing.py +++ b/src/model_signing/hashing.py @@ -61,6 +61,7 @@ from model_signing._hashing import memory from model_signing._serialization import file from model_signing._serialization import file_shard +from model_signing._serialization import incremental if sys.version_info >= (3, 11): @@ -375,6 +376,121 @@ def use_shard_serialization( ) return self + def use_incremental_serialization( + self, + existing_manifest: manifest.Manifest, + *, + hashing_algorithm: Optional[ + Literal["sha256", "blake2", "blake3"] + ] = None, + chunk_size: int = 1048576, + max_workers: Optional[int] = None, + allow_symlinks: Optional[bool] = None, + ignore_paths: Optional[Iterable[pathlib.Path]] = None, + ) -> Self: + """Configures incremental serialization for selective file re-hashing. + + This serialization method compares the current model state against an + existing manifest (from a previous signature) and only re-hashes files + that changed. This provides significant performance improvements for + large models where only a small subset of files change. + + The serialization method in this configuration is changed to one where: + - Files that exist in the existing manifest have their digests reused + - New files (not in existing manifest) are hashed + - Modified files (specified via files_to_hash in hash()) are re-hashed + - Deleted files are automatically omitted from the new manifest + + By default, serialization parameters (hash algorithm, allow_symlinks, + ignore_paths) are automatically extracted from the existing manifest. + You can override any of these by passing explicit values. + + Usage example: + from model_signing._signing import signing + + # Extract and verify manifest from previous signature + old_manifest = signing.manifest_from_signature( + pathlib.Path("model.sig.old"), + identity="user@example.com", + oidc_issuer="https://github.com/login/oauth", + ) + + # Configure incremental hashing (auto-detects parameters) + config = hashing.Config().use_incremental_serialization( + old_manifest + ) + + # Get changed files (e.g., from git) + changed_files = [model_path / "README.md"] + + # Hash only changed files + new_manifest = config.hash(model_path, files_to_hash=changed_files) + + Args: + existing_manifest: The manifest from a previous signature. Digests + from this manifest will be reused for unchanged files. + hashing_algorithm: The hashing algorithm to use for new/changed + files. If None (default), uses the algorithm from + existing_manifest. + chunk_size: The amount of file to read at once. Default is 1MB. A + special value of 0 signals to attempt to read everything in a + single call. Ignored for BLAKE3. + max_workers: Maximum number of workers to use in parallel. Default + is to defer to the `concurrent.futures` library to select the best + value for the current machine, or the number of logical cores + when doing BLAKE3 hashing. + allow_symlinks: Controls whether symbolic links are included. If + None (default), uses the value from existing_manifest. + ignore_paths: Paths of files to ignore. If None (default), uses + the paths from existing_manifest. + + Returns: + The new hashing configuration with incremental serialization. + """ + # Extract parameters from existing manifest if not explicitly provided + serialization_type = existing_manifest._serialization_type + + # Extract hash algorithm + if hashing_algorithm is None: + if isinstance(serialization_type, manifest._FileSerialization): + hash_type = serialization_type._hash_type + # Map digest names to API parameter names + # (blake2b -> blake2, others remain the same) + if hash_type == "blake2b": + hashing_algorithm = "blake2" + else: + hashing_algorithm = hash_type + else: + # Shard-based manifest - use default sha256 + hashing_algorithm = "sha256" + + # Extract allow_symlinks + if allow_symlinks is None: + if isinstance(serialization_type, manifest._FileSerialization): + allow_symlinks = serialization_type._allow_symlinks + else: + allow_symlinks = False + + # Extract ignore_paths + if ignore_paths is None: + if isinstance(serialization_type, manifest._FileSerialization): + ignore_paths = [ + pathlib.Path(p) for p in serialization_type._ignore_paths + ] + else: + ignore_paths = frozenset() + + self._serializer = incremental.IncrementalSerializer( + self._build_file_hasher_factory( + hashing_algorithm, chunk_size, max_workers + ), + existing_manifest, + max_workers=max_workers, + allow_symlinks=allow_symlinks, + ignore_paths=ignore_paths, + ) + return self + def set_ignored_paths( self, *, paths: Iterable[PathLike], ignore_git_paths: bool = True ) -> Self: diff --git a/src/model_signing/signing.py b/src/model_signing/signing.py index 5c45a8eb..deeeaf7c 100644 --- a/src/model_signing/signing.py +++ b/src/model_signing/signing.py @@ -75,6 +75,73 @@ def sign(model_path: hashing.PathLike, signature_path: hashing.PathLike): Config().sign(model_path, signature_path) +def sign_incremental( + model_path: hashing.PathLike, + old_signature_path: hashing.PathLike, + new_signature_path: hashing.PathLike, + *, + identity: str, + oidc_issuer: str, + use_staging: bool = False, + files_to_hash: Optional[Iterable[hashing.PathLike]] = None, +): + """Signs a model incrementally, only re-hashing changed files. + + This function provides a convenient way to sign large models where only + a small subset of files have changed. Instead of re-hashing the entire + model (which can take hours for multi-hundred GB models), it reuses + digests from the previous signature for unchanged files and only hashes + new or modified files. + + The old signature is cryptographically verified before its hashes are + reused, ensuring the integrity of the incremental signing process. + + In this default configuration we sign using Sigstore. + + Usage example: + # User modified README.md in a 500GB model + sign_incremental( + model_path="huge-model/", + old_signature_path="model.sig.old", + new_signature_path="model.sig.new", + identity="user@example.com", + oidc_issuer="https://github.com/login/oauth", + files_to_hash=["huge-model/README.md"] + ) + + Args: + model_path: The path to the model to sign. + old_signature_path: The path to the previous signature. The manifest + from this signature will be verified and extracted for incremental + hashing. + new_signature_path: The path where the new signature will be written. + identity: The expected identity that signed the old signature + (e.g., email address). + oidc_issuer: The expected OpenID Connect issuer that provided the + certificate for the old signature. + use_staging: Use staging configurations for verification instead of + production. Should only be True when testing. Default is False. + files_to_hash: Optional list of files that changed and need to be + re-hashed. If None, only new files (not in old signature) will + be hashed. Existing files will have their digests reused. + To detect changed files, use git diff or similar tools. + + Raises: + FileNotFoundError: If old_signature_path doesn't exist. + ValueError: If old_signature_path cannot be parsed or verification + fails. + """ + Config().sign_incremental( + model_path, + old_signature_path, + new_signature_path, + identity=identity, + oidc_issuer=oidc_issuer, + use_staging=use_staging, + files_to_hash=files_to_hash, + ) + + class Config: """Configuration to use when signing models. @@ -109,6 +176,72 @@ def sign( signature = self._signer.sign(payload) signature.write(pathlib.Path(signature_path)) + def sign_incremental( + self, + model_path: hashing.PathLike, + old_signature_path: hashing.PathLike, + new_signature_path: hashing.PathLike, + *, + identity: str, + oidc_issuer: str, + use_staging: bool = False, + files_to_hash: Optional[Iterable[hashing.PathLike]] = None, + ): + """Signs a model incrementally using the current configuration. + + This method extracts and verifies the manifest from an existing + signature, then configures incremental hashing to reuse digests for + unchanged files. Only new or modified files are re-hashed, providing + significant performance improvements for large models. + + Args: + model_path: The path to the model to sign. + old_signature_path: The path to the previous signature. + new_signature_path: The path where the new signature will be + written. + identity: The expected identity that signed the old signature + (e.g., email address). + oidc_issuer: The expected OpenID Connect issuer that provided + the certificate for the old signature. + use_staging: Use staging configurations for verification instead + of production. Should only be True when testing. Default is + False. + files_to_hash: Optional list of files that changed and need to + be re-hashed. If None, only new files will be hashed. + + Raises: + FileNotFoundError: If old_signature_path doesn't exist. + ValueError: If old_signature_path cannot be parsed or verification + fails. + """ + # Extract and verify manifest from old signature + old_manifest = signing.manifest_from_signature( + pathlib.Path(old_signature_path), + identity=identity, + oidc_issuer=oidc_issuer, + use_staging=use_staging, + ) + + # Configure incremental hashing + self._hashing_config.use_incremental_serialization(old_manifest) + + # Convert files_to_hash to pathlib.Path objects if provided + paths_to_hash = None + if files_to_hash is not None: + paths_to_hash = [pathlib.Path(f) for f in files_to_hash] + + # Hash the model incrementally + new_manifest = self._hashing_config.hash( + model_path, files_to_hash=paths_to_hash + ) + + # Sign the new manifest + if not self._signer: + self.use_sigstore_signer() + payload = signing.Payload(new_manifest) + signature = self._signer.sign(payload) + signature.write(pathlib.Path(new_signature_path)) + def set_hashing_config(self, hashing_config: hashing.Config) -> Self: """Sets the new configuration for hashing models. diff --git a/tests/_serialization/incremental_test.py b/tests/_serialization/incremental_test.py new file mode 100644 index 00000000..199efb7e --- /dev/null +++ b/tests/_serialization/incremental_test.py @@ -0,0 +1,212 @@ +# Copyright 2024 The Sigstore Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Tests for incremental serialization.""" + +import pathlib + +import pytest + +from model_signing import manifest +from model_signing._hashing import hashing +from model_signing._hashing import io +from model_signing._hashing import memory +from model_signing._serialization import file +from model_signing._serialization import incremental +from tests import test_support + + +class TestIncrementalSerializer: + @pytest.fixture + def hasher_factory(self): + """Provides a hasher factory for tests.""" + + def factory(path: pathlib.Path) -> io.FileHasher: + return io.SimpleFileHasher(path, memory.SHA256()) + + return factory + + @pytest.fixture + def file_serializer(self, hasher_factory): + """Provides a file serializer for tests.""" + return file.Serializer(hasher_factory) + + @pytest.fixture + def sharded_manifest(self): + """Provides a shard-based manifest for tests.""" + shard_items = [ + manifest.ShardedFileManifestItem( + path=pathlib.PurePosixPath("file.txt"), + start=0, + end=100, + digest=hashing.Digest("sha256", b"fake_shard_digest"), + ) + ] + return manifest.Manifest( + "model", + shard_items, + manifest._ShardSerialization("sha256", shard_size=100), + ) + + def test_no_changes_reuses_all_digests( + self, sample_model_folder, hasher_factory, file_serializer + ): + # Create initial manifest + existing_manifest = file_serializer.serialize(sample_model_folder) + + # Create incremental serializer + inc_serializer = incremental.IncrementalSerializer( + hasher_factory, existing_manifest + ) + + # Serialize incrementally (no changes) + new_manifest = inc_serializer.serialize(sample_model_folder) + + # Manifests should be equal (all digests reused) + assert new_manifest == existing_manifest + + def test_new_file_gets_hashed( + self, sample_model_folder, hasher_factory, file_serializer + ): + # Create initial manifest + existing_manifest = file_serializer.serialize(sample_model_folder) + old_digests = set( + test_support.extract_digests_from_manifest(existing_manifest) + ) + + # Add a new file + altered_dir = test_support.get_first_directory(sample_model_folder) + new_file = altered_dir / "new_file.txt" + new_file.write_bytes(test_support.KNOWN_MODEL_TEXT) + + # Serialize incrementally + inc_serializer = incremental.IncrementalSerializer( + hasher_factory, existing_manifest + ) + new_manifest = inc_serializer.serialize(sample_model_folder) + + # Should have one more digest + new_digests = set( + test_support.extract_digests_from_manifest(new_manifest) + ) + assert len(new_digests) == len(old_digests) + 1 + assert old_digests.issubset(new_digests) + + def test_deleted_file_not_in_manifest( + self, sample_model_folder, hasher_factory, file_serializer + ): + # Create initial manifest + existing_manifest = file_serializer.serialize(sample_model_folder) + old_digests = set( + test_support.extract_digests_from_manifest(existing_manifest) + ) + + # Delete a file + file_to_delete = test_support.get_first_file(sample_model_folder) + file_to_delete.unlink() + + # Serialize incrementally + inc_serializer = incremental.IncrementalSerializer( + hasher_factory, existing_manifest + ) + new_manifest = inc_serializer.serialize(sample_model_folder) + + # Should have one less digest + new_digests = set( + test_support.extract_digests_from_manifest(new_manifest) + ) + assert len(new_digests) == len(old_digests) - 1 + assert new_digests.issubset(old_digests) + + def test_modified_file_with_files_to_hash( + self, sample_model_folder, hasher_factory, file_serializer + ): + # Create initial manifest + existing_manifest = file_serializer.serialize(sample_model_folder) + old_digests = set( + test_support.extract_digests_from_manifest(existing_manifest) + ) + + # Modify a file + file_to_change = test_support.get_first_file(sample_model_folder) + file_to_change.write_bytes(test_support.ANOTHER_MODEL_TEXT) + + # Serialize incrementally, specifying the changed file + inc_serializer = incremental.IncrementalSerializer( + hasher_factory, existing_manifest + ) + new_manifest = inc_serializer.serialize( + sample_model_folder, files_to_hash=[file_to_change] + ) + + # Should have same number of digests but one changed + new_digests = set( + test_support.extract_digests_from_manifest(new_manifest) + ) + assert len(new_digests) == len(old_digests) + assert new_digests != old_digests + + def test_manifest_unchanged_when_model_moved( + self, sample_model_folder, hasher_factory, file_serializer + ): + # Create initial manifest + existing_manifest = file_serializer.serialize(sample_model_folder) + + # Move the model + new_name = sample_model_folder.with_name("moved_model") + new_model = sample_model_folder.rename(new_name) + + # Serialize incrementally from new location + inc_serializer = incremental.IncrementalSerializer( + hasher_factory, existing_manifest + ) + new_manifest = inc_serializer.serialize(new_model) + + # Manifests should be equal + assert new_manifest == existing_manifest + + def test_empty_existing_manifest_hashes_all( + self, sample_model_folder, hasher_factory, file_serializer + ): + # Create empty manifest + empty_manifest = manifest.Manifest( + "empty", [], manifest._FileSerialization("sha256") + ) + + # Serialize incrementally with empty existing manifest + inc_serializer = incremental.IncrementalSerializer( + hasher_factory, empty_manifest + ) + new_manifest = inc_serializer.serialize(sample_model_folder) + + # Should hash all files (same as regular file serialization) + expected_manifest = file_serializer.serialize(sample_model_folder) + assert new_manifest == expected_manifest + + def test_sharded_manifest_rehashes_all( + self, + sample_model_folder, + hasher_factory, + file_serializer, + sharded_manifest, + ): + # Serialize incrementally using the shard-based manifest + inc_serializer = incremental.IncrementalSerializer( + hasher_factory, sharded_manifest + ) + new_manifest = inc_serializer.serialize(sample_model_folder) + + # Should rehash everything (file-based, not shard-based) + expected_manifest = file_serializer.serialize(sample_model_folder) + assert new_manifest == expected_manifest diff --git a/tests/hashing_config_test.py b/tests/hashing_config_test.py index 3322298e..981df74a 100644 --- a/tests/hashing_config_test.py +++ b/tests/hashing_config_test.py @@ -107,3 +107,29 @@ def test_blake3_file_serialization_with_max_workers(tmp_path): # All manifests should be equal assert manifest1 == manifest2 assert manifest1 == manifest3 + + +def test_incremental_serialization_extracts_parameters(tmp_path): + """Test parameter extraction in use_incremental_serialization.""" + model = tmp_path / "model" + model.mkdir() + (model / "file1.txt").write_text("content1") + (model / "file2.txt").write_text("content2") + + # Create initial manifest with blake2 algorithm + cfg1 = hashing.Config().use_file_serialization(hashing_algorithm="blake2") + initial_manifest = cfg1.hash(model) + + # Verify initial manifest has blake2b + assert initial_manifest.serialization_type["hash_type"] == "blake2b" + + # Now use incremental serialization WITHOUT specifying hashing_algorithm + # It should auto-extract blake2 from initial_manifest + cfg2 = hashing.Config().use_incremental_serialization(initial_manifest) + new_manifest = cfg2.hash(model) + + # Verify new manifest uses same hash algorithm (blake2b) + assert new_manifest.serialization_type["hash_type"] == "blake2b" + + # Manifests should be equal (same files, same digests, same parameters) + assert initial_manifest == new_manifest