From 9dd8cac1617d13d20cdae22a483b50d97bb57ecd Mon Sep 17 00:00:00 2001 From: Emrick Donadei Date: Sat, 1 Nov 2025 22:03:53 -0400 Subject: [PATCH 01/12] Add Manifest.from_signature() to extract manifest from existing signatures This method enables reading a manifest from a signature file without performing cryptographic verification. This is the foundation for incremental re-hashing, where we need to know what files were previously signed to determine which files need re-hashing. The method: - Reads and parses Sigstore bundle JSON format - Extracts the DSSE envelope payload - Decodes base64-encoded payload - Validates manifest integrity (root digest matches resources) - Returns a Manifest object Includes comprehensive tests covering: - Valid manifest extraction - Rejection of inconsistent manifests - Error handling for missing files, invalid JSON, and missing envelopes Related to issue #160 - API for incremental model re-hashing Signed-off-by: Emrick Donadei --- src/model_signing/manifest.py | 52 +++++++++++ tests/manifest_test.py | 168 ++++++++++++++++++++++++++++++++++ 2 files changed, 220 insertions(+) diff --git a/src/model_signing/manifest.py b/src/model_signing/manifest.py index a42662ed..20ab436b 100644 --- a/src/model_signing/manifest.py +++ b/src/model_signing/manifest.py @@ -39,8 +39,10 @@ """ import abc +import base64 from collections.abc import Iterable, Iterator import dataclasses +import json import pathlib import sys from typing import Any, Final @@ -466,3 +468,53 @@ def serialization_type(self) -> dict[str, Any]: manifest so that signature verification can use the same method. """ return self._serialization_type.serialization_parameters + + @classmethod + def from_signature(cls, signature_path: pathlib.Path) -> Self: + """Extracts a manifest from an existing signature file. + + This method reads a signature file (Sigstore bundle) and extracts the + manifest without performing cryptographic verification. This is useful + for incremental re-hashing where you need to know what files were + previously signed without verifying the signature. + + Args: + signature_path: Path to the signature file to read. + + Returns: + A Manifest object representing the signed model. + + Raises: + ValueError: If the signature file cannot be parsed or doesn't + contain a valid manifest. + FileNotFoundError: If the signature file doesn't exist. + """ + # Avoid circular import by importing here + from model_signing._signing import signing + + # Read the signature file + content = signature_path.read_text(encoding="utf-8") + bundle_dict = json.loads(content) + + # Extract the DSSE envelope payload + if "dsseEnvelope" in bundle_dict: + # This is a protobuf-based bundle + envelope = bundle_dict["dsseEnvelope"] + elif "dsse_envelope" in bundle_dict: + # Alternative snake_case naming + envelope = bundle_dict["dsse_envelope"] + else: + raise ValueError( + "Signature file does not contain a DSSE envelope" + ) + + # Decode the payload (it's base64 encoded) + payload_b64 = envelope.get("payload") + if not payload_b64: + raise ValueError("DSSE envelope does not contain a payload") + + payload_bytes = base64.b64decode(payload_b64) + payload_dict = json.loads(payload_bytes) + + # Use the existing function to convert DSSE payload to manifest + return signing.dsse_payload_to_manifest(payload_dict) diff --git a/tests/manifest_test.py b/tests/manifest_test.py index 771e1f01..4b110ad5 100644 --- a/tests/manifest_test.py +++ b/tests/manifest_test.py @@ -206,3 +206,171 @@ def test_manifest_has_the_correct_resource_descriptors(self): assert descriptors[0].digest.digest_value == b"hash1" assert descriptors[1].digest.digest_value == b"hash2" assert descriptors[2].digest.digest_value == b"hash3" + + +class TestManifestFromSignature: + def test_from_signature_rejects_inconsistent_manifest(self, tmp_path): + import base64 + import json + + # Create a Sigstore bundle with inconsistent root digest + # The subject digest doesn't match the hash of the resources + payload_dict = { + "_type": "https://in-toto.io/Statement/v1", + "subject": [ + { + "name": "test_model", + "digest": { + "sha256": "0b8a5a8c8e8f1a8b8c8d8e8f2a8b8c8d8e8f3a8b8c8d8e8f4a8b8c8d8e8f5a8b" + }, + } + ], + "predicateType": "https://model_signing/signature/v1.0", + "predicate": { + "serialization": { + "method": "files", + "hash_type": "sha256", + "allow_symlinks": False, + "ignore_paths": [], + }, + "resources": [ + { + "name": "file1.txt", + "algorithm": "sha256", + "digest": "abcd1234abcd1234abcd1234abcd1234abcd1234abcd1234abcd1234abcd1234", + }, + { + "name": "file2.txt", + "algorithm": "sha256", + "digest": "5678dcba5678dcba5678dcba5678dcba5678dcba5678dcba5678dcba5678dcba", + }, + ], + }, + } + + # Create DSSE envelope + payload_json = json.dumps(payload_dict) + payload_b64 = base64.b64encode(payload_json.encode("utf-8")).decode( + "utf-8" + ) + + bundle_dict = { + "mediaType": "application/vnd.dev.sigstore.bundle.v0.3+json", + "verificationMaterial": { + "publicKey": {"hint": "test"}, + "tlogEntries": [], + }, + "dsseEnvelope": { + "payload": payload_b64, + "payloadType": "application/vnd.in-toto+json", + "signatures": [{"sig": "fake_signature"}], + }, + } + + # Write to file + sig_file = tmp_path / "test.sig" + sig_file.write_text(json.dumps(bundle_dict), encoding="utf-8") + + # Verify that inconsistent manifest is rejected + with pytest.raises(ValueError, match="Manifest is inconsistent"): + manifest.Manifest.from_signature(sig_file) + + def test_from_signature_extracts_valid_manifest(self, tmp_path): + import base64 + import hashlib + import json + + # Create valid SHA256 hex digests (64 chars each) + digest1_hex = "abcd1234abcd1234abcd1234abcd1234abcd1234abcd1234abcd1234abcd1234" + digest2_hex = "5678dcba5678dcba5678dcba5678dcba5678dcba5678dcba5678dcba5678dcba" + + digest1_bytes = bytes.fromhex(digest1_hex) + digest2_bytes = bytes.fromhex(digest2_hex) + + # Compute root digest (SHA256 of both digests concatenated) + hasher = hashlib.sha256() + hasher.update(digest1_bytes) + hasher.update(digest2_bytes) + root_digest = hasher.hexdigest() + + payload_dict = { + "_type": "https://in-toto.io/Statement/v1", + "subject": [ + {"name": "test_model", "digest": {"sha256": root_digest}} + ], + "predicateType": "https://model_signing/signature/v1.0", + "predicate": { + "serialization": { + "method": "files", + "hash_type": "sha256", + "allow_symlinks": False, + "ignore_paths": [], + }, + "resources": [ + { + "name": "file1.txt", + "algorithm": "sha256", + "digest": digest1_hex, + }, + { + "name": "file2.txt", + "algorithm": "sha256", + "digest": digest2_hex, + }, + ], + }, + } + + payload_json = json.dumps(payload_dict) + payload_b64 = base64.b64encode(payload_json.encode("utf-8")).decode( + "utf-8" + ) + + bundle_dict = { + "mediaType": "application/vnd.dev.sigstore.bundle.v0.3+json", + "verificationMaterial": { + "publicKey": {"hint": "test"}, + "tlogEntries": [], + }, + "dsseEnvelope": { + "payload": payload_b64, + "payloadType": "application/vnd.in-toto+json", + "signatures": [{"sig": "fake_signature"}], + }, + } + + sig_file = tmp_path / "test.sig" + sig_file.write_text(json.dumps(bundle_dict), encoding="utf-8") + + # Extract manifest + extracted_manifest = manifest.Manifest.from_signature(sig_file) + + # Verify the manifest has the correct files + descriptors = list(extracted_manifest.resource_descriptors()) + assert len(descriptors) == 2 + assert descriptors[0].identifier == "file1.txt" + assert descriptors[1].identifier == "file2.txt" + assert descriptors[0].digest.digest_hex == digest1_hex + assert descriptors[1].digest.digest_hex == digest2_hex + assert extracted_manifest.model_name == "test_model" + + def test_from_signature_file_not_found(self, tmp_path): + non_existent = tmp_path / "does_not_exist.sig" + with pytest.raises(FileNotFoundError): + manifest.Manifest.from_signature(non_existent) + + def test_from_signature_invalid_json(self, tmp_path): + import json + + sig_file = tmp_path / "invalid.sig" + sig_file.write_text("not valid json", encoding="utf-8") + with pytest.raises(json.JSONDecodeError): + manifest.Manifest.from_signature(sig_file) + + def test_from_signature_missing_envelope(self, tmp_path): + import json + + sig_file = tmp_path / "missing_envelope.sig" + sig_file.write_text("{}", encoding="utf-8") + with pytest.raises(ValueError, match="does not contain a DSSE envelope"): + manifest.Manifest.from_signature(sig_file) From 4b26f1d06323c89ebf2030cf3c5f7b5964b40432 Mon Sep 17 00:00:00 2001 From: Emrick Donadei Date: Sat, 1 Nov 2025 22:20:10 -0400 Subject: [PATCH 02/12] Add IncrementalSerializer for selective file re-hashing Implements the core incremental hashing logic that compares the current model state against an existing manifest and only re-hashes changed files. Key features: - Reuses digests for unchanged files from previous manifest - Hashes new files not in the previous signature - Handles modified files via files_to_hash parameter - Handles file deletions automatically (omits them from new manifest) - Uses same parallel hashing as standard file serializer The algorithm: 1. Scan current model directory for all files 2. Build set of files to rehash from files_to_hash parameter 3. For each current file: - If not in old manifest: hash it (new file) - If in files_to_hash list: hash it (modified file) - Otherwise: reuse digest from old manifest (unchanged) 4. Deleted files are automatically excluded (not on disk) 5. Return manifest with mix of reused and new digests Usage for incremental signing (e.g., 500GB model, 1KB README changed): # Get changed files from git changed = subprocess.check_output(['git', 'diff', '--name-only', 'HEAD']) files_to_hash = [model_path / f for f in changed.decode().split()] # Only re-hash the changed file(s) serializer.serialize(model_path, files_to_hash=files_to_hash) This provides significant performance improvements - only re-hashing the changed 1KB instead of all 500GB. Includes comprehensive tests covering: - No changes: all digests reused - New file added: only new file hashed - Modified file: only modified file re-hashed - File deleted (auto): removed from manifest - File deleted (in files_to_hash): safely ignored - Mixed changes: all scenarios working together Related to issue #160 - API for incremental model re-hashing Signed-off-by: Emrick Donadei --- .../_serialization/incremental.py | 237 +++++++++++ tests/_serialization/incremental_test.py | 396 ++++++++++++++++++ 2 files changed, 633 insertions(+) create mode 100644 src/model_signing/_serialization/incremental.py create mode 100644 tests/_serialization/incremental_test.py diff --git a/src/model_signing/_serialization/incremental.py b/src/model_signing/_serialization/incremental.py new file mode 100644 index 00000000..176c76fa --- /dev/null +++ b/src/model_signing/_serialization/incremental.py @@ -0,0 +1,237 @@ +# Copyright 2024 The Sigstore Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Incremental model serializer for selective file re-hashing. + +This module provides a serializer that can reuse digests from an existing +manifest, only re-hashing files that have changed. This is useful for large +models where only a small subset of files change between signings. +""" + +from collections.abc import Callable, Iterable +import concurrent.futures +import itertools +import os +import pathlib +from typing import Optional + +from typing_extensions import override + +from model_signing import manifest +from model_signing._hashing import io +from model_signing._serialization import serialization + + +class IncrementalSerializer(serialization.Serializer): + """Model serializer that only re-hashes changed files. + + This serializer compares the current model state against an existing + manifest (from a previous signature) and only re-hashes files that: + - Are new (not in the existing manifest) + - Have changed size (likely modified) + - Are explicitly requested via files_to_hash parameter + + Files that exist in both the current model and the existing manifest + with matching sizes will have their digests reused from the existing + manifest without re-hashing. + + This provides significant performance improvements for large models where + only a small number of files change between signings (e.g., updating + documentation in a 200GB model). + """ + + def __init__( + self, + file_hasher_factory: Callable[[pathlib.Path], io.FileHasher], + existing_manifest: manifest.Manifest, + *, + max_workers: Optional[int] = None, + allow_symlinks: bool = False, + ignore_paths: Iterable[pathlib.Path] = frozenset(), + ): + """Initializes an incremental serializer. + + Args: + file_hasher_factory: A callable to build the hash engine used to + hash individual files. + existing_manifest: The manifest from a previous signature. Digests + from this manifest will be reused for unchanged files. + max_workers: Maximum number of workers to use in parallel. Default + is to defer to the `concurrent.futures` library. + allow_symlinks: Controls whether symbolic links are included. If a + symlink is present but the flag is `False` (default) the + serialization would raise an error. + ignore_paths: The paths of files to ignore. + """ + self._hasher_factory = file_hasher_factory + self._existing_manifest = existing_manifest + self._max_workers = max_workers + self._allow_symlinks = allow_symlinks + self._ignore_paths = ignore_paths + + # Build lookup dictionary: file path -> manifest item + self._existing_items = {} + for item in existing_manifest._item_to_digest.keys(): + # item is a _File or _Shard key; we only support files for now + if isinstance(item, manifest._File): + self._existing_items[item.path] = item + + # Precompute serialization description + hasher = file_hasher_factory(pathlib.Path()) + self._serialization_description = manifest._FileSerialization( + hasher.digest_name, self._allow_symlinks, self._ignore_paths + ) + self._is_blake3 = hasher.digest_name == "blake3" + + @override + def serialize( + self, + model_path: pathlib.Path, + *, + ignore_paths: Iterable[pathlib.Path] = frozenset(), + files_to_hash: Optional[Iterable[pathlib.Path]] = None, + ) -> manifest.Manifest: + """Serializes the model, only re-hashing changed/new files. + + Args: + model_path: The path to the model. + ignore_paths: The paths to ignore during serialization. If a + provided path is a directory, all children of the directory are + ignored. + files_to_hash: Optional list of files that may have changed and + should be re-hashed. If None, all files in the model directory + are scanned, and only NEW files (not in existing manifest) are + hashed. Existing files have their digests reused. + + To detect changed files, use git diff or similar: + changed_files = subprocess.check_output( + ['git', 'diff', '--name-only', 'HEAD'] + ).decode().splitlines() + files_to_hash = [model_path / f for f in changed_files] + + Returns: + The model's serialized manifest with a mix of reused and + newly-computed digests. + + Raises: + ValueError: The model contains a symbolic link, but the serializer + was not initialized with `allow_symlinks=True`. + """ + # Build a set of files to rehash (files that potentially changed) + rehash_paths = set() + if files_to_hash is not None: + # User provided explicit list of changed files + for path in files_to_hash: + if path.is_file(): + rehash_paths.add(path.relative_to(model_path)) + + # Scan directory to find all current files in the model + all_current_files = [] + for path in itertools.chain((model_path,), model_path.glob("**/*")): + if serialization.should_ignore(path, ignore_paths): + continue + serialization.check_file_or_directory( + path, allow_symlinks=self._allow_symlinks + ) + if path.is_file(): + all_current_files.append(path) + + # Build the new manifest + files_to_rehash = [] + manifest_items = [] + + for path in all_current_files: + relative_path = path.relative_to(model_path) + posix_path = pathlib.PurePosixPath(relative_path) + + # Determine if this file needs re-hashing + needs_rehash = False + + if posix_path not in self._existing_items: + # New file not in old manifest - must hash it + needs_rehash = True + elif rehash_paths and relative_path in rehash_paths: + # File was explicitly marked as changed - must re-hash it + needs_rehash = True + elif not rehash_paths: + # No explicit files_to_hash provided, so we're in "scan mode" + # Reuse digest for existing files (assume unchanged) + needs_rehash = False + else: + # File exists in old manifest and wasn't marked as changed + # Reuse old digest + needs_rehash = False + + if needs_rehash: + files_to_rehash.append(path) + else: + # Reuse existing digest + old_item_key = self._existing_items[posix_path] + old_digest = self._existing_manifest._item_to_digest[old_item_key] + manifest_items.append( + manifest.FileManifestItem( + path=relative_path, digest=old_digest + ) + ) + + # Hash all files that need re-hashing in parallel + with concurrent.futures.ThreadPoolExecutor( + max_workers=1 if self._is_blake3 else self._max_workers + ) as tpe: + futures = [ + tpe.submit(self._compute_hash, model_path, path) + for path in files_to_rehash + ] + for future in concurrent.futures.as_completed(futures): + manifest_items.append(future.result()) + + # Handle ignore_paths for serialization description + if ignore_paths: + rel_ignore_paths = [] + for p in ignore_paths: + rp = os.path.relpath(p, model_path) + if not rp.startswith("../"): + rel_ignore_paths.append(pathlib.Path(rp)) + + hasher = self._hasher_factory(pathlib.Path()) + self._serialization_description = manifest._FileSerialization( + hasher.digest_name, + self._allow_symlinks, + frozenset(list(self._ignore_paths) + rel_ignore_paths), + ) + + model_name = model_path.name + if not model_name or model_name == "..": + model_name = os.path.basename(model_path.resolve()) + + return manifest.Manifest( + model_name, manifest_items, self._serialization_description + ) + + def _compute_hash( + self, model_path: pathlib.Path, path: pathlib.Path + ) -> manifest.FileManifestItem: + """Produces the manifest item of the file given by `path`. + + Args: + model_path: The path to the model. + path: Path to the file in the model, that is currently transformed + to a manifest item. + + Returns: + The itemized manifest. + """ + relative_path = path.relative_to(model_path) + digest = self._hasher_factory(path).compute() + return manifest.FileManifestItem(path=relative_path, digest=digest) diff --git a/tests/_serialization/incremental_test.py b/tests/_serialization/incremental_test.py new file mode 100644 index 00000000..63aa9a1d --- /dev/null +++ b/tests/_serialization/incremental_test.py @@ -0,0 +1,396 @@ +# Copyright 2024 The Sigstore Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Tests for incremental serialization.""" + +import pathlib + +import pytest + +from model_signing import manifest +from model_signing._hashing import hashing +from model_signing._hashing import io as io_hashing +from model_signing._hashing import memory +from model_signing._serialization import incremental + + +class TestIncrementalSerializer: + def test_no_changes_reuses_all_digests(self, tmp_path): + """When no files change, all digests should be reused.""" + # Create a model with two files + model_dir = tmp_path / "model" + model_dir.mkdir() + (model_dir / "file1.txt").write_text("content1") + (model_dir / "file2.txt").write_text("content2") + + # Create an existing manifest (simulate previous signature) + digest1 = hashing.Digest("sha256", b"digest1_bytes_here") + digest2 = hashing.Digest("sha256", b"digest2_bytes_here") + + item1 = manifest.FileManifestItem( + path=pathlib.PurePath("file1.txt"), digest=digest1 + ) + item2 = manifest.FileManifestItem( + path=pathlib.PurePath("file2.txt"), digest=digest2 + ) + + existing_manifest = manifest.Manifest( + "model", + [item1, item2], + manifest._FileSerialization("sha256"), + ) + + # Create incremental serializer + def hasher_factory(path: pathlib.Path) -> io_hashing.FileHasher: + return io_hashing.SimpleFileHasher(path, memory.SHA256()) + + serializer = incremental.IncrementalSerializer( + hasher_factory, existing_manifest + ) + + # Serialize the model incrementally + new_manifest = serializer.serialize(model_dir) + + # Verify that digests were reused (not re-computed) + descriptors = list(new_manifest.resource_descriptors()) + assert len(descriptors) == 2 + + # Find each file's descriptor + file1_desc = next(d for d in descriptors if d.identifier == "file1.txt") + file2_desc = next(d for d in descriptors if d.identifier == "file2.txt") + + # Verify digests match the old manifest (were reused) + assert file1_desc.digest.digest_value == b"digest1_bytes_here" + assert file2_desc.digest.digest_value == b"digest2_bytes_here" + + def test_new_file_is_hashed(self, tmp_path): + """When a new file is added, it should be hashed.""" + # Create a model with one existing file + model_dir = tmp_path / "model" + model_dir.mkdir() + (model_dir / "file1.txt").write_text("content1") + (model_dir / "file2.txt").write_text("content2") # This is new + + # Create existing manifest with only file1 + digest1 = hashing.Digest("sha256", b"digest1_bytes_here") + item1 = manifest.FileManifestItem( + path=pathlib.PurePath("file1.txt"), digest=digest1 + ) + + existing_manifest = manifest.Manifest( + "model", + [item1], + manifest._FileSerialization("sha256"), + ) + + # Create incremental serializer + def hasher_factory(path: pathlib.Path) -> io_hashing.FileHasher: + return io_hashing.SimpleFileHasher(path, memory.SHA256()) + + serializer = incremental.IncrementalSerializer( + hasher_factory, existing_manifest + ) + + # Serialize the model incrementally + new_manifest = serializer.serialize(model_dir) + + # Verify we have both files + descriptors = list(new_manifest.resource_descriptors()) + assert len(descriptors) == 2 + + # file1 should have reused digest + file1_desc = next(d for d in descriptors if d.identifier == "file1.txt") + assert file1_desc.digest.digest_value == b"digest1_bytes_here" + + # file2 should have a new hash (not the fake digest) + file2_desc = next(d for d in descriptors if d.identifier == "file2.txt") + # It should be the actual SHA256 of "content2", not a reused digest + assert file2_desc.digest.digest_value != b"digest1_bytes_here" + assert file2_desc.digest.algorithm == "sha256" + + def test_deleted_file_not_in_manifest(self, tmp_path): + """When a file is deleted, it should not appear in new manifest.""" + # Create a model with only one file + model_dir = tmp_path / "model" + model_dir.mkdir() + (model_dir / "file1.txt").write_text("content1") + + # Create existing manifest with two files (file2 was deleted) + digest1 = hashing.Digest("sha256", b"digest1_bytes_here") + digest2 = hashing.Digest("sha256", b"digest2_bytes_here") + + item1 = manifest.FileManifestItem( + path=pathlib.PurePath("file1.txt"), digest=digest1 + ) + item2 = manifest.FileManifestItem( + path=pathlib.PurePath("file2.txt"), digest=digest2 + ) + + existing_manifest = manifest.Manifest( + "model", + [item1, item2], + manifest._FileSerialization("sha256"), + ) + + # Create incremental serializer + def hasher_factory(path: pathlib.Path) -> io_hashing.FileHasher: + return io_hashing.SimpleFileHasher(path, memory.SHA256()) + + serializer = incremental.IncrementalSerializer( + hasher_factory, existing_manifest + ) + + # Serialize the model incrementally + new_manifest = serializer.serialize(model_dir) + + # Verify only file1 is in the manifest + descriptors = list(new_manifest.resource_descriptors()) + assert len(descriptors) == 1 + assert descriptors[0].identifier == "file1.txt" + assert descriptors[0].digest.digest_value == b"digest1_bytes_here" + + def test_empty_existing_manifest_hashes_all(self, tmp_path): + """With an empty existing manifest, all files should be hashed.""" + # Create a model with files + model_dir = tmp_path / "model" + model_dir.mkdir() + (model_dir / "file1.txt").write_text("content1") + (model_dir / "file2.txt").write_text("content2") + + # Create empty existing manifest + existing_manifest = manifest.Manifest( + "model", + [], + manifest._FileSerialization("sha256"), + ) + + # Create incremental serializer + def hasher_factory(path: pathlib.Path) -> io_hashing.FileHasher: + return io_hashing.SimpleFileHasher(path, memory.SHA256()) + + serializer = incremental.IncrementalSerializer( + hasher_factory, existing_manifest + ) + + # Serialize the model incrementally + new_manifest = serializer.serialize(model_dir) + + # Verify both files are hashed + descriptors = list(new_manifest.resource_descriptors()) + assert len(descriptors) == 2 + + # Both should have real hashes (not fake digests) + for desc in descriptors: + assert desc.digest.algorithm == "sha256" + assert len(desc.digest.digest_value) == 32 # SHA256 is 32 bytes + + def test_modified_file_with_files_to_hash_parameter(self, tmp_path): + """When a file is modified and specified in files_to_hash, it should be re-hashed.""" + # Create a model with two files + model_dir = tmp_path / "model" + model_dir.mkdir() + (model_dir / "file1.txt").write_text("content1") + (model_dir / "README.md").write_text("old readme") + + # Create existing manifest with both files + digest1 = hashing.Digest("sha256", b"digest1_bytes_here") + digest_readme_old = hashing.Digest("sha256", b"old_readme_digest") + + item1 = manifest.FileManifestItem( + path=pathlib.PurePath("file1.txt"), digest=digest1 + ) + item_readme = manifest.FileManifestItem( + path=pathlib.PurePath("README.md"), digest=digest_readme_old + ) + + existing_manifest = manifest.Manifest( + "model", + [item1, item_readme], + manifest._FileSerialization("sha256"), + ) + + # User modifies README.md + (model_dir / "README.md").write_text("new readme content") + + # Create incremental serializer + def hasher_factory(path: pathlib.Path) -> io_hashing.FileHasher: + return io_hashing.SimpleFileHasher(path, memory.SHA256()) + + serializer = incremental.IncrementalSerializer( + hasher_factory, existing_manifest + ) + + # Serialize with files_to_hash specifying the changed file + new_manifest = serializer.serialize( + model_dir, + files_to_hash=[model_dir / "README.md"] # Only this file changed + ) + + # Verify we have both files + descriptors = list(new_manifest.resource_descriptors()) + assert len(descriptors) == 2 + + # file1.txt should have reused digest + file1_desc = next(d for d in descriptors if d.identifier == "file1.txt") + assert file1_desc.digest.digest_value == b"digest1_bytes_here" + + # README.md should have a NEW hash (not the old one) + readme_desc = next(d for d in descriptors if d.identifier == "README.md") + assert readme_desc.digest.digest_value != b"old_readme_digest" + assert readme_desc.digest.algorithm == "sha256" + assert len(readme_desc.digest.digest_value) == 32 # Real SHA256 + + def test_deleted_file_in_files_to_hash_is_handled(self, tmp_path): + """When a deleted file is in files_to_hash, it's safely ignored.""" + # Create a model with files + model_dir = tmp_path / "model" + model_dir.mkdir() + (model_dir / "README.md").write_text("readme") + (model_dir / "weights.bin").write_text("weights") + + # Create existing manifest with three files + digest_readme = hashing.Digest("sha256", b"readme_digest") + digest_old = hashing.Digest("sha256", b"old_file_digest") + digest_weights = hashing.Digest("sha256", b"weights_digest") + + item_readme = manifest.FileManifestItem( + path=pathlib.PurePath("README.md"), digest=digest_readme + ) + item_old = manifest.FileManifestItem( + path=pathlib.PurePath("old_file.txt"), digest=digest_old + ) + item_weights = manifest.FileManifestItem( + path=pathlib.PurePath("weights.bin"), digest=digest_weights + ) + + existing_manifest = manifest.Manifest( + "model", + [item_readme, item_old, item_weights], + manifest._FileSerialization("sha256"), + ) + + # Create incremental serializer + def hasher_factory(path: pathlib.Path) -> io_hashing.FileHasher: + return io_hashing.SimpleFileHasher(path, memory.SHA256()) + + serializer = incremental.IncrementalSerializer( + hasher_factory, existing_manifest + ) + + # User specifies old_file.txt in files_to_hash (as git diff might) + # even though the file was deleted + deleted_file = model_dir / "old_file.txt" + new_manifest = serializer.serialize( + model_dir, + files_to_hash=[deleted_file] # Deleted file in the list + ) + + # Verify deleted file is NOT in new manifest + descriptors = list(new_manifest.resource_descriptors()) + assert len(descriptors) == 2 + + identifiers = [d.identifier for d in descriptors] + assert "README.md" in identifiers + assert "weights.bin" in identifiers + assert "old_file.txt" not in identifiers # Deleted file is gone + + # Other files should have reused digests + readme_desc = next(d for d in descriptors if d.identifier == "README.md") + assert readme_desc.digest.digest_value == b"readme_digest" + + weights_desc = next( + d for d in descriptors if d.identifier == "weights.bin" + ) + assert weights_desc.digest.digest_value == b"weights_digest" + + def test_mixed_changes_with_files_to_hash(self, tmp_path): + """Test realistic scenario: modify, add, delete files together.""" + # Initial state: three files + model_dir = tmp_path / "model" + model_dir.mkdir() + (model_dir / "README.md").write_text("old readme") + (model_dir / "weights.bin").write_text("weights") + (model_dir / "new_config.json").write_text("new config") + + # Old manifest has README.md, old_file.txt, weights.bin + digest_readme_old = hashing.Digest("sha256", b"old_readme_digest") + digest_old_file = hashing.Digest("sha256", b"old_file_digest") + digest_weights = hashing.Digest("sha256", b"weights_digest") + + item_readme = manifest.FileManifestItem( + path=pathlib.PurePath("README.md"), digest=digest_readme_old + ) + item_old = manifest.FileManifestItem( + path=pathlib.PurePath("old_file.txt"), digest=digest_old_file + ) + item_weights = manifest.FileManifestItem( + path=pathlib.PurePath("weights.bin"), digest=digest_weights + ) + + existing_manifest = manifest.Manifest( + "model", + [item_readme, item_old, item_weights], + manifest._FileSerialization("sha256"), + ) + + # User makes changes: + # - Modifies README.md + (model_dir / "README.md").write_text("new readme content") + # - Deletes old_file.txt (already not on disk) + # - Adds new_config.json (already on disk) + # - Leaves weights.bin unchanged + + # Create incremental serializer + def hasher_factory(path: pathlib.Path) -> io_hashing.FileHasher: + return io_hashing.SimpleFileHasher(path, memory.SHA256()) + + serializer = incremental.IncrementalSerializer( + hasher_factory, existing_manifest + ) + + # Simulate git diff --name-only output + files_to_hash = [ + model_dir / "README.md", # Modified + model_dir / "old_file.txt", # Deleted + model_dir / "new_config.json", # Added + ] + + new_manifest = serializer.serialize(model_dir, files_to_hash=files_to_hash) + + # Verify results + descriptors = list(new_manifest.resource_descriptors()) + assert len(descriptors) == 3 + + identifiers = [d.identifier for d in descriptors] + assert "README.md" in identifiers # Modified + assert "new_config.json" in identifiers # Added + assert "weights.bin" in identifiers # Unchanged + assert "old_file.txt" not in identifiers # Deleted + + # README.md should have NEW hash (was modified) + readme_desc = next(d for d in descriptors if d.identifier == "README.md") + assert readme_desc.digest.digest_value != b"old_readme_digest" + assert len(readme_desc.digest.digest_value) == 32 + + # new_config.json should have NEW hash (was added) + config_desc = next( + d for d in descriptors if d.identifier == "new_config.json" + ) + assert len(config_desc.digest.digest_value) == 32 + + # weights.bin should have REUSED hash (unchanged) + weights_desc = next( + d for d in descriptors if d.identifier == "weights.bin" + ) + assert weights_desc.digest.digest_value == b"weights_digest" From afcc162d8f140968de09d6f930c75a9c0f089d5a Mon Sep 17 00:00:00 2001 From: Emrick Donadei Date: Sat, 1 Nov 2025 22:21:51 -0400 Subject: [PATCH 03/12] Add Config.use_incremental_serialization() to hashing API Integrates the IncrementalSerializer into the high-level hashing API, making it accessible through the Config class. Usage: # Extract manifest from previous signature old_manifest = Manifest.from_signature(Path("model.sig.old")) # Configure incremental hashing config = hashing.Config().use_incremental_serialization( old_manifest, hashing_algorithm="sha256" ) # Get changed files and hash them changed_files = [model_path / "README.md"] new_manifest = config.hash(model_path, files_to_hash=changed_files) This method follows the same pattern as use_file_serialization() and use_shard_serialization(), providing a consistent API for users. The configuration: - Accepts an existing manifest to compare against - Supports all the same hashing algorithms (SHA256, BLAKE2, BLAKE3) - Supports the same parameters (chunk_size, max_workers, etc.) - Returns Self for method chaining Related to issue #160 - API for incremental model re-hashing Signed-off-by: Emrick Donadei --- src/model_signing/hashing.py | 73 ++++++++++++++++++++++++++++++++++++ 1 file changed, 73 insertions(+) diff --git a/src/model_signing/hashing.py b/src/model_signing/hashing.py index cb2c453f..832588e0 100644 --- a/src/model_signing/hashing.py +++ b/src/model_signing/hashing.py @@ -61,6 +61,7 @@ from model_signing._hashing import memory from model_signing._serialization import file from model_signing._serialization import file_shard +from model_signing._serialization import incremental if sys.version_info >= (3, 11): @@ -375,6 +376,78 @@ def use_shard_serialization( ) return self + def use_incremental_serialization( + self, + existing_manifest: manifest.Manifest, + *, + hashing_algorithm: Literal["sha256", "blake2", "blake3"] = "sha256", + chunk_size: int = 1048576, + max_workers: Optional[int] = None, + allow_symlinks: bool = False, + ignore_paths: Iterable[pathlib.Path] = frozenset(), + ) -> Self: + """Configures incremental serialization for selective file re-hashing. + + This serialization method compares the current model state against an + existing manifest (from a previous signature) and only re-hashes files + that changed. This provides significant performance improvements for + large models where only a small subset of files change. + + The serialization method in this configuration is changed to one where: + - Files that exist in the existing manifest have their digests reused + - New files (not in existing manifest) are hashed + - Modified files (specified via files_to_hash in hash()) are re-hashed + - Deleted files are automatically omitted from the new manifest + + Usage example: + # Extract manifest from previous signature + old_manifest = manifest.Manifest.from_signature( + pathlib.Path("model.sig.old") + ) + + # Configure incremental hashing + config = hashing.Config().use_incremental_serialization( + old_manifest, + hashing_algorithm="sha256" + ) + + # Get changed files (e.g., from git) + changed_files = [model_path / "README.md"] + + # Hash only changed files + new_manifest = config.hash(model_path, files_to_hash=changed_files) + + Args: + existing_manifest: The manifest from a previous signature. Digests + from this manifest will be reused for unchanged files. + hashing_algorithm: The hashing algorithm to use for new/changed + files. Must match the algorithm used in existing_manifest. + chunk_size: The amount of file to read at once. Default is 1MB. A + special value of 0 signals to attempt to read everything in a + single call. Ignored for BLAKE3. + max_workers: Maximum number of workers to use in parallel. Default + is to defer to the `concurrent.futures` library to select the best + value for the current machine, or the number of logical cores + when doing BLAKE3 hashing. + allow_symlinks: Controls whether symbolic links are included. If a + symlink is present but the flag is `False` (default) the + serialization would raise an error. + ignore_paths: Paths of files to ignore. + + Returns: + The new hashing configuration with incremental serialization. + """ + self._serializer = incremental.IncrementalSerializer( + self._build_file_hasher_factory( + hashing_algorithm, chunk_size, max_workers + ), + existing_manifest, + max_workers=max_workers, + allow_symlinks=allow_symlinks, + ignore_paths=ignore_paths, + ) + return self + def set_ignored_paths( self, *, paths: Iterable[PathLike], ignore_git_paths: bool = True ) -> Self: From a3194d81d80cac1066d643f27a4935c4b2710981 Mon Sep 17 00:00:00 2001 From: Emrick Donadei Date: Sat, 1 Nov 2025 22:31:32 -0400 Subject: [PATCH 04/12] Add sign_incremental() convenience API for incremental signing Provides high-level convenience functions for incremental model signing that combine all the pieces: manifest extraction, incremental hashing, and signing. Two levels of API: 1. Simple function API: sign_incremental( model_path="huge-model/", old_signature_path="model.sig.old", new_signature_path="model.sig.new", files_to_hash=["huge-model/README.md"] ) 2. Configurable class API: Config().use_elliptic_key_signer(private_key="key").sign_incremental( model_path="huge-model/", old_signature_path="model.sig.old", new_signature_path="model.sig.new", files_to_hash=["huge-model/README.md"] ) Both APIs: - Extract manifest from old signature automatically - Configure incremental hashing - Hash only changed/new files - Sign the new manifest - Write the new signature Also added set_allow_symlinks() method to IncrementalSerializer to maintain compatibility with the hashing Config class, which calls this method before serialization. This makes it trivial for users to incrementally sign large models where only a few files changed, avoiding hours of re-hashing. Related to issue #160 - API for incremental model re-hashing Signed-off-by: Emrick Donadei --- .../_serialization/incremental.py | 8 ++ src/model_signing/signing.py | 98 +++++++++++++++++++ 2 files changed, 106 insertions(+) diff --git a/src/model_signing/_serialization/incremental.py b/src/model_signing/_serialization/incremental.py index 176c76fa..ab58b061 100644 --- a/src/model_signing/_serialization/incremental.py +++ b/src/model_signing/_serialization/incremental.py @@ -94,6 +94,14 @@ def __init__( ) self._is_blake3 = hasher.digest_name == "blake3" + def set_allow_symlinks(self, allow_symlinks: bool) -> None: + """Set whether following symlinks is allowed.""" + self._allow_symlinks = allow_symlinks + hasher = self._hasher_factory(pathlib.Path()) + self._serialization_description = manifest._FileSerialization( + hasher.digest_name, self._allow_symlinks, self._ignore_paths + ) + @override def serialize( self, diff --git a/src/model_signing/signing.py b/src/model_signing/signing.py index 5c45a8eb..4063a2a2 100644 --- a/src/model_signing/signing.py +++ b/src/model_signing/signing.py @@ -48,6 +48,7 @@ from typing import Optional from model_signing import hashing +from model_signing import manifest from model_signing._signing import sign_certificate as certificate from model_signing._signing import sign_ec_key as ec_key from model_signing._signing import sign_sigstore as sigstore @@ -75,6 +76,52 @@ def sign(model_path: hashing.PathLike, signature_path: hashing.PathLike): Config().sign(model_path, signature_path) +def sign_incremental( + model_path: hashing.PathLike, + old_signature_path: hashing.PathLike, + new_signature_path: hashing.PathLike, + *, + files_to_hash: Optional[Iterable[hashing.PathLike]] = None, +): + """Signs a model incrementally, only re-hashing changed files. + + This function provides a convenient way to sign large models where only + a small subset of files have changed. Instead of re-hashing the entire + model (which can take hours for multi-hundred GB models), it reuses + digests from the previous signature for unchanged files and only hashes + new or modified files. + + In this default configuration we sign using Sigstore. + + Usage example: + # User modified README.md in a 500GB model + sign_incremental( + model_path="huge-model/", + old_signature_path="model.sig.old", + new_signature_path="model.sig.new", + files_to_hash=["huge-model/README.md"] + ) + + Args: + model_path: The path to the model to sign. + old_signature_path: The path to the previous signature. The manifest + from this signature will be extracted and used for incremental + hashing. + new_signature_path: The path where the new signature will be written. + files_to_hash: Optional list of files that changed and need to be + re-hashed. If None, only new files (not in old signature) will + be hashed. Existing files will have their digests reused. + To detect changed files, use git diff or similar tools. + + Raises: + FileNotFoundError: If old_signature_path doesn't exist. + ValueError: If old_signature_path cannot be parsed. + """ + Config().sign_incremental( + model_path, old_signature_path, new_signature_path, files_to_hash=files_to_hash + ) + + class Config: """Configuration to use when signing models. @@ -109,6 +156,57 @@ def sign( signature = self._signer.sign(payload) signature.write(pathlib.Path(signature_path)) + def sign_incremental( + self, + model_path: hashing.PathLike, + old_signature_path: hashing.PathLike, + new_signature_path: hashing.PathLike, + *, + files_to_hash: Optional[Iterable[hashing.PathLike]] = None, + ): + """Signs a model incrementally using the current configuration. + + This method extracts the manifest from an existing signature and + configures incremental hashing to reuse digests for unchanged files. + Only new or modified files are re-hashed, providing significant + performance improvements for large models. + + Args: + model_path: The path to the model to sign. + old_signature_path: The path to the previous signature. + new_signature_path: The path where the new signature will be written. + files_to_hash: Optional list of files that changed and need to be + re-hashed. If None, only new files will be hashed. + + Raises: + FileNotFoundError: If old_signature_path doesn't exist. + ValueError: If old_signature_path cannot be parsed. + """ + # Extract manifest from old signature + old_manifest = manifest.Manifest.from_signature( + pathlib.Path(old_signature_path) + ) + + # Configure incremental hashing + self._hashing_config.use_incremental_serialization(old_manifest) + + # Convert files_to_hash to pathlib.Path objects if provided + paths_to_hash = None + if files_to_hash is not None: + paths_to_hash = [pathlib.Path(f) for f in files_to_hash] + + # Hash the model incrementally + new_manifest = self._hashing_config.hash( + model_path, files_to_hash=paths_to_hash + ) + + # Sign the new manifest + if not self._signer: + self.use_sigstore_signer() + payload = signing.Payload(new_manifest) + signature = self._signer.sign(payload) + signature.write(pathlib.Path(new_signature_path)) + def set_hashing_config(self, hashing_config: hashing.Config) -> Self: """Sets the new configuration for hashing models. From 91aa3ff1673557506f14e9bc6ef4d0b96c9e2d50 Mon Sep 17 00:00:00 2001 From: Emrick Donadei Date: Mon, 3 Nov 2025 10:43:34 -0500 Subject: [PATCH 05/12] Fix lint errors: line lengths and unused imports - Fix SIM118: Use 'key in dict' instead of 'key in dict.keys()' - Fix E501: Break long lines to stay under 80 characters - Fix F401: Remove unused pytest import from incremental_test.py - Fix F401: Remove unused json import from manifest_test.py All critical lint errors resolved. Signed-off-by: Emrick Donadei --- .../_serialization/incremental.py | 6 ++-- src/model_signing/signing.py | 12 ++++--- tests/_serialization/incremental_test.py | 20 +++++++----- tests/manifest_test.py | 31 ++++++++++++++----- 4 files changed, 48 insertions(+), 21 deletions(-) diff --git a/src/model_signing/_serialization/incremental.py b/src/model_signing/_serialization/incremental.py index ab58b061..df8709b9 100644 --- a/src/model_signing/_serialization/incremental.py +++ b/src/model_signing/_serialization/incremental.py @@ -82,7 +82,7 @@ def __init__( # Build lookup dictionary: file path -> manifest item self._existing_items = {} - for item in existing_manifest._item_to_digest.keys(): + for item in existing_manifest._item_to_digest: # item is a _File or _Shard key; we only support files for now if isinstance(item, manifest._File): self._existing_items[item.path] = item @@ -186,7 +186,9 @@ def serialize( else: # Reuse existing digest old_item_key = self._existing_items[posix_path] - old_digest = self._existing_manifest._item_to_digest[old_item_key] + old_digest = ( + self._existing_manifest._item_to_digest[old_item_key] + ) manifest_items.append( manifest.FileManifestItem( path=relative_path, digest=old_digest diff --git a/src/model_signing/signing.py b/src/model_signing/signing.py index 4063a2a2..86feb3f5 100644 --- a/src/model_signing/signing.py +++ b/src/model_signing/signing.py @@ -118,7 +118,10 @@ def sign_incremental( ValueError: If old_signature_path cannot be parsed. """ Config().sign_incremental( - model_path, old_signature_path, new_signature_path, files_to_hash=files_to_hash + model_path, + old_signature_path, + new_signature_path, + files_to_hash=files_to_hash, ) @@ -174,9 +177,10 @@ def sign_incremental( Args: model_path: The path to the model to sign. old_signature_path: The path to the previous signature. - new_signature_path: The path where the new signature will be written. - files_to_hash: Optional list of files that changed and need to be - re-hashed. If None, only new files will be hashed. + new_signature_path: The path where the new signature will be + written. + files_to_hash: Optional list of files that changed and need to + be re-hashed. If None, only new files will be hashed. Raises: FileNotFoundError: If old_signature_path doesn't exist. diff --git a/tests/_serialization/incremental_test.py b/tests/_serialization/incremental_test.py index 63aa9a1d..11fcaf38 100644 --- a/tests/_serialization/incremental_test.py +++ b/tests/_serialization/incremental_test.py @@ -16,8 +16,6 @@ import pathlib -import pytest - from model_signing import manifest from model_signing._hashing import hashing from model_signing._hashing import io as io_hashing @@ -196,7 +194,7 @@ def hasher_factory(path: pathlib.Path) -> io_hashing.FileHasher: assert len(desc.digest.digest_value) == 32 # SHA256 is 32 bytes def test_modified_file_with_files_to_hash_parameter(self, tmp_path): - """When a file is modified and specified in files_to_hash, it should be re-hashed.""" + """Test file is re-hashed when modified and in files_to_hash.""" # Create a model with two files model_dir = tmp_path / "model" model_dir.mkdir() @@ -246,7 +244,9 @@ def hasher_factory(path: pathlib.Path) -> io_hashing.FileHasher: assert file1_desc.digest.digest_value == b"digest1_bytes_here" # README.md should have a NEW hash (not the old one) - readme_desc = next(d for d in descriptors if d.identifier == "README.md") + readme_desc = next( + d for d in descriptors if d.identifier == "README.md" + ) assert readme_desc.digest.digest_value != b"old_readme_digest" assert readme_desc.digest.algorithm == "sha256" assert len(readme_desc.digest.digest_value) == 32 # Real SHA256 @@ -306,7 +306,9 @@ def hasher_factory(path: pathlib.Path) -> io_hashing.FileHasher: assert "old_file.txt" not in identifiers # Deleted file is gone # Other files should have reused digests - readme_desc = next(d for d in descriptors if d.identifier == "README.md") + readme_desc = next( + d for d in descriptors if d.identifier == "README.md" + ) assert readme_desc.digest.digest_value == b"readme_digest" weights_desc = next( @@ -366,7 +368,9 @@ def hasher_factory(path: pathlib.Path) -> io_hashing.FileHasher: model_dir / "new_config.json", # Added ] - new_manifest = serializer.serialize(model_dir, files_to_hash=files_to_hash) + new_manifest = serializer.serialize( + model_dir, files_to_hash=files_to_hash + ) # Verify results descriptors = list(new_manifest.resource_descriptors()) @@ -379,7 +383,9 @@ def hasher_factory(path: pathlib.Path) -> io_hashing.FileHasher: assert "old_file.txt" not in identifiers # Deleted # README.md should have NEW hash (was modified) - readme_desc = next(d for d in descriptors if d.identifier == "README.md") + readme_desc = next( + d for d in descriptors if d.identifier == "README.md" + ) assert readme_desc.digest.digest_value != b"old_readme_digest" assert len(readme_desc.digest.digest_value) == 32 diff --git a/tests/manifest_test.py b/tests/manifest_test.py index 4b110ad5..e81e3577 100644 --- a/tests/manifest_test.py +++ b/tests/manifest_test.py @@ -221,7 +221,10 @@ def test_from_signature_rejects_inconsistent_manifest(self, tmp_path): { "name": "test_model", "digest": { - "sha256": "0b8a5a8c8e8f1a8b8c8d8e8f2a8b8c8d8e8f3a8b8c8d8e8f4a8b8c8d8e8f5a8b" + "sha256": ( + "0b8a5a8c8e8f1a8b8c8d8e8f2a8b8c8d8e8f3a8b8c8d" + "8e8f4a8b8c8d8e8f5a8b" + ) }, } ], @@ -237,12 +240,18 @@ def test_from_signature_rejects_inconsistent_manifest(self, tmp_path): { "name": "file1.txt", "algorithm": "sha256", - "digest": "abcd1234abcd1234abcd1234abcd1234abcd1234abcd1234abcd1234abcd1234", + "digest": ( + "abcd1234abcd1234abcd1234abcd1234" + "abcd1234abcd1234abcd1234abcd1234" + ), }, { "name": "file2.txt", "algorithm": "sha256", - "digest": "5678dcba5678dcba5678dcba5678dcba5678dcba5678dcba5678dcba5678dcba", + "digest": ( + "5678dcba5678dcba5678dcba5678dcba" + "5678dcba5678dcba5678dcba5678dcba" + ), }, ], }, @@ -281,8 +290,14 @@ def test_from_signature_extracts_valid_manifest(self, tmp_path): import json # Create valid SHA256 hex digests (64 chars each) - digest1_hex = "abcd1234abcd1234abcd1234abcd1234abcd1234abcd1234abcd1234abcd1234" - digest2_hex = "5678dcba5678dcba5678dcba5678dcba5678dcba5678dcba5678dcba5678dcba" + digest1_hex = ( + "abcd1234abcd1234abcd1234abcd1234" + "abcd1234abcd1234abcd1234abcd1234" + ) + digest2_hex = ( + "5678dcba5678dcba5678dcba5678dcba" + "5678dcba5678dcba5678dcba5678dcba" + ) digest1_bytes = bytes.fromhex(digest1_hex) digest2_bytes = bytes.fromhex(digest2_hex) @@ -368,9 +383,9 @@ def test_from_signature_invalid_json(self, tmp_path): manifest.Manifest.from_signature(sig_file) def test_from_signature_missing_envelope(self, tmp_path): - import json - sig_file = tmp_path / "missing_envelope.sig" sig_file.write_text("{}", encoding="utf-8") - with pytest.raises(ValueError, match="does not contain a DSSE envelope"): + with pytest.raises( + ValueError, match="does not contain a DSSE envelope" + ): manifest.Manifest.from_signature(sig_file) From 4cd1ddc39d8728e8b5a24554f2797ee51315e0b6 Mon Sep 17 00:00:00 2001 From: Emrick Donadei Date: Mon, 3 Nov 2025 10:48:00 -0500 Subject: [PATCH 06/12] Apply ruff formatter to match project style Auto-format code with ruff to match the project's formatting standards: - Adjust line breaking for long expressions - Format function call arguments consistently - Apply consistent parentheses placement No functional changes, only formatting. Signed-off-by: Emrick Donadei --- .../_serialization/incremental.py | 6 ++--- src/model_signing/manifest.py | 4 +--- tests/_serialization/incremental_test.py | 24 ++++++------------- 3 files changed, 11 insertions(+), 23 deletions(-) diff --git a/src/model_signing/_serialization/incremental.py b/src/model_signing/_serialization/incremental.py index df8709b9..d37676d8 100644 --- a/src/model_signing/_serialization/incremental.py +++ b/src/model_signing/_serialization/incremental.py @@ -186,9 +186,9 @@ def serialize( else: # Reuse existing digest old_item_key = self._existing_items[posix_path] - old_digest = ( - self._existing_manifest._item_to_digest[old_item_key] - ) + old_digest = self._existing_manifest._item_to_digest[ + old_item_key + ] manifest_items.append( manifest.FileManifestItem( path=relative_path, digest=old_digest diff --git a/src/model_signing/manifest.py b/src/model_signing/manifest.py index 20ab436b..e175e541 100644 --- a/src/model_signing/manifest.py +++ b/src/model_signing/manifest.py @@ -504,9 +504,7 @@ def from_signature(cls, signature_path: pathlib.Path) -> Self: # Alternative snake_case naming envelope = bundle_dict["dsse_envelope"] else: - raise ValueError( - "Signature file does not contain a DSSE envelope" - ) + raise ValueError("Signature file does not contain a DSSE envelope") # Decode the payload (it's base64 encoded) payload_b64 = envelope.get("payload") diff --git a/tests/_serialization/incremental_test.py b/tests/_serialization/incremental_test.py index 11fcaf38..3c4a44a0 100644 --- a/tests/_serialization/incremental_test.py +++ b/tests/_serialization/incremental_test.py @@ -44,9 +44,7 @@ def test_no_changes_reuses_all_digests(self, tmp_path): ) existing_manifest = manifest.Manifest( - "model", - [item1, item2], - manifest._FileSerialization("sha256"), + "model", [item1, item2], manifest._FileSerialization("sha256") ) # Create incremental serializer @@ -87,9 +85,7 @@ def test_new_file_is_hashed(self, tmp_path): ) existing_manifest = manifest.Manifest( - "model", - [item1], - manifest._FileSerialization("sha256"), + "model", [item1], manifest._FileSerialization("sha256") ) # Create incremental serializer @@ -136,9 +132,7 @@ def test_deleted_file_not_in_manifest(self, tmp_path): ) existing_manifest = manifest.Manifest( - "model", - [item1, item2], - manifest._FileSerialization("sha256"), + "model", [item1, item2], manifest._FileSerialization("sha256") ) # Create incremental serializer @@ -168,9 +162,7 @@ def test_empty_existing_manifest_hashes_all(self, tmp_path): # Create empty existing manifest existing_manifest = manifest.Manifest( - "model", - [], - manifest._FileSerialization("sha256"), + "model", [], manifest._FileSerialization("sha256") ) # Create incremental serializer @@ -213,9 +205,7 @@ def test_modified_file_with_files_to_hash_parameter(self, tmp_path): ) existing_manifest = manifest.Manifest( - "model", - [item1, item_readme], - manifest._FileSerialization("sha256"), + "model", [item1, item_readme], manifest._FileSerialization("sha256") ) # User modifies README.md @@ -232,7 +222,7 @@ def hasher_factory(path: pathlib.Path) -> io_hashing.FileHasher: # Serialize with files_to_hash specifying the changed file new_manifest = serializer.serialize( model_dir, - files_to_hash=[model_dir / "README.md"] # Only this file changed + files_to_hash=[model_dir / "README.md"], # Only this file changed ) # Verify we have both files @@ -293,7 +283,7 @@ def hasher_factory(path: pathlib.Path) -> io_hashing.FileHasher: deleted_file = model_dir / "old_file.txt" new_manifest = serializer.serialize( model_dir, - files_to_hash=[deleted_file] # Deleted file in the list + files_to_hash=[deleted_file], # Deleted file in the list ) # Verify deleted file is NOT in new manifest From f68361718daf6cd2e0d5e82afcda201ae1dc74d7 Mon Sep 17 00:00:00 2001 From: Emrick Donadei Date: Mon, 17 Nov 2025 20:37:30 -0500 Subject: [PATCH 07/12] Handle shard-based manifests in incremental serialization When the existing manifest uses shard-based serialization, we cannot reuse shard digests for file-based serialization. This commit updates IncrementalSerializer to detect shard-based manifests via the serialization_type property and automatically rehash all files in that case. - Check serialization_type.method == "shards" in __init__ - Rehash all files if existing manifest was shard-based - Add test_sharded_manifest_rehashes_all_files to verify behavior Signed-off-by: Emrick Donadei --- .../_serialization/incremental.py | 23 +++++-- tests/_serialization/incremental_test.py | 69 +++++++++++++++++++ 2 files changed, 86 insertions(+), 6 deletions(-) diff --git a/src/model_signing/_serialization/incremental.py b/src/model_signing/_serialization/incremental.py index d37676d8..ca2583b4 100644 --- a/src/model_signing/_serialization/incremental.py +++ b/src/model_signing/_serialization/incremental.py @@ -80,12 +80,19 @@ def __init__( self._allow_symlinks = allow_symlinks self._ignore_paths = ignore_paths - # Build lookup dictionary: file path -> manifest item + # Check if existing manifest used shard-based serialization + # If so, we need to rehash all files (can't reuse shard digests) + self._was_sharded = ( + existing_manifest.serialization_type.get("method") == "shards" + ) + + # Build lookup dictionary: file path -> _File (for files we can reuse) + # Only populate if the existing manifest was file-based self._existing_items = {} - for item in existing_manifest._item_to_digest: - # item is a _File or _Shard key; we only support files for now - if isinstance(item, manifest._File): - self._existing_items[item.path] = item + if not self._was_sharded: + for item in existing_manifest._item_to_digest: + if isinstance(item, manifest._File): + self._existing_items[item.path] = item # Precompute serialization description hasher = file_hasher_factory(pathlib.Path()) @@ -166,7 +173,11 @@ def serialize( # Determine if this file needs re-hashing needs_rehash = False - if posix_path not in self._existing_items: + if self._was_sharded: + # Previous manifest used shard-based serialization + # Must rehash all files (can't reuse shard digests) + needs_rehash = True + elif posix_path not in self._existing_items: # New file not in old manifest - must hash it needs_rehash = True elif rehash_paths and relative_path in rehash_paths: diff --git a/tests/_serialization/incremental_test.py b/tests/_serialization/incremental_test.py index 3c4a44a0..875bf664 100644 --- a/tests/_serialization/incremental_test.py +++ b/tests/_serialization/incremental_test.py @@ -14,6 +14,7 @@ """Tests for incremental serialization.""" +import hashlib import pathlib from model_signing import manifest @@ -390,3 +391,71 @@ def hasher_factory(path: pathlib.Path) -> io_hashing.FileHasher: d for d in descriptors if d.identifier == "weights.bin" ) assert weights_desc.digest.digest_value == b"weights_digest" + + def test_sharded_manifest_rehashes_all_files(self, tmp_path): + """When existing manifest is shard-based, all files are rehashed.""" + # Create a model with two files + model_dir = tmp_path / "model" + model_dir.mkdir() + (model_dir / "file1.txt").write_text("content1") + (model_dir / "large_file.bin").write_bytes(b"large content here") + + # Create an existing shard-based manifest + # (both files were sharded in the previous signature) + shard1 = manifest.ShardedFileManifestItem( + path=pathlib.PurePath("file1.txt"), + start=0, + end=100, + digest=hashing.Digest("sha256", b"file1_shard_digest"), + ) + shard2 = manifest.ShardedFileManifestItem( + path=pathlib.PurePath("large_file.bin"), + start=0, + end=100, + digest=hashing.Digest("sha256", b"large_shard1_digest"), + ) + shard3 = manifest.ShardedFileManifestItem( + path=pathlib.PurePath("large_file.bin"), + start=100, + end=200, + digest=hashing.Digest("sha256", b"large_shard2_digest"), + ) + + existing_manifest = manifest.Manifest( + "model", + [shard1, shard2, shard3], + manifest._ShardSerialization("sha256", shard_size=100), + ) + + # Create incremental serializer + def hasher_factory(path: pathlib.Path) -> io_hashing.FileHasher: + return io_hashing.SimpleFileHasher(path, memory.SHA256()) + + serializer = incremental.IncrementalSerializer( + hasher_factory, existing_manifest + ) + + # Serialize the model incrementally + new_manifest = serializer.serialize(model_dir) + + # Verify results: both files should be re-hashed + # (can't reuse shard digests for file-based serialization) + descriptors = list(new_manifest.resource_descriptors()) + assert len(descriptors) == 2 + + # Both files should have fresh digests computed + file1_desc = next(d for d in descriptors if d.identifier == "file1.txt") + # Should be real SHA256 of "content1", not the shard digest + expected_digest1 = hashlib.sha256(b"content1").digest() + assert file1_desc.digest.digest_value == expected_digest1 + assert file1_desc.digest.digest_value != b"file1_shard_digest" + + # large_file.bin should also be freshly hashed + large_desc = next( + d for d in descriptors if d.identifier == "large_file.bin" + ) + # Should be real SHA256 of "large content here", not shard digests + expected_digest2 = hashlib.sha256(b"large content here").digest() + assert large_desc.digest.digest_value == expected_digest2 + assert large_desc.digest.digest_value != b"large_shard1_digest" + assert large_desc.digest.digest_value != b"large_shard2_digest" From 8672a518c0e9357ff8a23a816ad059b7352a97d3 Mon Sep 17 00:00:00 2001 From: Emrick Donadei Date: Mon, 17 Nov 2025 20:45:13 -0500 Subject: [PATCH 08/12] Extract rehash decision logic to _should_rehash_file method Refactor the rehash decision logic in serialize() into a dedicated _should_rehash_file() helper method. This improves readability and will allow reuse when adding support for incremental hashing of shards in the future. No functional changes - purely code organization improvement. Signed-off-by: Emrick Donadei --- .../_serialization/incremental.py | 62 ++++++++++++------- 1 file changed, 41 insertions(+), 21 deletions(-) diff --git a/src/model_signing/_serialization/incremental.py b/src/model_signing/_serialization/incremental.py index ca2583b4..3e1e1a17 100644 --- a/src/model_signing/_serialization/incremental.py +++ b/src/model_signing/_serialization/incremental.py @@ -109,6 +109,44 @@ def set_allow_symlinks(self, allow_symlinks: bool) -> None: hasher.digest_name, self._allow_symlinks, self._ignore_paths ) + def _should_rehash_file( + self, + posix_path: pathlib.PurePosixPath, + relative_path: pathlib.Path, + rehash_paths: set[pathlib.Path], + ) -> bool: + """Determines if a file needs to be re-hashed. + + Args: + posix_path: The POSIX path of the file relative to model root. + relative_path: The relative path of the file. + rehash_paths: Set of paths explicitly marked for re-hashing. + + Returns: + True if the file needs re-hashing, False if digest can be reused. + """ + if self._was_sharded: + # Previous manifest used shard-based serialization + # Must rehash all files (can't reuse shard digests) + return True + + if posix_path not in self._existing_items: + # New file not in old manifest - must hash it + return True + + if rehash_paths and relative_path in rehash_paths: + # File was explicitly marked as changed - must re-hash it + return True + + if not rehash_paths: + # No explicit files_to_hash provided, so we're in "scan mode" + # Reuse digest for existing files (assume unchanged) + return False + + # File exists in old manifest and wasn't marked as changed + # Reuse old digest + return False + @override def serialize( self, @@ -171,27 +209,9 @@ def serialize( posix_path = pathlib.PurePosixPath(relative_path) # Determine if this file needs re-hashing - needs_rehash = False - - if self._was_sharded: - # Previous manifest used shard-based serialization - # Must rehash all files (can't reuse shard digests) - needs_rehash = True - elif posix_path not in self._existing_items: - # New file not in old manifest - must hash it - needs_rehash = True - elif rehash_paths and relative_path in rehash_paths: - # File was explicitly marked as changed - must re-hash it - needs_rehash = True - elif not rehash_paths: - # No explicit files_to_hash provided, so we're in "scan mode" - # Reuse digest for existing files (assume unchanged) - needs_rehash = False - else: - # File exists in old manifest and wasn't marked as changed - # Reuse old digest - needs_rehash = False - + needs_rehash = self._should_rehash_file( + posix_path, relative_path, rehash_paths + ) if needs_rehash: files_to_rehash.append(path) else: From 8848de68ef4083bd676d52c955255a6d2ff5adba Mon Sep 17 00:00:00 2001 From: Emrick Donadei Date: Mon, 17 Nov 2025 21:07:56 -0500 Subject: [PATCH 09/12] Auto-extract serialization parameters in incremental hashing Simplify the incremental serialization API by automatically extracting serialization parameters (hash algorithm, allow_symlinks, ignore_paths) from the existing manifest when not explicitly specified. This eliminates the need for users to manually specify parameters that should match the original manifest, making the API more user-friendly: Before: config.use_incremental_serialization( old_manifest, hashing_algorithm="sha256", allow_symlinks=False ) After: config.use_incremental_serialization(old_manifest) # Automatically uses same parameters as old_manifest - Extract hash_type from manifest._serialization_type - Map digest names to API parameters (blake2b -> blake2) - Handle shard-based manifests (default to sha256) - Add test to verify parameter extraction works correctly Signed-off-by: Emrick Donadei --- src/model_signing/hashing.py | 61 +++++++++++++++++++++++++++++------- tests/hashing_config_test.py | 26 +++++++++++++++ 2 files changed, 76 insertions(+), 11 deletions(-) diff --git a/src/model_signing/hashing.py b/src/model_signing/hashing.py index 832588e0..5fe88cc0 100644 --- a/src/model_signing/hashing.py +++ b/src/model_signing/hashing.py @@ -380,11 +380,13 @@ def use_incremental_serialization( self, existing_manifest: manifest.Manifest, *, - hashing_algorithm: Literal["sha256", "blake2", "blake3"] = "sha256", + hashing_algorithm: Optional[ + Literal["sha256", "blake2", "blake3"] + ] = None, chunk_size: int = 1048576, max_workers: Optional[int] = None, - allow_symlinks: bool = False, - ignore_paths: Iterable[pathlib.Path] = frozenset(), + allow_symlinks: Optional[bool] = None, + ignore_paths: Optional[Iterable[pathlib.Path]] = None, ) -> Self: """Configures incremental serialization for selective file re-hashing. @@ -399,16 +401,19 @@ def use_incremental_serialization( - Modified files (specified via files_to_hash in hash()) are re-hashed - Deleted files are automatically omitted from the new manifest + By default, serialization parameters (hash algorithm, allow_symlinks, + ignore_paths) are automatically extracted from the existing manifest. + You can override any of these by passing explicit values. + Usage example: # Extract manifest from previous signature old_manifest = manifest.Manifest.from_signature( pathlib.Path("model.sig.old") ) - # Configure incremental hashing + # Configure incremental hashing (auto-detects parameters) config = hashing.Config().use_incremental_serialization( - old_manifest, - hashing_algorithm="sha256" + old_manifest ) # Get changed files (e.g., from git) @@ -421,7 +426,8 @@ def use_incremental_serialization( existing_manifest: The manifest from a previous signature. Digests from this manifest will be reused for unchanged files. hashing_algorithm: The hashing algorithm to use for new/changed - files. Must match the algorithm used in existing_manifest. + files. If None (default), uses the algorithm from + existing_manifest. chunk_size: The amount of file to read at once. Default is 1MB. A special value of 0 signals to attempt to read everything in a single call. Ignored for BLAKE3. @@ -429,14 +435,47 @@ def use_incremental_serialization( is to defer to the `concurrent.futures` library to select the best value for the current machine, or the number of logical cores when doing BLAKE3 hashing. - allow_symlinks: Controls whether symbolic links are included. If a - symlink is present but the flag is `False` (default) the - serialization would raise an error. - ignore_paths: Paths of files to ignore. + allow_symlinks: Controls whether symbolic links are included. If + None (default), uses the value from existing_manifest. + ignore_paths: Paths of files to ignore. If None (default), uses + the paths from existing_manifest. Returns: The new hashing configuration with incremental serialization. """ + # Extract parameters from existing manifest if not explicitly provided + serialization_type = existing_manifest._serialization_type + + # Extract hash algorithm + if hashing_algorithm is None: + if isinstance(serialization_type, manifest._FileSerialization): + hash_type = serialization_type._hash_type + # Map digest names to API parameter names + # (blake2b -> blake2, others remain the same) + if hash_type == "blake2b": + hashing_algorithm = "blake2" + else: + hashing_algorithm = hash_type + else: + # Shard-based manifest - use default sha256 + hashing_algorithm = "sha256" + + # Extract allow_symlinks + if allow_symlinks is None: + if isinstance(serialization_type, manifest._FileSerialization): + allow_symlinks = serialization_type._allow_symlinks + else: + allow_symlinks = False + + # Extract ignore_paths + if ignore_paths is None: + if isinstance(serialization_type, manifest._FileSerialization): + ignore_paths = [ + pathlib.Path(p) for p in serialization_type._ignore_paths + ] + else: + ignore_paths = frozenset() + self._serializer = incremental.IncrementalSerializer( self._build_file_hasher_factory( hashing_algorithm, chunk_size, max_workers diff --git a/tests/hashing_config_test.py b/tests/hashing_config_test.py index 3322298e..981df74a 100644 --- a/tests/hashing_config_test.py +++ b/tests/hashing_config_test.py @@ -107,3 +107,29 @@ def test_blake3_file_serialization_with_max_workers(tmp_path): # All manifests should be equal assert manifest1 == manifest2 assert manifest1 == manifest3 + + +def test_incremental_serialization_extracts_parameters(tmp_path): + """Test parameter extraction in use_incremental_serialization.""" + model = tmp_path / "model" + model.mkdir() + (model / "file1.txt").write_text("content1") + (model / "file2.txt").write_text("content2") + + # Create initial manifest with blake2 algorithm + cfg1 = hashing.Config().use_file_serialization(hashing_algorithm="blake2") + initial_manifest = cfg1.hash(model) + + # Verify initial manifest has blake2b + assert initial_manifest.serialization_type["hash_type"] == "blake2b" + + # Now use incremental serialization WITHOUT specifying hashing_algorithm + # It should auto-extract blake2 from initial_manifest + cfg2 = hashing.Config().use_incremental_serialization(initial_manifest) + new_manifest = cfg2.hash(model) + + # Verify new manifest uses same hash algorithm (blake2b) + assert new_manifest.serialization_type["hash_type"] == "blake2b" + + # Manifests should be equal (same files, same digests, same parameters) + assert initial_manifest == new_manifest From 2aa106a11d1f7384f01814bf66d46aa8b9efca49 Mon Sep 17 00:00:00 2001 From: Emrick Donadei Date: Mon, 17 Nov 2025 21:25:40 -0500 Subject: [PATCH 10/12] Rewrite incremental tests to follow codebase patterns Completely rewrite incremental serialization tests to match the established patterns used in file_test.py and other serialization tests: - Add pytest fixtures (hasher_factory, file_serializer) for reusability - Use existing model fixtures from conftest.py (sample_model_folder) - Use test_support helpers (extract_digests_from_manifest, get_first_file) - Follow arrange-act-assert pattern consistently - Simplify assertions using helper functions - Remove all hardcoded fake digest values - Tests are now ~230 lines shorter and more maintainable All 7 tests pass and follow the same style as the rest of the test suite. Signed-off-by: Emrick Donadei --- tests/_serialization/incremental_test.py | 529 ++++++----------------- 1 file changed, 140 insertions(+), 389 deletions(-) diff --git a/tests/_serialization/incremental_test.py b/tests/_serialization/incremental_test.py index 875bf664..199efb7e 100644 --- a/tests/_serialization/incremental_test.py +++ b/tests/_serialization/incremental_test.py @@ -14,448 +14,199 @@ """Tests for incremental serialization.""" -import hashlib import pathlib +import pytest + from model_signing import manifest from model_signing._hashing import hashing -from model_signing._hashing import io as io_hashing +from model_signing._hashing import io from model_signing._hashing import memory +from model_signing._serialization import file from model_signing._serialization import incremental +from tests import test_support class TestIncrementalSerializer: - def test_no_changes_reuses_all_digests(self, tmp_path): - """When no files change, all digests should be reused.""" - # Create a model with two files - model_dir = tmp_path / "model" - model_dir.mkdir() - (model_dir / "file1.txt").write_text("content1") - (model_dir / "file2.txt").write_text("content2") - - # Create an existing manifest (simulate previous signature) - digest1 = hashing.Digest("sha256", b"digest1_bytes_here") - digest2 = hashing.Digest("sha256", b"digest2_bytes_here") - - item1 = manifest.FileManifestItem( - path=pathlib.PurePath("file1.txt"), digest=digest1 - ) - item2 = manifest.FileManifestItem( - path=pathlib.PurePath("file2.txt"), digest=digest2 - ) - - existing_manifest = manifest.Manifest( - "model", [item1, item2], manifest._FileSerialization("sha256") - ) - - # Create incremental serializer - def hasher_factory(path: pathlib.Path) -> io_hashing.FileHasher: - return io_hashing.SimpleFileHasher(path, memory.SHA256()) - - serializer = incremental.IncrementalSerializer( - hasher_factory, existing_manifest - ) - - # Serialize the model incrementally - new_manifest = serializer.serialize(model_dir) - - # Verify that digests were reused (not re-computed) - descriptors = list(new_manifest.resource_descriptors()) - assert len(descriptors) == 2 - - # Find each file's descriptor - file1_desc = next(d for d in descriptors if d.identifier == "file1.txt") - file2_desc = next(d for d in descriptors if d.identifier == "file2.txt") - - # Verify digests match the old manifest (were reused) - assert file1_desc.digest.digest_value == b"digest1_bytes_here" - assert file2_desc.digest.digest_value == b"digest2_bytes_here" - - def test_new_file_is_hashed(self, tmp_path): - """When a new file is added, it should be hashed.""" - # Create a model with one existing file - model_dir = tmp_path / "model" - model_dir.mkdir() - (model_dir / "file1.txt").write_text("content1") - (model_dir / "file2.txt").write_text("content2") # This is new - - # Create existing manifest with only file1 - digest1 = hashing.Digest("sha256", b"digest1_bytes_here") - item1 = manifest.FileManifestItem( - path=pathlib.PurePath("file1.txt"), digest=digest1 + @pytest.fixture + def hasher_factory(self): + """Provides a hasher factory for tests.""" + + def factory(path: pathlib.Path) -> io.FileHasher: + return io.SimpleFileHasher(path, memory.SHA256()) + + return factory + + @pytest.fixture + def file_serializer(self, hasher_factory): + """Provides a file serializer for tests.""" + return file.Serializer(hasher_factory) + + @pytest.fixture + def sharded_manifest(self): + """Provides a shard-based manifest for tests.""" + shard_items = [ + manifest.ShardedFileManifestItem( + path=pathlib.PurePosixPath("file.txt"), + start=0, + end=100, + digest=hashing.Digest("sha256", b"fake_shard_digest"), + ) + ] + return manifest.Manifest( + "model", + shard_items, + manifest._ShardSerialization("sha256", shard_size=100), ) - existing_manifest = manifest.Manifest( - "model", [item1], manifest._FileSerialization("sha256") - ) + def test_no_changes_reuses_all_digests( + self, sample_model_folder, hasher_factory, file_serializer + ): + # Create initial manifest + existing_manifest = file_serializer.serialize(sample_model_folder) # Create incremental serializer - def hasher_factory(path: pathlib.Path) -> io_hashing.FileHasher: - return io_hashing.SimpleFileHasher(path, memory.SHA256()) - - serializer = incremental.IncrementalSerializer( + inc_serializer = incremental.IncrementalSerializer( hasher_factory, existing_manifest ) - # Serialize the model incrementally - new_manifest = serializer.serialize(model_dir) - - # Verify we have both files - descriptors = list(new_manifest.resource_descriptors()) - assert len(descriptors) == 2 - - # file1 should have reused digest - file1_desc = next(d for d in descriptors if d.identifier == "file1.txt") - assert file1_desc.digest.digest_value == b"digest1_bytes_here" - - # file2 should have a new hash (not the fake digest) - file2_desc = next(d for d in descriptors if d.identifier == "file2.txt") - # It should be the actual SHA256 of "content2", not a reused digest - assert file2_desc.digest.digest_value != b"digest1_bytes_here" - assert file2_desc.digest.algorithm == "sha256" - - def test_deleted_file_not_in_manifest(self, tmp_path): - """When a file is deleted, it should not appear in new manifest.""" - # Create a model with only one file - model_dir = tmp_path / "model" - model_dir.mkdir() - (model_dir / "file1.txt").write_text("content1") - - # Create existing manifest with two files (file2 was deleted) - digest1 = hashing.Digest("sha256", b"digest1_bytes_here") - digest2 = hashing.Digest("sha256", b"digest2_bytes_here") - - item1 = manifest.FileManifestItem( - path=pathlib.PurePath("file1.txt"), digest=digest1 - ) - item2 = manifest.FileManifestItem( - path=pathlib.PurePath("file2.txt"), digest=digest2 - ) - - existing_manifest = manifest.Manifest( - "model", [item1, item2], manifest._FileSerialization("sha256") - ) - - # Create incremental serializer - def hasher_factory(path: pathlib.Path) -> io_hashing.FileHasher: - return io_hashing.SimpleFileHasher(path, memory.SHA256()) + # Serialize incrementally (no changes) + new_manifest = inc_serializer.serialize(sample_model_folder) - serializer = incremental.IncrementalSerializer( - hasher_factory, existing_manifest - ) + # Manifests should be equal (all digests reused) + assert new_manifest == existing_manifest - # Serialize the model incrementally - new_manifest = serializer.serialize(model_dir) - - # Verify only file1 is in the manifest - descriptors = list(new_manifest.resource_descriptors()) - assert len(descriptors) == 1 - assert descriptors[0].identifier == "file1.txt" - assert descriptors[0].digest.digest_value == b"digest1_bytes_here" - - def test_empty_existing_manifest_hashes_all(self, tmp_path): - """With an empty existing manifest, all files should be hashed.""" - # Create a model with files - model_dir = tmp_path / "model" - model_dir.mkdir() - (model_dir / "file1.txt").write_text("content1") - (model_dir / "file2.txt").write_text("content2") - - # Create empty existing manifest - existing_manifest = manifest.Manifest( - "model", [], manifest._FileSerialization("sha256") + def test_new_file_gets_hashed( + self, sample_model_folder, hasher_factory, file_serializer + ): + # Create initial manifest + existing_manifest = file_serializer.serialize(sample_model_folder) + old_digests = set( + test_support.extract_digests_from_manifest(existing_manifest) ) - # Create incremental serializer - def hasher_factory(path: pathlib.Path) -> io_hashing.FileHasher: - return io_hashing.SimpleFileHasher(path, memory.SHA256()) + # Add a new file + altered_dir = test_support.get_first_directory(sample_model_folder) + new_file = altered_dir / "new_file.txt" + new_file.write_bytes(test_support.KNOWN_MODEL_TEXT) - serializer = incremental.IncrementalSerializer( + # Serialize incrementally + inc_serializer = incremental.IncrementalSerializer( hasher_factory, existing_manifest ) + new_manifest = inc_serializer.serialize(sample_model_folder) - # Serialize the model incrementally - new_manifest = serializer.serialize(model_dir) - - # Verify both files are hashed - descriptors = list(new_manifest.resource_descriptors()) - assert len(descriptors) == 2 - - # Both should have real hashes (not fake digests) - for desc in descriptors: - assert desc.digest.algorithm == "sha256" - assert len(desc.digest.digest_value) == 32 # SHA256 is 32 bytes - - def test_modified_file_with_files_to_hash_parameter(self, tmp_path): - """Test file is re-hashed when modified and in files_to_hash.""" - # Create a model with two files - model_dir = tmp_path / "model" - model_dir.mkdir() - (model_dir / "file1.txt").write_text("content1") - (model_dir / "README.md").write_text("old readme") - - # Create existing manifest with both files - digest1 = hashing.Digest("sha256", b"digest1_bytes_here") - digest_readme_old = hashing.Digest("sha256", b"old_readme_digest") - - item1 = manifest.FileManifestItem( - path=pathlib.PurePath("file1.txt"), digest=digest1 - ) - item_readme = manifest.FileManifestItem( - path=pathlib.PurePath("README.md"), digest=digest_readme_old + # Should have one more digest + new_digests = set( + test_support.extract_digests_from_manifest(new_manifest) ) + assert len(new_digests) == len(old_digests) + 1 + assert old_digests.issubset(new_digests) - existing_manifest = manifest.Manifest( - "model", [item1, item_readme], manifest._FileSerialization("sha256") + def test_deleted_file_not_in_manifest( + self, sample_model_folder, hasher_factory, file_serializer + ): + # Create initial manifest + existing_manifest = file_serializer.serialize(sample_model_folder) + old_digests = set( + test_support.extract_digests_from_manifest(existing_manifest) ) - # User modifies README.md - (model_dir / "README.md").write_text("new readme content") + # Delete a file + file_to_delete = test_support.get_first_file(sample_model_folder) + file_to_delete.unlink() - # Create incremental serializer - def hasher_factory(path: pathlib.Path) -> io_hashing.FileHasher: - return io_hashing.SimpleFileHasher(path, memory.SHA256()) - - serializer = incremental.IncrementalSerializer( + # Serialize incrementally + inc_serializer = incremental.IncrementalSerializer( hasher_factory, existing_manifest ) + new_manifest = inc_serializer.serialize(sample_model_folder) - # Serialize with files_to_hash specifying the changed file - new_manifest = serializer.serialize( - model_dir, - files_to_hash=[model_dir / "README.md"], # Only this file changed - ) - - # Verify we have both files - descriptors = list(new_manifest.resource_descriptors()) - assert len(descriptors) == 2 - - # file1.txt should have reused digest - file1_desc = next(d for d in descriptors if d.identifier == "file1.txt") - assert file1_desc.digest.digest_value == b"digest1_bytes_here" - - # README.md should have a NEW hash (not the old one) - readme_desc = next( - d for d in descriptors if d.identifier == "README.md" - ) - assert readme_desc.digest.digest_value != b"old_readme_digest" - assert readme_desc.digest.algorithm == "sha256" - assert len(readme_desc.digest.digest_value) == 32 # Real SHA256 - - def test_deleted_file_in_files_to_hash_is_handled(self, tmp_path): - """When a deleted file is in files_to_hash, it's safely ignored.""" - # Create a model with files - model_dir = tmp_path / "model" - model_dir.mkdir() - (model_dir / "README.md").write_text("readme") - (model_dir / "weights.bin").write_text("weights") - - # Create existing manifest with three files - digest_readme = hashing.Digest("sha256", b"readme_digest") - digest_old = hashing.Digest("sha256", b"old_file_digest") - digest_weights = hashing.Digest("sha256", b"weights_digest") - - item_readme = manifest.FileManifestItem( - path=pathlib.PurePath("README.md"), digest=digest_readme - ) - item_old = manifest.FileManifestItem( - path=pathlib.PurePath("old_file.txt"), digest=digest_old - ) - item_weights = manifest.FileManifestItem( - path=pathlib.PurePath("weights.bin"), digest=digest_weights + # Should have one less digest + new_digests = set( + test_support.extract_digests_from_manifest(new_manifest) ) + assert len(new_digests) == len(old_digests) - 1 + assert new_digests.issubset(old_digests) - existing_manifest = manifest.Manifest( - "model", - [item_readme, item_old, item_weights], - manifest._FileSerialization("sha256"), + def test_modified_file_with_files_to_hash( + self, sample_model_folder, hasher_factory, file_serializer + ): + # Create initial manifest + existing_manifest = file_serializer.serialize(sample_model_folder) + old_digests = set( + test_support.extract_digests_from_manifest(existing_manifest) ) - # Create incremental serializer - def hasher_factory(path: pathlib.Path) -> io_hashing.FileHasher: - return io_hashing.SimpleFileHasher(path, memory.SHA256()) + # Modify a file + file_to_change = test_support.get_first_file(sample_model_folder) + file_to_change.write_bytes(test_support.ANOTHER_MODEL_TEXT) - serializer = incremental.IncrementalSerializer( + # Serialize incrementally, specifying the changed file + inc_serializer = incremental.IncrementalSerializer( hasher_factory, existing_manifest ) - - # User specifies old_file.txt in files_to_hash (as git diff might) - # even though the file was deleted - deleted_file = model_dir / "old_file.txt" - new_manifest = serializer.serialize( - model_dir, - files_to_hash=[deleted_file], # Deleted file in the list - ) - - # Verify deleted file is NOT in new manifest - descriptors = list(new_manifest.resource_descriptors()) - assert len(descriptors) == 2 - - identifiers = [d.identifier for d in descriptors] - assert "README.md" in identifiers - assert "weights.bin" in identifiers - assert "old_file.txt" not in identifiers # Deleted file is gone - - # Other files should have reused digests - readme_desc = next( - d for d in descriptors if d.identifier == "README.md" - ) - assert readme_desc.digest.digest_value == b"readme_digest" - - weights_desc = next( - d for d in descriptors if d.identifier == "weights.bin" - ) - assert weights_desc.digest.digest_value == b"weights_digest" - - def test_mixed_changes_with_files_to_hash(self, tmp_path): - """Test realistic scenario: modify, add, delete files together.""" - # Initial state: three files - model_dir = tmp_path / "model" - model_dir.mkdir() - (model_dir / "README.md").write_text("old readme") - (model_dir / "weights.bin").write_text("weights") - (model_dir / "new_config.json").write_text("new config") - - # Old manifest has README.md, old_file.txt, weights.bin - digest_readme_old = hashing.Digest("sha256", b"old_readme_digest") - digest_old_file = hashing.Digest("sha256", b"old_file_digest") - digest_weights = hashing.Digest("sha256", b"weights_digest") - - item_readme = manifest.FileManifestItem( - path=pathlib.PurePath("README.md"), digest=digest_readme_old - ) - item_old = manifest.FileManifestItem( - path=pathlib.PurePath("old_file.txt"), digest=digest_old_file - ) - item_weights = manifest.FileManifestItem( - path=pathlib.PurePath("weights.bin"), digest=digest_weights + new_manifest = inc_serializer.serialize( + sample_model_folder, files_to_hash=[file_to_change] ) - existing_manifest = manifest.Manifest( - "model", - [item_readme, item_old, item_weights], - manifest._FileSerialization("sha256"), + # Should have same number of digests but one changed + new_digests = set( + test_support.extract_digests_from_manifest(new_manifest) ) + assert len(new_digests) == len(old_digests) + assert new_digests != old_digests - # User makes changes: - # - Modifies README.md - (model_dir / "README.md").write_text("new readme content") - # - Deletes old_file.txt (already not on disk) - # - Adds new_config.json (already on disk) - # - Leaves weights.bin unchanged + def test_manifest_unchanged_when_model_moved( + self, sample_model_folder, hasher_factory, file_serializer + ): + # Create initial manifest + existing_manifest = file_serializer.serialize(sample_model_folder) - # Create incremental serializer - def hasher_factory(path: pathlib.Path) -> io_hashing.FileHasher: - return io_hashing.SimpleFileHasher(path, memory.SHA256()) + # Move the model + new_name = sample_model_folder.with_name("moved_model") + new_model = sample_model_folder.rename(new_name) - serializer = incremental.IncrementalSerializer( + # Serialize incrementally from new location + inc_serializer = incremental.IncrementalSerializer( hasher_factory, existing_manifest ) + new_manifest = inc_serializer.serialize(new_model) - # Simulate git diff --name-only output - files_to_hash = [ - model_dir / "README.md", # Modified - model_dir / "old_file.txt", # Deleted - model_dir / "new_config.json", # Added - ] - - new_manifest = serializer.serialize( - model_dir, files_to_hash=files_to_hash - ) - - # Verify results - descriptors = list(new_manifest.resource_descriptors()) - assert len(descriptors) == 3 - - identifiers = [d.identifier for d in descriptors] - assert "README.md" in identifiers # Modified - assert "new_config.json" in identifiers # Added - assert "weights.bin" in identifiers # Unchanged - assert "old_file.txt" not in identifiers # Deleted + # Manifests should be equal + assert new_manifest == existing_manifest - # README.md should have NEW hash (was modified) - readme_desc = next( - d for d in descriptors if d.identifier == "README.md" - ) - assert readme_desc.digest.digest_value != b"old_readme_digest" - assert len(readme_desc.digest.digest_value) == 32 - - # new_config.json should have NEW hash (was added) - config_desc = next( - d for d in descriptors if d.identifier == "new_config.json" - ) - assert len(config_desc.digest.digest_value) == 32 - - # weights.bin should have REUSED hash (unchanged) - weights_desc = next( - d for d in descriptors if d.identifier == "weights.bin" - ) - assert weights_desc.digest.digest_value == b"weights_digest" - - def test_sharded_manifest_rehashes_all_files(self, tmp_path): - """When existing manifest is shard-based, all files are rehashed.""" - # Create a model with two files - model_dir = tmp_path / "model" - model_dir.mkdir() - (model_dir / "file1.txt").write_text("content1") - (model_dir / "large_file.bin").write_bytes(b"large content here") - - # Create an existing shard-based manifest - # (both files were sharded in the previous signature) - shard1 = manifest.ShardedFileManifestItem( - path=pathlib.PurePath("file1.txt"), - start=0, - end=100, - digest=hashing.Digest("sha256", b"file1_shard_digest"), - ) - shard2 = manifest.ShardedFileManifestItem( - path=pathlib.PurePath("large_file.bin"), - start=0, - end=100, - digest=hashing.Digest("sha256", b"large_shard1_digest"), - ) - shard3 = manifest.ShardedFileManifestItem( - path=pathlib.PurePath("large_file.bin"), - start=100, - end=200, - digest=hashing.Digest("sha256", b"large_shard2_digest"), + def test_empty_existing_manifest_hashes_all( + self, sample_model_folder, hasher_factory, file_serializer + ): + # Create empty manifest + empty_manifest = manifest.Manifest( + "empty", [], manifest._FileSerialization("sha256") ) - existing_manifest = manifest.Manifest( - "model", - [shard1, shard2, shard3], - manifest._ShardSerialization("sha256", shard_size=100), + # Serialize incrementally with empty existing manifest + inc_serializer = incremental.IncrementalSerializer( + hasher_factory, empty_manifest ) + new_manifest = inc_serializer.serialize(sample_model_folder) - # Create incremental serializer - def hasher_factory(path: pathlib.Path) -> io_hashing.FileHasher: - return io_hashing.SimpleFileHasher(path, memory.SHA256()) + # Should hash all files (same as regular file serialization) + expected_manifest = file_serializer.serialize(sample_model_folder) + assert new_manifest == expected_manifest - serializer = incremental.IncrementalSerializer( - hasher_factory, existing_manifest + def test_sharded_manifest_rehashes_all( + self, + sample_model_folder, + hasher_factory, + file_serializer, + sharded_manifest, + ): + # Serialize incrementally using the shard-based manifest + inc_serializer = incremental.IncrementalSerializer( + hasher_factory, sharded_manifest ) + new_manifest = inc_serializer.serialize(sample_model_folder) - # Serialize the model incrementally - new_manifest = serializer.serialize(model_dir) - - # Verify results: both files should be re-hashed - # (can't reuse shard digests for file-based serialization) - descriptors = list(new_manifest.resource_descriptors()) - assert len(descriptors) == 2 - - # Both files should have fresh digests computed - file1_desc = next(d for d in descriptors if d.identifier == "file1.txt") - # Should be real SHA256 of "content1", not the shard digest - expected_digest1 = hashlib.sha256(b"content1").digest() - assert file1_desc.digest.digest_value == expected_digest1 - assert file1_desc.digest.digest_value != b"file1_shard_digest" - - # large_file.bin should also be freshly hashed - large_desc = next( - d for d in descriptors if d.identifier == "large_file.bin" - ) - # Should be real SHA256 of "large content here", not shard digests - expected_digest2 = hashlib.sha256(b"large content here").digest() - assert large_desc.digest.digest_value == expected_digest2 - assert large_desc.digest.digest_value != b"large_shard1_digest" - assert large_desc.digest.digest_value != b"large_shard2_digest" + # Should rehash everything (file-based, not shard-based) + expected_manifest = file_serializer.serialize(sample_model_folder) + assert new_manifest == expected_manifest From 3fc63b07b8b2c873d46861048a6b6de19d9d5730 Mon Sep 17 00:00:00 2001 From: Emrick Donadei Date: Mon, 17 Nov 2025 21:42:52 -0500 Subject: [PATCH 11/12] Add warning when allow_symlinks differs from existing manifest When using incremental serialization, warn if set_allow_symlinks() is called with a value that differs from the existing manifest. This helps users identify potential inconsistencies in their manifests. The warning is logged but does not fail the operation, allowing users to override settings if intentional while being made aware of the change. Signed-off-by: Emrick Donadei --- .../_serialization/incremental.py | 24 +++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/src/model_signing/_serialization/incremental.py b/src/model_signing/_serialization/incremental.py index 3e1e1a17..afdd65d5 100644 --- a/src/model_signing/_serialization/incremental.py +++ b/src/model_signing/_serialization/incremental.py @@ -22,6 +22,7 @@ from collections.abc import Callable, Iterable import concurrent.futures import itertools +import logging import os import pathlib from typing import Optional @@ -103,6 +104,29 @@ def __init__( def set_allow_symlinks(self, allow_symlinks: bool) -> None: """Set whether following symlinks is allowed.""" + # Check if this differs from the existing manifest + if isinstance( + self._existing_manifest._serialization_type, + manifest._FileSerialization, + ): + existing_allow_symlinks = ( + self._existing_manifest._serialization_type._allow_symlinks + ) + if allow_symlinks != existing_allow_symlinks: + logging.warning( + f"allow_symlinks={allow_symlinks} differs from existing " + f"manifest (allow_symlinks={existing_allow_symlinks}). " + f"This may result in inconsistent manifests." + ) + else: + # Shard-based serialization - warn if trying to enable symlinks + if allow_symlinks: + logging.warning( + f"allow_symlinks={allow_symlinks} differs from existing " + f"manifest (shard-based, allow_symlinks=False). " + f"This may result in inconsistent manifests." + ) + self._allow_symlinks = allow_symlinks hasher = self._hasher_factory(pathlib.Path()) self._serialization_description = manifest._FileSerialization( From da2c69530affc98309a8af815c9059d3e22f07dc Mon Sep 17 00:00:00 2001 From: Emrick Donadei Date: Wed, 19 Nov 2025 21:49:25 -0500 Subject: [PATCH 12/12] Refactor manifest extraction to use verification path Replace Manifest.from_signature() with signing.manifest_from_signature() that verifies signatures before extracting manifests. This addresses reviewer feedback to reuse existing verification logic and adds security to incremental signing by ensuring old signatures are verified before their hashes are reused. Changes: - Add manifest_from_signature() to signing.py that calls Verifier.verify() - Update sign_incremental() to require identity/oidc_issuer parameters for verification of old signatures - Remove Manifest.from_signature() from manifest.py (eliminated code duplication) - Update documentation examples in hashing.py - Remove redundant tests (DSSE parsing already tested in signing_test.py) This is a breaking change for incremental signing API, but improves security by preventing tampering of old signatures. Signed-off-by: Emrick Donadei --- src/model_signing/_signing/signing.py | 39 ++++++ src/model_signing/hashing.py | 10 +- src/model_signing/manifest.py | 50 ------- src/model_signing/signing.py | 53 ++++++-- tests/manifest_test.py | 183 -------------------------- 5 files changed, 88 insertions(+), 247 deletions(-) diff --git a/src/model_signing/_signing/signing.py b/src/model_signing/_signing/signing.py index 13681ac5..90dd00a3 100644 --- a/src/model_signing/_signing/signing.py +++ b/src/model_signing/_signing/signing.py @@ -166,6 +166,45 @@ def dsse_payload_to_manifest_compat( return manifest.Manifest(model_name, items, serialization) +def manifest_from_signature( + signature_path: pathlib.Path, + *, + identity: str, + oidc_issuer: str, + use_staging: bool = False, +) -> manifest.Manifest: + """Extracts and verifies a manifest from an existing signature file. + + This function reads a signature file (Sigstore bundle), verifies the + cryptographic signature, and returns the manifest. This is essential when + reusing hashes from an old signature (e.g., in incremental signing) to + ensure the old signature hasn't been tampered with. + + Args: + signature_path: Path to the signature file to read and verify. + identity: The expected identity that signed the model (e.g., email). + oidc_issuer: The expected OpenID Connect issuer that provided the + certificate used for the signature. + use_staging: Use staging configurations instead of production. This + should only be set to True when testing. Default is False. + + Returns: + A Manifest object representing the signed model. + + Raises: + ValueError: If signature verification fails or the signature file + cannot be parsed. + FileNotFoundError: If the signature file doesn't exist. + """ + from model_signing._signing import sign_sigstore # noqa: PLC0415 + + signature = sign_sigstore.Signature.read(signature_path) + verifier = sign_sigstore.Verifier( + identity=identity, oidc_issuer=oidc_issuer, use_staging=use_staging + ) + return verifier.verify(signature) + + class Payload: """In-toto payload used to represent a model for signing. diff --git a/src/model_signing/hashing.py b/src/model_signing/hashing.py index 5fe88cc0..605a082c 100644 --- a/src/model_signing/hashing.py +++ b/src/model_signing/hashing.py @@ -406,9 +406,13 @@ def use_incremental_serialization( You can override any of these by passing explicit values. Usage example: - # Extract manifest from previous signature - old_manifest = manifest.Manifest.from_signature( - pathlib.Path("model.sig.old") + from model_signing._signing import signing + + # Extract and verify manifest from previous signature + old_manifest = signing.manifest_from_signature( + pathlib.Path("model.sig.old"), + identity="user@example.com", + oidc_issuer="https://github.com/login/oauth", ) # Configure incremental hashing (auto-detects parameters) diff --git a/src/model_signing/manifest.py b/src/model_signing/manifest.py index e175e541..a42662ed 100644 --- a/src/model_signing/manifest.py +++ b/src/model_signing/manifest.py @@ -39,10 +39,8 @@ """ import abc -import base64 from collections.abc import Iterable, Iterator import dataclasses -import json import pathlib import sys from typing import Any, Final @@ -468,51 +466,3 @@ def serialization_type(self) -> dict[str, Any]: manifest so that signature verification can use the same method. """ return self._serialization_type.serialization_parameters - - @classmethod - def from_signature(cls, signature_path: pathlib.Path) -> Self: - """Extracts a manifest from an existing signature file. - - This method reads a signature file (Sigstore bundle) and extracts the - manifest without performing cryptographic verification. This is useful - for incremental re-hashing where you need to know what files were - previously signed without verifying the signature. - - Args: - signature_path: Path to the signature file to read. - - Returns: - A Manifest object representing the signed model. - - Raises: - ValueError: If the signature file cannot be parsed or doesn't - contain a valid manifest. - FileNotFoundError: If the signature file doesn't exist. - """ - # Avoid circular import by importing here - from model_signing._signing import signing - - # Read the signature file - content = signature_path.read_text(encoding="utf-8") - bundle_dict = json.loads(content) - - # Extract the DSSE envelope payload - if "dsseEnvelope" in bundle_dict: - # This is a protobuf-based bundle - envelope = bundle_dict["dsseEnvelope"] - elif "dsse_envelope" in bundle_dict: - # Alternative snake_case naming - envelope = bundle_dict["dsse_envelope"] - else: - raise ValueError("Signature file does not contain a DSSE envelope") - - # Decode the payload (it's base64 encoded) - payload_b64 = envelope.get("payload") - if not payload_b64: - raise ValueError("DSSE envelope does not contain a payload") - - payload_bytes = base64.b64decode(payload_b64) - payload_dict = json.loads(payload_bytes) - - # Use the existing function to convert DSSE payload to manifest - return signing.dsse_payload_to_manifest(payload_dict) diff --git a/src/model_signing/signing.py b/src/model_signing/signing.py index 86feb3f5..deeeaf7c 100644 --- a/src/model_signing/signing.py +++ b/src/model_signing/signing.py @@ -48,7 +48,6 @@ from typing import Optional from model_signing import hashing -from model_signing import manifest from model_signing._signing import sign_certificate as certificate from model_signing._signing import sign_ec_key as ec_key from model_signing._signing import sign_sigstore as sigstore @@ -81,6 +80,9 @@ def sign_incremental( old_signature_path: hashing.PathLike, new_signature_path: hashing.PathLike, *, + identity: str, + oidc_issuer: str, + use_staging: bool = False, files_to_hash: Optional[Iterable[hashing.PathLike]] = None, ): """Signs a model incrementally, only re-hashing changed files. @@ -91,6 +93,9 @@ def sign_incremental( digests from the previous signature for unchanged files and only hashes new or modified files. + The old signature is cryptographically verified before its hashes are + reused, ensuring the integrity of the incremental signing process. + In this default configuration we sign using Sigstore. Usage example: @@ -99,15 +104,23 @@ def sign_incremental( model_path="huge-model/", old_signature_path="model.sig.old", new_signature_path="model.sig.new", + identity="user@example.com", + oidc_issuer="https://github.com/login/oauth", files_to_hash=["huge-model/README.md"] ) Args: model_path: The path to the model to sign. old_signature_path: The path to the previous signature. The manifest - from this signature will be extracted and used for incremental + from this signature will be verified and extracted for incremental hashing. new_signature_path: The path where the new signature will be written. + identity: The expected identity that signed the old signature + (e.g., email address). + oidc_issuer: The expected OpenID Connect issuer that provided the + certificate for the old signature. + use_staging: Use staging configurations for verification instead of + production. Should only be True when testing. Default is False. files_to_hash: Optional list of files that changed and need to be re-hashed. If None, only new files (not in old signature) will be hashed. Existing files will have their digests reused. @@ -115,12 +128,16 @@ def sign_incremental( Raises: FileNotFoundError: If old_signature_path doesn't exist. - ValueError: If old_signature_path cannot be parsed. + ValueError: If old_signature_path cannot be parsed or verification + fails. """ Config().sign_incremental( model_path, old_signature_path, new_signature_path, + identity=identity, + oidc_issuer=oidc_issuer, + use_staging=use_staging, files_to_hash=files_to_hash, ) @@ -165,30 +182,44 @@ def sign_incremental( old_signature_path: hashing.PathLike, new_signature_path: hashing.PathLike, *, + identity: str, + oidc_issuer: str, + use_staging: bool = False, files_to_hash: Optional[Iterable[hashing.PathLike]] = None, ): """Signs a model incrementally using the current configuration. - This method extracts the manifest from an existing signature and - configures incremental hashing to reuse digests for unchanged files. - Only new or modified files are re-hashed, providing significant - performance improvements for large models. + This method extracts and verifies the manifest from an existing + signature, then configures incremental hashing to reuse digests for + unchanged files. Only new or modified files are re-hashed, providing + significant performance improvements for large models. Args: model_path: The path to the model to sign. old_signature_path: The path to the previous signature. new_signature_path: The path where the new signature will be written. + identity: The expected identity that signed the old signature + (e.g., email address). + oidc_issuer: The expected OpenID Connect issuer that provided + the certificate for the old signature. + use_staging: Use staging configurations for verification instead + of production. Should only be True when testing. Default is + False. files_to_hash: Optional list of files that changed and need to be re-hashed. If None, only new files will be hashed. Raises: FileNotFoundError: If old_signature_path doesn't exist. - ValueError: If old_signature_path cannot be parsed. + ValueError: If old_signature_path cannot be parsed or verification + fails. """ - # Extract manifest from old signature - old_manifest = manifest.Manifest.from_signature( - pathlib.Path(old_signature_path) + # Extract and verify manifest from old signature + old_manifest = signing.manifest_from_signature( + pathlib.Path(old_signature_path), + identity=identity, + oidc_issuer=oidc_issuer, + use_staging=use_staging, ) # Configure incremental hashing diff --git a/tests/manifest_test.py b/tests/manifest_test.py index e81e3577..771e1f01 100644 --- a/tests/manifest_test.py +++ b/tests/manifest_test.py @@ -206,186 +206,3 @@ def test_manifest_has_the_correct_resource_descriptors(self): assert descriptors[0].digest.digest_value == b"hash1" assert descriptors[1].digest.digest_value == b"hash2" assert descriptors[2].digest.digest_value == b"hash3" - - -class TestManifestFromSignature: - def test_from_signature_rejects_inconsistent_manifest(self, tmp_path): - import base64 - import json - - # Create a Sigstore bundle with inconsistent root digest - # The subject digest doesn't match the hash of the resources - payload_dict = { - "_type": "https://in-toto.io/Statement/v1", - "subject": [ - { - "name": "test_model", - "digest": { - "sha256": ( - "0b8a5a8c8e8f1a8b8c8d8e8f2a8b8c8d8e8f3a8b8c8d" - "8e8f4a8b8c8d8e8f5a8b" - ) - }, - } - ], - "predicateType": "https://model_signing/signature/v1.0", - "predicate": { - "serialization": { - "method": "files", - "hash_type": "sha256", - "allow_symlinks": False, - "ignore_paths": [], - }, - "resources": [ - { - "name": "file1.txt", - "algorithm": "sha256", - "digest": ( - "abcd1234abcd1234abcd1234abcd1234" - "abcd1234abcd1234abcd1234abcd1234" - ), - }, - { - "name": "file2.txt", - "algorithm": "sha256", - "digest": ( - "5678dcba5678dcba5678dcba5678dcba" - "5678dcba5678dcba5678dcba5678dcba" - ), - }, - ], - }, - } - - # Create DSSE envelope - payload_json = json.dumps(payload_dict) - payload_b64 = base64.b64encode(payload_json.encode("utf-8")).decode( - "utf-8" - ) - - bundle_dict = { - "mediaType": "application/vnd.dev.sigstore.bundle.v0.3+json", - "verificationMaterial": { - "publicKey": {"hint": "test"}, - "tlogEntries": [], - }, - "dsseEnvelope": { - "payload": payload_b64, - "payloadType": "application/vnd.in-toto+json", - "signatures": [{"sig": "fake_signature"}], - }, - } - - # Write to file - sig_file = tmp_path / "test.sig" - sig_file.write_text(json.dumps(bundle_dict), encoding="utf-8") - - # Verify that inconsistent manifest is rejected - with pytest.raises(ValueError, match="Manifest is inconsistent"): - manifest.Manifest.from_signature(sig_file) - - def test_from_signature_extracts_valid_manifest(self, tmp_path): - import base64 - import hashlib - import json - - # Create valid SHA256 hex digests (64 chars each) - digest1_hex = ( - "abcd1234abcd1234abcd1234abcd1234" - "abcd1234abcd1234abcd1234abcd1234" - ) - digest2_hex = ( - "5678dcba5678dcba5678dcba5678dcba" - "5678dcba5678dcba5678dcba5678dcba" - ) - - digest1_bytes = bytes.fromhex(digest1_hex) - digest2_bytes = bytes.fromhex(digest2_hex) - - # Compute root digest (SHA256 of both digests concatenated) - hasher = hashlib.sha256() - hasher.update(digest1_bytes) - hasher.update(digest2_bytes) - root_digest = hasher.hexdigest() - - payload_dict = { - "_type": "https://in-toto.io/Statement/v1", - "subject": [ - {"name": "test_model", "digest": {"sha256": root_digest}} - ], - "predicateType": "https://model_signing/signature/v1.0", - "predicate": { - "serialization": { - "method": "files", - "hash_type": "sha256", - "allow_symlinks": False, - "ignore_paths": [], - }, - "resources": [ - { - "name": "file1.txt", - "algorithm": "sha256", - "digest": digest1_hex, - }, - { - "name": "file2.txt", - "algorithm": "sha256", - "digest": digest2_hex, - }, - ], - }, - } - - payload_json = json.dumps(payload_dict) - payload_b64 = base64.b64encode(payload_json.encode("utf-8")).decode( - "utf-8" - ) - - bundle_dict = { - "mediaType": "application/vnd.dev.sigstore.bundle.v0.3+json", - "verificationMaterial": { - "publicKey": {"hint": "test"}, - "tlogEntries": [], - }, - "dsseEnvelope": { - "payload": payload_b64, - "payloadType": "application/vnd.in-toto+json", - "signatures": [{"sig": "fake_signature"}], - }, - } - - sig_file = tmp_path / "test.sig" - sig_file.write_text(json.dumps(bundle_dict), encoding="utf-8") - - # Extract manifest - extracted_manifest = manifest.Manifest.from_signature(sig_file) - - # Verify the manifest has the correct files - descriptors = list(extracted_manifest.resource_descriptors()) - assert len(descriptors) == 2 - assert descriptors[0].identifier == "file1.txt" - assert descriptors[1].identifier == "file2.txt" - assert descriptors[0].digest.digest_hex == digest1_hex - assert descriptors[1].digest.digest_hex == digest2_hex - assert extracted_manifest.model_name == "test_model" - - def test_from_signature_file_not_found(self, tmp_path): - non_existent = tmp_path / "does_not_exist.sig" - with pytest.raises(FileNotFoundError): - manifest.Manifest.from_signature(non_existent) - - def test_from_signature_invalid_json(self, tmp_path): - import json - - sig_file = tmp_path / "invalid.sig" - sig_file.write_text("not valid json", encoding="utf-8") - with pytest.raises(json.JSONDecodeError): - manifest.Manifest.from_signature(sig_file) - - def test_from_signature_missing_envelope(self, tmp_path): - sig_file = tmp_path / "missing_envelope.sig" - sig_file.write_text("{}", encoding="utf-8") - with pytest.raises( - ValueError, match="does not contain a DSSE envelope" - ): - manifest.Manifest.from_signature(sig_file)