From a3aa0e3508addfacab4d41da0c2c2ff8049e5b9e Mon Sep 17 00:00:00 2001 From: Josef Prochazka Date: Thu, 28 Aug 2025 17:04:12 +0200 Subject: [PATCH 01/16] WIP --- .../storage_clients/_base/_storage_client.py | 3 -- .../_file_system/_storage_client.py | 32 ++++++++++++------- src/crawlee/storages/_base.py | 2 -- src/crawlee/storages/_dataset.py | 5 +-- src/crawlee/storages/_key_value_store.py | 5 +-- src/crawlee/storages/_request_queue.py | 5 +-- .../storages/_storage_instance_manager.py | 14 ++------ 7 files changed, 26 insertions(+), 40 deletions(-) diff --git a/src/crawlee/storage_clients/_base/_storage_client.py b/src/crawlee/storage_clients/_base/_storage_client.py index 9be648a0e5..ebfc9a0130 100644 --- a/src/crawlee/storage_clients/_base/_storage_client.py +++ b/src/crawlee/storage_clients/_base/_storage_client.py @@ -34,7 +34,6 @@ async def create_dataset_client( *, id: str | None = None, name: str | None = None, - configuration: Configuration | None = None, ) -> DatasetClient: """Create a dataset client.""" @@ -44,7 +43,6 @@ async def create_kvs_client( *, id: str | None = None, name: str | None = None, - configuration: Configuration | None = None, ) -> KeyValueStoreClient: """Create a key-value store client.""" @@ -54,7 +52,6 @@ async def create_rq_client( *, id: str | None = None, name: str | None = None, - configuration: Configuration | None = None, ) -> RequestQueueClient: """Create a request queue client.""" diff --git a/src/crawlee/storage_clients/_file_system/_storage_client.py b/src/crawlee/storage_clients/_file_system/_storage_client.py index 86903ea4e7..2b66650210 100644 --- a/src/crawlee/storage_clients/_file_system/_storage_client.py +++ b/src/crawlee/storage_clients/_file_system/_storage_client.py @@ -29,17 +29,29 @@ class FileSystemStorageClient(StorageClient): Use it only when running a single crawler process at a time. """ + def __init__(self, configuration: Configuration | None = None) -> None: + """Initialize the file system storage client. + + Args: + configuration: Optional configuration instance to use with the storage client. + If not provided, the global configuration will be used. + purge_on_start: If true, all storages (datasets, key-value stores, request queues) + will be purged (deleted) when they are opened for the first time. + Use with caution as this will delete all existing data in the storages. + """ + self._configuration = configuration or Configuration.get_global_configuration() + + + @override async def create_dataset_client( self, *, id: str | None = None, name: str | None = None, - configuration: Configuration | None = None, ) -> FileSystemDatasetClient: - configuration = configuration or Configuration.get_global_configuration() - client = await FileSystemDatasetClient.open(id=id, name=name, configuration=configuration) - await self._purge_if_needed(client, configuration) + client = await FileSystemDatasetClient.open(id=id, name=name, configuration=self._configuration) + await self._purge_if_needed(client, self._configuration) return client @override @@ -48,11 +60,9 @@ async def create_kvs_client( *, id: str | None = None, name: str | None = None, - configuration: Configuration | None = None, ) -> FileSystemKeyValueStoreClient: - configuration = configuration or Configuration.get_global_configuration() - client = await FileSystemKeyValueStoreClient.open(id=id, name=name, configuration=configuration) - await self._purge_if_needed(client, configuration) + client = await FileSystemKeyValueStoreClient.open(id=id, name=name, configuration=self._configuration) + await self._purge_if_needed(client, self._configuration) return client @override @@ -61,9 +71,7 @@ async def create_rq_client( *, id: str | None = None, name: str | None = None, - configuration: Configuration | None = None, ) -> FileSystemRequestQueueClient: - configuration = configuration or Configuration.get_global_configuration() - client = await FileSystemRequestQueueClient.open(id=id, name=name, configuration=configuration) - await self._purge_if_needed(client, configuration) + client = await FileSystemRequestQueueClient.open(id=id, name=name, configuration=self._configuration) + await self._purge_if_needed(client, self._configuration) return client diff --git a/src/crawlee/storages/_base.py b/src/crawlee/storages/_base.py index c63f1dda37..f832d76ae8 100644 --- a/src/crawlee/storages/_base.py +++ b/src/crawlee/storages/_base.py @@ -36,7 +36,6 @@ async def open( *, id: str | None = None, name: str | None = None, - configuration: Configuration | None = None, storage_client: StorageClient | None = None, ) -> Storage: """Open a storage, either restore existing or create a new one. @@ -44,7 +43,6 @@ async def open( Args: id: The storage ID. name: The storage name. - configuration: Configuration object used during the storage creation or restoration process. storage_client: Underlying storage client to use. If not provided, the default global storage client from the service locator will be used. """ diff --git a/src/crawlee/storages/_dataset.py b/src/crawlee/storages/_dataset.py index 36fbea8a7a..4c972736dc 100644 --- a/src/crawlee/storages/_dataset.py +++ b/src/crawlee/storages/_dataset.py @@ -100,18 +100,15 @@ async def open( *, id: str | None = None, name: str | None = None, - configuration: Configuration | None = None, storage_client: StorageClient | None = None, ) -> Dataset: - configuration = service_locator.get_configuration() if configuration is None else configuration storage_client = service_locator.get_storage_client() if storage_client is None else storage_client return await service_locator.storage_instance_manager.open_storage_instance( cls, id=id, name=name, - configuration=configuration, - client_opener=storage_client.create_dataset_client, + storage_client=storage_client, ) @override diff --git a/src/crawlee/storages/_key_value_store.py b/src/crawlee/storages/_key_value_store.py index 5297925a37..7c75d04133 100644 --- a/src/crawlee/storages/_key_value_store.py +++ b/src/crawlee/storages/_key_value_store.py @@ -112,18 +112,15 @@ async def open( *, id: str | None = None, name: str | None = None, - configuration: Configuration | None = None, storage_client: StorageClient | None = None, ) -> KeyValueStore: - configuration = service_locator.get_configuration() if configuration is None else configuration storage_client = service_locator.get_storage_client() if storage_client is None else storage_client return await service_locator.storage_instance_manager.open_storage_instance( cls, id=id, name=name, - configuration=configuration, - client_opener=storage_client.create_kvs_client, + storage_client=storage_client, ) @override diff --git a/src/crawlee/storages/_request_queue.py b/src/crawlee/storages/_request_queue.py index 068b5135f0..9c0e876f0c 100644 --- a/src/crawlee/storages/_request_queue.py +++ b/src/crawlee/storages/_request_queue.py @@ -118,18 +118,15 @@ async def open( *, id: str | None = None, name: str | None = None, - configuration: Configuration | None = None, storage_client: StorageClient | None = None, ) -> RequestQueue: - configuration = service_locator.get_configuration() if configuration is None else configuration storage_client = service_locator.get_storage_client() if storage_client is None else storage_client return await service_locator.storage_instance_manager.open_storage_instance( cls, id=id, name=name, - configuration=configuration, - client_opener=storage_client.create_rq_client, + storage_client=storage_client, ) @override diff --git a/src/crawlee/storages/_storage_instance_manager.py b/src/crawlee/storages/_storage_instance_manager.py index 130a2eec63..82ea840fc9 100644 --- a/src/crawlee/storages/_storage_instance_manager.py +++ b/src/crawlee/storages/_storage_instance_manager.py @@ -8,16 +8,10 @@ from ._base import Storage if TYPE_CHECKING: - from crawlee.configuration import Configuration + from ..storage_clients import StorageClient T = TypeVar('T', bound='Storage') -StorageClientType = DatasetClient | KeyValueStoreClient | RequestQueueClient -"""Type alias for the storage client types.""" - -ClientOpener = Callable[..., Awaitable[StorageClientType]] -"""Type alias for the client opener function.""" - class StorageInstanceManager: """Manager for caching and managing storage instances. @@ -42,8 +36,7 @@ async def open_storage_instance( *, id: str | None, name: str | None, - configuration: Configuration, - client_opener: ClientOpener, + storage_client: StorageClient, ) -> T: """Open a storage instance with caching support. @@ -51,8 +44,7 @@ async def open_storage_instance( cls: The storage class to instantiate. id: Storage ID. name: Storage name. - configuration: Configuration object. - client_opener: Function to create the storage client. + storage_client: Storage client instance. Returns: The storage instance. From be54e460c81205d823352e8aa3b4bba151e3da27 Mon Sep 17 00:00:00 2001 From: Josef Prochazka Date: Fri, 29 Aug 2025 14:11:58 +0200 Subject: [PATCH 02/16] Draft for discussion. Rework of service_locator implicit setting of services, storages and storage creation. --- .../service_storage_configuration.py | 14 +- pyproject.toml | 2 +- src/crawlee/_autoscaling/snapshotter.py | 2 +- src/crawlee/_service_locator.py | 43 +++--- src/crawlee/crawlers/_basic/_basic_crawler.py | 29 ++-- .../pyproject.toml | 4 +- .../requirements.txt | 4 +- .../_file_system/_storage_client.py | 5 - .../_memory/_storage_client.py | 21 +-- src/crawlee/storages/_base.py | 1 - src/crawlee/storages/_dataset.py | 6 - src/crawlee/storages/_key_value_store.py | 1 - src/crawlee/storages/_request_queue.py | 1 - .../storages/_storage_instance_manager.py | 103 +++++++++----- tests/unit/_autoscaling/test_snapshotter.py | 2 +- tests/unit/conftest.py | 14 -- .../crawlers/_basic/test_basic_crawler.py | 127 +++++++++++++++++- .../_file_system/test_fs_dataset_client.py | 18 +-- .../_file_system/test_fs_kvs_client.py | 14 +- .../_file_system/test_fs_rq_client.py | 12 +- .../_memory/test_memory_dataset_client.py | 6 +- .../_memory/test_memory_kvs_client.py | 6 +- .../_memory/test_memory_rq_client.py | 6 +- tests/unit/storages/test_dataset.py | 36 ++--- tests/unit/storages/test_key_value_store.py | 39 ++---- tests/unit/storages/test_request_queue.py | 33 ++--- .../storages/test_storage_instance_manager.py | 83 ++++++++++++ tests/unit/test_service_locator.py | 22 ++- uv.lock | 4 +- 29 files changed, 409 insertions(+), 249 deletions(-) create mode 100644 tests/unit/storages/test_storage_instance_manager.py diff --git a/docs/guides/code_examples/service_locator/service_storage_configuration.py b/docs/guides/code_examples/service_locator/service_storage_configuration.py index 4c7370b77b..ae42474c29 100644 --- a/docs/guides/code_examples/service_locator/service_storage_configuration.py +++ b/docs/guides/code_examples/service_locator/service_storage_configuration.py @@ -1,7 +1,9 @@ import asyncio from datetime import timedelta +from crawlee import service_locator from crawlee.configuration import Configuration +from crawlee.storage_clients import MemoryStorageClient from crawlee.storages import Dataset @@ -11,10 +13,16 @@ async def main() -> None: headless=False, persist_state_interval=timedelta(seconds=30), ) + # Set the custom configuration as the global default configuration. + service_locator.set_configuration(configuration) - # Pass the configuration to the dataset (or other storage) when opening it. - dataset = await Dataset.open( - configuration=configuration, + # Use the global defaults when creating the dataset (or other storage). + dataset_1 = await Dataset.open() + + # Or set explicitly specific storage client if + # you do not want to rely on global defaults. + dataset_2 = await Dataset.open( + storage_client=MemoryStorageClient(configuration=configuration) ) diff --git a/pyproject.toml b/pyproject.toml index 050ac52211..9ce99bb21c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "hatchling.build" [project] name = "crawlee" -version = "0.6.13" +version = "1.0.0rc1" description = "Crawlee for Python" authors = [{ name = "Apify Technologies s.r.o.", email = "support@apify.com" }] license = { file = "LICENSE" } diff --git a/src/crawlee/_autoscaling/snapshotter.py b/src/crawlee/_autoscaling/snapshotter.py index 90d44db65e..55af9da1dd 100644 --- a/src/crawlee/_autoscaling/snapshotter.py +++ b/src/crawlee/_autoscaling/snapshotter.py @@ -113,7 +113,7 @@ def from_config(cls, config: Configuration | None = None) -> Snapshotter: Args: config: The `Configuration` instance. Uses the global (default) one if not provided. """ - config = service_locator.get_configuration() + config = config or service_locator.get_configuration() # Compute the maximum memory size based on the provided configuration. If `memory_mbytes` is provided, # it uses that value. Otherwise, it calculates the `max_memory_size` as a proportion of the system's diff --git a/src/crawlee/_service_locator.py b/src/crawlee/_service_locator.py index 3418e4d4e3..a3430dfc69 100644 --- a/src/crawlee/_service_locator.py +++ b/src/crawlee/_service_locator.py @@ -11,6 +11,10 @@ if TYPE_CHECKING: from crawlee.storages._storage_instance_manager import StorageInstanceManager +from logging import getLogger + +logger = getLogger(__name__) + @docs_group('Configuration') class ServiceLocator: @@ -19,23 +23,23 @@ class ServiceLocator: All services are initialized to its default value lazily. """ - def __init__(self) -> None: - self._configuration: Configuration | None = None - self._event_manager: EventManager | None = None - self._storage_client: StorageClient | None = None + def __init__( + self, + configuration: Configuration | None = None, + event_manager: EventManager | None = None, + storage_client: StorageClient | None = None, + ) -> None: + self._configuration = configuration + self._event_manager = event_manager + self._storage_client = storage_client self._storage_instance_manager: StorageInstanceManager | None = None - # Flags to check if the services were already set. - self._configuration_was_retrieved = False - self._event_manager_was_retrieved = False - self._storage_client_was_retrieved = False - def get_configuration(self) -> Configuration: """Get the configuration.""" if self._configuration is None: + logger.warning('No configuration set, implicitly creating and using default Configuration.') self._configuration = Configuration() - self._configuration_was_retrieved = True return self._configuration def set_configuration(self, configuration: Configuration) -> None: @@ -47,7 +51,10 @@ def set_configuration(self, configuration: Configuration) -> None: Raises: ServiceConflictError: If the configuration has already been retrieved before. """ - if self._configuration_was_retrieved: + if self._configuration is configuration: + # Same instance, no need to anything + return + if self._configuration: raise ServiceConflictError(Configuration, configuration, self._configuration) self._configuration = configuration @@ -55,13 +62,13 @@ def set_configuration(self, configuration: Configuration) -> None: def get_event_manager(self) -> EventManager: """Get the event manager.""" if self._event_manager is None: + logger.warning('No event manager set, implicitly creating and using default LocalEventManager.') self._event_manager = ( LocalEventManager().from_config(config=self._configuration) if self._configuration else LocalEventManager.from_config() ) - self._event_manager_was_retrieved = True return self._event_manager def set_event_manager(self, event_manager: EventManager) -> None: @@ -73,7 +80,10 @@ def set_event_manager(self, event_manager: EventManager) -> None: Raises: ServiceConflictError: If the event manager has already been retrieved before. """ - if self._event_manager_was_retrieved: + if self._event_manager is event_manager: + # Same instance, no need to anything + return + if self._event_manager: raise ServiceConflictError(EventManager, event_manager, self._event_manager) self._event_manager = event_manager @@ -81,9 +91,9 @@ def set_event_manager(self, event_manager: EventManager) -> None: def get_storage_client(self) -> StorageClient: """Get the storage client.""" if self._storage_client is None: + logger.warning('No storage client set, implicitly creating and using default FileSystemStorageClient.') self._storage_client = FileSystemStorageClient() - self._storage_client_was_retrieved = True return self._storage_client def set_storage_client(self, storage_client: StorageClient) -> None: @@ -95,7 +105,10 @@ def set_storage_client(self, storage_client: StorageClient) -> None: Raises: ServiceConflictError: If the storage client has already been retrieved before. """ - if self._storage_client_was_retrieved: + if self._storage_client is storage_client: + # Same instance, no need to anything + return + if self._storage_client: raise ServiceConflictError(StorageClient, storage_client, self._storage_client) self._storage_client = storage_client diff --git a/src/crawlee/crawlers/_basic/_basic_crawler.py b/src/crawlee/crawlers/_basic/_basic_crawler.py index a087ffebc8..545848d16d 100644 --- a/src/crawlee/crawlers/_basic/_basic_crawler.py +++ b/src/crawlee/crawlers/_basic/_basic_crawler.py @@ -27,6 +27,7 @@ from crawlee._autoscaling import AutoscaledPool, Snapshotter, SystemStatus from crawlee._log_config import configure_logger, get_configured_log_level, string_to_log_level from crawlee._request import Request, RequestOptions, RequestState +from crawlee._service_locator import ServiceLocator from crawlee._types import ( BasicCrawlingContext, EnqueueLinksKwargs, @@ -346,14 +347,18 @@ def __init__( _logger: A logger instance, typically provided by a subclass, for consistent logging labels. Intended for use by subclasses rather than direct instantiation of `BasicCrawler`. """ - if configuration: - service_locator.set_configuration(configuration) - if storage_client: - service_locator.set_storage_client(storage_client) - if event_manager: - service_locator.set_event_manager(event_manager) + if not configuration: + configuration = service_locator.get_configuration() + if not storage_client: + storage_client = service_locator.get_storage_client() + if not event_manager: + event_manager = service_locator.get_event_manager() + + self._service_locator = ServiceLocator( + configuration=configuration, storage_client=storage_client, event_manager=event_manager + ) - config = service_locator.get_configuration() + config = self._service_locator.get_configuration() # Core components self._request_manager = request_manager @@ -548,7 +553,7 @@ async def _get_proxy_info(self, request: Request, session: Session | None) -> Pr async def get_request_manager(self) -> RequestManager: """Return the configured request manager. If none is configured, open and return the default request queue.""" if not self._request_manager: - self._request_manager = await RequestQueue.open() + self._request_manager = await RequestQueue.open(storage_client=self._service_locator.get_storage_client()) return self._request_manager @@ -559,7 +564,7 @@ async def get_dataset( name: str | None = None, ) -> Dataset: """Return the `Dataset` with the given ID or name. If none is provided, return the default one.""" - return await Dataset.open(id=id, name=name) + return await Dataset.open(id=id, name=name, storage_client=self._service_locator.get_storage_client()) async def get_key_value_store( self, @@ -568,7 +573,7 @@ async def get_key_value_store( name: str | None = None, ) -> KeyValueStore: """Return the `KeyValueStore` with the given ID or name. If none is provided, return the default KVS.""" - return await KeyValueStore.open(id=id, name=name) + return await KeyValueStore.open(id=id, name=name, storage_client=self._service_locator.get_storage_client()) def error_handler( self, handler: ErrorHandler[TCrawlingContext | BasicCrawlingContext] @@ -684,7 +689,7 @@ def sigint_handler() -> None: return final_statistics async def _run_crawler(self) -> None: - event_manager = service_locator.get_event_manager() + event_manager = self._service_locator.get_event_manager() self._crawler_state_rec_task.start() @@ -1520,7 +1525,7 @@ def _log_status_message(self, message: str, level: LogLevel = 'DEBUG') -> None: async def _crawler_state_task(self) -> None: """Emit a persist state event with the given migration status.""" - event_manager = service_locator.get_event_manager() + event_manager = self._service_locator.get_event_manager() current_state = self.statistics.state diff --git a/src/crawlee/project_template/{{cookiecutter.project_name}}/pyproject.toml b/src/crawlee/project_template/{{cookiecutter.project_name}}/pyproject.toml index 5c2146104d..14705f3095 100644 --- a/src/crawlee/project_template/{{cookiecutter.project_name}}/pyproject.toml +++ b/src/crawlee/project_template/{{cookiecutter.project_name}}/pyproject.toml @@ -19,12 +19,12 @@ authors = [ readme = "README.md" requires-python = ">=3.10,<4.0" dependencies = [ - "crawlee[{{ extras|join(',') }}]", + "crawlee[{{ extras|join(',') }}]==1.0.0rc1", # % if cookiecutter.crawler_type == 'playwright-camoufox' "camoufox[geoip]~=0.4.5", # % endif # % if cookiecutter.enable_apify_integration - "apify", + "apify==3.0.0rc1", # % endif ] diff --git a/src/crawlee/project_template/{{cookiecutter.project_name}}/requirements.txt b/src/crawlee/project_template/{{cookiecutter.project_name}}/requirements.txt index 1eebd53bcf..b969739df5 100644 --- a/src/crawlee/project_template/{{cookiecutter.project_name}}/requirements.txt +++ b/src/crawlee/project_template/{{cookiecutter.project_name}}/requirements.txt @@ -5,9 +5,9 @@ camoufox[geoip]~=0.4.5 # % set extras = [cookiecutter.crawler_type] # % endif # % if cookiecutter.enable_apify_integration -apify +apify==3.0.0rc1 # % endif # % if cookiecutter.http_client == 'curl-impersonate' # % do extras.append('curl-impersonate') # % endif -crawlee[{{ extras | join(',') }}] +crawlee[{{ extras | join(',') }}]==1.0.0rc1 diff --git a/src/crawlee/storage_clients/_file_system/_storage_client.py b/src/crawlee/storage_clients/_file_system/_storage_client.py index 2b66650210..2aba740607 100644 --- a/src/crawlee/storage_clients/_file_system/_storage_client.py +++ b/src/crawlee/storage_clients/_file_system/_storage_client.py @@ -35,14 +35,9 @@ def __init__(self, configuration: Configuration | None = None) -> None: Args: configuration: Optional configuration instance to use with the storage client. If not provided, the global configuration will be used. - purge_on_start: If true, all storages (datasets, key-value stores, request queues) - will be purged (deleted) when they are opened for the first time. - Use with caution as this will delete all existing data in the storages. """ self._configuration = configuration or Configuration.get_global_configuration() - - @override async def create_dataset_client( self, diff --git a/src/crawlee/storage_clients/_memory/_storage_client.py b/src/crawlee/storage_clients/_memory/_storage_client.py index f4ac73e489..fece22233f 100644 --- a/src/crawlee/storage_clients/_memory/_storage_client.py +++ b/src/crawlee/storage_clients/_memory/_storage_client.py @@ -27,17 +27,24 @@ class MemoryStorageClient(StorageClient): operations where persistence is not required. """ + def __init__(self, configuration: Configuration | None = None) -> None: + """Initialize the file system storage client. + + Args: + configuration: Optional configuration instance to use with the storage client. + If not provided, the global configuration will be used. + """ + self._configuration = configuration or Configuration.get_global_configuration() + @override async def create_dataset_client( self, *, id: str | None = None, name: str | None = None, - configuration: Configuration | None = None, ) -> MemoryDatasetClient: - configuration = configuration or Configuration.get_global_configuration() client = await MemoryDatasetClient.open(id=id, name=name) - await self._purge_if_needed(client, configuration) + await self._purge_if_needed(client, self._configuration) return client @override @@ -46,11 +53,9 @@ async def create_kvs_client( *, id: str | None = None, name: str | None = None, - configuration: Configuration | None = None, ) -> MemoryKeyValueStoreClient: - configuration = configuration or Configuration.get_global_configuration() client = await MemoryKeyValueStoreClient.open(id=id, name=name) - await self._purge_if_needed(client, configuration) + await self._purge_if_needed(client, self._configuration) return client @override @@ -59,9 +64,7 @@ async def create_rq_client( *, id: str | None = None, name: str | None = None, - configuration: Configuration | None = None, ) -> MemoryRequestQueueClient: - configuration = configuration or Configuration.get_global_configuration() client = await MemoryRequestQueueClient.open(id=id, name=name) - await self._purge_if_needed(client, configuration) + await self._purge_if_needed(client, self._configuration) return client diff --git a/src/crawlee/storages/_base.py b/src/crawlee/storages/_base.py index f832d76ae8..aa312787ce 100644 --- a/src/crawlee/storages/_base.py +++ b/src/crawlee/storages/_base.py @@ -6,7 +6,6 @@ from crawlee._utils.docs import docs_group if TYPE_CHECKING: - from crawlee.configuration import Configuration from crawlee.storage_clients._base import StorageClient from crawlee.storage_clients.models import DatasetMetadata, KeyValueStoreMetadata, RequestQueueMetadata diff --git a/src/crawlee/storages/_dataset.py b/src/crawlee/storages/_dataset.py index 4c972736dc..df6b165987 100644 --- a/src/crawlee/storages/_dataset.py +++ b/src/crawlee/storages/_dataset.py @@ -20,7 +20,6 @@ from typing_extensions import Unpack from crawlee._types import ExportDataCsvKwargs, ExportDataJsonKwargs - from crawlee.configuration import Configuration from crawlee.storage_clients import StorageClient from crawlee.storage_clients._base import DatasetClient from crawlee.storage_clients.models import DatasetItemsListPage, DatasetMetadata @@ -291,7 +290,6 @@ async def export_to( to_kvs_id: str | None = None, to_kvs_name: str | None = None, to_kvs_storage_client: StorageClient | None = None, - to_kvs_configuration: Configuration | None = None, **kwargs: Unpack[ExportDataJsonKwargs], ) -> None: ... @@ -303,7 +301,6 @@ async def export_to( to_kvs_id: str | None = None, to_kvs_name: str | None = None, to_kvs_storage_client: StorageClient | None = None, - to_kvs_configuration: Configuration | None = None, **kwargs: Unpack[ExportDataCsvKwargs], ) -> None: ... @@ -314,7 +311,6 @@ async def export_to( to_kvs_id: str | None = None, to_kvs_name: str | None = None, to_kvs_storage_client: StorageClient | None = None, - to_kvs_configuration: Configuration | None = None, **kwargs: Any, ) -> None: """Export the entire dataset into a specified file stored under a key in a key-value store. @@ -332,13 +328,11 @@ async def export_to( to_kvs_name: Name of the key-value store to save the exported file. Specify only one of ID or name. to_kvs_storage_client: Storage client to use for the key-value store. - to_kvs_configuration: Configuration for the key-value store. kwargs: Additional parameters for the export operation, specific to the chosen content type. """ kvs = await KeyValueStore.open( id=to_kvs_id, name=to_kvs_name, - configuration=to_kvs_configuration, storage_client=to_kvs_storage_client, ) dst = StringIO() diff --git a/src/crawlee/storages/_key_value_store.py b/src/crawlee/storages/_key_value_store.py index 7c75d04133..e46ade9c0d 100644 --- a/src/crawlee/storages/_key_value_store.py +++ b/src/crawlee/storages/_key_value_store.py @@ -19,7 +19,6 @@ if TYPE_CHECKING: from collections.abc import AsyncIterator - from crawlee.configuration import Configuration from crawlee.storage_clients import StorageClient from crawlee.storage_clients._base import KeyValueStoreClient from crawlee.storage_clients.models import KeyValueStoreMetadata, KeyValueStoreRecordMetadata diff --git a/src/crawlee/storages/_request_queue.py b/src/crawlee/storages/_request_queue.py index 9c0e876f0c..1d60d19895 100644 --- a/src/crawlee/storages/_request_queue.py +++ b/src/crawlee/storages/_request_queue.py @@ -18,7 +18,6 @@ from collections.abc import Sequence from crawlee import Request - from crawlee.configuration import Configuration from crawlee.storage_clients import StorageClient from crawlee.storage_clients._base import RequestQueueClient from crawlee.storage_clients.models import ProcessedRequest, RequestQueueMetadata diff --git a/src/crawlee/storages/_storage_instance_manager.py b/src/crawlee/storages/_storage_instance_manager.py index 82ea840fc9..14cae6223b 100644 --- a/src/crawlee/storages/_storage_instance_manager.py +++ b/src/crawlee/storages/_storage_instance_manager.py @@ -1,17 +1,45 @@ from __future__ import annotations -from collections.abc import Awaitable, Callable -from typing import TYPE_CHECKING, TypeVar, cast +from collections import defaultdict +from collections.abc import Callable, Coroutine +from dataclasses import dataclass, field +from typing import TYPE_CHECKING, Any, TypeVar, cast + +from mypy_extensions import DefaultNamedArg from crawlee.storage_clients._base import DatasetClient, KeyValueStoreClient, RequestQueueClient -from ._base import Storage +from . import Dataset, KeyValueStore, RequestQueue if TYPE_CHECKING: - from ..storage_clients import StorageClient + from crawlee.storage_clients import StorageClient + + from ._base import Storage T = TypeVar('T', bound='Storage') +ClientOpener = Callable[ + [DefaultNamedArg(str | None, 'id'), DefaultNamedArg(str | None, 'name')], + Coroutine[Any, Any, DatasetClient | KeyValueStoreClient | RequestQueueClient], +] +"""Type alias for the client opener function.""" + + +@dataclass +class _StorageClientCache: + """Cache for specific storage client.""" + + by_id: defaultdict[type[Storage], defaultdict[str, Storage]] = field( + default_factory=lambda: defaultdict(lambda: defaultdict()) + ) + """Cache for storage instances by ID, separated by storage type.""" + by_name: defaultdict[type[Storage], defaultdict[str, Storage]] = field( + default_factory=lambda: defaultdict(lambda: defaultdict()) + ) + """Cache for storage instances by name, separated by storage type.""" + default_instances: defaultdict[type[Storage], Storage] = field(default_factory=lambda: defaultdict()) + """Cache for default instances of each storage type.""" + class StorageInstanceManager: """Manager for caching and managing storage instances. @@ -21,14 +49,7 @@ class StorageInstanceManager: """ def __init__(self) -> None: - self._cache_by_id = dict[type[Storage], dict[str, Storage]]() - """Cache for storage instances by ID, separated by storage type.""" - - self._cache_by_name = dict[type[Storage], dict[str, Storage]]() - """Cache for storage instances by name, separated by storage type.""" - - self._default_instances = dict[type[Storage], Storage]() - """Cache for default instances of each storage type.""" + self._cache_by_storage_client: dict[StorageClient, _StorageClientCache] = defaultdict(_StorageClientCache) async def open_storage_instance( self, @@ -56,42 +77,49 @@ async def open_storage_instance( raise ValueError('Only one of "id" or "name" can be specified, not both.') # Check for default instance - if id is None and name is None and cls in self._default_instances: - return cast('T', self._default_instances[cls]) + if id is None and name is None and cls in self._cache_by_storage_client[storage_client].default_instances: + return cast('T', self._cache_by_storage_client[storage_client].default_instances[cls]) # Check cache if id is not None: - type_cache_by_id = self._cache_by_id.get(cls, {}) + type_cache_by_id = self._cache_by_storage_client[storage_client].by_id[cls] if id in type_cache_by_id: cached_instance = type_cache_by_id[id] if isinstance(cached_instance, cls): return cached_instance if name is not None: - type_cache_by_name = self._cache_by_name.get(cls, {}) + type_cache_by_name = self._cache_by_storage_client[storage_client].by_name[cls] if name in type_cache_by_name: cached_instance = type_cache_by_name[name] if isinstance(cached_instance, cls): return cached_instance + client_opener: ClientOpener # Create new instance - client = await client_opener(id=id, name=name, configuration=configuration) + if cls is Dataset: + client_opener = storage_client.create_dataset_client + elif cls is KeyValueStore: + client_opener = storage_client.create_kvs_client + elif cls is RequestQueue: + client_opener = storage_client.create_rq_client + else: + raise ValueError(f'Unsupported storage class: {cls.__name__}') + + client = await client_opener(id=id, name=name) metadata = await client.get_metadata() instance = cls(client, metadata.id, metadata.name) # type: ignore[call-arg] instance_name = getattr(instance, 'name', None) # Cache the instance - type_cache_by_id = self._cache_by_id.setdefault(cls, {}) - type_cache_by_name = self._cache_by_name.setdefault(cls, {}) - - type_cache_by_id[instance.id] = instance + self._cache_by_storage_client[storage_client].by_id[cls][instance.id] = instance if instance_name is not None: - type_cache_by_name[instance_name] = instance + self._cache_by_storage_client[storage_client].by_name[cls][instance_name] = instance # Set as default if no id/name specified if id is None and name is None: - self._default_instances[cls] = instance + self._cache_by_storage_client[storage_client].default_instances[cls] = instance return instance @@ -104,22 +132,23 @@ def remove_from_cache(self, storage_instance: Storage) -> None: storage_type = type(storage_instance) # Remove from ID cache - type_cache_by_id = self._cache_by_id.get(storage_type, {}) - if storage_instance.id in type_cache_by_id: - del type_cache_by_id[storage_instance.id] - - # Remove from name cache - if storage_instance.name is not None: - type_cache_by_name = self._cache_by_name.get(storage_type, {}) - if storage_instance.name in type_cache_by_name: + for client_cache in self._cache_by_storage_client.values(): + type_cache_by_id = client_cache.by_id[storage_type] + if storage_instance.id in type_cache_by_id: + del type_cache_by_id[storage_instance.id] + + # Remove from name cache + type_cache_by_name = client_cache.by_name[storage_type] + if storage_instance.name in type_cache_by_name and storage_instance.name: del type_cache_by_name[storage_instance.name] - # Remove from default instances - if storage_type in self._default_instances and self._default_instances[storage_type] is storage_instance: - del self._default_instances[storage_type] + # Remove from default instances + if ( + storage_type in client_cache.default_instances + and client_cache.default_instances[storage_type] is storage_instance + ): + del client_cache.default_instances[storage_type] def clear_cache(self) -> None: """Clear all cached storage instances.""" - self._cache_by_id.clear() - self._cache_by_name.clear() - self._default_instances.clear() + self._cache_by_storage_client = defaultdict(_StorageClientCache) diff --git a/tests/unit/_autoscaling/test_snapshotter.py b/tests/unit/_autoscaling/test_snapshotter.py index e923eac421..3f13216f63 100644 --- a/tests/unit/_autoscaling/test_snapshotter.py +++ b/tests/unit/_autoscaling/test_snapshotter.py @@ -225,7 +225,7 @@ def test_snapshot_pruning_keeps_recent_records_unaffected(snapshotter: Snapshott def test_memory_load_evaluation_logs_warning_on_high_usage(caplog: pytest.LogCaptureFixture) -> None: - config = Configuration(memory_mbytes=ByteSize.from_gb(8).bytes) + config = Configuration(memory_mbytes=ByteSize.from_gb(8).to_mb()) snapshotter = Snapshotter.from_config(config) diff --git a/tests/unit/conftest.py b/tests/unit/conftest.py index e57f190bc3..a1be6b4a71 100644 --- a/tests/unit/conftest.py +++ b/tests/unit/conftest.py @@ -51,28 +51,14 @@ def _prepare_test_env() -> None: # Set the environment variable for the local storage directory to the temporary path. monkeypatch.setenv('CRAWLEE_STORAGE_DIR', str(tmp_path)) - # Reset the flags in the service locator to indicate that no services are explicitly set. This ensures - # a clean state, as services might have been set during a previous test and not reset properly. - service_locator._configuration_was_retrieved = False - service_locator._storage_client_was_retrieved = False - service_locator._event_manager_was_retrieved = False - # Reset the services in the service locator. service_locator._configuration = None service_locator._event_manager = None service_locator._storage_client = None service_locator._storage_instance_manager = None - # Reset the retrieval flags - service_locator._configuration_was_retrieved = False - service_locator._event_manager_was_retrieved = False - service_locator._storage_client_was_retrieved = False - # Verify that the test environment was set up correctly. assert os.environ.get('CRAWLEE_STORAGE_DIR') == str(tmp_path) - assert service_locator._configuration_was_retrieved is False - assert service_locator._storage_client_was_retrieved is False - assert service_locator._event_manager_was_retrieved is False return _prepare_test_env diff --git a/tests/unit/crawlers/_basic/test_basic_crawler.py b/tests/unit/crawlers/_basic/test_basic_crawler.py index eff582e603..7a4372f1ab 100644 --- a/tests/unit/crawlers/_basic/test_basic_crawler.py +++ b/tests/unit/crawlers/_basic/test_basic_crawler.py @@ -10,6 +10,7 @@ from collections import Counter from dataclasses import dataclass from datetime import timedelta +from itertools import product from typing import TYPE_CHECKING, Any, Literal, cast from unittest.mock import AsyncMock, Mock, call, patch @@ -27,7 +28,7 @@ from crawlee.request_loaders import RequestList, RequestManagerTandem from crawlee.sessions import Session, SessionPool from crawlee.statistics import FinalStatistics -from crawlee.storage_clients import MemoryStorageClient +from crawlee.storage_clients import FileSystemStorageClient, MemoryStorageClient from crawlee.storages import Dataset, KeyValueStore, RequestQueue if TYPE_CHECKING: @@ -1040,7 +1041,7 @@ async def handler(context: BasicCrawlingContext) -> None: assert stats.requests_finished == 2 -async def test_sets_services() -> None: +async def test_services_no_side_effet_on_crawler_init() -> None: custom_configuration = Configuration() custom_event_manager = LocalEventManager.from_config(custom_configuration) custom_storage_client = MemoryStorageClient() @@ -1051,9 +1052,125 @@ async def test_sets_services() -> None: storage_client=custom_storage_client, ) - assert service_locator.get_configuration() is custom_configuration - assert service_locator.get_event_manager() is custom_event_manager - assert service_locator.get_storage_client() is custom_storage_client + assert service_locator.get_configuration() is not custom_configuration + assert service_locator.get_event_manager() is not custom_event_manager + assert service_locator.get_storage_client() is not custom_storage_client + + +async def test_crawler_uses_default_services() -> None: + custom_configuration = Configuration() + service_locator.set_configuration(custom_configuration) + + custom_event_manager = LocalEventManager.from_config(custom_configuration) + service_locator.set_event_manager(custom_event_manager) + + custom_storage_client = MemoryStorageClient() + service_locator.set_storage_client(custom_storage_client) + + basic_crawler = BasicCrawler() + + assert basic_crawler._service_locator.get_configuration() is custom_configuration + assert basic_crawler._service_locator.get_event_manager() is custom_event_manager + assert basic_crawler._service_locator.get_storage_client() is custom_storage_client + + +async def test_services_crawlers_can_use_different_services() -> None: + custom_configuration_1 = Configuration() + custom_event_manager_1 = LocalEventManager.from_config(custom_configuration_1) + custom_storage_client_1 = MemoryStorageClient() + + custom_configuration_2 = Configuration() + custom_event_manager_2 = LocalEventManager.from_config(custom_configuration_2) + custom_storage_client_2 = MemoryStorageClient() + + _ = BasicCrawler( + configuration=custom_configuration_1, + event_manager=custom_event_manager_1, + storage_client=custom_storage_client_1, + ) + + _ = BasicCrawler( + configuration=custom_configuration_2, + event_manager=custom_event_manager_2, + storage_client=custom_storage_client_2, + ) + + +async def test_crawler_uses_default_storages(tmp_path: Path) -> None: + configuration = Configuration( + crawlee_storage_dir=str(tmp_path), # type: ignore[call-arg] + purge_on_start=True, + ) + service_locator.set_configuration(configuration) + + dataset = await Dataset.open() + kvs = await KeyValueStore.open() + rq = await RequestQueue.open() + + crawler = BasicCrawler() + + assert dataset is await crawler.get_dataset() + assert kvs is await crawler.get_key_value_store() + assert rq is await crawler.get_request_manager() + + +async def test_crawler_can_use_other_storages(tmp_path: Path) -> None: + configuration = Configuration( + crawlee_storage_dir=str(tmp_path), # type: ignore[call-arg] + purge_on_start=True, + ) + service_locator.set_configuration(configuration) + + dataset = await Dataset.open() + kvs = await KeyValueStore.open() + rq = await RequestQueue.open() + + crawler = BasicCrawler(storage_client=MemoryStorageClient()) + + assert dataset is not await crawler.get_dataset() + assert kvs is not await crawler.get_key_value_store() + assert rq is not await crawler.get_request_manager() + + +async def test_crawler_can_use_other_storages_of_same_type(tmp_path: Path) -> None: + """Test that crawler can use non-global storage of the same type as global storage without conflicts""" + a_path = tmp_path / 'a' + b_path = tmp_path / 'b' + a_path.mkdir() + b_path.mkdir() + expected_paths = { + path / storage + for path, storage in product({a_path, b_path}, {'datasets', 'key_value_stores', 'request_queues'}) + } + + configuration_a = Configuration( + crawlee_storage_dir=str(a_path), # type: ignore[call-arg] + purge_on_start=True, + ) + configuration_b = Configuration( + crawlee_storage_dir=str(b_path), # type: ignore[call-arg] + purge_on_start=True, + ) + + # Set global configuration + service_locator.set_configuration(configuration_a) + service_locator.set_storage_client(FileSystemStorageClient()) + # Create storages based on the global services + dataset = await Dataset.open() + kvs = await KeyValueStore.open() + rq = await RequestQueue.open() + + # Set the crawler to use different storage client + crawler = BasicCrawler(storage_client=FileSystemStorageClient(configuration=configuration_b)) + + # Assert that the storages are different + assert dataset is not await crawler.get_dataset() + assert kvs is not await crawler.get_key_value_store() + assert rq is not await crawler.get_request_manager() + + # Assert that all storages exists on the filesystem + for path in expected_paths: + assert path.is_dir() async def test_allows_storage_client_overwrite_before_run(monkeypatch: pytest.MonkeyPatch) -> None: diff --git a/tests/unit/storage_clients/_file_system/test_fs_dataset_client.py b/tests/unit/storage_clients/_file_system/test_fs_dataset_client.py index c5f31f144e..28aa16a286 100644 --- a/tests/unit/storage_clients/_file_system/test_fs_dataset_client.py +++ b/tests/unit/storage_clients/_file_system/test_fs_dataset_client.py @@ -27,9 +27,8 @@ def configuration(tmp_path: Path) -> Configuration: @pytest.fixture async def dataset_client(configuration: Configuration) -> AsyncGenerator[FileSystemDatasetClient, None]: """A fixture for a file system dataset client.""" - client = await FileSystemStorageClient().create_dataset_client( + client = await FileSystemStorageClient(configuration=configuration).create_dataset_client( name='test_dataset', - configuration=configuration, ) yield client await client.drop() @@ -37,9 +36,8 @@ async def dataset_client(configuration: Configuration) -> AsyncGenerator[FileSys async def test_file_and_directory_creation(configuration: Configuration) -> None: """Test that file system dataset creates proper files and directories.""" - client = await FileSystemStorageClient().create_dataset_client( + client = await FileSystemStorageClient(configuration=configuration).create_dataset_client( name='new_dataset', - configuration=configuration, ) # Verify files were created @@ -137,15 +135,12 @@ async def test_metadata_file_updates(dataset_client: FileSystemDatasetClient) -> assert metadata_json['item_count'] == 1 -async def test_data_persistence_across_reopens(configuration: Configuration) -> None: +async def test_data_persistence_across_reopens() -> None: """Test that data persists correctly when reopening the same dataset.""" storage_client = FileSystemStorageClient() # Create dataset and add data - original_client = await storage_client.create_dataset_client( - name='persistence-test', - configuration=configuration, - ) + original_client = await storage_client.create_dataset_client(name='persistence-test') test_data = {'test_item': 'test_value', 'id': 123} await original_client.push_data(test_data) @@ -153,10 +148,7 @@ async def test_data_persistence_across_reopens(configuration: Configuration) -> dataset_id = (await original_client.get_metadata()).id # Reopen by ID and verify data persists - reopened_client = await storage_client.create_dataset_client( - id=dataset_id, - configuration=configuration, - ) + reopened_client = await storage_client.create_dataset_client(id=dataset_id) data = await reopened_client.get_data() assert len(data.items) == 1 diff --git a/tests/unit/storage_clients/_file_system/test_fs_kvs_client.py b/tests/unit/storage_clients/_file_system/test_fs_kvs_client.py index c5bfa96c47..b8d98d309e 100644 --- a/tests/unit/storage_clients/_file_system/test_fs_kvs_client.py +++ b/tests/unit/storage_clients/_file_system/test_fs_kvs_client.py @@ -27,20 +27,14 @@ def configuration(tmp_path: Path) -> Configuration: @pytest.fixture async def kvs_client(configuration: Configuration) -> AsyncGenerator[FileSystemKeyValueStoreClient, None]: """A fixture for a file system key-value store client.""" - client = await FileSystemStorageClient().create_kvs_client( - name='test_kvs', - configuration=configuration, - ) + client = await FileSystemStorageClient(configuration=configuration).create_kvs_client(name='test_kvs') yield client await client.drop() async def test_file_and_directory_creation(configuration: Configuration) -> None: """Test that file system KVS creates proper files and directories.""" - client = await FileSystemStorageClient().create_kvs_client( - name='new_kvs', - configuration=configuration, - ) + client = await FileSystemStorageClient(configuration=configuration).create_kvs_client(name='new_kvs') # Verify files were created assert client.path_to_kvs.exists() @@ -184,12 +178,11 @@ async def test_metadata_file_updates(kvs_client: FileSystemKeyValueStoreClient) async def test_data_persistence_across_reopens(configuration: Configuration) -> None: """Test that data persists correctly when reopening the same KVS.""" - storage_client = FileSystemStorageClient() + storage_client = FileSystemStorageClient(configuration=configuration) # Create KVS and add data original_client = await storage_client.create_kvs_client( name='persistence-test', - configuration=configuration, ) test_key = 'persistent-key' @@ -201,7 +194,6 @@ async def test_data_persistence_across_reopens(configuration: Configuration) -> # Reopen by ID and verify data persists reopened_client = await storage_client.create_kvs_client( id=kvs_id, - configuration=configuration, ) record = await reopened_client.get_value(key=test_key) diff --git a/tests/unit/storage_clients/_file_system/test_fs_rq_client.py b/tests/unit/storage_clients/_file_system/test_fs_rq_client.py index 0be182fcd8..2ec3fdf1fe 100644 --- a/tests/unit/storage_clients/_file_system/test_fs_rq_client.py +++ b/tests/unit/storage_clients/_file_system/test_fs_rq_client.py @@ -27,9 +27,8 @@ def configuration(tmp_path: Path) -> Configuration: @pytest.fixture async def rq_client(configuration: Configuration) -> AsyncGenerator[FileSystemRequestQueueClient, None]: """A fixture for a file system request queue client.""" - client = await FileSystemStorageClient().create_rq_client( + client = await FileSystemStorageClient(configuration=configuration).create_rq_client( name='test_request_queue', - configuration=configuration, ) yield client await client.drop() @@ -37,10 +36,7 @@ async def rq_client(configuration: Configuration) -> AsyncGenerator[FileSystemRe async def test_file_and_directory_creation(configuration: Configuration) -> None: """Test that file system RQ creates proper files and directories.""" - client = await FileSystemStorageClient().create_rq_client( - name='new_request_queue', - configuration=configuration, - ) + client = await FileSystemStorageClient(configuration=configuration).create_rq_client(name='new_request_queue') # Verify files were created assert client.path_to_rq.exists() @@ -137,12 +133,11 @@ async def test_metadata_file_updates(rq_client: FileSystemRequestQueueClient) -> async def test_data_persistence_across_reopens(configuration: Configuration) -> None: """Test that requests persist correctly when reopening the same RQ.""" - storage_client = FileSystemStorageClient() + storage_client = FileSystemStorageClient(configuration=configuration) # Create RQ and add requests original_client = await storage_client.create_rq_client( name='persistence-test', - configuration=configuration, ) test_requests = [ @@ -156,7 +151,6 @@ async def test_data_persistence_across_reopens(configuration: Configuration) -> # Reopen by ID and verify requests persist reopened_client = await storage_client.create_rq_client( id=rq_id, - configuration=configuration, ) metadata = await reopened_client.get_metadata() diff --git a/tests/unit/storage_clients/_memory/test_memory_dataset_client.py b/tests/unit/storage_clients/_memory/test_memory_dataset_client.py index 8cc846b0f4..31aa61f0ac 100644 --- a/tests/unit/storage_clients/_memory/test_memory_dataset_client.py +++ b/tests/unit/storage_clients/_memory/test_memory_dataset_client.py @@ -27,9 +27,8 @@ async def test_memory_specific_purge_behavior() -> None: configuration = Configuration(purge_on_start=True) # Create dataset and add data - dataset_client1 = await MemoryStorageClient().create_dataset_client( + dataset_client1 = await MemoryStorageClient(configuration=configuration).create_dataset_client( name='test_purge_dataset', - configuration=configuration, ) await dataset_client1.push_data({'item': 'initial data'}) @@ -38,9 +37,8 @@ async def test_memory_specific_purge_behavior() -> None: assert len(items.items) == 1 # Reopen with same storage client instance - dataset_client2 = await MemoryStorageClient().create_dataset_client( + dataset_client2 = await MemoryStorageClient(configuration=configuration).create_dataset_client( name='test_purge_dataset', - configuration=configuration, ) # Verify data was purged (memory storage specific behavior) diff --git a/tests/unit/storage_clients/_memory/test_memory_kvs_client.py b/tests/unit/storage_clients/_memory/test_memory_kvs_client.py index 463fb2a14c..f8b8e755b2 100644 --- a/tests/unit/storage_clients/_memory/test_memory_kvs_client.py +++ b/tests/unit/storage_clients/_memory/test_memory_kvs_client.py @@ -27,9 +27,8 @@ async def test_memory_specific_purge_behavior() -> None: configuration = Configuration(purge_on_start=True) # Create KVS and add data - kvs_client1 = await MemoryStorageClient().create_kvs_client( + kvs_client1 = await MemoryStorageClient(configuration=configuration).create_kvs_client( name='test_purge_kvs', - configuration=configuration, ) await kvs_client1.set_value(key='test-key', value='initial value') @@ -39,9 +38,8 @@ async def test_memory_specific_purge_behavior() -> None: assert record.value == 'initial value' # Reopen with same storage client instance - kvs_client2 = await MemoryStorageClient().create_kvs_client( + kvs_client2 = await MemoryStorageClient(configuration=configuration).create_kvs_client( name='test_purge_kvs', - configuration=configuration, ) # Verify value was purged (memory storage specific behavior) diff --git a/tests/unit/storage_clients/_memory/test_memory_rq_client.py b/tests/unit/storage_clients/_memory/test_memory_rq_client.py index 7877d8af79..89b98cfac5 100644 --- a/tests/unit/storage_clients/_memory/test_memory_rq_client.py +++ b/tests/unit/storage_clients/_memory/test_memory_rq_client.py @@ -28,9 +28,8 @@ async def test_memory_specific_purge_behavior() -> None: configuration = Configuration(purge_on_start=True) # Create RQ and add data - rq_client1 = await MemoryStorageClient().create_rq_client( + rq_client1 = await MemoryStorageClient(configuration=configuration).create_rq_client( name='test_purge_rq', - configuration=configuration, ) request = Request.from_url(url='https://example.com/initial') await rq_client1.add_batch_of_requests([request]) @@ -39,9 +38,8 @@ async def test_memory_specific_purge_behavior() -> None: assert await rq_client1.is_empty() is False # Reopen with same storage client instance - rq_client2 = await MemoryStorageClient().create_rq_client( + rq_client2 = await MemoryStorageClient(configuration=configuration).create_rq_client( name='test_purge_rq', - configuration=configuration, ) # Verify queue was purged (memory storage specific behavior) diff --git a/tests/unit/storages/test_dataset.py b/tests/unit/storages/test_dataset.py index b4f75bc6b4..83489ff1e0 100644 --- a/tests/unit/storages/test_dataset.py +++ b/tests/unit/storages/test_dataset.py @@ -7,6 +7,7 @@ import pytest +from crawlee._service_locator import service_locator from crawlee.configuration import Configuration from crawlee.storage_clients import FileSystemStorageClient, MemoryStorageClient from crawlee.storages import Dataset, KeyValueStore @@ -20,32 +21,35 @@ @pytest.fixture(params=['memory', 'file_system']) -def storage_client(request: pytest.FixtureRequest) -> StorageClient: +def storage_client(request: pytest.FixtureRequest, configuration: Configuration) -> StorageClient: """Parameterized fixture to test with different storage clients.""" + storage_client: StorageClient if request.param == 'memory': - return MemoryStorageClient() - - return FileSystemStorageClient() + storage_client = MemoryStorageClient(configuration=configuration) + else: + storage_client = FileSystemStorageClient(configuration=configuration) + service_locator.set_storage_client(storage_client) + return storage_client @pytest.fixture def configuration(tmp_path: Path) -> Configuration: """Provide a configuration with a temporary storage directory.""" - return Configuration( + configuration = Configuration( crawlee_storage_dir=str(tmp_path), # type: ignore[call-arg] purge_on_start=True, ) + service_locator.set_configuration(configuration) + return configuration @pytest.fixture async def dataset( storage_client: StorageClient, - configuration: Configuration, ) -> AsyncGenerator[Dataset, None]: """Fixture that provides a dataset instance for each test.""" dataset = await Dataset.open( storage_client=storage_client, - configuration=configuration, ) yield dataset @@ -54,13 +58,11 @@ async def dataset( async def test_open_creates_new_dataset( storage_client: StorageClient, - configuration: Configuration, ) -> None: """Test that open() creates a new dataset with proper metadata.""" dataset = await Dataset.open( name='new_dataset', storage_client=storage_client, - configuration=configuration, ) # Verify dataset properties @@ -75,13 +77,11 @@ async def test_open_creates_new_dataset( async def test_reopen_default( storage_client: StorageClient, - configuration: Configuration, ) -> None: """Test reopening a dataset with default parameters.""" # Create a first dataset instance with default parameters dataset_1 = await Dataset.open( storage_client=storage_client, - configuration=configuration, ) # Verify default properties @@ -97,7 +97,6 @@ async def test_reopen_default( # Reopen the same dataset dataset_2 = await Dataset.open( storage_client=storage_client, - configuration=configuration, ) # Verify both instances reference the same dataset @@ -116,14 +115,12 @@ async def test_reopen_default( async def test_open_by_id( storage_client: StorageClient, - configuration: Configuration, ) -> None: """Test opening a dataset by its ID.""" # First create a dataset by name dataset1 = await Dataset.open( name='dataset_by_id_test', storage_client=storage_client, - configuration=configuration, ) # Add some data to identify it @@ -134,7 +131,6 @@ async def test_open_by_id( dataset2 = await Dataset.open( id=dataset1.id, storage_client=storage_client, - configuration=configuration, ) # Verify it's the same dataset @@ -153,13 +149,11 @@ async def test_open_by_id( async def test_open_existing_dataset( dataset: Dataset, - storage_client: StorageClient, ) -> None: """Test that open() loads an existing dataset correctly.""" # Open the same dataset again reopened_dataset = await Dataset.open( name=dataset.name, - storage_client=storage_client, ) # Verify dataset properties @@ -175,7 +169,6 @@ async def test_open_existing_dataset( async def test_open_with_id_and_name( storage_client: StorageClient, - configuration: Configuration, ) -> None: """Test that open() raises an error when both id and name are provided.""" with pytest.raises(ValueError, match='Only one of "id" or "name" can be specified'): @@ -183,7 +176,6 @@ async def test_open_with_id_and_name( id='some-id', name='some-name', storage_client=storage_client, - configuration=configuration, ) @@ -395,13 +387,11 @@ async def test_list_items_with_options(dataset: Dataset) -> None: async def test_drop( storage_client: StorageClient, - configuration: Configuration, ) -> None: """Test dropping a dataset removes it from cache and clears its data.""" dataset = await Dataset.open( name='drop_test', storage_client=storage_client, - configuration=configuration, ) # Add some data @@ -414,7 +404,6 @@ async def test_drop( new_dataset = await Dataset.open( name='drop_test', storage_client=storage_client, - configuration=configuration, ) result = await new_dataset.get_data() @@ -430,7 +419,6 @@ async def test_export_to_json( # Create a key-value store for export kvs = await KeyValueStore.open( name='export_kvs', - storage_client=storage_client, ) # Add some items to the dataset @@ -530,14 +518,12 @@ async def test_large_dataset(dataset: Dataset) -> None: async def test_purge( storage_client: StorageClient, - configuration: Configuration, ) -> None: """Test purging a dataset removes all data but keeps the dataset itself.""" # First create a dataset dataset = await Dataset.open( name='purge_test_dataset', storage_client=storage_client, - configuration=configuration, ) # Add some data diff --git a/tests/unit/storages/test_key_value_store.py b/tests/unit/storages/test_key_value_store.py index 25bbcb4fc0..1bb3de81bf 100644 --- a/tests/unit/storages/test_key_value_store.py +++ b/tests/unit/storages/test_key_value_store.py @@ -8,6 +8,7 @@ import pytest +from crawlee import service_locator from crawlee.configuration import Configuration from crawlee.storage_clients import FileSystemStorageClient, MemoryStorageClient from crawlee.storages import KeyValueStore @@ -20,32 +21,35 @@ @pytest.fixture(params=['memory', 'file_system']) -def storage_client(request: pytest.FixtureRequest) -> StorageClient: +def storage_client(request: pytest.FixtureRequest, configuration: Configuration) -> StorageClient: """Parameterized fixture to test with different storage clients.""" + storage_client: StorageClient if request.param == 'memory': - return MemoryStorageClient() - - return FileSystemStorageClient() + storage_client = MemoryStorageClient(configuration=configuration) + else: + storage_client = FileSystemStorageClient(configuration=configuration) + service_locator.set_storage_client(storage_client) + return storage_client @pytest.fixture def configuration(tmp_path: Path) -> Configuration: """Provide a configuration with a temporary storage directory.""" - return Configuration( + configuration = Configuration( crawlee_storage_dir=str(tmp_path), # type: ignore[call-arg] purge_on_start=True, ) + service_locator.set_configuration(configuration) + return configuration @pytest.fixture async def kvs( storage_client: StorageClient, - configuration: Configuration, ) -> AsyncGenerator[KeyValueStore, None]: """Fixture that provides a key-value store instance for each test.""" kvs = await KeyValueStore.open( storage_client=storage_client, - configuration=configuration, ) yield kvs @@ -54,13 +58,11 @@ async def kvs( async def test_open_creates_new_kvs( storage_client: StorageClient, - configuration: Configuration, ) -> None: """Test that open() creates a new key-value store with proper metadata.""" kvs = await KeyValueStore.open( name='new_kvs', storage_client=storage_client, - configuration=configuration, ) # Verify key-value store properties @@ -91,7 +93,6 @@ async def test_open_existing_kvs( async def test_open_with_id_and_name( storage_client: StorageClient, - configuration: Configuration, ) -> None: """Test that open() raises an error when both id and name are provided.""" with pytest.raises(ValueError, match='Only one of "id" or "name" can be specified'): @@ -99,20 +100,17 @@ async def test_open_with_id_and_name( id='some-id', name='some-name', storage_client=storage_client, - configuration=configuration, ) async def test_open_by_id( storage_client: StorageClient, - configuration: Configuration, ) -> None: """Test opening a key-value store by its ID.""" # First create a key-value store by name kvs1 = await KeyValueStore.open( name='kvs_by_id_test', storage_client=storage_client, - configuration=configuration, ) # Add some data to identify it @@ -122,7 +120,6 @@ async def test_open_by_id( kvs2 = await KeyValueStore.open( id=kvs1.id, storage_client=storage_client, - configuration=configuration, ) # Verify it's the same key-value store @@ -294,13 +291,11 @@ async def test_iterate_keys_with_limit(kvs: KeyValueStore) -> None: async def test_drop( storage_client: StorageClient, - configuration: Configuration, ) -> None: """Test dropping a key-value store removes it from cache and clears its data.""" kvs = await KeyValueStore.open( name='drop_test', storage_client=storage_client, - configuration=configuration, ) # Add some data @@ -313,7 +308,6 @@ async def test_drop( new_kvs = await KeyValueStore.open( name='drop_test', storage_client=storage_client, - configuration=configuration, ) # Attempt to get a previously stored value @@ -324,13 +318,11 @@ async def test_drop( async def test_reopen_default( storage_client: StorageClient, - configuration: Configuration, ) -> None: """Test reopening the default key-value store.""" # Open the default key-value store kvs1 = await KeyValueStore.open( storage_client=storage_client, - configuration=configuration, ) # Set a value @@ -339,7 +331,6 @@ async def test_reopen_default( # Open the default key-value store again kvs2 = await KeyValueStore.open( storage_client=storage_client, - configuration=configuration, ) # Verify they are the same store @@ -416,16 +407,16 @@ async def test_key_with_special_characters(kvs: KeyValueStore) -> None: assert await kvs.get_value(key=special_key) is None -async def test_data_persistence_on_reopen(configuration: Configuration) -> None: +async def test_data_persistence_on_reopen() -> None: """Test that data persists when reopening a KeyValueStore.""" - kvs1 = await KeyValueStore.open(configuration=configuration) + kvs1 = await KeyValueStore.open() await kvs1.set_value('key_123', 'value_123') result1 = await kvs1.get_value('key_123') assert result1 == 'value_123' - kvs2 = await KeyValueStore.open(configuration=configuration) + kvs2 = await KeyValueStore.open() result2 = await kvs2.get_value('key_123') assert result2 == 'value_123' @@ -439,14 +430,12 @@ async def test_data_persistence_on_reopen(configuration: Configuration) -> None: async def test_purge( storage_client: StorageClient, - configuration: Configuration, ) -> None: """Test purging a key-value store removes all values but keeps the store itself.""" # First create a key-value store kvs = await KeyValueStore.open( name='purge_test_kvs', storage_client=storage_client, - configuration=configuration, ) # Add some values diff --git a/tests/unit/storages/test_request_queue.py b/tests/unit/storages/test_request_queue.py index 7227504a95..e28faa23e8 100644 --- a/tests/unit/storages/test_request_queue.py +++ b/tests/unit/storages/test_request_queue.py @@ -21,32 +21,35 @@ @pytest.fixture(params=['memory', 'file_system']) -def storage_client(request: pytest.FixtureRequest) -> StorageClient: +def storage_client(request: pytest.FixtureRequest, configuration: Configuration) -> StorageClient: """Parameterized fixture to test with different storage clients.""" + storage_client: StorageClient if request.param == 'memory': - return MemoryStorageClient() - - return FileSystemStorageClient() + storage_client = MemoryStorageClient(configuration=configuration) + else: + storage_client = FileSystemStorageClient(configuration=configuration) + service_locator.set_storage_client(storage_client) + return storage_client @pytest.fixture def configuration(tmp_path: Path) -> Configuration: """Provide a configuration with a temporary storage directory.""" - return Configuration( + configuration = Configuration( crawlee_storage_dir=str(tmp_path), # type: ignore[call-arg] purge_on_start=True, ) + service_locator.set_configuration(configuration) + return configuration @pytest.fixture async def rq( storage_client: StorageClient, - configuration: Configuration, ) -> AsyncGenerator[RequestQueue, None]: """Fixture that provides a request queue instance for each test.""" rq = await RequestQueue.open( storage_client=storage_client, - configuration=configuration, ) yield rq @@ -55,13 +58,11 @@ async def rq( async def test_open_creates_new_rq( storage_client: StorageClient, - configuration: Configuration, ) -> None: """Test that open() creates a new request queue with proper metadata.""" rq = await RequestQueue.open( name='new_request_queue', storage_client=storage_client, - configuration=configuration, ) # Verify request queue properties @@ -96,7 +97,6 @@ async def test_open_existing_rq( async def test_open_with_id_and_name( storage_client: StorageClient, - configuration: Configuration, ) -> None: """Test that open() raises an error when both id and name are provided.""" with pytest.raises(ValueError, match='Only one of "id" or "name" can be specified'): @@ -104,20 +104,17 @@ async def test_open_with_id_and_name( id='some-id', name='some-name', storage_client=storage_client, - configuration=configuration, ) async def test_open_by_id( storage_client: StorageClient, - configuration: Configuration, ) -> None: """Test opening a request queue by its ID.""" # First create a request queue by name rq1 = await RequestQueue.open( name='rq_by_id_test', storage_client=storage_client, - configuration=configuration, ) # Add a request to identify it @@ -127,7 +124,6 @@ async def test_open_by_id( rq2 = await RequestQueue.open( id=rq1.id, storage_client=storage_client, - configuration=configuration, ) # Verify it's the same request queue @@ -498,13 +494,11 @@ async def test_reclaim_non_existent_request(rq: RequestQueue) -> None: async def test_drop( storage_client: StorageClient, - configuration: Configuration, ) -> None: """Test dropping a request queue removes it from cache and clears its data.""" rq = await RequestQueue.open( name='drop_test', storage_client=storage_client, - configuration=configuration, ) # Add a request @@ -517,7 +511,6 @@ async def test_drop( new_rq = await RequestQueue.open( name='drop_test', storage_client=storage_client, - configuration=configuration, ) # Verify the queue is empty @@ -530,7 +523,6 @@ async def test_drop( async def test_reopen_default( storage_client: StorageClient, - configuration: Configuration, ) -> None: """Test reopening the default request queue.""" # First clean up any storage instance caches @@ -540,7 +532,6 @@ async def test_reopen_default( # Open the default request queue rq1 = await RequestQueue.open( storage_client=storage_client, - configuration=configuration, ) # If a request queue already exists (due to previous test run), purge it to start fresh @@ -551,7 +542,6 @@ async def test_reopen_default( await rq1.drop() rq1 = await RequestQueue.open( storage_client=storage_client, - configuration=configuration, ) # Verify we're starting fresh @@ -568,7 +558,6 @@ async def test_reopen_default( # Open the default request queue again rq2 = await RequestQueue.open( storage_client=storage_client, - configuration=configuration, ) # Verify they are the same queue @@ -591,14 +580,12 @@ async def test_reopen_default( async def test_purge( storage_client: StorageClient, - configuration: Configuration, ) -> None: """Test purging a request queue removes all requests but keeps the queue itself.""" # First create a request queue rq = await RequestQueue.open( name='purge_test_queue', storage_client=storage_client, - configuration=configuration, ) # Add some requests diff --git a/tests/unit/storages/test_storage_instance_manager.py b/tests/unit/storages/test_storage_instance_manager.py new file mode 100644 index 0000000000..9a6cc955a2 --- /dev/null +++ b/tests/unit/storages/test_storage_instance_manager.py @@ -0,0 +1,83 @@ +from pathlib import Path + +from crawlee import service_locator +from crawlee.configuration import Configuration +from crawlee.storage_clients import FileSystemStorageClient, MemoryStorageClient +from crawlee.storages import Dataset, KeyValueStore + + +async def test_unique_storage_by_storage_client(tmp_path: Path) -> None: + config = Configuration( + crawlee_storage_dir=str(tmp_path), # type: ignore[call-arg] + purge_on_start=True, + ) + + kvs1 = await KeyValueStore.open(storage_client=MemoryStorageClient(configuration=config)) + kvs2 = await KeyValueStore.open(storage_client=FileSystemStorageClient(configuration=config)) + assert kvs1 is not kvs2 + + +async def test_unique_storage_by_storage_client_of_same_type(tmp_path: Path) -> None: + config = Configuration( + crawlee_storage_dir=str(tmp_path), # type: ignore[call-arg] + purge_on_start=True, + ) + + kvs1 = await KeyValueStore.open(storage_client=MemoryStorageClient(configuration=config)) + kvs2 = await KeyValueStore.open(storage_client=MemoryStorageClient(configuration=config)) + assert kvs1 is not kvs2 + + +async def test_unique_storage_by_storage_type(tmp_path: Path) -> None: + config = Configuration( + crawlee_storage_dir=str(tmp_path), # type: ignore[call-arg] + purge_on_start=True, + ) + storage_client = MemoryStorageClient(configuration=config) + + kvs = await KeyValueStore.open(storage_client=storage_client) + dataset = await Dataset.open(storage_client=storage_client) + assert kvs is not dataset + + +async def test_unique_storage_by_name(tmp_path: Path) -> None: + """Test that StorageInstanceManager support different storage clients at the same time.""" + config = Configuration( + crawlee_storage_dir=str(tmp_path), # type: ignore[call-arg] + purge_on_start=True, + ) + storage_client = FileSystemStorageClient(configuration=config) + + kvs1 = await KeyValueStore.open(storage_client=storage_client, name='kvs1') + kvs2 = await KeyValueStore.open(storage_client=storage_client, name='kvs2') + assert kvs1 is not kvs2 + + +async def test_identical_storage_default_config() -> None: + """Test that StorageInstanceManager correctly caches storage based on the storage client.""" + storage_client = MemoryStorageClient() + + kvs1 = await KeyValueStore.open(storage_client=storage_client) + kvs2 = await KeyValueStore.open(storage_client=storage_client) + assert kvs1 is kvs2 + + +async def test_identical_storage_default_storage() -> None: + """Test that StorageInstanceManager correctly caches storage based on the storage client.""" + kvs1 = await KeyValueStore.open() + kvs2 = await KeyValueStore.open() + assert kvs1 is kvs2 + + +async def test_identical_storage_clear_cache() -> None: + kvs1 = await KeyValueStore.open() + service_locator.storage_instance_manager.clear_cache() + kvs2 = await KeyValueStore.open() + assert kvs1 is not kvs2 + + +async def test_identical_storage_remove_from_cache() -> None: + kvs1 = await KeyValueStore.open() + service_locator.storage_instance_manager.remove_from_cache(kvs1) + kvs2 = await KeyValueStore.open() + assert kvs1 is not kvs2 diff --git a/tests/unit/test_service_locator.py b/tests/unit/test_service_locator.py index a4ed0620dd..15f5ee8aed 100644 --- a/tests/unit/test_service_locator.py +++ b/tests/unit/test_service_locator.py @@ -22,13 +22,13 @@ def test_custom_configuration() -> None: assert config is custom_config -def test_configuration_overwrite() -> None: +def test_configuration_overwrite_not_possible() -> None: default_config = Configuration() service_locator.set_configuration(default_config) custom_config = Configuration(default_browser_path='custom_path') - service_locator.set_configuration(custom_config) - assert service_locator.get_configuration() is custom_config + with pytest.raises(ServiceConflictError): + service_locator.set_configuration(custom_config) def test_configuration_conflict() -> None: @@ -51,15 +51,13 @@ def test_custom_event_manager() -> None: assert event_manager is custom_event_manager -def test_event_manager_overwrite() -> None: +def test_event_manager_overwrite_not_possible() -> None: custom_event_manager = LocalEventManager() service_locator.set_event_manager(custom_event_manager) another_custom_event_manager = LocalEventManager() - service_locator.set_event_manager(another_custom_event_manager) - - assert custom_event_manager != another_custom_event_manager - assert service_locator.get_event_manager() is another_custom_event_manager + with pytest.raises(ServiceConflictError): + service_locator.set_event_manager(another_custom_event_manager) def test_event_manager_conflict() -> None: @@ -82,15 +80,13 @@ def test_custom_storage_client() -> None: assert storage_client is custom_storage_client -def test_storage_client_overwrite() -> None: +def test_storage_client_overwrite_not_possible() -> None: custom_storage_client = MemoryStorageClient() service_locator.set_storage_client(custom_storage_client) another_custom_storage_client = MemoryStorageClient() - service_locator.set_storage_client(another_custom_storage_client) - - assert custom_storage_client != another_custom_storage_client - assert service_locator.get_storage_client() is another_custom_storage_client + with pytest.raises(ServiceConflictError): + service_locator.set_storage_client(another_custom_storage_client) def test_storage_client_conflict() -> None: diff --git a/uv.lock b/uv.lock index a58e64f995..7e502a6b21 100644 --- a/uv.lock +++ b/uv.lock @@ -1,5 +1,5 @@ version = 1 -revision = 3 +revision = 2 requires-python = ">=3.10" resolution-markers = [ "python_full_version >= '3.13'", @@ -577,7 +577,7 @@ toml = [ [[package]] name = "crawlee" -version = "0.6.13" +version = "1.0.0rc1" source = { editable = "." } dependencies = [ { name = "cachetools" }, From 9c6a5217a747905a4cece39f9202dd022e55e17c Mon Sep 17 00:00:00 2001 From: Josef Prochazka Date: Fri, 29 Aug 2025 14:33:18 +0200 Subject: [PATCH 03/16] Remove temp edits for e2e tests --- pyproject.toml | 2 +- src/crawlee/_service_locator.py | 18 ++++++++++++------ .../pyproject.toml | 4 ++-- .../requirements.txt | 4 ++-- uv.lock | 2 +- 5 files changed, 18 insertions(+), 12 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 9ce99bb21c..050ac52211 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "hatchling.build" [project] name = "crawlee" -version = "1.0.0rc1" +version = "0.6.13" description = "Crawlee for Python" authors = [{ name = "Apify Technologies s.r.o.", email = "support@apify.com" }] license = { file = "LICENSE" } diff --git a/src/crawlee/_service_locator.py b/src/crawlee/_service_locator.py index a3430dfc69..a4256e1eb4 100644 --- a/src/crawlee/_service_locator.py +++ b/src/crawlee/_service_locator.py @@ -63,11 +63,12 @@ def get_event_manager(self) -> EventManager: """Get the event manager.""" if self._event_manager is None: logger.warning('No event manager set, implicitly creating and using default LocalEventManager.') - self._event_manager = ( - LocalEventManager().from_config(config=self._configuration) - if self._configuration - else LocalEventManager.from_config() - ) + if self._configuration is None: + logger.warning( + 'Implicit creation of event manager will implicitly set configuration as side effect. ' + 'It is advised to explicitly first set the configuration instead.' + ) + self._event_manager = LocalEventManager().from_config(config=self._configuration) return self._event_manager @@ -92,7 +93,12 @@ def get_storage_client(self) -> StorageClient: """Get the storage client.""" if self._storage_client is None: logger.warning('No storage client set, implicitly creating and using default FileSystemStorageClient.') - self._storage_client = FileSystemStorageClient() + if self._configuration is None: + logger.warning( + 'Implicit creation of storage client will implicitly set configuration as side effect. ' + 'It is advised to explicitly first set the configuration instead.' + ) + self._storage_client = FileSystemStorageClient(configuration=self._configuration) return self._storage_client diff --git a/src/crawlee/project_template/{{cookiecutter.project_name}}/pyproject.toml b/src/crawlee/project_template/{{cookiecutter.project_name}}/pyproject.toml index 14705f3095..5c2146104d 100644 --- a/src/crawlee/project_template/{{cookiecutter.project_name}}/pyproject.toml +++ b/src/crawlee/project_template/{{cookiecutter.project_name}}/pyproject.toml @@ -19,12 +19,12 @@ authors = [ readme = "README.md" requires-python = ">=3.10,<4.0" dependencies = [ - "crawlee[{{ extras|join(',') }}]==1.0.0rc1", + "crawlee[{{ extras|join(',') }}]", # % if cookiecutter.crawler_type == 'playwright-camoufox' "camoufox[geoip]~=0.4.5", # % endif # % if cookiecutter.enable_apify_integration - "apify==3.0.0rc1", + "apify", # % endif ] diff --git a/src/crawlee/project_template/{{cookiecutter.project_name}}/requirements.txt b/src/crawlee/project_template/{{cookiecutter.project_name}}/requirements.txt index b969739df5..1eebd53bcf 100644 --- a/src/crawlee/project_template/{{cookiecutter.project_name}}/requirements.txt +++ b/src/crawlee/project_template/{{cookiecutter.project_name}}/requirements.txt @@ -5,9 +5,9 @@ camoufox[geoip]~=0.4.5 # % set extras = [cookiecutter.crawler_type] # % endif # % if cookiecutter.enable_apify_integration -apify==3.0.0rc1 +apify # % endif # % if cookiecutter.http_client == 'curl-impersonate' # % do extras.append('curl-impersonate') # % endif -crawlee[{{ extras | join(',') }}]==1.0.0rc1 +crawlee[{{ extras | join(',') }}] diff --git a/uv.lock b/uv.lock index 7e502a6b21..7ff212b282 100644 --- a/uv.lock +++ b/uv.lock @@ -577,7 +577,7 @@ toml = [ [[package]] name = "crawlee" -version = "1.0.0rc1" +version = "0.6.13" source = { editable = "." } dependencies = [ { name = "cachetools" }, From 86c42444123a9b08ad7a7aedc3713308712c7d2b Mon Sep 17 00:00:00 2001 From: Josef Prochazka Date: Fri, 29 Aug 2025 16:08:55 +0200 Subject: [PATCH 04/16] Simplify types --- .../storages/_storage_instance_manager.py | 23 +++++-------------- 1 file changed, 6 insertions(+), 17 deletions(-) diff --git a/src/crawlee/storages/_storage_instance_manager.py b/src/crawlee/storages/_storage_instance_manager.py index 14cae6223b..98faca65ca 100644 --- a/src/crawlee/storages/_storage_instance_manager.py +++ b/src/crawlee/storages/_storage_instance_manager.py @@ -1,29 +1,19 @@ from __future__ import annotations from collections import defaultdict -from collections.abc import Callable, Coroutine from dataclasses import dataclass, field -from typing import TYPE_CHECKING, Any, TypeVar, cast - -from mypy_extensions import DefaultNamedArg - -from crawlee.storage_clients._base import DatasetClient, KeyValueStoreClient, RequestQueueClient +from typing import TYPE_CHECKING, TypeVar, cast from . import Dataset, KeyValueStore, RequestQueue if TYPE_CHECKING: from crawlee.storage_clients import StorageClient + from crawlee.storage_clients._base import DatasetClient, KeyValueStoreClient, RequestQueueClient from ._base import Storage T = TypeVar('T', bound='Storage') -ClientOpener = Callable[ - [DefaultNamedArg(str | None, 'id'), DefaultNamedArg(str | None, 'name')], - Coroutine[Any, Any, DatasetClient | KeyValueStoreClient | RequestQueueClient], -] -"""Type alias for the client opener function.""" - @dataclass class _StorageClientCache: @@ -95,18 +85,17 @@ async def open_storage_instance( if isinstance(cached_instance, cls): return cached_instance - client_opener: ClientOpener + client: KeyValueStoreClient | DatasetClient | RequestQueueClient # Create new instance if cls is Dataset: - client_opener = storage_client.create_dataset_client + client = await storage_client.create_dataset_client(id=id, name=name) elif cls is KeyValueStore: - client_opener = storage_client.create_kvs_client + client = await storage_client.create_kvs_client(id=id, name=name) elif cls is RequestQueue: - client_opener = storage_client.create_rq_client + client = await storage_client.create_rq_client(id=id, name=name) else: raise ValueError(f'Unsupported storage class: {cls.__name__}') - client = await client_opener(id=id, name=name) metadata = await client.get_metadata() instance = cls(client, metadata.id, metadata.name) # type: ignore[call-arg] From dd5f9146c17e04b05a814c6e0006f06ea8ded90f Mon Sep 17 00:00:00 2001 From: Josef Prochazka Date: Mon, 1 Sep 2025 09:32:37 +0200 Subject: [PATCH 05/16] Update configuration according to Pydantic docs recommendation --- src/crawlee/configuration.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/crawlee/configuration.py b/src/crawlee/configuration.py index c825711565..c0ee228e9d 100644 --- a/src/crawlee/configuration.py +++ b/src/crawlee/configuration.py @@ -28,7 +28,7 @@ class Configuration(BaseSettings): Settings can also be configured via environment variables, prefixed with `CRAWLEE_`. """ - model_config = SettingsConfigDict(populate_by_name=True) + model_config = SettingsConfigDict(validate_by_name=True, validate_by_alias=True) internal_timeout: Annotated[timedelta | None, Field(alias='crawlee_internal_timeout')] = None """Timeout for the internal asynchronous operations.""" From df06b82446dba182a72afeec0bfb75484461b184 Mon Sep 17 00:00:00 2001 From: Josef Prochazka Date: Mon, 1 Sep 2025 15:14:05 +0200 Subject: [PATCH 06/16] Properly create custom storage clients when p[assing custom configuration --- src/crawlee/crawlers/_basic/_basic_crawler.py | 14 ++++++++++++-- .../storage_clients/_base/_storage_client.py | 4 ++++ .../_file_system/_storage_client.py | 5 +++++ .../storage_clients/_memory/_storage_client.py | 5 +++++ 4 files changed, 26 insertions(+), 2 deletions(-) diff --git a/src/crawlee/crawlers/_basic/_basic_crawler.py b/src/crawlee/crawlers/_basic/_basic_crawler.py index 545848d16d..211d57d7a4 100644 --- a/src/crawlee/crawlers/_basic/_basic_crawler.py +++ b/src/crawlee/crawlers/_basic/_basic_crawler.py @@ -347,10 +347,20 @@ def __init__( _logger: A logger instance, typically provided by a subclass, for consistent logging labels. Intended for use by subclasses rather than direct instantiation of `BasicCrawler`. """ + global_configuration: None | Configuration = None + if not configuration: - configuration = service_locator.get_configuration() + global_configuration = service_locator.get_configuration() + configuration = global_configuration + if not storage_client: - storage_client = service_locator.get_storage_client() + if global_configuration: + # If global configuration was used, reuse its storage client too + storage_client = service_locator.get_storage_client() + else: + # If unique configuration was used, create a unique storage client based on such configuration + storage_client = service_locator.get_storage_client().create_client(configuration) + if not event_manager: event_manager = service_locator.get_event_manager() diff --git a/src/crawlee/storage_clients/_base/_storage_client.py b/src/crawlee/storage_clients/_base/_storage_client.py index ebfc9a0130..9e7a7a4547 100644 --- a/src/crawlee/storage_clients/_base/_storage_client.py +++ b/src/crawlee/storage_clients/_base/_storage_client.py @@ -55,6 +55,10 @@ async def create_rq_client( ) -> RequestQueueClient: """Create a request queue client.""" + @abstractmethod + def create_client(self, configuration: Configuration) -> StorageClient: + """Create a storage client from an existing storage.""" + def get_rate_limit_errors(self) -> dict[int, int]: """Return statistics about rate limit errors encountered by the HTTP client in storage client.""" return {} diff --git a/src/crawlee/storage_clients/_file_system/_storage_client.py b/src/crawlee/storage_clients/_file_system/_storage_client.py index 2aba740607..76b9abd1a2 100644 --- a/src/crawlee/storage_clients/_file_system/_storage_client.py +++ b/src/crawlee/storage_clients/_file_system/_storage_client.py @@ -70,3 +70,8 @@ async def create_rq_client( client = await FileSystemRequestQueueClient.open(id=id, name=name, configuration=self._configuration) await self._purge_if_needed(client, self._configuration) return client + + @override + def create_client(self, configuration: Configuration) -> FileSystemStorageClient: + """Create a storage client from an existing storage client potentially just replacing the configuration.""" + return FileSystemStorageClient(configuration or self._configuration) diff --git a/src/crawlee/storage_clients/_memory/_storage_client.py b/src/crawlee/storage_clients/_memory/_storage_client.py index fece22233f..fc7f157320 100644 --- a/src/crawlee/storage_clients/_memory/_storage_client.py +++ b/src/crawlee/storage_clients/_memory/_storage_client.py @@ -68,3 +68,8 @@ async def create_rq_client( client = await MemoryRequestQueueClient.open(id=id, name=name) await self._purge_if_needed(client, self._configuration) return client + + @override + def create_client(self, configuration: Configuration) -> MemoryStorageClient: + """Create a storage client from an existing storage client potentially just replacing the configuration.""" + return MemoryStorageClient(configuration or self._configuration) From 430f2ad7e7d266ad35abe97b64a9709d81aab7f1 Mon Sep 17 00:00:00 2001 From: Josef Prochazka Date: Mon, 1 Sep 2025 15:17:30 +0200 Subject: [PATCH 07/16] Update the create methods --- src/crawlee/storage_clients/_file_system/_storage_client.py | 2 +- src/crawlee/storage_clients/_memory/_storage_client.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/crawlee/storage_clients/_file_system/_storage_client.py b/src/crawlee/storage_clients/_file_system/_storage_client.py index 76b9abd1a2..c6612665b3 100644 --- a/src/crawlee/storage_clients/_file_system/_storage_client.py +++ b/src/crawlee/storage_clients/_file_system/_storage_client.py @@ -74,4 +74,4 @@ async def create_rq_client( @override def create_client(self, configuration: Configuration) -> FileSystemStorageClient: """Create a storage client from an existing storage client potentially just replacing the configuration.""" - return FileSystemStorageClient(configuration or self._configuration) + return FileSystemStorageClient(configuration) diff --git a/src/crawlee/storage_clients/_memory/_storage_client.py b/src/crawlee/storage_clients/_memory/_storage_client.py index fc7f157320..e3a92a6cfd 100644 --- a/src/crawlee/storage_clients/_memory/_storage_client.py +++ b/src/crawlee/storage_clients/_memory/_storage_client.py @@ -72,4 +72,4 @@ async def create_rq_client( @override def create_client(self, configuration: Configuration) -> MemoryStorageClient: """Create a storage client from an existing storage client potentially just replacing the configuration.""" - return MemoryStorageClient(configuration or self._configuration) + return MemoryStorageClient(configuration) From 1340c5c973b3bb2513e2e92d2357ca80390822c8 Mon Sep 17 00:00:00 2001 From: Josef Prochazka Date: Tue, 2 Sep 2025 10:38:17 +0200 Subject: [PATCH 08/16] Add global instanc manager --- src/crawlee/_service_locator.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/src/crawlee/_service_locator.py b/src/crawlee/_service_locator.py index a4256e1eb4..9336784f9e 100644 --- a/src/crawlee/_service_locator.py +++ b/src/crawlee/_service_locator.py @@ -23,6 +23,8 @@ class ServiceLocator: All services are initialized to its default value lazily. """ + global_storage_instance_manager: StorageInstanceManager | None = None + def __init__( self, configuration: Configuration | None = None, @@ -121,14 +123,14 @@ def set_storage_client(self, storage_client: StorageClient) -> None: @property def storage_instance_manager(self) -> StorageInstanceManager: - """Get the storage instance manager.""" - if self._storage_instance_manager is None: + """Get the storage instance manager. It is global manager shared by all instances of ServiceLocator.""" + if self.__class__.global_storage_instance_manager is None: # Import here to avoid circular imports. from crawlee.storages._storage_instance_manager import StorageInstanceManager # noqa: PLC0415 - self._storage_instance_manager = StorageInstanceManager() + self.__class__.global_storage_instance_manager = StorageInstanceManager() - return self._storage_instance_manager + return self.__class__.global_storage_instance_manager service_locator = ServiceLocator() From a1746f37014f9caccb8f2dbdeacf1e62244e1f09 Mon Sep 17 00:00:00 2001 From: Josef Prochazka Date: Tue, 2 Sep 2025 15:33:52 +0200 Subject: [PATCH 09/16] Try to keep config in open, but rework the instance cache --- .../service_storage_configuration.py | 4 +- src/crawlee/_service_locator.py | 3 +- src/crawlee/crawlers/_basic/_basic_crawler.py | 34 +++-- .../storage_clients/_base/_storage_client.py | 16 ++- .../_file_system/_storage_client.py | 40 +++--- .../_memory/_storage_client.py | 26 ++-- src/crawlee/storages/_dataset.py | 10 +- src/crawlee/storages/_key_value_store.py | 10 +- src/crawlee/storages/_request_queue.py | 10 +- .../storages/_storage_instance_manager.py | 128 +++++++++++------- tests/unit/conftest.py | 2 +- .../crawlers/_basic/test_basic_crawler.py | 2 +- .../_file_system/test_fs_dataset_client.py | 8 +- .../_file_system/test_fs_kvs_client.py | 10 +- .../_file_system/test_fs_rq_client.py | 12 +- .../_memory/test_memory_dataset_client.py | 7 +- .../_memory/test_memory_kvs_client.py | 6 +- .../_memory/test_memory_rq_client.py | 7 +- tests/unit/storages/test_dataset.py | 7 +- tests/unit/storages/test_key_value_store.py | 7 +- tests/unit/storages/test_request_queue.py | 7 +- .../storages/test_storage_instance_manager.py | 76 ++++++++--- 22 files changed, 256 insertions(+), 176 deletions(-) diff --git a/docs/guides/code_examples/service_locator/service_storage_configuration.py b/docs/guides/code_examples/service_locator/service_storage_configuration.py index ae42474c29..580e6d348f 100644 --- a/docs/guides/code_examples/service_locator/service_storage_configuration.py +++ b/docs/guides/code_examples/service_locator/service_storage_configuration.py @@ -19,10 +19,10 @@ async def main() -> None: # Use the global defaults when creating the dataset (or other storage). dataset_1 = await Dataset.open() - # Or set explicitly specific storage client if + # Or set explicitly specific configuration if # you do not want to rely on global defaults. dataset_2 = await Dataset.open( - storage_client=MemoryStorageClient(configuration=configuration) + storage_client=MemoryStorageClient(), configuration=configuration ) diff --git a/src/crawlee/_service_locator.py b/src/crawlee/_service_locator.py index 9336784f9e..15f2eb74f1 100644 --- a/src/crawlee/_service_locator.py +++ b/src/crawlee/_service_locator.py @@ -34,7 +34,6 @@ def __init__( self._configuration = configuration self._event_manager = event_manager self._storage_client = storage_client - self._storage_instance_manager: StorageInstanceManager | None = None def get_configuration(self) -> Configuration: """Get the configuration.""" @@ -100,7 +99,7 @@ def get_storage_client(self) -> StorageClient: 'Implicit creation of storage client will implicitly set configuration as side effect. ' 'It is advised to explicitly first set the configuration instead.' ) - self._storage_client = FileSystemStorageClient(configuration=self._configuration) + self._storage_client = FileSystemStorageClient() return self._storage_client diff --git a/src/crawlee/crawlers/_basic/_basic_crawler.py b/src/crawlee/crawlers/_basic/_basic_crawler.py index 211d57d7a4..f587dd891e 100644 --- a/src/crawlee/crawlers/_basic/_basic_crawler.py +++ b/src/crawlee/crawlers/_basic/_basic_crawler.py @@ -347,21 +347,16 @@ def __init__( _logger: A logger instance, typically provided by a subclass, for consistent logging labels. Intended for use by subclasses rather than direct instantiation of `BasicCrawler`. """ - global_configuration: None | Configuration = None - if not configuration: - global_configuration = service_locator.get_configuration() - configuration = global_configuration + configuration = service_locator.get_configuration() if not storage_client: - if global_configuration: - # If global configuration was used, reuse its storage client too - storage_client = service_locator.get_storage_client() - else: - # If unique configuration was used, create a unique storage client based on such configuration - storage_client = service_locator.get_storage_client().create_client(configuration) + storage_client = service_locator.get_storage_client() if not event_manager: + # This is weird if someone passes configuration and its event manager related stuff gets ignored as the + # event manager will be used from service_locator. Maybe keep the was created flag for it? It does not have + # the use cases like the storages event_manager = service_locator.get_event_manager() self._service_locator = ServiceLocator( @@ -563,7 +558,10 @@ async def _get_proxy_info(self, request: Request, session: Session | None) -> Pr async def get_request_manager(self) -> RequestManager: """Return the configured request manager. If none is configured, open and return the default request queue.""" if not self._request_manager: - self._request_manager = await RequestQueue.open(storage_client=self._service_locator.get_storage_client()) + self._request_manager = await RequestQueue.open( + storage_client=self._service_locator.get_storage_client(), + configuration=self._service_locator.get_configuration(), + ) return self._request_manager @@ -574,7 +572,12 @@ async def get_dataset( name: str | None = None, ) -> Dataset: """Return the `Dataset` with the given ID or name. If none is provided, return the default one.""" - return await Dataset.open(id=id, name=name, storage_client=self._service_locator.get_storage_client()) + return await Dataset.open( + id=id, + name=name, + storage_client=self._service_locator.get_storage_client(), + configuration=self._service_locator.get_configuration(), + ) async def get_key_value_store( self, @@ -583,7 +586,12 @@ async def get_key_value_store( name: str | None = None, ) -> KeyValueStore: """Return the `KeyValueStore` with the given ID or name. If none is provided, return the default KVS.""" - return await KeyValueStore.open(id=id, name=name, storage_client=self._service_locator.get_storage_client()) + return await KeyValueStore.open( + id=id, + name=name, + storage_client=self._service_locator.get_storage_client(), + configuration=self._service_locator.get_configuration(), + ) def error_handler( self, handler: ErrorHandler[TCrawlingContext | BasicCrawlingContext] diff --git a/src/crawlee/storage_clients/_base/_storage_client.py b/src/crawlee/storage_clients/_base/_storage_client.py index 9e7a7a4547..0eadae50a1 100644 --- a/src/crawlee/storage_clients/_base/_storage_client.py +++ b/src/crawlee/storage_clients/_base/_storage_client.py @@ -6,6 +6,8 @@ from crawlee._utils.docs import docs_group if TYPE_CHECKING: + from collections.abc import Hashable + from crawlee.configuration import Configuration from ._dataset_client import DatasetClient @@ -28,12 +30,20 @@ class StorageClient(ABC): (where applicable), and consistent access patterns across all storage types it supports. """ + def get_additional_cache_key(self, configuration: Configuration) -> Hashable: # noqa: ARG002 + """Return a cache key that can differentiate between different storages of this client. + + Can be based on configuration or on the client itself. By default, returns an empty string. + """ + return '' + @abstractmethod async def create_dataset_client( self, *, id: str | None = None, name: str | None = None, + configuration: Configuration | None = None, ) -> DatasetClient: """Create a dataset client.""" @@ -43,6 +53,7 @@ async def create_kvs_client( *, id: str | None = None, name: str | None = None, + configuration: Configuration | None = None, ) -> KeyValueStoreClient: """Create a key-value store client.""" @@ -52,13 +63,10 @@ async def create_rq_client( *, id: str | None = None, name: str | None = None, + configuration: Configuration | None = None, ) -> RequestQueueClient: """Create a request queue client.""" - @abstractmethod - def create_client(self, configuration: Configuration) -> StorageClient: - """Create a storage client from an existing storage.""" - def get_rate_limit_errors(self) -> dict[int, int]: """Return statistics about rate limit errors encountered by the HTTP client in storage client.""" return {} diff --git a/src/crawlee/storage_clients/_file_system/_storage_client.py b/src/crawlee/storage_clients/_file_system/_storage_client.py index c6612665b3..55aa601ade 100644 --- a/src/crawlee/storage_clients/_file_system/_storage_client.py +++ b/src/crawlee/storage_clients/_file_system/_storage_client.py @@ -1,5 +1,7 @@ from __future__ import annotations +from typing import TYPE_CHECKING + from typing_extensions import override from crawlee._utils.docs import docs_group @@ -10,6 +12,9 @@ from ._key_value_store_client import FileSystemKeyValueStoreClient from ._request_queue_client import FileSystemRequestQueueClient +if TYPE_CHECKING: + from collections.abc import Hashable + @docs_group('Storage clients') class FileSystemStorageClient(StorageClient): @@ -29,14 +34,10 @@ class FileSystemStorageClient(StorageClient): Use it only when running a single crawler process at a time. """ - def __init__(self, configuration: Configuration | None = None) -> None: - """Initialize the file system storage client. - - Args: - configuration: Optional configuration instance to use with the storage client. - If not provided, the global configuration will be used. - """ - self._configuration = configuration or Configuration.get_global_configuration() + @override + def get_additional_cache_key(self, configuration: Configuration) -> Hashable: + # Even different client instances should return same storage if the storage_dir is the same. + return configuration.storage_dir @override async def create_dataset_client( @@ -44,9 +45,11 @@ async def create_dataset_client( *, id: str | None = None, name: str | None = None, + configuration: Configuration | None = None, ) -> FileSystemDatasetClient: - client = await FileSystemDatasetClient.open(id=id, name=name, configuration=self._configuration) - await self._purge_if_needed(client, self._configuration) + configuration = configuration or Configuration.get_global_configuration() + client = await FileSystemDatasetClient.open(id=id, name=name, configuration=configuration) + await self._purge_if_needed(client, configuration) return client @override @@ -55,9 +58,11 @@ async def create_kvs_client( *, id: str | None = None, name: str | None = None, + configuration: Configuration | None = None, ) -> FileSystemKeyValueStoreClient: - client = await FileSystemKeyValueStoreClient.open(id=id, name=name, configuration=self._configuration) - await self._purge_if_needed(client, self._configuration) + configuration = configuration or Configuration.get_global_configuration() + client = await FileSystemKeyValueStoreClient.open(id=id, name=name, configuration=configuration) + await self._purge_if_needed(client, configuration) return client @override @@ -66,12 +71,9 @@ async def create_rq_client( *, id: str | None = None, name: str | None = None, + configuration: Configuration | None = None, ) -> FileSystemRequestQueueClient: - client = await FileSystemRequestQueueClient.open(id=id, name=name, configuration=self._configuration) - await self._purge_if_needed(client, self._configuration) + configuration = configuration or Configuration.get_global_configuration() + client = await FileSystemRequestQueueClient.open(id=id, name=name, configuration=configuration) + await self._purge_if_needed(client, configuration) return client - - @override - def create_client(self, configuration: Configuration) -> FileSystemStorageClient: - """Create a storage client from an existing storage client potentially just replacing the configuration.""" - return FileSystemStorageClient(configuration) diff --git a/src/crawlee/storage_clients/_memory/_storage_client.py b/src/crawlee/storage_clients/_memory/_storage_client.py index e3a92a6cfd..f4ac73e489 100644 --- a/src/crawlee/storage_clients/_memory/_storage_client.py +++ b/src/crawlee/storage_clients/_memory/_storage_client.py @@ -27,24 +27,17 @@ class MemoryStorageClient(StorageClient): operations where persistence is not required. """ - def __init__(self, configuration: Configuration | None = None) -> None: - """Initialize the file system storage client. - - Args: - configuration: Optional configuration instance to use with the storage client. - If not provided, the global configuration will be used. - """ - self._configuration = configuration or Configuration.get_global_configuration() - @override async def create_dataset_client( self, *, id: str | None = None, name: str | None = None, + configuration: Configuration | None = None, ) -> MemoryDatasetClient: + configuration = configuration or Configuration.get_global_configuration() client = await MemoryDatasetClient.open(id=id, name=name) - await self._purge_if_needed(client, self._configuration) + await self._purge_if_needed(client, configuration) return client @override @@ -53,9 +46,11 @@ async def create_kvs_client( *, id: str | None = None, name: str | None = None, + configuration: Configuration | None = None, ) -> MemoryKeyValueStoreClient: + configuration = configuration or Configuration.get_global_configuration() client = await MemoryKeyValueStoreClient.open(id=id, name=name) - await self._purge_if_needed(client, self._configuration) + await self._purge_if_needed(client, configuration) return client @override @@ -64,12 +59,9 @@ async def create_rq_client( *, id: str | None = None, name: str | None = None, + configuration: Configuration | None = None, ) -> MemoryRequestQueueClient: + configuration = configuration or Configuration.get_global_configuration() client = await MemoryRequestQueueClient.open(id=id, name=name) - await self._purge_if_needed(client, self._configuration) + await self._purge_if_needed(client, configuration) return client - - @override - def create_client(self, configuration: Configuration) -> MemoryStorageClient: - """Create a storage client from an existing storage client potentially just replacing the configuration.""" - return MemoryStorageClient(configuration) diff --git a/src/crawlee/storages/_dataset.py b/src/crawlee/storages/_dataset.py index df6b165987..62f3e23057 100644 --- a/src/crawlee/storages/_dataset.py +++ b/src/crawlee/storages/_dataset.py @@ -20,6 +20,7 @@ from typing_extensions import Unpack from crawlee._types import ExportDataCsvKwargs, ExportDataJsonKwargs + from crawlee.configuration import Configuration from crawlee.storage_clients import StorageClient from crawlee.storage_clients._base import DatasetClient from crawlee.storage_clients.models import DatasetItemsListPage, DatasetMetadata @@ -99,15 +100,22 @@ async def open( *, id: str | None = None, name: str | None = None, + configuration: Configuration | None = None, storage_client: StorageClient | None = None, ) -> Dataset: + configuration = service_locator.get_configuration() if configuration is None else configuration storage_client = service_locator.get_storage_client() if storage_client is None else storage_client + client_opener = storage_client.create_dataset_client(id=id, name=name, configuration=configuration) + additional_cache_key = storage_client.get_additional_cache_key(configuration=configuration) + return await service_locator.storage_instance_manager.open_storage_instance( cls, id=id, name=name, - storage_client=storage_client, + client_opener=client_opener, + storage_client_type=storage_client.__class__, + additional_cache_key=additional_cache_key, ) @override diff --git a/src/crawlee/storages/_key_value_store.py b/src/crawlee/storages/_key_value_store.py index e46ade9c0d..ed7a0a1c41 100644 --- a/src/crawlee/storages/_key_value_store.py +++ b/src/crawlee/storages/_key_value_store.py @@ -19,6 +19,7 @@ if TYPE_CHECKING: from collections.abc import AsyncIterator + from crawlee.configuration import Configuration from crawlee.storage_clients import StorageClient from crawlee.storage_clients._base import KeyValueStoreClient from crawlee.storage_clients.models import KeyValueStoreMetadata, KeyValueStoreRecordMetadata @@ -111,15 +112,22 @@ async def open( *, id: str | None = None, name: str | None = None, + configuration: Configuration | None = None, storage_client: StorageClient | None = None, ) -> KeyValueStore: + configuration = service_locator.get_configuration() if configuration is None else configuration storage_client = service_locator.get_storage_client() if storage_client is None else storage_client + client_opener = storage_client.create_kvs_client(id=id, name=name, configuration=configuration) + additional_cache_key = storage_client.get_additional_cache_key(configuration=configuration) + return await service_locator.storage_instance_manager.open_storage_instance( cls, id=id, name=name, - storage_client=storage_client, + client_opener=client_opener, + storage_client_type=storage_client.__class__, + additional_cache_key=additional_cache_key, ) @override diff --git a/src/crawlee/storages/_request_queue.py b/src/crawlee/storages/_request_queue.py index 1d60d19895..2e92d56ff0 100644 --- a/src/crawlee/storages/_request_queue.py +++ b/src/crawlee/storages/_request_queue.py @@ -18,6 +18,7 @@ from collections.abc import Sequence from crawlee import Request + from crawlee.configuration import Configuration from crawlee.storage_clients import StorageClient from crawlee.storage_clients._base import RequestQueueClient from crawlee.storage_clients.models import ProcessedRequest, RequestQueueMetadata @@ -117,15 +118,22 @@ async def open( *, id: str | None = None, name: str | None = None, + configuration: Configuration | None = None, storage_client: StorageClient | None = None, ) -> RequestQueue: + configuration = service_locator.get_configuration() if configuration is None else configuration storage_client = service_locator.get_storage_client() if storage_client is None else storage_client + client_opener = storage_client.create_rq_client(id=id, name=name, configuration=configuration) + additional_cache_key = storage_client.get_additional_cache_key(configuration=configuration) + return await service_locator.storage_instance_manager.open_storage_instance( cls, id=id, name=name, - storage_client=storage_client, + client_opener=client_opener, + storage_client_type=storage_client.__class__, + additional_cache_key=additional_cache_key, ) @override diff --git a/src/crawlee/storages/_storage_instance_manager.py b/src/crawlee/storages/_storage_instance_manager.py index 98faca65ca..962745e024 100644 --- a/src/crawlee/storages/_storage_instance_manager.py +++ b/src/crawlee/storages/_storage_instance_manager.py @@ -1,14 +1,16 @@ from __future__ import annotations from collections import defaultdict +from collections.abc import Awaitable, Hashable from dataclasses import dataclass, field from typing import TYPE_CHECKING, TypeVar, cast +from crawlee.storage_clients._base import DatasetClient, KeyValueStoreClient, RequestQueueClient + from . import Dataset, KeyValueStore, RequestQueue if TYPE_CHECKING: from crawlee.storage_clients import StorageClient - from crawlee.storage_clients._base import DatasetClient, KeyValueStoreClient, RequestQueueClient from ._base import Storage @@ -17,20 +19,34 @@ @dataclass class _StorageClientCache: - """Cache for specific storage client.""" + """Cache for specific storage client. - by_id: defaultdict[type[Storage], defaultdict[str, Storage]] = field( - default_factory=lambda: defaultdict(lambda: defaultdict()) + Example: + Storage=Dataset, id='123', additional_cache_key="some_path" will be located in + storage = by_id[Dataset]['123'][some_path] + """ + + by_id: defaultdict[type[Storage], defaultdict[str, defaultdict[Hashable, Storage]]] = field( + default_factory=lambda: defaultdict(lambda: defaultdict(lambda: defaultdict())) ) """Cache for storage instances by ID, separated by storage type.""" - by_name: defaultdict[type[Storage], defaultdict[str, Storage]] = field( - default_factory=lambda: defaultdict(lambda: defaultdict()) + by_name: defaultdict[type[Storage], defaultdict[str, defaultdict[Hashable, Storage]]] = field( + default_factory=lambda: defaultdict(lambda: defaultdict(lambda: defaultdict())) ) """Cache for storage instances by name, separated by storage type.""" - default_instances: defaultdict[type[Storage], Storage] = field(default_factory=lambda: defaultdict()) + default_instances: defaultdict[type[Storage], defaultdict[Hashable, Storage]] = field( + default_factory=lambda: defaultdict(lambda: defaultdict()) + ) """Cache for default instances of each storage type.""" +StorageClientType = DatasetClient | KeyValueStoreClient | RequestQueueClient +"""Type alias for the storage client types.""" + +ClientOpener = Awaitable[StorageClientType] +"""Type alias for the client opener function.""" + + class StorageInstanceManager: """Manager for caching and managing storage instances. @@ -39,7 +55,7 @@ class StorageInstanceManager: """ def __init__(self) -> None: - self._cache_by_storage_client: dict[StorageClient, _StorageClientCache] = defaultdict(_StorageClientCache) + self._cache_by_storage_client: dict[type[StorageClient], _StorageClientCache] = defaultdict(_StorageClientCache) async def open_storage_instance( self, @@ -47,7 +63,9 @@ async def open_storage_instance( *, id: str | None, name: str | None, - storage_client: StorageClient, + storage_client_type: type[StorageClient], + client_opener: ClientOpener, + additional_cache_key: Hashable = '', ) -> T: """Open a storage instance with caching support. @@ -55,7 +73,9 @@ async def open_storage_instance( cls: The storage class to instantiate. id: Storage ID. name: Storage name. - storage_client: Storage client instance. + storage_client_type: Type of storage client to use. + client_opener: Awaitable to open the storage client when storage instance not found in cache. + additional_cache_key: Additional optional key to differentiate cache entries. Returns: The storage instance. @@ -67,32 +87,38 @@ async def open_storage_instance( raise ValueError('Only one of "id" or "name" can be specified, not both.') # Check for default instance - if id is None and name is None and cls in self._cache_by_storage_client[storage_client].default_instances: - return cast('T', self._cache_by_storage_client[storage_client].default_instances[cls]) + if ( + id is None + and name is None + and additional_cache_key in self._cache_by_storage_client[storage_client_type].default_instances[cls] + ): + return cast( + 'T', self._cache_by_storage_client[storage_client_type].default_instances[cls][additional_cache_key] + ) # Check cache - if id is not None: - type_cache_by_id = self._cache_by_storage_client[storage_client].by_id[cls] - if id in type_cache_by_id: - cached_instance = type_cache_by_id[id] - if isinstance(cached_instance, cls): - return cached_instance - - if name is not None: - type_cache_by_name = self._cache_by_storage_client[storage_client].by_name[cls] - if name in type_cache_by_name: - cached_instance = type_cache_by_name[name] - if isinstance(cached_instance, cls): - return cached_instance + if id is not None and ( + cached_instance := self._cache_by_storage_client[storage_client_type] + .by_id[cls][id] + .get(additional_cache_key) + ): + if isinstance(cached_instance, cls): + return cached_instance + raise RuntimeError('Cached instance type mismatch.') + + if name is not None and ( + cached_instance := self._cache_by_storage_client[storage_client_type] + .by_name[cls][name] + .get(additional_cache_key) + ): + if isinstance(cached_instance, cls): + return cached_instance + raise RuntimeError('Cached instance type mismatch.') client: KeyValueStoreClient | DatasetClient | RequestQueueClient # Create new instance - if cls is Dataset: - client = await storage_client.create_dataset_client(id=id, name=name) - elif cls is KeyValueStore: - client = await storage_client.create_kvs_client(id=id, name=name) - elif cls is RequestQueue: - client = await storage_client.create_rq_client(id=id, name=name) + if cls is Dataset or cls is KeyValueStore or cls is RequestQueue: + client = await client_opener else: raise ValueError(f'Unsupported storage class: {cls.__name__}') @@ -102,13 +128,15 @@ async def open_storage_instance( instance_name = getattr(instance, 'name', None) # Cache the instance - self._cache_by_storage_client[storage_client].by_id[cls][instance.id] = instance + self._cache_by_storage_client[storage_client_type].by_id[cls][instance.id][additional_cache_key] = instance if instance_name is not None: - self._cache_by_storage_client[storage_client].by_name[cls][instance_name] = instance + self._cache_by_storage_client[storage_client_type].by_name[cls][instance_name][additional_cache_key] = ( + instance + ) # Set as default if no id/name specified if id is None and name is None: - self._cache_by_storage_client[storage_client].default_instances[cls] = instance + self._cache_by_storage_client[storage_client_type].default_instances[cls][additional_cache_key] = instance return instance @@ -120,23 +148,25 @@ def remove_from_cache(self, storage_instance: Storage) -> None: """ storage_type = type(storage_instance) - # Remove from ID cache - for client_cache in self._cache_by_storage_client.values(): - type_cache_by_id = client_cache.by_id[storage_type] - if storage_instance.id in type_cache_by_id: - del type_cache_by_id[storage_instance.id] + for storage_client_cache in self._cache_by_storage_client.values(): + # Remove from ID cache + for additional_key in storage_client_cache.by_id[storage_type][storage_instance.id]: + if additional_key in storage_client_cache.by_id[storage_type][storage_instance.id]: + del storage_client_cache.by_id[storage_type][storage_instance.id][additional_key] + break # Remove from name cache - type_cache_by_name = client_cache.by_name[storage_type] - if storage_instance.name in type_cache_by_name and storage_instance.name: - del type_cache_by_name[storage_instance.name] - - # Remove from default instances - if ( - storage_type in client_cache.default_instances - and client_cache.default_instances[storage_type] is storage_instance - ): - del client_cache.default_instances[storage_type] + if storage_instance.name is not None: + for additional_key in storage_client_cache.by_name[storage_type][storage_instance.name]: + if additional_key in storage_client_cache.by_name[storage_type][storage_instance.name]: + del storage_client_cache.by_name[storage_type][storage_instance.name][additional_key] + break + + # Remove from default instances + for additional_key in storage_client_cache.default_instances[storage_type]: + if storage_client_cache.default_instances[storage_type][additional_key] is storage_instance: + del storage_client_cache.default_instances[storage_type][additional_key] + break def clear_cache(self) -> None: """Clear all cached storage instances.""" diff --git a/tests/unit/conftest.py b/tests/unit/conftest.py index a1be6b4a71..4207810280 100644 --- a/tests/unit/conftest.py +++ b/tests/unit/conftest.py @@ -55,7 +55,7 @@ def _prepare_test_env() -> None: service_locator._configuration = None service_locator._event_manager = None service_locator._storage_client = None - service_locator._storage_instance_manager = None + service_locator.storage_instance_manager.clear_cache() # Verify that the test environment was set up correctly. assert os.environ.get('CRAWLEE_STORAGE_DIR') == str(tmp_path) diff --git a/tests/unit/crawlers/_basic/test_basic_crawler.py b/tests/unit/crawlers/_basic/test_basic_crawler.py index 7a4372f1ab..62ede11e67 100644 --- a/tests/unit/crawlers/_basic/test_basic_crawler.py +++ b/tests/unit/crawlers/_basic/test_basic_crawler.py @@ -1161,7 +1161,7 @@ async def test_crawler_can_use_other_storages_of_same_type(tmp_path: Path) -> No rq = await RequestQueue.open() # Set the crawler to use different storage client - crawler = BasicCrawler(storage_client=FileSystemStorageClient(configuration=configuration_b)) + crawler = BasicCrawler(storage_client=FileSystemStorageClient(), configuration=configuration_b) # Assert that the storages are different assert dataset is not await crawler.get_dataset() diff --git a/tests/unit/storage_clients/_file_system/test_fs_dataset_client.py b/tests/unit/storage_clients/_file_system/test_fs_dataset_client.py index 28aa16a286..d3e5c6d9cf 100644 --- a/tests/unit/storage_clients/_file_system/test_fs_dataset_client.py +++ b/tests/unit/storage_clients/_file_system/test_fs_dataset_client.py @@ -27,18 +27,14 @@ def configuration(tmp_path: Path) -> Configuration: @pytest.fixture async def dataset_client(configuration: Configuration) -> AsyncGenerator[FileSystemDatasetClient, None]: """A fixture for a file system dataset client.""" - client = await FileSystemStorageClient(configuration=configuration).create_dataset_client( - name='test_dataset', - ) + client = await FileSystemStorageClient().create_dataset_client(name='test_dataset', configuration=configuration) yield client await client.drop() async def test_file_and_directory_creation(configuration: Configuration) -> None: """Test that file system dataset creates proper files and directories.""" - client = await FileSystemStorageClient(configuration=configuration).create_dataset_client( - name='new_dataset', - ) + client = await FileSystemStorageClient().create_dataset_client(name='new_dataset', configuration=configuration) # Verify files were created assert client.path_to_dataset.exists() diff --git a/tests/unit/storage_clients/_file_system/test_fs_kvs_client.py b/tests/unit/storage_clients/_file_system/test_fs_kvs_client.py index b8d98d309e..b9702299a0 100644 --- a/tests/unit/storage_clients/_file_system/test_fs_kvs_client.py +++ b/tests/unit/storage_clients/_file_system/test_fs_kvs_client.py @@ -27,14 +27,14 @@ def configuration(tmp_path: Path) -> Configuration: @pytest.fixture async def kvs_client(configuration: Configuration) -> AsyncGenerator[FileSystemKeyValueStoreClient, None]: """A fixture for a file system key-value store client.""" - client = await FileSystemStorageClient(configuration=configuration).create_kvs_client(name='test_kvs') + client = await FileSystemStorageClient().create_kvs_client(name='test_kvs', configuration=configuration) yield client await client.drop() async def test_file_and_directory_creation(configuration: Configuration) -> None: """Test that file system KVS creates proper files and directories.""" - client = await FileSystemStorageClient(configuration=configuration).create_kvs_client(name='new_kvs') + client = await FileSystemStorageClient().create_kvs_client(name='new_kvs', configuration=configuration) # Verify files were created assert client.path_to_kvs.exists() @@ -178,12 +178,10 @@ async def test_metadata_file_updates(kvs_client: FileSystemKeyValueStoreClient) async def test_data_persistence_across_reopens(configuration: Configuration) -> None: """Test that data persists correctly when reopening the same KVS.""" - storage_client = FileSystemStorageClient(configuration=configuration) + storage_client = FileSystemStorageClient() # Create KVS and add data - original_client = await storage_client.create_kvs_client( - name='persistence-test', - ) + original_client = await storage_client.create_kvs_client(name='persistence-test', configuration=configuration) test_key = 'persistent-key' test_value = 'persistent-value' diff --git a/tests/unit/storage_clients/_file_system/test_fs_rq_client.py b/tests/unit/storage_clients/_file_system/test_fs_rq_client.py index 2ec3fdf1fe..dc2937a259 100644 --- a/tests/unit/storage_clients/_file_system/test_fs_rq_client.py +++ b/tests/unit/storage_clients/_file_system/test_fs_rq_client.py @@ -25,18 +25,18 @@ def configuration(tmp_path: Path) -> Configuration: @pytest.fixture -async def rq_client(configuration: Configuration) -> AsyncGenerator[FileSystemRequestQueueClient, None]: +async def rq_client() -> AsyncGenerator[FileSystemRequestQueueClient, None]: """A fixture for a file system request queue client.""" - client = await FileSystemStorageClient(configuration=configuration).create_rq_client( + client = await FileSystemStorageClient().create_rq_client( name='test_request_queue', ) yield client await client.drop() -async def test_file_and_directory_creation(configuration: Configuration) -> None: +async def test_file_and_directory_creation() -> None: """Test that file system RQ creates proper files and directories.""" - client = await FileSystemStorageClient(configuration=configuration).create_rq_client(name='new_request_queue') + client = await FileSystemStorageClient().create_rq_client(name='new_request_queue') # Verify files were created assert client.path_to_rq.exists() @@ -131,9 +131,9 @@ async def test_metadata_file_updates(rq_client: FileSystemRequestQueueClient) -> assert metadata_json['total_request_count'] == 1 -async def test_data_persistence_across_reopens(configuration: Configuration) -> None: +async def test_data_persistence_across_reopens() -> None: """Test that requests persist correctly when reopening the same RQ.""" - storage_client = FileSystemStorageClient(configuration=configuration) + storage_client = FileSystemStorageClient() # Create RQ and add requests original_client = await storage_client.create_rq_client( diff --git a/tests/unit/storage_clients/_memory/test_memory_dataset_client.py b/tests/unit/storage_clients/_memory/test_memory_dataset_client.py index 31aa61f0ac..c503374ce7 100644 --- a/tests/unit/storage_clients/_memory/test_memory_dataset_client.py +++ b/tests/unit/storage_clients/_memory/test_memory_dataset_client.py @@ -5,7 +5,6 @@ import pytest -from crawlee.configuration import Configuration from crawlee.storage_clients import MemoryStorageClient if TYPE_CHECKING: @@ -24,10 +23,8 @@ async def dataset_client() -> AsyncGenerator[MemoryDatasetClient, None]: async def test_memory_specific_purge_behavior() -> None: """Test memory-specific purge behavior and in-memory storage characteristics.""" - configuration = Configuration(purge_on_start=True) - # Create dataset and add data - dataset_client1 = await MemoryStorageClient(configuration=configuration).create_dataset_client( + dataset_client1 = await MemoryStorageClient().create_dataset_client( name='test_purge_dataset', ) await dataset_client1.push_data({'item': 'initial data'}) @@ -37,7 +34,7 @@ async def test_memory_specific_purge_behavior() -> None: assert len(items.items) == 1 # Reopen with same storage client instance - dataset_client2 = await MemoryStorageClient(configuration=configuration).create_dataset_client( + dataset_client2 = await MemoryStorageClient().create_dataset_client( name='test_purge_dataset', ) diff --git a/tests/unit/storage_clients/_memory/test_memory_kvs_client.py b/tests/unit/storage_clients/_memory/test_memory_kvs_client.py index f8b8e755b2..ef55107393 100644 --- a/tests/unit/storage_clients/_memory/test_memory_kvs_client.py +++ b/tests/unit/storage_clients/_memory/test_memory_kvs_client.py @@ -5,7 +5,6 @@ import pytest -from crawlee.configuration import Configuration from crawlee.storage_clients import MemoryStorageClient if TYPE_CHECKING: @@ -24,10 +23,9 @@ async def kvs_client() -> AsyncGenerator[MemoryKeyValueStoreClient, None]: async def test_memory_specific_purge_behavior() -> None: """Test memory-specific purge behavior and in-memory storage characteristics.""" - configuration = Configuration(purge_on_start=True) # Create KVS and add data - kvs_client1 = await MemoryStorageClient(configuration=configuration).create_kvs_client( + kvs_client1 = await MemoryStorageClient().create_kvs_client( name='test_purge_kvs', ) await kvs_client1.set_value(key='test-key', value='initial value') @@ -38,7 +36,7 @@ async def test_memory_specific_purge_behavior() -> None: assert record.value == 'initial value' # Reopen with same storage client instance - kvs_client2 = await MemoryStorageClient(configuration=configuration).create_kvs_client( + kvs_client2 = await MemoryStorageClient().create_kvs_client( name='test_purge_kvs', ) diff --git a/tests/unit/storage_clients/_memory/test_memory_rq_client.py b/tests/unit/storage_clients/_memory/test_memory_rq_client.py index 89b98cfac5..8bfe8632df 100644 --- a/tests/unit/storage_clients/_memory/test_memory_rq_client.py +++ b/tests/unit/storage_clients/_memory/test_memory_rq_client.py @@ -6,7 +6,6 @@ import pytest from crawlee import Request -from crawlee.configuration import Configuration from crawlee.storage_clients import MemoryStorageClient if TYPE_CHECKING: @@ -25,10 +24,8 @@ async def rq_client() -> AsyncGenerator[MemoryRequestQueueClient, None]: async def test_memory_specific_purge_behavior() -> None: """Test memory-specific purge behavior and in-memory storage characteristics.""" - configuration = Configuration(purge_on_start=True) - # Create RQ and add data - rq_client1 = await MemoryStorageClient(configuration=configuration).create_rq_client( + rq_client1 = await MemoryStorageClient().create_rq_client( name='test_purge_rq', ) request = Request.from_url(url='https://example.com/initial') @@ -38,7 +35,7 @@ async def test_memory_specific_purge_behavior() -> None: assert await rq_client1.is_empty() is False # Reopen with same storage client instance - rq_client2 = await MemoryStorageClient(configuration=configuration).create_rq_client( + rq_client2 = await MemoryStorageClient().create_rq_client( name='test_purge_rq', ) diff --git a/tests/unit/storages/test_dataset.py b/tests/unit/storages/test_dataset.py index 83489ff1e0..7b8091ec0b 100644 --- a/tests/unit/storages/test_dataset.py +++ b/tests/unit/storages/test_dataset.py @@ -21,13 +21,10 @@ @pytest.fixture(params=['memory', 'file_system']) -def storage_client(request: pytest.FixtureRequest, configuration: Configuration) -> StorageClient: +def storage_client(request: pytest.FixtureRequest) -> StorageClient: """Parameterized fixture to test with different storage clients.""" storage_client: StorageClient - if request.param == 'memory': - storage_client = MemoryStorageClient(configuration=configuration) - else: - storage_client = FileSystemStorageClient(configuration=configuration) + storage_client = MemoryStorageClient() if request.param == 'memory' else FileSystemStorageClient() service_locator.set_storage_client(storage_client) return storage_client diff --git a/tests/unit/storages/test_key_value_store.py b/tests/unit/storages/test_key_value_store.py index 1bb3de81bf..5d989866cd 100644 --- a/tests/unit/storages/test_key_value_store.py +++ b/tests/unit/storages/test_key_value_store.py @@ -21,13 +21,10 @@ @pytest.fixture(params=['memory', 'file_system']) -def storage_client(request: pytest.FixtureRequest, configuration: Configuration) -> StorageClient: +def storage_client(request: pytest.FixtureRequest) -> StorageClient: """Parameterized fixture to test with different storage clients.""" storage_client: StorageClient - if request.param == 'memory': - storage_client = MemoryStorageClient(configuration=configuration) - else: - storage_client = FileSystemStorageClient(configuration=configuration) + storage_client = MemoryStorageClient() if request.param == 'memory' else FileSystemStorageClient() service_locator.set_storage_client(storage_client) return storage_client diff --git a/tests/unit/storages/test_request_queue.py b/tests/unit/storages/test_request_queue.py index e28faa23e8..92240462f2 100644 --- a/tests/unit/storages/test_request_queue.py +++ b/tests/unit/storages/test_request_queue.py @@ -21,13 +21,10 @@ @pytest.fixture(params=['memory', 'file_system']) -def storage_client(request: pytest.FixtureRequest, configuration: Configuration) -> StorageClient: +def storage_client(request: pytest.FixtureRequest) -> StorageClient: """Parameterized fixture to test with different storage clients.""" storage_client: StorageClient - if request.param == 'memory': - storage_client = MemoryStorageClient(configuration=configuration) - else: - storage_client = FileSystemStorageClient(configuration=configuration) + storage_client = MemoryStorageClient() if request.param == 'memory' else FileSystemStorageClient() service_locator.set_storage_client(storage_client) return storage_client diff --git a/tests/unit/storages/test_storage_instance_manager.py b/tests/unit/storages/test_storage_instance_manager.py index 9a6cc955a2..83682ca582 100644 --- a/tests/unit/storages/test_storage_instance_manager.py +++ b/tests/unit/storages/test_storage_instance_manager.py @@ -1,58 +1,98 @@ from pathlib import Path +import pytest + from crawlee import service_locator from crawlee.configuration import Configuration from crawlee.storage_clients import FileSystemStorageClient, MemoryStorageClient from crawlee.storages import Dataset, KeyValueStore +@pytest.fixture(autouse=True) +def clean_storage_instance_manager() -> None: + """Helper function to clean the storage instance manager before each test.""" + service_locator.storage_instance_manager.clear_cache() + + async def test_unique_storage_by_storage_client(tmp_path: Path) -> None: config = Configuration( - crawlee_storage_dir=str(tmp_path), # type: ignore[call-arg] purge_on_start=True, ) + config.storage_dir = str(tmp_path) - kvs1 = await KeyValueStore.open(storage_client=MemoryStorageClient(configuration=config)) - kvs2 = await KeyValueStore.open(storage_client=FileSystemStorageClient(configuration=config)) + kvs1 = await KeyValueStore.open(storage_client=MemoryStorageClient(), configuration=config) + kvs2 = await KeyValueStore.open(storage_client=FileSystemStorageClient(), configuration=config) assert kvs1 is not kvs2 -async def test_unique_storage_by_storage_client_of_same_type(tmp_path: Path) -> None: +async def test_same_storage_when_different_client(tmp_path: Path) -> None: config = Configuration( - crawlee_storage_dir=str(tmp_path), # type: ignore[call-arg] purge_on_start=True, ) + config.storage_dir = str(tmp_path) - kvs1 = await KeyValueStore.open(storage_client=MemoryStorageClient(configuration=config)) - kvs2 = await KeyValueStore.open(storage_client=MemoryStorageClient(configuration=config)) - assert kvs1 is not kvs2 + kvs1 = await KeyValueStore.open(storage_client=MemoryStorageClient(), configuration=config) + kvs2 = await KeyValueStore.open(storage_client=MemoryStorageClient(), configuration=config) + assert kvs1 is kvs2 async def test_unique_storage_by_storage_type(tmp_path: Path) -> None: config = Configuration( - crawlee_storage_dir=str(tmp_path), # type: ignore[call-arg] purge_on_start=True, ) - storage_client = MemoryStorageClient(configuration=config) + config.storage_dir = str(tmp_path) + storage_client = MemoryStorageClient() - kvs = await KeyValueStore.open(storage_client=storage_client) - dataset = await Dataset.open(storage_client=storage_client) + kvs = await KeyValueStore.open(storage_client=storage_client, configuration=config) + dataset = await Dataset.open(storage_client=storage_client, configuration=config) assert kvs is not dataset -async def test_unique_storage_by_name(tmp_path: Path) -> None: +async def test_unique_storage_by_name() -> None: """Test that StorageInstanceManager support different storage clients at the same time.""" - config = Configuration( - crawlee_storage_dir=str(tmp_path), # type: ignore[call-arg] - purge_on_start=True, - ) - storage_client = FileSystemStorageClient(configuration=config) + storage_client = MemoryStorageClient() kvs1 = await KeyValueStore.open(storage_client=storage_client, name='kvs1') kvs2 = await KeyValueStore.open(storage_client=storage_client, name='kvs2') assert kvs1 is not kvs2 +async def test_unique_storage_by_unique_cache_key_different_path(tmp_path: Path) -> None: + """Test that StorageInstanceManager support unique cache key. Difference in storage_dir.""" + path_1 = tmp_path / 'dir1' + path_2 = tmp_path / 'dir2' + path_1.mkdir() + path_2.mkdir() + + config_1 = Configuration() + config_1.storage_dir = str(path_1) + + config_2 = Configuration() + config_2.storage_dir = str(path_2) + + storage_client = FileSystemStorageClient() + + kvs1 = await KeyValueStore.open(storage_client=storage_client, configuration=config_1) + kvs2 = await KeyValueStore.open(storage_client=storage_client, configuration=config_2) + assert kvs1 is not kvs2 + + +async def test_unique_storage_by_unique_cache_key_same_path(tmp_path: Path) -> None: + """Test that StorageInstanceManager support unique cache key. Different configs with same storage_dir create same + storage.""" + config_1 = Configuration() + config_1.storage_dir = str(tmp_path) + + config_2 = Configuration() + config_2.storage_dir = str(tmp_path) + + storage_client = FileSystemStorageClient() + + kvs1 = await KeyValueStore.open(storage_client=storage_client, configuration=config_1) + kvs2 = await KeyValueStore.open(storage_client=storage_client, configuration=config_2) + assert kvs1 is kvs2 + + async def test_identical_storage_default_config() -> None: """Test that StorageInstanceManager correctly caches storage based on the storage client.""" storage_client = MemoryStorageClient() From e86356d496869f5855c54d44f7db6675c39c240c Mon Sep 17 00:00:00 2001 From: Josef Prochazka Date: Tue, 2 Sep 2025 16:10:30 +0200 Subject: [PATCH 10/16] Properly set global_storage_instance_manager --- src/crawlee/_service_locator.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/crawlee/_service_locator.py b/src/crawlee/_service_locator.py index 15f2eb74f1..9c4eebcae8 100644 --- a/src/crawlee/_service_locator.py +++ b/src/crawlee/_service_locator.py @@ -123,13 +123,13 @@ def set_storage_client(self, storage_client: StorageClient) -> None: @property def storage_instance_manager(self) -> StorageInstanceManager: """Get the storage instance manager. It is global manager shared by all instances of ServiceLocator.""" - if self.__class__.global_storage_instance_manager is None: + if ServiceLocator.global_storage_instance_manager is None: # Import here to avoid circular imports. from crawlee.storages._storage_instance_manager import StorageInstanceManager # noqa: PLC0415 - self.__class__.global_storage_instance_manager = StorageInstanceManager() + ServiceLocator.global_storage_instance_manager = StorageInstanceManager() - return self.__class__.global_storage_instance_manager + return ServiceLocator.global_storage_instance_manager service_locator = ServiceLocator() From 07ffa190661b026d3fa077aea8e1bc745c48aea9 Mon Sep 17 00:00:00 2001 From: Josef Prochazka Date: Tue, 2 Sep 2025 16:56:28 +0200 Subject: [PATCH 11/16] Avoid coroutine ... was never awaited --- .../storages/_storage_instance_manager.py | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/src/crawlee/storages/_storage_instance_manager.py b/src/crawlee/storages/_storage_instance_manager.py index 962745e024..b09154aab9 100644 --- a/src/crawlee/storages/_storage_instance_manager.py +++ b/src/crawlee/storages/_storage_instance_manager.py @@ -1,14 +1,12 @@ from __future__ import annotations from collections import defaultdict -from collections.abc import Awaitable, Hashable +from collections.abc import Coroutine, Hashable from dataclasses import dataclass, field from typing import TYPE_CHECKING, TypeVar, cast from crawlee.storage_clients._base import DatasetClient, KeyValueStoreClient, RequestQueueClient -from . import Dataset, KeyValueStore, RequestQueue - if TYPE_CHECKING: from crawlee.storage_clients import StorageClient @@ -43,7 +41,7 @@ class _StorageClientCache: StorageClientType = DatasetClient | KeyValueStoreClient | RequestQueueClient """Type alias for the storage client types.""" -ClientOpener = Awaitable[StorageClientType] +ClientOpener = Coroutine[None, None, StorageClientType] """Type alias for the client opener function.""" @@ -74,7 +72,7 @@ async def open_storage_instance( id: Storage ID. name: Storage name. storage_client_type: Type of storage client to use. - client_opener: Awaitable to open the storage client when storage instance not found in cache. + client_opener: Coroutine to open the storage client when storage instance not found in cache. additional_cache_key: Additional optional key to differentiate cache entries. Returns: @@ -92,6 +90,7 @@ async def open_storage_instance( and name is None and additional_cache_key in self._cache_by_storage_client[storage_client_type].default_instances[cls] ): + client_opener.close() # Close the opener since we don't need it return cast( 'T', self._cache_by_storage_client[storage_client_type].default_instances[cls][additional_cache_key] ) @@ -103,6 +102,7 @@ async def open_storage_instance( .get(additional_cache_key) ): if isinstance(cached_instance, cls): + client_opener.close() # Close the opener since we don't need it return cached_instance raise RuntimeError('Cached instance type mismatch.') @@ -112,15 +112,13 @@ async def open_storage_instance( .get(additional_cache_key) ): if isinstance(cached_instance, cls): + client_opener.close() # Close the opener since we don't need it return cached_instance raise RuntimeError('Cached instance type mismatch.') client: KeyValueStoreClient | DatasetClient | RequestQueueClient # Create new instance - if cls is Dataset or cls is KeyValueStore or cls is RequestQueue: - client = await client_opener - else: - raise ValueError(f'Unsupported storage class: {cls.__name__}') + client = await client_opener metadata = await client.get_metadata() From 6026b3a202205c138d098c144797f60c3ad71fdd Mon Sep 17 00:00:00 2001 From: Josef Prochazka Date: Fri, 12 Sep 2025 10:37:06 +0200 Subject: [PATCH 12/16] Revert incorrect delete, add test that would catch it, remove useless fixture --- src/crawlee/storages/_dataset.py | 5 +++ tests/unit/storages/test_dataset.py | 38 +++++++++++++++++++++++ tests/unit/storages/test_request_queue.py | 13 -------- 3 files changed, 43 insertions(+), 13 deletions(-) diff --git a/src/crawlee/storages/_dataset.py b/src/crawlee/storages/_dataset.py index 62f3e23057..9dc3fef043 100644 --- a/src/crawlee/storages/_dataset.py +++ b/src/crawlee/storages/_dataset.py @@ -298,6 +298,7 @@ async def export_to( to_kvs_id: str | None = None, to_kvs_name: str | None = None, to_kvs_storage_client: StorageClient | None = None, + to_kvs_configuration: Configuration | None = None, **kwargs: Unpack[ExportDataJsonKwargs], ) -> None: ... @@ -309,6 +310,7 @@ async def export_to( to_kvs_id: str | None = None, to_kvs_name: str | None = None, to_kvs_storage_client: StorageClient | None = None, + to_kvs_configuration: Configuration | None = None, **kwargs: Unpack[ExportDataCsvKwargs], ) -> None: ... @@ -319,6 +321,7 @@ async def export_to( to_kvs_id: str | None = None, to_kvs_name: str | None = None, to_kvs_storage_client: StorageClient | None = None, + to_kvs_configuration: Configuration | None = None, **kwargs: Any, ) -> None: """Export the entire dataset into a specified file stored under a key in a key-value store. @@ -336,11 +339,13 @@ async def export_to( to_kvs_name: Name of the key-value store to save the exported file. Specify only one of ID or name. to_kvs_storage_client: Storage client to use for the key-value store. + to_kvs_configuration: Configuration for the key-value store. kwargs: Additional parameters for the export operation, specific to the chosen content type. """ kvs = await KeyValueStore.open( id=to_kvs_id, name=to_kvs_name, + configuration=to_kvs_configuration, storage_client=to_kvs_storage_client, ) dst = StringIO() diff --git a/tests/unit/storages/test_dataset.py b/tests/unit/storages/test_dataset.py index 7b8091ec0b..b15a9cd46b 100644 --- a/tests/unit/storages/test_dataset.py +++ b/tests/unit/storages/test_dataset.py @@ -3,6 +3,7 @@ from __future__ import annotations +import json from typing import TYPE_CHECKING import pytest @@ -495,6 +496,43 @@ async def test_export_to_invalid_content_type(dataset: Dataset) -> None: ) +async def test_export_with_multiple_kwargs(dataset: Dataset, tmp_path: Path) -> None: + """Test exporting dataset using many optional arguments together.""" + target_kvs_name = 'some_kvs' + target_storage_client = FileSystemStorageClient() + export_key = 'exported_dataset' + data = {'some key': 'some data'} + + # Prepare custom directory and configuration + custom_dir_name = 'some_dir' + custom_dir = tmp_path / custom_dir_name + custom_dir.mkdir() + target_configuration = Configuration(crawlee_storage_dir=str(custom_dir)) # type: ignore[call-arg] + + # Set expected values + expected_exported_data = f'{json.dumps([{"some key": "some data"}])}' + expected_kvs_dir = custom_dir / 'key_value_stores' / target_kvs_name + + # Populate dataset and export + await dataset.push_data(data) + await dataset.export_to( + key=export_key, + content_type='json', + to_kvs_name=target_kvs_name, + to_kvs_storage_client=target_storage_client, + to_kvs_configuration=target_configuration, + ) + + # Verify the directory was created + assert expected_kvs_dir.is_dir() + # Verify that kvs contains the exported data + kvs = await KeyValueStore.open( + name=target_kvs_name, storage_client=target_storage_client, configuration=target_configuration + ) + + assert await kvs.get_value(key=export_key) == expected_exported_data + + async def test_large_dataset(dataset: Dataset) -> None: """Test handling a large dataset with many items.""" items = [{'id': i, 'value': f'value-{i}'} for i in range(100)] diff --git a/tests/unit/storages/test_request_queue.py b/tests/unit/storages/test_request_queue.py index 92240462f2..43105d160f 100644 --- a/tests/unit/storages/test_request_queue.py +++ b/tests/unit/storages/test_request_queue.py @@ -9,13 +9,11 @@ import pytest from crawlee import Request, service_locator -from crawlee.configuration import Configuration from crawlee.storage_clients import FileSystemStorageClient, MemoryStorageClient, StorageClient from crawlee.storages import RequestQueue if TYPE_CHECKING: from collections.abc import AsyncGenerator - from pathlib import Path from crawlee.storage_clients import StorageClient @@ -29,17 +27,6 @@ def storage_client(request: pytest.FixtureRequest) -> StorageClient: return storage_client -@pytest.fixture -def configuration(tmp_path: Path) -> Configuration: - """Provide a configuration with a temporary storage directory.""" - configuration = Configuration( - crawlee_storage_dir=str(tmp_path), # type: ignore[call-arg] - purge_on_start=True, - ) - service_locator.set_configuration(configuration) - return configuration - - @pytest.fixture async def rq( storage_client: StorageClient, From 330b598db9ba4c2cd9bfbdfc6125b8a4870d52f5 Mon Sep 17 00:00:00 2001 From: Josef Prochazka Date: Fri, 12 Sep 2025 10:47:39 +0200 Subject: [PATCH 13/16] Revert unintentional edit in base class --- src/crawlee/storages/_base.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/crawlee/storages/_base.py b/src/crawlee/storages/_base.py index aa312787ce..174965f644 100644 --- a/src/crawlee/storages/_base.py +++ b/src/crawlee/storages/_base.py @@ -6,6 +6,7 @@ from crawlee._utils.docs import docs_group if TYPE_CHECKING: + from crawlee.configuration import Configuration from crawlee.storage_clients._base import StorageClient from crawlee.storage_clients.models import DatasetMetadata, KeyValueStoreMetadata, RequestQueueMetadata @@ -36,6 +37,7 @@ async def open( id: str | None = None, name: str | None = None, storage_client: StorageClient | None = None, + configuration: Configuration | None = None, ) -> Storage: """Open a storage, either restore existing or create a new one. @@ -44,6 +46,7 @@ async def open( name: The storage name. storage_client: Underlying storage client to use. If not provided, the default global storage client from the service locator will be used. + configuration: Configuration object used during the storage creation or restoration process. """ @abstractmethod From 814327781f4dc27c5615f9d5a711e0bed4c53a6a Mon Sep 17 00:00:00 2001 From: Josef Prochazka Date: Fri, 12 Sep 2025 11:02:05 +0200 Subject: [PATCH 14/16] Parametrize storage_instance_manager tests by storage type --- .../storages/test_storage_instance_manager.py | 83 ++++++++++--------- 1 file changed, 45 insertions(+), 38 deletions(-) diff --git a/tests/unit/storages/test_storage_instance_manager.py b/tests/unit/storages/test_storage_instance_manager.py index 83682ca582..730279f9a6 100644 --- a/tests/unit/storages/test_storage_instance_manager.py +++ b/tests/unit/storages/test_storage_instance_manager.py @@ -1,11 +1,13 @@ from pathlib import Path +from typing import cast import pytest from crawlee import service_locator from crawlee.configuration import Configuration from crawlee.storage_clients import FileSystemStorageClient, MemoryStorageClient -from crawlee.storages import Dataset, KeyValueStore +from crawlee.storages import Dataset, KeyValueStore, RequestQueue +from crawlee.storages._base import Storage @pytest.fixture(autouse=True) @@ -14,26 +16,31 @@ def clean_storage_instance_manager() -> None: service_locator.storage_instance_manager.clear_cache() -async def test_unique_storage_by_storage_client(tmp_path: Path) -> None: +@pytest.fixture(params=[KeyValueStore, Dataset, RequestQueue]) +def storage_type(request: pytest.FixtureRequest) -> type[Storage]: + return cast('type[Storage]', request.param) + + +async def test_unique_storage_by_storage_client(tmp_path: Path, storage_type: type[Storage]) -> None: config = Configuration( purge_on_start=True, ) config.storage_dir = str(tmp_path) - kvs1 = await KeyValueStore.open(storage_client=MemoryStorageClient(), configuration=config) - kvs2 = await KeyValueStore.open(storage_client=FileSystemStorageClient(), configuration=config) - assert kvs1 is not kvs2 + storage_1 = await storage_type.open(storage_client=MemoryStorageClient(), configuration=config) + storage_2 = await storage_type.open(storage_client=FileSystemStorageClient(), configuration=config) + assert storage_1 is not storage_2 -async def test_same_storage_when_different_client(tmp_path: Path) -> None: +async def test_same_storage_when_different_client(tmp_path: Path, storage_type: type[Storage]) -> None: config = Configuration( purge_on_start=True, ) config.storage_dir = str(tmp_path) - kvs1 = await KeyValueStore.open(storage_client=MemoryStorageClient(), configuration=config) - kvs2 = await KeyValueStore.open(storage_client=MemoryStorageClient(), configuration=config) - assert kvs1 is kvs2 + storage_1 = await storage_type.open(storage_client=MemoryStorageClient(), configuration=config) + storage_2 = await storage_type.open(storage_client=MemoryStorageClient(), configuration=config) + assert storage_1 is storage_2 async def test_unique_storage_by_storage_type(tmp_path: Path) -> None: @@ -48,16 +55,16 @@ async def test_unique_storage_by_storage_type(tmp_path: Path) -> None: assert kvs is not dataset -async def test_unique_storage_by_name() -> None: +async def test_unique_storage_by_name(storage_type: type[Storage]) -> None: """Test that StorageInstanceManager support different storage clients at the same time.""" storage_client = MemoryStorageClient() - kvs1 = await KeyValueStore.open(storage_client=storage_client, name='kvs1') - kvs2 = await KeyValueStore.open(storage_client=storage_client, name='kvs2') - assert kvs1 is not kvs2 + storage_1 = await storage_type.open(storage_client=storage_client, name='kvs1') + storage_2 = await storage_type.open(storage_client=storage_client, name='kvs2') + assert storage_1 is not storage_2 -async def test_unique_storage_by_unique_cache_key_different_path(tmp_path: Path) -> None: +async def test_unique_storage_by_unique_cache_key_different_path(tmp_path: Path, storage_type: type[Storage]) -> None: """Test that StorageInstanceManager support unique cache key. Difference in storage_dir.""" path_1 = tmp_path / 'dir1' path_2 = tmp_path / 'dir2' @@ -72,12 +79,12 @@ async def test_unique_storage_by_unique_cache_key_different_path(tmp_path: Path) storage_client = FileSystemStorageClient() - kvs1 = await KeyValueStore.open(storage_client=storage_client, configuration=config_1) - kvs2 = await KeyValueStore.open(storage_client=storage_client, configuration=config_2) - assert kvs1 is not kvs2 + storage_1 = await storage_type.open(storage_client=storage_client, configuration=config_1) + storage_2 = await storage_type.open(storage_client=storage_client, configuration=config_2) + assert storage_1 is not storage_2 -async def test_unique_storage_by_unique_cache_key_same_path(tmp_path: Path) -> None: +async def test_unique_storage_by_unique_cache_key_same_path(tmp_path: Path, storage_type: type[Storage]) -> None: """Test that StorageInstanceManager support unique cache key. Different configs with same storage_dir create same storage.""" config_1 = Configuration() @@ -88,36 +95,36 @@ async def test_unique_storage_by_unique_cache_key_same_path(tmp_path: Path) -> N storage_client = FileSystemStorageClient() - kvs1 = await KeyValueStore.open(storage_client=storage_client, configuration=config_1) - kvs2 = await KeyValueStore.open(storage_client=storage_client, configuration=config_2) - assert kvs1 is kvs2 + storage_1 = await storage_type.open(storage_client=storage_client, configuration=config_1) + storage_2 = await storage_type.open(storage_client=storage_client, configuration=config_2) + assert storage_1 is storage_2 -async def test_identical_storage_default_config() -> None: +async def test_identical_storage_default_config(storage_type: type[Storage]) -> None: """Test that StorageInstanceManager correctly caches storage based on the storage client.""" storage_client = MemoryStorageClient() - kvs1 = await KeyValueStore.open(storage_client=storage_client) - kvs2 = await KeyValueStore.open(storage_client=storage_client) - assert kvs1 is kvs2 + storage_1 = await storage_type.open(storage_client=storage_client) + storage_2 = await storage_type.open(storage_client=storage_client) + assert storage_1 is storage_2 -async def test_identical_storage_default_storage() -> None: +async def test_identical_storage_default_storage(storage_type: type[Storage]) -> None: """Test that StorageInstanceManager correctly caches storage based on the storage client.""" - kvs1 = await KeyValueStore.open() - kvs2 = await KeyValueStore.open() - assert kvs1 is kvs2 + storage_1 = await storage_type.open() + storage_2 = await storage_type.open() + assert storage_1 is storage_2 -async def test_identical_storage_clear_cache() -> None: - kvs1 = await KeyValueStore.open() +async def test_identical_storage_clear_cache(storage_type: type[Storage]) -> None: + storage_1 = await storage_type.open() service_locator.storage_instance_manager.clear_cache() - kvs2 = await KeyValueStore.open() - assert kvs1 is not kvs2 + storage_2 = await storage_type.open() + assert storage_1 is not storage_2 -async def test_identical_storage_remove_from_cache() -> None: - kvs1 = await KeyValueStore.open() - service_locator.storage_instance_manager.remove_from_cache(kvs1) - kvs2 = await KeyValueStore.open() - assert kvs1 is not kvs2 +async def test_identical_storage_remove_from_cache(storage_type: type[Storage]) -> None: + storage_1 = await storage_type.open() + service_locator.storage_instance_manager.remove_from_cache(storage_1) + storage_2 = await storage_type.open() + assert storage_1 is not storage_2 From ed4bca72f819f41eb8fdcec4d74e160e3695eb2f Mon Sep 17 00:00:00 2001 From: Josef Prochazka Date: Fri, 12 Sep 2025 11:17:00 +0200 Subject: [PATCH 15/16] Extract shared fixture and remove unused fixtures --- tests/unit/storages/conftest.py | 13 +++++++++++ tests/unit/storages/test_dataset.py | 23 +------------------- tests/unit/storages/test_key_value_store.py | 24 --------------------- tests/unit/storages/test_request_queue.py | 11 +--------- 4 files changed, 15 insertions(+), 56 deletions(-) create mode 100644 tests/unit/storages/conftest.py diff --git a/tests/unit/storages/conftest.py b/tests/unit/storages/conftest.py new file mode 100644 index 0000000000..e7fd22f6b5 --- /dev/null +++ b/tests/unit/storages/conftest.py @@ -0,0 +1,13 @@ +import pytest + +from crawlee import service_locator +from crawlee.storage_clients import FileSystemStorageClient, MemoryStorageClient, StorageClient + + +@pytest.fixture(params=['memory', 'file_system']) +def storage_client(request: pytest.FixtureRequest) -> StorageClient: + """Parameterized fixture to test with different storage clients.""" + storage_client: StorageClient + storage_client = MemoryStorageClient() if request.param == 'memory' else FileSystemStorageClient() + service_locator.set_storage_client(storage_client) + return storage_client diff --git a/tests/unit/storages/test_dataset.py b/tests/unit/storages/test_dataset.py index b15a9cd46b..50ad92c0ec 100644 --- a/tests/unit/storages/test_dataset.py +++ b/tests/unit/storages/test_dataset.py @@ -8,9 +8,8 @@ import pytest -from crawlee._service_locator import service_locator from crawlee.configuration import Configuration -from crawlee.storage_clients import FileSystemStorageClient, MemoryStorageClient +from crawlee.storage_clients import FileSystemStorageClient from crawlee.storages import Dataset, KeyValueStore if TYPE_CHECKING: @@ -21,26 +20,6 @@ from crawlee.storage_clients import StorageClient -@pytest.fixture(params=['memory', 'file_system']) -def storage_client(request: pytest.FixtureRequest) -> StorageClient: - """Parameterized fixture to test with different storage clients.""" - storage_client: StorageClient - storage_client = MemoryStorageClient() if request.param == 'memory' else FileSystemStorageClient() - service_locator.set_storage_client(storage_client) - return storage_client - - -@pytest.fixture -def configuration(tmp_path: Path) -> Configuration: - """Provide a configuration with a temporary storage directory.""" - configuration = Configuration( - crawlee_storage_dir=str(tmp_path), # type: ignore[call-arg] - purge_on_start=True, - ) - service_locator.set_configuration(configuration) - return configuration - - @pytest.fixture async def dataset( storage_client: StorageClient, diff --git a/tests/unit/storages/test_key_value_store.py b/tests/unit/storages/test_key_value_store.py index 5d989866cd..f6eb9debed 100644 --- a/tests/unit/storages/test_key_value_store.py +++ b/tests/unit/storages/test_key_value_store.py @@ -8,38 +8,14 @@ import pytest -from crawlee import service_locator -from crawlee.configuration import Configuration -from crawlee.storage_clients import FileSystemStorageClient, MemoryStorageClient from crawlee.storages import KeyValueStore if TYPE_CHECKING: from collections.abc import AsyncGenerator - from pathlib import Path from crawlee.storage_clients import StorageClient -@pytest.fixture(params=['memory', 'file_system']) -def storage_client(request: pytest.FixtureRequest) -> StorageClient: - """Parameterized fixture to test with different storage clients.""" - storage_client: StorageClient - storage_client = MemoryStorageClient() if request.param == 'memory' else FileSystemStorageClient() - service_locator.set_storage_client(storage_client) - return storage_client - - -@pytest.fixture -def configuration(tmp_path: Path) -> Configuration: - """Provide a configuration with a temporary storage directory.""" - configuration = Configuration( - crawlee_storage_dir=str(tmp_path), # type: ignore[call-arg] - purge_on_start=True, - ) - service_locator.set_configuration(configuration) - return configuration - - @pytest.fixture async def kvs( storage_client: StorageClient, diff --git a/tests/unit/storages/test_request_queue.py b/tests/unit/storages/test_request_queue.py index 43105d160f..c5b1c0648c 100644 --- a/tests/unit/storages/test_request_queue.py +++ b/tests/unit/storages/test_request_queue.py @@ -9,7 +9,7 @@ import pytest from crawlee import Request, service_locator -from crawlee.storage_clients import FileSystemStorageClient, MemoryStorageClient, StorageClient +from crawlee.storage_clients import StorageClient from crawlee.storages import RequestQueue if TYPE_CHECKING: @@ -18,15 +18,6 @@ from crawlee.storage_clients import StorageClient -@pytest.fixture(params=['memory', 'file_system']) -def storage_client(request: pytest.FixtureRequest) -> StorageClient: - """Parameterized fixture to test with different storage clients.""" - storage_client: StorageClient - storage_client = MemoryStorageClient() if request.param == 'memory' else FileSystemStorageClient() - service_locator.set_storage_client(storage_client) - return storage_client - - @pytest.fixture async def rq( storage_client: StorageClient, From b06054b63c6c2e2e7e5e5f4dd82e1921a80614fc Mon Sep 17 00:00:00 2001 From: Josef Prochazka Date: Fri, 12 Sep 2025 13:58:53 +0200 Subject: [PATCH 16/16] Update upgrading guide --- docs/upgrading/upgrading_to_v1.md | 89 +++++++++++++++++++++++++++++++ 1 file changed, 89 insertions(+) diff --git a/docs/upgrading/upgrading_to_v1.md b/docs/upgrading/upgrading_to_v1.md index 3a03e8e1e7..edb5840cac 100644 --- a/docs/upgrading/upgrading_to_v1.md +++ b/docs/upgrading/upgrading_to_v1.md @@ -5,6 +5,95 @@ title: Upgrading to v1 This page summarizes the breaking changes between Crawlee for Python v0.6 and v1.0. +## ServiceLocator changes + +### ServiceLocator is stricter with registering services. +You can register the services just once, and you can no longer override already registered services. + +**Before (v0.6):** +```python +from crawlee import service_locator +from crawlee.storage_clients import MemoryStorageClient + +service_locator.set_storage_client(MemoryStorageClient()) +service_locator.set_storage_client(MemoryStorageClient()) +``` +**Now (v1.0):** + +```python +from crawlee import service_locator +from crawlee.storage_clients import MemoryStorageClient + +service_locator.set_storage_client(MemoryStorageClient()) +service_locator.set_storage_client(MemoryStorageClient()) # Raises an error +``` + +### BasicCrawler has own instance of ServiceLocator to track its own services. +Explicitly passed services to the crawler can be different the global ones accessible in `crawlee.service_locator`. `BasicCrawler` no longer causes the global services in `service_locator` to be set to the crawler's explicitly passed services. + +```python +from crawlee import service_locator +from crawlee.crawlers import BasicCrawler +from crawlee.storage_clients import MemoryStorageClient +from crawlee.storages import Dataset + + +async def main() -> None: + custom_storage_client = MemoryStorageClient() + crawler = BasicCrawler(storage_client=custom_storage_client) + + assert service_locator.get_storage_client() is custom_storage_client + assert await crawler.get_dataset() is await Dataset.open() +``` +**Now (v1.0):** + +```python +from crawlee import service_locator +from crawlee.crawlers import BasicCrawler +from crawlee.storage_clients import MemoryStorageClient +from crawlee.storages import Dataset + + +async def main() -> None: + custom_storage_client = MemoryStorageClient() + crawler = BasicCrawler(storage_client=custom_storage_client) + + assert service_locator.get_storage_client() is not custom_storage_client + assert await crawler.get_dataset() is not await Dataset.open() +``` +### There can be two crawlers with different services at the same time. + +**Now (v1.0):** + +```python +from crawlee.crawlers import BasicCrawler +from crawlee.storage_clients import MemoryStorageClient, FileSystemStorageClient +from crawlee.configuration import Configuration +from crawlee.events import LocalEventManager + +custom_configuration_1 = Configuration() +custom_event_manager_1 = LocalEventManager.from_config(custom_configuration_1) +custom_storage_client_1 = MemoryStorageClient() + +custom_configuration_2 = Configuration() +custom_event_manager_2 = LocalEventManager.from_config(custom_configuration_2) +custom_storage_client_2 = FileSystemStorageClient() + +crawler_1 = BasicCrawler( + configuration=custom_configuration_1, + event_manager=custom_event_manager_1, + storage_client=custom_storage_client_1, +) + +crawler_2 = BasicCrawler( + configuration=custom_configuration_2, + event_manager=custom_event_manager_2, + storage_client=custom_storage_client_2, + ) + +# use crawlers without runtime crash... +``` + ## Terminology change: "browser" in different contexts The word "browser" is now used distinctly in two contexts: