Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
75 commits
Select commit Hold shift + click to select a range
a3c5fa0
base implementation sql client
Mantisus Jul 29, 2025
3142bdd
resolve
Mantisus Jul 29, 2025
b056505
add dataset tests
Mantisus Jul 29, 2025
ae3bc3d
add kvs tests
Mantisus Jul 30, 2025
49f2643
add rq tests
Mantisus Jul 30, 2025
35a27fc
fix docs in tests
Mantisus Jul 30, 2025
52e1ad2
wrap `SQLStorageClient` in _try_import
Mantisus Jul 30, 2025
df41c45
update db models
Mantisus Jul 30, 2025
342c65a
dataset optimization
Mantisus Jul 30, 2025
61a2666
kvs optimization
Mantisus Jul 31, 2025
7055f7d
optimization
Mantisus Aug 1, 2025
1884f7d
reduce the refresh rate of `accessed_at`
Mantisus Aug 1, 2025
a10e3cf
up docs
Mantisus Aug 1, 2025
f7ebbe5
Update src/crawlee/storage_clients/_sql/_request_queue_client.py
Mantisus Aug 1, 2025
83ca6d3
fix tests
Mantisus Aug 1, 2025
1e3474c
Merge master
Mantisus Aug 3, 2025
8086ab2
same updates
Mantisus Aug 19, 2025
9ee93ab
resolve
Mantisus Aug 19, 2025
2934836
Merge branch 'master' into sql-client
Mantisus Aug 20, 2025
6401b65
up pyproject
Mantisus Aug 20, 2025
1c11d97
Merge branch 'master' into sql-client
Mantisus Aug 21, 2025
df927d1
refactor
Mantisus Aug 21, 2025
9f5e640
fix len strict for metadata_id in kvs_record
Mantisus Aug 21, 2025
77c1894
fix cache
Mantisus Aug 22, 2025
b3c1aad
update queue for support multi-clients
Mantisus Aug 23, 2025
fb8ce7d
fix metadata calculate
Mantisus Aug 23, 2025
63249bb
Add experimental warning
Mantisus Aug 23, 2025
0d62dcf
remove mysql
Mantisus Aug 24, 2025
dffeb76
raise Error for unsupported dialects
Mantisus Aug 24, 2025
61ba512
optimize update timestamps in metadata
Mantisus Aug 24, 2025
46e12b4
add docs
Mantisus Aug 24, 2025
41fcb35
Merge branch 'master' into sql-client
Mantisus Aug 25, 2025
b92e385
Update pyproject.toml
Mantisus Aug 25, 2025
045fe9c
up docs
Mantisus Aug 25, 2025
1a7618e
up database types
Mantisus Aug 26, 2025
cf1f722
Up names
Mantisus Aug 26, 2025
9328d9d
Update src/crawlee/storage_clients/_sql/_key_value_store_client.py
Mantisus Aug 26, 2025
9296d90
save session maker
Mantisus Aug 26, 2025
bdc1258
some updates
Mantisus Aug 26, 2025
9d47cff
Apply suggestion from @vdusek
Mantisus Aug 27, 2025
f69771e
Apply suggestion from @vdusek
Mantisus Aug 27, 2025
3d53ac2
Update docs/guides/storage_clients.mdx
Mantisus Aug 27, 2025
c7e3f8c
Update docs/guides/storage_clients.mdx
Mantisus Aug 27, 2025
5d05c06
Update src/crawlee/storage_clients/_sql/_client_mixin.py
Mantisus Aug 27, 2025
7a999a4
Update src/crawlee/storage_clients/_sql/_client_mixin.py
Mantisus Aug 27, 2025
bfec174
Update src/crawlee/storage_clients/_sql/_db_models.py
Mantisus Aug 27, 2025
a9b466f
Update src/crawlee/storage_clients/_sql/_db_models.py
Mantisus Aug 27, 2025
4443e98
Update src/crawlee/storage_clients/_sql/_storage_client.py
Mantisus Aug 27, 2025
245a4f9
Update docs/guides/storage_clients.mdx
Mantisus Aug 27, 2025
fb2937b
Update src/crawlee/storage_clients/_sql/_storage_client.py
Mantisus Aug 27, 2025
c3cc554
Update tests/unit/storages/test_request_queue.py
Mantisus Aug 27, 2025
05f59ca
polish sql-client
Mantisus Aug 27, 2025
473610d
Update docs/guides/storage_clients.mdx
Mantisus Aug 30, 2025
f17f6ca
Update docs/guides/storage_clients.mdx
Mantisus Aug 30, 2025
2ed4f06
Update docs/guides/storage_clients.mdx
Mantisus Aug 30, 2025
88a60f3
Update docs/guides/storage_clients.mdx
Mantisus Aug 30, 2025
a9b9671
chore(deps): update typescript-eslint monorepo to v8.41.0 (#1375)
renovate[bot] Aug 26, 2025
f8b2879
docs: Update `RequestLoader.fetch_next_request` docblock (#1374)
janbuchar Aug 26, 2025
4ba3a2e
chore(release): Update changelog and package version [skip ci]
Aug 26, 2025
1d0e531
chore(deps): update dependency types-cachetools to ~=6.2.0.20250827 (…
renovate[bot] Aug 27, 2025
5ae2c38
chore(deps): update yarn to v4.9.4 (#1377)
renovate[bot] Aug 27, 2025
ceaa9b5
docs: Update Request loaders guide (#1376)
vdusek Aug 27, 2025
3f0bf8a
chore: Fix accidentally missing name of the test (#1380)
Pijukatel Aug 28, 2025
3241785
feat: Persist the `SitemapRequestLoader` state (#1347)
Mantisus Aug 29, 2025
caff701
chore(release): Update changelog and package version [skip ci]
Aug 29, 2025
29cf5af
suppose warning
Mantisus Aug 30, 2025
bf47625
up code block
Mantisus Aug 30, 2025
b0e9f66
Merge branch 'master' into sql-client
Mantisus Sep 1, 2025
4d5ade3
up docs
Mantisus Sep 1, 2025
74f8825
drop cast
Mantisus Sep 1, 2025
d3a2ebc
fix docs
Mantisus Sep 1, 2025
7081fe4
clean docstrings
Mantisus Sep 1, 2025
b1a877e
extra optimization
Mantisus Sep 3, 2025
582adb0
Merge branch 'master' into sql-client
Mantisus Sep 3, 2025
d14c43a
handle create tables rom several parallel processes
Mantisus Sep 3, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
from crawlee.crawlers import ParselCrawler
from crawlee.storage_clients import SqlStorageClient


async def main() -> None:
# Create a new instance of storage client.
# This will create an SQLite database file crawlee.db or created tables in your
# database if you pass `connection_string` or `engine`
# Use the context manager to ensure that connections are properly cleaned up.
async with SqlStorageClient() as storage_client:
# And pass it to the crawler.
crawler = ParselCrawler(storage_client=storage_client)
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
from sqlalchemy.ext.asyncio import create_async_engine

from crawlee.configuration import Configuration
from crawlee.crawlers import ParselCrawler
from crawlee.storage_clients import SqlStorageClient


async def main() -> None:
# Create a new instance of storage client.
# On first run, also creates tables in your PostgreSQL database.
# Use the context manager to ensure that connections are properly cleaned up.
async with SqlStorageClient(
# Create an `engine` with the desired configuration
engine=create_async_engine(
'postgresql+asyncpg://myuser:mypassword@localhost:5432/postgres',
future=True,
pool_size=5,
max_overflow=10,
pool_recycle=3600,
pool_pre_ping=True,
echo=False,
)
) as storage_client:
# Create a configuration with custom settings.
configuration = Configuration(
purge_on_start=False,
)

# And pass them to the crawler.
crawler = ParselCrawler(
storage_client=storage_client,
configuration=configuration,
)
184 changes: 184 additions & 0 deletions docs/guides/storage_clients.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -8,12 +8,15 @@ import ApiLink from '@site/src/components/ApiLink';
import Tabs from '@theme/Tabs';
import TabItem from '@theme/TabItem';
import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock';
import CodeBlock from '@theme/CodeBlock';

import MemoryStorageClientBasicExample from '!!raw-loader!roa-loader!./code_examples/storage_clients/memory_storage_client_basic_example.py';
import FileSystemStorageClientBasicExample from '!!raw-loader!roa-loader!./code_examples/storage_clients/file_system_storage_client_basic_example.py';
import FileSystemStorageClientConfigurationExample from '!!raw-loader!roa-loader!./code_examples/storage_clients/file_system_storage_client_configuration_example.py';
import CustomStorageClientExample from '!!raw-loader!roa-loader!./code_examples/storage_clients/custom_storage_client_example.py';
import RegisteringStorageClientsExample from '!!raw-loader!roa-loader!./code_examples/storage_clients/registering_storage_clients_example.py';
import SQLStorageClientBasicExample from '!!raw-loader!roa-loader!./code_examples/storage_clients/sql_storage_client_basic_example.py';
import SQLStorageClientConfigurationExample from '!!raw-loader!./code_examples/storage_clients/sql_storage_client_configuration_example.py';

Storage clients provide a unified interface for interacting with <ApiLink to="class/Dataset">`Dataset`</ApiLink>, <ApiLink to="class/KeyValueStore">`KeyValueStore`</ApiLink>, and <ApiLink to="class/RequestQueue">`RequestQueue`</ApiLink>, regardless of the underlying implementation. They handle operations like creating, reading, updating, and deleting storage instances, as well as managing data persistence and cleanup. This abstraction makes it easy to switch between different environments, such as local development and cloud production setups.

Expand All @@ -23,6 +26,7 @@ Crawlee provides three main storage client implementations:

- <ApiLink to="class/FileSystemStorageClient">`FileSystemStorageClient`</ApiLink> - Provides persistent file system storage with in-memory caching.
- <ApiLink to="class/MemoryStorageClient">`MemoryStorageClient`</ApiLink> - Stores data in memory with no persistence.
- <ApiLink to="class/SqlStorageClient">`SqlStorageClient`</ApiLink> – Provides persistent storage using a SQL database ([SQLite](https://sqlite.org/) or [PostgreSQL](https://www.postgresql.org/)). Requires installing the extra dependency: 'crawlee[sql_sqlite]' for SQLite or 'crawlee[sql_postgres]' for PostgreSQL.
- [`ApifyStorageClient`](https://docs.apify.com/sdk/python/reference/class/ApifyStorageClient) - Manages storage on the [Apify platform](https://apify.com), implemented in the [Apify SDK](https://github.com/apify/apify-sdk-python).

```mermaid
Expand Down Expand Up @@ -50,6 +54,8 @@ class FileSystemStorageClient

class MemoryStorageClient

class SqlStorageClient

class ApifyStorageClient

%% ========================
Expand All @@ -58,6 +64,7 @@ class ApifyStorageClient

StorageClient --|> FileSystemStorageClient
StorageClient --|> MemoryStorageClient
StorageClient --|> SqlStorageClient
StorageClient --|> ApifyStorageClient
```

Expand Down Expand Up @@ -125,6 +132,183 @@ The `MemoryStorageClient` does not persist data between runs. All data is lost w
{MemoryStorageClientBasicExample}
</RunnableCodeBlock>

### SQL storage client

:::warning Experimental feature
The `SqlStorageClient` is experimental. Its API and behavior may change in future releases.
:::

The <ApiLink to="class/SqlStorageClient">`SqlStorageClient`</ApiLink> provides persistent storage using a SQL database (SQLite by default, or PostgreSQL). It supports all Crawlee storage types and enables concurrent access from multiple independent clients or processes.

:::note dependencies
The <ApiLink to="class/SqlStorageClient">`SqlStorageClient`</ApiLink> is not included in the core Crawlee package.
To use it, you need to install Crawlee with the appropriate extra dependency:

- For SQLite support, run:
<code>pip install 'crawlee[sql_sqlite]'</code>
- For PostgreSQL support, run:
<code>pip install 'crawlee[sql_postgres]'</code>
:::

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please add some context before the code snippet

By default, <ApiLink to="class/SqlStorageClient">SqlStorageClient</ApiLink> uses SQLite.
To use PostgreSQL instead, just provide a PostgreSQL connection string via the `connection_string` parameter. No other code changes are needed—the same client works for both databases.

<RunnableCodeBlock className="language-python" language="python">
{SQLStorageClientBasicExample}
</RunnableCodeBlock>

Data is organized in relational tables. Below are the main tables and columns used for each storage type:

```mermaid
---
config:
class:
hideEmptyMembersBox: true
---

classDiagram

%% ========================
%% Storage Clients
%% ========================

class SqlDatasetClient {
<<Dataset>>
}

class SqlKeyValueStoreClient {
<<Key-value store>>
}

%% ========================
%% Dataset Tables
%% ========================

class datasets {
<<table>>
+ id (PK)
+ name
+ accessed_at
+ created_at
+ modified_at
+ item_count
}

class dataset_records {
<<table>>
+ order_id (PK)
+ metadata_id (FK)
+ data
}

%% ========================
%% Key-Value Store Tables
%% ========================

class key_value_stores {
<<table>>
+ id (PK)
+ name
+ accessed_at
+ created_at
+ modified_at
}

class key_value_store_records {
<<table>>
+ metadata_id (FK, PK)
+ key (PK)
+ value
+ content_type
+ size
}

%% ========================
%% Client to Table arrows
%% ========================

SqlDatasetClient --> datasets
SqlDatasetClient --> dataset_records

SqlKeyValueStoreClient --> key_value_stores
SqlKeyValueStoreClient --> key_value_store_records
```
```mermaid
---
config:
class:
hideEmptyMembersBox: true
---

classDiagram

%% ========================
%% Storage Clients
%% ========================

class SqlRequestQueueClient {
<<Request queue>>
}

%% ========================
%% Request Queue Tables
%% ========================

class request_queues {
<<table>>
+ id (PK)
+ name
+ accessed_at
+ created_at
+ modified_at
+ had_multiple_clients
+ handled_request_count
+ pending_request_count
+ total_request_count
}

class request_queue_records {
<<table>>
+ request_id (PK)
+ metadata_id (FK, PK)
+ data
+ sequence_number
+ is_handled
+ time_blocked_until
}

class request_queue_state {
<<table>>
+ metadata_id (FK, PK)
+ sequence_counter
+ forefront_sequence_counter
}

%% ========================
%% Client to Table arrows
%% ========================

SqlRequestQueueClient --> request_queues
SqlRequestQueueClient --> request_queue_records
SqlRequestQueueClient --> request_queue_state
```

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could we also explain somewhere in this section that switching between sqlite and postgres is only done by providing a proper connection string? But the storage client remains the same.

Configuration options for the <ApiLink to="class/SqlStorageClient">`SqlStorageClient`</ApiLink> can be set through environment variables or the <ApiLink to="class/Configuration">`Configuration`</ApiLink> class:

- **`storage_dir`** (env: `CRAWLEE_STORAGE_DIR`, default: `'./storage'`) - The root directory where the default SQLite database will be created if no connection string is provided.
- **`purge_on_start`** (env: `CRAWLEE_PURGE_ON_START`, default: `True`) - Whether to purge default storages on start.

Configuration options for the <ApiLink to="class/SqlStorageClient">`SqlStorageClient`</ApiLink> can be set via constructor arguments:

- **`connection_string`** (default: SQLite in <ApiLink to="class/Configuration">`Configuration`</ApiLink> storage dir) – SQLAlchemy connection string, e.g. `sqlite+aiosqlite:///my.db` or `postgresql+asyncpg://user:pass@host/db`.
- **`engine`** – Pre-configured SQLAlchemy AsyncEngine (optional).

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The following example is specific to PostgreSQL. Please provide some context before the code snippet.

For advanced scenarios, you can configure <ApiLink to="class/SqlStorageClient">`SqlStorageClient`</ApiLink> with a custom SQLAlchemy engine and additional options via the <ApiLink to="class/Configuration">`Configuration`</ApiLink> class. This is useful, for example, when connecting to an external PostgreSQL database or customizing connection pooling.

<CodeBlock className="language-python" language="python">
{SQLStorageClientConfigurationExample}
</CodeBlock>

## Creating a custom storage client

A storage client consists of two parts: the storage client factory and individual storage type clients. The <ApiLink to="class/StorageClient">`StorageClient`</ApiLink> acts as a factory that creates specific clients (<ApiLink to="class/DatasetClient">`DatasetClient`</ApiLink>, <ApiLink to="class/KeyValueStoreClient">`KeyValueStoreClient`</ApiLink>, <ApiLink to="class/RequestQueueClient">`RequestQueueClient`</ApiLink>) where the actual storage logic is implemented.
Expand Down
10 changes: 9 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ dependencies = [
]

[project.optional-dependencies]
all = ["crawlee[adaptive-crawler,beautifulsoup,cli,curl-impersonate,httpx,parsel,playwright,otel]"]
all = ["crawlee[adaptive-crawler,beautifulsoup,cli,curl-impersonate,httpx,parsel,playwright,otel,sql_sqlite,sql_postgres]"]
adaptive-crawler = [
"jaro-winkler>=2.0.3",
"playwright>=1.27.0",
Expand All @@ -71,6 +71,14 @@ otel = [
"opentelemetry-semantic-conventions>=0.54",
"wrapt>=1.17.0",
]
sql_postgres = [
"sqlalchemy[asyncio]>=2.0.0,<3.0.0",
"asyncpg>=0.24.0"
]
sql_sqlite = [
"sqlalchemy[asyncio]>=2.0.0,<3.0.0",
"aiosqlite>=0.21.0",
]

[project.scripts]
crawlee = "crawlee._cli:cli"
Expand Down
12 changes: 12 additions & 0 deletions src/crawlee/storage_clients/__init__.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,21 @@
from crawlee._utils.try_import import install_import_hook as _install_import_hook
from crawlee._utils.try_import import try_import as _try_import

# These imports have only mandatory dependencies, so they are imported directly.
from ._base import StorageClient
from ._file_system import FileSystemStorageClient
from ._memory import MemoryStorageClient

_install_import_hook(__name__)

# The following imports are wrapped in try_import to handle optional dependencies,
# ensuring the module can still function even if these dependencies are missing.
with _try_import(__name__, 'SqlStorageClient'):
from ._sql import SqlStorageClient

__all__ = [
'FileSystemStorageClient',
'MemoryStorageClient',
'SqlStorageClient',
'StorageClient',
]
6 changes: 6 additions & 0 deletions src/crawlee/storage_clients/_sql/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
from ._dataset_client import SqlDatasetClient
from ._key_value_store_client import SqlKeyValueStoreClient
from ._request_queue_client import SqlRequestQueueClient
from ._storage_client import SqlStorageClient

__all__ = ['SqlDatasetClient', 'SqlKeyValueStoreClient', 'SqlRequestQueueClient', 'SqlStorageClient']
Loading
Loading