Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,20 +3,20 @@
exclude: ^tests/files # these are raw test files, no need to mess with them
repos:
- repo: https://github.com/pre-commit/pre-commit-hooks
rev: v5.0.0
rev: v6.0.0
hooks:
- id: trailing-whitespace
- id: end-of-file-fixer
- repo: https://github.com/psf/black
rev: '25.9.0'
rev: '25.11.0'
hooks:
- id: black
- repo: https://github.com/astral-sh/ruff-pre-commit
rev: v0.13.1
rev: v0.14.5
hooks:
- id: ruff
- repo: https://github.com/RobertCraigie/pyright-python
rev: v1.1.405
rev: v1.1.407
hooks:
- id: pyright
name: pyright (system)
Expand Down
2 changes: 1 addition & 1 deletion .readthedocs.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ version: 2
build:
os: ubuntu-24.04
tools:
python: '3.13'
python: '3.14'

# custom commands to run mkdocs build within hatch, as suggested by maintainer in
# https://github.com/readthedocs/readthedocs.org/issues/10706
Expand Down
5 changes: 5 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,11 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

## [Unreleased]

### Changed

- Upgrade to support only Python 3.14 (#266)
- Upgrade dependencies (#269)

## [5.2.0] - 2025-10-02

### Added
Expand Down
7 changes: 7 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,13 @@ zimscraperlib>=1.1,<1.2

See documentation at [Read the Docs](https://python-scraperlib.readthedocs.io/) for details.

> [!WARNING]
> While this library brings support for downloading videos with yt-dlp, recent changes in Youtube have forced yt-dlp team
> to require new dependencies for youtube videos (see https://github.com/yt-dlp/yt-dlp/issues/15012). These dependencies
> are significantly big and not needed for all other backend supported by yt-dlp (only youtube needs it). These dependencies
> are hence not included in this library dependencies (yet, see https://github.com/openzim/python-scraperlib/issues/268),
> you have to install them on your own if you intend to download videos from Youtube.

# Dependencies

- libmagic
Expand Down
48 changes: 21 additions & 27 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
[build-system]
# jinja2 is required to generate JS and Python rules at build time
# PyYAML is used to parse fuzzy rules and generate Python/JS code
requires = ["hatchling", "hatch-openzim>=0.2", "jinja2==3.1.6", "PyYAML==6.0.2"]
requires = ["hatchling", "hatch-openzim>=0.2", "jinja2==3.1.6", "PyYAML==6.0.3"]
build-backend = "hatchling.build"

[project]
name = "zimscraperlib"
requires-python = ">=3.13,<3.14"
requires-python = ">=3.14,<3.15"
description = "Collection of python tools to re-use common code across scrapers"
readme = "README.md"
dependencies = [
Expand All @@ -16,7 +16,7 @@ dependencies = [
"python-resize-image>=1.1.19,<1.2",
"Babel>=2.9,<3.0",
"python-magic>=0.4.3,<0.5",
"libzim>=3.4.0,<4.0",
"libzim>=3.8.0,<4.0",
"beautifulsoup4>=4.9.3,<5.0",
"lxml>=4.6.3,<7.0",
"optimize-images>=1.3.6,<2.0",
Expand All @@ -26,10 +26,10 @@ dependencies = [
"regex>=2020.7.14",
"pymupdf>=1.24.0,<2.0",
"CairoSVG>=2.2.0,<3.0",
"beartype>=0.19,<0.22",
"beartype>=0.19,<0.23",
# youtube-dl should be updated as frequently as possible
"yt-dlp",
"pillow>=7.0.0,<12.0",
"pillow>=7.0.0,<13.0",
"urllib3>=1.26.5,<2.6.0",
"piexif==1.1.3", # this dep is a nightmare in terms of release management, better pinned just like in optimize-images anyway
"idna>=2.5,<4.0",
Expand All @@ -49,39 +49,39 @@ additional-classifiers = [

[project.optional-dependencies]
scripts = [
"invoke==2.2.0",
"invoke==2.2.1",
# jinja2 is required to generate JS and Python rules at build time
# PyYAML is used to parse fuzzy rules and generate Python/JS code
# also update version in build-system above
"jinja2==3.1.6",
"PyYAML==6.0.2",
"PyYAML==6.0.3",

]
lint = [
"black==25.9.0",
"ruff==0.13.1",
"black==25.11.0",
"ruff==0.14.5",
]
check = [
"pyright==1.1.405",
"pytest==8.4.2",
"pyright==1.1.407",
"pytest==9.0.1",
]
test = [
"pytest==8.4.2",
"pytest==9.0.1",
"pytest-mock==3.15.1",
"coverage==7.10.7",
"coverage==7.11.3",
]
docs = [
"mkdocs==1.6.1",
"mkdocs-include-markdown-plugin==7.1.7",
"mkdocs-material==9.6.20",
"mkdocs-include-markdown-plugin==7.2.0",
"mkdocs-material==9.7.0",
"mkdocstrings[python]==0.30.1",
"pymdown-extensions==10.16.1",
"pymdown-extensions==10.17.1",
"mkdocs-gen-files==0.5.0",
"mkdocs-literate-nav==0.6.2",
]
dev = [
"ipython==9.5.0",
"pre-commit==4.3.0",
"ipython==9.7.0",
"pre-commit==4.4.0",
"zimscraperlib[scripts]",
"zimscraperlib[lint]",
"zimscraperlib[test]",
Expand Down Expand Up @@ -157,10 +157,10 @@ build = "inv docs-build --args '{args}'"

[tool.black]
line-length = 88
target-version = ['py313']
target-version = ['py314']

[tool.ruff]
target-version = "py313"
target-version = "py314"
line-length = 88
src = ["src", "contrib"]

Expand Down Expand Up @@ -293,12 +293,6 @@ exclude_lines = [
include = ["contrib", "src", "tests", "tasks.py"]
exclude = [".env/**", ".venv/**"]
extraPaths = ["src"]
pythonVersion = "3.13"
pythonVersion = "3.14"
typeCheckingMode="strict"
disableBytesTypePromotions = true

[[tool.pyright.overrides.files]]
files = [
"src/zimscraperlib/rewriting**/*.py",
"tests/rewriting/**/*.py"
]
4 changes: 2 additions & 2 deletions rules/rules.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,8 @@
# ones) but just rewriting to proper path.
#
# This file is in sync with content at commit
# https://github.com/webrecorder/wabac.js/commit/1c3acfce39e0dc127acf455b04237e9a82062730
# from October 17, 2024
# https://github.com/webrecorder/wabac.js/commit/f62756661d06e721bc57ff25199c73ce51227916
# from October 29, 2025
#
# This file should be updated at every release of scraperlib
#
Expand Down
4 changes: 2 additions & 2 deletions src/zimscraperlib/rewriting/js.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,8 @@
ZIM at `_zim_static/__wb_module_decl.js`

This code is based on https://github.com/webrecorder/wabac.js/blob/main/src/rewrite/jsrewriter.ts
Last backport of upstream changes is from Sept 13, 2025
Commit 6dd2d9ae664cfcd2ea8637d7d6c7ed7a0ca332a0
Last backport of upstream changes is from Oct 12, 2025
Commit 1849552c3dbcbc065c05afac2dd80061db37b64d
"""

import re
Expand Down
13 changes: 9 additions & 4 deletions src/zimscraperlib/zim/indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,10 +101,15 @@ def get_pdf_index_data(
if parts: # pragma: no branch (always metadata in test PDFs)
title = " - ".join(parts)

content = "\n".join(
page.get_text() # pyright: ignore[reportUnknownArgumentType, reportUnknownMemberType, reportAttributeAccessIssue]
for page in doc
)
def get_pdf_content(page: pymupdf.Page) -> str:
text = ( # pyright: ignore[reportUnknownVariableType]
page.get_text() # pyright: ignore[reportUnknownMemberType]
)
if not isinstance(text, str):
raise Exception("Unexpected text content")
return text

content = "\n".join(get_pdf_content(page) for page in doc)

# build list of messages and filter messages which are known to not be relevant
# in our use-case
Expand Down
6 changes: 6 additions & 0 deletions tests/rewriting/test_js_rewriting.py
Original file line number Diff line number Diff line change
Expand Up @@ -449,6 +449,12 @@ def test_import_rewrite(rewrite_import_content: ImportTestContent):
`;
}

""",
""""use strict";(function() {
const text = `
export { a };
`;
})
""",
"let a = 7; var b = 5; const foo = 4;\n\n",
]
Expand Down