diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index bbbefc0..8d27748 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -3,20 +3,20 @@ exclude: ^tests/files # these are raw test files, no need to mess with them repos: - repo: https://github.com/pre-commit/pre-commit-hooks - rev: v5.0.0 + rev: v6.0.0 hooks: - id: trailing-whitespace - id: end-of-file-fixer - repo: https://github.com/psf/black - rev: '25.9.0' + rev: '25.11.0' hooks: - id: black - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.13.1 + rev: v0.14.5 hooks: - id: ruff - repo: https://github.com/RobertCraigie/pyright-python - rev: v1.1.405 + rev: v1.1.407 hooks: - id: pyright name: pyright (system) diff --git a/.readthedocs.yaml b/.readthedocs.yaml index 8c1c416..8f110d1 100644 --- a/.readthedocs.yaml +++ b/.readthedocs.yaml @@ -6,7 +6,7 @@ version: 2 build: os: ubuntu-24.04 tools: - python: '3.13' + python: '3.14' # custom commands to run mkdocs build within hatch, as suggested by maintainer in # https://github.com/readthedocs/readthedocs.org/issues/10706 diff --git a/CHANGELOG.md b/CHANGELOG.md index e2aa498..cd35e0c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,11 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +### Changed + +- Upgrade to support only Python 3.14 (#266) +- Upgrade dependencies (#269) + ## [5.2.0] - 2025-10-02 ### Added diff --git a/README.md b/README.md index 982e784..bce8752 100644 --- a/README.md +++ b/README.md @@ -24,6 +24,13 @@ zimscraperlib>=1.1,<1.2 See documentation at [Read the Docs](https://python-scraperlib.readthedocs.io/) for details. +> [!WARNING] +> While this library brings support for downloading videos with yt-dlp, recent changes in Youtube have forced yt-dlp team +> to require new dependencies for youtube videos (see https://github.com/yt-dlp/yt-dlp/issues/15012). These dependencies +> are significantly big and not needed for all other backend supported by yt-dlp (only youtube needs it). These dependencies +> are hence not included in this library dependencies (yet, see https://github.com/openzim/python-scraperlib/issues/268), +> you have to install them on your own if you intend to download videos from Youtube. + # Dependencies - libmagic diff --git a/pyproject.toml b/pyproject.toml index 754e150..933b816 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,12 +1,12 @@ [build-system] # jinja2 is required to generate JS and Python rules at build time # PyYAML is used to parse fuzzy rules and generate Python/JS code -requires = ["hatchling", "hatch-openzim>=0.2", "jinja2==3.1.6", "PyYAML==6.0.2"] +requires = ["hatchling", "hatch-openzim>=0.2", "jinja2==3.1.6", "PyYAML==6.0.3"] build-backend = "hatchling.build" [project] name = "zimscraperlib" -requires-python = ">=3.13,<3.14" +requires-python = ">=3.14,<3.15" description = "Collection of python tools to re-use common code across scrapers" readme = "README.md" dependencies = [ @@ -16,7 +16,7 @@ dependencies = [ "python-resize-image>=1.1.19,<1.2", "Babel>=2.9,<3.0", "python-magic>=0.4.3,<0.5", - "libzim>=3.4.0,<4.0", + "libzim>=3.8.0,<4.0", "beautifulsoup4>=4.9.3,<5.0", "lxml>=4.6.3,<7.0", "optimize-images>=1.3.6,<2.0", @@ -26,10 +26,10 @@ dependencies = [ "regex>=2020.7.14", "pymupdf>=1.24.0,<2.0", "CairoSVG>=2.2.0,<3.0", - "beartype>=0.19,<0.22", + "beartype>=0.19,<0.23", # youtube-dl should be updated as frequently as possible "yt-dlp", - "pillow>=7.0.0,<12.0", + "pillow>=7.0.0,<13.0", "urllib3>=1.26.5,<2.6.0", "piexif==1.1.3", # this dep is a nightmare in terms of release management, better pinned just like in optimize-images anyway "idna>=2.5,<4.0", @@ -49,39 +49,39 @@ additional-classifiers = [ [project.optional-dependencies] scripts = [ - "invoke==2.2.0", + "invoke==2.2.1", # jinja2 is required to generate JS and Python rules at build time # PyYAML is used to parse fuzzy rules and generate Python/JS code # also update version in build-system above "jinja2==3.1.6", - "PyYAML==6.0.2", + "PyYAML==6.0.3", ] lint = [ - "black==25.9.0", - "ruff==0.13.1", + "black==25.11.0", + "ruff==0.14.5", ] check = [ - "pyright==1.1.405", - "pytest==8.4.2", + "pyright==1.1.407", + "pytest==9.0.1", ] test = [ - "pytest==8.4.2", + "pytest==9.0.1", "pytest-mock==3.15.1", - "coverage==7.10.7", + "coverage==7.11.3", ] docs = [ "mkdocs==1.6.1", - "mkdocs-include-markdown-plugin==7.1.7", - "mkdocs-material==9.6.20", + "mkdocs-include-markdown-plugin==7.2.0", + "mkdocs-material==9.7.0", "mkdocstrings[python]==0.30.1", - "pymdown-extensions==10.16.1", + "pymdown-extensions==10.17.1", "mkdocs-gen-files==0.5.0", "mkdocs-literate-nav==0.6.2", ] dev = [ - "ipython==9.5.0", - "pre-commit==4.3.0", + "ipython==9.7.0", + "pre-commit==4.4.0", "zimscraperlib[scripts]", "zimscraperlib[lint]", "zimscraperlib[test]", @@ -157,10 +157,10 @@ build = "inv docs-build --args '{args}'" [tool.black] line-length = 88 -target-version = ['py313'] +target-version = ['py314'] [tool.ruff] -target-version = "py313" +target-version = "py314" line-length = 88 src = ["src", "contrib"] @@ -293,12 +293,6 @@ exclude_lines = [ include = ["contrib", "src", "tests", "tasks.py"] exclude = [".env/**", ".venv/**"] extraPaths = ["src"] -pythonVersion = "3.13" +pythonVersion = "3.14" typeCheckingMode="strict" disableBytesTypePromotions = true - -[[tool.pyright.overrides.files]] -files = [ - "src/zimscraperlib/rewriting**/*.py", - "tests/rewriting/**/*.py" -] diff --git a/rules/rules.yaml b/rules/rules.yaml index 3403cb9..d703986 100644 --- a/rules/rules.yaml +++ b/rules/rules.yaml @@ -6,8 +6,8 @@ # ones) but just rewriting to proper path. # # This file is in sync with content at commit -# https://github.com/webrecorder/wabac.js/commit/1c3acfce39e0dc127acf455b04237e9a82062730 -# from October 17, 2024 +# https://github.com/webrecorder/wabac.js/commit/f62756661d06e721bc57ff25199c73ce51227916 +# from October 29, 2025 # # This file should be updated at every release of scraperlib # diff --git a/src/zimscraperlib/rewriting/js.py b/src/zimscraperlib/rewriting/js.py index 22ffbed..89cfcd6 100644 --- a/src/zimscraperlib/rewriting/js.py +++ b/src/zimscraperlib/rewriting/js.py @@ -13,8 +13,8 @@ ZIM at `_zim_static/__wb_module_decl.js` This code is based on https://github.com/webrecorder/wabac.js/blob/main/src/rewrite/jsrewriter.ts -Last backport of upstream changes is from Sept 13, 2025 -Commit 6dd2d9ae664cfcd2ea8637d7d6c7ed7a0ca332a0 +Last backport of upstream changes is from Oct 12, 2025 +Commit 1849552c3dbcbc065c05afac2dd80061db37b64d """ import re diff --git a/src/zimscraperlib/zim/indexing.py b/src/zimscraperlib/zim/indexing.py index 9dd590e..51de209 100644 --- a/src/zimscraperlib/zim/indexing.py +++ b/src/zimscraperlib/zim/indexing.py @@ -101,10 +101,15 @@ def get_pdf_index_data( if parts: # pragma: no branch (always metadata in test PDFs) title = " - ".join(parts) - content = "\n".join( - page.get_text() # pyright: ignore[reportUnknownArgumentType, reportUnknownMemberType, reportAttributeAccessIssue] - for page in doc - ) + def get_pdf_content(page: pymupdf.Page) -> str: + text = ( # pyright: ignore[reportUnknownVariableType] + page.get_text() # pyright: ignore[reportUnknownMemberType] + ) + if not isinstance(text, str): + raise Exception("Unexpected text content") + return text + + content = "\n".join(get_pdf_content(page) for page in doc) # build list of messages and filter messages which are known to not be relevant # in our use-case diff --git a/tests/rewriting/test_js_rewriting.py b/tests/rewriting/test_js_rewriting.py index c75e90e..2d60c06 100644 --- a/tests/rewriting/test_js_rewriting.py +++ b/tests/rewriting/test_js_rewriting.py @@ -449,6 +449,12 @@ def test_import_rewrite(rewrite_import_content: ImportTestContent): `; } +""", + """"use strict";(function() { + const text = ` +export { a }; +`; + }) """, "let a = 7; var b = 5; const foo = 4;\n\n", ]