Skip to content

Commit eef27e9

Browse files
committed
store recipe similarity data on create/update
1 parent f84564f commit eef27e9

File tree

28 files changed

+942
-64
lines changed

28 files changed

+942
-64
lines changed

backend/maint-scripts/update_scraper_version.py

Lines changed: 15 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
#!/usr/bin/env python3
2+
23
"""
34
Script to update schedules and requested tasks with new image tags or offliner
45
definitions.
@@ -54,6 +55,7 @@
5455
from sqlalchemy.orm.attributes import flag_modified
5556

5657
from zimfarm_backend import logger
58+
from zimfarm_backend.common.schemas.offliners.builder import generate_similarity_data
5759
from zimfarm_backend.common.schemas.orms import OfflinerDefinitionSchema, OfflinerSchema
5860
from zimfarm_backend.db import Session
5961
from zimfarm_backend.db.models import RequestedTask, Schedule
@@ -103,23 +105,32 @@ def update_entries(
103105
model.config["offliner"]["offliner_id"].astext == offliner.id
104106
)
105107
).scalars():
108+
obj_name = getattr(obj, "name", obj.id)
106109
if image_tag:
107-
logger.info(f"setting {offliner.id} image tag to {args.image_tag}...")
110+
logger.info(
111+
f"setting {offliner.id} image tag for {model.__tablename__} {obj_name} "
112+
f"to {args.image_tag}..."
113+
)
108114
obj.config["image"]["tag"] = image_tag
109115
flag_modified(obj, "config")
110116

111117
if offliner_definition:
112-
logger.info(f"setting offliner defintion for {model.__tablename__}...")
118+
logger.info(
119+
f"setting offliner defintion for {model.__tablename__} {obj_name} ..."
120+
)
113121
if name_mappings:
114122
obj.config["offliner"] = update_offliner_flags(
115123
offliner=offliner,
116124
offliner_definition=offliner_definition,
117125
data=obj.config["offliner"],
118126
name_mappings=name_mappings,
119127
)
120-
flag_modified(obj, "config")
121128
obj.offliner_definition_id = offliner_definition.id
122-
129+
if isinstance(obj, Schedule):
130+
obj.similarity_data = generate_similarity_data(
131+
obj.config["offliner"], offliner, offliner_definition.schema_
132+
)
133+
flag_modified(obj, "config")
123134
# Just needed to ensure our model still works
124135
if isinstance(obj, Schedule):
125136
create_schedule_full_schema(obj, offliner)
Lines changed: 135 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,135 @@
1+
#!/usr/bin/env python3
2+
3+
"""
4+
Script to update similarity data of offliner definitions and schedules
5+
6+
EXAMPLES:
7+
8+
1. Update the similarity data for all offliner definitions. This updates the recipes
9+
similarity data too. Input can be a JSON file or sent via stdin.
10+
./update_scraper_version.py -o mwoffliner -s simialrity_data.in
11+
12+
2 To update via stdin:
13+
echo '[
14+
{
15+
"flag": "mwUrl",
16+
"transformers": [
17+
{
18+
"name": "hostname",
19+
"operand": null
20+
}
21+
]
22+
}
23+
]' \
24+
| ./update_scraper_version.py \
25+
-o mwoffliner \
26+
-s
27+
"""
28+
29+
import argparse
30+
import json
31+
import sys
32+
33+
import sqlalchemy as sa
34+
from sqlalchemy import select
35+
from sqlalchemy.orm import Session as OrmSession
36+
from sqlalchemy.orm.attributes import flag_modified
37+
38+
from zimfarm_backend import logger
39+
from zimfarm_backend.common.schemas.offliners.builder import generate_similarity_data
40+
from zimfarm_backend.common.schemas.offliners.models import (
41+
OfflinerSpecSchema,
42+
SimilarityDataSchema,
43+
)
44+
from zimfarm_backend.common.schemas.orms import OfflinerDefinitionSchema, OfflinerSchema
45+
from zimfarm_backend.db import Session
46+
from zimfarm_backend.db.models import OfflinerDefinition, Schedule
47+
from zimfarm_backend.db.offliner import get_offliner
48+
from zimfarm_backend.db.offliner_definition import (
49+
create_offliner_definition_schema,
50+
)
51+
from zimfarm_backend.db.schedule import create_schedule_full_schema
52+
53+
54+
def update_schedules(
55+
session: OrmSession,
56+
*,
57+
offliner: OfflinerSchema,
58+
offliner_definition: OfflinerDefinitionSchema,
59+
) -> int:
60+
nb_modified: int = 0
61+
62+
for schedule in session.execute(
63+
sa.select(Schedule).where(
64+
Schedule.config["offliner"]["offliner_id"].astext == offliner.id,
65+
Schedule.offliner_definition_id == offliner_definition.id,
66+
)
67+
).scalars():
68+
logger.info(f"seting similarity data for schedule {schedule.name}")
69+
schedule.similarity_data = generate_similarity_data(
70+
schedule.config["offliner"], offliner, offliner_definition.schema_
71+
)
72+
flag_modified(schedule, "config")
73+
create_schedule_full_schema(schedule, offliner)
74+
nb_modified += 1
75+
return nb_modified
76+
77+
78+
if __name__ == "__main__":
79+
parser = argparse.ArgumentParser(
80+
description="Update schedules and requested tasks image tag "
81+
"or offliner definition",
82+
)
83+
84+
# Required offliner specification
85+
parser.add_argument(
86+
"-o", "--offliner", required=True, help="Specify which offliner to update"
87+
)
88+
89+
parser.add_argument(
90+
"-s",
91+
"--similarity-data",
92+
metavar="SIMILARITY DATA",
93+
type=argparse.FileType("r", encoding="utf-8"),
94+
const=sys.stdin,
95+
nargs="?",
96+
help=(
97+
"List of similarity data transformers to apply to all definitions of the "
98+
"offliner. This updates the similarity data array of the recipes "
99+
"associated with the definitions too."
100+
),
101+
)
102+
103+
args = parser.parse_args()
104+
105+
with Session.begin() as session:
106+
offliner = get_offliner(session, args.offliner)
107+
108+
if args.similarity_data:
109+
similarity_data = [
110+
SimilarityDataSchema.model_validate(data)
111+
for data in json.loads(args.similarity_data.read())
112+
]
113+
for offliner_definition in session.scalars(
114+
select(OfflinerDefinition).where(
115+
OfflinerDefinition.offliner == args.offliner
116+
)
117+
):
118+
# update the spec for the offliner definition to use the new
119+
# similarity data transformers list
120+
offliner_spec = OfflinerSpecSchema.model_validate(
121+
offliner_definition.schema
122+
)
123+
offliner_spec.similarity_data = similarity_data
124+
# update the db offliner definition and all of its associated recipes
125+
offliner_definition.schema = offliner_spec.model_dump(mode="json")
126+
nb_schedules_modified = update_schedules(
127+
session,
128+
offliner=offliner,
129+
offliner_definition=create_offliner_definition_schema(
130+
offliner_definition
131+
),
132+
)
133+
logger.info(f"updated {nb_schedules_modified} schedule(s) ")
134+
135+
logger.info("FINISH!")

backend/src/zimfarm_backend/common/schemas/offliners/builder.py

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
# ruff: noqa: UP007
22
from collections.abc import Callable
33
from enum import Enum, StrEnum
4+
from itertools import chain
45
from typing import Annotated, Any, Literal, Optional, cast
56

67
from pydantic import (
@@ -12,6 +13,7 @@
1213
model_validator,
1314
)
1415

16+
from zimfarm_backend import logger
1517
from zimfarm_backend.common.constants import getenv, parse_bool
1618
from zimfarm_backend.common.schemas import CamelModel, DashModel
1719
from zimfarm_backend.common.schemas.fields import (
@@ -30,6 +32,7 @@
3032
FlagSchema,
3133
OfflinerSpecSchema,
3234
)
35+
from zimfarm_backend.common.schemas.offliners.transformers import transform_data
3336
from zimfarm_backend.common.schemas.offliners.validators import (
3437
check_exclusive_fields,
3538
validate_ted_links,
@@ -169,6 +172,34 @@ def generate_field_type(offliner: str, flag: FlagSchema, label: str):
169172
return Annotated[py_type, pydantic_field, WrapValidator(skip_validation)]
170173

171174

175+
def generate_similarity_data(
176+
flags: dict[str, Any], offliner: OfflinerSchema, spec: OfflinerSpecSchema
177+
) -> list[str]:
178+
"""Generate the similarity list of flags data."""
179+
schema_cls = build_offliner_model(offliner, spec)
180+
result: list[list[str]] = []
181+
for similarity_data in spec.similarity_data:
182+
# find the pydantic field info from the schema class
183+
field_info = schema_cls.model_fields[similarity_data.flag]
184+
# find the value of the flag in the data. This is typically at the alias of the
185+
# field given we always dump with aliases. Sometimes, it turns out to be at the
186+
# name of the python identifier because we originally dumped without aliases.
187+
value: Any | None = None
188+
if field_info.alias:
189+
value = flags.get(field_info.alias, flags.get(similarity_data.flag))
190+
else:
191+
value = flags.get(similarity_data.flag)
192+
193+
if value is None:
194+
logger.warning(
195+
f"Could not find value in data that matched the keys: "
196+
f"'{field_info.alias}', '{similarity_data.flag}'"
197+
)
198+
continue
199+
result.append(transform_data([value], similarity_data.transformers))
200+
return list(set(chain.from_iterable(result)))
201+
202+
172203
def build_offliner_model(
173204
offliner: OfflinerSchema,
174205
schema: OfflinerSpecSchema,

backend/src/zimfarm_backend/common/schemas/offliners/models.py

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -78,10 +78,29 @@ class ModelValidatorSchema(CamelModel):
7878
model_config = ConfigDict(extra="forbid")
7979

8080

81+
class TransformerSchema(CamelModel):
82+
# the name of the transformer function to use for the field. If None, the field
83+
# will be used as it is
84+
name: Literal["split", "hostname"] | None = None
85+
# the operand to use for the transformer function (if the function
86+
# takes an operand)
87+
operand: str | None = None
88+
89+
90+
class SimilarityDataSchema(CamelModel):
91+
# the name of the flag to use for the similarity data
92+
flag: str
93+
# transformers are applied in sequential order
94+
transformers: list[TransformerSchema]
95+
96+
8197
class OfflinerSpecSchema(CamelModel):
8298
flags: dict[str, FlagSchema]
8399
model_validators: list[ModelValidatorSchema] = Field( # pyright: ignore
84100
default_factory=list,
85101
)
86102
std_output: str | bool = Field(default=False)
87103
std_stats: str | bool = Field(default=False)
104+
similarity_data: list[SimilarityDataSchema] = Field( # pyright: ignore
105+
default_factory=list
106+
)
Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,49 @@
1+
from collections.abc import Callable
2+
from functools import partial
3+
from itertools import chain
4+
from urllib.parse import urlparse
5+
6+
from zimfarm_backend.common.schemas.offliners.models import TransformerSchema
7+
8+
9+
def get_transformer_function(
10+
transformer: TransformerSchema,
11+
) -> Callable[[str], list[str]]:
12+
def _get_hostname(url: str) -> list[str]:
13+
if not url.startswith(("https://", "http://")):
14+
url = "https://" + url
15+
return [urlparse(url).hostname or ""]
16+
17+
match transformer.name:
18+
case "split":
19+
return partial(str.split, sep=transformer.operand)
20+
case "hostname":
21+
return _get_hostname
22+
case None:
23+
# return the value as it is if there is no transformer associated
24+
return lambda x: [x]
25+
case _:
26+
raise ValueError(
27+
f"No transformer function registered for '{transformer.name}'"
28+
)
29+
30+
31+
def transform_data(data: list[str], transformers: list[TransformerSchema]) -> list[str]:
32+
"""Generate the output by applying each tranformer to data"""
33+
if not transformers:
34+
return data
35+
head, *tail = transformers
36+
return transform_data(
37+
list(
38+
chain.from_iterable(
39+
# apply the transformer function to each entry in the list
40+
# and feed the new list as input to the next transformer function
41+
[
42+
get_transformer_function(head)(entry)
43+
for entry in data
44+
if entry.strip()
45+
]
46+
)
47+
),
48+
tail,
49+
)

backend/src/zimfarm_backend/common/schemas/orms.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -239,6 +239,7 @@ class ScheduleFullSchema(BaseModel):
239239
archived: bool
240240
context: str
241241
offliner_definition_id: UUID = Field(exclude=True)
242+
similarity_data: list[str] = Field(exclude=True)
242243
offliner: str
243244
version: str
244245

backend/src/zimfarm_backend/db/models.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -208,6 +208,9 @@ class Schedule(Base):
208208
# context that a worker must have to run this schedule
209209
context: Mapped[str] = mapped_column(default="", server_default="", index=True)
210210
archived: Mapped[bool] = mapped_column(default=False, server_default=false())
211+
similarity_data: Mapped[list[str]] = mapped_column(
212+
default_factory=list, server_default="{}", index=True
213+
)
211214

212215
# use_alter is mandatory for alembic to break the dependency cycle
213216
# but it is still not totally handled automatically, the migration

0 commit comments

Comments
 (0)