Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 30 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -341,6 +341,36 @@ media.claims.add(Item(prop_nr='P180', value='Q3146211'))
media.write()
```

## Entity validation
Two different validators for entities are available.

### Entityshape
This is a in beta state. For simple entity schemas it has proven reliable.

See https://github.com/dpriskorn/entityshape#limitations for a list of limitations

```python
from wikibaseintegrator import WikibaseIntegrator

wbi = WikibaseIntegrator()
item = wbi.item.get('Q1')
result = item.entityshape_schema_validator(entity_schema_id="E1")
print(result)
```

### PyShex
This is considered highly experimental/alpha state.
We have not yet been able to successfully validate an item or lexeme with this library.

```python
from wikibaseintegrator import WikibaseIntegrator

wbi = WikibaseIntegrator()
item = wbi.item.get('Q1')
result = item.pyshex_schema_validator(entity_schema_id="E1")
print(result)
```

# More than Wikibase #

WikibaseIntegrator natively supports some extensions:
Expand Down
4 changes: 4 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,7 @@ oauthlib~=3.2.2
requests~=2.31.0
requests-oauthlib~=1.3.1
ujson~=5.8.0
entityshape~=0.1.0
pyshex~=0.8.1
rdflib~=6.3.2
pydantic~=1.10.9
1 change: 1 addition & 0 deletions setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ install_requires =
requests>=2.27.1,<2.29.0
requests-oauthlib~=1.3.1
ujson>=5.4,<5.6
entityshape~=0.1.0
python_requires = >=3.8, <3.13

[options.extras_require]
Expand Down
3 changes: 2 additions & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,8 @@
"oauthlib ~= 3.2.0",
"requests >= 2.27.1,< 2.32.0",
"requests-oauthlib ~= 1.3.1",
"ujson >= 5.4,< 5.9"
"ujson >= 5.4,< 5.9",
"entityshape ~= 0.1.0"
],
extras_require={
"dev": [
Expand Down
50 changes: 50 additions & 0 deletions test/test_entity_item.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,3 +107,53 @@ def test_new_lines(self):
item.claims.add(MonolingualText(prop_nr=123, text="Multi\r\nline"))
item.claims.add(MonolingualText(prop_nr=123, text="Multi\rline"))
item.claims.add(MonolingualText(prop_nr=123, text="Multi\nline"))

def test_entityshape_entity_validation(self):
random_campsite = wbi.item.get('Q119156070')
assert random_campsite.entityshape_schema_validator(entity_schema_id="E376").is_valid
assert random_campsite.entityshape_schema_validator(entity_schema_id="376").is_valid
assert random_campsite.entityshape_schema_validator(entity_schema_id=376).is_valid
assert not wbi.item.get('Q582').entityshape_schema_validator(entity_schema_id="E376").is_valid

def test_pyshex_entity_validation(self):
# TODO find a combination of shex and entity that is valid
# danish noun
result = wbi.lexeme.get('L41172').pyshex_schema_validator(entity_schema_id="E34")
assert result.valid is False
# This error makes no sense TODO report upstream to pyshex
assert result.reason == 'Import failure on https://www.wikidata.org/wiki/Special:EntitySchemaText/E68'
random_campsite = wbi.item.get('Q119156070')
result = random_campsite.pyshex_schema_validator(entity_schema_id="E376")
assert result.valid is False
assert result.reason == (' Testing wd:Q119156070 against shape campsite\n'
' Datatype constraint (http://www.w3.org/2001/XMLSchema#string) does not '
'match URIRef '
'<http://commons.wikimedia.org/wiki/Special:FilePath/Grenforsen%2005.jpg>\n'
' Testing wd:Q119156070 against shape campsite\n'
' Datatype constraint (http://www.w3.org/2001/XMLSchema#string) does not '
'match URIRef '
'<http://commons.wikimedia.org/wiki/Special:FilePath/Grenforsen%2005.jpg>\n'
' Testing wd:Q119156070 against shape campsite\n'
' No matching triples found for predicate wdt:P31')
assert random_campsite.pyshex_schema_validator(entity_schema_id="376").valid is False
assert random_campsite.pyshex_schema_validator(entity_schema_id=376).valid is False
result2 = wbi.item.get('Q582').pyshex_schema_validator(entity_schema_id="E376")
assert not result2.valid
assert result2.reason == (' Testing wd:Q582 against shape campsite\n'
' Triples:\n'
' wd:Q582 wdt:P31 wd:Q1549591 .\n'
' wd:Q582 wdt:P31 wd:Q484170 .\n'
' 2 triples exceeds max {1,1}\n'
' Testing wd:Q582 against shape campsite\n'
' Node: wd:Q1549591 not in value set:\n'
'\t {"values": ["http://www.wikidata.org/entity/Q832778", "http:...\n'
' Testing wd:Q582 against shape campsite\n'
' Node: wd:Q484170 not in value set:\n'
'\t {"values": ["http://www.wikidata.org/entity/Q832778", "http:...\n'
' Testing wd:Q582 against shape campsite\n'
' Triples:\n'
' wd:Q582 wdt:P31 wd:Q1549591 .\n'
' wd:Q582 wdt:P31 wd:Q484170 .\n'
' 2 triples exceeds max {1,1}\n'
' Testing wd:Q582 against shape campsite\n'
' No matching triples found for predicate wdt:P31')
52 changes: 52 additions & 0 deletions test/test_entityshape.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
from unittest import TestCase

from entityshape import EntityShape

from wikibaseintegrator import WikibaseIntegrator
from wikibaseintegrator.wbi_config import config as wbi_config
from wikibaseintegrator.wbi_helpers import execute_sparql_query

wbi_config['USER_AGENT'] = 'WikibaseIntegrator-pytest/1.0 (test_entity_item.py)'
wbi = WikibaseIntegrator()
class TestEntityShape(TestCase):
def test_validate_one_item(self):
item = wbi.item.get("Q96620548")
#item.validate(eid="E1", lang="en")
e = EntityShape(qid=item.id, eid="E376", lang="en")
result = e.get_result()
assert result.is_valid is False
assert result.required_properties_that_are_missing == ["P137"]

def test_validate_all_campsite_shelter_items(self):
# This query was build in a few seconds using https://query.wikidata.org/querybuilder/?uselang=en :)
results = execute_sparql_query("""
SELECT DISTINCT ?item ?itemLabel WHERE {
SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE]". }
{
SELECT DISTINCT ?item WHERE {
?item p:P31 ?statement0.
?statement0 (ps:P31/(wdt:P279*)) wd:Q96620652.
}
LIMIT 100
}
}
""")
bindings = results["results"]["bindings"]
print(f"Found {len(bindings)} results")
count = 1
for result in bindings:
qid = result["itemLabel"]["value"]
print(f"Working on: {qid}")
#print(result)
item = wbi.item.get(qid)
e = EntityShape(qid=item.id, eid="E376", lang="en")
result = e.get_result()
# Ignore the invalid shelters missing an operator P137
if result.is_valid is False and result.required_properties_that_are_missing == {"P137"}:
print("Skipping campsite only missing and operator")
elif result.is_valid is True:
print("Skipping valid campsite - they are boring!")
else:
print(f"is_valid: {result.is_valid}, required_properties_that_are_missing:{result.required_properties_that_are_missing}, statements_with_property_that_is_not_allowed:{result.statements_with_property_that_is_not_allowed}, properties_with_too_many_statements:{result.properties_with_too_many_statements}, see {item.get_entity_url()}")
# assert result.is_valid is False
# assert result.required_properties_that_are_missing == ["P137"]
109 changes: 108 additions & 1 deletion wikibaseintegrator/entities/baseentity.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,23 @@
from __future__ import annotations

import logging
import re
from copy import copy
from typing import TYPE_CHECKING, Any

import requests
from entityshape import EntityShape, Result
from pydantic import BaseModel
from pyshex import ShExEvaluator
from pyshex.shex_evaluator import EvaluationResult
from rdflib import Graph

from wikibaseintegrator import wbi_fastrun
from wikibaseintegrator.datatypes import BaseDataType
from wikibaseintegrator.models.claims import Claim, Claims
from wikibaseintegrator.wbi_config import config
from wikibaseintegrator.wbi_enums import ActionIfExists
from wikibaseintegrator.wbi_exceptions import MissingEntityException
from wikibaseintegrator.wbi_exceptions import EntitySchemaDownloadError, MissingEntityException, TtlDownloadError
from wikibaseintegrator.wbi_helpers import delete_page, edit_entity, mediawiki_api_call_helper
from wikibaseintegrator.wbi_login import _Login

Expand All @@ -18,6 +27,17 @@
log = logging.getLogger(__name__)


class PyshexResult(BaseModel):
reason: str
valid: bool

def __str__(self):
return (
f"Valid: {self.valid}\n"
f"Reason: {self.reason}"
)


class BaseEntity:
ETYPE = 'base-entity'
subclasses: list[type[BaseEntity]] = []
Expand Down Expand Up @@ -299,6 +319,93 @@ def get_entity_url(self, wikibase_url: str | None = None) -> str:

raise ValueError('wikibase_url or entity ID is null.')

def _get_valid_entity_schema_id(self, entity_schema_id) -> str:
if isinstance(entity_schema_id, str):
pattern = re.compile(r'^(?:[a-zA-Z]+:)?E?([0-9]+)$')
matches = pattern.match(entity_schema_id)

if not matches:
raise ValueError(f"Invalid EntitySchema ID ({entity_schema_id}), format must be 'E[0-9]+'")

entity_schema_id = f'E{matches.group(1)}'
elif isinstance(entity_schema_id, int):
entity_schema_id = f'E{entity_schema_id}'
else:
raise ValueError(f"Invalid EntitySchema ID ({entity_schema_id}), format must be 'E[0-9]+'")
return entity_schema_id

def _get_ttl_data(self) -> str:
"""Download the entity data in turtle format (ttl)"""
api_endpoint = 'https://www.wikidata.org/wiki/Special:EntityData/'
api_url = f'{api_endpoint}{self.id}.ttl'
# TODO fix timeout
response = requests.get(api_url, timeout=10)
if response.status_code == 200:
return response.text
else:
raise TtlDownloadError()

def _get_schema_text(self, entity_schema_id) -> str:
"""
Downloads the schema from wikidata

:param entity_schema_id: the entityschema id to be downloaded
"""
url: str = f"https://www.wikidata.org/wiki/EntitySchema:{entity_schema_id}?action=raw"
response = requests.get(url, timeout=10)
if response.status_code == 200:
json_text: dict = response.json()
return json_text["schemaText"]
else:
raise EntitySchemaDownloadError()

# TODO make an interface for the validator so the user
# does not have to think about how the internals of the validators work
# The users should get similar output no matter which validator they choose
def entityshape_schema_validator(self, entity_schema_id: str, language: str | None = None) -> Result:
entity_schema_id = self._get_valid_entity_schema_id(entity_schema_id=entity_schema_id)
language = str(language or config['DEFAULT_LANGUAGE'])
return EntityShape(qid=self.id, eid=entity_schema_id, lang=language).validate_and_get_result()

def pyshex_schema_validator(self, entity_schema_id: str) -> PyshexResult:
entity_schema_id = self._get_valid_entity_schema_id(entity_schema_id=entity_schema_id)
return self._check_shex_conformance(entity_schema_id=entity_schema_id)

def _check_shex_conformance(self, entity_schema_id: str= "", data: str= "") -> PyshexResult:
"""
Static method which can be used to check for conformance of a Wikidata item to an EntitySchema any SPARQL query

:param entity_schema_id: The URI prefixes required for an endpoint, default is the Wikidata specific prefixes
:param data: Turtle data to be validated (Optional)
:return: The results of the query are an instance of PyshexResult
"""
# load the string of ttl data into a rdf graph to please ShExEvaluator
rdfdata = Graph()
if not data:
# This downloads the ttl data
data = self._get_ttl_data()
# print(data)
# exit()
rdfdata.parse(data=data)
else:
rdfdata.parse(data=data)
for result in ShExEvaluator(rdf=rdfdata, schema=self._get_schema_text(entity_schema_id=entity_schema_id), focus=f"http://www.wikidata.org/entity/{self.id}").evaluate():
result: EvaluationResult
# convert named tuple to pydantic class which is way nicer
# class EvaluationResult(NamedTuple):
# result: bool
# focus: Optional[URIRef]
# start: Optional[URIRef]
# reason: Optional[str]
# We return early because we expect only one result from ShExEvaluator
return PyshexResult(
valid=result[0],
# We ignore these for now as they seem overcomplicated
#focus=result[1],
#start=result[2],
reason=result[3],
)

def __repr__(self):
"""A mixin implementing a simple __repr__."""
return "<{klass} @{id:x} {attrs}>".format( # pylint: disable=consider-using-f-string
Expand Down
8 changes: 8 additions & 0 deletions wikibaseintegrator/wbi_exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,3 +99,11 @@ class MissingEntityException(Exception):

class SearchError(Exception):
pass


class TtlDownloadError(BaseException):
pass


class EntitySchemaDownloadError(BaseException):
pass