Skip to content

Use fastjsonschema for json schema validation #64

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 12 commits into
base: master
Choose a base branch
from
1 change: 0 additions & 1 deletion jupyter_telemetry/categories.py

This file was deleted.

24 changes: 19 additions & 5 deletions jupyter_telemetry/eventlog.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,12 +23,14 @@
# conda install the 'real' ruamel.yaml to fix
raise ImportError("Missing dependency ruamel.yaml. Try: `conda install ruamel.yaml`")

from traitlets import CaselessStrEnum
from traitlets.config import Configurable, Config

from .eventschema import EventSchema, JSON_SCHEMA_VALIDATORS
from .traits import Handlers, SchemaOptions
from . import TELEMETRY_METADATA_VERSION

from .categories import JSONSchemaValidator, filter_categories_from_event
from ._categories import filter_categories_from_event

yaml = YAML(typ='safe')

Expand Down Expand Up @@ -67,6 +69,16 @@ class EventLog(Configurable):
"""
).tag(config=True)

json_validator = CaselessStrEnum(
JSON_SCHEMA_VALIDATORS.keys(),
'fastjsonschema',
help="""
JSON schema validation implementation.

Valid choices are `jsonschema` and `fastjsonschema`.
"""
).tag(allowed_none=False)
Comment on lines +72 to +80
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@Zsailer Should we allow users to subclass here? They might want to customize the default validator or use their own. Or we can compromise and allow them to past a class name but restricted to only JSONSchemaValidator and FastJSONSchemaValidator and later can consider passing any class if someone asks about that use case. Is there any downside to allowing subclassing?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good question. I don’t see any downside to allowing custom validators by subclassing EventSchemaValidator. We can make this trait a Type(default_value='FastJSONSchemaValidator', klass='EventSchemaValidator').


def __init__(self, *args, **kwargs):
# We need to initialize the configurable before
# adding the logging handlers.
Expand All @@ -86,6 +98,7 @@ def __init__(self, *args, **kwargs):
for handler in self.handlers:
handler.setFormatter(formatter)
self.log.addHandler(handler)
self.json_validator_cls = JSON_SCHEMA_VALIDATORS[self.json_validator.lower()]

def _load_config(self, cfg, section_names=None, traits=None):
"""Load EventLog traits from a Config object, patching the
Expand Down Expand Up @@ -132,7 +145,7 @@ def register_schema(self, schema):
"""
# Check if our schema itself is valid
# This throws an exception if it isn't valid
JSONSchemaValidator.check_schema(schema)
event_schema = EventSchema(schema, validator_cls=self.json_validator_cls)

# Check that the properties we require are present
required_schema_fields = {'$id', 'version', 'properties'}
Expand Down Expand Up @@ -170,7 +183,7 @@ def register_schema(self, schema):
'have a category field.'.format(p)
)

self.schemas[(schema['$id'], schema['version'])] = schema
self.schemas[(schema['$id'], schema['version'])] = event_schema

def get_allowed_properties(self, schema_name):
"""Get the allowed properties for an allowed schema."""
Expand Down Expand Up @@ -223,10 +236,11 @@ def record_event(self, schema_name, version, event, timestamp_override=None):
schema_name=schema_name, version=version
))

schema = self.schemas[(schema_name, version)]
event_schema = self.schemas[(schema_name, version)]
schema = event_schema.schema

# Validate the event data.
JSONSchemaValidator(schema).validate(event)
event_schema.validate(event)

# Generate the empty event capsule.
if timestamp_override is None:
Expand Down
75 changes: 75 additions & 0 deletions jupyter_telemetry/eventschema.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
import jsonschema
import fastjsonschema


class JSONSchemaError(Exception):
pass


class JSONValidationError(Exception):
pass


class EventSchemaValidator:
def __init__(self, schema):
pass

def validate(self, instance):
pass


class JSONSchemaValidator(EventSchemaValidator):
def __init__(self, schema):
ivalidator = jsonschema.validators.validator_for(schema)

try:
ivalidator.check_schema(schema)
except jsonschema.SchemaError as e:
raise(JSONSchemaError(e.message))
except Exception as e:
raise JSONSchemaError(str(e))

self._validator = ivalidator(schema)

def validate(self, instance):
try:
self._validator.validate(instance)
except jsonschema.ValidationError as e:
raise(JSONValidationError(e.message))
except Exception as e:
raise JSONValidationError(str(e))


class FastJSONSchemaValidator(EventSchemaValidator):
def __init__(self, schema):
try:
self._validate = fastjsonschema.compile(schema)
except fastjsonschema.JsonSchemaDefinitionException as e:
raise JSONSchemaError(e.message)
except Exception as e:
raise JSONSchemaError(str(e))

def validate(self, instance):
try:
self._validate(instance)
except fastjsonschema.JsonSchemaValueException as e:
raise JSONValidationError(e.message)
except Exception as e:
raise JSONValidationError(str(e))


JSON_SCHEMA_VALIDATORS = {
'jsonschema': JSONSchemaValidator,
'fastjsonschema': FastJSONSchemaValidator
}

DEFAULT_SCHEMA_VALIDATOR = FastJSONSchemaValidator


class EventSchema:
def __init__(self, schema, validator_cls=DEFAULT_SCHEMA_VALIDATOR):
self.schema = schema
self._validator = validator_cls(schema)

def validate(self, instance):
self._validator.validate(instance)
4 changes: 2 additions & 2 deletions jupyter_telemetry/traits.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,8 +62,8 @@ def validate(self, obj, val):
raise TraitError(
"The schema option, {schema_name}, includes "
"unknown key(s): {unknown_keys}".format(
schema_name=schema_name,
unknown_keys=",".join(unknown_keys)
schema_name=schema_name,
unknown_keys=",".join(unknown_keys)
)
)
validated_val = val
Expand Down
3 changes: 2 additions & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@
'jsonschema',
'python-json-logger',
'traitlets',
'ruamel.yaml'
'ruamel.yaml',
'fastjsonschema'
],
)
6 changes: 6 additions & 0 deletions tests/conftest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
from jupyter_telemetry.eventschema import JSON_SCHEMA_VALIDATORS


def pytest_generate_tests(metafunc):
if "json_validator" in metafunc.fixturenames:
metafunc.parametrize("json_validator", JSON_SCHEMA_VALIDATORS.keys())
15 changes: 8 additions & 7 deletions tests/test_allowed_schemas.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ def schema():
}


def test_raised_exception_for_nonlist_categories():
def test_raised_exception_for_nonlist_categories(json_validator):
# Bad schema in yaml form.
yaml_schema = _("""\
$id: test.schema
Expand All @@ -63,6 +63,7 @@ def test_raised_exception_for_nonlist_categories():
"allowed_categories": ["user-identifier"]
}
},
json_validator=json_validator
)

# This schema does not have categories as a list.
Expand All @@ -72,7 +73,7 @@ def test_raised_exception_for_nonlist_categories():
assert 'must be a list.' in str(err.value)


def test_missing_categories_label():
def test_missing_categories_label(json_validator):
# Bad schema in yaml form.
yaml_schema = _("""\
$id: test.schema
Expand All @@ -93,7 +94,8 @@ def test_missing_categories_label():
SCHEMA_ID: {
"allowed_categories": ["random-category"]
}
}
},
json_validator=json_validator
)

# This schema does not have categories as a list.
Expand Down Expand Up @@ -197,13 +199,12 @@ def test_missing_categories_label():
),
]
)
def test_allowed_schemas(schema, allowed_schemas, expected_output):
def test_allowed_schemas(json_validator, schema, allowed_schemas, expected_output):
event_data = get_event_data(
EVENT_DATA,
schema,
SCHEMA_ID,
VERSION,
allowed_schemas
allowed_schemas=allowed_schemas,
json_validator=json_validator
)

# Verify that *exactly* the right properties are recorded.
Expand Down
35 changes: 15 additions & 20 deletions tests/test_category_filtering.py
Original file line number Diff line number Diff line change
Expand Up @@ -127,13 +127,12 @@
@pytest.mark.parametrize(
'allowed_schemas,expected_output', NESTED_CATEGORY_TEST_CASES
)
def test_category_filtering(allowed_schemas, expected_output):
def test_category_filtering(json_validator, allowed_schemas, expected_output):
event_data = get_event_data(
NESTED_EVENT_DATA,
NESTED_CATEGORY_SCHEMA,
SCHEMA_ID,
VERSION,
allowed_schemas
allowed_schemas=allowed_schemas,
json_validator=json_validator
)

# Verify that *exactly* the right properties are recorded.
Expand Down Expand Up @@ -282,13 +281,12 @@ def test_category_filtering(allowed_schemas, expected_output):
),
]
)
def test_array_category_filtering(allowed_schemas, expected_output):
def test_array_category_filtering(json_validator, allowed_schemas, expected_output):
event_data = get_event_data(
ARRAY_EVENT_DATA,
NESTED_CATEGORY_ARRAY_SCHEMA,
SCHEMA_ID,
VERSION,
allowed_schemas
allowed_schemas=allowed_schemas,
json_validator=json_validator
)

# Verify that *exactly* the right properties are recorded.
Expand Down Expand Up @@ -386,13 +384,12 @@ def test_array_category_filtering(allowed_schemas, expected_output):
),
]
)
def test_no_additional_properties(allowed_schemas, expected_output):
def test_no_additional_properties(json_validator, allowed_schemas, expected_output):
event_data = get_event_data(
ADDITIONAL_PROP_EVENT_DATA,
NESTED_CATEGORY_SCHEMA,
SCHEMA_ID,
VERSION,
allowed_schemas
allowed_schemas=allowed_schemas,
json_validator=json_validator
)

# Verify that *exactly* the right properties are recorded.
Expand Down Expand Up @@ -479,13 +476,12 @@ def test_no_additional_properties(allowed_schemas, expected_output):
@pytest.mark.parametrize(
'allowed_schemas,expected_output', NESTED_CATEGORY_TEST_CASES
)
def test_category_filtering_ref(allowed_schemas, expected_output):
def test_category_filtering_ref(json_validator, allowed_schemas, expected_output):
event_data = get_event_data(
NESTED_EVENT_DATA,
NESTED_CATEGORY_SCHEMA_REF,
SCHEMA_ID,
VERSION,
allowed_schemas
allowed_schemas=allowed_schemas,
json_validator=json_validator
)

# Verify that *exactly* the right properties are recorded.
Expand All @@ -495,13 +491,12 @@ def test_category_filtering_ref(allowed_schemas, expected_output):
@pytest.mark.parametrize(
'allowed_schemas,expected_output', NESTED_CATEGORY_TEST_CASES
)
def test_category_filtering_allof(allowed_schemas, expected_output):
def test_category_filtering_allof(json_validator, allowed_schemas, expected_output):
event_data = get_event_data(
NESTED_EVENT_DATA,
NESTED_CATEGORY_SCHEMA_ALLOF,
SCHEMA_ID,
VERSION,
allowed_schemas
allowed_schemas=allowed_schemas,
json_validator=json_validator
)

# Verify that *exactly* the right properties are recorded.
Expand Down
Loading