Skip to content

Mitch/kv search #314

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 3 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion config/urls.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,8 +27,10 @@
scraper_dashboard,
)
from documentcloud.core.views import FileServer, account_logout, mailgun
from documentcloud.documents.constants import DATA_KEY_REGEX
from documentcloud.documents.views import (
DataViewSet,
DocumentDataViewSet,
DocumentErrorViewSet,
DocumentViewSet,
EntityDateViewSet,
Expand Down Expand Up @@ -59,6 +61,7 @@ class BulkNestedDefaultRouter(BulkRouterMixin, NestedDefaultRouter):


router = BulkDefaultRouter()
router.register("documents/data", DocumentDataViewSet, basename="documents-data")
router.register("documents", DocumentViewSet)
router.register("organizations", OrganizationViewSet)
router.register("projects", ProjectViewSet)
Expand All @@ -70,7 +73,6 @@ class BulkNestedDefaultRouter(BulkRouterMixin, NestedDefaultRouter):
router.register("flatpages", FlatPageViewSet)
router.register("statistics", StatisticsViewSet)


documents_router = BulkNestedDefaultRouter(router, "documents", lookup="document")
documents_router.register("notes", NoteViewSet)
documents_router.register("sections", SectionViewSet)
Expand Down
2 changes: 0 additions & 2 deletions documentcloud/documents/models/document.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,8 +33,6 @@

logger = logging.getLogger(__name__)

# pylint:disable = too-many-lines


class Document(models.Model):
"""A document uploaded to DocumentCloud"""
Expand Down
101 changes: 99 additions & 2 deletions documentcloud/documents/views.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# Django
from django.conf import settings
from django.db import transaction
from django.db.models import Q, prefetch_related_objects
from django.db import connection, transaction
from django.db.models import Func, Q, prefetch_related_objects
from django.db.models.query import Prefetch
from django.utils.decorators import method_decorator
from django.utils.translation import gettext_lazy as _
Expand All @@ -12,6 +12,7 @@
from rest_framework.response import Response

# Standard Library
import json
import logging
import sys
from functools import lru_cache
Expand Down Expand Up @@ -1784,3 +1785,99 @@ def post_process(self, request, document_pk=None):
post_process.delay(document_pk, request.data)

return Response("OK", status=status.HTTP_200_OK)


class DocumentDataViewSet(viewsets.GenericViewSet):
lookup_value_regex = DATA_KEY_REGEX

def get_queryset(self):
return (
Document.objects.get_viewable(self.request.user).order_by().values("data")
)

def list(self, request):
"""Return all keys and all of their values
for documents visible to the user. You may filter
by project and partial key name
"""
# XXX Should the results be paginated?

# You must specify a project for now for performance reasons
# It may be possible to use an index to remove this restriction
# if desired
if "project" not in request.GET:
raise serializers.ValidationError("You must specify a project")

queryset = self.filter_queryset(self.get_queryset())

# We interpolate the queryset's query into the custom SQL
query = str(queryset.query)
sql = f"""
SELECT key, jsonb_agg(values)
FROM (
SELECT DISTINCT key, jsonb_array_elements(data->key) AS values
FROM ({query}) d, jsonb_object_keys(data) AS key
WHERE jsonb_typeof(data->key) = 'array'
) kv
WHERE key LIKE %s
GROUP BY key
ORDER BY key
"""
key = request.GET.get("key", "") + "%"
with connection.cursor() as cursor:
cursor.execute(sql, [key])
data = cursor.fetchall()

# The JSON is returned as a string for some reason,
# so we parse it
return Response({k: json.loads(v)} for k, v in data)

def retrieve(self, request, pk=None):
"""Given a key, this will return values
present for that key in the documents visible
to the requesting user. You may filter the resulting values
by which project they are present in or by a partial value name.
"""
# XXX Should the results be paginated?

# You must specify a project for now for performance reasons
# It may be possible to use an index to remove this restriction
# if desired
if "project" not in request.GET:
raise serializers.ValidationError("You must specify a project")

queryset = self.filter_queryset(self.get_queryset())

queryset = (
queryset.annotate(
values=Func(
f"data__{pk}",
function="jsonb_array_elements",
)
)
.values_list("values", flat=True)
.order_by("values")
.distinct()
)
if "value" in request.GET:
# We could do this filter using custom SQL, but embedding the above
# query resulted in SQL errors for me. It is also a bit much
# and might be considered a pre-mature optimization
data = [v for v in queryset if v.startswith(request.GET["value"])]
else:
data = queryset

return Response(data)

class Filter(django_filters.FilterSet):
project = ModelMultipleChoiceFilter(
model=Project,
field_name="projects",
help_text=("Filter by which projects a document belongs to"),
)

class Meta:
model = Document
fields = ["project"]

filterset_class = Filter