Skip to content

Commit 744683e

Browse files
Handle force_ocr and re-record tests
1 parent 02a0b60 commit 744683e

File tree

80 files changed

+63873
-3080
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

80 files changed

+63873
-3080
lines changed

.github/workflows/main.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ jobs:
1313
runs-on: ubuntu-latest
1414
strategy:
1515
matrix:
16-
python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"]
16+
python-version: ["3.10", "3.11"]
1717
steps:
1818
- name: Check out code
1919
uses: actions/checkout@v2

docs/conf.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -48,16 +48,16 @@
4848

4949
# General information about the project.
5050
project = "documentcloud"
51-
copyright = "2023, MuckRock Foundation"
51+
copyright = "2025, MuckRock Foundation"
5252

5353
# The version info for the project you're documenting, acts as replacement for
5454
# |version| and |release|, also used in various other places throughout the
5555
# built documents.
5656
#
5757
# The short X.Y version.
58-
version = "4.3"
58+
version = "4.5"
5959
# The full version, including alpha/beta/rc tags.
60-
release = "4.3.0"
60+
release = "4.5.0"
6161

6262
# The language for content autogenerated by Sphinx. Refer to documentation
6363
# for a list of supported languages.

documentcloud/documents.py

Lines changed: 68 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -74,8 +74,11 @@ def __str__(self):
7474
def __getattr__(self, attr):
7575
"""Generate methods for fetching resources"""
7676
p_image = re.compile(
77-
r"^get_(?P<size>thumbnail|small|normal|large|xlarge)_image_url(?P<list>_list)?$"
77+
r"^get_"
78+
r"(?P<size>thumbnail|small|normal|large|xlarge)_image_url"
79+
r"(?P<list>_list)?$"
7880
)
81+
7982
get = attr.startswith("get_")
8083
url = attr.endswith("_url")
8184
text = attr.endswith("_text")
@@ -230,9 +233,15 @@ def get_errors(self):
230233

231234
return all_results
232235

233-
def process(self):
234-
"""Reprocess the document"""
235-
self._client.post(f"{self.api_path}/{self.id}/process/")
236+
def process(self, **kwargs):
237+
"""Process the document, used on upload and for reprocessing"""
238+
payload = {}
239+
if "force_ocr" in kwargs:
240+
payload["force_ocr"] = kwargs["force_ocr"]
241+
if "ocr_engine" in kwargs:
242+
payload["ocr_engine"] = kwargs["ocr_engine"]
243+
244+
self._client.post(f"{self.api_path}/{self.id}/process/", json=payload)
236245

237246

238247
class DocumentClient(BaseAPIClient):
@@ -310,6 +319,7 @@ def _format_upload_parameters(self, name, **kwargs):
310319
"title",
311320
"data",
312321
"force_ocr",
322+
"ocr_engine",
313323
"projects",
314324
"delayed_index",
315325
"revision_control",
@@ -333,21 +343,55 @@ def _format_upload_parameters(self, name, **kwargs):
333343

334344
return params
335345

346+
def _extract_ocr_options(self, kwargs):
347+
"""
348+
Extract and validate OCR options from kwargs.
349+
350+
Returns:
351+
force_ocr (bool)
352+
ocr_engine (str)
353+
"""
354+
force_ocr = kwargs.pop("force_ocr", False)
355+
ocr_engine = kwargs.pop("ocr_engine", "tess4")
356+
357+
if not isinstance(force_ocr, bool):
358+
raise ValueError("force_ocr must be a boolean")
359+
360+
if ocr_engine and ocr_engine not in ("tess4", "textract"):
361+
raise ValueError(
362+
"ocr_engine must be either 'tess4' for tesseract or 'textract'"
363+
)
364+
365+
return force_ocr, ocr_engine
366+
336367
def _get_title(self, name):
337368
"""Get the default title for a document from its path"""
338369
return name.split(os.sep)[-1].rsplit(".", 1)[0]
339370

340371
def _upload_url(self, file_url, **kwargs):
341372
"""Upload a document from a publicly accessible URL"""
373+
# extract process-related args
374+
force_ocr, ocr_engine = self._extract_ocr_options(kwargs)
375+
376+
# create the document
342377
params = self._format_upload_parameters(file_url, **kwargs)
343378
params["file_url"] = file_url
379+
if force_ocr:
380+
params["force_ocr"] = force_ocr
381+
params["ocr_engine"] = ocr_engine
344382
response = self.client.post("documents/", json=params)
345-
return Document(self.client, response.json())
383+
create_json = response.json()
384+
385+
# wrap in Document object
386+
doc = Document(self.client, create_json)
387+
388+
return doc
346389

347390
def _upload_file(self, file_, **kwargs):
348391
"""Upload a document directly"""
349392
# create the document
350-
force_ocr = kwargs.pop("force_ocr", False)
393+
force_ocr, ocr_engine = self._extract_ocr_options(kwargs)
394+
351395
params = self._format_upload_parameters(file_.name, **kwargs)
352396
response = self.client.post("documents/", json=params)
353397

@@ -357,12 +401,12 @@ def _upload_file(self, file_, **kwargs):
357401
response = requests_retry_session().put(presigned_url, data=file_.read())
358402

359403
# begin processing the document
360-
doc_id = create_json["id"]
361-
response = self.client.post(
362-
f"documents/{doc_id}/process/", json={"force_ocr": force_ocr}
363-
)
404+
doc = Document(self.client, create_json)
364405

365-
return Document(self.client, create_json)
406+
# begin processing
407+
doc.process(force_ocr=force_ocr, ocr_engine=ocr_engine)
408+
409+
return doc
366410

367411
def _collect_files(self, path, extensions):
368412
"""Find the paths to files with specified extensions under a directory"""
@@ -410,7 +454,9 @@ def upload_directory(self, path, handle_errors=False, extensions=".pdf", **kwarg
410454
# Upload all the files using the bulk API to reduce the number
411455
# of API calls and improve performance
412456
obj_list = []
457+
force_ocr, ocr_engine = self._extract_ocr_options(kwargs)
413458
params = self._format_upload_parameters("", **kwargs)
459+
414460
for i, file_paths in enumerate(grouper(path_list, BULK_LIMIT)):
415461
# Grouper will put None's on the end of the last group
416462
file_paths = [p for p in file_paths if p is not None]
@@ -471,9 +517,13 @@ def upload_directory(self, path, handle_errors=False, extensions=".pdf", **kwarg
471517

472518
# Begin processing the documents
473519
logger.info("Processing the documents...")
474-
doc_ids = [j["id"] for j in create_json]
520+
process_payload = [
521+
{"id": j["id"], "force_ocr": force_ocr, "ocr_engine": ocr_engine}
522+
for j in create_json
523+
]
524+
475525
try:
476-
response = self.client.post("documents/process/", json={"ids": doc_ids})
526+
response = self.client.post("documents/process/", json=process_payload)
477527
except (APIError, RequestException) as exc:
478528
if handle_errors:
479529
logger.info(
@@ -484,7 +534,6 @@ def upload_directory(self, path, handle_errors=False, extensions=".pdf", **kwarg
484534
continue
485535
else:
486536
raise
487-
488537
logger.info("Upload directory complete")
489538

490539
# Pass back the list of documents
@@ -496,8 +545,13 @@ def upload_urls(self, url_list, handle_errors=False, **kwargs):
496545
# Do not set the same title for all documents
497546
kwargs.pop("title", None)
498547

548+
force_ocr, ocr_engine = self._extract_ocr_options(kwargs)
499549
obj_list = []
500550
params = self._format_upload_parameters("", **kwargs)
551+
# Add OCR options directly to params if needed
552+
if force_ocr:
553+
params["force_ocr"] = force_ocr
554+
params["ocr_engine"] = ocr_engine
501555
for i, url_group in enumerate(grouper(url_list, BULK_LIMIT)):
502556
# Grouper will put None's on the end of the last group
503557
url_group = [url for url in url_group if url is not None]

0 commit comments

Comments
 (0)