@@ -74,8 +74,11 @@ def __str__(self):
74
74
def __getattr__ (self , attr ):
75
75
"""Generate methods for fetching resources"""
76
76
p_image = re .compile (
77
- r"^get_(?P<size>thumbnail|small|normal|large|xlarge)_image_url(?P<list>_list)?$"
77
+ r"^get_"
78
+ r"(?P<size>thumbnail|small|normal|large|xlarge)_image_url"
79
+ r"(?P<list>_list)?$"
78
80
)
81
+
79
82
get = attr .startswith ("get_" )
80
83
url = attr .endswith ("_url" )
81
84
text = attr .endswith ("_text" )
@@ -230,9 +233,15 @@ def get_errors(self):
230
233
231
234
return all_results
232
235
233
- def process (self ):
234
- """Reprocess the document"""
235
- self ._client .post (f"{ self .api_path } /{ self .id } /process/" )
236
+ def process (self , ** kwargs ):
237
+ """Process the document, used on upload and for reprocessing"""
238
+ payload = {}
239
+ if "force_ocr" in kwargs :
240
+ payload ["force_ocr" ] = kwargs ["force_ocr" ]
241
+ if "ocr_engine" in kwargs :
242
+ payload ["ocr_engine" ] = kwargs ["ocr_engine" ]
243
+
244
+ self ._client .post (f"{ self .api_path } /{ self .id } /process/" , json = payload )
236
245
237
246
238
247
class DocumentClient (BaseAPIClient ):
@@ -310,6 +319,7 @@ def _format_upload_parameters(self, name, **kwargs):
310
319
"title" ,
311
320
"data" ,
312
321
"force_ocr" ,
322
+ "ocr_engine" ,
313
323
"projects" ,
314
324
"delayed_index" ,
315
325
"revision_control" ,
@@ -333,21 +343,55 @@ def _format_upload_parameters(self, name, **kwargs):
333
343
334
344
return params
335
345
346
+ def _extract_ocr_options (self , kwargs ):
347
+ """
348
+ Extract and validate OCR options from kwargs.
349
+
350
+ Returns:
351
+ force_ocr (bool)
352
+ ocr_engine (str)
353
+ """
354
+ force_ocr = kwargs .pop ("force_ocr" , False )
355
+ ocr_engine = kwargs .pop ("ocr_engine" , "tess4" )
356
+
357
+ if not isinstance (force_ocr , bool ):
358
+ raise ValueError ("force_ocr must be a boolean" )
359
+
360
+ if ocr_engine and ocr_engine not in ("tess4" , "textract" ):
361
+ raise ValueError (
362
+ "ocr_engine must be either 'tess4' for tesseract or 'textract'"
363
+ )
364
+
365
+ return force_ocr , ocr_engine
366
+
336
367
def _get_title (self , name ):
337
368
"""Get the default title for a document from its path"""
338
369
return name .split (os .sep )[- 1 ].rsplit ("." , 1 )[0 ]
339
370
340
371
def _upload_url (self , file_url , ** kwargs ):
341
372
"""Upload a document from a publicly accessible URL"""
373
+ # extract process-related args
374
+ force_ocr , ocr_engine = self ._extract_ocr_options (kwargs )
375
+
376
+ # create the document
342
377
params = self ._format_upload_parameters (file_url , ** kwargs )
343
378
params ["file_url" ] = file_url
379
+ if force_ocr :
380
+ params ["force_ocr" ] = force_ocr
381
+ params ["ocr_engine" ] = ocr_engine
344
382
response = self .client .post ("documents/" , json = params )
345
- return Document (self .client , response .json ())
383
+ create_json = response .json ()
384
+
385
+ # wrap in Document object
386
+ doc = Document (self .client , create_json )
387
+
388
+ return doc
346
389
347
390
def _upload_file (self , file_ , ** kwargs ):
348
391
"""Upload a document directly"""
349
392
# create the document
350
- force_ocr = kwargs .pop ("force_ocr" , False )
393
+ force_ocr , ocr_engine = self ._extract_ocr_options (kwargs )
394
+
351
395
params = self ._format_upload_parameters (file_ .name , ** kwargs )
352
396
response = self .client .post ("documents/" , json = params )
353
397
@@ -357,12 +401,12 @@ def _upload_file(self, file_, **kwargs):
357
401
response = requests_retry_session ().put (presigned_url , data = file_ .read ())
358
402
359
403
# begin processing the document
360
- doc_id = create_json ["id" ]
361
- response = self .client .post (
362
- f"documents/{ doc_id } /process/" , json = {"force_ocr" : force_ocr }
363
- )
404
+ doc = Document (self .client , create_json )
364
405
365
- return Document (self .client , create_json )
406
+ # begin processing
407
+ doc .process (force_ocr = force_ocr , ocr_engine = ocr_engine )
408
+
409
+ return doc
366
410
367
411
def _collect_files (self , path , extensions ):
368
412
"""Find the paths to files with specified extensions under a directory"""
@@ -410,7 +454,9 @@ def upload_directory(self, path, handle_errors=False, extensions=".pdf", **kwarg
410
454
# Upload all the files using the bulk API to reduce the number
411
455
# of API calls and improve performance
412
456
obj_list = []
457
+ force_ocr , ocr_engine = self ._extract_ocr_options (kwargs )
413
458
params = self ._format_upload_parameters ("" , ** kwargs )
459
+
414
460
for i , file_paths in enumerate (grouper (path_list , BULK_LIMIT )):
415
461
# Grouper will put None's on the end of the last group
416
462
file_paths = [p for p in file_paths if p is not None ]
@@ -471,9 +517,13 @@ def upload_directory(self, path, handle_errors=False, extensions=".pdf", **kwarg
471
517
472
518
# Begin processing the documents
473
519
logger .info ("Processing the documents..." )
474
- doc_ids = [j ["id" ] for j in create_json ]
520
+ process_payload = [
521
+ {"id" : j ["id" ], "force_ocr" : force_ocr , "ocr_engine" : ocr_engine }
522
+ for j in create_json
523
+ ]
524
+
475
525
try :
476
- response = self .client .post ("documents/process/" , json = { "ids" : doc_ids } )
526
+ response = self .client .post ("documents/process/" , json = process_payload )
477
527
except (APIError , RequestException ) as exc :
478
528
if handle_errors :
479
529
logger .info (
@@ -484,7 +534,6 @@ def upload_directory(self, path, handle_errors=False, extensions=".pdf", **kwarg
484
534
continue
485
535
else :
486
536
raise
487
-
488
537
logger .info ("Upload directory complete" )
489
538
490
539
# Pass back the list of documents
@@ -496,8 +545,13 @@ def upload_urls(self, url_list, handle_errors=False, **kwargs):
496
545
# Do not set the same title for all documents
497
546
kwargs .pop ("title" , None )
498
547
548
+ force_ocr , ocr_engine = self ._extract_ocr_options (kwargs )
499
549
obj_list = []
500
550
params = self ._format_upload_parameters ("" , ** kwargs )
551
+ # Add OCR options directly to params if needed
552
+ if force_ocr :
553
+ params ["force_ocr" ] = force_ocr
554
+ params ["ocr_engine" ] = ocr_engine
501
555
for i , url_group in enumerate (grouper (url_list , BULK_LIMIT )):
502
556
# Grouper will put None's on the end of the last group
503
557
url_group = [url for url in url_group if url is not None ]
0 commit comments