@@ -423,175 +423,98 @@ def _collect_files(self, path, extensions):
423
423
424
424
def upload_directory (self , path , handle_errors = False , extensions = ".pdf" , ** kwargs ):
425
425
"""Upload files with specified extensions in a directory"""
426
- # pylint: disable=too-many-locals, too-many-branches
427
-
428
- # Do not set the same title for all documents
426
+ # pylint:disable=too-many-locals
429
427
kwargs .pop ("title" , None )
430
428
431
- # If extensions are specified as None, it will check for all supported
432
- # filetypes.
433
429
if extensions is None :
434
430
extensions = SUPPORTED_EXTENSIONS
435
-
436
- # Convert single extension to a list if provided
437
431
if extensions and not isinstance (extensions , list ):
438
432
extensions = [extensions ]
439
-
440
- # Checks to see if the extensions are supported, raises an error if not.
441
433
invalid_extensions = set (extensions ) - set (SUPPORTED_EXTENSIONS )
442
434
if invalid_extensions :
443
435
raise ValueError (
444
436
f"Invalid extensions provided: { ', ' .join (invalid_extensions )} "
445
437
)
446
438
447
- # Loop through the path and get all the files with matching extensions
448
439
path_list = self ._collect_files (path , extensions )
449
-
450
440
logger .info (
451
441
"Upload directory on %s: Found %d files to upload" , path , len (path_list )
452
442
)
453
443
454
- # Upload all the files using the bulk API to reduce the number
455
- # of API calls and improve performance
456
444
obj_list = []
457
445
force_ocr , ocr_engine = self ._extract_ocr_options (kwargs )
458
446
params = self ._format_upload_parameters ("" , ** kwargs )
459
447
460
448
for i , file_paths in enumerate (grouper (path_list , BULK_LIMIT )):
461
- # Grouper will put None's on the end of the last group
462
449
file_paths = [p for p in file_paths if p is not None ]
463
-
464
450
logger .info ("Uploading group %d:\n %s" , i + 1 , "\n " .join (file_paths ))
465
451
466
- # Create the documents
467
- logger .info ("Creating the documents..." )
468
- body = [
469
- merge_dicts (
470
- params ,
471
- {
472
- "title" : self ._get_title (p ),
473
- "original_extension" : os .path .splitext (os .path .basename (p ))[1 ]
474
- .lower ()
475
- .lstrip ("." ),
476
- },
477
- )
478
- for p in sorted (file_paths )
479
- ]
480
- try :
481
- response = self .client .post ("documents/" , json = body )
482
- except (APIError , RequestException ) as exc :
483
- if handle_errors :
484
- logger .info (
485
- "Error creating the following documents: %s\n %s" ,
486
- exc ,
487
- "\n " .join (file_paths ),
488
- )
489
- continue
490
- else :
491
- raise
492
-
493
- # Upload the files directly to storage
494
- create_json = response .json ()
495
-
452
+ create_json = self ._create_documents (file_paths , params , handle_errors )
496
453
sorted_create_json = sorted (create_json , key = lambda j : j ["title" ])
497
- sorted_file_paths = sorted (file_paths , key = lambda p : self ._get_title ( p ) )
454
+ sorted_file_paths = sorted (file_paths , key = self ._get_title )
498
455
obj_list .extend (sorted_create_json )
499
456
presigned_urls = [j ["presigned_url" ] for j in sorted_create_json ]
500
-
501
- for url , file_path in zip (presigned_urls , sorted_file_paths ):
502
- logger .info ("Uploading %s to S3..." , file_path )
503
- try :
504
- with open (file_path , "rb" ) as file :
505
- response = requests_retry_session ().put (url , data = file .read ())
506
- self .client .raise_for_status (response )
507
- except (APIError , RequestException ) as exc :
508
- if handle_errors :
509
- logger .info (
510
- "Error uploading the following document: %s %s" ,
511
- exc ,
512
- file_path ,
513
- )
514
- continue
515
- else :
516
- raise
517
-
518
- # Begin processing the documents
519
- logger .info ("Processing the documents..." )
520
- process_payload = [
521
- {"id" : j ["id" ], "force_ocr" : force_ocr , "ocr_engine" : ocr_engine }
522
- for j in create_json
523
- ]
524
457
525
- try :
526
- response = self .client .post ("documents/process/" , json = process_payload )
527
- except (APIError , RequestException ) as exc :
528
- if handle_errors :
529
- logger .info (
530
- "Error creating the following documents: %s\n %s" ,
531
- exc ,
532
- "\n " .join (file_paths ),
533
- )
534
- continue
535
- else :
536
- raise
537
- logger .info ("Upload directory complete" )
458
+ self ._upload_files_to_s3 (sorted_file_paths , presigned_urls , handle_errors )
459
+ self ._process_documents (create_json , force_ocr , ocr_engine , handle_errors )
538
460
539
- # Pass back the list of documents
461
+ logger . info ( "Upload directory complete" )
540
462
return [Document (self .client , d ) for d in obj_list ]
541
463
542
- def upload_urls (self , url_list , handle_errors = False , ** kwargs ):
543
- """Upload documents from a list of URLs"""
544
-
545
- # Do not set the same title for all documents
546
- kwargs .pop ("title" , None )
547
-
548
- force_ocr , ocr_engine = self ._extract_ocr_options (kwargs )
549
- obj_list = []
550
- params = self ._format_upload_parameters ("" , ** kwargs )
551
- # Add OCR options directly to params if needed
552
- if force_ocr :
553
- params ["force_ocr" ] = force_ocr
554
- params ["ocr_engine" ] = ocr_engine
555
- for i , url_group in enumerate (grouper (url_list , BULK_LIMIT )):
556
- # Grouper will put None's on the end of the last group
557
- url_group = [url for url in url_group if url is not None ]
558
-
559
- logger .info ("Uploading group %d: %s" , i + 1 , "\n " .join (url_group ))
560
-
561
- # Create the documents
562
- logger .info ("Creating the documents..." )
563
- try :
564
- response = self .client .post (
565
- "documents/" ,
566
- json = [
567
- merge_dicts (
568
- params ,
569
- {
570
- "title" : self ._get_title (url ),
571
- "file_url" : url ,
572
- },
573
- )
574
- for url in url_group
575
- ],
464
+ def _create_documents (self , file_paths , params , handle_errors ):
465
+ body = [
466
+ merge_dicts (
467
+ params ,
468
+ {
469
+ "title" : self ._get_title (p ),
470
+ "original_extension" : os .path .splitext (os .path .basename (p ))[1 ]
471
+ .lower ()
472
+ .lstrip ("." ),
473
+ },
474
+ )
475
+ for p in sorted (file_paths )
476
+ ]
477
+ try :
478
+ response = self .client .post ("documents/" , json = body )
479
+ except (APIError , RequestException ) as exc :
480
+ if handle_errors :
481
+ logger .info (
482
+ "Error creating the following documents: %s\n %s" ,
483
+ exc ,
484
+ "\n " .join (file_paths ),
576
485
)
486
+ return []
487
+ else :
488
+ raise
489
+ return response .json ()
490
+
491
+ def _upload_files_to_s3 (self , file_paths , presigned_urls , handle_errors ):
492
+ for url , file_path in zip (presigned_urls , file_paths ):
493
+ logger .info ("Uploading %s to S3..." , file_path )
494
+ try :
495
+ with open (file_path , "rb" ) as f :
496
+ response = requests_retry_session ().put (url , data = f .read ())
497
+ self .client .raise_for_status (response )
577
498
except (APIError , RequestException ) as exc :
578
499
if handle_errors :
579
500
logger .info (
580
- "Error creating the following documents: %s\n %s" ,
581
- str (exc ),
582
- "\n " .join (url_group ),
501
+ "Error uploading the following document: %s %s" , exc , file_path
583
502
)
584
- continue
585
503
else :
586
504
raise
587
505
588
- create_json = response .json ()
589
- obj_list .extend (create_json )
590
-
591
- logger .info ("Upload URLs complete" )
592
-
593
- # Pass back the list of documents
594
- return [Document (self .client , d ) for d in obj_list ]
506
+ def _process_documents (self , create_json , force_ocr , ocr_engine , handle_errors ):
507
+ payload = [
508
+ {"id" : j ["id" ], "force_ocr" : force_ocr , "ocr_engine" : ocr_engine }
509
+ for j in create_json
510
+ ]
511
+ try :
512
+ self .client .post ("documents/process/" , json = payload )
513
+ except (APIError , RequestException ) as exc :
514
+ if handle_errors :
515
+ logger .info ("Error processing documents: %s" , exc )
516
+ else :
517
+ raise
595
518
596
519
597
520
class Mention :
0 commit comments