Skip to content

Commit ff16909

Browse files
Refactor upload_directory
1 parent c9f9e00 commit ff16909

File tree

73 files changed

+2628
-1794
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

73 files changed

+2628
-1794
lines changed

documentcloud/documents.py

Lines changed: 52 additions & 129 deletions
Original file line numberDiff line numberDiff line change
@@ -423,175 +423,98 @@ def _collect_files(self, path, extensions):
423423

424424
def upload_directory(self, path, handle_errors=False, extensions=".pdf", **kwargs):
425425
"""Upload files with specified extensions in a directory"""
426-
# pylint: disable=too-many-locals, too-many-branches
427-
428-
# Do not set the same title for all documents
426+
# pylint:disable=too-many-locals
429427
kwargs.pop("title", None)
430428

431-
# If extensions are specified as None, it will check for all supported
432-
# filetypes.
433429
if extensions is None:
434430
extensions = SUPPORTED_EXTENSIONS
435-
436-
# Convert single extension to a list if provided
437431
if extensions and not isinstance(extensions, list):
438432
extensions = [extensions]
439-
440-
# Checks to see if the extensions are supported, raises an error if not.
441433
invalid_extensions = set(extensions) - set(SUPPORTED_EXTENSIONS)
442434
if invalid_extensions:
443435
raise ValueError(
444436
f"Invalid extensions provided: {', '.join(invalid_extensions)}"
445437
)
446438

447-
# Loop through the path and get all the files with matching extensions
448439
path_list = self._collect_files(path, extensions)
449-
450440
logger.info(
451441
"Upload directory on %s: Found %d files to upload", path, len(path_list)
452442
)
453443

454-
# Upload all the files using the bulk API to reduce the number
455-
# of API calls and improve performance
456444
obj_list = []
457445
force_ocr, ocr_engine = self._extract_ocr_options(kwargs)
458446
params = self._format_upload_parameters("", **kwargs)
459447

460448
for i, file_paths in enumerate(grouper(path_list, BULK_LIMIT)):
461-
# Grouper will put None's on the end of the last group
462449
file_paths = [p for p in file_paths if p is not None]
463-
464450
logger.info("Uploading group %d:\n%s", i + 1, "\n".join(file_paths))
465451

466-
# Create the documents
467-
logger.info("Creating the documents...")
468-
body = [
469-
merge_dicts(
470-
params,
471-
{
472-
"title": self._get_title(p),
473-
"original_extension": os.path.splitext(os.path.basename(p))[1]
474-
.lower()
475-
.lstrip("."),
476-
},
477-
)
478-
for p in sorted(file_paths)
479-
]
480-
try:
481-
response = self.client.post("documents/", json=body)
482-
except (APIError, RequestException) as exc:
483-
if handle_errors:
484-
logger.info(
485-
"Error creating the following documents: %s\n%s",
486-
exc,
487-
"\n".join(file_paths),
488-
)
489-
continue
490-
else:
491-
raise
492-
493-
# Upload the files directly to storage
494-
create_json = response.json()
495-
452+
create_json = self._create_documents(file_paths, params, handle_errors)
496453
sorted_create_json = sorted(create_json, key=lambda j: j["title"])
497-
sorted_file_paths = sorted(file_paths, key=lambda p: self._get_title(p))
454+
sorted_file_paths = sorted(file_paths, key=self._get_title)
498455
obj_list.extend(sorted_create_json)
499456
presigned_urls = [j["presigned_url"] for j in sorted_create_json]
500-
501-
for url, file_path in zip(presigned_urls, sorted_file_paths):
502-
logger.info("Uploading %s to S3...", file_path)
503-
try:
504-
with open(file_path, "rb") as file:
505-
response = requests_retry_session().put(url, data=file.read())
506-
self.client.raise_for_status(response)
507-
except (APIError, RequestException) as exc:
508-
if handle_errors:
509-
logger.info(
510-
"Error uploading the following document: %s %s",
511-
exc,
512-
file_path,
513-
)
514-
continue
515-
else:
516-
raise
517-
518-
# Begin processing the documents
519-
logger.info("Processing the documents...")
520-
process_payload = [
521-
{"id": j["id"], "force_ocr": force_ocr, "ocr_engine": ocr_engine}
522-
for j in create_json
523-
]
524457

525-
try:
526-
response = self.client.post("documents/process/", json=process_payload)
527-
except (APIError, RequestException) as exc:
528-
if handle_errors:
529-
logger.info(
530-
"Error creating the following documents: %s\n%s",
531-
exc,
532-
"\n".join(file_paths),
533-
)
534-
continue
535-
else:
536-
raise
537-
logger.info("Upload directory complete")
458+
self._upload_files_to_s3(sorted_file_paths, presigned_urls, handle_errors)
459+
self._process_documents(create_json, force_ocr, ocr_engine, handle_errors)
538460

539-
# Pass back the list of documents
461+
logger.info("Upload directory complete")
540462
return [Document(self.client, d) for d in obj_list]
541463

542-
def upload_urls(self, url_list, handle_errors=False, **kwargs):
543-
"""Upload documents from a list of URLs"""
544-
545-
# Do not set the same title for all documents
546-
kwargs.pop("title", None)
547-
548-
force_ocr, ocr_engine = self._extract_ocr_options(kwargs)
549-
obj_list = []
550-
params = self._format_upload_parameters("", **kwargs)
551-
# Add OCR options directly to params if needed
552-
if force_ocr:
553-
params["force_ocr"] = force_ocr
554-
params["ocr_engine"] = ocr_engine
555-
for i, url_group in enumerate(grouper(url_list, BULK_LIMIT)):
556-
# Grouper will put None's on the end of the last group
557-
url_group = [url for url in url_group if url is not None]
558-
559-
logger.info("Uploading group %d: %s", i + 1, "\n".join(url_group))
560-
561-
# Create the documents
562-
logger.info("Creating the documents...")
563-
try:
564-
response = self.client.post(
565-
"documents/",
566-
json=[
567-
merge_dicts(
568-
params,
569-
{
570-
"title": self._get_title(url),
571-
"file_url": url,
572-
},
573-
)
574-
for url in url_group
575-
],
464+
def _create_documents(self, file_paths, params, handle_errors):
465+
body = [
466+
merge_dicts(
467+
params,
468+
{
469+
"title": self._get_title(p),
470+
"original_extension": os.path.splitext(os.path.basename(p))[1]
471+
.lower()
472+
.lstrip("."),
473+
},
474+
)
475+
for p in sorted(file_paths)
476+
]
477+
try:
478+
response = self.client.post("documents/", json=body)
479+
except (APIError, RequestException) as exc:
480+
if handle_errors:
481+
logger.info(
482+
"Error creating the following documents: %s\n%s",
483+
exc,
484+
"\n".join(file_paths),
576485
)
486+
return []
487+
else:
488+
raise
489+
return response.json()
490+
491+
def _upload_files_to_s3(self, file_paths, presigned_urls, handle_errors):
492+
for url, file_path in zip(presigned_urls, file_paths):
493+
logger.info("Uploading %s to S3...", file_path)
494+
try:
495+
with open(file_path, "rb") as f:
496+
response = requests_retry_session().put(url, data=f.read())
497+
self.client.raise_for_status(response)
577498
except (APIError, RequestException) as exc:
578499
if handle_errors:
579500
logger.info(
580-
"Error creating the following documents: %s\n%s",
581-
str(exc),
582-
"\n".join(url_group),
501+
"Error uploading the following document: %s %s", exc, file_path
583502
)
584-
continue
585503
else:
586504
raise
587505

588-
create_json = response.json()
589-
obj_list.extend(create_json)
590-
591-
logger.info("Upload URLs complete")
592-
593-
# Pass back the list of documents
594-
return [Document(self.client, d) for d in obj_list]
506+
def _process_documents(self, create_json, force_ocr, ocr_engine, handle_errors):
507+
payload = [
508+
{"id": j["id"], "force_ocr": force_ocr, "ocr_engine": ocr_engine}
509+
for j in create_json
510+
]
511+
try:
512+
self.client.post("documents/process/", json=payload)
513+
except (APIError, RequestException) as exc:
514+
if handle_errors:
515+
logger.info("Error processing documents: %s", exc)
516+
else:
517+
raise
595518

596519

597520
class Mention:

tests/cassettes/fixtures/client.yaml

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ interactions:
1818
uri: https://dev.squarelet.com/api/token/
1919
response:
2020
body:
21-
string: '{"refresh":"eyJ0eXAiOiJKV1QiLCJhbGciOiJSUzI1NiJ9.eyJ0b2tlbl90eXBlIjoicmVmcmVzaCIsImV4cCI6MTc1ODgxNTMyMiwiaWF0IjoxNzU4NzI4OTIyLCJqdGkiOiIyZGIyYjk2MGRiMTg0MjIwOTY0YWY5YTI2ZjUyNjQ3YyIsInVzZXJfaWQiOiJmNjAyOWU4YS1kOGEyLTQxODgtOGEyMS1jZDA3M2EwMmVlNWMiLCJhdWQiOlsic3F1YXJlbGV0IiwibXVja3JvY2siLCJkb2N1bWVudGNsb3VkIl0sImlzcyI6WyJzcXVhcmVsZXQiXX0.OBpX4-A48BbNHKr75vpgy7Ssdj-Ag63eJZ8Zo-aiHBoq5bGrA1dTcHkrglh61KI6Ji6w3zksv4DmfL1kvEkocd_cMB4bYRunfIKU2VMPn51F9Pm3YcDV27ogydh_tZykt3I4YfdIq4Ct5c97I8SyI0AS_7i347pKlaGoJynTZPezTZWUKIM9EtriQ9iT47HEDWH09fmcyvqA3afsgR70vSxxB_OLcHAnH0fZ1DP6F9oQh-nwOmooNlS7rTGikkVjZqlOXWJl3FLHsi4R_GD05b37XCyAEjo9RVf5fxEVbaeegKwCWEpRetQABrVnOFOdaeeVfjz8iAwVXBBV5o9yjQ","access":"eyJ0eXAiOiJKV1QiLCJhbGciOiJSUzI1NiJ9.eyJ0b2tlbl90eXBlIjoiYWNjZXNzIiwiZXhwIjoxNzU4NzI5MjIyLCJpYXQiOjE3NTg3Mjg5MjIsImp0aSI6IjVjZjJjNTY0MmM5MjQ0YWNiOGI1MTE4ZDIzYzQxZjAyIiwidXNlcl9pZCI6ImY2MDI5ZThhLWQ4YTItNDE4OC04YTIxLWNkMDczYTAyZWU1YyIsImF1ZCI6WyJzcXVhcmVsZXQiLCJtdWNrcm9jayIsImRvY3VtZW50Y2xvdWQiXSwiaXNzIjpbInNxdWFyZWxldCJdfQ.ox04nvr_QyJh5z249PZuvQiy4YLLKKba_yF73J_osL6UcQcv-viGFqxXcMgObbcugvVPqSW4VEiCiEMPTgNhrrl7KzhFwtZlZv7QXRHcJP8N5hJbahaFwKHCJwUpHpM9qliAE0rVSQ1U3MfaEWGZTsj8JN2tM3gSp1fGguoY76N2AvD0KlykTsQZYHEbkO2gTrCm25WVFGxe-f7s58n_sSBgdvvM9vpNf3to6_AeZ2QwpUOG_rWplgvDE1ugILOdur2TSL5_RGgqI5KQltGdpURVYpeURiycZ-f1w-JPP7D8KDfawXV2N9lhBE7kP-Lb98qgh-vLAzazLLVBWDnB5Q"}'
21+
string: '{"refresh":"eyJ0eXAiOiJKV1QiLCJhbGciOiJSUzI1NiJ9.eyJ0b2tlbl90eXBlIjoicmVmcmVzaCIsImV4cCI6MTc1ODgxNjUyOSwiaWF0IjoxNzU4NzMwMTI5LCJqdGkiOiIzZWI4MzExNmQzNTA0Njc0YTViZGFiOGEyYjFkNDA0ZiIsInVzZXJfaWQiOiJmNjAyOWU4YS1kOGEyLTQxODgtOGEyMS1jZDA3M2EwMmVlNWMiLCJhdWQiOlsic3F1YXJlbGV0IiwibXVja3JvY2siLCJkb2N1bWVudGNsb3VkIl0sImlzcyI6WyJzcXVhcmVsZXQiXX0.jl0ql4G-9ZFn0yOWSyzlXLjBXaQF9ZzWoHf7vxfrK9e4MnQ5jZyCLrFR7-nkjbKy9q9WAjlO3u3ZV3bzYW0xobiOuZcvTEiucy8qnzQlXLDLOjMy1JLnyh7VJI4Si40BSs5l-UfSvUv3854l6V_fxwcx0asLFVclT0PrDnAuNt50uxxgsSAwzrsquqPOASuG_6DHiD-DIE-MrWYiNc2Z5fy7eQFRt600oTOPRfLLmVixlqN33QfHO6GZQsM20vinJxyOXWvjtsGmcaJooxIkyU56HLObx6fxokzEGKzvHXLeF7zbrZuHaww8fPmFTtq-QjaY7Pt2vxmJnbBIIvRh-g","access":"eyJ0eXAiOiJKV1QiLCJhbGciOiJSUzI1NiJ9.eyJ0b2tlbl90eXBlIjoiYWNjZXNzIiwiZXhwIjoxNzU4NzMwNDI5LCJpYXQiOjE3NTg3MzAxMjksImp0aSI6IjEyMzE1OGFkOWExYjQ3YTE5ZDdhMjdlYjMwNTdmMzRkIiwidXNlcl9pZCI6ImY2MDI5ZThhLWQ4YTItNDE4OC04YTIxLWNkMDczYTAyZWU1YyIsImF1ZCI6WyJzcXVhcmVsZXQiLCJtdWNrcm9jayIsImRvY3VtZW50Y2xvdWQiXSwiaXNzIjpbInNxdWFyZWxldCJdfQ.LduwXStVTHJGEoNw9eO-xQ0OVLF6b-kaeq-qXOCGJ06mc6JygXWOE7D9NnV7m-PzdtTKCGyQtOCe1RvU51ho5yFNJUwlLcYlmABbOQnvWOuuyqYvzoGXwULm99lV8KWW2Aetmldo_pUU3LY0VJS4Qe13ZaussP1Emsr68_uvy99FU8Xkm_jQqt4qoUDWuj-IeXkVj024Mv70VrCOBiTs_fmRHJixo12aNadUe9D0MhHoJxgBmRiRDqlq0Tu-jjP5I2_wY-c7iVSMUJTFInXHKRJadBaavvllrLZ4t47nc9G-ustuqnT_i8vrkN95LJmL87EK6NoVzS_O84c6DySZDA"}'
2222
headers:
2323
Allow:
2424
- POST, OPTIONS
@@ -31,16 +31,16 @@ interactions:
3131
Cross-Origin-Opener-Policy:
3232
- same-origin
3333
Date:
34-
- Wed, 24 Sep 2025 15:48:42 GMT
34+
- Wed, 24 Sep 2025 16:08:49 GMT
3535
Referrer-Policy:
3636
- same-origin
3737
Server:
3838
- nginx/1.25.2
3939
Server-Timing:
40-
- TimerPanel_utime;dur=244.6600000000103;desc="User CPU time", TimerPanel_stime;dur=47.13300000000231;desc="System
41-
CPU time", TimerPanel_total;dur=291.7930000000126;desc="Total CPU time", TimerPanel_total_time;dur=135.79398800357012;desc="Elapsed
42-
time", SQLPanel_sql_time;dur=4.552477003016975;desc="SQL 4 queries", CachePanel_total_time;dur=0;desc="Cache
43-
0 Calls"
40+
- TimerPanel_utime;dur=223.58000000002676;desc="User CPU time", TimerPanel_stime;dur=41.8860000000052;desc="System
41+
CPU time", TimerPanel_total;dur=265.46600000003195;desc="Total CPU time",
42+
TimerPanel_total_time;dur=129.28208100493066;desc="Elapsed time", SQLPanel_sql_time;dur=4.6200070064514875;desc="SQL
43+
4 queries", CachePanel_total_time;dur=0;desc="Cache 0 Calls"
4444
Set-Cookie:
4545
- op_browser_state=f5448717470b879d75a31d1e1e832e10c24a7586f91c49d672dea82f;
4646
Path=/
@@ -51,7 +51,7 @@ interactions:
5151
X-Frame-Options:
5252
- DENY
5353
djdt-store-id:
54-
- 7d41427a9d1846d6be4975c3f0b284b0
54+
- 2262d688c46f4da090ad828bac248f04
5555
status:
5656
code: 200
5757
message: OK

0 commit comments

Comments
 (0)