MuckRock
diff --git a/‎documentcloud/documents.py‎
Lines changed: 52 additions & 129 deletions b/‎documentcloud/documents.py‎
Lines changed: 52 additions & 129 deletions
diff --git a/‎tests/cassettes/fixtures/client.yaml‎
Lines changed: 7 additions & 7 deletions b/‎tests/cassettes/fixtures/client.yaml‎
Lines changed: 7 additions & 7 deletions
@@ -423,175 +423,98 @@ def _collect_files(self, path, extensions):
 
     def upload_directory(self, path, handle_errors=False, extensions=".pdf", **kwargs):
         """Upload files with specified extensions in a directory"""
-        # pylint: disable=too-many-locals, too-many-branches
-
-        # Do not set the same title for all documents
+        # pylint:disable=too-many-locals
         kwargs.pop("title", None)
 
-        # If extensions are specified as None, it will check for all supported
-        # filetypes.
         if extensions is None:
             extensions = SUPPORTED_EXTENSIONS
-
-        # Convert single extension to a list if provided
         if extensions and not isinstance(extensions, list):
             extensions = [extensions]
-
-        # Checks to see if the extensions are supported, raises an error if not.
         invalid_extensions = set(extensions) - set(SUPPORTED_EXTENSIONS)
         if invalid_extensions:
             raise ValueError(
                 f"Invalid extensions provided: {', '.join(invalid_extensions)}"
             )
 
-        # Loop through the path and get all the files with matching extensions
         path_list = self._collect_files(path, extensions)
-
         logger.info(
             "Upload directory on %s: Found %d files to upload", path, len(path_list)
         )
 
-        # Upload all the files using the bulk API to reduce the number
-        # of API calls and improve performance
         obj_list = []
         force_ocr, ocr_engine = self._extract_ocr_options(kwargs)
         params = self._format_upload_parameters("", **kwargs)
 
         for i, file_paths in enumerate(grouper(path_list, BULK_LIMIT)):
-            # Grouper will put None's on the end of the last group
             file_paths = [p for p in file_paths if p is not None]
-
             logger.info("Uploading group %d:\n%s", i + 1, "\n".join(file_paths))
 
-            # Create the documents
-            logger.info("Creating the documents...")
-            body = [
-                merge_dicts(
-                    params,
-                    {
-                        "title": self._get_title(p),
-                        "original_extension": os.path.splitext(os.path.basename(p))[1]
-                        .lower()
-                        .lstrip("."),
-                    },
-                )
-                for p in sorted(file_paths)
-            ]
-            try:
-                response = self.client.post("documents/", json=body)
-            except (APIError, RequestException) as exc:
-                if handle_errors:
-                    logger.info(
-                        "Error creating the following documents: %s\n%s",
-                        exc,
-                        "\n".join(file_paths),
-                    )
-                    continue
-                else:
-                    raise
-
-            # Upload the files directly to storage
-            create_json = response.json()
-
+            create_json = self._create_documents(file_paths, params, handle_errors)
             sorted_create_json = sorted(create_json, key=lambda j: j["title"])
-            sorted_file_paths = sorted(file_paths, key=lambda p: self._get_title(p))
+            sorted_file_paths = sorted(file_paths, key=self._get_title)
             obj_list.extend(sorted_create_json)
             presigned_urls = [j["presigned_url"] for j in sorted_create_json]
-            
-            for url, file_path in zip(presigned_urls, sorted_file_paths):
-                logger.info("Uploading %s to S3...", file_path)
-                try:
-                    with open(file_path, "rb") as file:
-                        response = requests_retry_session().put(url, data=file.read())
-                    self.client.raise_for_status(response)
-                except (APIError, RequestException) as exc:
-                    if handle_errors:
-                        logger.info(
-                            "Error uploading the following document: %s %s",
-                            exc,
-                            file_path,
-                        )
-                        continue
-                    else:
-                        raise
-
-            # Begin processing the documents
-            logger.info("Processing the documents...")
-            process_payload = [
-                {"id": j["id"], "force_ocr": force_ocr, "ocr_engine": ocr_engine}
-                for j in create_json
-            ]
 
-            try:
-                response = self.client.post("documents/process/", json=process_payload)
-            except (APIError, RequestException) as exc:
-                if handle_errors:
-                    logger.info(
-                        "Error creating the following documents: %s\n%s",
-                        exc,
-                        "\n".join(file_paths),
-                    )
-                    continue
-                else:
-                    raise
-        logger.info("Upload directory complete")
+            self._upload_files_to_s3(sorted_file_paths, presigned_urls, handle_errors)
+            self._process_documents(create_json, force_ocr, ocr_engine, handle_errors)
 
-        # Pass back the list of documents
+        logger.info("Upload directory complete")
         return [Document(self.client, d) for d in obj_list]
 
-    def upload_urls(self, url_list, handle_errors=False, **kwargs):
-        """Upload documents from a list of URLs"""
-
-        # Do not set the same title for all documents
-        kwargs.pop("title", None)
-
-        force_ocr, ocr_engine = self._extract_ocr_options(kwargs)
-        obj_list = []
-        params = self._format_upload_parameters("", **kwargs)
-        # Add OCR options directly to params if needed
-        if force_ocr:
-            params["force_ocr"] = force_ocr
-            params["ocr_engine"] = ocr_engine
-        for i, url_group in enumerate(grouper(url_list, BULK_LIMIT)):
-            # Grouper will put None's on the end of the last group
-            url_group = [url for url in url_group if url is not None]
-
-            logger.info("Uploading group %d: %s", i + 1, "\n".join(url_group))
-
-            # Create the documents
-            logger.info("Creating the documents...")
-            try:
-                response = self.client.post(
-                    "documents/",
-                    json=[
-                        merge_dicts(
-                            params,
-                            {
-                                "title": self._get_title(url),
-                                "file_url": url,
-                            },
-                        )
-                        for url in url_group
-                    ],
+    def _create_documents(self, file_paths, params, handle_errors):
+        body = [
+            merge_dicts(
+                params,
+                {
+                    "title": self._get_title(p),
+                    "original_extension": os.path.splitext(os.path.basename(p))[1]
+                    .lower()
+                    .lstrip("."),
+                },
+            )
+            for p in sorted(file_paths)
+        ]
+        try:
+            response = self.client.post("documents/", json=body)
+        except (APIError, RequestException) as exc:
+            if handle_errors:
+                logger.info(
+                    "Error creating the following documents: %s\n%s",
+                    exc,
+                    "\n".join(file_paths),
                 )
+                return []
+            else:
+                raise
+        return response.json()
+
+    def _upload_files_to_s3(self, file_paths, presigned_urls, handle_errors):
+        for url, file_path in zip(presigned_urls, file_paths):
+            logger.info("Uploading %s to S3...", file_path)
+            try:
+                with open(file_path, "rb") as f:
+                    response = requests_retry_session().put(url, data=f.read())
+                self.client.raise_for_status(response)
             except (APIError, RequestException) as exc:
                 if handle_errors:
                     logger.info(
-                        "Error creating the following documents: %s\n%s",
-                        str(exc),
-                        "\n".join(url_group),
+                        "Error uploading the following document: %s %s", exc, file_path
                     )
-                    continue
                 else:
                     raise
 
-            create_json = response.json()
-            obj_list.extend(create_json)
-
-        logger.info("Upload URLs complete")
-
-        # Pass back the list of documents
-        return [Document(self.client, d) for d in obj_list]
+    def _process_documents(self, create_json, force_ocr, ocr_engine, handle_errors):
+        payload = [
+            {"id": j["id"], "force_ocr": force_ocr, "ocr_engine": ocr_engine}
+            for j in create_json
+        ]
+        try:
+            self.client.post("documents/process/", json=payload)
+        except (APIError, RequestException) as exc:
+            if handle_errors:
+                logger.info("Error processing documents: %s", exc)
+            else:
+                raise
 
 
 class Mention:
 
@@ -18,7 +18,7 @@ interactions:
     uri: https://dev.squarelet.com/api/token/
   response:
     body:
-      string: '{"refresh":"eyJ0eXAiOiJKV1QiLCJhbGciOiJSUzI1NiJ9.eyJ0b2tlbl90eXBlIjoicmVmcmVzaCIsImV4cCI6MTc1ODgxNTMyMiwiaWF0IjoxNzU4NzI4OTIyLCJqdGkiOiIyZGIyYjk2MGRiMTg0MjIwOTY0YWY5YTI2ZjUyNjQ3YyIsInVzZXJfaWQiOiJmNjAyOWU4YS1kOGEyLTQxODgtOGEyMS1jZDA3M2EwMmVlNWMiLCJhdWQiOlsic3F1YXJlbGV0IiwibXVja3JvY2siLCJkb2N1bWVudGNsb3VkIl0sImlzcyI6WyJzcXVhcmVsZXQiXX0.OBpX4-A48BbNHKr75vpgy7Ssdj-Ag63eJZ8Zo-aiHBoq5bGrA1dTcHkrglh61KI6Ji6w3zksv4DmfL1kvEkocd_cMB4bYRunfIKU2VMPn51F9Pm3YcDV27ogydh_tZykt3I4YfdIq4Ct5c97I8SyI0AS_7i347pKlaGoJynTZPezTZWUKIM9EtriQ9iT47HEDWH09fmcyvqA3afsgR70vSxxB_OLcHAnH0fZ1DP6F9oQh-nwOmooNlS7rTGikkVjZqlOXWJl3FLHsi4R_GD05b37XCyAEjo9RVf5fxEVbaeegKwCWEpRetQABrVnOFOdaeeVfjz8iAwVXBBV5o9yjQ","access":"eyJ0eXAiOiJKV1QiLCJhbGciOiJSUzI1NiJ9.eyJ0b2tlbl90eXBlIjoiYWNjZXNzIiwiZXhwIjoxNzU4NzI5MjIyLCJpYXQiOjE3NTg3Mjg5MjIsImp0aSI6IjVjZjJjNTY0MmM5MjQ0YWNiOGI1MTE4ZDIzYzQxZjAyIiwidXNlcl9pZCI6ImY2MDI5ZThhLWQ4YTItNDE4OC04YTIxLWNkMDczYTAyZWU1YyIsImF1ZCI6WyJzcXVhcmVsZXQiLCJtdWNrcm9jayIsImRvY3VtZW50Y2xvdWQiXSwiaXNzIjpbInNxdWFyZWxldCJdfQ.ox04nvr_QyJh5z249PZuvQiy4YLLKKba_yF73J_osL6UcQcv-viGFqxXcMgObbcugvVPqSW4VEiCiEMPTgNhrrl7KzhFwtZlZv7QXRHcJP8N5hJbahaFwKHCJwUpHpM9qliAE0rVSQ1U3MfaEWGZTsj8JN2tM3gSp1fGguoY76N2AvD0KlykTsQZYHEbkO2gTrCm25WVFGxe-f7s58n_sSBgdvvM9vpNf3to6_AeZ2QwpUOG_rWplgvDE1ugILOdur2TSL5_RGgqI5KQltGdpURVYpeURiycZ-f1w-JPP7D8KDfawXV2N9lhBE7kP-Lb98qgh-vLAzazLLVBWDnB5Q"}'
+      string: '{"refresh":"eyJ0eXAiOiJKV1QiLCJhbGciOiJSUzI1NiJ9.eyJ0b2tlbl90eXBlIjoicmVmcmVzaCIsImV4cCI6MTc1ODgxNjUyOSwiaWF0IjoxNzU4NzMwMTI5LCJqdGkiOiIzZWI4MzExNmQzNTA0Njc0YTViZGFiOGEyYjFkNDA0ZiIsInVzZXJfaWQiOiJmNjAyOWU4YS1kOGEyLTQxODgtOGEyMS1jZDA3M2EwMmVlNWMiLCJhdWQiOlsic3F1YXJlbGV0IiwibXVja3JvY2siLCJkb2N1bWVudGNsb3VkIl0sImlzcyI6WyJzcXVhcmVsZXQiXX0.jl0ql4G-9ZFn0yOWSyzlXLjBXaQF9ZzWoHf7vxfrK9e4MnQ5jZyCLrFR7-nkjbKy9q9WAjlO3u3ZV3bzYW0xobiOuZcvTEiucy8qnzQlXLDLOjMy1JLnyh7VJI4Si40BSs5l-UfSvUv3854l6V_fxwcx0asLFVclT0PrDnAuNt50uxxgsSAwzrsquqPOASuG_6DHiD-DIE-MrWYiNc2Z5fy7eQFRt600oTOPRfLLmVixlqN33QfHO6GZQsM20vinJxyOXWvjtsGmcaJooxIkyU56HLObx6fxokzEGKzvHXLeF7zbrZuHaww8fPmFTtq-QjaY7Pt2vxmJnbBIIvRh-g","access":"eyJ0eXAiOiJKV1QiLCJhbGciOiJSUzI1NiJ9.eyJ0b2tlbl90eXBlIjoiYWNjZXNzIiwiZXhwIjoxNzU4NzMwNDI5LCJpYXQiOjE3NTg3MzAxMjksImp0aSI6IjEyMzE1OGFkOWExYjQ3YTE5ZDdhMjdlYjMwNTdmMzRkIiwidXNlcl9pZCI6ImY2MDI5ZThhLWQ4YTItNDE4OC04YTIxLWNkMDczYTAyZWU1YyIsImF1ZCI6WyJzcXVhcmVsZXQiLCJtdWNrcm9jayIsImRvY3VtZW50Y2xvdWQiXSwiaXNzIjpbInNxdWFyZWxldCJdfQ.LduwXStVTHJGEoNw9eO-xQ0OVLF6b-kaeq-qXOCGJ06mc6JygXWOE7D9NnV7m-PzdtTKCGyQtOCe1RvU51ho5yFNJUwlLcYlmABbOQnvWOuuyqYvzoGXwULm99lV8KWW2Aetmldo_pUU3LY0VJS4Qe13ZaussP1Emsr68_uvy99FU8Xkm_jQqt4qoUDWuj-IeXkVj024Mv70VrCOBiTs_fmRHJixo12aNadUe9D0MhHoJxgBmRiRDqlq0Tu-jjP5I2_wY-c7iVSMUJTFInXHKRJadBaavvllrLZ4t47nc9G-ustuqnT_i8vrkN95LJmL87EK6NoVzS_O84c6DySZDA"}'
     headers:
       Allow:
       - POST, OPTIONS
@@ -31,16 +31,16 @@ interactions:
       Cross-Origin-Opener-Policy:
       - same-origin
       Date:
-      - Wed, 24 Sep 2025 15:48:42 GMT
+      - Wed, 24 Sep 2025 16:08:49 GMT
       Referrer-Policy:
       - same-origin
       Server:
       - nginx/1.25.2
       Server-Timing:
-      - TimerPanel_utime;dur=244.6600000000103;desc="User CPU time", TimerPanel_stime;dur=47.13300000000231;desc="System
-        CPU time", TimerPanel_total;dur=291.7930000000126;desc="Total CPU time", TimerPanel_total_time;dur=135.79398800357012;desc="Elapsed
-        time", SQLPanel_sql_time;dur=4.552477003016975;desc="SQL 4 queries", CachePanel_total_time;dur=0;desc="Cache
-        0 Calls"
+      - TimerPanel_utime;dur=223.58000000002676;desc="User CPU time", TimerPanel_stime;dur=41.8860000000052;desc="System
+        CPU time", TimerPanel_total;dur=265.46600000003195;desc="Total CPU time",
+        TimerPanel_total_time;dur=129.28208100493066;desc="Elapsed time", SQLPanel_sql_time;dur=4.6200070064514875;desc="SQL
+        4 queries", CachePanel_total_time;dur=0;desc="Cache 0 Calls"
       Set-Cookie:
       - op_browser_state=f5448717470b879d75a31d1e1e832e10c24a7586f91c49d672dea82f;
         Path=/
@@ -51,7 +51,7 @@ interactions:
       X-Frame-Options:
       - DENY
       djdt-store-id:
-      - 7d41427a9d1846d6be4975c3f0b284b0
+      - 2262d688c46f4da090ad828bac248f04
     status:
       code: 200
       message: OK