MuckRock
diff --git a/‎.github/workflows/main.yml‎
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/main.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎docs/conf.py‎
Lines changed: 3 additions & 3 deletions b/‎docs/conf.py‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎documentcloud/documents.py‎
Lines changed: 68 additions & 14 deletions b/‎documentcloud/documents.py‎
Lines changed: 68 additions & 14 deletions
@@ -13,7 +13,7 @@ jobs:
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"]
+        python-version: ["3.10", "3.11"]
     steps:
       - name: Check out code
         uses: actions/checkout@v2
 
@@ -48,16 +48,16 @@
 
 # General information about the project.
 project = "documentcloud"
-copyright = "2023, MuckRock Foundation"
+copyright = "2025, MuckRock Foundation"
 
 # The version info for the project you're documenting, acts as replacement for
 # |version| and |release|, also used in various other places throughout the
 # built documents.
 #
 # The short X.Y version.
-version = "4.3"
+version = "4.5"
 # The full version, including alpha/beta/rc tags.
-release = "4.3.0"
+release = "4.5.0"
 
 # The language for content autogenerated by Sphinx. Refer to documentation
 # for a list of supported languages.
 
@@ -74,8 +74,11 @@ def __str__(self):
     def __getattr__(self, attr):
         """Generate methods for fetching resources"""
         p_image = re.compile(
-            r"^get_(?P<size>thumbnail|small|normal|large|xlarge)_image_url(?P<list>_list)?$"
+            r"^get_"
+            r"(?P<size>thumbnail|small|normal|large|xlarge)_image_url"
+            r"(?P<list>_list)?$"
         )
+
         get = attr.startswith("get_")
         url = attr.endswith("_url")
         text = attr.endswith("_text")
@@ -230,9 +233,15 @@ def get_errors(self):
 
         return all_results
 
-    def process(self):
-        """Reprocess the document"""
-        self._client.post(f"{self.api_path}/{self.id}/process/")
+    def process(self, **kwargs):
+        """Process the document, used on upload and for reprocessing"""
+        payload = {}
+        if "force_ocr" in kwargs:
+            payload["force_ocr"] = kwargs["force_ocr"]
+        if "ocr_engine" in kwargs:
+            payload["ocr_engine"] = kwargs["ocr_engine"]
+
+        self._client.post(f"{self.api_path}/{self.id}/process/", json=payload)
 
 
 class DocumentClient(BaseAPIClient):
@@ -310,6 +319,7 @@ def _format_upload_parameters(self, name, **kwargs):
             "title",
             "data",
             "force_ocr",
+            "ocr_engine",
             "projects",
             "delayed_index",
             "revision_control",
@@ -333,21 +343,55 @@ def _format_upload_parameters(self, name, **kwargs):
 
         return params
 
+    def _extract_ocr_options(self, kwargs):
+        """
+        Extract and validate OCR options from kwargs.
+
+        Returns:
+            force_ocr (bool)
+            ocr_engine (str)
+        """
+        force_ocr = kwargs.pop("force_ocr", False)
+        ocr_engine = kwargs.pop("ocr_engine", "tess4")
+
+        if not isinstance(force_ocr, bool):
+            raise ValueError("force_ocr must be a boolean")
+
+        if ocr_engine and ocr_engine not in ("tess4", "textract"):
+            raise ValueError(
+                "ocr_engine must be either 'tess4' for tesseract or 'textract'"
+            )
+
+        return force_ocr, ocr_engine
+
     def _get_title(self, name):
         """Get the default title for a document from its path"""
         return name.split(os.sep)[-1].rsplit(".", 1)[0]
 
     def _upload_url(self, file_url, **kwargs):
         """Upload a document from a publicly accessible URL"""
+        # extract process-related args
+        force_ocr, ocr_engine = self._extract_ocr_options(kwargs)
+
+        # create the document
         params = self._format_upload_parameters(file_url, **kwargs)
         params["file_url"] = file_url
+        if force_ocr:
+            params["force_ocr"] = force_ocr
+            params["ocr_engine"] = ocr_engine
         response = self.client.post("documents/", json=params)
-        return Document(self.client, response.json())
+        create_json = response.json()
+
+        # wrap in Document object
+        doc = Document(self.client, create_json)
+
+        return doc
 
     def _upload_file(self, file_, **kwargs):
         """Upload a document directly"""
         # create the document
-        force_ocr = kwargs.pop("force_ocr", False)
+        force_ocr, ocr_engine = self._extract_ocr_options(kwargs)
+
         params = self._format_upload_parameters(file_.name, **kwargs)
         response = self.client.post("documents/", json=params)
 
@@ -357,12 +401,12 @@ def _upload_file(self, file_, **kwargs):
         response = requests_retry_session().put(presigned_url, data=file_.read())
 
         # begin processing the document
-        doc_id = create_json["id"]
-        response = self.client.post(
-            f"documents/{doc_id}/process/", json={"force_ocr": force_ocr}
-        )
+        doc = Document(self.client, create_json)
 
-        return Document(self.client, create_json)
+        # begin processing
+        doc.process(force_ocr=force_ocr, ocr_engine=ocr_engine)
+
+        return doc
 
     def _collect_files(self, path, extensions):
         """Find the paths to files with specified extensions under a directory"""
@@ -410,7 +454,9 @@ def upload_directory(self, path, handle_errors=False, extensions=".pdf", **kwarg
         # Upload all the files using the bulk API to reduce the number
         # of API calls and improve performance
         obj_list = []
+        force_ocr, ocr_engine = self._extract_ocr_options(kwargs)
         params = self._format_upload_parameters("", **kwargs)
+
         for i, file_paths in enumerate(grouper(path_list, BULK_LIMIT)):
             # Grouper will put None's on the end of the last group
             file_paths = [p for p in file_paths if p is not None]
@@ -471,9 +517,13 @@ def upload_directory(self, path, handle_errors=False, extensions=".pdf", **kwarg
 
             # Begin processing the documents
             logger.info("Processing the documents...")
-            doc_ids = [j["id"] for j in create_json]
+            process_payload = [
+                {"id": j["id"], "force_ocr": force_ocr, "ocr_engine": ocr_engine}
+                for j in create_json
+            ]
+
             try:
-                response = self.client.post("documents/process/", json={"ids": doc_ids})
+                response = self.client.post("documents/process/", json=process_payload)
             except (APIError, RequestException) as exc:
                 if handle_errors:
                     logger.info(
@@ -484,7 +534,6 @@ def upload_directory(self, path, handle_errors=False, extensions=".pdf", **kwarg
                     continue
                 else:
                     raise
-
         logger.info("Upload directory complete")
 
         # Pass back the list of documents
@@ -496,8 +545,13 @@ def upload_urls(self, url_list, handle_errors=False, **kwargs):
         # Do not set the same title for all documents
         kwargs.pop("title", None)
 
+        force_ocr, ocr_engine = self._extract_ocr_options(kwargs)
         obj_list = []
         params = self._format_upload_parameters("", **kwargs)
+        # Add OCR options directly to params if needed
+        if force_ocr:
+            params["force_ocr"] = force_ocr
+            params["ocr_engine"] = ocr_engine
         for i, url_group in enumerate(grouper(url_list, BULK_LIMIT)):
             # Grouper will put None's on the end of the last group
             url_group = [url for url in url_group if url is not None]