[DEV-11721]: create dataset with email options (#333)

jacobmanderson · web-flow · commit 71d17a958244 · 2024-09-19T12:08:17.000-07:00
* create dataset with email options

* comment on email options

* does this trigger new harness
diff --git a/indico/queries/datasets.py b/indico/queries/datasets.py
@@ -5,7 +5,6 @@
 from pathlib import Path
 from typing import Dict, List, Optional, Union
 
-import deprecation
 import jsons
 import pandas as pd
 
@@ -22,6 +21,7 @@
 from indico.queries.storage import UploadBatched, UploadImages
 from indico.types.dataset import (
     Dataset,
+    EmailOptions,
     OcrEngine,
     OcrInputLanguage,
     OmnipageOcrOptionsInput,
@@ -228,6 +228,7 @@ def __init__(
         omnipage_ocr_options: OmnipageOcrOptionsInput = None,
         read_api_ocr_options: ReadApiOcrOptionsInput = None,
         request_interval: Union[int, float] = 5,
+        email_options: EmailOptions = None,
     ):
         self.files = files
         self.name = name
@@ -240,6 +241,7 @@ def __init__(
         self.omnipage_ocr_options = omnipage_ocr_options
         self.read_api_ocr_options = read_api_ocr_options
         self.request_interval = request_interval
+        self.email_options = email_options
         if omnipage_ocr_options is not None and read_api_ocr_options is not None:
             raise IndicoInputError(
                 "Must supply either omnipage or readapi options but not both."
@@ -279,6 +281,7 @@ def requests(self):
             readapi_ocr_options=self.read_api_ocr_options,
             omnipage_ocr_options=self.omnipage_ocr_options,
             ocr_engine=self.ocr_engine,
+            email_options=self.email_options,
         )
         yield _AddFiles(
             dataset_id=self.previous.id, metadata=file_metadata, autoprocess=True
@@ -376,6 +379,7 @@ def __init__(
         ocr_engine: OcrEngine = None,
         omnipage_ocr_options: OmnipageOcrOptionsInput = None,
         readapi_ocr_options: ReadApiOcrOptionsInput = None,
+        email_options: EmailOptions = None,
     ):
         if not dataset_type:
             dataset_type = "TEXT"
@@ -386,7 +390,8 @@ def __init__(
                     "ocrEngine": ocr_engine.name,
                     "omnipageOptions": omnipage_ocr_options,
                     "readapiOptions": readapi_ocr_options,
-                }
+                },
+                "emailOptions": email_options,
             }
         super().__init__(
             self.query,
diff --git a/indico/types/dataset.py b/indico/types/dataset.py
@@ -1,9 +1,9 @@
 from enum import Enum
-from typing import List
+from typing import List, Optional
 
+from indico.errors import IndicoInputError
 from indico.types.base import BaseType
 from indico.types.datafile import Datafile
-from indico.errors import IndicoInputError
 
 
 class DataColumn(BaseType):
@@ -57,28 +57,35 @@ class Dataset(BaseType):
 
     def labelset_by_name(self, name: str) -> LabelSet:
         if name not in [lab.name for lab in self.labelsets]:
-            raise IndicoInputError(f"No labelset found for {name}. Current labelset names include {[lab.name for lab in self.labelsets]}.")
+            raise IndicoInputError(
+                f"No labelset found for {name}. Current labelset names include {[lab.name for lab in self.labelsets]}."
+            )
         return next(lab for lab in self.labelsets if lab.name == name)
 
     def datacolumn_by_name(self, name: str) -> DataColumn:
         if name not in [datacol.name for datacol in self.datacolumns]:
-            raise IndicoInputError(f"No datacolumn found for {name}. Current datacolumn names include {[datacol.name for datacol in self.datacolumns]}.")
+            raise IndicoInputError(
+                f"No datacolumn found for {name}. Current datacolumn names include {[datacol.name for datacol in self.datacolumns]}."
+            )
         return next(datacol for datacol in self.datacolumns if datacol.name == name)
 
 
 class TableReadOrder(Enum):
     ROW = 0
     COLUMN = 1
 
+
 class OcrEngine(Enum):
     """
     Enum representing available OCR engines.
     """
+
     OMNIPAGE = 0
     READAPI = 1
     READAPI_V2 = 2
     READAPI_TABLES_V1 = 3
 
+
 class OmnipageOcrOptionsInput(BaseType):
     """
     Omnipage specific OCR options for dataset creation.
@@ -95,6 +102,7 @@ class OmnipageOcrOptionsInput(BaseType):
         table_read_order(TableReadOrder): Read table by row or column.
 
     """
+
     auto_rotate: bool
     single_column: bool
     upscale_images: bool
@@ -105,6 +113,7 @@ class OmnipageOcrOptionsInput(BaseType):
     native_pdf: bool
     table_read_order: TableReadOrder
 
+
 class ReadApiOcrOptionsInput(BaseType):
     """
     Read API OCR options.
@@ -115,20 +124,44 @@ class ReadApiOcrOptionsInput(BaseType):
         upscale_images(bool): Scale up low resolution images.
         languages(List[str]): List of languages to use.
     """
+
     auto_rotate: bool
     single_column: bool
     upscale_images: bool
     languages: List[str]
 
+
 class OcrInputLanguage(BaseType):
     name: str
     code: str
 
-class OcrOptionsInput():
+
+class IncludeSections(BaseType):
+    header: Optional[bool]
+    body: Optional[bool]
+    attachments: Optional[bool]
+
+
+class EmailOptions(BaseType):
+    """
+    Email options
+
+    Args:
+        include_sections: Sections of the email to include after parsing (header, body, attachments)
+        unpack: Unpack an email and treat it as a multi-file Submission
+        preserve_body_whitespace: Preserve whitespace in the body of the email
+    """
+
+    include_sections: Optional[IncludeSections]
+    unpack: Optional[bool]
+    preserve_body_whitespace: Optional[bool]
+
+
+class OcrOptionsInput:
     """
     Input options for OCR engine.
     """
+
     ocr_engine: OcrEngine
     omnipage_options: OmnipageOcrOptionsInput
     readapi_options: ReadApiOcrOptionsInput
-    
diff --git a/tests/integration/queries/test_dataset.py b/tests/integration/queries/test_dataset.py
@@ -425,3 +425,26 @@ def test_bad_csv_create_dataset(indico):
     assert dataset.status == "CREATING"
     dataset = client.call(GetDatasetFileStatus(id=dataset.id))
     assert all([f.status == "FAILED" for f in dataset.files])
+
+
+@pytest.mark.ocr("readapi")
+def test_create_with_email_options_readapi(indico):
+    client = IndicoClient()
+    readapi_config: ReadApiOcrOptionsInput = {
+        "auto_rotate": True,
+        "single_column": False,
+        "upscale_images": True,
+        "languages": ["AUTO"],
+    }
+    email_config = {
+        "include_sections": {"header": True, "body": True, "attachments": True},
+        "unpack": True,
+    }
+    dataset = client.call(
+        CreateEmptyDataset(
+            name=f"dataset-{int(time.time())}",
+            ocr_engine=OcrEngine.READAPI,
+            readapi_ocr_options=readapi_config,
+            email_options=email_config,
+        )
+    )