Skip to content

Commit 71d17a9

Browse files
[DEV-11721]: create dataset with email options (#333)
* create dataset with email options * comment on email options * does this trigger new harness
1 parent 4235c4f commit 71d17a9

File tree

3 files changed

+69
-8
lines changed

3 files changed

+69
-8
lines changed

indico/queries/datasets.py

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,6 @@
55
from pathlib import Path
66
from typing import Dict, List, Optional, Union
77

8-
import deprecation
98
import jsons
109
import pandas as pd
1110

@@ -22,6 +21,7 @@
2221
from indico.queries.storage import UploadBatched, UploadImages
2322
from indico.types.dataset import (
2423
Dataset,
24+
EmailOptions,
2525
OcrEngine,
2626
OcrInputLanguage,
2727
OmnipageOcrOptionsInput,
@@ -228,6 +228,7 @@ def __init__(
228228
omnipage_ocr_options: OmnipageOcrOptionsInput = None,
229229
read_api_ocr_options: ReadApiOcrOptionsInput = None,
230230
request_interval: Union[int, float] = 5,
231+
email_options: EmailOptions = None,
231232
):
232233
self.files = files
233234
self.name = name
@@ -240,6 +241,7 @@ def __init__(
240241
self.omnipage_ocr_options = omnipage_ocr_options
241242
self.read_api_ocr_options = read_api_ocr_options
242243
self.request_interval = request_interval
244+
self.email_options = email_options
243245
if omnipage_ocr_options is not None and read_api_ocr_options is not None:
244246
raise IndicoInputError(
245247
"Must supply either omnipage or readapi options but not both."
@@ -279,6 +281,7 @@ def requests(self):
279281
readapi_ocr_options=self.read_api_ocr_options,
280282
omnipage_ocr_options=self.omnipage_ocr_options,
281283
ocr_engine=self.ocr_engine,
284+
email_options=self.email_options,
282285
)
283286
yield _AddFiles(
284287
dataset_id=self.previous.id, metadata=file_metadata, autoprocess=True
@@ -376,6 +379,7 @@ def __init__(
376379
ocr_engine: OcrEngine = None,
377380
omnipage_ocr_options: OmnipageOcrOptionsInput = None,
378381
readapi_ocr_options: ReadApiOcrOptionsInput = None,
382+
email_options: EmailOptions = None,
379383
):
380384
if not dataset_type:
381385
dataset_type = "TEXT"
@@ -386,7 +390,8 @@ def __init__(
386390
"ocrEngine": ocr_engine.name,
387391
"omnipageOptions": omnipage_ocr_options,
388392
"readapiOptions": readapi_ocr_options,
389-
}
393+
},
394+
"emailOptions": email_options,
390395
}
391396
super().__init__(
392397
self.query,

indico/types/dataset.py

Lines changed: 39 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,9 @@
11
from enum import Enum
2-
from typing import List
2+
from typing import List, Optional
33

4+
from indico.errors import IndicoInputError
45
from indico.types.base import BaseType
56
from indico.types.datafile import Datafile
6-
from indico.errors import IndicoInputError
77

88

99
class DataColumn(BaseType):
@@ -57,28 +57,35 @@ class Dataset(BaseType):
5757

5858
def labelset_by_name(self, name: str) -> LabelSet:
5959
if name not in [lab.name for lab in self.labelsets]:
60-
raise IndicoInputError(f"No labelset found for {name}. Current labelset names include {[lab.name for lab in self.labelsets]}.")
60+
raise IndicoInputError(
61+
f"No labelset found for {name}. Current labelset names include {[lab.name for lab in self.labelsets]}."
62+
)
6163
return next(lab for lab in self.labelsets if lab.name == name)
6264

6365
def datacolumn_by_name(self, name: str) -> DataColumn:
6466
if name not in [datacol.name for datacol in self.datacolumns]:
65-
raise IndicoInputError(f"No datacolumn found for {name}. Current datacolumn names include {[datacol.name for datacol in self.datacolumns]}.")
67+
raise IndicoInputError(
68+
f"No datacolumn found for {name}. Current datacolumn names include {[datacol.name for datacol in self.datacolumns]}."
69+
)
6670
return next(datacol for datacol in self.datacolumns if datacol.name == name)
6771

6872

6973
class TableReadOrder(Enum):
7074
ROW = 0
7175
COLUMN = 1
7276

77+
7378
class OcrEngine(Enum):
7479
"""
7580
Enum representing available OCR engines.
7681
"""
82+
7783
OMNIPAGE = 0
7884
READAPI = 1
7985
READAPI_V2 = 2
8086
READAPI_TABLES_V1 = 3
8187

88+
8289
class OmnipageOcrOptionsInput(BaseType):
8390
"""
8491
Omnipage specific OCR options for dataset creation.
@@ -95,6 +102,7 @@ class OmnipageOcrOptionsInput(BaseType):
95102
table_read_order(TableReadOrder): Read table by row or column.
96103
97104
"""
105+
98106
auto_rotate: bool
99107
single_column: bool
100108
upscale_images: bool
@@ -105,6 +113,7 @@ class OmnipageOcrOptionsInput(BaseType):
105113
native_pdf: bool
106114
table_read_order: TableReadOrder
107115

116+
108117
class ReadApiOcrOptionsInput(BaseType):
109118
"""
110119
Read API OCR options.
@@ -115,20 +124,44 @@ class ReadApiOcrOptionsInput(BaseType):
115124
upscale_images(bool): Scale up low resolution images.
116125
languages(List[str]): List of languages to use.
117126
"""
127+
118128
auto_rotate: bool
119129
single_column: bool
120130
upscale_images: bool
121131
languages: List[str]
122132

133+
123134
class OcrInputLanguage(BaseType):
124135
name: str
125136
code: str
126137

127-
class OcrOptionsInput():
138+
139+
class IncludeSections(BaseType):
140+
header: Optional[bool]
141+
body: Optional[bool]
142+
attachments: Optional[bool]
143+
144+
145+
class EmailOptions(BaseType):
146+
"""
147+
Email options
148+
149+
Args:
150+
include_sections: Sections of the email to include after parsing (header, body, attachments)
151+
unpack: Unpack an email and treat it as a multi-file Submission
152+
preserve_body_whitespace: Preserve whitespace in the body of the email
153+
"""
154+
155+
include_sections: Optional[IncludeSections]
156+
unpack: Optional[bool]
157+
preserve_body_whitespace: Optional[bool]
158+
159+
160+
class OcrOptionsInput:
128161
"""
129162
Input options for OCR engine.
130163
"""
164+
131165
ocr_engine: OcrEngine
132166
omnipage_options: OmnipageOcrOptionsInput
133167
readapi_options: ReadApiOcrOptionsInput
134-

tests/integration/queries/test_dataset.py

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -425,3 +425,26 @@ def test_bad_csv_create_dataset(indico):
425425
assert dataset.status == "CREATING"
426426
dataset = client.call(GetDatasetFileStatus(id=dataset.id))
427427
assert all([f.status == "FAILED" for f in dataset.files])
428+
429+
430+
@pytest.mark.ocr("readapi")
431+
def test_create_with_email_options_readapi(indico):
432+
client = IndicoClient()
433+
readapi_config: ReadApiOcrOptionsInput = {
434+
"auto_rotate": True,
435+
"single_column": False,
436+
"upscale_images": True,
437+
"languages": ["AUTO"],
438+
}
439+
email_config = {
440+
"include_sections": {"header": True, "body": True, "attachments": True},
441+
"unpack": True,
442+
}
443+
dataset = client.call(
444+
CreateEmptyDataset(
445+
name=f"dataset-{int(time.time())}",
446+
ocr_engine=OcrEngine.READAPI,
447+
readapi_ocr_options=readapi_config,
448+
email_options=email_config,
449+
)
450+
)

0 commit comments

Comments
 (0)