21
21
from paperqa .utils import ImpossibleParsingError
22
22
from paperqa .version import __version__ as pqa_version
23
23
24
+ BLOCK_TEXT_INDEX = 4
25
+
24
26
25
27
def parse_pdf_to_pages (
26
- path : str | os .PathLike , page_size_limit : int | None = None
28
+ path : str | os .PathLike ,
29
+ page_size_limit : int | None = None ,
30
+ use_block_parsing : bool = False ,
27
31
) -> ParsedText :
28
32
29
33
with pymupdf .open (path ) as file :
@@ -39,7 +43,25 @@ def parse_pdf_to_pages(
39
43
f" { file .page_count } for the PDF at path { path } , likely this PDF"
40
44
" file is corrupt."
41
45
) from exc
42
- text = page .get_text ("text" , sort = True )
46
+
47
+ if use_block_parsing :
48
+ # NOTE: this block-based parsing appears to be better, but until
49
+ # fully validated on 1+ benchmarks, it's considered experimental
50
+
51
+ # Extract text blocks from the page
52
+ # Note: sort=False is important to preserve the order of text blocks
53
+ # as they appear in the PDF
54
+ blocks = page .get_text ("blocks" , sort = False )
55
+
56
+ # Concatenate text blocks into a single string
57
+ text = "\n " .join (
58
+ block [BLOCK_TEXT_INDEX ]
59
+ for block in blocks
60
+ if len (block ) > BLOCK_TEXT_INDEX
61
+ )
62
+ else :
63
+ text = page .get_text ("text" , sort = True )
64
+
43
65
if page_size_limit and len (text ) > page_size_limit :
44
66
raise ImpossibleParsingError (
45
67
f"The text in page { i } of { file .page_count } was { len (text )} chars"
@@ -267,7 +289,7 @@ async def read_doc(
267
289
include_metadata : Literal [True ],
268
290
chunk_chars : int = ...,
269
291
overlap : int = ...,
270
- page_size_limit : int | None = ... ,
292
+ ** parser_kwargs ,
271
293
) -> ParsedText : ...
272
294
@overload
273
295
async def read_doc (
@@ -277,7 +299,7 @@ async def read_doc(
277
299
include_metadata : Literal [False ] = ...,
278
300
chunk_chars : int = ...,
279
301
overlap : int = ...,
280
- page_size_limit : int | None = ... ,
302
+ ** parser_kwargs ,
281
303
) -> ParsedText : ...
282
304
@overload
283
305
async def read_doc (
@@ -287,7 +309,7 @@ async def read_doc(
287
309
include_metadata : Literal [True ],
288
310
chunk_chars : int = ...,
289
311
overlap : int = ...,
290
- page_size_limit : int | None = ... ,
312
+ ** parser_kwargs ,
291
313
) -> tuple [list [Text ], ParsedMetadata ]: ...
292
314
@overload
293
315
async def read_doc (
@@ -297,7 +319,7 @@ async def read_doc(
297
319
include_metadata : Literal [False ] = ...,
298
320
chunk_chars : int = ...,
299
321
overlap : int = ...,
300
- page_size_limit : int | None = ... ,
322
+ ** parser_kwargs ,
301
323
) -> list [Text ]: ...
302
324
@overload
303
325
async def read_doc (
@@ -307,7 +329,7 @@ async def read_doc(
307
329
include_metadata : Literal [True ],
308
330
chunk_chars : int = ...,
309
331
overlap : int = ...,
310
- page_size_limit : int | None = ... ,
332
+ ** parser_kwargs ,
311
333
) -> tuple [list [Text ], ParsedMetadata ]: ...
312
334
async def read_doc (
313
335
path : str | os .PathLike ,
@@ -316,7 +338,7 @@ async def read_doc(
316
338
include_metadata : bool = False ,
317
339
chunk_chars : int = 3000 ,
318
340
overlap : int = 100 ,
319
- page_size_limit : int | None = None ,
341
+ ** parser_kwargs ,
320
342
) -> list [Text ] | ParsedText | tuple [list [Text ], ParsedMetadata ]:
321
343
"""Parse a document and split into chunks.
322
344
@@ -328,32 +350,27 @@ async def read_doc(
328
350
include_metadata: return a tuple
329
351
chunk_chars: size of chunks
330
352
overlap: size of overlap between chunks
331
- page_size_limit: optional limit on the number of characters per page
353
+ parser_kwargs: Keyword arguments to pass to the used parsing function.
332
354
"""
333
355
str_path = str (path )
334
356
335
357
# start with parsing -- users may want to store this separately
336
358
if str_path .endswith (".pdf" ):
337
359
# TODO: Make parse_pdf_to_pages async
338
- parsed_text = await asyncio .to_thread (
339
- parse_pdf_to_pages , path , page_size_limit = page_size_limit
340
- )
360
+ parsed_text = await asyncio .to_thread (parse_pdf_to_pages , path , ** parser_kwargs )
341
361
elif str_path .endswith (".txt" ):
342
362
# TODO: Make parse_text async
343
- parsed_text = await asyncio .to_thread (
344
- parse_text , path , page_size_limit = page_size_limit
345
- )
363
+ parser_kwargs .pop ("use_block_parsing" , None ) # Not a parse_text kwarg
364
+ parsed_text = await asyncio .to_thread (parse_text , path , ** parser_kwargs )
346
365
elif str_path .endswith (".html" ):
366
+ parser_kwargs .pop ("use_block_parsing" , None ) # Not a parse_text kwarg
347
367
parsed_text = await asyncio .to_thread (
348
- parse_text , path , html = True , page_size_limit = page_size_limit
368
+ parse_text , path , html = True , ** parser_kwargs
349
369
)
350
370
else :
371
+ parser_kwargs .pop ("use_block_parsing" , None ) # Not a parse_text kwarg
351
372
parsed_text = await asyncio .to_thread (
352
- parse_text ,
353
- path ,
354
- split_lines = True ,
355
- use_tiktoken = False ,
356
- page_size_limit = page_size_limit ,
373
+ parse_text , path , split_lines = True , use_tiktoken = False , ** parser_kwargs
357
374
)
358
375
359
376
if parsed_text_only :
0 commit comments