Skip to content

Commit efa41d4

Browse files
markenkimarkencCopilotpre-commit-ci-lite[bot]jamesbraza
authored
Fix pdf parsing bug (#938)
Co-authored-by: Mark Encarnación <[email protected]> Co-authored-by: Copilot <[email protected]> Co-authored-by: pre-commit-ci-lite[bot] <117423508+pre-commit-ci-lite[bot]@users.noreply.github.com> Co-authored-by: James Braza <[email protected]>
1 parent 462f775 commit efa41d4

File tree

7 files changed

+70
-24
lines changed

7 files changed

+70
-24
lines changed

README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -802,6 +802,7 @@ will return much faster than the first query and we'll be certain the authors ma
802802
| `answer.get_evidence_if_no_contexts` | `True` | Allow lazy evidence gathering. |
803803
| `parsing.chunk_size` | `5000` | Characters per chunk (0 for no chunking). |
804804
| `parsing.page_size_limit` | `1,280,000` | Character limit per page. |
805+
| `parsing.pdfs_use_block_parsing` | `False` | Opt-in flag for block-based PDF parsing over text-based PDF parsing. |
805806
| `parsing.use_doc_details` | `True` | Whether to get metadata details for docs. |
806807
| `parsing.overlap` | `250` | Characters to overlap chunks. |
807808
| `parsing.defer_embedding` | `False` | Whether to defer embedding until summarization. |

paperqa/docs.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -298,6 +298,7 @@ async def aadd( # noqa: PLR0912
298298
chunk_chars=parse_config.chunk_size,
299299
overlap=parse_config.overlap,
300300
page_size_limit=parse_config.page_size_limit,
301+
use_block_parsing=parse_config.pdfs_use_block_parsing,
301302
)
302303
if not texts:
303304
raise ValueError(f"Could not read document {path}. Is it empty?")
@@ -390,6 +391,7 @@ async def aadd( # noqa: PLR0912
390391
chunk_chars=parse_config.chunk_size,
391392
overlap=parse_config.overlap,
392393
page_size_limit=parse_config.page_size_limit,
394+
use_block_parsing=parse_config.pdfs_use_block_parsing,
393395
)
394396
# loose check to see if document was loaded
395397
if (

paperqa/readers.py

Lines changed: 38 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -21,9 +21,13 @@
2121
from paperqa.utils import ImpossibleParsingError
2222
from paperqa.version import __version__ as pqa_version
2323

24+
BLOCK_TEXT_INDEX = 4
25+
2426

2527
def parse_pdf_to_pages(
26-
path: str | os.PathLike, page_size_limit: int | None = None
28+
path: str | os.PathLike,
29+
page_size_limit: int | None = None,
30+
use_block_parsing: bool = False,
2731
) -> ParsedText:
2832

2933
with pymupdf.open(path) as file:
@@ -39,7 +43,25 @@ def parse_pdf_to_pages(
3943
f" {file.page_count} for the PDF at path {path}, likely this PDF"
4044
" file is corrupt."
4145
) from exc
42-
text = page.get_text("text", sort=True)
46+
47+
if use_block_parsing:
48+
# NOTE: this block-based parsing appears to be better, but until
49+
# fully validated on 1+ benchmarks, it's considered experimental
50+
51+
# Extract text blocks from the page
52+
# Note: sort=False is important to preserve the order of text blocks
53+
# as they appear in the PDF
54+
blocks = page.get_text("blocks", sort=False)
55+
56+
# Concatenate text blocks into a single string
57+
text = "\n".join(
58+
block[BLOCK_TEXT_INDEX]
59+
for block in blocks
60+
if len(block) > BLOCK_TEXT_INDEX
61+
)
62+
else:
63+
text = page.get_text("text", sort=True)
64+
4365
if page_size_limit and len(text) > page_size_limit:
4466
raise ImpossibleParsingError(
4567
f"The text in page {i} of {file.page_count} was {len(text)} chars"
@@ -267,7 +289,7 @@ async def read_doc(
267289
include_metadata: Literal[True],
268290
chunk_chars: int = ...,
269291
overlap: int = ...,
270-
page_size_limit: int | None = ...,
292+
**parser_kwargs,
271293
) -> ParsedText: ...
272294
@overload
273295
async def read_doc(
@@ -277,7 +299,7 @@ async def read_doc(
277299
include_metadata: Literal[False] = ...,
278300
chunk_chars: int = ...,
279301
overlap: int = ...,
280-
page_size_limit: int | None = ...,
302+
**parser_kwargs,
281303
) -> ParsedText: ...
282304
@overload
283305
async def read_doc(
@@ -287,7 +309,7 @@ async def read_doc(
287309
include_metadata: Literal[True],
288310
chunk_chars: int = ...,
289311
overlap: int = ...,
290-
page_size_limit: int | None = ...,
312+
**parser_kwargs,
291313
) -> tuple[list[Text], ParsedMetadata]: ...
292314
@overload
293315
async def read_doc(
@@ -297,7 +319,7 @@ async def read_doc(
297319
include_metadata: Literal[False] = ...,
298320
chunk_chars: int = ...,
299321
overlap: int = ...,
300-
page_size_limit: int | None = ...,
322+
**parser_kwargs,
301323
) -> list[Text]: ...
302324
@overload
303325
async def read_doc(
@@ -307,7 +329,7 @@ async def read_doc(
307329
include_metadata: Literal[True],
308330
chunk_chars: int = ...,
309331
overlap: int = ...,
310-
page_size_limit: int | None = ...,
332+
**parser_kwargs,
311333
) -> tuple[list[Text], ParsedMetadata]: ...
312334
async def read_doc(
313335
path: str | os.PathLike,
@@ -316,7 +338,7 @@ async def read_doc(
316338
include_metadata: bool = False,
317339
chunk_chars: int = 3000,
318340
overlap: int = 100,
319-
page_size_limit: int | None = None,
341+
**parser_kwargs,
320342
) -> list[Text] | ParsedText | tuple[list[Text], ParsedMetadata]:
321343
"""Parse a document and split into chunks.
322344
@@ -328,32 +350,27 @@ async def read_doc(
328350
include_metadata: return a tuple
329351
chunk_chars: size of chunks
330352
overlap: size of overlap between chunks
331-
page_size_limit: optional limit on the number of characters per page
353+
parser_kwargs: Keyword arguments to pass to the used parsing function.
332354
"""
333355
str_path = str(path)
334356

335357
# start with parsing -- users may want to store this separately
336358
if str_path.endswith(".pdf"):
337359
# TODO: Make parse_pdf_to_pages async
338-
parsed_text = await asyncio.to_thread(
339-
parse_pdf_to_pages, path, page_size_limit=page_size_limit
340-
)
360+
parsed_text = await asyncio.to_thread(parse_pdf_to_pages, path, **parser_kwargs)
341361
elif str_path.endswith(".txt"):
342362
# TODO: Make parse_text async
343-
parsed_text = await asyncio.to_thread(
344-
parse_text, path, page_size_limit=page_size_limit
345-
)
363+
parser_kwargs.pop("use_block_parsing", None) # Not a parse_text kwarg
364+
parsed_text = await asyncio.to_thread(parse_text, path, **parser_kwargs)
346365
elif str_path.endswith(".html"):
366+
parser_kwargs.pop("use_block_parsing", None) # Not a parse_text kwarg
347367
parsed_text = await asyncio.to_thread(
348-
parse_text, path, html=True, page_size_limit=page_size_limit
368+
parse_text, path, html=True, **parser_kwargs
349369
)
350370
else:
371+
parser_kwargs.pop("use_block_parsing", None) # Not a parse_text kwarg
351372
parsed_text = await asyncio.to_thread(
352-
parse_text,
353-
path,
354-
split_lines=True,
355-
use_tiktoken=False,
356-
page_size_limit=page_size_limit,
373+
parse_text, path, split_lines=True, use_tiktoken=False, **parser_kwargs
357374
)
358375

359376
if parsed_text_only:

paperqa/settings.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -165,6 +165,13 @@ class ParsingSettings(BaseModel):
165165
" (ignoring chars vs tokens difference)."
166166
),
167167
)
168+
pdfs_use_block_parsing: bool = Field(
169+
default=False,
170+
description=(
171+
"Opt-in flag to use block-based parsing for PDFs instead of"
172+
" text-based parsing, which is known to be better for some PDFs."
173+
),
174+
)
168175
use_doc_details: bool = Field(
169176
default=True, description="Whether to try to get metadata details for a Doc."
170177
)

tests/stub_data/pasa.pdf

666 KB
Binary file not shown.

tests/test_agents.py

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -89,11 +89,11 @@ async def test_get_directory_index(
8989
"year",
9090
], "Incorrect fields in index"
9191
assert not index.changed, "Expected index to not have changes at this point"
92-
# bates.txt + empty.txt + flag_day.html + gravity_hill.md + obama.txt + paper.pdf,
92+
# bates.txt + empty.txt + flag_day.html + gravity_hill.md + obama.txt + paper.pdf + pasa.pdf,
9393
# but empty.txt fails to be added
9494
path_to_id = await index.index_files
9595
assert (
96-
sum(id_ != FAILED_DOCUMENT_ADD_ID for id_ in path_to_id.values()) == 5
96+
sum(id_ != FAILED_DOCUMENT_ADD_ID for id_ in path_to_id.values()) == 6
9797
), "Incorrect number of parsed index files"
9898

9999
with subtests.test(msg="check-txt-query"):
@@ -248,6 +248,7 @@ async def test_getting_manifest(
248248
"gravity_hill.md",
249249
"obama.txt",
250250
"paper.pdf",
251+
"pasa.pdf",
251252
}
252253

253254

@@ -565,6 +566,13 @@ async def gen_answer(self, state) -> str: # noqa: ARG001, RUF029
565566
async def test_agent_sharing_state(
566567
agent_test_settings: Settings, subtests: SubTests, callback_type: str | None
567568
) -> None:
569+
def files_filter(f) -> bool:
570+
# Filter out pasa.pdf just to speed the test and save API costs
571+
return f.name != "pasa.pdf" and IndexSettings.model_fields[
572+
"files_filter"
573+
].default(f)
574+
575+
agent_test_settings.agent.index.files_filter = files_filter
568576
agent_test_settings.agent.search_count = 3 # Keep low for speed
569577
agent_test_settings.answer.evidence_k = 2
570578
agent_test_settings.answer.answer_max_sources = 1

tests/test_paperqa.py

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,7 @@
4747
from paperqa.core import llm_parse_json
4848
from paperqa.prompts import CANNOT_ANSWER_PHRASE
4949
from paperqa.prompts import qa_prompt as default_qa_prompt
50-
from paperqa.readers import read_doc
50+
from paperqa.readers import parse_pdf_to_pages, read_doc
5151
from paperqa.types import ChunkMetadata
5252
from paperqa.utils import (
5353
extract_score,
@@ -989,6 +989,17 @@ async def test_pdf_reader_w_no_chunks(stub_data_dir: Path) -> None:
989989
assert docs.texts[0].embedding is None, "Should have deferred the embedding"
990990

991991

992+
def test_parse_pdf_to_pages(stub_data_dir: Path) -> None:
993+
filepath = stub_data_dir / "pasa.pdf"
994+
parsed_text = parse_pdf_to_pages(filepath, use_block_parsing=True)
995+
assert isinstance(parsed_text.content, dict)
996+
assert "1" in parsed_text.content, "Parsed text should contain page 1"
997+
assert (
998+
"Abstract\n\nWe introduce PaSa, an advanced Paper Search"
999+
"\nagent powered by large language models."
1000+
) in parsed_text.content["1"]
1001+
1002+
9921003
@pytest.mark.vcr
9931004
@pytest.mark.parametrize("defer_embeddings", [True, False])
9941005
@pytest.mark.asyncio

0 commit comments

Comments
 (0)