Fix pdf parsing bug (#938)

markenki · markenc · Copilot · web-flow · commit efa41d4a6792 · 2025-06-13T15:47:53.000-07:00
Co-authored-by: Mark Encarnación &lt;markenc@microsoft.com&gt;
Co-authored-by: Copilot &lt;175728472+Copilot@users.noreply.github.com&gt;
Co-authored-by: pre-commit-ci-lite[bot] &lt;117423508+pre-commit-ci-lite[bot]@users.noreply.github.com&gt;
Co-authored-by: James Braza &lt;jamesbraza@gmail.com&gt;
diff --git a/README.md b/README.md
@@ -802,6 +802,7 @@ will return much faster than the first query and we'll be certain the authors ma
 | `answer.get_evidence_if_no_contexts`         | `True`                                 | Allow lazy evidence gathering.                                                                          |
 | `parsing.chunk_size`                         | `5000`                                 | Characters per chunk (0 for no chunking).                                                               |
 | `parsing.page_size_limit`                    | `1,280,000`                            | Character limit per page.                                                                               |
+| `parsing.pdfs_use_block_parsing`             | `False`                                | Opt-in flag for block-based PDF parsing over text-based PDF parsing.                                    |
 | `parsing.use_doc_details`                    | `True`                                 | Whether to get metadata details for docs.                                                               |
 | `parsing.overlap`                            | `250`                                  | Characters to overlap chunks.                                                                           |
 | `parsing.defer_embedding`                    | `False`                                | Whether to defer embedding until summarization.                                                         |
diff --git a/paperqa/docs.py b/paperqa/docs.py
@@ -298,6 +298,7 @@ async def aadd(  # noqa: PLR0912
                 chunk_chars=parse_config.chunk_size,
                 overlap=parse_config.overlap,
                 page_size_limit=parse_config.page_size_limit,
+                use_block_parsing=parse_config.pdfs_use_block_parsing,
             )
             if not texts:
                 raise ValueError(f"Could not read document {path}. Is it empty?")
@@ -390,6 +391,7 @@ async def aadd(  # noqa: PLR0912
             chunk_chars=parse_config.chunk_size,
             overlap=parse_config.overlap,
             page_size_limit=parse_config.page_size_limit,
+            use_block_parsing=parse_config.pdfs_use_block_parsing,
         )
         # loose check to see if document was loaded
         if (
diff --git a/paperqa/readers.py b/paperqa/readers.py
@@ -21,9 +21,13 @@
 from paperqa.utils import ImpossibleParsingError
 from paperqa.version import __version__ as pqa_version
 
+BLOCK_TEXT_INDEX = 4
+
 
 def parse_pdf_to_pages(
-    path: str | os.PathLike, page_size_limit: int | None = None
+    path: str | os.PathLike,
+    page_size_limit: int | None = None,
+    use_block_parsing: bool = False,
 ) -> ParsedText:
 
     with pymupdf.open(path) as file:
@@ -39,7 +43,25 @@ def parse_pdf_to_pages(
                     f" {file.page_count} for the PDF at path {path}, likely this PDF"
                     " file is corrupt."
                 ) from exc
-            text = page.get_text("text", sort=True)
+
+            if use_block_parsing:
+                # NOTE: this block-based parsing appears to be better, but until
+                # fully validated on 1+ benchmarks, it's considered experimental
+
+                # Extract text blocks from the page
+                # Note: sort=False is important to preserve the order of text blocks
+                # as they appear in the PDF
+                blocks = page.get_text("blocks", sort=False)
+
+                # Concatenate text blocks into a single string
+                text = "\n".join(
+                    block[BLOCK_TEXT_INDEX]
+                    for block in blocks
+                    if len(block) > BLOCK_TEXT_INDEX
+                )
+            else:
+                text = page.get_text("text", sort=True)
+
             if page_size_limit and len(text) > page_size_limit:
                 raise ImpossibleParsingError(
                     f"The text in page {i} of {file.page_count} was {len(text)} chars"
@@ -267,7 +289,7 @@ async def read_doc(
     include_metadata: Literal[True],
     chunk_chars: int = ...,
     overlap: int = ...,
-    page_size_limit: int | None = ...,
+    **parser_kwargs,
 ) -> ParsedText: ...
 @overload
 async def read_doc(
@@ -277,7 +299,7 @@ async def read_doc(
     include_metadata: Literal[False] = ...,
     chunk_chars: int = ...,
     overlap: int = ...,
-    page_size_limit: int | None = ...,
+    **parser_kwargs,
 ) -> ParsedText: ...
 @overload
 async def read_doc(
@@ -287,7 +309,7 @@ async def read_doc(
     include_metadata: Literal[True],
     chunk_chars: int = ...,
     overlap: int = ...,
-    page_size_limit: int | None = ...,
+    **parser_kwargs,
 ) -> tuple[list[Text], ParsedMetadata]: ...
 @overload
 async def read_doc(
@@ -297,7 +319,7 @@ async def read_doc(
     include_metadata: Literal[False] = ...,
     chunk_chars: int = ...,
     overlap: int = ...,
-    page_size_limit: int | None = ...,
+    **parser_kwargs,
 ) -> list[Text]: ...
 @overload
 async def read_doc(
@@ -307,7 +329,7 @@ async def read_doc(
     include_metadata: Literal[True],
     chunk_chars: int = ...,
     overlap: int = ...,
-    page_size_limit: int | None = ...,
+    **parser_kwargs,
 ) -> tuple[list[Text], ParsedMetadata]: ...
 async def read_doc(
     path: str | os.PathLike,
@@ -316,7 +338,7 @@ async def read_doc(
     include_metadata: bool = False,
     chunk_chars: int = 3000,
     overlap: int = 100,
-    page_size_limit: int | None = None,
+    **parser_kwargs,
 ) -> list[Text] | ParsedText | tuple[list[Text], ParsedMetadata]:
     """Parse a document and split into chunks.
 
@@ -328,32 +350,27 @@ async def read_doc(
         include_metadata: return a tuple
         chunk_chars: size of chunks
         overlap: size of overlap between chunks
-        page_size_limit: optional limit on the number of characters per page
+        parser_kwargs: Keyword arguments to pass to the used parsing function.
     """
     str_path = str(path)
 
     # start with parsing -- users may want to store this separately
     if str_path.endswith(".pdf"):
         # TODO: Make parse_pdf_to_pages async
-        parsed_text = await asyncio.to_thread(
-            parse_pdf_to_pages, path, page_size_limit=page_size_limit
-        )
+        parsed_text = await asyncio.to_thread(parse_pdf_to_pages, path, **parser_kwargs)
     elif str_path.endswith(".txt"):
         # TODO: Make parse_text async
-        parsed_text = await asyncio.to_thread(
-            parse_text, path, page_size_limit=page_size_limit
-        )
+        parser_kwargs.pop("use_block_parsing", None)  # Not a parse_text kwarg
+        parsed_text = await asyncio.to_thread(parse_text, path, **parser_kwargs)
     elif str_path.endswith(".html"):
+        parser_kwargs.pop("use_block_parsing", None)  # Not a parse_text kwarg
         parsed_text = await asyncio.to_thread(
-            parse_text, path, html=True, page_size_limit=page_size_limit
+            parse_text, path, html=True, **parser_kwargs
         )
     else:
+        parser_kwargs.pop("use_block_parsing", None)  # Not a parse_text kwarg
         parsed_text = await asyncio.to_thread(
-            parse_text,
-            path,
-            split_lines=True,
-            use_tiktoken=False,
-            page_size_limit=page_size_limit,
+            parse_text, path, split_lines=True, use_tiktoken=False, **parser_kwargs
         )
 
     if parsed_text_only:
diff --git a/paperqa/settings.py b/paperqa/settings.py
@@ -165,6 +165,13 @@ class ParsingSettings(BaseModel):
             " (ignoring chars vs tokens difference)."
         ),
     )
+    pdfs_use_block_parsing: bool = Field(
+        default=False,
+        description=(
+            "Opt-in flag to use block-based parsing for PDFs instead of"
+            " text-based parsing, which is known to be better for some PDFs."
+        ),
+    )
     use_doc_details: bool = Field(
         default=True, description="Whether to try to get metadata details for a Doc."
     )
diff --git a/tests/stub_data/pasa.pdf b/tests/stub_data/pasa.pdf
diff --git a/tests/test_agents.py b/tests/test_agents.py
@@ -89,11 +89,11 @@ async def test_get_directory_index(
             "year",
         ], "Incorrect fields in index"
         assert not index.changed, "Expected index to not have changes at this point"
-        # bates.txt + empty.txt + flag_day.html + gravity_hill.md + obama.txt + paper.pdf,
+        # bates.txt + empty.txt + flag_day.html + gravity_hill.md + obama.txt + paper.pdf + pasa.pdf,
         # but empty.txt fails to be added
         path_to_id = await index.index_files
         assert (
-            sum(id_ != FAILED_DOCUMENT_ADD_ID for id_ in path_to_id.values()) == 5
+            sum(id_ != FAILED_DOCUMENT_ADD_ID for id_ in path_to_id.values()) == 6
         ), "Incorrect number of parsed index files"
 
         with subtests.test(msg="check-txt-query"):
@@ -248,6 +248,7 @@ async def test_getting_manifest(
     "gravity_hill.md",
     "obama.txt",
     "paper.pdf",
+    "pasa.pdf",
 }
 
 
@@ -565,6 +566,13 @@ async def gen_answer(self, state) -> str:  # noqa: ARG001, RUF029
 async def test_agent_sharing_state(
     agent_test_settings: Settings, subtests: SubTests, callback_type: str | None
 ) -> None:
+    def files_filter(f) -> bool:
+        # Filter out pasa.pdf just to speed the test and save API costs
+        return f.name != "pasa.pdf" and IndexSettings.model_fields[
+            "files_filter"
+        ].default(f)
+
+    agent_test_settings.agent.index.files_filter = files_filter
     agent_test_settings.agent.search_count = 3  # Keep low for speed
     agent_test_settings.answer.evidence_k = 2
     agent_test_settings.answer.answer_max_sources = 1
diff --git a/tests/test_paperqa.py b/tests/test_paperqa.py
@@ -47,7 +47,7 @@
 from paperqa.core import llm_parse_json
 from paperqa.prompts import CANNOT_ANSWER_PHRASE
 from paperqa.prompts import qa_prompt as default_qa_prompt
-from paperqa.readers import read_doc
+from paperqa.readers import parse_pdf_to_pages, read_doc
 from paperqa.types import ChunkMetadata
 from paperqa.utils import (
     extract_score,
@@ -989,6 +989,17 @@ async def test_pdf_reader_w_no_chunks(stub_data_dir: Path) -> None:
     assert docs.texts[0].embedding is None, "Should have deferred the embedding"
 
 
+def test_parse_pdf_to_pages(stub_data_dir: Path) -> None:
+    filepath = stub_data_dir / "pasa.pdf"
+    parsed_text = parse_pdf_to_pages(filepath, use_block_parsing=True)
+    assert isinstance(parsed_text.content, dict)
+    assert "1" in parsed_text.content, "Parsed text should contain page 1"
+    assert (
+        "Abstract\n\nWe introduce PaSa, an advanced Paper Search"
+        "\nagent powered by large language models."
+    ) in parsed_text.content["1"]
+
+
 @pytest.mark.vcr
 @pytest.mark.parametrize("defer_embeddings", [True, False])
 @pytest.mark.asyncio

Original file line number	Diff line number	Diff line change
`@@ -165,6 +165,13 @@ class ParsingSettings(BaseModel):`
`165`	`165`	`" (ignoring chars vs tokens difference)."`
`166`	`166`	`),`
`167`	`167`	`)`
	`168`	`+ pdfs_use_block_parsing: bool = Field(`
	`169`	`+ default=False,`
	`170`	`+ description=(`
	`171`	`+ "Opt-in flag to use block-based parsing for PDFs instead of"`
	`172`	`+ " text-based parsing, which is known to be better for some PDFs."`
	`173`	`+ ),`
	`174`	`+ )`
`168`	`175`	`use_doc_details: bool = Field(`
`169`	`176`	`default=True, description="Whether to try to get metadata details for a Doc."`
`170`	`177`	`)`