From c2094264671dcff2032ff919bff285f662424db6 Mon Sep 17 00:00:00 2001
From: AnkitRaut <ankitraut87@gmail.com>
Date: Sat, 2 Aug 2025 23:07:42 +0530
Subject: [PATCH 1/3] Add _process_table and to_list methods. Refactor
 to_pandas method to use _process_table method

---
 textractor/entities/table.py | 82 ++++++++++++++++++++----------------
 1 file changed, 45 insertions(+), 37 deletions(-)

diff --git a/textractor/entities/table.py b/textractor/entities/table.py
index e8fccdf..d6f386b 100644
--- a/textractor/entities/table.py
+++ b/textractor/entities/table.py
@@ -497,45 +497,32 @@ def __getitem__(self, key):
 
         return new_table
 
-    def to_pandas(self, use_columns=False, config: TextLinearizationConfig = TextLinearizationConfig()):
+    def _process_table(self, use_columns=False, config: TextLinearizationConfig = TextLinearizationConfig()):
         """
-        Converts the table to a pandas DataFrame
-
-        :param use_columns: If the first row of the table is made of column headers, use them for the pandas dataframe. Only supports single row header.
-        :param config: Text linearization configuration object for the table content
-        :return:
+        Processes the table into a list of rows for consumption by to_pandas and to_list.
+        Returns (table: List[List[str]], columns: Optional[List[str]])
         """
-        try:
-            from pandas import DataFrame
-        except ImportError:
-            raise MissingDependencyException(
-                "pandas library is required for exporting tables to DataFrame objects or markdown"
-            )
-
+        import itertools
         rows = sorted([(key, list(group)) for key, group in itertools.groupby(
             self.table_cells, key=lambda cell: cell.row_index
         )], key=lambda r: r[0])
         row_offset = 0
-
         columns = None
         processed_cells = set()
+        table = []
         if use_columns:
-            # Try to automatically get the columns if they are in the first row
             columns = [[] for _ in range(self.column_count)]
             is_header_count = 0
             for _, row in rows:
                 if not any([c.is_column_header for c in row]):
-                    # There is not header in that row, we are done
                     break
                 for i, cell in enumerate(row):
                     if (
-                        cell not in processed_cells or
-                        config.table_duplicate_text_in_merged_cells or
-                        config.table_flatten_headers
+                            cell not in processed_cells or
+                            config.table_duplicate_text_in_merged_cells or
+                            config.table_flatten_headers
                     ):
                         if cell.siblings:
-                            # This handles the edge case where we are flattening the headers
-                            # so we want to duplicate the cell text but only in its first row
                             first_row, _, _, _ = cell._get_merged_cell_range()
                             if cell in processed_cells and first_row != cell.row_index:
                                 continue
@@ -557,26 +544,20 @@ def to_pandas(self, use_columns=False, config: TextLinearizationConfig = TextLin
                     else:
                         columns[i].append("")
                 row_offset += 1
-            # If we have the correct number of column and at least half the row is tagged as a header
-            if len(columns) == self.column_count and is_header_count / len(columns) >= config.table_column_header_threshold:
+            if len(columns) == self.column_count and is_header_count / len(
+                    columns) >= config.table_column_header_threshold:
                 use_columns = True
             else:
                 use_columns = False
-                logger.info(
-                    f"The number of column header cell do not match the column count, ignoring them, {len(columns)} vs {self.column_count}"
-                )
-
+                columns = None
+                row_offset = 0
         if columns and any([c for c in columns]) and config.table_flatten_headers:
             columns = ["".join(c) for c in columns]
             table = [columns]
         elif columns and any([c for c in columns]):
-            # We reset the row offset as only the first line will be taken as header
             columns = [c[0] for c in columns]
             table = [columns]
             row_offset = 1
-        else:
-            table = []
-
         for _, row in rows[row_offset:]:
             table.append([])
             for cell in row:
@@ -584,7 +565,8 @@ def to_pandas(self, use_columns=False, config: TextLinearizationConfig = TextLin
                 if cell.siblings:
                     children = []
                     first_row, first_col, last_row, last_col = cell._get_merged_cell_range()
-                    if (cell.col_index == first_col and cell.row_index == first_row) or config.table_duplicate_text_in_merged_cells:
+                    if (
+                            cell.col_index == first_col and cell.row_index == first_row) or config.table_duplicate_text_in_merged_cells:
                         for sib in cell.siblings:
                             children.extend(sib.children)
                             processed_cells.add(sib)
@@ -599,12 +581,38 @@ def to_pandas(self, use_columns=False, config: TextLinearizationConfig = TextLin
                         text = config.table_cell_empty_cell_placeholder if config.table_cell_empty_cell_placeholder else ""
                 else:
                     text = cell.get_text(config)
-                table[-1][cell.col_index - 1] = text if text or not config.table_cell_empty_cell_placeholder else config.table_cell_empty_cell_placeholder
+                table[-1][
+                    cell.col_index - 1] = text if text or not config.table_cell_empty_cell_placeholder else config.table_cell_empty_cell_placeholder
+        return table, columns
 
-        return DataFrame(
-            table[1:] if use_columns else table,
-            columns=columns if use_columns else None,
-        )
+    def to_list(self, config: TextLinearizationConfig = TextLinearizationConfig()):
+        """
+        Converts the table to a list of lists.
+        :param config: Text linearization configuration object for the table content
+        :return: List of rows representing the table.
+        :rtype: List[List[str]]
+        """
+        table, columns = self._process_table(use_columns=False, config=config)
+        return table
+
+    def to_pandas(self, use_columns=False, config: TextLinearizationConfig = TextLinearizationConfig()):
+        """
+        Converts the table to a pandas DataFrame
+        :param use_columns: If the first row of the table is made of column headers, use them for the pandas dataframe. Only supports single row header.
+        :param config: Text linearization configuration object for the table content
+        :return:
+        """
+        try:
+            from pandas import DataFrame
+        except ImportError:
+            raise MissingDependencyException(
+                "pandas library is required for exporting tables to DataFrame objects or markdown"
+            )
+        table, columns = self._process_table(use_columns=use_columns, config=config)
+        if columns is not None and use_columns:
+            return DataFrame(table[1:], columns=columns)
+        else:
+            return DataFrame(table)
 
     def to_csv(self, use_columns = False, config: TextLinearizationConfig = TextLinearizationConfig()) -> str:
         """Returns the table in the Comma-Separated-Value (CSV) format

From b72ef1c676b73d3cb5e22f9d17b0a12af9a8ceeb Mon Sep 17 00:00:00 2001
From: AnkitRaut <ankitraut87@gmail.com>
Date: Sat, 2 Aug 2025 23:09:33 +0530
Subject: [PATCH 2/3] Add _process_table and to_list methods. Refactor
 to_pandas method to use _process_table method

---
 textractor/entities/table.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/textractor/entities/table.py b/textractor/entities/table.py
index d6f386b..965c8b4 100644
--- a/textractor/entities/table.py
+++ b/textractor/entities/table.py
@@ -502,7 +502,6 @@ def _process_table(self, use_columns=False, config: TextLinearizationConfig = Te
         Processes the table into a list of rows for consumption by to_pandas and to_list.
         Returns (table: List[List[str]], columns: Optional[List[str]])
         """
-        import itertools
         rows = sorted([(key, list(group)) for key, group in itertools.groupby(
             self.table_cells, key=lambda cell: cell.row_index
         )], key=lambda r: r[0])

From 0a8c036850613061ae66c7604760fe2dac2fa7ad Mon Sep 17 00:00:00 2001
From: AnkitRaut <ankitraut87@gmail.com>
Date: Sat, 2 Aug 2025 23:15:00 +0530
Subject: [PATCH 3/3] adding back original comments

---
 textractor/entities/table.py | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/textractor/entities/table.py b/textractor/entities/table.py
index 965c8b4..bf7a5d2 100644
--- a/textractor/entities/table.py
+++ b/textractor/entities/table.py
@@ -510,10 +510,12 @@ def _process_table(self, use_columns=False, config: TextLinearizationConfig = Te
         processed_cells = set()
         table = []
         if use_columns:
+            # Try to automatically get the columns if they are in the first row
             columns = [[] for _ in range(self.column_count)]
             is_header_count = 0
             for _, row in rows:
                 if not any([c.is_column_header for c in row]):
+                    # There is not header in that row, we are done
                     break
                 for i, cell in enumerate(row):
                     if (
@@ -522,6 +524,8 @@ def _process_table(self, use_columns=False, config: TextLinearizationConfig = Te
                             config.table_flatten_headers
                     ):
                         if cell.siblings:
+                            # This handles the edge case where we are flattening the headers
+                            # so we want to duplicate the cell text but only in its first row
                             first_row, _, _, _ = cell._get_merged_cell_range()
                             if cell in processed_cells and first_row != cell.row_index:
                                 continue
@@ -548,12 +552,16 @@ def _process_table(self, use_columns=False, config: TextLinearizationConfig = Te
                 use_columns = True
             else:
                 use_columns = False
+                logger.info(
+                    f"The number of column header cell do not match the column count, ignoring them, {len(columns)} vs {self.column_count}"
+                )
                 columns = None
                 row_offset = 0
         if columns and any([c for c in columns]) and config.table_flatten_headers:
             columns = ["".join(c) for c in columns]
             table = [columns]
         elif columns and any([c for c in columns]):
+            # We reset the row offset as only the first line will be taken as header
             columns = [c[0] for c in columns]
             table = [columns]
             row_offset = 1