From c2094264671dcff2032ff919bff285f662424db6 Mon Sep 17 00:00:00 2001 From: AnkitRaut Date: Sat, 2 Aug 2025 23:07:42 +0530 Subject: [PATCH 1/3] Add _process_table and to_list methods. Refactor to_pandas method to use _process_table method --- textractor/entities/table.py | 82 ++++++++++++++++++++---------------- 1 file changed, 45 insertions(+), 37 deletions(-) diff --git a/textractor/entities/table.py b/textractor/entities/table.py index e8fccdf..d6f386b 100644 --- a/textractor/entities/table.py +++ b/textractor/entities/table.py @@ -497,45 +497,32 @@ def __getitem__(self, key): return new_table - def to_pandas(self, use_columns=False, config: TextLinearizationConfig = TextLinearizationConfig()): + def _process_table(self, use_columns=False, config: TextLinearizationConfig = TextLinearizationConfig()): """ - Converts the table to a pandas DataFrame - - :param use_columns: If the first row of the table is made of column headers, use them for the pandas dataframe. Only supports single row header. - :param config: Text linearization configuration object for the table content - :return: + Processes the table into a list of rows for consumption by to_pandas and to_list. + Returns (table: List[List[str]], columns: Optional[List[str]]) """ - try: - from pandas import DataFrame - except ImportError: - raise MissingDependencyException( - "pandas library is required for exporting tables to DataFrame objects or markdown" - ) - + import itertools rows = sorted([(key, list(group)) for key, group in itertools.groupby( self.table_cells, key=lambda cell: cell.row_index )], key=lambda r: r[0]) row_offset = 0 - columns = None processed_cells = set() + table = [] if use_columns: - # Try to automatically get the columns if they are in the first row columns = [[] for _ in range(self.column_count)] is_header_count = 0 for _, row in rows: if not any([c.is_column_header for c in row]): - # There is not header in that row, we are done break for i, cell in enumerate(row): if ( - cell not in processed_cells or - config.table_duplicate_text_in_merged_cells or - config.table_flatten_headers + cell not in processed_cells or + config.table_duplicate_text_in_merged_cells or + config.table_flatten_headers ): if cell.siblings: - # This handles the edge case where we are flattening the headers - # so we want to duplicate the cell text but only in its first row first_row, _, _, _ = cell._get_merged_cell_range() if cell in processed_cells and first_row != cell.row_index: continue @@ -557,26 +544,20 @@ def to_pandas(self, use_columns=False, config: TextLinearizationConfig = TextLin else: columns[i].append("") row_offset += 1 - # If we have the correct number of column and at least half the row is tagged as a header - if len(columns) == self.column_count and is_header_count / len(columns) >= config.table_column_header_threshold: + if len(columns) == self.column_count and is_header_count / len( + columns) >= config.table_column_header_threshold: use_columns = True else: use_columns = False - logger.info( - f"The number of column header cell do not match the column count, ignoring them, {len(columns)} vs {self.column_count}" - ) - + columns = None + row_offset = 0 if columns and any([c for c in columns]) and config.table_flatten_headers: columns = ["".join(c) for c in columns] table = [columns] elif columns and any([c for c in columns]): - # We reset the row offset as only the first line will be taken as header columns = [c[0] for c in columns] table = [columns] row_offset = 1 - else: - table = [] - for _, row in rows[row_offset:]: table.append([]) for cell in row: @@ -584,7 +565,8 @@ def to_pandas(self, use_columns=False, config: TextLinearizationConfig = TextLin if cell.siblings: children = [] first_row, first_col, last_row, last_col = cell._get_merged_cell_range() - if (cell.col_index == first_col and cell.row_index == first_row) or config.table_duplicate_text_in_merged_cells: + if ( + cell.col_index == first_col and cell.row_index == first_row) or config.table_duplicate_text_in_merged_cells: for sib in cell.siblings: children.extend(sib.children) processed_cells.add(sib) @@ -599,12 +581,38 @@ def to_pandas(self, use_columns=False, config: TextLinearizationConfig = TextLin text = config.table_cell_empty_cell_placeholder if config.table_cell_empty_cell_placeholder else "" else: text = cell.get_text(config) - table[-1][cell.col_index - 1] = text if text or not config.table_cell_empty_cell_placeholder else config.table_cell_empty_cell_placeholder + table[-1][ + cell.col_index - 1] = text if text or not config.table_cell_empty_cell_placeholder else config.table_cell_empty_cell_placeholder + return table, columns - return DataFrame( - table[1:] if use_columns else table, - columns=columns if use_columns else None, - ) + def to_list(self, config: TextLinearizationConfig = TextLinearizationConfig()): + """ + Converts the table to a list of lists. + :param config: Text linearization configuration object for the table content + :return: List of rows representing the table. + :rtype: List[List[str]] + """ + table, columns = self._process_table(use_columns=False, config=config) + return table + + def to_pandas(self, use_columns=False, config: TextLinearizationConfig = TextLinearizationConfig()): + """ + Converts the table to a pandas DataFrame + :param use_columns: If the first row of the table is made of column headers, use them for the pandas dataframe. Only supports single row header. + :param config: Text linearization configuration object for the table content + :return: + """ + try: + from pandas import DataFrame + except ImportError: + raise MissingDependencyException( + "pandas library is required for exporting tables to DataFrame objects or markdown" + ) + table, columns = self._process_table(use_columns=use_columns, config=config) + if columns is not None and use_columns: + return DataFrame(table[1:], columns=columns) + else: + return DataFrame(table) def to_csv(self, use_columns = False, config: TextLinearizationConfig = TextLinearizationConfig()) -> str: """Returns the table in the Comma-Separated-Value (CSV) format From b72ef1c676b73d3cb5e22f9d17b0a12af9a8ceeb Mon Sep 17 00:00:00 2001 From: AnkitRaut Date: Sat, 2 Aug 2025 23:09:33 +0530 Subject: [PATCH 2/3] Add _process_table and to_list methods. Refactor to_pandas method to use _process_table method --- textractor/entities/table.py | 1 - 1 file changed, 1 deletion(-) diff --git a/textractor/entities/table.py b/textractor/entities/table.py index d6f386b..965c8b4 100644 --- a/textractor/entities/table.py +++ b/textractor/entities/table.py @@ -502,7 +502,6 @@ def _process_table(self, use_columns=False, config: TextLinearizationConfig = Te Processes the table into a list of rows for consumption by to_pandas and to_list. Returns (table: List[List[str]], columns: Optional[List[str]]) """ - import itertools rows = sorted([(key, list(group)) for key, group in itertools.groupby( self.table_cells, key=lambda cell: cell.row_index )], key=lambda r: r[0]) From 0a8c036850613061ae66c7604760fe2dac2fa7ad Mon Sep 17 00:00:00 2001 From: AnkitRaut Date: Sat, 2 Aug 2025 23:15:00 +0530 Subject: [PATCH 3/3] adding back original comments --- textractor/entities/table.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/textractor/entities/table.py b/textractor/entities/table.py index 965c8b4..bf7a5d2 100644 --- a/textractor/entities/table.py +++ b/textractor/entities/table.py @@ -510,10 +510,12 @@ def _process_table(self, use_columns=False, config: TextLinearizationConfig = Te processed_cells = set() table = [] if use_columns: + # Try to automatically get the columns if they are in the first row columns = [[] for _ in range(self.column_count)] is_header_count = 0 for _, row in rows: if not any([c.is_column_header for c in row]): + # There is not header in that row, we are done break for i, cell in enumerate(row): if ( @@ -522,6 +524,8 @@ def _process_table(self, use_columns=False, config: TextLinearizationConfig = Te config.table_flatten_headers ): if cell.siblings: + # This handles the edge case where we are flattening the headers + # so we want to duplicate the cell text but only in its first row first_row, _, _, _ = cell._get_merged_cell_range() if cell in processed_cells and first_row != cell.row_index: continue @@ -548,12 +552,16 @@ def _process_table(self, use_columns=False, config: TextLinearizationConfig = Te use_columns = True else: use_columns = False + logger.info( + f"The number of column header cell do not match the column count, ignoring them, {len(columns)} vs {self.column_count}" + ) columns = None row_offset = 0 if columns and any([c for c in columns]) and config.table_flatten_headers: columns = ["".join(c) for c in columns] table = [columns] elif columns and any([c for c in columns]): + # We reset the row offset as only the first line will be taken as header columns = [c[0] for c in columns] table = [columns] row_offset = 1