Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
73 changes: 44 additions & 29 deletions textractor/entities/table.py
Original file line number Diff line number Diff line change
Expand Up @@ -497,28 +497,18 @@ def __getitem__(self, key):

return new_table

def to_pandas(self, use_columns=False, config: TextLinearizationConfig = TextLinearizationConfig()):
def _process_table(self, use_columns=False, config: TextLinearizationConfig = TextLinearizationConfig()):
"""
Converts the table to a pandas DataFrame

:param use_columns: If the first row of the table is made of column headers, use them for the pandas dataframe. Only supports single row header.
:param config: Text linearization configuration object for the table content
:return:
Processes the table into a list of rows for consumption by to_pandas and to_list.
Returns (table: List[List[str]], columns: Optional[List[str]])
"""
try:
from pandas import DataFrame
except ImportError:
raise MissingDependencyException(
"pandas library is required for exporting tables to DataFrame objects or markdown"
)

rows = sorted([(key, list(group)) for key, group in itertools.groupby(
self.table_cells, key=lambda cell: cell.row_index
)], key=lambda r: r[0])
row_offset = 0

columns = None
processed_cells = set()
table = []
if use_columns:
# Try to automatically get the columns if they are in the first row
columns = [[] for _ in range(self.column_count)]
Expand All @@ -529,9 +519,9 @@ def to_pandas(self, use_columns=False, config: TextLinearizationConfig = TextLin
break
for i, cell in enumerate(row):
if (
cell not in processed_cells or
config.table_duplicate_text_in_merged_cells or
config.table_flatten_headers
cell not in processed_cells or
config.table_duplicate_text_in_merged_cells or
config.table_flatten_headers
):
if cell.siblings:
# This handles the edge case where we are flattening the headers
Expand All @@ -557,15 +547,16 @@ def to_pandas(self, use_columns=False, config: TextLinearizationConfig = TextLin
else:
columns[i].append("")
row_offset += 1
# If we have the correct number of column and at least half the row is tagged as a header
if len(columns) == self.column_count and is_header_count / len(columns) >= config.table_column_header_threshold:
if len(columns) == self.column_count and is_header_count / len(
columns) >= config.table_column_header_threshold:
use_columns = True
else:
use_columns = False
logger.info(
f"The number of column header cell do not match the column count, ignoring them, {len(columns)} vs {self.column_count}"
)

columns = None
row_offset = 0
if columns and any([c for c in columns]) and config.table_flatten_headers:
columns = ["".join(c) for c in columns]
table = [columns]
Expand All @@ -574,17 +565,15 @@ def to_pandas(self, use_columns=False, config: TextLinearizationConfig = TextLin
columns = [c[0] for c in columns]
table = [columns]
row_offset = 1
else:
table = []

for _, row in rows[row_offset:]:
table.append([])
for cell in row:
table[-1].append("")
if cell.siblings:
children = []
first_row, first_col, last_row, last_col = cell._get_merged_cell_range()
if (cell.col_index == first_col and cell.row_index == first_row) or config.table_duplicate_text_in_merged_cells:
if (
cell.col_index == first_col and cell.row_index == first_row) or config.table_duplicate_text_in_merged_cells:
for sib in cell.siblings:
children.extend(sib.children)
processed_cells.add(sib)
Expand All @@ -599,12 +588,38 @@ def to_pandas(self, use_columns=False, config: TextLinearizationConfig = TextLin
text = config.table_cell_empty_cell_placeholder if config.table_cell_empty_cell_placeholder else ""
else:
text = cell.get_text(config)
table[-1][cell.col_index - 1] = text if text or not config.table_cell_empty_cell_placeholder else config.table_cell_empty_cell_placeholder
table[-1][
cell.col_index - 1] = text if text or not config.table_cell_empty_cell_placeholder else config.table_cell_empty_cell_placeholder
return table, columns

return DataFrame(
table[1:] if use_columns else table,
columns=columns if use_columns else None,
)
def to_list(self, config: TextLinearizationConfig = TextLinearizationConfig()):
"""
Converts the table to a list of lists.
:param config: Text linearization configuration object for the table content
:return: List of rows representing the table.
:rtype: List[List[str]]
"""
table, columns = self._process_table(use_columns=False, config=config)
return table

def to_pandas(self, use_columns=False, config: TextLinearizationConfig = TextLinearizationConfig()):
"""
Converts the table to a pandas DataFrame
:param use_columns: If the first row of the table is made of column headers, use them for the pandas dataframe. Only supports single row header.
:param config: Text linearization configuration object for the table content
:return:
"""
try:
from pandas import DataFrame
except ImportError:
raise MissingDependencyException(
"pandas library is required for exporting tables to DataFrame objects or markdown"
)
table, columns = self._process_table(use_columns=use_columns, config=config)
if columns is not None and use_columns:
return DataFrame(table[1:], columns=columns)
else:
return DataFrame(table)

def to_csv(self, use_columns = False, config: TextLinearizationConfig = TextLinearizationConfig()) -> str:
"""Returns the table in the Comma-Separated-Value (CSV) format
Expand Down