From 2fb2b5a98b2ba71991307ece34de60146ea2c35b Mon Sep 17 00:00:00 2001 From: Ian Su Date: Tue, 3 Sep 2024 14:00:07 +0800 Subject: [PATCH 1/5] format t_pretty_print_layout --- .../t_pretty_print_layout.py | 337 ++++++++++++------ 1 file changed, 230 insertions(+), 107 deletions(-) diff --git a/prettyprinter/textractprettyprinter/t_pretty_print_layout.py b/prettyprinter/textractprettyprinter/t_pretty_print_layout.py index b0f997bc..e9a3a8c1 100644 --- a/prettyprinter/textractprettyprinter/t_pretty_print_layout.py +++ b/prettyprinter/textractprettyprinter/t_pretty_print_layout.py @@ -6,17 +6,20 @@ logger = logging.getLogger(__name__) + class LinearizeLayout: - def __init__(self, - textract_json: dict, - table_format: str = "grid", - exclude_figure_text: bool=True, - exclude_page_header: bool=False, - exclude_page_footer: bool=False, - exclude_page_number: bool=False, - skip_table: bool=False, - save_txt_path: str=None, - generate_markdown: bool=False): + def __init__( + self, + textract_json: dict, + table_format: str = "grid", + exclude_figure_text: bool = True, + exclude_page_header: bool = False, + exclude_page_footer: bool = False, + exclude_page_number: bool = False, + skip_table: bool = False, + save_txt_path: str = None, + generate_markdown: bool = False, + ): self.j = textract_json self.table_format = table_format self.exclude_figure_text = exclude_figure_text @@ -27,23 +30,31 @@ def __init__(self, self.save_txt_path = save_txt_path self.generate_markdown = generate_markdown self.figures = [] - + def _get_layout_blocks(self) -> tuple: """Get all blocks of type 'LAYOUT' and a dictionary of Ids mapped to their corresponding block.""" - layouts = [{"Id": x['Id'], "Page": x.get('Page',1)} for x in self.j['Blocks'] if x['BlockType'].startswith('LAYOUT')] - id2block = {x['Id']: x for x in self.j['Blocks']} - self.figures = [{"page": block.get('Page', 1), "geometry": block['Geometry']['BoundingBox']} \ - for block in self.j['Blocks'] \ - if block['BlockType'] == 'LAYOUT_FIGURE'] + layouts = [ + {"Id": x["Id"], "Page": x.get("Page", 1)} + for x in self.j["Blocks"] + if x["BlockType"].startswith("LAYOUT") + ] + id2block = {x["Id"]: x for x in self.j["Blocks"]} + self.figures = [ + {"page": block.get("Page", 1), "geometry": block["Geometry"]["BoundingBox"]} + for block in self.j["Blocks"] + if block["BlockType"] == "LAYOUT_FIGURE" + ] if not layouts: - logger.warning("No LAYOUT information found in Textract response. \ + logger.warning( + "No LAYOUT information found in Textract response. \ Please use LAYOUT feature for AnalyzeDocument API call \ - for optimum output") + for optimum output" + ) return layouts, id2block def _geometry_match(self, geom1, geom2, tolerance=0.1): """Check if two geometries match within a given tolerance.""" - for key in ['Width', 'Height', 'Left', 'Top']: + for key in ["Width", "Height", "Left", "Top"]: if abs(geom1[key] - geom2[key]) > tolerance: return False return True @@ -51,22 +62,26 @@ def _geometry_match(self, geom1, geom2, tolerance=0.1): def _is_inside(self, inner_geom, outer_geom): """Check if inner geometry is fully contained within the outer geometry.""" inner_left, inner_top, inner_right, inner_bottom = ( - inner_geom['Left'], - inner_geom['Top'], - inner_geom['Left'] + inner_geom['Width'], - inner_geom['Top'] + inner_geom['Height'] + inner_geom["Left"], + inner_geom["Top"], + inner_geom["Left"] + inner_geom["Width"], + inner_geom["Top"] + inner_geom["Height"], ) - + outer_left, outer_top, outer_right, outer_bottom = ( - outer_geom['Left'], - outer_geom['Top'], - outer_geom['Left'] + outer_geom['Width'], - outer_geom['Top'] + outer_geom['Height'] + outer_geom["Left"], + outer_geom["Top"], + outer_geom["Left"] + outer_geom["Width"], + outer_geom["Top"] + outer_geom["Height"], + ) + + return ( + inner_left >= outer_left + and inner_right <= outer_right + and inner_top >= outer_top + and inner_bottom <= outer_bottom ) - - return (inner_left >= outer_left and inner_right <= outer_right and - inner_top >= outer_top and inner_bottom <= outer_bottom) - + def _validate_block_skip(self, blockType: str) -> bool: if self.exclude_page_header and blockType == "LAYOUT_HEADER": return True @@ -76,7 +91,7 @@ def _validate_block_skip(self, blockType: str) -> bool: return True else: return False - + def _dfs(self, root, id2block): texts = [] stack = [(root, 0)] @@ -84,17 +99,25 @@ def _dfs(self, root, id2block): while stack: block_id, depth = stack.pop() block = id2block[block_id] - + if self._validate_block_skip(block["BlockType"]): continue - + # Handle LAYOUT_TABLE type if not self.skip_table and block["BlockType"] == "LAYOUT_TABLE": table_data = [] # Find the matching TABLE block for the LAYOUT_TABLE table_block = None - for potential_table in [b for b in self.j['Blocks'] if b['BlockType'] == 'TABLE' and b.get('Page',1) == block.get('Page', 1)]: - if self._geometry_match(block['Geometry']['BoundingBox'], potential_table['Geometry']['BoundingBox']): + for potential_table in [ + b + for b in self.j["Blocks"] + if b["BlockType"] == "TABLE" + and b.get("Page", 1) == block.get("Page", 1) + ]: + if self._geometry_match( + block["Geometry"]["BoundingBox"], + potential_table["Geometry"]["BoundingBox"], + ): table_block = potential_table break @@ -104,22 +127,36 @@ def _dfs(self, root, id2block): max_row = 0 max_col = 0 for cell_rel in table_block["Relationships"]: - if cell_rel['Type'] == "CHILD": - for cell_id in cell_rel['Ids']: + if cell_rel["Type"] == "CHILD": + for cell_id in cell_rel["Ids"]: cell_block = id2block[cell_id] if "Relationships" in cell_block: - cell_text = " ".join([id2block[line_id]['Text'] for line_id in cell_block["Relationships"][0]['Ids'] if 'Text' in id2block[line_id]]) - row_idx = cell_block['RowIndex'] - col_idx = cell_block['ColumnIndex'] + cell_text = " ".join( + [ + id2block[line_id]["Text"] + for line_id in cell_block["Relationships"][ + 0 + ]["Ids"] + if "Text" in id2block[line_id] + ] + ) + row_idx = cell_block["RowIndex"] + col_idx = cell_block["ColumnIndex"] max_row = max(max_row, row_idx) max_col = max(max_col, col_idx) - for r in range(cell_block.get('RowSpan', 1)): - for c in range(cell_block.get('ColumnSpan', 1)): - if "EntityTypes" in cell_block and "COLUMN_HEADER" in cell_block["EntityTypes"]: + for r in range(cell_block.get("RowSpan", 1)): + for c in range(cell_block.get("ColumnSpan", 1)): + if ( + "EntityTypes" in cell_block + and "COLUMN_HEADER" + in cell_block["EntityTypes"] + ): headers[col_idx + c] = cell_text else: - table_content[(row_idx + r, col_idx + c)] = cell_text - + table_content[ + (row_idx + r, col_idx + c) + ] = cell_text + table_data = [] start_row = 2 if headers else 1 for r in range(start_row, max_row + 1): @@ -129,7 +166,7 @@ def _dfs(self, root, id2block): table_data.append(row_data) header_list = [headers.get(c, "") for c in range(1, max_col + 1)] - + try: from tabulate import tabulate except ImportError: @@ -137,27 +174,42 @@ def _dfs(self, root, id2block): "Could not import tabulate python package. " "Please install it with `pip install tabulate`." ) - + tab_fmt = "pipe" if self.generate_markdown else self.table_format - '''If Markdown is enabled then default to pipe for tables''' - - table_text = tabulate(table_data, headers=header_list, tablefmt=tab_fmt) + """If Markdown is enabled then default to pipe for tables""" + + table_text = tabulate( + table_data, headers=header_list, tablefmt=tab_fmt + ) yield table_text continue else: - logger.warning("LAYOUT_TABLE detected but TABLES feature was not provided in API call. \ - Inlcuding TABLES feature may improve the layout output") - + logger.warning( + "LAYOUT_TABLE detected but TABLES feature was not provided in API call. \ + Inlcuding TABLES feature may improve the layout output" + ) + if block["BlockType"] == "LINE" and "Text" in block: if self.exclude_figure_text and self.figures: - if any(self._is_inside(block['Geometry']['BoundingBox'], figure_geom["geometry"]) \ - for figure_geom in self.figures if figure_geom["page"] == block.get("Page",1)): + if any( + self._is_inside( + block["Geometry"]["BoundingBox"], figure_geom["geometry"] + ) + for figure_geom in self.figures + if figure_geom["page"] == block.get("Page", 1) + ): continue - yield block['Text'] - elif block["BlockType"] in ["LAYOUT_TITLE", "LAYOUT_SECTION_HEADER"] and "Relationships" in block: + yield block["Text"] + elif ( + block["BlockType"] in ["LAYOUT_TITLE", "LAYOUT_SECTION_HEADER"] + and "Relationships" in block + ): # Get the associated LINE text for the layout - line_texts = [id2block[line_id]['Text'] for line_id in block["Relationships"][0]['Ids']] - combined_text = ' '.join(line_texts) + line_texts = [ + id2block[line_id]["Text"] + for line_id in block["Relationships"][0]["Ids"] + ] + combined_text = " ".join(line_texts) # Prefix with appropriate markdown if self.generate_markdown: @@ -166,32 +218,38 @@ def _dfs(self, root, id2block): elif block["BlockType"] == "LAYOUT_SECTION_HEADER": combined_text = f"## {combined_text}" yield combined_text - - if block["BlockType"].startswith('LAYOUT') and block["BlockType"] not in ["LAYOUT_TITLE", "LAYOUT_SECTION_HEADER"]: + + if block["BlockType"].startswith("LAYOUT") and block["BlockType"] not in [ + "LAYOUT_TITLE", + "LAYOUT_SECTION_HEADER", + ]: if "Relationships" in block: relationships = block["Relationships"] - children = [(x, depth + 1) for x in relationships[0]['Ids']] + children = [(x, depth + 1) for x in relationships[0]["Ids"]] stack.extend(reversed(children)) - + def _save_to_s3(self, page_texts: dict) -> None: try: import boto3 import re - s3 = boto3.client('s3') - match = re.match(r's3://([^/]+)(?:/(.*))?', self.save_txt_path) + + s3 = boto3.client("s3") + match = re.match(r"s3://([^/]+)(?:/(.*))?", self.save_txt_path) bucket = match.group(1) prefix = match.group(2) if match.group(2) else "" - + for page_number, content in page_texts.items(): file_name = f"{page_number}.txt" s3_key = os.path.join(prefix, file_name) - logger.debug(f"Writing linearized text for page {page_number} to bucket {bucket} file {s3_key}") - s3.put_object(Body=content, - Bucket=bucket, - Key=s3_key) + logger.debug( + f"Writing linearized text for page {page_number} to bucket {bucket} file {s3_key}" + ) + s3.put_object(Body=content, Bucket=bucket, Key=s3_key) except ImportError: - logger.error("Could not import boto3 python package. \ - Please install it with `pip install boto3`.") + logger.error( + "Could not import boto3 python package. \ + Please install it with `pip install boto3`." + ) raise ModuleNotFoundError( "Could not import boto3 python package. " "Please install it with `pip install boto3`." @@ -199,36 +257,40 @@ def _save_to_s3(self, page_texts: dict) -> None: except Exception as e: logger.error(e) raise e - + def _save_to_files(self, page_texts: dict) -> None: path = self.save_txt_path.rstrip(os.sep) - if path.startswith('s3://'): + if path.startswith("s3://"): self._save_to_s3(page_texts=page_texts) else: - for page_number, content in page_texts.items(): + for page_number, content in page_texts.items(): file_path = os.path.join(path, f"{page_number}.txt") - logger.debug(f"Writing linearized text for page {page_number} to file {file_path}") + logger.debug( + f"Writing linearized text for page {page_number} to file {file_path}" + ) with open(file_path, "w") as f: f.write(content) - + def get_text(self) -> dict: """Retrieve the text content in specified format. Default is CSV. Options: "csv", "markdown".""" # texts = [] page_texts = {} layouts, id2block = self._get_layout_blocks() for layout in layouts: - root = layout['Id'] - page_number = layout.get('Page', 1) + root = layout["Id"] + page_number = layout.get("Page", 1) if page_number not in page_texts: page_texts[page_number] = "" - page_texts[page_number] += '\n'.join(self._dfs(root, id2block))+ "\n\n" + page_texts[page_number] += "\n".join(self._dfs(root, id2block)) + "\n\n" if self.save_txt_path: self._save_to_files(page_texts) return page_texts + def string_counter(): # Dictionary to keep track of the occurrences of each string occurrences = {} + def counter(string): if string in occurrences: occurrences[string] += 1 @@ -238,6 +300,7 @@ def counter(string): return counter + def get_layout_csv_from_trp2(trp2_doc: TDocument) -> List[List[List[str]]]: """ Generate the layout.csv from the Amazon Textract Web Console download @@ -246,25 +309,40 @@ def get_layout_csv_from_trp2(trp2_doc: TDocument) -> List[List[List[str]]]: 'Page number','Layout','Text','Reading Order','Confidence score' Page number : Starting at 1, incrementing for each page - Layout : The BlockType + a number indicating the sequence for - this BlockType starting at 1 and for LAYOUT_LIST elements + Layout : The BlockType + a number indicating the sequence for + this BlockType starting at 1 and for LAYOUT_LIST elements the string: "- part of LAYOUT_LIST (index)" is added Text : The underlying text (except LAYOUT_LIST and LAYOUT_FIGURE ) Reading Order : Increasing int for each LAYOUT element starting with 0 Confidence score: Confidence in this being a LAYOUT element """ - result_value:List[List[List[str]]] = list() + result_value: List[List[List[str]]] = list() counter_instance = string_counter() for page_number, page in enumerate(trp2_doc.pages): - page_result:List[List[str]] = list() + page_result: List[List[str]] = list() processed_ids = [] relationships: t2.TRelationship = page.get_relationships_for_type() - blocks = [trp2_doc.get_block_by_id(id) for id in relationships.ids if relationships.ids] + blocks = [ + trp2_doc.get_block_by_id(id) + for id in relationships.ids + if relationships.ids + ] layout_blocks = [ - block for block in blocks if block.block_type in [ - "LAYOUT_TITLE", "LAYOUT_HEADER", "LAYOUT_FOOTER", "LAYOUT_SECTION_HEADER", "LAYOUT_PAGE_NUMBER", - "LAYOUT_LIST", "LAYOUT_FIGURE", "LAYOUT_TABLE", "LAYOUT_KEY_VALUE", "LAYOUT_TEXT" + block + for block in blocks + if block.block_type + in [ + "LAYOUT_TITLE", + "LAYOUT_HEADER", + "LAYOUT_FOOTER", + "LAYOUT_SECTION_HEADER", + "LAYOUT_PAGE_NUMBER", + "LAYOUT_LIST", + "LAYOUT_FIGURE", + "LAYOUT_TABLE", + "LAYOUT_KEY_VALUE", + "LAYOUT_TEXT", ] ] for idx, layout_block in enumerate(layout_blocks): @@ -275,31 +353,63 @@ def get_layout_csv_from_trp2(trp2_doc: TDocument) -> List[List[List[str]]]: if layout_block.block_type == "LAYOUT_LIST": # first print out the LAYOUT_LIST block_type_count = counter_instance(layout_block.block_type) - page_result.append([str(page_number + 1), layout_block.block_type + " " + str(block_type_count), "", str(idx), - layout_block.block_type, str(layout_block.confidence)]) + page_result.append( + [ + str(page_number + 1), + layout_block.block_type + " " + str(block_type_count), + "", + str(idx), + layout_block.block_type, + str(layout_block.confidence), + ] + ) # print(page_number + 1, layout_block.block_type + " " + str(block_type_count), "", idx, layout_block.block_type) - list_context_name = layout_block.block_type + " " + str(block_type_count) + list_context_name = ( + layout_block.block_type + " " + str(block_type_count) + ) # now get the relationships list_block_rel = layout_block.get_relationships_for_type() if list_block_rel: # get the text relationships - list_child_blocks = [trp2_doc.get_block_by_id(id) for id in list_block_rel.ids] + list_child_blocks = [ + trp2_doc.get_block_by_id(id) for id in list_block_rel.ids + ] for child_idx, list_child_block in enumerate(list_child_blocks): block_type_count = counter_instance(list_child_block.block_type) - child_block_relation_text = list_child_block.block_type + " " + str( - block_type_count) + " - part of " + list_context_name + child_block_relation_text = ( + list_child_block.block_type + + " " + + str(block_type_count) + + " - part of " + + list_context_name + ) # get the text, meaning get all the child relationships - layout_child_block_rel = list_child_block.get_relationships_for_type() + layout_child_block_rel = ( + list_child_block.get_relationships_for_type() + ) layout_child_line_blocks = [ - trp2_doc.get_block_by_id(id) for id in layout_child_block_rel.ids + trp2_doc.get_block_by_id(id) + for id in layout_child_block_rel.ids if layout_child_block_rel.ids ] # get the text, but not for figures - layout_text = trp2_doc.get_text_for_tblocks( - layout_child_line_blocks) if list_child_block.block_type != "LAYOUT_FIGURE" else "" + layout_text = ( + trp2_doc.get_text_for_tblocks(layout_child_line_blocks) + if list_child_block.block_type != "LAYOUT_FIGURE" + else "" + ) - page_result.append([str(page_number + 1), child_block_relation_text, layout_text, str(idx + child_idx + 1), list_child_block.block_type, str(list_child_block.confidence)]) + page_result.append( + [ + str(page_number + 1), + child_block_relation_text, + layout_text, + str(idx + child_idx + 1), + list_child_block.block_type, + str(list_child_block.confidence), + ] + ) # print(page_number + 1, child_block_relation_text, layout_text, idx + child_idx + 1, # list_child_block.block_type) processed_ids.append(list_child_block.id) @@ -308,18 +418,31 @@ def get_layout_csv_from_trp2(trp2_doc: TDocument) -> List[List[List[str]]]: # Bug fix - #284 if layout_block_rel is None: - logger.info (f'Block {layout_block} has no relationships') + logger.info(f"Block {layout_block} has no relationships") continue layout_blocks = [ - trp2_doc.get_block_by_id(id) for id in layout_block_rel.ids if layout_block_rel.ids + trp2_doc.get_block_by_id(id) + for id in layout_block_rel.ids + if layout_block_rel.ids ] # get the text, but not for figures - layout_text = trp2_doc.get_text_for_tblocks( - layout_blocks) if layout_block.block_type != "LAYOUT_FIGURE" else "" + layout_text = ( + trp2_doc.get_text_for_tblocks(layout_blocks) + if layout_block.block_type != "LAYOUT_FIGURE" + else "" + ) block_type_count = counter_instance(layout_block.block_type) - page_result.append([str(page_number + 1), layout_block.block_type + " " + str(block_type_count), layout_text, str(idx), - layout_block.block_type, str(layout_block.confidence)]) + page_result.append( + [ + str(page_number + 1), + layout_block.block_type + " " + str(block_type_count), + layout_text, + str(idx), + layout_block.block_type, + str(layout_block.confidence), + ] + ) # print(page_number + 1, layout_block.block_type + " " + str(block_type_count), layout_text, idx, layout_block.block_type) result_value.append(page_result) From 78f6de3d123aa771d6410e329f348c3907901cd1 Mon Sep 17 00:00:00 2001 From: Ian Su Date: Tue, 3 Sep 2024 14:00:28 +0800 Subject: [PATCH 2/5] add LAYOUT_FIGURE --- .../t_pretty_print_layout.py | 30 +++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/prettyprinter/textractprettyprinter/t_pretty_print_layout.py b/prettyprinter/textractprettyprinter/t_pretty_print_layout.py index e9a3a8c1..835c397a 100644 --- a/prettyprinter/textractprettyprinter/t_pretty_print_layout.py +++ b/prettyprinter/textractprettyprinter/t_pretty_print_layout.py @@ -189,6 +189,36 @@ def _dfs(self, root, id2block): Inlcuding TABLES feature may improve the layout output" ) + elif block["BlockType"] == "LAYOUT_FIGURE": + figure_caption = "" + if "Relationships" in block: + for child_id in block["Relationships"][0]["Ids"]: + child_block = id2block[child_id] + if child_block["BlockType"] == "LINE": + figure_caption += child_block.get("Text", "") + " " + if not figure_caption: + figure_caption = "No caption" + + # Extract geometry information + geometry = block["Geometry"] + bounding_box = geometry["BoundingBox"] + polygon = geometry["Polygon"] + + # Create a dictionary with figure information + figure_info = { + "bounding_box": bounding_box, + "polygon": polygon, + "page": block.get("Page", 1), + } + + # Convert figure_info to a string representation + figure_info_str = str(figure_info) + + if self.generate_markdown: + yield f"![Figure]({figure_caption.strip()})\n" + else: + yield f"[Figure: {figure_caption.strip()}]\n// {figure_info_str}" + if block["BlockType"] == "LINE" and "Text" in block: if self.exclude_figure_text and self.figures: if any( From 0a6695cef980798b569be395400dbb0d299c513c Mon Sep 17 00:00:00 2001 From: Ian Su Date: Tue, 3 Sep 2024 16:11:37 +0800 Subject: [PATCH 3/5] able to generate table under figure --- .../t_pretty_print_layout.py | 185 +++++++++++------- 1 file changed, 118 insertions(+), 67 deletions(-) diff --git a/prettyprinter/textractprettyprinter/t_pretty_print_layout.py b/prettyprinter/textractprettyprinter/t_pretty_print_layout.py index 835c397a..f7d9c174 100644 --- a/prettyprinter/textractprettyprinter/t_pretty_print_layout.py +++ b/prettyprinter/textractprettyprinter/t_pretty_print_layout.py @@ -2,7 +2,7 @@ import warnings import logging from trp.trp2 import TDocument -from typing import List +from typing import List, Dict, Any, Tuple logger = logging.getLogger(__name__) @@ -30,6 +30,7 @@ def __init__( self.save_txt_path = save_txt_path self.generate_markdown = generate_markdown self.figures = [] + self.tables = [] def _get_layout_blocks(self) -> tuple: """Get all blocks of type 'LAYOUT' and a dictionary of Ids mapped to their corresponding block.""" @@ -44,6 +45,23 @@ def _get_layout_blocks(self) -> tuple: for block in self.j["Blocks"] if block["BlockType"] == "LAYOUT_FIGURE" ] + self.tables = [ + { + "page": block.get("Page", 1), + "id": block.get("Id", ""), + "word_ids": [ + word_id + for relationship in block.get("Relationships", []) + if relationship["Type"] == "CHILD" + for cell_id in relationship.get("Ids", []) + for cell_rel in id2block[cell_id].get("Relationships", []) + if cell_rel["Type"] == "CHILD" + for word_id in cell_rel.get("Ids", []) + ], + } + for block in self.j["Blocks"] + if block["BlockType"] == "TABLE" + ] if not layouts: logger.warning( "No LAYOUT information found in Textract response. \ @@ -92,6 +110,87 @@ def _validate_block_skip(self, blockType: str) -> bool: else: return False + def _find_words_in_tables( + self, word_ids: List[str] + ) -> Tuple[List[str], Dict[int, List[str]]]: + """ + Check which word_ids are part of table cells and which are not. + + Args: + word_ids (List[str]): List of word IDs to check. + + Returns: + Tuple[List[str], Dict[int, List[str]]]: A tuple containing: + - List of word IDs not in any table + - Dictionary mapping table indices to lists of word IDs they contain + """ + words_not_in_table = set(word_ids) + relevant_tables = set() + for table in self.tables: + table_words = set(table["word_ids"]) & set(word_ids) + + if table_words: + relevant_tables.add(table["id"]) + words_not_in_table -= table_words + + return list(words_not_in_table), relevant_tables + + def _generate_table_string(self, table_block, id2block): + table_content = {} + headers = {} + max_row = 0 + max_col = 0 + for cell_rel in table_block["Relationships"]: + if cell_rel["Type"] == "CHILD": + for cell_id in cell_rel["Ids"]: + cell_block = id2block[cell_id] + if "Relationships" in cell_block: + cell_text = " ".join( + [ + id2block[line_id]["Text"] + for line_id in cell_block["Relationships"][0]["Ids"] + if "Text" in id2block[line_id] + ] + ) + row_idx = cell_block["RowIndex"] + col_idx = cell_block["ColumnIndex"] + max_row = max(max_row, row_idx) + max_col = max(max_col, col_idx) + for r in range(cell_block.get("RowSpan", 1)): + for c in range(cell_block.get("ColumnSpan", 1)): + if ( + "EntityTypes" in cell_block + and "COLUMN_HEADER" in cell_block["EntityTypes"] + ): + headers[col_idx + c] = cell_text + else: + table_content[(row_idx + r, col_idx + c)] = ( + cell_text + ) + + table_data = [] + start_row = 2 if headers else 1 + for r in range(start_row, max_row + 1): + row_data = [] + for c in range(1, max_col + 1): + row_data.append(table_content.get((r, c), "")) + table_data.append(row_data) + + header_list = [headers.get(c, "") for c in range(1, max_col + 1)] + + try: + from tabulate import tabulate + except ImportError: + raise ModuleNotFoundError( + "Could not import tabulate python package. " + "Please install it with `pip install tabulate`." + ) + + tab_fmt = "pipe" if self.generate_markdown else self.table_format + """If Markdown is enabled then default to pipe for tables""" + + return tabulate(table_data, headers=header_list, tablefmt=tab_fmt) + def _dfs(self, root, id2block): texts = [] stack = [(root, 0)] @@ -122,65 +221,7 @@ def _dfs(self, root, id2block): break if table_block and "Relationships" in table_block: - table_content = {} - headers = {} - max_row = 0 - max_col = 0 - for cell_rel in table_block["Relationships"]: - if cell_rel["Type"] == "CHILD": - for cell_id in cell_rel["Ids"]: - cell_block = id2block[cell_id] - if "Relationships" in cell_block: - cell_text = " ".join( - [ - id2block[line_id]["Text"] - for line_id in cell_block["Relationships"][ - 0 - ]["Ids"] - if "Text" in id2block[line_id] - ] - ) - row_idx = cell_block["RowIndex"] - col_idx = cell_block["ColumnIndex"] - max_row = max(max_row, row_idx) - max_col = max(max_col, col_idx) - for r in range(cell_block.get("RowSpan", 1)): - for c in range(cell_block.get("ColumnSpan", 1)): - if ( - "EntityTypes" in cell_block - and "COLUMN_HEADER" - in cell_block["EntityTypes"] - ): - headers[col_idx + c] = cell_text - else: - table_content[ - (row_idx + r, col_idx + c) - ] = cell_text - - table_data = [] - start_row = 2 if headers else 1 - for r in range(start_row, max_row + 1): - row_data = [] - for c in range(1, max_col + 1): - row_data.append(table_content.get((r, c), "")) - table_data.append(row_data) - - header_list = [headers.get(c, "") for c in range(1, max_col + 1)] - - try: - from tabulate import tabulate - except ImportError: - raise ModuleNotFoundError( - "Could not import tabulate python package. " - "Please install it with `pip install tabulate`." - ) - - tab_fmt = "pipe" if self.generate_markdown else self.table_format - """If Markdown is enabled then default to pipe for tables""" - - table_text = tabulate( - table_data, headers=header_list, tablefmt=tab_fmt - ) + table_text = self._generate_table_string(table_block, id2block) yield table_text continue else: @@ -190,14 +231,24 @@ def _dfs(self, root, id2block): ) elif block["BlockType"] == "LAYOUT_FIGURE": - figure_caption = "" + figure_caption = None + if "Relationships" in block: + word_ids = [] for child_id in block["Relationships"][0]["Ids"]: child_block = id2block[child_id] - if child_block["BlockType"] == "LINE": - figure_caption += child_block.get("Text", "") + " " - if not figure_caption: - figure_caption = "No caption" + for word_id in child_block["Relationships"][0]["Ids"]: + word_ids.append(word_id) + words_not_in_table, relevant_table_ids = self._find_words_in_tables( + word_ids + ) + figure_caption = " ".join( + [id2block[word_id]["Text"] for word_id in words_not_in_table] + ) + for table_id in relevant_table_ids: + table_block = id2block[table_id] + table_text = self._generate_table_string(table_block, id2block) + figure_caption += f"\n\n{table_text}" # Extract geometry information geometry = block["Geometry"] @@ -215,9 +266,9 @@ def _dfs(self, root, id2block): figure_info_str = str(figure_info) if self.generate_markdown: - yield f"![Figure]({figure_caption.strip()})\n" + yield f"![Figure]({(figure_caption or '').strip() })\n" else: - yield f"[Figure: {figure_caption.strip()}]\n// {figure_info_str}" + yield f"[Figure: {(figure_caption or '').strip()}]\n// {figure_info_str}" if block["BlockType"] == "LINE" and "Text" in block: if self.exclude_figure_text and self.figures: From 58fe81bc3a05a9e52b0e560b45966e9ffb866984 Mon Sep 17 00:00:00 2001 From: Ian Su Date: Tue, 3 Sep 2024 23:24:24 +0800 Subject: [PATCH 4/5] update way to store caption --- .../textractprettyprinter/t_pretty_print_layout.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/prettyprinter/textractprettyprinter/t_pretty_print_layout.py b/prettyprinter/textractprettyprinter/t_pretty_print_layout.py index f7d9c174..946adeb3 100644 --- a/prettyprinter/textractprettyprinter/t_pretty_print_layout.py +++ b/prettyprinter/textractprettyprinter/t_pretty_print_layout.py @@ -1,5 +1,6 @@ import os import warnings +import json import logging from trp.trp2 import TDocument from typing import List, Dict, Any, Tuple @@ -231,7 +232,7 @@ def _dfs(self, root, id2block): ) elif block["BlockType"] == "LAYOUT_FIGURE": - figure_caption = None + figure_caption = "" if "Relationships" in block: word_ids = [] @@ -260,15 +261,16 @@ def _dfs(self, root, id2block): "bounding_box": bounding_box, "polygon": polygon, "page": block.get("Page", 1), + "caption": figure_caption, } # Convert figure_info to a string representation - figure_info_str = str(figure_info) + figure_info_str = json.dumps(figure_info) if self.generate_markdown: - yield f"![Figure]({(figure_caption or '').strip() })\n" + yield f"![Figure]\n" else: - yield f"[Figure: {(figure_caption or '').strip()}]\n// {figure_info_str}" + yield f"[Figure]\n// {figure_info_str}" if block["BlockType"] == "LINE" and "Text" in block: if self.exclude_figure_text and self.figures: From b7181c6bd0dc268418127d5ddbf6825e56bb33ff Mon Sep 17 00:00:00 2001 From: Ian Su Date: Tue, 3 Sep 2024 23:34:23 +0800 Subject: [PATCH 5/5] only show table with enough word_ids --- .../t_pretty_print_layout.py | 26 +++++++++++-------- 1 file changed, 15 insertions(+), 11 deletions(-) diff --git a/prettyprinter/textractprettyprinter/t_pretty_print_layout.py b/prettyprinter/textractprettyprinter/t_pretty_print_layout.py index 946adeb3..ac26d62a 100644 --- a/prettyprinter/textractprettyprinter/t_pretty_print_layout.py +++ b/prettyprinter/textractprettyprinter/t_pretty_print_layout.py @@ -113,7 +113,7 @@ def _validate_block_skip(self, blockType: str) -> bool: def _find_words_in_tables( self, word_ids: List[str] - ) -> Tuple[List[str], Dict[int, List[str]]]: + ) -> Tuple[List[str], List[Tuple[str, float]]]: """ Check which word_ids are part of table cells and which are not. @@ -121,17 +121,18 @@ def _find_words_in_tables( word_ids (List[str]): List of word IDs to check. Returns: - Tuple[List[str], Dict[int, List[str]]]: A tuple containing: + Tuple[List[str], List[Tuple[str, float]]]: A tuple containing: - List of word IDs not in any table - - Dictionary mapping table indices to lists of word IDs they contain + - List of tuples containing table IDs and the ratio of word IDs in the table """ words_not_in_table = set(word_ids) - relevant_tables = set() + relevant_tables = [] for table in self.tables: table_words = set(table["word_ids"]) & set(word_ids) if table_words: - relevant_tables.add(table["id"]) + ratio = len(table_words) / len(table["word_ids"]) + relevant_tables.append((table["id"], ratio)) words_not_in_table -= table_words return list(words_not_in_table), relevant_tables @@ -240,16 +241,19 @@ def _dfs(self, root, id2block): child_block = id2block[child_id] for word_id in child_block["Relationships"][0]["Ids"]: word_ids.append(word_id) - words_not_in_table, relevant_table_ids = self._find_words_in_tables( - word_ids + words_not_in_table, relevant_table_infos = ( + self._find_words_in_tables(word_ids) ) figure_caption = " ".join( [id2block[word_id]["Text"] for word_id in words_not_in_table] ) - for table_id in relevant_table_ids: - table_block = id2block[table_id] - table_text = self._generate_table_string(table_block, id2block) - figure_caption += f"\n\n{table_text}" + for table_id, ratio in relevant_table_infos: + if ratio > 0.9: + table_block = id2block[table_id] + table_text = self._generate_table_string( + table_block, id2block + ) + figure_caption += f"\n\n{table_text}" # Extract geometry information geometry = block["Geometry"]