From 2fb2b5a98b2ba71991307ece34de60146ea2c35b Mon Sep 17 00:00:00 2001
From: Ian Su <ensu.tw@gmail.com>
Date: Tue, 3 Sep 2024 14:00:07 +0800
Subject: [PATCH 1/5] format t_pretty_print_layout

---
 .../t_pretty_print_layout.py                  | 337 ++++++++++++------
 1 file changed, 230 insertions(+), 107 deletions(-)

diff --git a/prettyprinter/textractprettyprinter/t_pretty_print_layout.py b/prettyprinter/textractprettyprinter/t_pretty_print_layout.py
index b0f997bc..e9a3a8c1 100644
--- a/prettyprinter/textractprettyprinter/t_pretty_print_layout.py
+++ b/prettyprinter/textractprettyprinter/t_pretty_print_layout.py
@@ -6,17 +6,20 @@
 
 logger = logging.getLogger(__name__)
 
+
 class LinearizeLayout:
-    def __init__(self, 
-                 textract_json: dict, 
-                 table_format: str = "grid", 
-                 exclude_figure_text: bool=True,
-                 exclude_page_header: bool=False, 
-                 exclude_page_footer: bool=False, 
-                 exclude_page_number: bool=False,
-                 skip_table: bool=False,
-                 save_txt_path: str=None, 
-                 generate_markdown: bool=False):
+    def __init__(
+        self,
+        textract_json: dict,
+        table_format: str = "grid",
+        exclude_figure_text: bool = True,
+        exclude_page_header: bool = False,
+        exclude_page_footer: bool = False,
+        exclude_page_number: bool = False,
+        skip_table: bool = False,
+        save_txt_path: str = None,
+        generate_markdown: bool = False,
+    ):
         self.j = textract_json
         self.table_format = table_format
         self.exclude_figure_text = exclude_figure_text
@@ -27,23 +30,31 @@ def __init__(self,
         self.save_txt_path = save_txt_path
         self.generate_markdown = generate_markdown
         self.figures = []
-        
+
     def _get_layout_blocks(self) -> tuple:
         """Get all blocks of type 'LAYOUT' and a dictionary of Ids mapped to their corresponding block."""
-        layouts = [{"Id": x['Id'], "Page": x.get('Page',1)} for x in self.j['Blocks'] if x['BlockType'].startswith('LAYOUT')]
-        id2block = {x['Id']: x for x in self.j['Blocks']}
-        self.figures = [{"page": block.get('Page', 1), "geometry": block['Geometry']['BoundingBox']} \
-                        for block in self.j['Blocks'] \
-                        if block['BlockType'] == 'LAYOUT_FIGURE']
+        layouts = [
+            {"Id": x["Id"], "Page": x.get("Page", 1)}
+            for x in self.j["Blocks"]
+            if x["BlockType"].startswith("LAYOUT")
+        ]
+        id2block = {x["Id"]: x for x in self.j["Blocks"]}
+        self.figures = [
+            {"page": block.get("Page", 1), "geometry": block["Geometry"]["BoundingBox"]}
+            for block in self.j["Blocks"]
+            if block["BlockType"] == "LAYOUT_FIGURE"
+        ]
         if not layouts:
-            logger.warning("No LAYOUT information found in Textract response. \
+            logger.warning(
+                "No LAYOUT information found in Textract response. \
                            Please use LAYOUT feature for AnalyzeDocument API call \
-                           for optimum output")
+                           for optimum output"
+            )
         return layouts, id2block
 
     def _geometry_match(self, geom1, geom2, tolerance=0.1):
         """Check if two geometries match within a given tolerance."""
-        for key in ['Width', 'Height', 'Left', 'Top']:
+        for key in ["Width", "Height", "Left", "Top"]:
             if abs(geom1[key] - geom2[key]) > tolerance:
                 return False
         return True
@@ -51,22 +62,26 @@ def _geometry_match(self, geom1, geom2, tolerance=0.1):
     def _is_inside(self, inner_geom, outer_geom):
         """Check if inner geometry is fully contained within the outer geometry."""
         inner_left, inner_top, inner_right, inner_bottom = (
-            inner_geom['Left'], 
-            inner_geom['Top'], 
-            inner_geom['Left'] + inner_geom['Width'], 
-            inner_geom['Top'] + inner_geom['Height']
+            inner_geom["Left"],
+            inner_geom["Top"],
+            inner_geom["Left"] + inner_geom["Width"],
+            inner_geom["Top"] + inner_geom["Height"],
         )
-        
+
         outer_left, outer_top, outer_right, outer_bottom = (
-            outer_geom['Left'], 
-            outer_geom['Top'], 
-            outer_geom['Left'] + outer_geom['Width'], 
-            outer_geom['Top'] + outer_geom['Height']
+            outer_geom["Left"],
+            outer_geom["Top"],
+            outer_geom["Left"] + outer_geom["Width"],
+            outer_geom["Top"] + outer_geom["Height"],
+        )
+
+        return (
+            inner_left >= outer_left
+            and inner_right <= outer_right
+            and inner_top >= outer_top
+            and inner_bottom <= outer_bottom
         )
-        
-        return (inner_left >= outer_left and inner_right <= outer_right and 
-                inner_top >= outer_top and inner_bottom <= outer_bottom)
-    
+
     def _validate_block_skip(self, blockType: str) -> bool:
         if self.exclude_page_header and blockType == "LAYOUT_HEADER":
             return True
@@ -76,7 +91,7 @@ def _validate_block_skip(self, blockType: str) -> bool:
             return True
         else:
             return False
-    
+
     def _dfs(self, root, id2block):
         texts = []
         stack = [(root, 0)]
@@ -84,17 +99,25 @@ def _dfs(self, root, id2block):
         while stack:
             block_id, depth = stack.pop()
             block = id2block[block_id]
-            
+
             if self._validate_block_skip(block["BlockType"]):
                 continue
-            
+
             # Handle LAYOUT_TABLE type
             if not self.skip_table and block["BlockType"] == "LAYOUT_TABLE":
                 table_data = []
                 # Find the matching TABLE block for the LAYOUT_TABLE
                 table_block = None
-                for potential_table in [b for b in self.j['Blocks'] if b['BlockType'] == 'TABLE' and b.get('Page',1) == block.get('Page', 1)]:
-                    if self._geometry_match(block['Geometry']['BoundingBox'], potential_table['Geometry']['BoundingBox']):
+                for potential_table in [
+                    b
+                    for b in self.j["Blocks"]
+                    if b["BlockType"] == "TABLE"
+                    and b.get("Page", 1) == block.get("Page", 1)
+                ]:
+                    if self._geometry_match(
+                        block["Geometry"]["BoundingBox"],
+                        potential_table["Geometry"]["BoundingBox"],
+                    ):
                         table_block = potential_table
                         break
 
@@ -104,22 +127,36 @@ def _dfs(self, root, id2block):
                     max_row = 0
                     max_col = 0
                     for cell_rel in table_block["Relationships"]:
-                        if cell_rel['Type'] == "CHILD":
-                            for cell_id in cell_rel['Ids']:
+                        if cell_rel["Type"] == "CHILD":
+                            for cell_id in cell_rel["Ids"]:
                                 cell_block = id2block[cell_id]
                                 if "Relationships" in cell_block:
-                                    cell_text = " ".join([id2block[line_id]['Text'] for line_id in cell_block["Relationships"][0]['Ids'] if 'Text' in id2block[line_id]])
-                                    row_idx = cell_block['RowIndex']
-                                    col_idx = cell_block['ColumnIndex']
+                                    cell_text = " ".join(
+                                        [
+                                            id2block[line_id]["Text"]
+                                            for line_id in cell_block["Relationships"][
+                                                0
+                                            ]["Ids"]
+                                            if "Text" in id2block[line_id]
+                                        ]
+                                    )
+                                    row_idx = cell_block["RowIndex"]
+                                    col_idx = cell_block["ColumnIndex"]
                                     max_row = max(max_row, row_idx)
                                     max_col = max(max_col, col_idx)
-                                    for r in range(cell_block.get('RowSpan', 1)):
-                                        for c in range(cell_block.get('ColumnSpan', 1)):
-                                            if "EntityTypes" in cell_block and "COLUMN_HEADER" in cell_block["EntityTypes"]:
+                                    for r in range(cell_block.get("RowSpan", 1)):
+                                        for c in range(cell_block.get("ColumnSpan", 1)):
+                                            if (
+                                                "EntityTypes" in cell_block
+                                                and "COLUMN_HEADER"
+                                                in cell_block["EntityTypes"]
+                                            ):
                                                 headers[col_idx + c] = cell_text
                                             else:
-                                                table_content[(row_idx + r, col_idx + c)] = cell_text
-                    
+                                                table_content[
+                                                    (row_idx + r, col_idx + c)
+                                                ] = cell_text
+
                     table_data = []
                     start_row = 2 if headers else 1
                     for r in range(start_row, max_row + 1):
@@ -129,7 +166,7 @@ def _dfs(self, root, id2block):
                         table_data.append(row_data)
 
                     header_list = [headers.get(c, "") for c in range(1, max_col + 1)]
-                
+
                     try:
                         from tabulate import tabulate
                     except ImportError:
@@ -137,27 +174,42 @@ def _dfs(self, root, id2block):
                             "Could not import tabulate python package. "
                             "Please install it with `pip install tabulate`."
                         )
-                        
+
                     tab_fmt = "pipe" if self.generate_markdown else self.table_format
-                    '''If Markdown is enabled then default to pipe for tables'''
-                    
-                    table_text = tabulate(table_data, headers=header_list, tablefmt=tab_fmt)
+                    """If Markdown is enabled then default to pipe for tables"""
+
+                    table_text = tabulate(
+                        table_data, headers=header_list, tablefmt=tab_fmt
+                    )
                     yield table_text
                     continue
                 else:
-                    logger.warning("LAYOUT_TABLE detected but TABLES feature was not provided in API call. \
-                                  Inlcuding TABLES feature may improve the layout output")
-                    
+                    logger.warning(
+                        "LAYOUT_TABLE detected but TABLES feature was not provided in API call. \
+                                  Inlcuding TABLES feature may improve the layout output"
+                    )
+
             if block["BlockType"] == "LINE" and "Text" in block:
                 if self.exclude_figure_text and self.figures:
-                    if any(self._is_inside(block['Geometry']['BoundingBox'], figure_geom["geometry"]) \
-                           for figure_geom in self.figures if figure_geom["page"] == block.get("Page",1)):
+                    if any(
+                        self._is_inside(
+                            block["Geometry"]["BoundingBox"], figure_geom["geometry"]
+                        )
+                        for figure_geom in self.figures
+                        if figure_geom["page"] == block.get("Page", 1)
+                    ):
                         continue
-                yield block['Text']
-            elif block["BlockType"] in ["LAYOUT_TITLE", "LAYOUT_SECTION_HEADER"] and "Relationships" in block:
+                yield block["Text"]
+            elif (
+                block["BlockType"] in ["LAYOUT_TITLE", "LAYOUT_SECTION_HEADER"]
+                and "Relationships" in block
+            ):
                 # Get the associated LINE text for the layout
-                line_texts = [id2block[line_id]['Text'] for line_id in block["Relationships"][0]['Ids']]
-                combined_text = ' '.join(line_texts)
+                line_texts = [
+                    id2block[line_id]["Text"]
+                    for line_id in block["Relationships"][0]["Ids"]
+                ]
+                combined_text = " ".join(line_texts)
 
                 # Prefix with appropriate markdown
                 if self.generate_markdown:
@@ -166,32 +218,38 @@ def _dfs(self, root, id2block):
                     elif block["BlockType"] == "LAYOUT_SECTION_HEADER":
                         combined_text = f"## {combined_text}"
                 yield combined_text
-                
-            if block["BlockType"].startswith('LAYOUT') and block["BlockType"] not in ["LAYOUT_TITLE", "LAYOUT_SECTION_HEADER"]:
+
+            if block["BlockType"].startswith("LAYOUT") and block["BlockType"] not in [
+                "LAYOUT_TITLE",
+                "LAYOUT_SECTION_HEADER",
+            ]:
                 if "Relationships" in block:
                     relationships = block["Relationships"]
-                    children = [(x, depth + 1) for x in relationships[0]['Ids']]            
+                    children = [(x, depth + 1) for x in relationships[0]["Ids"]]
                     stack.extend(reversed(children))
-    
+
     def _save_to_s3(self, page_texts: dict) -> None:
         try:
             import boto3
             import re
-            s3 = boto3.client('s3')            
-            match = re.match(r's3://([^/]+)(?:/(.*))?', self.save_txt_path)
+
+            s3 = boto3.client("s3")
+            match = re.match(r"s3://([^/]+)(?:/(.*))?", self.save_txt_path)
             bucket = match.group(1)
             prefix = match.group(2) if match.group(2) else ""
-            
+
             for page_number, content in page_texts.items():
                 file_name = f"{page_number}.txt"
                 s3_key = os.path.join(prefix, file_name)
-                logger.debug(f"Writing linearized text for page {page_number} to bucket {bucket} file {s3_key}")
-                s3.put_object(Body=content, 
-                              Bucket=bucket, 
-                              Key=s3_key)
+                logger.debug(
+                    f"Writing linearized text for page {page_number} to bucket {bucket} file {s3_key}"
+                )
+                s3.put_object(Body=content, Bucket=bucket, Key=s3_key)
         except ImportError:
-            logger.error("Could not import boto3 python package. \
-                          Please install it with `pip install boto3`.")
+            logger.error(
+                "Could not import boto3 python package. \
+                          Please install it with `pip install boto3`."
+            )
             raise ModuleNotFoundError(
                 "Could not import boto3 python package. "
                 "Please install it with `pip install boto3`."
@@ -199,36 +257,40 @@ def _save_to_s3(self, page_texts: dict) -> None:
         except Exception as e:
             logger.error(e)
             raise e
-    
+
     def _save_to_files(self, page_texts: dict) -> None:
         path = self.save_txt_path.rstrip(os.sep)
-        if path.startswith('s3://'):
+        if path.startswith("s3://"):
             self._save_to_s3(page_texts=page_texts)
         else:
-            for page_number, content in page_texts.items():            
+            for page_number, content in page_texts.items():
                 file_path = os.path.join(path, f"{page_number}.txt")
-                logger.debug(f"Writing linearized text for page {page_number} to file {file_path}")
+                logger.debug(
+                    f"Writing linearized text for page {page_number} to file {file_path}"
+                )
                 with open(file_path, "w") as f:
                     f.write(content)
-                
+
     def get_text(self) -> dict:
         """Retrieve the text content in specified format. Default is CSV. Options: "csv", "markdown"."""
         # texts = []
         page_texts = {}
         layouts, id2block = self._get_layout_blocks()
         for layout in layouts:
-            root = layout['Id']
-            page_number = layout.get('Page', 1)
+            root = layout["Id"]
+            page_number = layout.get("Page", 1)
             if page_number not in page_texts:
                 page_texts[page_number] = ""
-            page_texts[page_number] += '\n'.join(self._dfs(root, id2block))+ "\n\n"
+            page_texts[page_number] += "\n".join(self._dfs(root, id2block)) + "\n\n"
         if self.save_txt_path:
             self._save_to_files(page_texts)
         return page_texts
 
+
 def string_counter():
     # Dictionary to keep track of the occurrences of each string
     occurrences = {}
+
     def counter(string):
         if string in occurrences:
             occurrences[string] += 1
@@ -238,6 +300,7 @@ def counter(string):
 
     return counter
 
+
 def get_layout_csv_from_trp2(trp2_doc: TDocument) -> List[List[List[str]]]:
     """
     Generate the layout.csv from the Amazon Textract Web Console download
@@ -246,25 +309,40 @@ def get_layout_csv_from_trp2(trp2_doc: TDocument) -> List[List[List[str]]]:
     'Page number','Layout','Text','Reading Order','Confidence score'
 
     Page number     : Starting at 1, incrementing for each page
-    Layout          : The BlockType + a number indicating the sequence for 
-                      this BlockType starting at 1 and for LAYOUT_LIST elements 
+    Layout          : The BlockType + a number indicating the sequence for
+                      this BlockType starting at 1 and for LAYOUT_LIST elements
                       the string:  "- part of LAYOUT_LIST (index)" is added
     Text            : The underlying text (except LAYOUT_LIST and LAYOUT_FIGURE )
     Reading Order   : Increasing int for each LAYOUT element starting with 0
     Confidence score: Confidence in this being a LAYOUT element
     """
-    result_value:List[List[List[str]]] = list()
+    result_value: List[List[List[str]]] = list()
 
     counter_instance = string_counter()
     for page_number, page in enumerate(trp2_doc.pages):
-        page_result:List[List[str]] = list()
+        page_result: List[List[str]] = list()
         processed_ids = []
         relationships: t2.TRelationship = page.get_relationships_for_type()
-        blocks = [trp2_doc.get_block_by_id(id) for id in relationships.ids if relationships.ids]
+        blocks = [
+            trp2_doc.get_block_by_id(id)
+            for id in relationships.ids
+            if relationships.ids
+        ]
         layout_blocks = [
-            block for block in blocks if block.block_type in [
-                "LAYOUT_TITLE", "LAYOUT_HEADER", "LAYOUT_FOOTER", "LAYOUT_SECTION_HEADER", "LAYOUT_PAGE_NUMBER",
-                "LAYOUT_LIST", "LAYOUT_FIGURE", "LAYOUT_TABLE", "LAYOUT_KEY_VALUE", "LAYOUT_TEXT"
+            block
+            for block in blocks
+            if block.block_type
+            in [
+                "LAYOUT_TITLE",
+                "LAYOUT_HEADER",
+                "LAYOUT_FOOTER",
+                "LAYOUT_SECTION_HEADER",
+                "LAYOUT_PAGE_NUMBER",
+                "LAYOUT_LIST",
+                "LAYOUT_FIGURE",
+                "LAYOUT_TABLE",
+                "LAYOUT_KEY_VALUE",
+                "LAYOUT_TEXT",
             ]
         ]
         for idx, layout_block in enumerate(layout_blocks):
@@ -275,31 +353,63 @@ def get_layout_csv_from_trp2(trp2_doc: TDocument) -> List[List[List[str]]]:
             if layout_block.block_type == "LAYOUT_LIST":
                 # first print out the LAYOUT_LIST
                 block_type_count = counter_instance(layout_block.block_type)
-                page_result.append([str(page_number + 1), layout_block.block_type + " " + str(block_type_count), "", str(idx),
-                                              layout_block.block_type, str(layout_block.confidence)])
+                page_result.append(
+                    [
+                        str(page_number + 1),
+                        layout_block.block_type + " " + str(block_type_count),
+                        "",
+                        str(idx),
+                        layout_block.block_type,
+                        str(layout_block.confidence),
+                    ]
+                )
 
                 # print(page_number + 1, layout_block.block_type + " " + str(block_type_count), "", idx, layout_block.block_type)
-                list_context_name = layout_block.block_type + " " + str(block_type_count)
+                list_context_name = (
+                    layout_block.block_type + " " + str(block_type_count)
+                )
                 # now get the relationships
                 list_block_rel = layout_block.get_relationships_for_type()
                 if list_block_rel:
                     # get the text relationships
-                    list_child_blocks = [trp2_doc.get_block_by_id(id) for id in list_block_rel.ids]
+                    list_child_blocks = [
+                        trp2_doc.get_block_by_id(id) for id in list_block_rel.ids
+                    ]
                     for child_idx, list_child_block in enumerate(list_child_blocks):
                         block_type_count = counter_instance(list_child_block.block_type)
-                        child_block_relation_text = list_child_block.block_type + " " + str(
-                            block_type_count) + " - part of " + list_context_name
+                        child_block_relation_text = (
+                            list_child_block.block_type
+                            + " "
+                            + str(block_type_count)
+                            + " - part of "
+                            + list_context_name
+                        )
                         # get the text, meaning get all the child relationships
-                        layout_child_block_rel = list_child_block.get_relationships_for_type()
+                        layout_child_block_rel = (
+                            list_child_block.get_relationships_for_type()
+                        )
                         layout_child_line_blocks = [
-                            trp2_doc.get_block_by_id(id) for id in layout_child_block_rel.ids
+                            trp2_doc.get_block_by_id(id)
+                            for id in layout_child_block_rel.ids
                             if layout_child_block_rel.ids
                         ]
                         # get the text, but not for figures
-                        layout_text = trp2_doc.get_text_for_tblocks(
-                            layout_child_line_blocks) if list_child_block.block_type != "LAYOUT_FIGURE" else ""
+                        layout_text = (
+                            trp2_doc.get_text_for_tblocks(layout_child_line_blocks)
+                            if list_child_block.block_type != "LAYOUT_FIGURE"
+                            else ""
+                        )
 
-                        page_result.append([str(page_number + 1), child_block_relation_text, layout_text, str(idx + child_idx + 1), list_child_block.block_type, str(list_child_block.confidence)])
+                        page_result.append(
+                            [
+                                str(page_number + 1),
+                                child_block_relation_text,
+                                layout_text,
+                                str(idx + child_idx + 1),
+                                list_child_block.block_type,
+                                str(list_child_block.confidence),
+                            ]
+                        )
                         # print(page_number + 1, child_block_relation_text, layout_text, idx + child_idx + 1,
                         #         list_child_block.block_type)
                         processed_ids.append(list_child_block.id)
@@ -308,18 +418,31 @@ def get_layout_csv_from_trp2(trp2_doc: TDocument) -> List[List[List[str]]]:
 
                 # Bug fix - #284
                 if layout_block_rel is None:
-                    logger.info (f'Block {layout_block} has no relationships')
+                    logger.info(f"Block {layout_block} has no relationships")
                     continue
 
                 layout_blocks = [
-                    trp2_doc.get_block_by_id(id) for id in layout_block_rel.ids if layout_block_rel.ids
+                    trp2_doc.get_block_by_id(id)
+                    for id in layout_block_rel.ids
+                    if layout_block_rel.ids
                 ]
                 # get the text, but not for figures
-                layout_text = trp2_doc.get_text_for_tblocks(
-                    layout_blocks) if layout_block.block_type != "LAYOUT_FIGURE" else ""
+                layout_text = (
+                    trp2_doc.get_text_for_tblocks(layout_blocks)
+                    if layout_block.block_type != "LAYOUT_FIGURE"
+                    else ""
+                )
                 block_type_count = counter_instance(layout_block.block_type)
-                page_result.append([str(page_number + 1), layout_block.block_type + " " + str(block_type_count), layout_text, str(idx),
-                                              layout_block.block_type, str(layout_block.confidence)])
+                page_result.append(
+                    [
+                        str(page_number + 1),
+                        layout_block.block_type + " " + str(block_type_count),
+                        layout_text,
+                        str(idx),
+                        layout_block.block_type,
+                        str(layout_block.confidence),
+                    ]
+                )
                 # print(page_number + 1, layout_block.block_type + " " + str(block_type_count), layout_text, idx, layout_block.block_type)
 
         result_value.append(page_result)

From 78f6de3d123aa771d6410e329f348c3907901cd1 Mon Sep 17 00:00:00 2001
From: Ian Su <ensu.tw@gmail.com>
Date: Tue, 3 Sep 2024 14:00:28 +0800
Subject: [PATCH 2/5] add LAYOUT_FIGURE

---
 .../t_pretty_print_layout.py                  | 30 +++++++++++++++++++
 1 file changed, 30 insertions(+)

diff --git a/prettyprinter/textractprettyprinter/t_pretty_print_layout.py b/prettyprinter/textractprettyprinter/t_pretty_print_layout.py
index e9a3a8c1..835c397a 100644
--- a/prettyprinter/textractprettyprinter/t_pretty_print_layout.py
+++ b/prettyprinter/textractprettyprinter/t_pretty_print_layout.py
@@ -189,6 +189,36 @@ def _dfs(self, root, id2block):
                                   Inlcuding TABLES feature may improve the layout output"
                     )
 
+            elif block["BlockType"] == "LAYOUT_FIGURE":
+                figure_caption = ""
+                if "Relationships" in block:
+                    for child_id in block["Relationships"][0]["Ids"]:
+                        child_block = id2block[child_id]
+                        if child_block["BlockType"] == "LINE":
+                            figure_caption += child_block.get("Text", "") + " "
+                if not figure_caption:
+                    figure_caption = "No caption"
+
+                # Extract geometry information
+                geometry = block["Geometry"]
+                bounding_box = geometry["BoundingBox"]
+                polygon = geometry["Polygon"]
+
+                # Create a dictionary with figure information
+                figure_info = {
+                    "bounding_box": bounding_box,
+                    "polygon": polygon,
+                    "page": block.get("Page", 1),
+                }
+
+                # Convert figure_info to a string representation
+                figure_info_str = str(figure_info)
+
+                if self.generate_markdown:
+                    yield f"![Figure]({figure_caption.strip()})\n<!-- {figure_info_str} -->"
+                else:
+                    yield f"[Figure: {figure_caption.strip()}]\n// {figure_info_str}"
+
             if block["BlockType"] == "LINE" and "Text" in block:
                 if self.exclude_figure_text and self.figures:
                     if any(

From 0a6695cef980798b569be395400dbb0d299c513c Mon Sep 17 00:00:00 2001
From: Ian Su <ensu.tw@gmail.com>
Date: Tue, 3 Sep 2024 16:11:37 +0800
Subject: [PATCH 3/5] able to generate table under figure

---
 .../t_pretty_print_layout.py                  | 185 +++++++++++-------
 1 file changed, 118 insertions(+), 67 deletions(-)

diff --git a/prettyprinter/textractprettyprinter/t_pretty_print_layout.py b/prettyprinter/textractprettyprinter/t_pretty_print_layout.py
index 835c397a..f7d9c174 100644
--- a/prettyprinter/textractprettyprinter/t_pretty_print_layout.py
+++ b/prettyprinter/textractprettyprinter/t_pretty_print_layout.py
@@ -2,7 +2,7 @@
 import warnings
 import logging
 from trp.trp2 import TDocument
-from typing import List
+from typing import List, Dict, Any, Tuple
 
 logger = logging.getLogger(__name__)
 
@@ -30,6 +30,7 @@ def __init__(
         self.save_txt_path = save_txt_path
         self.generate_markdown = generate_markdown
         self.figures = []
+        self.tables = []
 
     def _get_layout_blocks(self) -> tuple:
         """Get all blocks of type 'LAYOUT' and a dictionary of Ids mapped to their corresponding block."""
@@ -44,6 +45,23 @@ def _get_layout_blocks(self) -> tuple:
             for block in self.j["Blocks"]
             if block["BlockType"] == "LAYOUT_FIGURE"
         ]
+        self.tables = [
+            {
+                "page": block.get("Page", 1),
+                "id": block.get("Id", ""),
+                "word_ids": [
+                    word_id
+                    for relationship in block.get("Relationships", [])
+                    if relationship["Type"] == "CHILD"
+                    for cell_id in relationship.get("Ids", [])
+                    for cell_rel in id2block[cell_id].get("Relationships", [])
+                    if cell_rel["Type"] == "CHILD"
+                    for word_id in cell_rel.get("Ids", [])
+                ],
+            }
+            for block in self.j["Blocks"]
+            if block["BlockType"] == "TABLE"
+        ]
         if not layouts:
             logger.warning(
                 "No LAYOUT information found in Textract response. \
@@ -92,6 +110,87 @@ def _validate_block_skip(self, blockType: str) -> bool:
         else:
             return False
 
+    def _find_words_in_tables(
+        self, word_ids: List[str]
+    ) -> Tuple[List[str], Dict[int, List[str]]]:
+        """
+        Check which word_ids are part of table cells and which are not.
+
+        Args:
+            word_ids (List[str]): List of word IDs to check.
+
+        Returns:
+            Tuple[List[str], Dict[int, List[str]]]: A tuple containing:
+                - List of word IDs not in any table
+                - Dictionary mapping table indices to lists of word IDs they contain
+        """
+        words_not_in_table = set(word_ids)
+        relevant_tables = set()
+        for table in self.tables:
+            table_words = set(table["word_ids"]) & set(word_ids)
+
+            if table_words:
+                relevant_tables.add(table["id"])
+                words_not_in_table -= table_words
+
+        return list(words_not_in_table), relevant_tables
+
+    def _generate_table_string(self, table_block, id2block):
+        table_content = {}
+        headers = {}
+        max_row = 0
+        max_col = 0
+        for cell_rel in table_block["Relationships"]:
+            if cell_rel["Type"] == "CHILD":
+                for cell_id in cell_rel["Ids"]:
+                    cell_block = id2block[cell_id]
+                    if "Relationships" in cell_block:
+                        cell_text = " ".join(
+                            [
+                                id2block[line_id]["Text"]
+                                for line_id in cell_block["Relationships"][0]["Ids"]
+                                if "Text" in id2block[line_id]
+                            ]
+                        )
+                        row_idx = cell_block["RowIndex"]
+                        col_idx = cell_block["ColumnIndex"]
+                        max_row = max(max_row, row_idx)
+                        max_col = max(max_col, col_idx)
+                        for r in range(cell_block.get("RowSpan", 1)):
+                            for c in range(cell_block.get("ColumnSpan", 1)):
+                                if (
+                                    "EntityTypes" in cell_block
+                                    and "COLUMN_HEADER" in cell_block["EntityTypes"]
+                                ):
+                                    headers[col_idx + c] = cell_text
+                                else:
+                                    table_content[(row_idx + r, col_idx + c)] = (
+                                        cell_text
+                                    )
+
+        table_data = []
+        start_row = 2 if headers else 1
+        for r in range(start_row, max_row + 1):
+            row_data = []
+            for c in range(1, max_col + 1):
+                row_data.append(table_content.get((r, c), ""))
+            table_data.append(row_data)
+
+        header_list = [headers.get(c, "") for c in range(1, max_col + 1)]
+
+        try:
+            from tabulate import tabulate
+        except ImportError:
+            raise ModuleNotFoundError(
+                "Could not import tabulate python package. "
+                "Please install it with `pip install tabulate`."
+            )
+
+        tab_fmt = "pipe" if self.generate_markdown else self.table_format
+        """If Markdown is enabled then default to pipe for tables"""
+
+        return tabulate(table_data, headers=header_list, tablefmt=tab_fmt)
+
     def _dfs(self, root, id2block):
         texts = []
         stack = [(root, 0)]
@@ -122,65 +221,7 @@ def _dfs(self, root, id2block):
                         break
 
                 if table_block and "Relationships" in table_block:
-                    table_content = {}
-                    headers = {}
-                    max_row = 0
-                    max_col = 0
-                    for cell_rel in table_block["Relationships"]:
-                        if cell_rel["Type"] == "CHILD":
-                            for cell_id in cell_rel["Ids"]:
-                                cell_block = id2block[cell_id]
-                                if "Relationships" in cell_block:
-                                    cell_text = " ".join(
-                                        [
-                                            id2block[line_id]["Text"]
-                                            for line_id in cell_block["Relationships"][
-                                                0
-                                            ]["Ids"]
-                                            if "Text" in id2block[line_id]
-                                        ]
-                                    )
-                                    row_idx = cell_block["RowIndex"]
-                                    col_idx = cell_block["ColumnIndex"]
-                                    max_row = max(max_row, row_idx)
-                                    max_col = max(max_col, col_idx)
-                                    for r in range(cell_block.get("RowSpan", 1)):
-                                        for c in range(cell_block.get("ColumnSpan", 1)):
-                                            if (
-                                                "EntityTypes" in cell_block
-                                                and "COLUMN_HEADER"
-                                                in cell_block["EntityTypes"]
-                                            ):
-                                                headers[col_idx + c] = cell_text
-                                            else:
-                                                table_content[
-                                                    (row_idx + r, col_idx + c)
-                                                ] = cell_text
-
-                    table_data = []
-                    start_row = 2 if headers else 1
-                    for r in range(start_row, max_row + 1):
-                        row_data = []
-                        for c in range(1, max_col + 1):
-                            row_data.append(table_content.get((r, c), ""))
-                        table_data.append(row_data)
-
-                    header_list = [headers.get(c, "") for c in range(1, max_col + 1)]
-
-                    try:
-                        from tabulate import tabulate
-                    except ImportError:
-                        raise ModuleNotFoundError(
-                            "Could not import tabulate python package. "
-                            "Please install it with `pip install tabulate`."
-                        )
-
-                    tab_fmt = "pipe" if self.generate_markdown else self.table_format
-                    """If Markdown is enabled then default to pipe for tables"""
-
-                    table_text = tabulate(
-                        table_data, headers=header_list, tablefmt=tab_fmt
-                    )
+                    table_text = self._generate_table_string(table_block, id2block)
                     yield table_text
                     continue
                 else:
@@ -190,14 +231,24 @@ def _dfs(self, root, id2block):
                     )
 
             elif block["BlockType"] == "LAYOUT_FIGURE":
-                figure_caption = ""
+                figure_caption = None
+
                 if "Relationships" in block:
+                    word_ids = []
                     for child_id in block["Relationships"][0]["Ids"]:
                         child_block = id2block[child_id]
-                        if child_block["BlockType"] == "LINE":
-                            figure_caption += child_block.get("Text", "") + " "
-                if not figure_caption:
-                    figure_caption = "No caption"
+                        for word_id in child_block["Relationships"][0]["Ids"]:
+                            word_ids.append(word_id)
+                    words_not_in_table, relevant_table_ids = self._find_words_in_tables(
+                        word_ids
+                    )
+                    figure_caption = " ".join(
+                        [id2block[word_id]["Text"] for word_id in words_not_in_table]
+                    )
+                    for table_id in relevant_table_ids:
+                        table_block = id2block[table_id]
+                        table_text = self._generate_table_string(table_block, id2block)
+                        figure_caption += f"\n\n{table_text}"
 
                 # Extract geometry information
                 geometry = block["Geometry"]
@@ -215,9 +266,9 @@ def _dfs(self, root, id2block):
                 figure_info_str = str(figure_info)
 
                 if self.generate_markdown:
-                    yield f"![Figure]({figure_caption.strip()})\n<!-- {figure_info_str} -->"
+                    yield f"![Figure]({(figure_caption or '').strip() })\n<!-- {figure_info_str} -->"
                 else:
-                    yield f"[Figure: {figure_caption.strip()}]\n// {figure_info_str}"
+                    yield f"[Figure: {(figure_caption or '').strip()}]\n// {figure_info_str}"
 
             if block["BlockType"] == "LINE" and "Text" in block:
                 if self.exclude_figure_text and self.figures:

From 58fe81bc3a05a9e52b0e560b45966e9ffb866984 Mon Sep 17 00:00:00 2001
From: Ian Su <ensu.tw@gmail.com>
Date: Tue, 3 Sep 2024 23:24:24 +0800
Subject: [PATCH 4/5] update way to store caption

---
 .../textractprettyprinter/t_pretty_print_layout.py     | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/prettyprinter/textractprettyprinter/t_pretty_print_layout.py b/prettyprinter/textractprettyprinter/t_pretty_print_layout.py
index f7d9c174..946adeb3 100644
--- a/prettyprinter/textractprettyprinter/t_pretty_print_layout.py
+++ b/prettyprinter/textractprettyprinter/t_pretty_print_layout.py
@@ -1,5 +1,6 @@
 import os
 import warnings
+import json
 import logging
 from trp.trp2 import TDocument
 from typing import List, Dict, Any, Tuple
@@ -231,7 +232,7 @@ def _dfs(self, root, id2block):
                     )
 
             elif block["BlockType"] == "LAYOUT_FIGURE":
-                figure_caption = None
+                figure_caption = ""
 
                 if "Relationships" in block:
                     word_ids = []
@@ -260,15 +261,16 @@ def _dfs(self, root, id2block):
                     "bounding_box": bounding_box,
                     "polygon": polygon,
                     "page": block.get("Page", 1),
+                    "caption": figure_caption,
                 }
 
                 # Convert figure_info to a string representation
-                figure_info_str = str(figure_info)
+                figure_info_str = json.dumps(figure_info)
 
                 if self.generate_markdown:
-                    yield f"![Figure]({(figure_caption or '').strip() })\n<!-- {figure_info_str} -->"
+                    yield f"![Figure]\n<!-- {figure_info_str} -->"
                 else:
-                    yield f"[Figure: {(figure_caption or '').strip()}]\n// {figure_info_str}"
+                    yield f"[Figure]\n// {figure_info_str}"
 
             if block["BlockType"] == "LINE" and "Text" in block:
                 if self.exclude_figure_text and self.figures:

From b7181c6bd0dc268418127d5ddbf6825e56bb33ff Mon Sep 17 00:00:00 2001
From: Ian Su <ensu.tw@gmail.com>
Date: Tue, 3 Sep 2024 23:34:23 +0800
Subject: [PATCH 5/5] only show table with enough word_ids

---
 .../t_pretty_print_layout.py                  | 26 +++++++++++--------
 1 file changed, 15 insertions(+), 11 deletions(-)

diff --git a/prettyprinter/textractprettyprinter/t_pretty_print_layout.py b/prettyprinter/textractprettyprinter/t_pretty_print_layout.py
index 946adeb3..ac26d62a 100644
--- a/prettyprinter/textractprettyprinter/t_pretty_print_layout.py
+++ b/prettyprinter/textractprettyprinter/t_pretty_print_layout.py
@@ -113,7 +113,7 @@ def _validate_block_skip(self, blockType: str) -> bool:
 
     def _find_words_in_tables(
         self, word_ids: List[str]
-    ) -> Tuple[List[str], Dict[int, List[str]]]:
+    ) -> Tuple[List[str], List[Tuple[str, float]]]:
         """
         Check which word_ids are part of table cells and which are not.
 
@@ -121,17 +121,18 @@ def _find_words_in_tables(
             word_ids (List[str]): List of word IDs to check.
 
         Returns:
-            Tuple[List[str], Dict[int, List[str]]]: A tuple containing:
+            Tuple[List[str], List[Tuple[str, float]]]: A tuple containing:
                 - List of word IDs not in any table
-                - Dictionary mapping table indices to lists of word IDs they contain
+                - List of tuples containing table IDs and the ratio of word IDs in the table
         """
         words_not_in_table = set(word_ids)
-        relevant_tables = set()
+        relevant_tables = []
         for table in self.tables:
             table_words = set(table["word_ids"]) & set(word_ids)
 
             if table_words:
-                relevant_tables.add(table["id"])
+                ratio = len(table_words) / len(table["word_ids"])
+                relevant_tables.append((table["id"], ratio))
                 words_not_in_table -= table_words
 
         return list(words_not_in_table), relevant_tables
@@ -240,16 +241,19 @@ def _dfs(self, root, id2block):
                         child_block = id2block[child_id]
                         for word_id in child_block["Relationships"][0]["Ids"]:
                             word_ids.append(word_id)
-                    words_not_in_table, relevant_table_ids = self._find_words_in_tables(
-                        word_ids
+                    words_not_in_table, relevant_table_infos = (
+                        self._find_words_in_tables(word_ids)
                     )
                     figure_caption = " ".join(
                         [id2block[word_id]["Text"] for word_id in words_not_in_table]
                     )
-                    for table_id in relevant_table_ids:
-                        table_block = id2block[table_id]
-                        table_text = self._generate_table_string(table_block, id2block)
-                        figure_caption += f"\n\n{table_text}"
+                    for table_id, ratio in relevant_table_infos:
+                        if ratio > 0.9:
+                            table_block = id2block[table_id]
+                            table_text = self._generate_table_string(
+                                table_block, id2block
+                            )
+                            figure_caption += f"\n\n{table_text}"
 
                 # Extract geometry information
                 geometry = block["Geometry"]