Added PDF extraction code

NeuralNine · NeuralNine · commit 353dbe3cd0c8 · 2025-03-07T19:31:13.000+01:00
diff --git a/PDF Table Extraction/extract_camelot.py b/PDF Table Extraction/extract_camelot.py
@@ -0,0 +1,19 @@
+import camelot
+import pandas as pd
+
+
+# Lattice -> looks for clearly defined borders / lines like a grid, visible ruling lines between rows and columns
+lattice_tables = camelot.read_pdf('documents/safari.pdf', pages='all', flavor='lattice', suppress_stdout=False)
+
+# Stream -> analyzes text positioning and spaces between text, when structure is maintained through spacing
+stream_tables = camelot.read_pdf('documents/safari.pdf', pages='all', flavor='stream', suppress_stdout=False)
+
+    
+for table in lattice_tables:
+    print('Lattice Table')
+    print(table.df)
+
+for table in stream_tables:
+    print('Stream Table')
+    print(table.df)
+
diff --git a/PDF Table Extraction/extract_llmwhisperer.py b/PDF Table Extraction/extract_llmwhisperer.py
@@ -0,0 +1,23 @@
+# pip3 install llmwhisperer-client
+
+import time
+from unstract.llmwhisperer import LLMWhispererClientV2
+
+client = LLMWhispererClientV2(base_url="https://llmwhisperer-api.us-central.unstract.com/api/v2", api_key='<YOUR API KEY>')
+
+result = client.whisper(file_path="documents/scan-biogenx.pdf")
+
+print(result)
+
+while True:
+    status = client.whisper_status(whisper_hash=result["whisper_hash"])
+    elif status["status"] == "processed":
+        resultx = client.whisper_retrieve(
+            whisper_hash=result["whisper_hash"]
+        )
+        break
+    time.sleep(5)
+
+extracted_text = resultx['extraction']['result_text']
+
+print(extracted_text)
diff --git a/PDF Table Extraction/extract_pdfplumber.py b/PDF Table Extraction/extract_pdfplumber.py
@@ -0,0 +1,26 @@
+import pdfplumber
+import pandas as pd
+
+tables = []
+
+with pdfplumber.open('documents/safari.pdf') as pdf:
+    for page in pdf.pages:
+        tables_on_page = page.extract_tables({
+            "vertical_strategy": "text",
+            "horizontal_strategy": "text",
+            "intersection_x_tolerance": 10,
+            "intersection_y_tolerance": 10
+        })
+        
+        if tables_on_page:
+            for table in tables_on_page:
+                if table:
+                    tables.append({
+                        'page': pdf.pages.index(page) + 1,
+                        'data': table
+                    })
+    
+for table in tables:
+    print(f"\nTable from page {table['page']}:")
+    print(pd.DataFrame(table['data']))
+    print("-" * 50)
diff --git a/PDF Table Extraction/extract_py2pdf.py b/PDF Table Extraction/extract_py2pdf.py
@@ -0,0 +1,17 @@
+import PyPDF2
+
+text = ""
+
+with open('documents/safari.pdf', 'rb') as file:
+    pdf_reader = PyPDF2.PdfReader(file)
+    num_pages = len(pdf_reader.pages)
+    
+    for page_num in range(num_pages):
+        page = pdf_reader.pages[page_num]
+        
+        page_text = page.extract_text()
+        
+        text += page_text + "\n\n"
+        
+    return text
+        
diff --git a/PDF Table Extraction/extract_tabula.py b/PDF Table Extraction/extract_tabula.py
@@ -0,0 +1,16 @@
+import tabula
+import pandas as pd
+
+tables = tabula.read_pdf(
+    'documents/safari.pdf',
+    pages='all',
+    multiple_tables=True,
+    lattice=True,      # For tables with borders
+    stream=True,       # For tables without borders
+    guess=False,
+    pandas_options={'header': None},
+)
+
+for i, table in enumerate(tables, 1):
+    print(table)
+