Skip to content

Commit 353dbe3

Browse files
committed
Added PDF extraction code
1 parent 3d9af42 commit 353dbe3

File tree

5 files changed

+101
-0
lines changed

5 files changed

+101
-0
lines changed
Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
import camelot
2+
import pandas as pd
3+
4+
5+
# Lattice -> looks for clearly defined borders / lines like a grid, visible ruling lines between rows and columns
6+
lattice_tables = camelot.read_pdf('documents/safari.pdf', pages='all', flavor='lattice', suppress_stdout=False)
7+
8+
# Stream -> analyzes text positioning and spaces between text, when structure is maintained through spacing
9+
stream_tables = camelot.read_pdf('documents/safari.pdf', pages='all', flavor='stream', suppress_stdout=False)
10+
11+
12+
for table in lattice_tables:
13+
print('Lattice Table')
14+
print(table.df)
15+
16+
for table in stream_tables:
17+
print('Stream Table')
18+
print(table.df)
19+
Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
# pip3 install llmwhisperer-client
2+
3+
import time
4+
from unstract.llmwhisperer import LLMWhispererClientV2
5+
6+
client = LLMWhispererClientV2(base_url="https://llmwhisperer-api.us-central.unstract.com/api/v2", api_key='<YOUR API KEY>')
7+
8+
result = client.whisper(file_path="documents/scan-biogenx.pdf")
9+
10+
print(result)
11+
12+
while True:
13+
status = client.whisper_status(whisper_hash=result["whisper_hash"])
14+
elif status["status"] == "processed":
15+
resultx = client.whisper_retrieve(
16+
whisper_hash=result["whisper_hash"]
17+
)
18+
break
19+
time.sleep(5)
20+
21+
extracted_text = resultx['extraction']['result_text']
22+
23+
print(extracted_text)
Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
import pdfplumber
2+
import pandas as pd
3+
4+
tables = []
5+
6+
with pdfplumber.open('documents/safari.pdf') as pdf:
7+
for page in pdf.pages:
8+
tables_on_page = page.extract_tables({
9+
"vertical_strategy": "text",
10+
"horizontal_strategy": "text",
11+
"intersection_x_tolerance": 10,
12+
"intersection_y_tolerance": 10
13+
})
14+
15+
if tables_on_page:
16+
for table in tables_on_page:
17+
if table:
18+
tables.append({
19+
'page': pdf.pages.index(page) + 1,
20+
'data': table
21+
})
22+
23+
for table in tables:
24+
print(f"\nTable from page {table['page']}:")
25+
print(pd.DataFrame(table['data']))
26+
print("-" * 50)
Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
import PyPDF2
2+
3+
text = ""
4+
5+
with open('documents/safari.pdf', 'rb') as file:
6+
pdf_reader = PyPDF2.PdfReader(file)
7+
num_pages = len(pdf_reader.pages)
8+
9+
for page_num in range(num_pages):
10+
page = pdf_reader.pages[page_num]
11+
12+
page_text = page.extract_text()
13+
14+
text += page_text + "\n\n"
15+
16+
return text
17+
Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
import tabula
2+
import pandas as pd
3+
4+
tables = tabula.read_pdf(
5+
'documents/safari.pdf',
6+
pages='all',
7+
multiple_tables=True,
8+
lattice=True, # For tables with borders
9+
stream=True, # For tables without borders
10+
guess=False,
11+
pandas_options={'header': None},
12+
)
13+
14+
for i, table in enumerate(tables, 1):
15+
print(table)
16+

0 commit comments

Comments
 (0)