File tree Expand file tree Collapse file tree 5 files changed +101
-0
lines changed Expand file tree Collapse file tree 5 files changed +101
-0
lines changed Original file line number Diff line number Diff line change
1
+ import camelot
2
+ import pandas as pd
3
+
4
+
5
+ # Lattice -> looks for clearly defined borders / lines like a grid, visible ruling lines between rows and columns
6
+ lattice_tables = camelot .read_pdf ('documents/safari.pdf' , pages = 'all' , flavor = 'lattice' , suppress_stdout = False )
7
+
8
+ # Stream -> analyzes text positioning and spaces between text, when structure is maintained through spacing
9
+ stream_tables = camelot .read_pdf ('documents/safari.pdf' , pages = 'all' , flavor = 'stream' , suppress_stdout = False )
10
+
11
+
12
+ for table in lattice_tables :
13
+ print ('Lattice Table' )
14
+ print (table .df )
15
+
16
+ for table in stream_tables :
17
+ print ('Stream Table' )
18
+ print (table .df )
19
+
Original file line number Diff line number Diff line change
1
+ # pip3 install llmwhisperer-client
2
+
3
+ import time
4
+ from unstract .llmwhisperer import LLMWhispererClientV2
5
+
6
+ client = LLMWhispererClientV2 (base_url = "https://llmwhisperer-api.us-central.unstract.com/api/v2" , api_key = '<YOUR API KEY>' )
7
+
8
+ result = client .whisper (file_path = "documents/scan-biogenx.pdf" )
9
+
10
+ print (result )
11
+
12
+ while True :
13
+ status = client .whisper_status (whisper_hash = result ["whisper_hash" ])
14
+ elif status ["status" ] == "processed" :
15
+ resultx = client .whisper_retrieve (
16
+ whisper_hash = result ["whisper_hash" ]
17
+ )
18
+ break
19
+ time .sleep (5 )
20
+
21
+ extracted_text = resultx ['extraction' ]['result_text' ]
22
+
23
+ print (extracted_text )
Original file line number Diff line number Diff line change
1
+ import pdfplumber
2
+ import pandas as pd
3
+
4
+ tables = []
5
+
6
+ with pdfplumber .open ('documents/safari.pdf' ) as pdf :
7
+ for page in pdf .pages :
8
+ tables_on_page = page .extract_tables ({
9
+ "vertical_strategy" : "text" ,
10
+ "horizontal_strategy" : "text" ,
11
+ "intersection_x_tolerance" : 10 ,
12
+ "intersection_y_tolerance" : 10
13
+ })
14
+
15
+ if tables_on_page :
16
+ for table in tables_on_page :
17
+ if table :
18
+ tables .append ({
19
+ 'page' : pdf .pages .index (page ) + 1 ,
20
+ 'data' : table
21
+ })
22
+
23
+ for table in tables :
24
+ print (f"\n Table from page { table ['page' ]} :" )
25
+ print (pd .DataFrame (table ['data' ]))
26
+ print ("-" * 50 )
Original file line number Diff line number Diff line change
1
+ import PyPDF2
2
+
3
+ text = ""
4
+
5
+ with open ('documents/safari.pdf' , 'rb' ) as file :
6
+ pdf_reader = PyPDF2 .PdfReader (file )
7
+ num_pages = len (pdf_reader .pages )
8
+
9
+ for page_num in range (num_pages ):
10
+ page = pdf_reader .pages [page_num ]
11
+
12
+ page_text = page .extract_text ()
13
+
14
+ text += page_text + "\n \n "
15
+
16
+ return text
17
+
Original file line number Diff line number Diff line change
1
+ import tabula
2
+ import pandas as pd
3
+
4
+ tables = tabula .read_pdf (
5
+ 'documents/safari.pdf' ,
6
+ pages = 'all' ,
7
+ multiple_tables = True ,
8
+ lattice = True , # For tables with borders
9
+ stream = True , # For tables without borders
10
+ guess = False ,
11
+ pandas_options = {'header' : None },
12
+ )
13
+
14
+ for i , table in enumerate (tables , 1 ):
15
+ print (table )
16
+
You can’t perform that action at this time.
0 commit comments