1
1
import os
2
+ import fitz
3
+ import pandas as pd
2
4
import tkinter as tk
3
5
from tkinter .filedialog import askdirectory
4
6
from pathlib import Path
5
- from pdfrw import PdfReader
6
7
7
8
class XLSX :
8
9
9
10
def __init__ (self ):
10
- self .temp = 1
11
+ self .consultant_check = 0
12
+ self .consultant_idx = 0
11
13
12
- def find_pdfs_in_downloads (self ):
14
+ def find_pdfs (self ) -> list :
13
15
_default = str (os .path .join (Path .home (), "Downloads" ))
14
16
15
17
tk .Tk ().withdraw ()
@@ -18,25 +20,72 @@ def find_pdfs_in_downloads(self):
18
20
pdf_files = []
19
21
20
22
for root , dirs , files in os .walk (folder ):
21
- sep = str (root ).split ('\\ ' )
23
+ root = str (root ).replace ('\\ ' , '/' )
24
+ sep = str (root ).split ('/' )
22
25
if str (sep [- 1 ]).startswith ('International' ):
23
26
for file in files :
24
27
pdf_files .append (os .path .abspath (os .path .join (root , file )))
25
28
29
+ print ('Paths acquired...' )
30
+
26
31
return pdf_files
27
32
28
- def read_pdf_content (self , pdf_files ) :
33
+ def read_pdf_content (self , pdf_files : list ) -> list :
29
34
30
- content_list = []
35
+ _list = []
31
36
32
37
for pdf in pdf_files :
33
- with open (pdf , 'rb' ) as file :
34
- for line in file :
35
- print (line )
38
+ document = fitz .open (pdf )
39
+ content_list = []
40
+ root = str (pdf ).replace ('\\ ' , '/' )
41
+ sep = str (root ).split ('/' )
42
+
43
+ content_list .append (sep [- 3 ]) # File name
44
+ content_list .append (sep [- 2 ]) # App Type
45
+ content_list .append (sep [- 1 ][:- 4 ]) # Applicant name
46
+
47
+ for page_num in range (len (document )):
48
+ page = document .load_page (page_num )
49
+ content = page .get_text ()
50
+ _lines = content .splitlines ()
51
+ for idx , line in enumerate (_lines ):
52
+ if self .search_for_headers (line , idx ):
53
+ if str (line ).startswith ('Date of Birth' ):
54
+ content_list .append (line [15 :25 ]) # DOB
55
+ content_list .append (line [33 :]) # Gender
56
+ elif self .consultant_idx != 0 and self .consultant_check == 1 :
57
+ content_list .append (_lines [self .consultant_idx + 3 ]) # Consultant Answer
58
+ self .consultant_idx == 0
59
+ else :
60
+ content_list .append (line ) # Semester Entry
61
+ self .consultant_check = 0
62
+
63
+ _list .append (content_list )
36
64
65
+ print ('Lines appended to list...' )
37
66
67
+ return _list
68
+
69
+ def search_for_headers (self , line : str , idx : int ) -> bool :
70
+
71
+ if str (line ).startswith (('Fall' , 'Spring' , 'Date of Birth' )) and idx <= 20 :
72
+ return True
73
+ elif line == 'Consultant/Agency' :
74
+ self .consultant_check += 1
75
+ self .consultant_idx = idx
76
+ return True
77
+
78
+ return False
79
+
80
+ def generate_xlsx_sheet (self , _list : list ) -> None :
81
+
82
+ df = pd .DataFrame (_list )
83
+ download_default = str (os .path .join (Path .home (), "Downloads" ))
84
+ filepath = f'{ download_default } /total.xlsx'
85
+ df .to_excel (filepath , index = False , header = ["File Name" , "Type" , "Name" , "Semester" , "DOB" , "Gender" , "Info" ])
38
86
39
87
if __name__ == '__main__' :
40
88
x = XLSX ()
41
- pdf_files = x .find_pdfs_in_downloads ()
42
- x .read_pdf_content (pdf_files )
89
+ pdf_files = x .find_pdfs ()
90
+ _list = x .read_pdf_content (pdf_files )
91
+ x .generate_xlsx_sheet (_list )
0 commit comments