Skip to content

Commit 441d4cd

Browse files
committed
Updated xlsx generation. Removed unnecessary code.
1 parent 5a56bdd commit 441d4cd

File tree

3 files changed

+62
-94
lines changed

3 files changed

+62
-94
lines changed

core/pdf.py

+1-33
Original file line numberDiff line numberDiff line change
@@ -173,36 +173,4 @@ def generate_pdf_page(self, canvas: object, _list: list, page_num: int) -> None:
173173
canvas.drawString(self.x_start, self.y_start, items)
174174
self.y_start = self.y_start - 10
175175

176-
self.y_start = 750
177-
178-
def find_consultant_agency(self, app_data: list, filename: str) -> list:
179-
180-
_list = []
181-
182-
for idx, items in enumerate(app_data):
183-
if items == 'Spring 2025':
184-
_list.append(items)
185-
elif str(items) == 'Consultant/Agency':
186-
_list.append(app_data[idx+3])
187-
elif str(items).startswith('Student Contact'):
188-
self.student_flag = 1
189-
elif self.student_flag == 1 and str(items).startswith('Date of Birth'):
190-
self.student_flag = 0
191-
_list.append(items[15:25])
192-
_list.append(items[33:])
193-
_list.append(filename)
194-
195-
return _list
196-
197-
def generate_xlsx_sheet(self, _list: list, filename: str) -> None:
198-
_temp = []
199-
200-
for idx, items in enumerate(_list):
201-
if len(items[0]) >= 5:
202-
_temp.append([items[0][0], items[0][1], items[0][2], items[0][3], items[0][4], items[1]])
203-
204-
if len(_temp) != 0:
205-
df = pd.DataFrame(_temp)
206-
download_default = str(os.path.join(Path.home(), "Downloads"))
207-
filepath = f'{download_default}/{filename}.xlsx'
208-
df.to_excel(filepath, index=False, header= ["Semester", "DOB", "Gender", "Info", "Filename", "Name"])
176+
self.y_start = 750

core/xlsx.py

+60-11
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,17 @@
11
import os
2+
import fitz
3+
import pandas as pd
24
import tkinter as tk
35
from tkinter.filedialog import askdirectory
46
from pathlib import Path
5-
from pdfrw import PdfReader
67

78
class XLSX:
89

910
def __init__(self):
10-
self.temp = 1
11+
self.consultant_check = 0
12+
self.consultant_idx = 0
1113

12-
def find_pdfs_in_downloads(self):
14+
def find_pdfs(self) -> list:
1315
_default = str(os.path.join(Path.home(), "Downloads"))
1416

1517
tk.Tk().withdraw()
@@ -18,25 +20,72 @@ def find_pdfs_in_downloads(self):
1820
pdf_files = []
1921

2022
for root, dirs, files in os.walk(folder):
21-
sep = str(root).split('\\')
23+
root = str(root).replace('\\', '/')
24+
sep = str(root).split('/')
2225
if str(sep[-1]).startswith('International'):
2326
for file in files:
2427
pdf_files.append(os.path.abspath(os.path.join(root, file)))
2528

29+
print('Paths acquired...')
30+
2631
return pdf_files
2732

28-
def read_pdf_content(self, pdf_files):
33+
def read_pdf_content(self, pdf_files: list) -> list:
2934

30-
content_list = []
35+
_list = []
3136

3237
for pdf in pdf_files:
33-
with open(pdf, 'rb') as file:
34-
for line in file:
35-
print(line)
38+
document = fitz.open(pdf)
39+
content_list = []
40+
root = str(pdf).replace('\\', '/')
41+
sep = str(root).split('/')
42+
43+
content_list.append(sep[-3]) # File name
44+
content_list.append(sep[-2]) # App Type
45+
content_list.append(sep[-1][:-4]) # Applicant name
46+
47+
for page_num in range(len(document)):
48+
page = document.load_page(page_num)
49+
content = page.get_text()
50+
_lines = content.splitlines()
51+
for idx, line in enumerate(_lines):
52+
if self.search_for_headers(line, idx):
53+
if str(line).startswith('Date of Birth'):
54+
content_list.append(line[15:25]) # DOB
55+
content_list.append(line[33:]) # Gender
56+
elif self.consultant_idx != 0 and self.consultant_check == 1:
57+
content_list.append(_lines[self.consultant_idx+3]) # Consultant Answer
58+
self.consultant_idx == 0
59+
else:
60+
content_list.append(line) # Semester Entry
61+
self.consultant_check = 0
62+
63+
_list.append(content_list)
3664

65+
print('Lines appended to list...')
3766

67+
return _list
68+
69+
def search_for_headers(self, line: str, idx: int) -> bool:
70+
71+
if str(line).startswith(('Fall', 'Spring', 'Date of Birth')) and idx <= 20:
72+
return True
73+
elif line == 'Consultant/Agency':
74+
self.consultant_check += 1
75+
self.consultant_idx = idx
76+
return True
77+
78+
return False
79+
80+
def generate_xlsx_sheet(self, _list: list) -> None:
81+
82+
df = pd.DataFrame(_list)
83+
download_default = str(os.path.join(Path.home(), "Downloads"))
84+
filepath = f'{download_default}/total.xlsx'
85+
df.to_excel(filepath, index=False, header= ["File Name", "Type", "Name", "Semester", "DOB", "Gender", "Info"])
3886

3987
if __name__ == '__main__':
4088
x = XLSX()
41-
pdf_files = x.find_pdfs_in_downloads()
42-
x.read_pdf_content(pdf_files)
89+
pdf_files = x.find_pdfs()
90+
_list = x.read_pdf_content(pdf_files)
91+
x.generate_xlsx_sheet(_list)

main.py

+1-50
Original file line numberDiff line numberDiff line change
@@ -76,64 +76,15 @@ def run(file_path: str, filename: str) -> None:
7676
r.capture_student_name()
7777
r.capture_app_type()
7878

79-
# create_xlsx(translated_spe, filename)
80-
8179
for idx, item in enumerate(translated_spe):
8280
_list = r.fit_student_data(item)
8381
r.create_page_structure(folder, filename, _list, idx)
8482

8583
except BaseException as b:
8684
tk.messagebox.showerror("run() error", f"{sys.exc_info()[1]}")
8785

88-
def create_xlsx(translated_spe: list, filename: str)-> None:
89-
90-
_xlsx = []
91-
92-
r = core.Report(translated_spe)
93-
r.capture_student_name()
94-
r.capture_app_type()
95-
96-
for idx, item in enumerate(translated_spe):
97-
_list = r.fit_student_data(item)
98-
_xlsx.append(r.find_consultant_agency(_list, filename))
99-
100-
for idx, item in enumerate(r.student_name):
101-
_xlsx[idx] = (_xlsx[idx], item)
102-
103-
r.generate_xlsx_sheet(_xlsx, filename[:-4])
104-
105-
def merge_xlsx()-> None:
106-
# specifying the path to csv files
107-
input_folder = str(os.path.join(Path.home(), "Downloads"))
108-
output_file = str(os.path.join(input_folder, 'Spring 2024 total.xlsx'))
109-
110-
# Create a list to hold the dataframes
111-
dfs = []
112-
113-
# Iterate over all Excel files in the specified folder
114-
for file_name in os.listdir(input_folder):
115-
if file_name.endswith('.xlsx') or file_name.endswith('.xls'):
116-
file_path = os.path.join(input_folder, file_name)
117-
# Read all sheets from the Excel file
118-
xls = pd.ExcelFile(file_path, engine='openpyxl')
119-
for sheet_name in xls.sheet_names:
120-
df = pd.read_excel(file_path, sheet_name=sheet_name)
121-
dfs.append(df)
122-
123-
# Concatenate all dataframes into one
124-
merged_df = pd.concat(dfs, ignore_index=True)
125-
126-
# Drop duplicate rows
127-
merged_df = merged_df.drop_duplicates()
128-
129-
# Save the merged dataframe to a new Excel file
130-
merged_df.to_excel(output_file, index=False, engine='openpyxl')
131-
13286
if __name__ == "__main__":
13387

13488
find_spe_files() # Multiple .spe files
13589
# find_spe_file() # Singluar .spe file
136-
print('Done')
137-
138-
# merge_xlsx()
139-
# print('Done Done')
90+
print('Done')

0 commit comments

Comments
 (0)