Updated xlsx generation. Removed unnecessary code.

Nick-prog · Nick-prog · commit 441d4cdb6913 · 2024-07-05T13:39:01.000-05:00
diff --git a/core/pdf.py b/core/pdf.py
@@ -173,36 +173,4 @@ def generate_pdf_page(self, canvas: object, _list: list, page_num: int) -> None:
                 canvas.drawString(self.x_start, self.y_start, items)
                 self.y_start = self.y_start - 10
 
-        self.y_start = 750
-
-    def find_consultant_agency(self, app_data: list, filename: str) -> list:
-
-        _list = []
-
-        for idx, items in enumerate(app_data):
-            if items == 'Spring 2025':
-                _list.append(items)
-            elif str(items) == 'Consultant/Agency':
-                _list.append(app_data[idx+3])
-            elif str(items).startswith('Student Contact'):
-                self.student_flag = 1
-            elif self.student_flag == 1 and str(items).startswith('Date of Birth'):
-                self.student_flag = 0
-                _list.append(items[15:25])
-                _list.append(items[33:])
-        _list.append(filename)
-
-        return _list
-    
-    def generate_xlsx_sheet(self, _list: list, filename: str) -> None:
-        _temp = []
-
-        for idx, items in enumerate(_list):
-            if len(items[0]) >= 5:
-                _temp.append([items[0][0], items[0][1], items[0][2], items[0][3], items[0][4], items[1]])
-
-        if len(_temp) != 0:
-            df = pd.DataFrame(_temp)
-            download_default = str(os.path.join(Path.home(), "Downloads"))
-            filepath = f'{download_default}/{filename}.xlsx'
-            df.to_excel(filepath, index=False, header= ["Semester", "DOB", "Gender", "Info", "Filename", "Name"])
+        self.y_start = 750
diff --git a/core/xlsx.py b/core/xlsx.py
@@ -1,15 +1,17 @@
 import os
+import fitz
+import pandas as pd
 import tkinter as tk
 from tkinter.filedialog import askdirectory
 from pathlib import Path
-from pdfrw import PdfReader
 
 class XLSX:
 
     def __init__(self):
-        self.temp = 1
+        self.consultant_check = 0
+        self.consultant_idx = 0
 
-    def find_pdfs_in_downloads(self):
+    def find_pdfs(self) -> list:
         _default = str(os.path.join(Path.home(), "Downloads"))
 
         tk.Tk().withdraw()
@@ -18,25 +20,72 @@ def find_pdfs_in_downloads(self):
         pdf_files = []
 
         for root, dirs, files in os.walk(folder):
-            sep = str(root).split('\\')
+            root = str(root).replace('\\', '/')
+            sep = str(root).split('/')
             if str(sep[-1]).startswith('International'):
                 for file in files:
                     pdf_files.append(os.path.abspath(os.path.join(root, file)))
 
+        print('Paths acquired...')
+
         return pdf_files
     
-    def read_pdf_content(self, pdf_files):
+    def read_pdf_content(self, pdf_files: list) -> list:
 
-        content_list = []
+        _list = []
         
         for pdf in pdf_files:
-            with open(pdf, 'rb') as file:
-                for line in file:
-                    print(line)
+            document = fitz.open(pdf)
+            content_list = []
+            root = str(pdf).replace('\\', '/')
+            sep = str(root).split('/')
+
+            content_list.append(sep[-3]) # File name
+            content_list.append(sep[-2]) # App Type
+            content_list.append(sep[-1][:-4]) # Applicant name
+
+            for page_num in range(len(document)):
+                page = document.load_page(page_num)
+                content = page.get_text()
+                _lines = content.splitlines()
+                for idx, line in enumerate(_lines):
+                    if self.search_for_headers(line, idx):
+                        if str(line).startswith('Date of Birth'):
+                            content_list.append(line[15:25]) # DOB
+                            content_list.append(line[33:]) # Gender
+                        elif self.consultant_idx != 0 and self.consultant_check == 1:
+                            content_list.append(_lines[self.consultant_idx+3]) # Consultant Answer
+                            self.consultant_idx == 0
+                        else: 
+                            content_list.append(line) # Semester Entry
+                self.consultant_check = 0
+
+            _list.append(content_list)
 
+        print('Lines appended to list...')
 
+        return _list
+
+    def search_for_headers(self, line: str, idx: int) -> bool:
+
+        if str(line).startswith(('Fall', 'Spring', 'Date of Birth')) and idx <= 20:
+            return True
+        elif line == 'Consultant/Agency':
+            self.consultant_check += 1
+            self.consultant_idx = idx
+            return True
+        
+        return False
+    
+    def generate_xlsx_sheet(self, _list: list) -> None:
+            
+        df = pd.DataFrame(_list)
+        download_default = str(os.path.join(Path.home(), "Downloads"))
+        filepath = f'{download_default}/total.xlsx'
+        df.to_excel(filepath, index=False, header= ["File Name", "Type", "Name", "Semester", "DOB", "Gender", "Info"])
 
 if __name__ == '__main__':
     x = XLSX()
-    pdf_files = x.find_pdfs_in_downloads()
-    x.read_pdf_content(pdf_files)
+    pdf_files = x.find_pdfs()
+    _list = x.read_pdf_content(pdf_files)
+    x.generate_xlsx_sheet(_list)
diff --git a/main.py b/main.py
@@ -76,64 +76,15 @@ def run(file_path: str, filename: str) -> None:
         r.capture_student_name()
         r.capture_app_type()
 
-        # create_xlsx(translated_spe, filename)
-
         for idx, item in enumerate(translated_spe):
             _list = r.fit_student_data(item)
             r.create_page_structure(folder, filename, _list, idx)
 
     except BaseException as b:
         tk.messagebox.showerror("run() error", f"{sys.exc_info()[1]}")
 
-def create_xlsx(translated_spe: list, filename: str)-> None:
-
-    _xlsx = []
-
-    r = core.Report(translated_spe)
-    r.capture_student_name()
-    r.capture_app_type()
-
-    for idx, item in enumerate(translated_spe):
-        _list = r.fit_student_data(item)
-        _xlsx.append(r.find_consultant_agency(_list, filename))
-
-    for idx, item in enumerate(r.student_name):
-            _xlsx[idx] = (_xlsx[idx], item)
-
-    r.generate_xlsx_sheet(_xlsx, filename[:-4])
-
-def merge_xlsx()-> None:
-    # specifying the path to csv files
-    input_folder = str(os.path.join(Path.home(), "Downloads"))
-    output_file = str(os.path.join(input_folder, 'Spring 2024 total.xlsx'))
-    
-    # Create a list to hold the dataframes
-    dfs = []
-
-    # Iterate over all Excel files in the specified folder
-    for file_name in os.listdir(input_folder):
-        if file_name.endswith('.xlsx') or file_name.endswith('.xls'):
-            file_path = os.path.join(input_folder, file_name)
-            # Read all sheets from the Excel file
-            xls = pd.ExcelFile(file_path, engine='openpyxl')
-            for sheet_name in xls.sheet_names:
-                df = pd.read_excel(file_path, sheet_name=sheet_name)
-                dfs.append(df)
-
-    # Concatenate all dataframes into one
-    merged_df = pd.concat(dfs, ignore_index=True)
-
-    # Drop duplicate rows
-    merged_df = merged_df.drop_duplicates()
-
-    # Save the merged dataframe to a new Excel file
-    merged_df.to_excel(output_file, index=False, engine='openpyxl')
-
 if __name__ == "__main__":
 
     find_spe_files() # Multiple .spe files
     # find_spe_file() # Singluar .spe file
-    print('Done')
-    
-    # merge_xlsx()
-    # print('Done Done')
+    print('Done')