diff --git a/pypdfocr/pypdfocr.py b/pypdfocr/pypdfocr.py index 7ee7e9e..5561a64 100644 --- a/pypdfocr/pypdfocr.py +++ b/pypdfocr/pypdfocr.py @@ -21,27 +21,27 @@ import itertools from functools import wraps -from version import __version__ +from .version import __version__ from PIL import Image import yaml import multiprocessing # Replace the Popen routine to allow win32 pyinstaller to build from multiprocessing import forking -from pypdfocr_multiprocessing import _Popen +from .pypdfocr_multiprocessing import _Popen forking.Popen = _Popen -from pypdfocr_pdf import PyPdf -from pypdfocr_tesseract import PyTesseract -from pypdfocr_gs import PyGs -from pypdfocr_watcher import PyPdfWatcher -from pypdfocr_pdffiler import PyPdfFiler -from pypdfocr_filer_dirs import PyFilerDirs -from pypdfocr_filer_evernote import PyFilerEvernote -from pypdfocr_preprocess import PyPreprocess +from .pypdfocr_pdf import PyPdf +from .pypdfocr_tesseract import PyTesseract +from .pypdfocr_gs import PyGs +from .pypdfocr_watcher import PyPdfWatcher +from .pypdfocr_pdffiler import PyPdfFiler +from .pypdfocr_filer_dirs import PyFilerDirs +from .pypdfocr_filer_evernote import PyFilerEvernote +from .pypdfocr_preprocess import PyPreprocess def error(text): - print("ERROR: %s" % text) + print(("ERROR: %s" % text)) sys.exit(-1) # decorator to retry multiple times @@ -299,7 +299,7 @@ def _setup_filing(self): keyword_count = 0 folder_count = 0 if 'folders' in self.config: - for folder, keywords in self.config['folders'].items(): + for folder, keywords in list(self.config['folders'].items()): folder_count +=1 keyword_count += len(keywords) # Make sure keywords are lower-cased before adding @@ -307,8 +307,8 @@ def _setup_filing(self): self.filer.add_folder_target(folder, keywords) print ("Filing of PDFs is enabled") - print (" - %d target filing folders" % (folder_count)) - print (" - %d keywords" % (keyword_count)) + print((" - %d target filing folders" % (folder_count))) + print((" - %d keywords" % (keyword_count))) def _setup_external_tools(self): @@ -337,7 +337,7 @@ def run_conversion(self, pdf_filename): :returns: OCR'ed PDF :rtype: filename string """ - print ("Starting conversion of %s" % pdf_filename) + print(("Starting conversion of %s" % pdf_filename)) try: # Make the images for Tesseract img_dpi, glob_img_filename = self.gs.make_img_from_pdf(pdf_filename) @@ -367,11 +367,11 @@ def run_conversion(self, pdf_filename): time.sleep(1) if not self.debug: # Need to clean up the original image files before preprocessing - if locals().has_key("fns"): # Have to check if this was set before exception raised + if "fns" in locals(): # Have to check if this was set before exception raised logging.info("Cleaning up %s" % fns) self._clean_up_files(fns) - if locals().has_key("preprocess_imagefilenames"): # Have to check if this was set before exception raised + if "preprocess_imagefilenames" in locals(): # Have to check if this was set before exception raised logging.info("Cleaning up %s" % preprocess_imagefilenames) self._clean_up_files(preprocess_imagefilenames) # splat the hocr_filenames as it is a list of pairs for ext in [".hocr", ".html", ".txt"]: @@ -384,7 +384,7 @@ def run_conversion(self, pdf_filename): #self._clean_up_files([x[1].replace(".hocr", ".txt") for x in hocr_filenames]) - print ("Completed conversion successfully to %s" % ocr_pdf_filename) + print(("Completed conversion successfully to %s" % ocr_pdf_filename)) return ocr_pdf_filename def file_converted_file(self, ocr_pdffilename, original_pdffilename): @@ -399,11 +399,11 @@ def file_converted_file(self, ocr_pdffilename, original_pdffilename): "rtype: string """ filed_path = self.pdf_filer.move_to_matching_folder(ocr_pdffilename) - print("Filed %s to %s as %s" % (ocr_pdffilename, os.path.dirname(filed_path), os.path.basename(filed_path))) + print(("Filed %s to %s as %s" % (ocr_pdffilename, os.path.dirname(filed_path), os.path.basename(filed_path)))) tgt_path = self.pdf_filer.file_original(original_pdffilename) if tgt_path != original_pdffilename: - print("Filed original file %s to %s as %s" % (original_pdffilename, os.path.dirname(tgt_path), os.path.basename(tgt_path))) + print(("Filed original file %s to %s as %s" % (original_pdffilename, os.path.dirname(tgt_path), os.path.basename(tgt_path)))) return os.path.dirname(filed_path) @@ -467,7 +467,7 @@ def go(self, argv): except KeyboardInterrupt: break except Exception as e: - print traceback.print_exc(e) + print(traceback.print_exc(e)) py_watcher.stop() else: diff --git a/pypdfocr/pypdfocr_filer.py b/pypdfocr/pypdfocr_filer.py index b3b32c7..23ab637 100644 --- a/pypdfocr/pypdfocr_filer.py +++ b/pypdfocr/pypdfocr_filer.py @@ -14,12 +14,11 @@ import abc import os, logging -class PyFiler(object): +class PyFiler(object, metaclass=abc.ABCMeta): """ Abstract base class for defining filing objects, whether you want to save to a file-system/directory structure or to something like Evernote """ - __metaclass__ = abc.ABCMeta @abc.abstractmethod def move_to_matching_folder(self, filename): diff --git a/pypdfocr/pypdfocr_filer_dirs.py b/pypdfocr/pypdfocr_filer_dirs.py index dc19330..c7dc73f 100644 --- a/pypdfocr/pypdfocr_filer_dirs.py +++ b/pypdfocr/pypdfocr_filer_dirs.py @@ -16,7 +16,7 @@ import os import shutil -from pypdfocr_filer import PyFiler +from .pypdfocr_filer import PyFiler """ Implementation of a filer class diff --git a/pypdfocr/pypdfocr_filer_evernote.py b/pypdfocr/pypdfocr_filer_evernote.py index 80ec115..deb9209 100644 --- a/pypdfocr/pypdfocr_filer_evernote.py +++ b/pypdfocr/pypdfocr_filer_evernote.py @@ -19,7 +19,7 @@ import time import sys -from pypdfocr_filer import PyFiler +from .pypdfocr_filer import PyFiler import functools @@ -87,7 +87,7 @@ def get_target_folder(self): return self._target_folder def set_target_folder (self, target_folder): """ Override this to make sure we only have the basename""" - print("Setting target_folder %s" % target_folder) + print(("Setting target_folder %s" % target_folder)) if target_folder: self._target_folder = os.path.basename(target_folder) else: @@ -134,14 +134,14 @@ def _connect_to_evernote(self, dictUserInfo): user = self.user_store.getUser() except EDAMUserException as e: err = e.errorCode - print("Error attempting to authenticate to Evernote: %s - %s" % (EDAMErrorCode._VALUES_TO_NAMES[err], e.parameter)) + print(("Error attempting to authenticate to Evernote: %s - %s" % (EDAMErrorCode._VALUES_TO_NAMES[err], e.parameter))) except EDAMSystemException as e: err = e.errorCode - print("Error attempting to authenticate to Evernote: %s - %s" % (EDAMErrorCode._VALUES_TO_NAMES[err], e.message)) + print(("Error attempting to authenticate to Evernote: %s - %s" % (EDAMErrorCode._VALUES_TO_NAMES[err], e.message))) sys.exit(-1) if user: - print("Authenticated to evernote as user %s" % user.username) + print(("Authenticated to evernote as user %s" % user.username)) return True def add_folder_target(self, folder, keywords): @@ -274,9 +274,9 @@ def move_to_matching_folder(self, filename, foldername): logging.info("[MATCH] %s --> %s" % (filename, foldername)) # Check if the evernote notebook exists - print ("Checking for notebook named %s" % foldername) + print(("Checking for notebook named %s" % foldername)) notebook = self._check_and_make_notebook(foldername) - print("Uploading %s to %s" % (filename, foldername)) + print(("Uploading %s to %s" % (filename, foldername))) note = self._create_evernote_note(notebook, filename) diff --git a/pypdfocr/pypdfocr_gs.py b/pypdfocr/pypdfocr_gs.py index 5599082..c8eebdf 100644 --- a/pypdfocr/pypdfocr_gs.py +++ b/pypdfocr/pypdfocr_gs.py @@ -26,7 +26,7 @@ import glob def error(text): - print("ERROR: %s" % text) + print(("ERROR: %s" % text)) exit(-1) class PyGs(object): @@ -112,7 +112,7 @@ def _find_windows_gs(self): error(self.msgs['GS_MISSING_BINARY']) def _warn(self, msg): - print("WARNING: %s" % msg) + print(("WARNING: %s" % msg)) def _get_dpi(self, pdf_filename): if not os.path.exists(pdf_filename): @@ -157,7 +157,7 @@ def _get_dpi(self, pdf_filename): if abs(xdpi-ydpi) > xdpi*.05: # Make sure the two dpi's are within 5% self._warn("X-dpi is %d, Y-dpi is %d, defaulting to %d" % (xdpi, ydpi, self.output_dpi)) else: - print("Using %d DPI" % self.output_dpi) + print(("Using %d DPI" % self.output_dpi)) except Exception as e: @@ -174,7 +174,7 @@ def _run_gs(self, options, output_filename, pdf_filename): out = subprocess.check_output(cmd, shell=True) except subprocess.CalledProcessError as e: - print e.output + print(e.output) if "undefined in .getdeviceparams" in e.output: error(self.msgs['GS_OUTDATED']) else: diff --git a/pypdfocr/pypdfocr_pdf.py b/pypdfocr/pypdfocr_pdf.py index bdc1f86..303bd8a 100644 --- a/pypdfocr/pypdfocr_pdf.py +++ b/pypdfocr/pypdfocr_pdf.py @@ -31,7 +31,7 @@ import tempfile import glob -import cStringIO +import io import base64 import zlib import math @@ -52,7 +52,7 @@ from reportlab.lib.enums import TA_LEFT from reportlab.platypus.paragraph import Paragraph -from pypdfocr_util import Retry +from .pypdfocr_util import Retry from functools import partial class RotatedPara(Paragraph): diff --git a/pypdfocr/pypdfocr_pdffiler.py b/pypdfocr/pypdfocr_pdffiler.py index 1bb23f5..0dfa8eb 100644 --- a/pypdfocr/pypdfocr_pdffiler.py +++ b/pypdfocr/pypdfocr_pdffiler.py @@ -1,80 +1,80 @@ - -# Copyright 2013 Virantha Ekanayake All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -""" - Provides capability to search PDFs and file to a specific folder based - on keywords -""" - -from sets import Set -import sys, os -import re -import logging -import shutil - -from PyPDF2 import PdfFileReader -from pypdfocr_filer import PyFiler -from pypdfocr_filer_dirs import PyFilerDirs - -class PyPdfFiler(object): - def __init__(self, filer): - - assert isinstance(filer, PyFiler) - self.filer = filer # Must be a subclass of PyFiler - - # Whether to fall back on filename for matching keywords against - # if there is no match in the text - self.file_using_filename = False - - def iter_pdf_page_text(self, filename): - self.filename = filename - reader = PdfFileReader(filename) - logging.info("pdf scanner found %d pages in %s" % (reader.getNumPages(), filename)) - for pgnum in range(reader.getNumPages()): - text = reader.getPage(pgnum).extractText() - text = text.encode('ascii', 'ignore') - text = text.replace('\n', ' ') - yield text - - def _get_matching_folder(self, pdfText): - searchText = pdfText.lower() - for folder,strings in self.filer.folder_targets.items(): - for s in strings: - logging.debug("Checking string %s" % s) - if s in searchText: - logging.info("Matched keyword '%s'" % s) - return folder - # No match found, so return - return None - - def file_original (self, original_filename): - return self.filer.file_original(original_filename) - - def move_to_matching_folder(self, filename): - for page_text in self.iter_pdf_page_text(filename): - tgt_folder = self._get_matching_folder(page_text) - if tgt_folder: break # Stop searching through pdf pages as soon as we find a match - - if not tgt_folder and self.file_using_filename: - tgt_folder = self._get_matching_folder(filename) - - tgt_file = self.filer.move_to_matching_folder(filename, tgt_folder) - return tgt_file - -if __name__ == '__main__': - p = PyPdfFiler(PyFilerDirs()) - for page_text in p.iter_pdf_page_text("scan_ocr.pdf"): - print (page_text) - + +# Copyright 2013 Virantha Ekanayake All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" + Provides capability to search PDFs and file to a specific folder based + on keywords +""" + +from sets import Set +import sys, os +import re +import logging +import shutil + +from PyPDF2 import PdfFileReader +from .pypdfocr_filer import PyFiler +from .pypdfocr_filer_dirs import PyFilerDirs + +class PyPdfFiler(object): + def __init__(self, filer): + + assert isinstance(filer, PyFiler) + self.filer = filer # Must be a subclass of PyFiler + + # Whether to fall back on filename for matching keywords against + # if there is no match in the text + self.file_using_filename = False + + def iter_pdf_page_text(self, filename): + self.filename = filename + reader = PdfFileReader(filename) + logging.info("pdf scanner found %d pages in %s" % (reader.getNumPages(), filename)) + for pgnum in range(reader.getNumPages()): + text = reader.getPage(pgnum).extractText() + text = text.encode('ascii', 'ignore') + text = text.replace('\n', ' ') + yield text + + def _get_matching_folder(self, pdfText): + searchText = pdfText.lower() + for folder,strings in list(self.filer.folder_targets.items()): + for s in strings: + logging.debug("Checking string %s" % s) + if s in searchText: + logging.info("Matched keyword '%s'" % s) + return folder + # No match found, so return + return None + + def file_original (self, original_filename): + return self.filer.file_original(original_filename) + + def move_to_matching_folder(self, filename): + for page_text in self.iter_pdf_page_text(filename): + tgt_folder = self._get_matching_folder(page_text) + if tgt_folder: break # Stop searching through pdf pages as soon as we find a match + + if not tgt_folder and self.file_using_filename: + tgt_folder = self._get_matching_folder(filename) + + tgt_file = self.filer.move_to_matching_folder(filename, tgt_folder) + return tgt_file + +if __name__ == '__main__': + p = PyPdfFiler(PyFilerDirs()) + for page_text in p.iter_pdf_page_text("scan_ocr.pdf"): + print (page_text) + diff --git a/pypdfocr/pypdfocr_preprocess.py b/pypdfocr/pypdfocr_preprocess.py index e942cc3..75bdf74 100644 --- a/pypdfocr/pypdfocr_preprocess.py +++ b/pypdfocr/pypdfocr_preprocess.py @@ -28,7 +28,7 @@ import signal from multiprocessing import Pool -from pypdfocr_interrupts import init_worker +from .pypdfocr_interrupts import init_worker # Ugly hack to pass in object method to the multiprocessing library # From http://www.rueckstiess.net/research/snippets/show/ca1d7d90 @@ -47,7 +47,7 @@ def __init__(self, config): self.threads = config.get('threads', 4) def _warn(self, msg): # pragma: no cover - print("WARNING: %s" % msg) + print(("WARNING: %s" % msg)) def cmd(self, cmd_list): if isinstance(cmd_list, list): @@ -58,7 +58,7 @@ def cmd(self, cmd_list): logging.debug(out) return out except subprocess.CalledProcessError as e: - print e.output + print(e.output) self._warn("Could not run command %s" % cmd_list) @@ -100,7 +100,7 @@ def preprocess(self, in_filenames): pool = Pool(processes=self.threads, initializer=init_worker) try: logging.info("Starting preprocessing parallel execution") - preprocessed_filenames = pool.map(unwrap_self,zip([self]*len(fns),fns)) + preprocessed_filenames = pool.map(unwrap_self,list(zip([self]*len(fns),fns))) pool.close() except KeyboardInterrupt or Exception: print("Caught keyboard interrupt... terminating") diff --git a/pypdfocr/pypdfocr_tesseract.py b/pypdfocr/pypdfocr_tesseract.py index 8f246ee..2e12f40 100644 --- a/pypdfocr/pypdfocr_tesseract.py +++ b/pypdfocr/pypdfocr_tesseract.py @@ -26,10 +26,10 @@ from subprocess import CalledProcessError from multiprocessing import Pool -from pypdfocr_interrupts import init_worker +from .pypdfocr_interrupts import init_worker def error(text): - print("ERROR: %s" % text) + print(("ERROR: %s" % text)) sys.exit(-1) # Ugly hack to pass in object method to the multiprocessing library @@ -125,7 +125,7 @@ def _is_version_uptodate(self): return version_good, ver_str def _warn(self, msg): # pragma: no cover - print("WARNING: %s" % msg) + print(("WARNING: %s" % msg)) def make_hocr_from_pnms(self, fns): @@ -139,7 +139,7 @@ def make_hocr_from_pnms(self, fns): pool = Pool(processes=self.threads, initializer=init_worker) try: - hocr_filenames = pool.map(unwrap_self, zip([self]*len(fns), fns)) + hocr_filenames = pool.map(unwrap_self, list(zip([self]*len(fns), fns))) pool.close() except KeyboardInterrupt or Exception: print("Caught keyboard interrupt... terminating") @@ -148,7 +148,7 @@ def make_hocr_from_pnms(self, fns): finally: pool.join() - return zip(fns,hocr_filenames) + return list(zip(fns,hocr_filenames)) def make_hocr_from_pnm(self, img_filename): @@ -166,7 +166,7 @@ def make_hocr_from_pnm(self, img_filename): ret_output = subprocess.check_output(cmd, shell=True, stderr=subprocess.STDOUT) except subprocess.CalledProcessError as e: # Could not run tesseract - print e.output + print(e.output) self._warn (self.msgs['TS_FAILED']) if os.path.isfile(hocr_filename): diff --git a/pypdfocr/pypdfocr_watcher.py b/pypdfocr/pypdfocr_watcher.py index f7ef556..ab4ebfe 100755 --- a/pypdfocr/pypdfocr_watcher.py +++ b/pypdfocr/pypdfocr_watcher.py @@ -1,152 +1,152 @@ -""" -Something -""" - -import sys, os -import re -import logging -import shutil -import time -import glob - -from threading import Lock - -from watchdog.observers import Observer -from watchdog.events import LoggingEventHandler -from watchdog.events import FileSystemEventHandler - - -class PyPdfWatcher(FileSystemEventHandler): - """ - Watch a folder for new pdf files. - - If new file event, then add it to queue with timestamp. - If file mofified event, then change timestamp in queue. - Every few seconds pop-off queue and if timestamp older than 3 seconds, - process the file else, push it back onto queue. - """ - events = {} - events_lock = Lock() - - def __init__(self, monitor_dir, config): - FileSystemEventHandler.__init__(self) - - self.monitor_dir = monitor_dir - if not config: config = {} - - self.scan_interval = config.get('scan_interval', 3) # If no updates in 3 seconds (or user specified option in config file) process file - - def start(self): - self.observer = Observer() - self.observer.schedule(self, self.monitor_dir) - self.observer.start() - print("Starting to watch for new pdfs in %s" % (self.monitor_dir)) - while True: - logging.info("Sleeping for %d seconds" % self.scan_interval) - time.sleep(self.scan_interval) - newFile = self.check_queue() - if newFile: - yield newFile - self.observer.join() - - - def stop(self): - self.observer.stop() - - def rename_file_with_spaces(self, pdf_filename): - """ - Rename any portion of a filename that has spaces in the basename with underscores. - Does not affect spaces in the directory path. - - :param pdf_filename: Filename to remove spaces - :type pdf_filename: string - :returns: Modified filename - :rtype: string - """ - filepath, filename = os.path.split(pdf_filename) - if ' ' in filename: - newFilename = os.path.join(filepath, filename.replace(' ','_')) - logging.debug("Renaming spaces") - logging.debug("---> %s \n ------> %s" % (pdf_filename, newFilename)) - shutil.move(pdf_filename, newFilename) - return newFilename - else: - return pdf_filename - - def check_for_new_pdf(self,ev_path): - """ - Called by the file watching api on any file creations/modifications. - For any file ending with ".pdf", but not "_ocr.pdf", it adds new files - to the event queue with the current time stamp, or it updates existing files in - the queue with the current timestamp. This queue is used to track files and - keep track of their last "touched" time, so we can start processing a file if - :func:`check_queue` finds a file that hasn't been touched in a while. - - If the file does note exist in the events dict: - - - Add it with the current time - - Otherwise: - - - If the file time is marked as -1, delete it from the dict - - Else, update the time in the dict to the current time - - """ - if ev_path.endswith(".pdf"): - if not ev_path.endswith(("_ocr.pdf", "_test.pdf")): - PyPdfWatcher.events_lock.acquire() - if not ev_path in PyPdfWatcher.events: - PyPdfWatcher.events[ev_path] = time.time() - logging.info ("Adding %s to event queue" % ev_path) - else: - if PyPdfWatcher.events[ev_path] == -1: - logging.info ( "%s removing from event queue" % (ev_path)) - del PyPdfWatcher.events[ev_path] - else: - newTime = time.time() - logging.debug ( "%s already in event queue, updating timestamp to %d" % (ev_path, newTime)) - PyPdfWatcher.events[ev_path] = newTime - PyPdfWatcher.events_lock.release() - - - - def on_created(self, event): - logging.debug ("on_created: %s at time %d" % (event.src_path, time.time())) - self.check_for_new_pdf(event.src_path) - - def on_moved(self, event): - logging.debug ("on_moved: %s" % event.src_path) - self.check_for_new_pdf(event.dest_path) - - def on_modified(self, event): - logging.debug ("on_modified: %s" % event.src_path) - self.check_for_new_pdf(event.src_path) - - def check_queue(self): - """ - This function is called at regular intervals by :func:`start`. - - Iterate through the events, and if there is any with a timestamp - greater than the scan_interval, return it and set its timestamp to -1 - for purging later. - - :returns: Filename if available to process, otherwise None. - """ - now = time.time() - PyPdfWatcher.events_lock.acquire() - for monitored_file, timestamp in PyPdfWatcher.events.items(): - if timestamp == -1: - del PyPdfWatcher.events[monitored_file] - elif now - timestamp > self.scan_interval: - logging.info("Processing new file %s" % (monitored_file)) - # Remove this file from the dict - del PyPdfWatcher.events[monitored_file] - monitored_file = self.rename_file_with_spaces(monitored_file) - PyPdfWatcher.events[monitored_file] = -1 # Add back into queue and mark as not needing further action in the event handler - PyPdfWatcher.events_lock.release() - return monitored_file - PyPdfWatcher.events_lock.release() - return None - - - +""" +Something +""" + +import sys, os +import re +import logging +import shutil +import time +import glob + +from threading import Lock + +from watchdog.observers import Observer +from watchdog.events import LoggingEventHandler +from watchdog.events import FileSystemEventHandler + + +class PyPdfWatcher(FileSystemEventHandler): + """ + Watch a folder for new pdf files. + + If new file event, then add it to queue with timestamp. + If file mofified event, then change timestamp in queue. + Every few seconds pop-off queue and if timestamp older than 3 seconds, + process the file else, push it back onto queue. + """ + events = {} + events_lock = Lock() + + def __init__(self, monitor_dir, config): + FileSystemEventHandler.__init__(self) + + self.monitor_dir = monitor_dir + if not config: config = {} + + self.scan_interval = config.get('scan_interval', 3) # If no updates in 3 seconds (or user specified option in config file) process file + + def start(self): + self.observer = Observer() + self.observer.schedule(self, self.monitor_dir) + self.observer.start() + print(("Starting to watch for new pdfs in %s" % (self.monitor_dir))) + while True: + logging.info("Sleeping for %d seconds" % self.scan_interval) + time.sleep(self.scan_interval) + newFile = self.check_queue() + if newFile: + yield newFile + self.observer.join() + + + def stop(self): + self.observer.stop() + + def rename_file_with_spaces(self, pdf_filename): + """ + Rename any portion of a filename that has spaces in the basename with underscores. + Does not affect spaces in the directory path. + + :param pdf_filename: Filename to remove spaces + :type pdf_filename: string + :returns: Modified filename + :rtype: string + """ + filepath, filename = os.path.split(pdf_filename) + if ' ' in filename: + newFilename = os.path.join(filepath, filename.replace(' ','_')) + logging.debug("Renaming spaces") + logging.debug("---> %s \n ------> %s" % (pdf_filename, newFilename)) + shutil.move(pdf_filename, newFilename) + return newFilename + else: + return pdf_filename + + def check_for_new_pdf(self,ev_path): + """ + Called by the file watching api on any file creations/modifications. + For any file ending with ".pdf", but not "_ocr.pdf", it adds new files + to the event queue with the current time stamp, or it updates existing files in + the queue with the current timestamp. This queue is used to track files and + keep track of their last "touched" time, so we can start processing a file if + :func:`check_queue` finds a file that hasn't been touched in a while. + + If the file does note exist in the events dict: + + - Add it with the current time + + Otherwise: + + - If the file time is marked as -1, delete it from the dict + - Else, update the time in the dict to the current time + + """ + if ev_path.endswith(".pdf"): + if not ev_path.endswith(("_ocr.pdf", "_test.pdf")): + PyPdfWatcher.events_lock.acquire() + if not ev_path in PyPdfWatcher.events: + PyPdfWatcher.events[ev_path] = time.time() + logging.info ("Adding %s to event queue" % ev_path) + else: + if PyPdfWatcher.events[ev_path] == -1: + logging.info ( "%s removing from event queue" % (ev_path)) + del PyPdfWatcher.events[ev_path] + else: + newTime = time.time() + logging.debug ( "%s already in event queue, updating timestamp to %d" % (ev_path, newTime)) + PyPdfWatcher.events[ev_path] = newTime + PyPdfWatcher.events_lock.release() + + + + def on_created(self, event): + logging.debug ("on_created: %s at time %d" % (event.src_path, time.time())) + self.check_for_new_pdf(event.src_path) + + def on_moved(self, event): + logging.debug ("on_moved: %s" % event.src_path) + self.check_for_new_pdf(event.dest_path) + + def on_modified(self, event): + logging.debug ("on_modified: %s" % event.src_path) + self.check_for_new_pdf(event.src_path) + + def check_queue(self): + """ + This function is called at regular intervals by :func:`start`. + + Iterate through the events, and if there is any with a timestamp + greater than the scan_interval, return it and set its timestamp to -1 + for purging later. + + :returns: Filename if available to process, otherwise None. + """ + now = time.time() + PyPdfWatcher.events_lock.acquire() + for monitored_file, timestamp in list(PyPdfWatcher.events.items()): + if timestamp == -1: + del PyPdfWatcher.events[monitored_file] + elif now - timestamp > self.scan_interval: + logging.info("Processing new file %s" % (monitored_file)) + # Remove this file from the dict + del PyPdfWatcher.events[monitored_file] + monitored_file = self.rename_file_with_spaces(monitored_file) + PyPdfWatcher.events[monitored_file] = -1 # Add back into queue and mark as not needing further action in the event handler + PyPdfWatcher.events_lock.release() + return monitored_file + PyPdfWatcher.events_lock.release() + return None + + + diff --git a/setup.py b/setup.py index 585e145..a9ee10d 100644 --- a/setup.py +++ b/setup.py @@ -1,4 +1,4 @@ -from __future__ import print_function + from setuptools import setup, find_packages import pypdfocr