-
Notifications
You must be signed in to change notification settings - Fork 212
Home
tesstrain (formerly ocrd-train) is a collection of scripts and documentation for training of Tesseract with LSTM (supported by Tesseract 4 and newer releases).
Currently it includes a Makefile
which allows training from real line images with ground truth (text transcriptions).
Such data is available from a number of sources, see https://github.com/cneud/ocr-gt for a list.
Training from synthetic images is supported by training scripts (Shell, Python) which are still part of the Tesseract code base.
- Training Fraktur with Austrian Newspapers
- Training Fraktur with Neue Zürcher Zeitung
- Training Fraktur with GT4HistOCR
- Training Fraktur and Handwriting with German primers
- Training Arabic Handwriting
- Training Handwritten Text with German Konzilsprotokolle
import fitz # PyMuPDF import pytesseract from PIL import Image import io, re import pandas as pd from reportlab.platypus import SimpleDocTemplate, Table, TableStyle from reportlab.lib import colors
PDF_INPUT = "5.pdf" # input scanned PDF (your CamScanner file) CSV_OUTPUT = "contacts.csv" XLSX_OUTPUT = "contacts.xlsx" PDF_OUTPUT = "clean_contacts.pdf" DPI = 250 # OCR quality (200–300 is usually good)
gst_regex = re.compile(r'\b[0-9]{2}[A-Z]{5}[0-9]{4}[A-Z][0-9A-Z]Z[0-9A-Z]\b') phone_regex = re.compile(r'(?:+91|0)?[6-9]\d{9}')
def ocr_page(page): # convert PDF page → image pix = page.get_pixmap(dpi=DPI) img = Image.open(io.BytesIO(pix.tobytes("png"))) # OCR text = pytesseract.image_to_string(img, lang="eng")
consignee, description, gst, phone = None, None, None, None
lines = [ln.strip() for ln in text.splitlines() if ln.strip()]
for i, line in enumerate(lines):
up = line.upper()
if "CONSIGNEE" in up:
consignee = line.split(":", 1)[-1].strip() if ":" in line else line
if "DESCRIPTION" in up:
description = line.split(":", 1)[-1].strip() if ":" in line else line
# regex search for GST and Phone
gst = gst_regex.search(text)
phone = phone_regex.search(text)
gst = gst.group(0) if gst else None
phone = phone.group(0) if phone else None
return consignee, description, gst, phone
def main(): doc = fitz.open(PDF_INPUT) rows = []
for i, page in enumerate(doc, start=1):
consignee, description, gst, phone = ocr_page(page)
if consignee or description or gst or phone:
rows.append([consignee, description, gst, phone])
df = pd.DataFrame(rows, columns=["Consignee", "Description", "GST Number", "Phone Number"])
# remove duplicates (based on Phone + GST)
df = df.drop_duplicates(subset=["Phone Number", "GST Number"])
# save Excel/CSV
df.to_csv(CSV_OUTPUT, index=False)
df.to_excel(XLSX_OUTPUT, index=False)
# save as clean PDF table
data = [list(df.columns)] + df.values.tolist()
pdf = SimpleDocTemplate(PDF_OUTPUT)
table = Table(data)
table.setStyle(TableStyle([
("BACKGROUND", (0,0), (-1,0), colors.lightgrey),
("GRID", (0,0), (-1,-1), 0.5, colors.black),
("FONTNAME", (0,0), (-1,0), "Helvetica-Bold"),
("ALIGN", (0,0), (-1,-1), "LEFT"),
]))
pdf.build([table])
print(f"✅ Done! Extracted {len(df)} unique contacts.")
print(f"Saved: {CSV_OUTPUT}, {XLSX_OUTPUT}, {PDF_OUTPUT}")
if name == "main": main()