From 1f99fedb060101c3411966de945456682b6f9c5e Mon Sep 17 00:00:00 2001 From: ststefanov Date: Mon, 27 Apr 2020 18:18:32 +0300 Subject: [PATCH 1/2] fixed 8 and B problem and migrate to python3 --- extract_text | 4 +- extract_text.py3 | 321 +++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 323 insertions(+), 2 deletions(-) create mode 100755 extract_text.py3 diff --git a/extract_text b/extract_text index f02c611..a12ca90 100755 --- a/extract_text +++ b/extract_text @@ -169,12 +169,12 @@ def include_box(index, h_, contour): count_children(get_parent(index, h_), h_, contour)) + " children" print "\thas " + str(count_children(index, h_, contour)) + " children" - if is_child(index, h_) and count_children(get_parent(index, h_), h_, contour) <= 2: + if is_child(index, h_) and count_children(get_parent(index, h_), h_, contour) <= 4: if DEBUG: print "\t skipping: is an interior to a letter" return False - if count_children(index, h_, contour) > 2: + if count_children(index, h_, contour) > 4: if DEBUG: print "\t skipping, is a container of letters" return False diff --git a/extract_text.py3 b/extract_text.py3 new file mode 100755 index 0000000..b41eadc --- /dev/null +++ b/extract_text.py3 @@ -0,0 +1,321 @@ +#!/usr/bin/python + +# Processes an image to extract the text portions. Primarily +# used for pre-processing for performing OCR. + +# Based on the paper "Font and Background Color Independent Text Binarization" by +# T Kasar, J Kumar and A G Ramakrishnan +# http://www.m.cs.osakafu-u.ac.jp/cbdar2007/proceedings/papers/O1-1.pdf + +# Copyright (c) 2012, Jason Funk +# Permission is hereby granted, free of charge, to any person obtaining a copy of this software +# and associated documentation files (the "Software"), to deal in the Software without restriction, +# including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +# and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, +# subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all copies or substantial +# portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT +# LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. +# IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +# WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +# SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +import cv2 +import numpy as np +import sys +import os.path + +if len(sys.argv) != 3: + print("%s input_file output_file" % (sys.argv[0])) + sys.exit() +else: + input_file = sys.argv[1] + output_file = sys.argv[2] + +if not os.path.isfile(input_file): + print("No such file '%s'" % input_file) + sys.exit() + +DEBUG = 0 + + +# Determine pixel intensity +# Apparently human eyes register colors differently. +# TVs use this formula to determine +# pixel intensity = 0.30R + 0.59G + 0.11B +def ii(xx, yy): + global img, img_y, img_x + if yy >= img_y or xx >= img_x: + #print "pixel out of bounds ("+str(y)+","+str(x)+")" + return 0 + pixel = img[yy][xx] + return 0.30 * pixel[2] + 0.59 * pixel[1] + 0.11 * pixel[0] + + +# A quick test to check whether the contour is +# a connected shape +def connected(contour): + first = contour[0][0] + last = contour[len(contour) - 1][0] + return abs(first[0] - last[0]) <= 1 and abs(first[1] - last[1]) <= 1 + + +# Helper function to return a given contour +def c(index): + global contours + return contours[index] + + +# Count the number of real children +def count_children(index, h_, contour): + # No children + if h_[index][2] < 0: + return 0 + else: + #If the first child is a contour we care about + # then count it, otherwise don't + if keep(c(h_[index][2])): + count = 1 + else: + count = 0 + + # Also count all of the child's siblings and their children + count += count_siblings(h_[index][2], h_, contour, True) + return count + + +# Quick check to test if the contour is a child +def is_child(index, h_): + return get_parent(index, h_) > 0 + + +# Get the first parent of the contour that we care about +def get_parent(index, h_): + parent = h_[index][3] + while not keep(c(parent)) and parent > 0: + parent = h_[parent][3] + + return parent + + +# Count the number of relevant siblings of a contour +def count_siblings(index, h_, contour, inc_children=False): + # Include the children if necessary + if inc_children: + count = count_children(index, h_, contour) + else: + count = 0 + + # Look ahead + p_ = h_[index][0] + while p_ > 0: + if keep(c(p_)): + count += 1 + if inc_children: + count += count_children(p_, h_, contour) + p_ = h_[p_][0] + + # Look behind + n = h_[index][1] + while n > 0: + if keep(c(n)): + count += 1 + if inc_children: + count += count_children(n, h_, contour) + n = h_[n][1] + return count + + +# Whether we care about this contour +def keep(contour): + return keep_box(contour) and connected(contour) + + +# Whether we should keep the containing box of this +# contour based on it's shape +def keep_box(contour): + xx, yy, w_, h_ = cv2.boundingRect(contour) + + # width and height need to be floats + w_ *= 1.0 + h_ *= 1.0 + + # Test it's shape - if it's too oblong or tall it's + # probably not a real character + if w_ / h_ < 0.1 or w_ / h_ > 10: + if DEBUG: + print("\t Rejected because of shape: (" + str(xx) + "," + str(yy) + "," + str(w_) + "," + str(h_) + ")" + \ + str(w_ / h_)) + return False + + # check size of the box + if ((w_ * h_) > ((img_x * img_y) / 5)) or ((w_ * h_) < 15): + if DEBUG: + print("\t Rejected because of size") + return False + + return True + + +def include_box(index, h_, contour): + if DEBUG: + print(str(index) + ":") + if is_child(index, h_): + print("\tIs a child") + print("\tparent " + str(get_parent(index, h_)) + " has " + str( + count_children(get_parent(index, h_), h_, contour)) + " children") + print("\thas " + str(count_children(index, h_, contour)) + " children") + + if is_child(index, h_) and count_children(get_parent(index, h_), h_, contour) <= 4: + if DEBUG: + print("\t skipping: is an interior to a letter") + return False + + if count_children(index, h_, contour) > 4: + if DEBUG: + print("\t skipping, is a container of letters") + return False + + if DEBUG: + print("\t keeping") + return True + +# Load the image +orig_img = cv2.imread(input_file) + +# Add a border to the image for processing sake +img = cv2.copyMakeBorder(orig_img, 50, 50, 50, 50, cv2.BORDER_CONSTANT) + +# Calculate the width and height of the image +img_y = len(img) +img_x = len(img[0]) + +if DEBUG: + print("Image is " + str(len(img)) + "x" + str(len(img[0]))) + +#Split out each channel +blue, green, red = cv2.split(img) + +# Run canny edge detection on each channel +blue_edges = cv2.Canny(blue, 200, 250) +green_edges = cv2.Canny(green, 200, 250) +red_edges = cv2.Canny(red, 200, 250) + +# Join edges back into image +edges = blue_edges | green_edges | red_edges + +# Find the contours +contours, hierarchy = cv2.findContours(edges.copy(), cv2.RETR_TREE, cv2.CHAIN_APPROX_NONE) + +hierarchy = hierarchy[0] + +if DEBUG: + processed = edges.copy() + rejected = edges.copy() + +# These are the boxes that we are determining +keepers = [] + +# For each contour, find the bounding rectangle and decide +# if it's one we care about +for index_, contour_ in enumerate(contours): + if DEBUG: + print("Processing #%d" % index_) + + x, y, w, h = cv2.boundingRect(contour_) + + # Check the contour and it's bounding box + if keep(contour_) and include_box(index_, hierarchy, contour_): + # It's a winner! + keepers.append([contour_, [x, y, w, h]]) + if DEBUG: + cv2.rectangle(processed, (x, y), (x + w, y + h), (100, 100, 100), 1) + cv2.putText(processed, str(index_), (x, y - 5), cv2.FONT_HERSHEY_PLAIN, 1, (255, 255, 255)) + else: + if DEBUG: + cv2.rectangle(rejected, (x, y), (x + w, y + h), (100, 100, 100), 1) + cv2.putText(rejected, str(index_), (x, y - 5), cv2.FONT_HERSHEY_PLAIN, 1, (255, 255, 255)) + +# Make a white copy of our image +new_image = edges.copy() +new_image.fill(255) +boxes = [] + +# For each box, find the foreground and background intensities +for index_, (contour_, box) in enumerate(keepers): + + # Find the average intensity of the edge pixels to + # determine the foreground intensity + fg_int = 0.0 + for p in contour_: + fg_int += ii(p[0][0], p[0][1]) + + fg_int /= len(contour_) + if DEBUG: + print("FG Intensity for #%d = %d" % (index_, fg_int)) + + # Find the intensity of three pixels going around the + # outside of each corner of the bounding box to determine + # the background intensity + x_, y_, width, height = box + bg_int = \ + [ + # bottom left corner 3 pixels + ii(x_ - 1, y_ - 1), + ii(x_ - 1, y_), + ii(x_, y_ - 1), + + # bottom right corner 3 pixels + ii(x_ + width + 1, y_ - 1), + ii(x_ + width, y_ - 1), + ii(x_ + width + 1, y_), + + # top left corner 3 pixels + ii(x_ - 1, y_ + height + 1), + ii(x_ - 1, y_ + height), + ii(x_, y_ + height + 1), + + # top right corner 3 pixels + ii(x_ + width + 1, y_ + height + 1), + ii(x_ + width, y_ + height + 1), + ii(x_ + width + 1, y_ + height) + ] + + # Find the median of the background + # pixels determined above + bg_int = np.median(bg_int) + + if DEBUG: + print("BG Intensity for #%d = %s" % (index_, repr(bg_int))) + + # Determine if the box should be inverted + if fg_int >= bg_int: + fg = 255 + bg = 0 + else: + fg = 0 + bg = 255 + + # Loop through every pixel in the box and color the + # pixel accordingly + for x in range(x_, x_ + width): + for y in range(y_, y_ + height): + if y >= img_y or x >= img_x: + if DEBUG: + print("pixel out of bounds (%d,%d)" % (y, x)) + continue + if ii(x, y) > fg_int: + new_image[y][x] = bg + else: + new_image[y][x] = fg + +# blur a bit to improve ocr accuracy +new_image = cv2.blur(new_image, (2, 2)) +cv2.imwrite(output_file, new_image) +if DEBUG: + cv2.imwrite('edges.png', edges) + cv2.imwrite('processed.png', processed) + cv2.imwrite('rejected.png', rejected) From 28a16f9a9cfcd7b958b5735c8a5c0580fc8db219 Mon Sep 17 00:00:00 2001 From: ststefanov Date: Mon, 27 Apr 2020 18:27:12 +0300 Subject: [PATCH 2/2] changed blocksize from 1/5 to user defined --- extract_text | 24 +++++++++++++----------- extract_text.py3 | 25 ++++++++++++++----------- 2 files changed, 27 insertions(+), 22 deletions(-) diff --git a/extract_text b/extract_text index a12ca90..6841e02 100755 --- a/extract_text +++ b/extract_text @@ -41,6 +41,18 @@ if not os.path.isfile(input_file): DEBUG = 0 +# Load the image +orig_img = cv2.imread(input_file) + +# Add a border to the image for processing sake +img = cv2.copyMakeBorder(orig_img, 50, 50, 50, 50, cv2.BORDER_CONSTANT) + +# Calculate the width and height of the image +img_y = len(img) +img_x = len(img[0]) + +#MAXBLOCKSIZE = (img_x * img_y) / 5 +MAXBLOCKSIZE = 2500 # Determine pixel intensity # Apparently human eyes register colors differently. @@ -152,7 +164,7 @@ def keep_box(contour): return False # check size of the box - if ((w_ * h_) > ((img_x * img_y) / 5)) or ((w_ * h_) < 15): + if ((w_ * h_) > (MAXBLOCKSIZE)) or ((w_ * h_) < 15): if DEBUG: print "\t Rejected because of size" return False @@ -183,16 +195,6 @@ def include_box(index, h_, contour): print "\t keeping" return True -# Load the image -orig_img = cv2.imread(input_file) - -# Add a border to the image for processing sake -img = cv2.copyMakeBorder(orig_img, 50, 50, 50, 50, cv2.BORDER_CONSTANT) - -# Calculate the width and height of the image -img_y = len(img) -img_x = len(img[0]) - if DEBUG: print "Image is " + str(len(img)) + "x" + str(len(img[0])) diff --git a/extract_text.py3 b/extract_text.py3 index b41eadc..dd2d30c 100755 --- a/extract_text.py3 +++ b/extract_text.py3 @@ -41,6 +41,19 @@ if not os.path.isfile(input_file): DEBUG = 0 +# Load the image +orig_img = cv2.imread(input_file) + +# Add a border to the image for processing sake +img = cv2.copyMakeBorder(orig_img, 50, 50, 50, 50, cv2.BORDER_CONSTANT) + +# Calculate the width and height of the image +img_y = len(img) +img_x = len(img[0]) + +#MAXBLOCKSIZE = (img_x * img_y) / 5 +MAXBLOCKSIZE = 2500 + # Determine pixel intensity # Apparently human eyes register colors differently. @@ -152,7 +165,7 @@ def keep_box(contour): return False # check size of the box - if ((w_ * h_) > ((img_x * img_y) / 5)) or ((w_ * h_) < 15): + if ((w_ * h_) > (MAXBLOCKSIZE)) or ((w_ * h_) < 15): if DEBUG: print("\t Rejected because of size") return False @@ -183,16 +196,6 @@ def include_box(index, h_, contour): print("\t keeping") return True -# Load the image -orig_img = cv2.imread(input_file) - -# Add a border to the image for processing sake -img = cv2.copyMakeBorder(orig_img, 50, 50, 50, 50, cv2.BORDER_CONSTANT) - -# Calculate the width and height of the image -img_y = len(img) -img_x = len(img[0]) - if DEBUG: print("Image is " + str(len(img)) + "x" + str(len(img[0])))