From 560a3ea9573b21034e76a5b3aff051ecaa7b28de Mon Sep 17 00:00:00 2001
From: Joe Hosteny <jhosteny@gmail.com>
Date: Thu, 7 Nov 2013 16:16:17 -0500
Subject: [PATCH] Enable specifying a config file. If a config file is
 specified, see if hocr output is enabled. If so, we also generate the text
 file from the hocr output, and back annotate the hocr output with word
 positions in HTML data attributes.

---
 docsplit.gemspec               |  4 +-
 lib/docsplit.rb                |  5 ++
 lib/docsplit/command_line.rb   |  5 +-
 lib/docsplit/text_cleaner.rb   | 36 +++++++++----
 lib/docsplit/text_extractor.rb | 93 ++++++++++++++++++++++++++++++----
 test/unit/test_extract_text.rb | 28 ++++++++++
 6 files changed, 151 insertions(+), 20 deletions(-)

diff --git a/docsplit.gemspec b/docsplit.gemspec
index d3526bb..2ffb04c 100755
--- a/docsplit.gemspec
+++ b/docsplit.gemspec
@@ -21,4 +21,6 @@ Gem::Specification.new do |s|
 
   s.files = Dir['build/**/*', 'lib/**/*', 'bin/*', 'vendor/**/*',
                 'docsplit.gemspec', 'LICENSE', 'README']
-end
\ No newline at end of file
+
+  s.add_dependency "nokogiri"
+end
diff --git a/lib/docsplit.rb b/lib/docsplit.rb
index 5001413..395a40f 100755
--- a/lib/docsplit.rb
+++ b/lib/docsplit.rb
@@ -79,6 +79,11 @@ def self.clean_text(text)
     TextCleaner.new.clean(text)
   end
 
+  # Utility method to clean OCR'd text in hOCR output format.
+  def self.clean_hocr(html)
+    TextCleaner.new.clean_hocr(html)
+  end
+
   private
 
   # Normalize a value in an options hash for the command line.
diff --git a/lib/docsplit/command_line.rb b/lib/docsplit/command_line.rb
index 7c7af08..f079d45 100755
--- a/lib/docsplit/command_line.rb
+++ b/lib/docsplit/command_line.rb
@@ -91,6 +91,9 @@ def parse_options
         opts.on('--[no-]ocr', 'force OCR to be used, or disable OCR') do |o|
           @options[:ocr] = o
         end
+        opts.on('-c', '--config [FILE]', 'use the specified config file') do |c|
+          @options[:config] = c
+        end
         opts.on('--no-clean', 'disable cleaning of OCR\'d text') do |c|
           @options[:clean] = false
         end
@@ -119,4 +122,4 @@ def parse_options
 
   end
 
-end
\ No newline at end of file
+end
diff --git a/lib/docsplit/text_cleaner.rb b/lib/docsplit/text_cleaner.rb
index c4aac01..ceee965 100644
--- a/lib/docsplit/text_cleaner.rb
+++ b/lib/docsplit/text_cleaner.rb
@@ -1,4 +1,5 @@
 require 'strscan'
+require 'nokogiri'
 
 module Docsplit
 
@@ -32,16 +33,8 @@ class TextCleaner
     REPEATED    = /(\b\S{1,2}\s+)(\S{1,3}\s+){5,}(\S{1,2}\s+)/
     SINGLETONS  = /^[AaIi]$/
 
-    # For the time being, `clean` uses the regular StringScanner, and not the
-    # multibyte-aware version, coercing to ASCII first.
     def clean(text)
-      if String.method_defined?(:encode)
-        text.encode!('ascii', :invalid => :replace, :undef => :replace, :replace => '?')
-      else
-        require 'iconv' unless defined?(Iconv)
-        text = Iconv.iconv('ascii//translit//ignore', 'utf-8', text).first
-      end
-
+      text = get_conversion_method.call(text)
       scanner = StringScanner.new(text)
       cleaned = []
       spaced  = false
@@ -60,6 +53,31 @@ def clean(text)
       end
     end
 
+    # When cleaning hOCR output, we follow a slightly simplied cleaning
+    # heuristic. Simply look at the individual word embedded within the
+    # XML text node that is a child of the XML element with the class
+    # attribute set to '.ocrx_word.' If it is garbage, delete that node.
+    def clean_hocr(xhtml)
+      convert = get_conversion_method
+      xml = Nokogiri::XML(xhtml)
+      xml.css('.ocrx_word').each do |elt|
+        word = xml.css('.ocrx_word').last.xpath(".//text()").text
+        elt.remove if garbage(convert.call(word))
+      end
+      xml.to_s
+    end
+
+    # For the time being, `clean` uses the regular StringScanner, and not the
+    # multibyte-aware version, coercing to ASCII first.
+    def get_conversion_method
+      if String.method_defined?(:encode)
+        lambda { |text| text.encode('ascii', :invalid => :replace, :undef => :replace, :replace => '?') }
+      else
+        require 'iconv' unless defined?(Iconv)
+        lambda { |text| Iconv.iconv('ascii//translit//ignore', 'utf-8', text).first }
+      end
+    end
+
     # Is a given word OCR garbage?
     def garbage(w)
       acronym = w =~ ACRONYM
diff --git a/lib/docsplit/text_extractor.rb b/lib/docsplit/text_extractor.rb
index 0d55f32..b10f6e0 100644
--- a/lib/docsplit/text_extractor.rb
+++ b/lib/docsplit/text_extractor.rb
@@ -1,3 +1,5 @@
+require 'nokogiri'
+
 module Docsplit
 
   # Delegates to **pdftotext** and **tesseract** in order to extract text from
@@ -21,6 +23,10 @@ class TextExtractor
 
     MIN_TEXT_PER_PAGE = 100 # in bytes
 
+    HOCR_SECTIONS = [ [ '.ocr_par',   "\n\n" ],
+                      [ '.ocr_line',  "\n"   ],
+                      [ '.ocrx_word', " "    ] ]
+
     def initialize
       @pages_to_ocr = []
     end
@@ -66,16 +72,20 @@ def extract_from_ocr(pdf, pages)
           escaped_tiff = ESCAPE[tiff]
           file = "#{base_path}_#{page}"
           run "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert -despeckle +adjoin #{MEMORY_ARGS} #{OCR_FLAGS} #{escaped_pdf}[#{page - 1}] #{escaped_tiff} 2>&1"
-          run "tesseract #{escaped_tiff} #{ESCAPE[file]} -l #{@language} 2>&1"
-          clean_text(file + '.txt') if @clean_ocr
+          run "tesseract #{escaped_tiff} #{ESCAPE[file]} -l #{@language} #{@config} 2>&1"
+          run "cp #{escaped_tiff} #{base_path}_#{page}.tif" if @gen_hocr
+          clean_ocr(file) if @clean_ocr
+          generate_text_and_annotate(file) if @gen_hocr
           FileUtils.remove_entry_secure tiff
         end
       else
         tiff = "#{tempdir}/#{@pdf_name}.tif"
         escaped_tiff = ESCAPE[tiff]
         run "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert -despeckle #{MEMORY_ARGS} #{OCR_FLAGS} #{escaped_pdf} #{escaped_tiff} 2>&1"
-        run "tesseract #{escaped_tiff} #{base_path} -l #{@language} 2>&1"
-        clean_text(base_path + '.txt') if @clean_ocr
+        run "tesseract #{escaped_tiff} #{base_path} -l #{@language} #{@config} 2>&1"
+        run "cp #{escaped_tiff} #{base_path}.tif" if @gen_hocr
+        clean_ocr(base_path) if @clean_ocr
+        generate_text_and_annotate(base_path) if @gen_hocr
       end
     ensure
       FileUtils.remove_entry_secure tempdir if File.exists?(tempdir)
@@ -84,15 +94,69 @@ def extract_from_ocr(pdf, pages)
 
     private
 
-    def clean_text(file)
-      File.open(file, 'r+') do |f|
-        text = f.read
+    def clean_ocr(basename)
+      ext = @gen_hocr ? "html" : "txt"
+      File.open(basename + ".#{ext}", 'r+') do |f|
+        content = f.read
         f.truncate(0)
         f.rewind
-        f.write(Docsplit.clean_text(text))
+        meth = @gen_hocr ? "hocr" : "text"
+        f.write(Docsplit.send("clean_#{meth}".to_sym, content))
       end
     end
 
+    # When generating hOCR output, tesseract doesn't generate text output.
+    # This method will generate the text output, and also add the corresponding
+    # character position of the words back into the hOCR file as HTML data
+    # attributes.
+    def generate_text_and_annotate(basename)
+      File.open(basename + '.txt', 'w') do |output|
+        File.open(basename + '.html', 'r+') do |input|
+          xml = Nokogiri::XML(input.read)
+          generate_text_position(xml) do |text, pos, elt|
+            # Write the output text file
+            output.write(text)
+
+            # Annotate the hOCR element we are given
+            if elt
+              elt['data-start'] = pos
+              elt['data-stop' ] = pos + text.size
+            end
+          end
+          input.truncate(0)
+          input.rewind
+          input.write(xml.to_xml)
+        end
+      end
+    end
+
+    def generate_text_position(root, index=0, pos=0, &block)
+      raise RuntimeError, "bad section list" if index >= HOCR_SECTIONS.size
+      # Select the sections we want at this level
+      sections = root.css(HOCR_SECTIONS[index][0])
+      sections.each do |section|
+        if index < HOCR_SECTIONS.size - 1
+          # It is not the base section, so recurse.
+          pos = generate_text_position(section, index + 1, pos, &block)
+        else
+          # It is the base section (a word), so emit the
+          # text and the xml element so the caller can
+          # annotate.
+          block.call(section.text, pos, section) if block
+          pos += section.text.size
+        end
+
+        # We 'join' the sections with the specified separator.
+        # Emit the section join text, but without the xml
+        # element, since this is just generate text.
+        if section != sections.last
+          block.call(HOCR_SECTIONS[index][1], pos, nil) if block
+          pos += HOCR_SECTIONS[index][1].size
+        end
+      end
+      pos
+    end
+
     # Run an external process and raise an exception if it fails.
     def run(command)
       result = `#{command}`
@@ -123,8 +187,19 @@ def extract_options(options)
       @forbid_ocr = options[:ocr] == false
       @clean_ocr  = !(options[:clean] == false)
       @language   = options[:language] || 'eng'
+      @gen_hocr   = check_tesseract_config(options[:config])
+      @config     = options[:config] || ''
+    end
+
+    def check_tesseract_config(config)
+      return false unless config
+      hocr_configs = File.open(config, 'r').grep(/tessedit_create_hocr/)
+      if hocr_configs.size > 0
+        return hocr_configs.last.split[1] != "0"
+      end
+      false
     end
 
   end
 
-end
\ No newline at end of file
+end
diff --git a/test/unit/test_extract_text.rb b/test/unit/test_extract_text.rb
index 00d24e3..09444dd 100755
--- a/test/unit/test_extract_text.rb
+++ b/test/unit/test_extract_text.rb
@@ -1,5 +1,6 @@
 here = File.expand_path(File.dirname(__FILE__))
 require File.join(here, '..', 'test_helper')
+require 'fileutils'
 require 'tmpdir'
 
 class ExtractTextTest < Test::Unit::TestCase
@@ -38,6 +39,33 @@ def test_ocr_extraction
     end
   end
 
+  def test_hocr_extraction
+    # Create a config that enables hOCR output
+    FileUtils.mkdir_p(OUTPUT)
+    File.write("#{OUTPUT}/config", "tessedit_create_hocr 1")
+
+    Docsplit.extract_text('test/fixtures/corrosion.pdf', :pages => 'all', :output => OUTPUT, :config => "#{OUTPUT}/config")
+
+    # Remove the file to avoid polluting the tests below
+    FileUtils.rm("#{OUTPUT}/config")
+
+    files = []
+    4.times do |i|
+      file = "corrosion_#{i + 1}.txt"
+      files.push(file)
+      assert File.read(File.join(OUTPUT, file)).size > 1, "Expected that file with extracted text should have reasonable size"
+      # This page contains does not need ocr.
+      next if i == 2
+      file = "corrosion_#{i + 1}.html"
+      files.push(file)
+      assert File.read(File.join(OUTPUT, file)).size > 1, "Expected that file with annotated html should have reasonable size"
+      file = "corrosion_#{i + 1}.tif"
+      files.push(file)
+      assert File.read(File.join(OUTPUT, file)).size > 1, "Expected that tif file should have reasonable size"
+    end
+    assert_directory_contains(OUTPUT, files)
+  end
+
   def test_ocr_extraction_in_mock_language
     exception = assert_raise(Docsplit::ExtractionFailed) {Docsplit.extract_text('test/fixtures/corrosion.pdf', :pages => 'all', :output => OUTPUT, :language => "mock")}
     assert exception.message.match("tessdata/mock"), "Expected problem with loading data for language 'mock'"