From 560a3ea9573b21034e76a5b3aff051ecaa7b28de Mon Sep 17 00:00:00 2001 From: Joe Hosteny Date: Thu, 7 Nov 2013 16:16:17 -0500 Subject: [PATCH] Enable specifying a config file. If a config file is specified, see if hocr output is enabled. If so, we also generate the text file from the hocr output, and back annotate the hocr output with word positions in HTML data attributes. --- docsplit.gemspec | 4 +- lib/docsplit.rb | 5 ++ lib/docsplit/command_line.rb | 5 +- lib/docsplit/text_cleaner.rb | 36 +++++++++---- lib/docsplit/text_extractor.rb | 93 ++++++++++++++++++++++++++++++---- test/unit/test_extract_text.rb | 28 ++++++++++ 6 files changed, 151 insertions(+), 20 deletions(-) diff --git a/docsplit.gemspec b/docsplit.gemspec index d3526bb..2ffb04c 100755 --- a/docsplit.gemspec +++ b/docsplit.gemspec @@ -21,4 +21,6 @@ Gem::Specification.new do |s| s.files = Dir['build/**/*', 'lib/**/*', 'bin/*', 'vendor/**/*', 'docsplit.gemspec', 'LICENSE', 'README'] -end \ No newline at end of file + + s.add_dependency "nokogiri" +end diff --git a/lib/docsplit.rb b/lib/docsplit.rb index 5001413..395a40f 100755 --- a/lib/docsplit.rb +++ b/lib/docsplit.rb @@ -79,6 +79,11 @@ def self.clean_text(text) TextCleaner.new.clean(text) end + # Utility method to clean OCR'd text in hOCR output format. + def self.clean_hocr(html) + TextCleaner.new.clean_hocr(html) + end + private # Normalize a value in an options hash for the command line. diff --git a/lib/docsplit/command_line.rb b/lib/docsplit/command_line.rb index 7c7af08..f079d45 100755 --- a/lib/docsplit/command_line.rb +++ b/lib/docsplit/command_line.rb @@ -91,6 +91,9 @@ def parse_options opts.on('--[no-]ocr', 'force OCR to be used, or disable OCR') do |o| @options[:ocr] = o end + opts.on('-c', '--config [FILE]', 'use the specified config file') do |c| + @options[:config] = c + end opts.on('--no-clean', 'disable cleaning of OCR\'d text') do |c| @options[:clean] = false end @@ -119,4 +122,4 @@ def parse_options end -end \ No newline at end of file +end diff --git a/lib/docsplit/text_cleaner.rb b/lib/docsplit/text_cleaner.rb index c4aac01..ceee965 100644 --- a/lib/docsplit/text_cleaner.rb +++ b/lib/docsplit/text_cleaner.rb @@ -1,4 +1,5 @@ require 'strscan' +require 'nokogiri' module Docsplit @@ -32,16 +33,8 @@ class TextCleaner REPEATED = /(\b\S{1,2}\s+)(\S{1,3}\s+){5,}(\S{1,2}\s+)/ SINGLETONS = /^[AaIi]$/ - # For the time being, `clean` uses the regular StringScanner, and not the - # multibyte-aware version, coercing to ASCII first. def clean(text) - if String.method_defined?(:encode) - text.encode!('ascii', :invalid => :replace, :undef => :replace, :replace => '?') - else - require 'iconv' unless defined?(Iconv) - text = Iconv.iconv('ascii//translit//ignore', 'utf-8', text).first - end - + text = get_conversion_method.call(text) scanner = StringScanner.new(text) cleaned = [] spaced = false @@ -60,6 +53,31 @@ def clean(text) end end + # When cleaning hOCR output, we follow a slightly simplied cleaning + # heuristic. Simply look at the individual word embedded within the + # XML text node that is a child of the XML element with the class + # attribute set to '.ocrx_word.' If it is garbage, delete that node. + def clean_hocr(xhtml) + convert = get_conversion_method + xml = Nokogiri::XML(xhtml) + xml.css('.ocrx_word').each do |elt| + word = xml.css('.ocrx_word').last.xpath(".//text()").text + elt.remove if garbage(convert.call(word)) + end + xml.to_s + end + + # For the time being, `clean` uses the regular StringScanner, and not the + # multibyte-aware version, coercing to ASCII first. + def get_conversion_method + if String.method_defined?(:encode) + lambda { |text| text.encode('ascii', :invalid => :replace, :undef => :replace, :replace => '?') } + else + require 'iconv' unless defined?(Iconv) + lambda { |text| Iconv.iconv('ascii//translit//ignore', 'utf-8', text).first } + end + end + # Is a given word OCR garbage? def garbage(w) acronym = w =~ ACRONYM diff --git a/lib/docsplit/text_extractor.rb b/lib/docsplit/text_extractor.rb index 0d55f32..b10f6e0 100644 --- a/lib/docsplit/text_extractor.rb +++ b/lib/docsplit/text_extractor.rb @@ -1,3 +1,5 @@ +require 'nokogiri' + module Docsplit # Delegates to **pdftotext** and **tesseract** in order to extract text from @@ -21,6 +23,10 @@ class TextExtractor MIN_TEXT_PER_PAGE = 100 # in bytes + HOCR_SECTIONS = [ [ '.ocr_par', "\n\n" ], + [ '.ocr_line', "\n" ], + [ '.ocrx_word', " " ] ] + def initialize @pages_to_ocr = [] end @@ -66,16 +72,20 @@ def extract_from_ocr(pdf, pages) escaped_tiff = ESCAPE[tiff] file = "#{base_path}_#{page}" run "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert -despeckle +adjoin #{MEMORY_ARGS} #{OCR_FLAGS} #{escaped_pdf}[#{page - 1}] #{escaped_tiff} 2>&1" - run "tesseract #{escaped_tiff} #{ESCAPE[file]} -l #{@language} 2>&1" - clean_text(file + '.txt') if @clean_ocr + run "tesseract #{escaped_tiff} #{ESCAPE[file]} -l #{@language} #{@config} 2>&1" + run "cp #{escaped_tiff} #{base_path}_#{page}.tif" if @gen_hocr + clean_ocr(file) if @clean_ocr + generate_text_and_annotate(file) if @gen_hocr FileUtils.remove_entry_secure tiff end else tiff = "#{tempdir}/#{@pdf_name}.tif" escaped_tiff = ESCAPE[tiff] run "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert -despeckle #{MEMORY_ARGS} #{OCR_FLAGS} #{escaped_pdf} #{escaped_tiff} 2>&1" - run "tesseract #{escaped_tiff} #{base_path} -l #{@language} 2>&1" - clean_text(base_path + '.txt') if @clean_ocr + run "tesseract #{escaped_tiff} #{base_path} -l #{@language} #{@config} 2>&1" + run "cp #{escaped_tiff} #{base_path}.tif" if @gen_hocr + clean_ocr(base_path) if @clean_ocr + generate_text_and_annotate(base_path) if @gen_hocr end ensure FileUtils.remove_entry_secure tempdir if File.exists?(tempdir) @@ -84,15 +94,69 @@ def extract_from_ocr(pdf, pages) private - def clean_text(file) - File.open(file, 'r+') do |f| - text = f.read + def clean_ocr(basename) + ext = @gen_hocr ? "html" : "txt" + File.open(basename + ".#{ext}", 'r+') do |f| + content = f.read f.truncate(0) f.rewind - f.write(Docsplit.clean_text(text)) + meth = @gen_hocr ? "hocr" : "text" + f.write(Docsplit.send("clean_#{meth}".to_sym, content)) end end + # When generating hOCR output, tesseract doesn't generate text output. + # This method will generate the text output, and also add the corresponding + # character position of the words back into the hOCR file as HTML data + # attributes. + def generate_text_and_annotate(basename) + File.open(basename + '.txt', 'w') do |output| + File.open(basename + '.html', 'r+') do |input| + xml = Nokogiri::XML(input.read) + generate_text_position(xml) do |text, pos, elt| + # Write the output text file + output.write(text) + + # Annotate the hOCR element we are given + if elt + elt['data-start'] = pos + elt['data-stop' ] = pos + text.size + end + end + input.truncate(0) + input.rewind + input.write(xml.to_xml) + end + end + end + + def generate_text_position(root, index=0, pos=0, &block) + raise RuntimeError, "bad section list" if index >= HOCR_SECTIONS.size + # Select the sections we want at this level + sections = root.css(HOCR_SECTIONS[index][0]) + sections.each do |section| + if index < HOCR_SECTIONS.size - 1 + # It is not the base section, so recurse. + pos = generate_text_position(section, index + 1, pos, &block) + else + # It is the base section (a word), so emit the + # text and the xml element so the caller can + # annotate. + block.call(section.text, pos, section) if block + pos += section.text.size + end + + # We 'join' the sections with the specified separator. + # Emit the section join text, but without the xml + # element, since this is just generate text. + if section != sections.last + block.call(HOCR_SECTIONS[index][1], pos, nil) if block + pos += HOCR_SECTIONS[index][1].size + end + end + pos + end + # Run an external process and raise an exception if it fails. def run(command) result = `#{command}` @@ -123,8 +187,19 @@ def extract_options(options) @forbid_ocr = options[:ocr] == false @clean_ocr = !(options[:clean] == false) @language = options[:language] || 'eng' + @gen_hocr = check_tesseract_config(options[:config]) + @config = options[:config] || '' + end + + def check_tesseract_config(config) + return false unless config + hocr_configs = File.open(config, 'r').grep(/tessedit_create_hocr/) + if hocr_configs.size > 0 + return hocr_configs.last.split[1] != "0" + end + false end end -end \ No newline at end of file +end diff --git a/test/unit/test_extract_text.rb b/test/unit/test_extract_text.rb index 00d24e3..09444dd 100755 --- a/test/unit/test_extract_text.rb +++ b/test/unit/test_extract_text.rb @@ -1,5 +1,6 @@ here = File.expand_path(File.dirname(__FILE__)) require File.join(here, '..', 'test_helper') +require 'fileutils' require 'tmpdir' class ExtractTextTest < Test::Unit::TestCase @@ -38,6 +39,33 @@ def test_ocr_extraction end end + def test_hocr_extraction + # Create a config that enables hOCR output + FileUtils.mkdir_p(OUTPUT) + File.write("#{OUTPUT}/config", "tessedit_create_hocr 1") + + Docsplit.extract_text('test/fixtures/corrosion.pdf', :pages => 'all', :output => OUTPUT, :config => "#{OUTPUT}/config") + + # Remove the file to avoid polluting the tests below + FileUtils.rm("#{OUTPUT}/config") + + files = [] + 4.times do |i| + file = "corrosion_#{i + 1}.txt" + files.push(file) + assert File.read(File.join(OUTPUT, file)).size > 1, "Expected that file with extracted text should have reasonable size" + # This page contains does not need ocr. + next if i == 2 + file = "corrosion_#{i + 1}.html" + files.push(file) + assert File.read(File.join(OUTPUT, file)).size > 1, "Expected that file with annotated html should have reasonable size" + file = "corrosion_#{i + 1}.tif" + files.push(file) + assert File.read(File.join(OUTPUT, file)).size > 1, "Expected that tif file should have reasonable size" + end + assert_directory_contains(OUTPUT, files) + end + def test_ocr_extraction_in_mock_language exception = assert_raise(Docsplit::ExtractionFailed) {Docsplit.extract_text('test/fixtures/corrosion.pdf', :pages => 'all', :output => OUTPUT, :language => "mock")} assert exception.message.match("tessdata/mock"), "Expected problem with loading data for language 'mock'"