From 9967ca797b71e985acc4c7caa292035f7d6a50c6 Mon Sep 17 00:00:00 2001 From: James Mead <3169+floehopper@users.noreply.github.com> Date: Mon, 2 Jul 2018 21:41:56 +0100 Subject: [PATCH] Fix escaping when extracting text using OCR Previously the output filename passed to the tesseract command was not shell-escaped. This meant that the filename was truncated and did not match the filename expected by Docsplit::TextExtractor#clean_text resulting in the following exception: Errno::ENOENT: No such file or directory @ rb_sysopen - test/output/PDF file with spaces 'single' and "double quotes".txt /Users/jamesmead/Code/freerange/docsplit/lib/docsplit/text_extractor.rb:90:in `initialize' /Users/jamesmead/Code/freerange/docsplit/lib/docsplit/text_extractor.rb:90:in `open' /Users/jamesmead/Code/freerange/docsplit/lib/docsplit/text_extractor.rb:90:in `clean_text' /Users/jamesmead/Code/freerange/docsplit/lib/docsplit/text_extractor.rb:80:in `extract_from_ocr' /Users/jamesmead/Code/freerange/docsplit/lib/docsplit/text_extractor.rb:36:in `block in extract' /Users/jamesmead/Code/freerange/docsplit/lib/docsplit/text_extractor.rb:32:in `each' /Users/jamesmead/Code/freerange/docsplit/lib/docsplit/text_extractor.rb:32:in `extract' /Users/jamesmead/Code/freerange/docsplit/lib/docsplit.rb:52:in `extract_text' test/unit/test_extract_text.rb:58:in `test_name_escaping_while_extracting_text_using_ocr' --- lib/docsplit/text_extractor.rb | 2 +- test/unit/test_extract_text.rb | 9 +++++++-- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/lib/docsplit/text_extractor.rb b/lib/docsplit/text_extractor.rb index 93973f6..e78b832 100644 --- a/lib/docsplit/text_extractor.rb +++ b/lib/docsplit/text_extractor.rb @@ -76,7 +76,7 @@ def extract_from_ocr(pdf, pages) escaped_tiff = ESCAPE[tiff] run "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert -despeckle #{MEMORY_ARGS} #{OCR_FLAGS} #{escaped_pdf} #{escaped_tiff} 2>&1" #if the user says don't do orientation detection or the plugin is not installed, set psm to 0 - run "tesseract #{escaped_tiff} #{base_path} -l #{@language} #{psm} 2>&1" + run "tesseract #{escaped_tiff} #{ESCAPE[base_path]} -l #{@language} #{psm} 2>&1" clean_text(base_path + '.txt') if @clean_ocr end ensure diff --git a/test/unit/test_extract_text.rb b/test/unit/test_extract_text.rb index fa46180..eaa9f56 100755 --- a/test/unit/test_extract_text.rb +++ b/test/unit/test_extract_text.rb @@ -49,11 +49,16 @@ def test_password_protected end end - def test_name_escaping_while_extracting_text + def test_name_escaping_while_extracting_text_into_pages Docsplit.extract_text('test/fixtures/PDF file with spaces \'single\' and "double quotes".pdf', :pages => 'all', :output => OUTPUT) assert Dir["#{OUTPUT}/*.txt"].length == 2 end - + + def test_name_escaping_while_extracting_text_using_ocr + Docsplit.extract_text('test/fixtures/PDF file with spaces \'single\' and "double quotes".pdf', :ocr => true, :output => OUTPUT) + assert Dir["#{OUTPUT}/*.txt"].length == 1 + end + def test_orientation_detected_ocr_extraction if Docsplit::DEPENDENCIES[:osd] pages = 1..4