From 9967ca797b71e985acc4c7caa292035f7d6a50c6 Mon Sep 17 00:00:00 2001
From: James Mead <3169+floehopper@users.noreply.github.com>
Date: Mon, 2 Jul 2018 21:41:56 +0100
Subject: [PATCH] Fix escaping when extracting text using OCR

Previously the output filename passed to the tesseract command was not
shell-escaped. This meant that the filename was truncated and did not
match the filename expected by Docsplit::TextExtractor#clean_text
resulting in the following exception:

    Errno::ENOENT: No such file or directory @ rb_sysopen - test/output/PDF file with spaces 'single' and "double quotes".txt
    /Users/jamesmead/Code/freerange/docsplit/lib/docsplit/text_extractor.rb:90:in `initialize'
    /Users/jamesmead/Code/freerange/docsplit/lib/docsplit/text_extractor.rb:90:in `open'
    /Users/jamesmead/Code/freerange/docsplit/lib/docsplit/text_extractor.rb:90:in `clean_text'
    /Users/jamesmead/Code/freerange/docsplit/lib/docsplit/text_extractor.rb:80:in `extract_from_ocr'
    /Users/jamesmead/Code/freerange/docsplit/lib/docsplit/text_extractor.rb:36:in `block in extract'
    /Users/jamesmead/Code/freerange/docsplit/lib/docsplit/text_extractor.rb:32:in `each'
    /Users/jamesmead/Code/freerange/docsplit/lib/docsplit/text_extractor.rb:32:in `extract'
    /Users/jamesmead/Code/freerange/docsplit/lib/docsplit.rb:52:in `extract_text'
    test/unit/test_extract_text.rb:58:in `test_name_escaping_while_extracting_text_using_ocr'
---
 lib/docsplit/text_extractor.rb | 2 +-
 test/unit/test_extract_text.rb | 9 +++++++--
 2 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/lib/docsplit/text_extractor.rb b/lib/docsplit/text_extractor.rb
index 93973f6..e78b832 100644
--- a/lib/docsplit/text_extractor.rb
+++ b/lib/docsplit/text_extractor.rb
@@ -76,7 +76,7 @@ def extract_from_ocr(pdf, pages)
         escaped_tiff = ESCAPE[tiff]
         run "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert -despeckle #{MEMORY_ARGS} #{OCR_FLAGS} #{escaped_pdf} #{escaped_tiff} 2>&1"
         #if the user says don't do orientation detection or the plugin is not installed, set psm to 0
-        run "tesseract #{escaped_tiff} #{base_path} -l #{@language} #{psm} 2>&1"
+        run "tesseract #{escaped_tiff} #{ESCAPE[base_path]} -l #{@language} #{psm} 2>&1"
         clean_text(base_path + '.txt') if @clean_ocr
       end
     ensure
diff --git a/test/unit/test_extract_text.rb b/test/unit/test_extract_text.rb
index fa46180..eaa9f56 100755
--- a/test/unit/test_extract_text.rb
+++ b/test/unit/test_extract_text.rb
@@ -49,11 +49,16 @@ def test_password_protected
     end
   end
 
-  def test_name_escaping_while_extracting_text
+  def test_name_escaping_while_extracting_text_into_pages
     Docsplit.extract_text('test/fixtures/PDF file with spaces \'single\' and "double quotes".pdf', :pages => 'all', :output => OUTPUT)
     assert Dir["#{OUTPUT}/*.txt"].length == 2
   end
-  
+
+  def test_name_escaping_while_extracting_text_using_ocr
+    Docsplit.extract_text('test/fixtures/PDF file with spaces \'single\' and "double quotes".pdf', :ocr => true, :output => OUTPUT)
+    assert Dir["#{OUTPUT}/*.txt"].length == 1
+  end
+
   def test_orientation_detected_ocr_extraction
     if Docsplit::DEPENDENCIES[:osd]
       pages = 1..4