🎁 Adding PDF Split Page Checks

Prior to this commit, if we'd already pre-processed a PDF split, we would again re-process that split (as there was no check for existing pages). With this commit, we check for those pre-processed pages. One critical bit of conversation, is that one work might have multiple PDFs uploaded. Therefore, it is important to have those PDFs pages write to different "sub-directories". I'm putting this hear so we can account for that in a test audit of some kind. Related to: - https://github.com/scientist-softserv/adventist-dl/issues/330 - notch8/iiif_print#220 Co-authored-by: Rob Kaufman <[email protected]> Co-authored-by: Kirk Wang <[email protected]>
notch8 · May 25, 2023 · 85f0c37 · 85f0c37
1 parent 046a358
commit 85f0c37
Show file tree

Hide file tree

Showing 9 changed files with 124 additions and 28 deletions.
diff --git a/lib/derivative_rodeo/generators/base_generator.rb b/lib/derivative_rodeo/generators/base_generator.rb
@@ -170,7 +170,7 @@ def destination(input_location)
         return output_location unless preprocessed_location_template
 
         preprocessed_location = input_location.derived_file_from(template: preprocessed_location_template)
-        # We only want
+        # We only want the location if it exists
         return preprocessed_location if preprocessed_location&.exist?
 
         # NOTE: The file does not exist at the output_location; but we pass this information along so

diff --git a/lib/derivative_rodeo/generators/pdf_split_generator.rb b/lib/derivative_rodeo/generators/pdf_split_generator.rb
@@ -25,12 +25,36 @@ class PdfSplitGenerator < BaseGenerator
       # @note This must include "%d" in the returning value, as that is how Ghostscript will assign
       # the page number.
       #
-      # @note I have extracted this function to make it abundantly clear the expected filename of
-      # each split image.
+      # @note I have extracted this function to make it abundantly clear the expected location
+      # each split image.  Further there is an interaction in this
+      #
+      # @see #existing_page_locations
       def image_file_basename_template(basename:)
-        # TODO: Rather urgently we need to decide if this is a reasonable format?  Do we want to
-        # have subfolders instead?  Will that make it easier to find things.
-        "#{basename}-page%d.#{output_extension}"
+        # We can do this because the temp files are always local; and we'll need to modify how we
+        # write these files.
+        "pages/#{basename}-%d.#{output_extension}"
+      end
+
+      ##
+      # We want to check the output location and pre-processed location for the existence of already
+      # split pages.  This method checks both places.
+      #
+      # @param input_location [StorageLocations::BaseLocation]
+      #
+      # @return [Enumerable<StorageLocations::BaseLocation>] the files at the given :input_location
+      #         with :tail_glob.
+      #
+      # @note There is relation to {Generators::BaseGenerator#destination} and this method.
+      def existing_page_locations(input_location:)
+        # TODO: Are we adequately accounting for the directory structure necessary to have a work have
+        # more than one PDF and then split each PDF's pages into the correct sub directory?
+        tail_glob = "pages/*.#{output_extension}"
+        output_locations = input_location.derived_file_from(template: output_location_template).globbed_tail_locations(tail_glob: tail_glob)
+        return output_locations if output_locations.count.positive?
+
+        return [] if preprocessed_location_template.blank?
+
+        input_location.derived_file_from(template: preprocessed_location_template).globbed_tail_loations(tail_glob: tail_glob)
       end
 
       ##
@@ -46,20 +70,29 @@ def image_file_basename_template(basename:)
       # @yieldparam image_path [String] where to find this file in the tmp space
       #
       # @see BaseGenerator#with_each_requisite_location_and_tmp_file_path for further discussion
+      # rubocop:disable Metrics/MethodLength
       def with_each_requisite_location_and_tmp_file_path
         input_files.each do |input_location|
           input_location.with_existing_tmp_path do |input_tmp_file_path|
-            Services::PdfSplitter.call(
-              input_tmp_file_path,
-              image_extension: output_extension,
-              image_file_basename_template: image_file_basename_template(basename: input_location.file_basename)
-            ).each do |image_path|
+            ## We want a single call for a directory listing of the image_file_basename_template
+            generated_files = existing_page_locations(input_location: input_location)
+
+            if generated_files.count.zero?
+              generated_files = Services::PdfSplitter.call(
+                input_tmp_file_path,
+                image_extension: output_extension,
+                image_file_basename_template: image_file_basename_template(basename: input_location.file_basename)
+              )
+            end
+
+            generated_files.each do |image_path|
               image_location = StorageLocations::FileLocation.new("file://#{image_path}")
               yield(image_location, image_path)
             end
           end
         end
       end
+      # rubocop:enable Metrics/MethodLength
     end
   end
 end
diff --git a/lib/derivative_rodeo/services/pdf_splitter/base.rb b/lib/derivative_rodeo/services/pdf_splitter/base.rb
@@ -2,8 +2,6 @@
 
 require 'open3'
 require 'securerandom'
-require 'tmpdir'
-
 module DerivativeRodeo
   module Services
     ##
@@ -24,13 +22,12 @@ module PdfSplitter
       #        for an image "split" from the given PDF.  It must include "%d" as part of the
       #        declaration.  For example if the template is "hello-world-%d.png" then the first
       #        split page will be "hello-world-1.png".
-      # @param tmpdir [String] place to perform the "work" of splitting the PDF.
       #
       # @return [Enumerable, Utilities::PdfSplitter::Base, #each] see {Base#each}
-      def self.call(path, image_extension:, image_file_basename_template:, tmpdir: File.dirname(path))
+      def self.call(path, image_extension:, image_file_basename_template:)
         klass_name = "#{image_extension.to_s.classify}_page".classify
         klass = "DerivativeRodeo::Services::PdfSplitter::#{klass_name}".constantize
-        klass.new(path, tmpdir: tmpdir, image_file_basename_template: image_file_basename_template)
+        klass.new(path, image_file_basename_template: image_file_basename_template)
       end
 
       ##
@@ -52,14 +49,15 @@ class Base
 
         def initialize(path,
                        image_file_basename_template:,
-                       # TODO: Do we need to provide the :tmpdir for the application?  Based on
-                       # implementation, no, this can be extracted from the provided path.
-                       tmpdir: Dir.mktmpdir,
                        pdf_pages_summary: PagesSummary.extract_from(path: path))
           @pdfpath = path
           @pdf_pages_summary = pdf_pages_summary
-          @tmpdir = tmpdir
-          @ghost_script_output_file_template = File.join(tmpdir, image_file_basename_template)
+          @ghost_script_output_file_template = File.join(File.dirname(path), image_file_basename_template)
+
+          # We need to ensure that this temporary directory exists so we can write the files to it.
+          # Fortunately, because this file space must be "local" tmp dir, we don't need to work
+          # through any of the location antics of {StorageLocations::BaseLocation}.
+          FileUtils.mkdir_p(File.dirname(@ghost_script_output_file_template))
         end
 
         attr_reader :ghost_script_output_file_template
@@ -82,8 +80,8 @@ def invalid_pdf?
           !pdf_pages_summary.valid?
         end
 
-        attr_reader :pdf_pages_summary, :tmpdir, :basename, :pdfpath
-        private :pdf_pages_summary, :tmpdir, :basename, :pdfpath
+        attr_reader :pdf_pages_summary, :basename, :pdfpath
+        private :pdf_pages_summary, :basename, :pdfpath
 
         # @api private
         def gsdevice

diff --git a/lib/derivative_rodeo/storage_locations/base_location.rb b/lib/derivative_rodeo/storage_locations/base_location.rb
@@ -206,6 +206,22 @@ def derived_file_from(template:)
         klass.build(from_uri: file_path, template: template)
       end
 
+      ##
+      # When you have a known location and want to check for files that are within that location,
+      # use the #globbed_tail_locations method.  In the case of {Generators::PdfSplitGenerator} we
+      # need to know the path to the all of the image files we "split" off of the given PDF.
+      #
+      # We can use the :file_path as the prefix the given :tail_glob as the suffix for a "fully
+      # qualified" Dir.glob type search.
+      #
+      # @param tail_glob [String]
+      #
+      # @return [StorageLocations::BaseLocation] when there is one or more files at the location
+      # @return [NilClass] when there are no files
+      def globbed_tail_locations(tail_glob:)
+        raise NotImplementedError, "#{self.class}#globbed_locations"
+      end
+
       ##
       # @param extension [String, StorageLocations::SAME]
       # @return [String] the path for the new extension; when given {StorageLocations::SAME} re-use

diff --git a/lib/derivative_rodeo/storage_locations/file_location.rb b/lib/derivative_rodeo/storage_locations/file_location.rb
@@ -34,6 +34,10 @@ def write
         FileUtils.cp_r(tmp_file_path, file_path)
         file_uri
       end
+
+      def globbed_tail_locations(tail_glob:)
+        Dir.glob(File.join(file_dir, tail_glob))
+      end
     end
   end
 end
diff --git a/lib/derivative_rodeo/storage_locations/s3_location.rb b/lib/derivative_rodeo/storage_locations/s3_location.rb
@@ -53,6 +53,28 @@ def exist?
         bucket.objects(prefix: file_path).count.positive?
       end
 
+      ##
+      # @return [Enumerable<DerivativeRodeo::StorageLocations::S3Location>]
+      #
+      # @note S3 allows searching on a prefix but does not allow for "wildcard" searches.  We can
+      #       use the components of the file_path to fake that behavior.
+      def globbed_tail_locations(tail_glob:)
+        # file_path = "s3://blah/1234/hello-world/pages/*.tiff"
+        #
+        # NOTE: Should we be storing our files as such?  The pattern we need is
+        # :parent_identifier/:file_set_identifier/files There are probably cases where a work has
+        # more than one PDF (that we intend to split); we don't want to trample on those split files
+        # and miscolate two PDFs.
+        #
+        # file_path = "s3://blah/1234/hello-world/hello-world.pdf
+        # TODO: Should file_path be file_dir?
+        globname = File.join(file_path, tail_glob)
+        regexp = %r{#{File.extname(globname)}$}
+        bucket.objects(prefix: File.dirname(globname)).flat_map do |object|
+          derived_file_from(object.key) if object.key.match(regexp)
+        end
+      end
+
       ##
       # @api public
       # write the tmp file to the file_uri

diff --git a/spec/derivative_rodeo/storage_locations/file_location_spec.rb b/spec/derivative_rodeo/storage_locations/file_location_spec.rb
@@ -68,4 +68,12 @@
     end
   end
   xit "write cases or mark private the rest of the methods"
+
+  context '#globbed_tail_locations' do
+    let(:args) { "file://#{__FILE__}" }
+
+    it "searches for files within the file_dir that match the given glob" do
+      expect(instance.globbed_tail_locations(tail_glob: "*.rb")).to include(__FILE__)
+    end
+  end
 end
diff --git a/spec/derivative_rodeo/storage_locations/s3_location_spec.rb b/spec/derivative_rodeo/storage_locations/s3_location_spec.rb
@@ -66,4 +66,16 @@
 
   xit "write additional write cases"
   xit "write cases or mark private the rest of the methods"
+
+  describe '#globbed_tail_locations' do
+    it 'searched the bucket' do
+      # The subject's bucket is not the same as the above bucket
+      subject.bucket = bucket
+      basename_ish = short_path.split(".").first
+      key = File.join(basename_ish, File.basename(__FILE__))
+      bucket.object(key).upload_file(__FILE__)
+
+      subject.globbed_tail_locations(tail_glob: "*.rb")
+    end
+  end
 end
diff --git a/spec/support/aws_s3_faux_bucket.rb b/spec/support/aws_s3_faux_bucket.rb
@@ -11,20 +11,23 @@ def initialize
   attr_reader :storage
   def object(path)
     # Yup, we've got nested buckets
-    @storage[path] ||= Storage.new
+    @storage[path] ||= Storage.new(key: path)
   end
 
   def objects(prefix:)
-    @storage.select do |path, _file|
-      path.start_with?(prefix)
+    @storage.each_with_object([]) do |(path, obj), accumulator|
+      accumulator << obj if path.start_with?(prefix)
     end
   end
 
   class Storage
-    def initialize
+    # Because we're now coping with the glob tail finder, we need to account for the bucket entry's
+    # key.
+    def initialize(key:)
+      @key = key
       @storage = {}
     end
-    attr_reader :storage
+    attr_reader :storage, :key
 
     def upload_file(path)
       @storage[:upload] = path