diff --git a/lib/derivative_rodeo/generators/base_generator.rb b/lib/derivative_rodeo/generators/base_generator.rb index 85da0b1..4589323 100644 --- a/lib/derivative_rodeo/generators/base_generator.rb +++ b/lib/derivative_rodeo/generators/base_generator.rb @@ -170,7 +170,7 @@ def destination(input_location) return output_location unless preprocessed_location_template preprocessed_location = input_location.derived_file_from(template: preprocessed_location_template) - # We only want + # We only want the location if it exists return preprocessed_location if preprocessed_location&.exist? # NOTE: The file does not exist at the output_location; but we pass this information along so diff --git a/lib/derivative_rodeo/generators/pdf_split_generator.rb b/lib/derivative_rodeo/generators/pdf_split_generator.rb index d644b3a..87d019b 100644 --- a/lib/derivative_rodeo/generators/pdf_split_generator.rb +++ b/lib/derivative_rodeo/generators/pdf_split_generator.rb @@ -25,12 +25,36 @@ class PdfSplitGenerator < BaseGenerator # @note This must include "%d" in the returning value, as that is how Ghostscript will assign # the page number. # - # @note I have extracted this function to make it abundantly clear the expected filename of - # each split image. + # @note I have extracted this function to make it abundantly clear the expected location + # each split image. Further there is an interaction in this + # + # @see #existing_page_locations def image_file_basename_template(basename:) - # TODO: Rather urgently we need to decide if this is a reasonable format? Do we want to - # have subfolders instead? Will that make it easier to find things. - "#{basename}-page%d.#{output_extension}" + # We can do this because the temp files are always local; and we'll need to modify how we + # write these files. + "pages/#{basename}-%d.#{output_extension}" + end + + ## + # We want to check the output location and pre-processed location for the existence of already + # split pages. This method checks both places. + # + # @param input_location [StorageLocations::BaseLocation] + # + # @return [Enumerable] the files at the given :input_location + # with :tail_glob. + # + # @note There is relation to {Generators::BaseGenerator#destination} and this method. + def existing_page_locations(input_location:) + # TODO: Are we adequately accounting for the directory structure necessary to have a work have + # more than one PDF and then split each PDF's pages into the correct sub directory? + tail_glob = "pages/*.#{output_extension}" + output_locations = input_location.derived_file_from(template: output_location_template).globbed_tail_locations(tail_glob: tail_glob) + return output_locations if output_locations.count.positive? + + return [] if preprocessed_location_template.blank? + + input_location.derived_file_from(template: preprocessed_location_template).globbed_tail_loations(tail_glob: tail_glob) end ## @@ -46,20 +70,29 @@ def image_file_basename_template(basename:) # @yieldparam image_path [String] where to find this file in the tmp space # # @see BaseGenerator#with_each_requisite_location_and_tmp_file_path for further discussion + # rubocop:disable Metrics/MethodLength def with_each_requisite_location_and_tmp_file_path input_files.each do |input_location| input_location.with_existing_tmp_path do |input_tmp_file_path| - Services::PdfSplitter.call( - input_tmp_file_path, - image_extension: output_extension, - image_file_basename_template: image_file_basename_template(basename: input_location.file_basename) - ).each do |image_path| + ## We want a single call for a directory listing of the image_file_basename_template + generated_files = existing_page_locations(input_location: input_location) + + if generated_files.count.zero? + generated_files = Services::PdfSplitter.call( + input_tmp_file_path, + image_extension: output_extension, + image_file_basename_template: image_file_basename_template(basename: input_location.file_basename) + ) + end + + generated_files.each do |image_path| image_location = StorageLocations::FileLocation.new("file://#{image_path}") yield(image_location, image_path) end end end end + # rubocop:enable Metrics/MethodLength end end end diff --git a/lib/derivative_rodeo/services/pdf_splitter/base.rb b/lib/derivative_rodeo/services/pdf_splitter/base.rb index 054c5f2..c59a22a 100644 --- a/lib/derivative_rodeo/services/pdf_splitter/base.rb +++ b/lib/derivative_rodeo/services/pdf_splitter/base.rb @@ -2,8 +2,6 @@ require 'open3' require 'securerandom' -require 'tmpdir' - module DerivativeRodeo module Services ## @@ -24,13 +22,12 @@ module PdfSplitter # for an image "split" from the given PDF. It must include "%d" as part of the # declaration. For example if the template is "hello-world-%d.png" then the first # split page will be "hello-world-1.png". - # @param tmpdir [String] place to perform the "work" of splitting the PDF. # # @return [Enumerable, Utilities::PdfSplitter::Base, #each] see {Base#each} - def self.call(path, image_extension:, image_file_basename_template:, tmpdir: File.dirname(path)) + def self.call(path, image_extension:, image_file_basename_template:) klass_name = "#{image_extension.to_s.classify}_page".classify klass = "DerivativeRodeo::Services::PdfSplitter::#{klass_name}".constantize - klass.new(path, tmpdir: tmpdir, image_file_basename_template: image_file_basename_template) + klass.new(path, image_file_basename_template: image_file_basename_template) end ## @@ -52,14 +49,15 @@ class Base def initialize(path, image_file_basename_template:, - # TODO: Do we need to provide the :tmpdir for the application? Based on - # implementation, no, this can be extracted from the provided path. - tmpdir: Dir.mktmpdir, pdf_pages_summary: PagesSummary.extract_from(path: path)) @pdfpath = path @pdf_pages_summary = pdf_pages_summary - @tmpdir = tmpdir - @ghost_script_output_file_template = File.join(tmpdir, image_file_basename_template) + @ghost_script_output_file_template = File.join(File.dirname(path), image_file_basename_template) + + # We need to ensure that this temporary directory exists so we can write the files to it. + # Fortunately, because this file space must be "local" tmp dir, we don't need to work + # through any of the location antics of {StorageLocations::BaseLocation}. + FileUtils.mkdir_p(File.dirname(@ghost_script_output_file_template)) end attr_reader :ghost_script_output_file_template @@ -82,8 +80,8 @@ def invalid_pdf? !pdf_pages_summary.valid? end - attr_reader :pdf_pages_summary, :tmpdir, :basename, :pdfpath - private :pdf_pages_summary, :tmpdir, :basename, :pdfpath + attr_reader :pdf_pages_summary, :basename, :pdfpath + private :pdf_pages_summary, :basename, :pdfpath # @api private def gsdevice diff --git a/lib/derivative_rodeo/storage_locations/base_location.rb b/lib/derivative_rodeo/storage_locations/base_location.rb index 1a62e18..c1b6960 100644 --- a/lib/derivative_rodeo/storage_locations/base_location.rb +++ b/lib/derivative_rodeo/storage_locations/base_location.rb @@ -206,6 +206,22 @@ def derived_file_from(template:) klass.build(from_uri: file_path, template: template) end + ## + # When you have a known location and want to check for files that are within that location, + # use the #globbed_tail_locations method. In the case of {Generators::PdfSplitGenerator} we + # need to know the path to the all of the image files we "split" off of the given PDF. + # + # We can use the :file_path as the prefix the given :tail_glob as the suffix for a "fully + # qualified" Dir.glob type search. + # + # @param tail_glob [String] + # + # @return [StorageLocations::BaseLocation] when there is one or more files at the location + # @return [NilClass] when there are no files + def globbed_tail_locations(tail_glob:) + raise NotImplementedError, "#{self.class}#globbed_locations" + end + ## # @param extension [String, StorageLocations::SAME] # @return [String] the path for the new extension; when given {StorageLocations::SAME} re-use diff --git a/lib/derivative_rodeo/storage_locations/file_location.rb b/lib/derivative_rodeo/storage_locations/file_location.rb index d2310b9..eb9b25e 100644 --- a/lib/derivative_rodeo/storage_locations/file_location.rb +++ b/lib/derivative_rodeo/storage_locations/file_location.rb @@ -34,6 +34,10 @@ def write FileUtils.cp_r(tmp_file_path, file_path) file_uri end + + def globbed_tail_locations(tail_glob:) + Dir.glob(File.join(file_dir, tail_glob)) + end end end end diff --git a/lib/derivative_rodeo/storage_locations/s3_location.rb b/lib/derivative_rodeo/storage_locations/s3_location.rb index 26a51b8..51233a7 100644 --- a/lib/derivative_rodeo/storage_locations/s3_location.rb +++ b/lib/derivative_rodeo/storage_locations/s3_location.rb @@ -53,6 +53,28 @@ def exist? bucket.objects(prefix: file_path).count.positive? end + ## + # @return [Enumerable] + # + # @note S3 allows searching on a prefix but does not allow for "wildcard" searches. We can + # use the components of the file_path to fake that behavior. + def globbed_tail_locations(tail_glob:) + # file_path = "s3://blah/1234/hello-world/pages/*.tiff" + # + # NOTE: Should we be storing our files as such? The pattern we need is + # :parent_identifier/:file_set_identifier/files There are probably cases where a work has + # more than one PDF (that we intend to split); we don't want to trample on those split files + # and miscolate two PDFs. + # + # file_path = "s3://blah/1234/hello-world/hello-world.pdf + # TODO: Should file_path be file_dir? + globname = File.join(file_path, tail_glob) + regexp = %r{#{File.extname(globname)}$} + bucket.objects(prefix: File.dirname(globname)).flat_map do |object| + derived_file_from(object.key) if object.key.match(regexp) + end + end + ## # @api public # write the tmp file to the file_uri diff --git a/spec/derivative_rodeo/storage_locations/file_location_spec.rb b/spec/derivative_rodeo/storage_locations/file_location_spec.rb index 5ad25cc..5857c64 100644 --- a/spec/derivative_rodeo/storage_locations/file_location_spec.rb +++ b/spec/derivative_rodeo/storage_locations/file_location_spec.rb @@ -68,4 +68,12 @@ end end xit "write cases or mark private the rest of the methods" + + context '#globbed_tail_locations' do + let(:args) { "file://#{__FILE__}" } + + it "searches for files within the file_dir that match the given glob" do + expect(instance.globbed_tail_locations(tail_glob: "*.rb")).to include(__FILE__) + end + end end diff --git a/spec/derivative_rodeo/storage_locations/s3_location_spec.rb b/spec/derivative_rodeo/storage_locations/s3_location_spec.rb index b6b4355..cd4f902 100644 --- a/spec/derivative_rodeo/storage_locations/s3_location_spec.rb +++ b/spec/derivative_rodeo/storage_locations/s3_location_spec.rb @@ -66,4 +66,16 @@ xit "write additional write cases" xit "write cases or mark private the rest of the methods" + + describe '#globbed_tail_locations' do + it 'searched the bucket' do + # The subject's bucket is not the same as the above bucket + subject.bucket = bucket + basename_ish = short_path.split(".").first + key = File.join(basename_ish, File.basename(__FILE__)) + bucket.object(key).upload_file(__FILE__) + + subject.globbed_tail_locations(tail_glob: "*.rb") + end + end end diff --git a/spec/support/aws_s3_faux_bucket.rb b/spec/support/aws_s3_faux_bucket.rb index c52203f..eb2e560 100644 --- a/spec/support/aws_s3_faux_bucket.rb +++ b/spec/support/aws_s3_faux_bucket.rb @@ -11,20 +11,23 @@ def initialize attr_reader :storage def object(path) # Yup, we've got nested buckets - @storage[path] ||= Storage.new + @storage[path] ||= Storage.new(key: path) end def objects(prefix:) - @storage.select do |path, _file| - path.start_with?(prefix) + @storage.each_with_object([]) do |(path, obj), accumulator| + accumulator << obj if path.start_with?(prefix) end end class Storage - def initialize + # Because we're now coping with the glob tail finder, we need to account for the bucket entry's + # key. + def initialize(key:) + @key = key @storage = {} end - attr_reader :storage + attr_reader :storage, :key def upload_file(path) @storage[:upload] = path