Skip to content

Commit

Permalink
Merge pull request #83 from scientist-softserv/i11-job-split-pdfs-int…
Browse files Browse the repository at this point in the history
…o-child-works

I11 job split pdfs into child works
  • Loading branch information
jeremyf authored Feb 6, 2023
2 parents fd6a001 + 25a43de commit f729884
Show file tree
Hide file tree
Showing 21 changed files with 603 additions and 144 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -46,3 +46,4 @@ fcrepo-webapp-*
*.gem
pkg/
*~undo-tree~
.DS_Store
45 changes: 30 additions & 15 deletions app/actors/iiif_print/actors/iiif_print_upload_actor.rb
Original file line number Diff line number Diff line change
Expand Up @@ -4,18 +4,20 @@ class IiifPrintUploadActor < Hyrax::Actors::AbstractActor
# An actor which locates all uploaded PDF paths and
# spins off IiifPrint::CreatePagesJob to split them.
def create(env)
# TODO: test what happens when ensure_title is removed.
ensure_title(env)
@pdf_paths = []
hold_upload_paths(env) if responds_to_split?(env.curation_concern)
@prior_pdfs_count = 0
hold_upload_paths(env) if iiif_print?(env)
next_actor.create(env) && after_other_actors(env)
end

def update(env)
# TODO: test what happens when ensure_title is removed.
ensure_title(env)
@pdf_paths = []
hold_upload_paths(env) if responds_to_split?(env.curation_concern)
if iiif_print?(env)
hold_upload_paths(env)
count_existing_pdfs(env)
end
next_actor.update(env) && after_other_actors(env)
end

Expand All @@ -28,16 +30,22 @@ def hold_upload_paths(env)
return if upload_ids.empty?
uploads = Hyrax::UploadedFile.find(upload_ids)
paths = uploads.map(&method(:upload_path))
@pdf_paths = paths.select { |path| path.end_with?('.pdf') }
# TODO: remote routes in bulkrax may not always end in pdf. Consider other
# methods to identify a PDF file.
@pdf_paths = paths.select { |path| path.end_with?('.pdf', '.PDF') }
end

def responds_to_split?(curation_concern)
return true if curation_concern.respond_to?(:split_pdf)
false
def iiif_print?(env)
@iiif_print_defined ||= env.curation_concern.try(:iiif_print_config?)
end

# TODO: find the number of pdfs on the parent work prior to this update, to support addition of more PDFs. Parm env will then be required to pull out env.curation_concern.
def count_existing_pdfs(_env)
@prior_pdfs_count = 0
end

def after_other_actors(env)
handle_issue_upload(env) if responds_to_split?(env.curation_concern)
handle_issue_upload(env) if iiif_print?(env)
# needs to return true to not break actor stack traversal
true
end
Expand All @@ -47,17 +55,24 @@ def handle_issue_upload(env)
work = env.curation_concern
# must persist work to serialize job using it
work.save!(validate: false)
user = env.current_ability.current_user.user_key
env.attributes[:admin_set_id] ||= default_admin_set
queue_job(work, @pdf_paths, user, env.attributes[:admin_set_id])
user = env.current_ability.current_user
admin_set = env.attributes[:admin_set_id] ||= default_admin_set
queue_job(work, @pdf_paths, user, admin_set, @prior_pdfs_count)
end

def queue_job(work, paths, user, admin_set_id)
IiifPrint::CreatePagesJob.perform_later(
# submit the job
# @param [GenericWork, etc] A valid type of hyrax work
# @param [Array<String>] paths to PDF attachments
# @param [User] user
# @param [String] admin set ID
# @param [Integer] count of PDFs already existing on the parent work
def queue_job(work, paths, user, admin_set_id, prior_pdfs)
work.iiif_print_config.pdf_splitter_job.perform_later(
work,
paths,
user,
admin_set_id
admin_set_id,
prior_pdfs
)
end

Expand Down
4 changes: 0 additions & 4 deletions app/jobs/iiif_print/application_job.rb

This file was deleted.

21 changes: 0 additions & 21 deletions app/jobs/iiif_print/create_pages_job.rb

This file was deleted.

5 changes: 0 additions & 5 deletions app/models/concerns/iiif_print/iiif_print_behavior.rb
Original file line number Diff line number Diff line change
@@ -1,10 +1,5 @@
module IiifPrint
module IiifPrintBehavior
# adds IIIF Print behavior to an object
def split_pdf
true
end

##
# relationship indexing for fileset and works
#
Expand Down
7 changes: 7 additions & 0 deletions app/models/iiif_print/pending_relationship.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
module IiifPrint
class PendingRelationship < ApplicationRecord
validates :parent_id, presence: true
validates :child_title, presence: true
validates :child_order, presence: true
end
end
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
class CreateIiifPrintPendingRelationships < ActiveRecord::Migration[5.1]
def change
create_table :iiif_print_pending_relationships do |t|
t.string :child_title, null: false
t.string :parent_id, null: false
t.string :child_order, null: false
t.timestamps
end
add_index :iiif_print_pending_relationships, :parent_id
end
end
9 changes: 7 additions & 2 deletions lib/iiif_print.rb
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,10 @@
require "iiif_print/tiff_derivative_service"
require "iiif_print/metadata"
require "iiif_print/works_controller_behavior"
require "iiif_print/jobs/application_job"
require "iiif_print/jobs/child_works_from_pdf_job"
require "iiif_print/jobs/create_relationships_job"
require "iiif_print/split_pdfs/pages_into_images_service"

module IiifPrint
extend ActiveSupport::Autoload
Expand All @@ -40,8 +44,9 @@ def self.config(&block)
end

DEFAULT_MODEL_CONFIGURATION = {
# TODO: This should be a class and not a string; but I don't know what that should just now be.
pdf_splitter_job: "IiifPrint::DefaultPdfSplitterJob",
# Split a PDF into individual page images and create a new child work for each image.
pdf_splitter_job: IiifPrint::Jobs::ChildWorksFromPdfJob,
pdf_splitter_service: IiifPrint::SplitPdfs::PagesIntoImagesService,
derivative_service_plugins: [
IiifPrint::JP2DerivativeService,
IiifPrint::PDFDerivativeService,
Expand Down
6 changes: 6 additions & 0 deletions lib/iiif_print/jobs/application_job.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
module IiifPrint
module Jobs
class ApplicationJob < ActiveJob::Base
end
end
end
107 changes: 107 additions & 0 deletions lib/iiif_print/jobs/child_works_from_pdf_job.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
module IiifPrint
module Jobs
class ChildWorksFromPdfJob < IiifPrint::Jobs::ApplicationJob
# Break a pdf into individual pages
# @param parent_work
# @param pdf_paths: [<Array => String>] paths to pdfs
# @param user: [User]
# @param admin_set_id: [<String>]
# @param prior_pdfs: [<Integer>] count of pdfs already on parent work
def perform(parent_work, pdf_paths, user, admin_set_id, prior_pdfs)
@parent_work = parent_work
@child_admin_set_id = admin_set_id
child_model = @parent_work.iiif_print_config.pdf_split_child_model

# handle each input pdf
pdf_paths.each_with_index do |path, pdf_idx|
split_pdf(path, pdf_idx, user, prior_pdfs, child_model)
end

# Link newly created child works to the parent
# @param user: [User] user
# @param parent_id: [<String>] parent work id
# @param parent_model: [<String>] parent model
# @param child_model: [<String>] child model
IiifPrint::Jobs::CreateRelationshipsJob.set(wait: 10.minutes).perform_later(
user: user,
parent_id: @parent_work.id,
parent_model: @parent_work.class.to_s,
child_model: child_model.to_s
)

# TODO: clean up image_files and pdf_paths
end

private

def split_pdf(path, pdf_idx, user, prior_pdfs_count, child_model)
image_files = @parent_work.iiif_print_config.pdf_splitter_service.new(path).to_a
return if image_files.blank?

pdf_sequence = pdf_idx + prior_pdfs_count
prepare_import_data(pdf_sequence, image_files, user)

# submit the job to create all the child works for one PDF
# @param [User] user
# @param [Hash<String => String>] titles
# @param [Hash<String => String>] resource_types (optional)
# @param [Array<String>] uploaded_files Hyrax::UploadedFile IDs
# @param [Hash] attributes attributes to apply to all works, including :model
# @param [Hyrax::BatchCreateOperation] operation
operation = Hyrax::BatchCreateOperation.create!(
user: user,
operation_type: "PDF Batch Create"
)
BatchCreateJob.perform_later(user,
@child_work_titles,
{},
@uploaded_files,
attributes.merge!(model: child_model.to_s).with_indifferent_access,
operation)
end

def prepare_import_data(pdf_sequence, image_files, user)
@uploaded_files = []
@child_work_titles = {}
image_files.each_with_index do |image_path, idx|
file_id = create_uploaded_file(user, image_path).to_s
file_title = set_title(@parent_work.title.first, pdf_sequence, idx)
@uploaded_files << file_id
@child_work_titles[file_id] = file_title
# save child work info to create the member relationships
PendingRelationship.create!(child_title: file_title,
parent_id: @parent_work.id,
child_order: sort_order(pdf_sequence, idx))
end
end

def sort_order(pdf_sequence, idx)
"#{pdf_sequence} #{idx}"
end

def create_uploaded_file(user, path)
uf = Hyrax::UploadedFile.new
uf.user_id = user.id
uf.file = CarrierWave::SanitizedFile.new(path)
uf.save!
uf.id
end

def set_title(title, pdf_sequence, idx)
pdf_index = "Pdf Nbr #{pdf_sequence + 1}"
page_number = "Page #{idx + 1}"
"#{title}: #{pdf_index}, #{page_number}"
end

# TODO: what attributes do we need to fill in from the parent work? What about AllinsonFlex?
def attributes
{
admin_set_id: @child_admin_set_id.to_s,
creator: @parent_work.creator.to_a,
rights_statement: @parent_work.rights_statement.to_a,
visibility: @parent_work.visibility.to_s
}
end
end
end
end
70 changes: 70 additions & 0 deletions lib/iiif_print/jobs/create_relationships_job.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
module IiifPrint
module Jobs
# Break a pdf into individual pages
class CreateRelationshipsJob < IiifPrint::Jobs::ApplicationJob
# Link newly created child works to the parent
# @param user: [User] user
# @param parent_id: [<String>] parent work id
# @param parent_model: [<String>] parent model
# @param child_model: [<String>] child model
def perform(user:, parent_id:, parent_model:, child_model:)
if completed_child_data_for(parent_id, child_model)
# add the members
parent_work = parent_model.constantize.find(parent_id)
create_relationships(user: user, parent: parent_work, ordered_child_ids: @child_ids)
@pending_children.each(&:destroy)
else
# reschedule the job and end this one normally
reschedule(user: user, parent_id: parent_id, parent_model: parent_model, child_model: child_model)
end
end

private

# load @child_ids, and return true or false
def completed_child_data_for(parent_id, child_model)
@child_ids = []
found_all_children = true

# find and sequence all pending children
@pending_children = IiifPrint::PendingRelationship.where(parent_id: parent_id).order('child_order asc')

# find child ids (skip out if any haven't yet been created)
@pending_children.each do |child|
# find by title... if any aren't found, the child works are not yet ready
found_child = find_id_by_title_for(child.child_title, child_model)
found_all_children = false if found_child.empty?
break unless found_all_children == true
@child_ids += found_child
end
# return boolean
found_all_children
end

def find_id_by_title_for(title, model)
model.constantize.where(title: title).map(&:id)
end

def reschedule(user:, parent_id:, parent_model:, child_model:)
CreateRelationshipsJob.set(wait: 10.minutes).perform_later(
user: user,
parent_id: parent_id,
parent_model: parent_model,
child_model: child_model
)
end

def create_relationships(user:, parent:, ordered_child_ids:)
records_hash = {}
ordered_child_ids.each_with_index do |child_id, i|
records_hash[i] = { id: child_id }
end
attrs = { work_members_attributes: records_hash }
parent.try(:reindex_extent=, Hyrax::Adapters::NestingIndexAdapter::LIMITED_REINDEX)
env = Hyrax::Actors::Environment.new(parent, Ability.new(user), attrs)

Hyrax::CurationConcern.actor.update(env)
end
end
end
end
Loading

0 comments on commit f729884

Please sign in to comment.