Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

I11 job split pdfs into child works #83

Merged
merged 6 commits into from
Feb 6, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -46,3 +46,4 @@ fcrepo-webapp-*
*.gem
pkg/
*~undo-tree~
.DS_Store
45 changes: 30 additions & 15 deletions app/actors/iiif_print/actors/iiif_print_upload_actor.rb
Original file line number Diff line number Diff line change
Expand Up @@ -4,18 +4,20 @@ class IiifPrintUploadActor < Hyrax::Actors::AbstractActor
# An actor which locates all uploaded PDF paths and
# spins off IiifPrint::CreatePagesJob to split them.
def create(env)
# TODO: test what happens when ensure_title is removed.
ensure_title(env)
@pdf_paths = []
hold_upload_paths(env) if responds_to_split?(env.curation_concern)
@prior_pdfs_count = 0
hold_upload_paths(env) if iiif_print?(env)
next_actor.create(env) && after_other_actors(env)
end

def update(env)
# TODO: test what happens when ensure_title is removed.
ensure_title(env)
@pdf_paths = []
hold_upload_paths(env) if responds_to_split?(env.curation_concern)
if iiif_print?(env)
hold_upload_paths(env)
count_existing_pdfs(env)
kirkkwang marked this conversation as resolved.
Show resolved Hide resolved
end
next_actor.update(env) && after_other_actors(env)
end

Expand All @@ -28,16 +30,22 @@ def hold_upload_paths(env)
return if upload_ids.empty?
uploads = Hyrax::UploadedFile.find(upload_ids)
paths = uploads.map(&method(:upload_path))
@pdf_paths = paths.select { |path| path.end_with?('.pdf') }
# TODO: remote routes in bulkrax may not always end in pdf. Consider other
# methods to identify a PDF file.
@pdf_paths = paths.select { |path| path.end_with?('.pdf', '.PDF') }
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

just in case there's something weird like a_file.Pdf consider using case insensitive regex

@pdf_paths = paths.select { |path| path.match? /\.pdf$/i }

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Possibly? I was thinking a TODO may be necessary to find a way to identify PDFs differently which is why I left it this way for now. Remote URLS can contain PDF files but don't always end in .pdf (i.e. google docs links).

end

def responds_to_split?(curation_concern)
return true if curation_concern.respond_to?(:split_pdf)
false
def iiif_print?(env)
@iiif_print_defined ||= env.curation_concern.try(:iiif_print_config?)
end

# TODO: find the number of pdfs on the parent work prior to this update, to support addition of more PDFs. Parm env will then be required to pull out env.curation_concern.
def count_existing_pdfs(_env)
@prior_pdfs_count = 0
end

def after_other_actors(env)
handle_issue_upload(env) if responds_to_split?(env.curation_concern)
handle_issue_upload(env) if iiif_print?(env)
# needs to return true to not break actor stack traversal
true
end
Expand All @@ -47,17 +55,24 @@ def handle_issue_upload(env)
work = env.curation_concern
# must persist work to serialize job using it
work.save!(validate: false)
user = env.current_ability.current_user.user_key
env.attributes[:admin_set_id] ||= default_admin_set
queue_job(work, @pdf_paths, user, env.attributes[:admin_set_id])
user = env.current_ability.current_user
admin_set = env.attributes[:admin_set_id] ||= default_admin_set
queue_job(work, @pdf_paths, user, admin_set, @prior_pdfs_count)
end

def queue_job(work, paths, user, admin_set_id)
IiifPrint::CreatePagesJob.perform_later(
# submit the job
# @param [GenericWork, etc] A valid type of hyrax work
# @param [Array<String>] paths to PDF attachments
# @param [User] user
# @param [String] admin set ID
# @param [Integer] count of PDFs already existing on the parent work
def queue_job(work, paths, user, admin_set_id, prior_pdfs)
work.iiif_print_config.pdf_splitter_job.perform_later(
work,
paths,
user,
admin_set_id
admin_set_id,
prior_pdfs
)
end

Expand Down
4 changes: 0 additions & 4 deletions app/jobs/iiif_print/application_job.rb

This file was deleted.

13 changes: 0 additions & 13 deletions app/jobs/iiif_print/compose_issue_pdf_job.rb

This file was deleted.

21 changes: 0 additions & 21 deletions app/jobs/iiif_print/create_pages_job.rb

This file was deleted.

5 changes: 0 additions & 5 deletions app/models/concerns/iiif_print/iiif_print_behavior.rb
Original file line number Diff line number Diff line change
@@ -1,10 +1,5 @@
module IiifPrint
module IiifPrintBehavior
# adds IIIF Print behavior to an object
def split_pdf
true
end

##
# relationship indexing for fileset and works
#
Expand Down
7 changes: 7 additions & 0 deletions app/models/iiif_print/pending_relationship.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
module IiifPrint
class PendingRelationship < ApplicationRecord
validates :parent_id, presence: true
validates :child_title, presence: true
validates :child_order, presence: true
end
end
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
class CreateIiifPrintPendingRelationships < ActiveRecord::Migration[5.1]
def change
create_table :iiif_print_pending_relationships do |t|
t.string :child_title, null: false
t.string :parent_id, null: false
t.string :child_order, null: false
t.timestamps
end
add_index :iiif_print_pending_relationships, :parent_id
end
end
9 changes: 7 additions & 2 deletions lib/iiif_print.rb
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,10 @@
require "iiif_print/tiff_derivative_service"
require "iiif_print/metadata"
require "iiif_print/works_controller_behavior"
require "iiif_print/jobs/application_job"
require "iiif_print/jobs/child_works_from_pdf_job"
require "iiif_print/jobs/create_relationships_job"
require "iiif_print/split_pdfs/pages_into_images_service"

module IiifPrint
extend ActiveSupport::Autoload
Expand All @@ -41,8 +45,9 @@ def self.config(&block)
end

DEFAULT_MODEL_CONFIGURATION = {
# TODO: This should be a class and not a string; but I don't know what that should just now be.
pdf_splitter_job: "IiifPrint::DefaultPdfSplitterJob",
# Split a PDF into individual page images and create a new child work for each image.
pdf_splitter_job: IiifPrint::Jobs::ChildWorksFromPdfJob,
pdf_splitter_service: IiifPrint::SplitPdfs::PagesIntoImagesService,
derivative_service_plugins: [
IiifPrint::JP2DerivativeService,
IiifPrint::PDFDerivativeService,
Expand Down
6 changes: 6 additions & 0 deletions lib/iiif_print/jobs/application_job.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
module IiifPrint
module Jobs
class ApplicationJob < ActiveJob::Base
end
end
end
107 changes: 107 additions & 0 deletions lib/iiif_print/jobs/child_works_from_pdf_job.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
module IiifPrint
module Jobs
class ChildWorksFromPdfJob < IiifPrint::Jobs::ApplicationJob
# Break a pdf into individual pages
# @param parent_work
# @param pdf_paths: [<Array => String>] paths to pdfs
# @param user: [User]
# @param admin_set_id: [<String>]
# @param prior_pdfs: [<Integer>] count of pdfs already on parent work
def perform(parent_work, pdf_paths, user, admin_set_id, prior_pdfs)
@parent_work = parent_work
@child_admin_set_id = admin_set_id
child_model = @parent_work.iiif_print_config.pdf_split_child_model

# handle each input pdf
pdf_paths.each_with_index do |path, pdf_idx|
split_pdf(path, pdf_idx, user, prior_pdfs, child_model)
end

# Link newly created child works to the parent
# @param user: [User] user
# @param parent_id: [<String>] parent work id
# @param parent_model: [<String>] parent model
# @param child_model: [<String>] child model
IiifPrint::Jobs::CreateRelationshipsJob.set(wait: 10.minutes).perform_later(
user: user,
parent_id: @parent_work.id,
parent_model: @parent_work.class.to_s,
child_model: child_model.to_s
)

# TODO: clean up image_files and pdf_paths
end

private

def split_pdf(path, pdf_idx, user, prior_pdfs_count, child_model)
image_files = @parent_work.iiif_print_config.pdf_splitter_service.new(path).to_a
return if image_files.blank?

pdf_sequence = pdf_idx + prior_pdfs_count
prepare_import_data(pdf_sequence, image_files, user)

# submit the job to create all the child works for one PDF
# @param [User] user
# @param [Hash<String => String>] titles
# @param [Hash<String => String>] resource_types (optional)
# @param [Array<String>] uploaded_files Hyrax::UploadedFile IDs
# @param [Hash] attributes attributes to apply to all works, including :model
# @param [Hyrax::BatchCreateOperation] operation
operation = Hyrax::BatchCreateOperation.create!(
user: user,
operation_type: "PDF Batch Create"
)
BatchCreateJob.perform_later(user,
@child_work_titles,
{},
@uploaded_files,
attributes.merge!(model: child_model.to_s).with_indifferent_access,
operation)
end

def prepare_import_data(pdf_sequence, image_files, user)
@uploaded_files = []
@child_work_titles = {}
image_files.each_with_index do |image_path, idx|
file_id = create_uploaded_file(user, image_path).to_s
file_title = set_title(@parent_work.title.first, pdf_sequence, idx)
@uploaded_files << file_id
@child_work_titles[file_id] = file_title
# save child work info to create the member relationships
PendingRelationship.create!(child_title: file_title,
parent_id: @parent_work.id,
child_order: sort_order(pdf_sequence, idx))
end
end

def sort_order(pdf_sequence, idx)
"#{pdf_sequence} #{idx}"
end

def create_uploaded_file(user, path)
uf = Hyrax::UploadedFile.new
uf.user_id = user.id
uf.file = CarrierWave::SanitizedFile.new(path)
uf.save!
uf.id
end

def set_title(title, pdf_sequence, idx)
pdf_index = "Pdf Nbr #{pdf_sequence + 1}"
page_number = "Page #{idx + 1}"
"#{title}: #{pdf_index}, #{page_number}"
end

# TODO: what attributes do we need to fill in from the parent work? What about AllinsonFlex?
def attributes
{
admin_set_id: @child_admin_set_id.to_s,
creator: @parent_work.creator.to_a,
rights_statement: @parent_work.rights_statement.to_a,
visibility: @parent_work.visibility.to_s
}
end
end
end
end
70 changes: 70 additions & 0 deletions lib/iiif_print/jobs/create_relationships_job.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
module IiifPrint
module Jobs
# Break a pdf into individual pages
class CreateRelationshipsJob < IiifPrint::Jobs::ApplicationJob
# Link newly created child works to the parent
# @param user: [User] user
# @param parent_id: [<String>] parent work id
# @param parent_model: [<String>] parent model
# @param child_model: [<String>] child model
def perform(user:, parent_id:, parent_model:, child_model:)
if completed_child_data_for(parent_id, child_model)
# add the members
parent_work = parent_model.constantize.find(parent_id)
create_relationships(user: user, parent: parent_work, ordered_child_ids: @child_ids)
@pending_children.each(&:destroy)
else
# reschedule the job and end this one normally
reschedule(user: user, parent_id: parent_id, parent_model: parent_model, child_model: child_model)
end
end

private

# load @child_ids, and return true or false
def completed_child_data_for(parent_id, child_model)
@child_ids = []
found_all_children = true

# find and sequence all pending children
@pending_children = IiifPrint::PendingRelationship.where(parent_id: parent_id).order('child_order asc')

# find child ids (skip out if any haven't yet been created)
@pending_children.each do |child|
# find by title... if any aren't found, the child works are not yet ready
found_child = find_id_by_title_for(child.child_title, child_model)
found_all_children = false if found_child.empty?
break unless found_all_children == true
@child_ids += found_child
end
# return boolean
found_all_children
end

def find_id_by_title_for(title, model)
model.constantize.where(title: title).map(&:id)
end

def reschedule(user:, parent_id:, parent_model:, child_model:)
CreateRelationshipsJob.set(wait: 10.minutes).perform_later(
user: user,
parent_id: parent_id,
parent_model: parent_model,
child_model: child_model
)
end

def create_relationships(user:, parent:, ordered_child_ids:)
records_hash = {}
ordered_child_ids.each_with_index do |child_id, i|
records_hash[i] = { id: child_id }
end
attrs = { work_members_attributes: records_hash }
parent.try(:reindex_extent=, Hyrax::Adapters::NestingIndexAdapter::LIMITED_REINDEX)
env = Hyrax::Actors::Environment.new(parent, Ability.new(user), attrs)

Hyrax::CurationConcern.actor.update(env)
end
end
end
end
Loading