Skip to content

Commit

Permalink
Merge pull request #128 from scientist-softserv/i98-bulkrax
Browse files Browse the repository at this point in the history
Fix Bulkrax Imports for IiifPrint PDF splitting
  • Loading branch information
laritakr authored Feb 15, 2023
2 parents 781378e + 58bbae0 commit 8d1309f
Show file tree
Hide file tree
Showing 5 changed files with 145 additions and 55 deletions.
49 changes: 49 additions & 0 deletions app/actors/iiif_print/actors/file_set_actor_decorator.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
# frozen_string_literal: true

# override to add PDF splitting for file sets
module IiifPrint
module Actors
module FileSetActorDecorator
def create_content(file, relation = :original_file, from_url: false)
# Spawns asynchronous IngestJob unless ingesting from URL
super

if from_url
# we have everything we need... queue the job
parent = parent_for(file_set: @file_set)

if service.iiif_print_split?(work: parent) && service.pdfs?(paths: [file_set.import_url])
service.queue_job(
work: parent,
file_locations: [file.path],
user: @user,
admin_set_id: parent.admin_set_id
)
end
else
# we don't have the parent yet... save the paths for later use
@pdf_paths = service.pdf_paths(files: [file.id.to_s])
end
end

# Override to add PDF splitting
def attach_to_work(work, file_set_params = {})
# Locks to ensure that only one process is operating on the list at a time.
super

return if @pdf_paths.blank?
return unless service.iiif_print_split?(work: work)
service.queue_job(
work: work,
file_locations: @pdf_paths,
user: @user,
admin_set_id: work.admin_set_id
)
end

def service
IiifPrint::SplitPdfs::ChildWorkCreationFromPdfService
end
end
end
end
74 changes: 19 additions & 55 deletions app/actors/iiif_print/actors/iiif_print_upload_actor.rb
Original file line number Diff line number Diff line change
@@ -1,78 +1,52 @@
# frozen_string_literal: true

module IiifPrint
module Actors
class IiifPrintUploadActor < Hyrax::Actors::AbstractActor
# An actor which locates all uploaded PDF paths and
# spins off IiifPrint::CreatePagesJob to split them.
# spins off IiifPrint::ChildWorksFromPdfJob to split them.
def create(env)
ensure_title(env)
@pdf_paths = []
@prior_pdfs_count = 0
hold_upload_paths(env) if iiif_print?(env)
@pdf_paths = hold_upload_paths(env)
next_actor.create(env) && after_other_actors(env)
end

def update(env)
ensure_title(env)
@pdf_paths = []
if iiif_print?(env)
hold_upload_paths(env)
count_existing_pdfs(env)
end
@pdf_paths = hold_upload_paths(env)
next_actor.update(env) && after_other_actors(env)
end

private

# fill the array of pdf files' upload paths
def hold_upload_paths(env)
return unless env.attributes.keys.include? 'uploaded_files'
upload_ids = filter_file_ids(env.attributes['uploaded_files'])
return if upload_ids.empty?
uploads = Hyrax::UploadedFile.find(upload_ids)
paths = uploads.map(&method(:upload_path))
# TODO: remote routes in bulkrax may not always end in pdf. Consider other
# methods to identify a PDF file.
@pdf_paths = paths.select { |path| path.end_with?('.pdf', '.PDF') }
end

def iiif_print?(env)
@iiif_print_defined ||= env.curation_concern.try(:iiif_print_config?)
def service
IiifPrint::SplitPdfs::ChildWorkCreationFromPdfService
end

# TODO: find the number of pdfs on the parent work prior to this update, to support addition of more PDFs. Parm env will then be required to pull out env.curation_concern.
def count_existing_pdfs(_env)
@prior_pdfs_count = 0
# fill & save an array of pdf files' upload paths
def hold_upload_paths(env)
return [] unless env.attributes.keys.include? 'uploaded_files'
service.pdf_paths(files: env.attributes['uploaded_files'])
end

def after_other_actors(env)
handle_issue_upload(env) if iiif_print?(env)
handle_issue_upload(env)
# needs to return true to not break actor stack traversal
true
end

def handle_issue_upload(env)
return if @pdf_paths.empty?
work = env.curation_concern
return unless service.iiif_print_split?(work: work)
# must persist work to serialize job using it
work.save!(validate: false)
user = env.current_ability.current_user
admin_set = env.attributes[:admin_set_id] ||= default_admin_set
queue_job(work, @pdf_paths, user, admin_set, @prior_pdfs_count)
end

# submit the job
# @param [GenericWork, etc] A valid type of hyrax work
# @param [Array<String>] paths to PDF attachments
# @param [User] user
# @param [String] admin set ID
# @param [Integer] count of PDFs already existing on the parent work
def queue_job(work, paths, user, admin_set_id, prior_pdfs)
work.iiif_print_config.pdf_splitter_job.perform_later(
work,
paths,
user,
admin_set_id,
prior_pdfs
admin_set_id = env.attributes[:admin_set_id] ||= default_admin_set
service.queue_job(
work: work,
file_locations: @pdf_paths,
user: env.current_ability.current_user,
admin_set_id: admin_set_id
)
end

Expand All @@ -92,16 +66,6 @@ def default_admin_set

Hyrax::AdminSetCreateService.find_or_create_default_admin_set.id.to_s
end

# Given Hyrax::Upload object, return path to file on local filesystem
def upload_path(upload)
# so many layers to this onion:
upload.file.file.file
end

def filter_file_ids(input)
Array.wrap(input).select(&:present?)
end
end
end
end
1 change: 1 addition & 0 deletions lib/iiif_print.rb
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
require "iiif_print/jobs/child_works_from_pdf_job"
require "iiif_print/jobs/create_relationships_job"
require "iiif_print/split_pdfs/pages_into_images_service"
require "iiif_print/split_pdfs/child_work_creation_from_pdf_service"

module IiifPrint
extend ActiveSupport::Autoload
Expand Down
1 change: 1 addition & 0 deletions lib/iiif_print/engine.rb
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ class Engine < ::Rails::Engine

::BlacklightIiifSearch::IiifSearchResponse.prepend(IiifPrint::IiifSearchResponseDecorator)
::BlacklightIiifSearch::IiifSearchAnnotation.prepend(IiifPrint::BlacklightIiifSearch::AnnotationDecorator)
Hyrax::Actors::FileSetActor.prepend(IiifPrint::Actors::FileSetActorDecorator)

# Extending the presenter to the base url which includes the protocol.
# We need the base url to render the facet links and normalize the interface.
Expand Down
75 changes: 75 additions & 0 deletions lib/iiif_print/split_pdfs/child_work_creation_from_pdf_service.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
# frozen_string_literal: true

# Encapsulates methods used for pdf splitting into child works
module IiifPrint
module SplitPdfs
class ChildWorkCreationFromPdfService
# Load an array of paths to pdf files
# @param [Array > Hyrax::Upload file ids]
# @return [Array > String] file paths to temp directory
def self.pdf_paths(files:)
upload_ids = filter_file_ids(files)
return [] if upload_ids.empty?
uploads = Hyrax::UploadedFile.find(upload_ids)
paths = uploads.map(&method(:upload_path))
pdfs_only_for(paths)
end

# Is child work splitting defined for model?
# @param [GenericWork, etc] A valid type of hyrax work
# @return [Boolean]
def self.iiif_print_split?(work:)
# defined only if work has include IiifPrint.model_configuration with pdf_split_child_model
return true if work.try(:iiif_print_config)&.pdf_split_child_model
false
end

# Are there any PDF files?
# @param [Array > String] paths to PDFs
# @return [Boolean]
def self.pdfs?(paths:)
pdf_paths = pdfs_only_for(paths)
return false unless pdf_paths.count.positive?
true
end

# Submit the job to split PDF into child works
# @param [GenericWork, etc] A valid type of hyrax work
# @param [Array<String>] paths to PDF attachments
# @param [User] user
# @param [Integer] number of pdfs already on existing work's filesets (not yet implemented)
def self.queue_job(work:, file_locations:, user:, admin_set_id:)
work.iiif_print_config.pdf_splitter_job.perform_later(
work,
file_locations,
user,
admin_set_id,
count_existing_pdfs(work)
)
end

def self.filter_file_ids(input)
Array.wrap(input).select(&:present?)
end

# Given Hyrax::Upload object, return path to file on local filesystem
def self.upload_path(upload)
# so many layers to this onion:
upload.file.file.file
end

# TODO: implement a method to count existing PDFs on a work to support
# adding more PDFs to an existing work.
def self.count_existing_pdfs(_work)
0
end

# TODO: Consider other methods to identify a PDF file.
# This sub-selection may need to be moved to use mimetype if there
# is a need to support paths not ending in .pdf (i.e. remote_urls)
def self.pdfs_only_for(paths)
paths.select { |path| path.end_with?('.pdf', '.PDF') }
end
end
end
end

0 comments on commit 8d1309f

Please sign in to comment.