Merge pull request #128 from scientist-softserv/i98-bulkrax

Fix Bulkrax Imports for IiifPrint PDF splitting
notch8 · Feb 15, 2023 · 8d1309f · 8d1309f
2 parents 781378e + 58bbae0
commit 8d1309f
Show file tree

Hide file tree

Showing 5 changed files with 145 additions and 55 deletions.
diff --git a/app/actors/iiif_print/actors/file_set_actor_decorator.rb b/app/actors/iiif_print/actors/file_set_actor_decorator.rb
@@ -0,0 +1,49 @@
+# frozen_string_literal: true
+
+# override to add PDF splitting for file sets
+module IiifPrint
+  module Actors
+    module FileSetActorDecorator
+      def create_content(file, relation = :original_file, from_url: false)
+        # Spawns asynchronous IngestJob unless ingesting from URL
+        super
+
+        if from_url
+          # we have everything we need... queue the job
+          parent = parent_for(file_set: @file_set)
+
+          if service.iiif_print_split?(work: parent) && service.pdfs?(paths: [file_set.import_url])
+            service.queue_job(
+              work: parent,
+              file_locations: [file.path],
+              user: @user,
+              admin_set_id: parent.admin_set_id
+            )
+          end
+        else
+          # we don't have the parent yet... save the paths for later use
+          @pdf_paths = service.pdf_paths(files: [file.id.to_s])
+        end
+      end
+
+      # Override to add PDF splitting
+      def attach_to_work(work, file_set_params = {})
+        # Locks to ensure that only one process is operating on the list at a time.
+        super
+
+        return if @pdf_paths.blank?
+        return unless service.iiif_print_split?(work: work)
+        service.queue_job(
+          work: work,
+          file_locations: @pdf_paths,
+          user: @user,
+          admin_set_id: work.admin_set_id
+        )
+      end
+
+      def service
+        IiifPrint::SplitPdfs::ChildWorkCreationFromPdfService
+      end
+    end
+  end
+end
diff --git a/app/actors/iiif_print/actors/iiif_print_upload_actor.rb b/app/actors/iiif_print/actors/iiif_print_upload_actor.rb
@@ -1,78 +1,52 @@
+# frozen_string_literal: true
+
 module IiifPrint
   module Actors
     class IiifPrintUploadActor < Hyrax::Actors::AbstractActor
       # An actor which locates all uploaded PDF paths and
-      # spins off IiifPrint::CreatePagesJob to split them.
+      # spins off IiifPrint::ChildWorksFromPdfJob to split them.
       def create(env)
         ensure_title(env)
-        @pdf_paths = []
-        @prior_pdfs_count = 0
-        hold_upload_paths(env) if iiif_print?(env)
+        @pdf_paths = hold_upload_paths(env)
         next_actor.create(env) && after_other_actors(env)
       end
 
       def update(env)
         ensure_title(env)
-        @pdf_paths = []
-        if iiif_print?(env)
-          hold_upload_paths(env)
-          count_existing_pdfs(env)
-        end
+        @pdf_paths = hold_upload_paths(env)
         next_actor.update(env) && after_other_actors(env)
       end
 
       private
 
-      # fill the array of pdf files' upload paths
-      def hold_upload_paths(env)
-        return unless env.attributes.keys.include? 'uploaded_files'
-        upload_ids = filter_file_ids(env.attributes['uploaded_files'])
-        return if upload_ids.empty?
-        uploads = Hyrax::UploadedFile.find(upload_ids)
-        paths = uploads.map(&method(:upload_path))
-        # TODO: remote routes in bulkrax may not always end in pdf. Consider other
-        #       methods to identify a PDF file.
-        @pdf_paths = paths.select { |path| path.end_with?('.pdf', '.PDF') }
-      end
-
-      def iiif_print?(env)
-        @iiif_print_defined ||= env.curation_concern.try(:iiif_print_config?)
+      def service
+        IiifPrint::SplitPdfs::ChildWorkCreationFromPdfService
       end
 
-      # TODO: find the number of pdfs on the parent work prior to this update, to support addition of more PDFs. Parm env will then be required to pull out env.curation_concern.
-      def count_existing_pdfs(_env)
-        @prior_pdfs_count = 0
+      # fill & save an array of pdf files' upload paths
+      def hold_upload_paths(env)
+        return [] unless env.attributes.keys.include? 'uploaded_files'
+        service.pdf_paths(files: env.attributes['uploaded_files'])
       end
 
       def after_other_actors(env)
-        handle_issue_upload(env) if iiif_print?(env)
+        handle_issue_upload(env)
         # needs to return true to not break actor stack traversal
         true
       end
 
       def handle_issue_upload(env)
         return if @pdf_paths.empty?
         work = env.curation_concern
+        return unless service.iiif_print_split?(work: work)
         # must persist work to serialize job using it
         work.save!(validate: false)
-        user = env.current_ability.current_user
-        admin_set = env.attributes[:admin_set_id] ||= default_admin_set
-        queue_job(work, @pdf_paths, user, admin_set, @prior_pdfs_count)
-      end
-
-      # submit the job
-      # @param [GenericWork, etc] A valid type of hyrax work
-      # @param [Array<String>] paths to PDF attachments
-      # @param [User] user
-      # @param [String] admin set ID
-      # @param [Integer] count of PDFs already existing on the parent work
-      def queue_job(work, paths, user, admin_set_id, prior_pdfs)
-        work.iiif_print_config.pdf_splitter_job.perform_later(
-          work,
-          paths,
-          user,
-          admin_set_id,
-          prior_pdfs
+        admin_set_id = env.attributes[:admin_set_id] ||= default_admin_set
+        service.queue_job(
+          work: work,
+          file_locations: @pdf_paths,
+          user: env.current_ability.current_user,
+          admin_set_id: admin_set_id
         )
       end
 
@@ -92,16 +66,6 @@ def default_admin_set
 
         Hyrax::AdminSetCreateService.find_or_create_default_admin_set.id.to_s
       end
-
-      # Given Hyrax::Upload object, return path to file on local filesystem
-      def upload_path(upload)
-        # so many layers to this onion:
-        upload.file.file.file
-      end
-
-      def filter_file_ids(input)
-        Array.wrap(input).select(&:present?)
-      end
     end
   end
 end
diff --git a/lib/iiif_print.rb b/lib/iiif_print.rb
@@ -19,6 +19,7 @@
 require "iiif_print/jobs/child_works_from_pdf_job"
 require "iiif_print/jobs/create_relationships_job"
 require "iiif_print/split_pdfs/pages_into_images_service"
+require "iiif_print/split_pdfs/child_work_creation_from_pdf_service"
 
 module IiifPrint
   extend ActiveSupport::Autoload

diff --git a/lib/iiif_print/engine.rb b/lib/iiif_print/engine.rb
@@ -39,6 +39,7 @@ class Engine < ::Rails::Engine
 
       ::BlacklightIiifSearch::IiifSearchResponse.prepend(IiifPrint::IiifSearchResponseDecorator)
       ::BlacklightIiifSearch::IiifSearchAnnotation.prepend(IiifPrint::BlacklightIiifSearch::AnnotationDecorator)
+      Hyrax::Actors::FileSetActor.prepend(IiifPrint::Actors::FileSetActorDecorator)
 
       # Extending the presenter to the base url which includes the protocol.
       # We need the base url to render the facet links and normalize the interface.

diff --git a/lib/iiif_print/split_pdfs/child_work_creation_from_pdf_service.rb b/lib/iiif_print/split_pdfs/child_work_creation_from_pdf_service.rb
@@ -0,0 +1,75 @@
+# frozen_string_literal: true
+
+# Encapsulates methods used for pdf splitting into child works
+module IiifPrint
+  module SplitPdfs
+    class ChildWorkCreationFromPdfService
+      # Load an array of paths to pdf files
+      # @param [Array > Hyrax::Upload file ids]
+      # @return [Array > String] file paths to temp directory
+      def self.pdf_paths(files:)
+        upload_ids = filter_file_ids(files)
+        return [] if upload_ids.empty?
+        uploads = Hyrax::UploadedFile.find(upload_ids)
+        paths = uploads.map(&method(:upload_path))
+        pdfs_only_for(paths)
+      end
+
+      # Is child work splitting defined for model?
+      # @param [GenericWork, etc] A valid type of hyrax work
+      # @return [Boolean]
+      def self.iiif_print_split?(work:)
+        # defined only if work has include IiifPrint.model_configuration with pdf_split_child_model
+        return true if work.try(:iiif_print_config)&.pdf_split_child_model
+        false
+      end
+
+      # Are there any PDF files?
+      # @param [Array > String] paths to PDFs
+      # @return [Boolean]
+      def self.pdfs?(paths:)
+        pdf_paths = pdfs_only_for(paths)
+        return false unless pdf_paths.count.positive?
+        true
+      end
+
+      # Submit the job to split PDF into child works
+      # @param [GenericWork, etc] A valid type of hyrax work
+      # @param [Array<String>] paths to PDF attachments
+      # @param [User] user
+      # @param [Integer] number of pdfs already on existing work's filesets (not yet implemented)
+      def self.queue_job(work:, file_locations:, user:, admin_set_id:)
+        work.iiif_print_config.pdf_splitter_job.perform_later(
+          work,
+          file_locations,
+          user,
+          admin_set_id,
+          count_existing_pdfs(work)
+        )
+      end
+
+      def self.filter_file_ids(input)
+        Array.wrap(input).select(&:present?)
+      end
+
+      # Given Hyrax::Upload object, return path to file on local filesystem
+      def self.upload_path(upload)
+        # so many layers to this onion:
+        upload.file.file.file
+      end
+
+      # TODO: implement a method to count existing PDFs on a work to support
+      #       adding more PDFs to an existing work.
+      def self.count_existing_pdfs(_work)
+        0
+      end
+
+      # TODO: Consider other methods to identify a PDF file.
+      #       This sub-selection may need to be moved to use mimetype if there
+      #       is a need to support paths not ending in .pdf (i.e. remote_urls)
+      def self.pdfs_only_for(paths)
+        paths.select { |path| path.end_with?('.pdf', '.PDF') }
+      end
+    end
+  end
+end