From 4b43117077619435ac16b6ccf028e91fa213f0ce Mon Sep 17 00:00:00 2001
From: LaRita Robinson <larita@scientist.com>
Date: Wed, 29 Jan 2025 16:53:55 -0500
Subject: [PATCH] Fix pdf resplitting

There were a few pieces of the pdf resplitting process that were not
fully valkyrized. This commit valkyrizes the resplitting, and fixes
a bug with removing the child works from the initial split.

Ref https://github.com/notch8/palni_palci_knapsack/issues/80
---
 .../iiif_print/split_pdfs_controller.rb       |  7 ++--
 .../iiif_print/jobs/request_split_pdf_job.rb  | 14 +++----
 lib/iiif_print.rb                             |  1 +
 lib/iiif_print/persistence_layer.rb           |  4 ++
 .../active_fedora_adapter.rb                  |  9 +++++
 .../persistence_layer/valkyrie_adapter.rb     | 39 +++++++++++++++----
 6 files changed, 57 insertions(+), 17 deletions(-)

diff --git a/app/controllers/iiif_print/split_pdfs_controller.rb b/app/controllers/iiif_print/split_pdfs_controller.rb
index 531b2b06..82eb1ce4 100644
--- a/app/controllers/iiif_print/split_pdfs_controller.rb
+++ b/app/controllers/iiif_print/split_pdfs_controller.rb
@@ -4,9 +4,10 @@ class SplitPdfsController < ApplicationController
     before_action :authenticate_user!
 
     def create
-      @file_set = FileSet.where(id: params[:file_set_id]).first
+      @file_set = IiifPrint.find_by(id: params[:file_set_id])
       authorize_create_split_request!(@file_set)
-      IiifPrint::Jobs::RequestSplitPdfJob.perform_later(file_set: @file_set, user: current_user)
+
+      IiifPrint::Jobs::RequestSplitPdfJob.perform_later(file_set_id: @file_set.to_param, user: current_user)
       respond_to do |wants|
         wants.html { redirect_to polymorphic_path([main_app, @file_set]), notice: t("iiif_print.file_set.split_submitted", id: @file_set.id) }
         wants.json { render json: { id: @file_set.id, to_param: @file_set.to_param }, status: :ok }
@@ -27,7 +28,7 @@ def authorize_create_split_request!(file_set)
       # Rely on CanCan's authorize! method.  We could add the :split_pdf action to the ability
       # class.  But we're pigging backing on the idea that you can do this if you can edit the work.
       authorize!(:edit, file_set)
-      raise "Expected #{file_set.class} ID=#{file_set.id} #to_param=#{file_set.to_param} to be a PDF.  Instead found mime_type of #{file_set.mime_type}." unless file_set.pdf?
+      raise "Expected #{file_set.class} ID=#{file_set.id} #to_param=#{file_set.to_param} to be a PDF.  Instead found mime_type of #{file_set.mime_type}." unless IiifPrint.pdf?(file_set)
 
       work = IiifPrint.parent_for(file_set)
       raise WorkNotConfiguredToSplitFileSetError.new(file_set: file_set, work: work) unless work&.iiif_print_config&.pdf_splitter_job&.presence
diff --git a/app/jobs/iiif_print/jobs/request_split_pdf_job.rb b/app/jobs/iiif_print/jobs/request_split_pdf_job.rb
index 993c9ea7..19359772 100644
--- a/app/jobs/iiif_print/jobs/request_split_pdf_job.rb
+++ b/app/jobs/iiif_print/jobs/request_split_pdf_job.rb
@@ -4,12 +4,12 @@ module Jobs
     # Encapsulates logic for cleanup when the PDF is destroyed after pdf splitting into child works
     class RequestSplitPdfJob < IiifPrint::Jobs::ApplicationJob
       ##
-      # @param file_set [FileSet]
+      # @param file_set_id [FileSet id]
       # @param user [User]
       # rubocop:disable Metrics/MethodLength
-      def perform(file_set:, user:)
-        return true unless file_set.pdf?
-
+      def perform(file_set_id:, user:)
+        file_set = IiifPrint.find_by(id: file_set_id)
+        return true unless IiifPrint.pdf?(file_set)
         work = IiifPrint.parent_for(file_set)
 
         # Woe is ye who changes the configuration of the model, thus removing the splitting.
@@ -18,11 +18,11 @@ def perform(file_set:, user:)
         # clean up any existing spawned child works of this file_set
         IiifPrint::SplitPdfs::DestroyPdfChildWorksService.conditionally_destroy_spawned_children_of(
           file_set: file_set,
-          work: work
+          work: work,
+          user: user
         )
 
-        location = Hyrax::WorkingDirectory.find_or_retrieve(file_set.files.first.id, file_set.id)
-
+        location = IiifPrint.pdf_path_for(file_set: file_set)
         IiifPrint.conditionally_submit_split_for(work: work, file_set: file_set, locations: [location], user: user)
       end
       # rubocop:enable Metrics/MethodLength
diff --git a/lib/iiif_print.rb b/lib/iiif_print.rb
index b7a5e464..821da170 100644
--- a/lib/iiif_print.rb
+++ b/lib/iiif_print.rb
@@ -66,6 +66,7 @@ class << self
       :solr_construct_query,
       :solr_name,
       :solr_query,
+      :pdf_path_for,
       to: :persistence_adapter
     )
   end
diff --git a/lib/iiif_print/persistence_layer.rb b/lib/iiif_print/persistence_layer.rb
index 5ddd62d0..f13a8996 100644
--- a/lib/iiif_print/persistence_layer.rb
+++ b/lib/iiif_print/persistence_layer.rb
@@ -113,6 +113,10 @@ def self.copy_derivatives_from_data_store(stream:, directives:)
       def self.extract_text_for(file_set:)
         raise NotImplementedError, "#{self}.{__method__}"
       end
+
+      def self.pdf_path_for(file_set:)
+        raise NotImplementedError, "#{self}.{__method__}"
+      end
     end
   end
 end
diff --git a/lib/iiif_print/persistence_layer/active_fedora_adapter.rb b/lib/iiif_print/persistence_layer/active_fedora_adapter.rb
index 6ff9bd07..51e661cc 100644
--- a/lib/iiif_print/persistence_layer/active_fedora_adapter.rb
+++ b/lib/iiif_print/persistence_layer/active_fedora_adapter.rb
@@ -184,6 +184,15 @@ def self.copy_derivatives_from_data_store(*)
       def self.extract_text_for(file_set:)
         IiifPrint.config.all_text_generator_function.call(object: file_set) || ''
       end
+
+      ##
+      # Location of the file for resplitting
+      #
+      # @param [FileSet] an ActiveFedora fileset
+      # @return [String] location of the original file
+      def self.pdf_path_for(file_set:)
+        Hyrax::WorkingDirectory.find_or_retrieve(file_set.files.first.id, file_set.id)
+      end
     end
   end
 end
diff --git a/lib/iiif_print/persistence_layer/valkyrie_adapter.rb b/lib/iiif_print/persistence_layer/valkyrie_adapter.rb
index d98d6648..ad56a84c 100644
--- a/lib/iiif_print/persistence_layer/valkyrie_adapter.rb
+++ b/lib/iiif_print/persistence_layer/valkyrie_adapter.rb
@@ -76,14 +76,28 @@ def self.solr_name(field_name)
         Hyrax.config.index_field_mapper.solr_name(field_name.to_s)
       end
 
-      # rubocop:disable Lint/UnusedMethodArgument
+      # NOTE: this isn't the most efficient method, but it is the most reliable.
+      #  Attribute 'split_from_pdf_id' is saved in Valkyrie as a string rather than as { id: string },
+      #    so we can't use the 'find_inverse_references_by' query.
+      #  Additionally, the attribute does not exist on all child works, as it was added later, so using
+      #    a child work's title allows us to find child works when the attribute isn't present.
+      #  Building a custom query to find these child works directly via the attribute would be more efficient.
+      #    However, it would require more effort for a lesser-used feature, and would not allow for the fallback
+      #    of finding child works by title.
       def self.destroy_children_split_from(file_set:, work:, model:, user:)
-        # rubocop:enable Lint/UnusedMethodArgument
-        # look for child records by the file set id they were split from
-        Hyrax.query_service.find_inverse_references_by(resource: file_set, property: :split_from_pdf_id, model: model).each do |child|
-          Hyrax.persister.delete(resource: child)
-          Hyrax.indexing_service.delete(resource: child)
-          Hyrax.publisher.publish('object.deleted', object: child, user: user)
+        all_child_works = Hyrax.custom_queries.find_child_works(resource: work)
+        return if all_child_works.blank?
+        # look first for children by the file set id they were split from
+        children = all_child_works.select { |m| m.split_from_pdf_id == file_set.id }
+        if children.blank?
+          # find works where file name and work `to_param` are both in the title
+          children = all_child_works.select { |m| m.title.include?(file_set.label) && m.title.include?(work.to_param) }
+        end
+        return if children.blank?
+        children.each do |rcd|
+          Hyrax.persister.delete(resource: rcd)
+          Hyrax.index_adapter.delete(resource: rcd)
+          Hyrax.publisher.publish('object.deleted', object: rcd, user: user)
         end
         true
       end
@@ -176,6 +190,17 @@ def self.extract_text_for(file_set:)
         return if text_fm.nil?
         text_fm.content
       end
+
+      ##
+      # Location of the file for resplitting
+      #
+      # @param [Hyrax::FileSet] a Valkyrie fileset
+      # @return [String] location of the original file
+      def self.pdf_path_for(file_set:)
+       file = file_set.original_file
+       return '' unless file.pdf?
+       file.file.disk_path.to_s
+      end
     end
   end
 end