From 4b43117077619435ac16b6ccf028e91fa213f0ce Mon Sep 17 00:00:00 2001 From: LaRita Robinson Date: Wed, 29 Jan 2025 16:53:55 -0500 Subject: [PATCH] Fix pdf resplitting There were a few pieces of the pdf resplitting process that were not fully valkyrized. This commit valkyrizes the resplitting, and fixes a bug with removing the child works from the initial split. Ref https://github.com/notch8/palni_palci_knapsack/issues/80 --- .../iiif_print/split_pdfs_controller.rb | 7 ++-- .../iiif_print/jobs/request_split_pdf_job.rb | 14 +++---- lib/iiif_print.rb | 1 + lib/iiif_print/persistence_layer.rb | 4 ++ .../active_fedora_adapter.rb | 9 +++++ .../persistence_layer/valkyrie_adapter.rb | 39 +++++++++++++++---- 6 files changed, 57 insertions(+), 17 deletions(-) diff --git a/app/controllers/iiif_print/split_pdfs_controller.rb b/app/controllers/iiif_print/split_pdfs_controller.rb index 531b2b06..82eb1ce4 100644 --- a/app/controllers/iiif_print/split_pdfs_controller.rb +++ b/app/controllers/iiif_print/split_pdfs_controller.rb @@ -4,9 +4,10 @@ class SplitPdfsController < ApplicationController before_action :authenticate_user! def create - @file_set = FileSet.where(id: params[:file_set_id]).first + @file_set = IiifPrint.find_by(id: params[:file_set_id]) authorize_create_split_request!(@file_set) - IiifPrint::Jobs::RequestSplitPdfJob.perform_later(file_set: @file_set, user: current_user) + + IiifPrint::Jobs::RequestSplitPdfJob.perform_later(file_set_id: @file_set.to_param, user: current_user) respond_to do |wants| wants.html { redirect_to polymorphic_path([main_app, @file_set]), notice: t("iiif_print.file_set.split_submitted", id: @file_set.id) } wants.json { render json: { id: @file_set.id, to_param: @file_set.to_param }, status: :ok } @@ -27,7 +28,7 @@ def authorize_create_split_request!(file_set) # Rely on CanCan's authorize! method. We could add the :split_pdf action to the ability # class. But we're pigging backing on the idea that you can do this if you can edit the work. authorize!(:edit, file_set) - raise "Expected #{file_set.class} ID=#{file_set.id} #to_param=#{file_set.to_param} to be a PDF. Instead found mime_type of #{file_set.mime_type}." unless file_set.pdf? + raise "Expected #{file_set.class} ID=#{file_set.id} #to_param=#{file_set.to_param} to be a PDF. Instead found mime_type of #{file_set.mime_type}." unless IiifPrint.pdf?(file_set) work = IiifPrint.parent_for(file_set) raise WorkNotConfiguredToSplitFileSetError.new(file_set: file_set, work: work) unless work&.iiif_print_config&.pdf_splitter_job&.presence diff --git a/app/jobs/iiif_print/jobs/request_split_pdf_job.rb b/app/jobs/iiif_print/jobs/request_split_pdf_job.rb index 993c9ea7..19359772 100644 --- a/app/jobs/iiif_print/jobs/request_split_pdf_job.rb +++ b/app/jobs/iiif_print/jobs/request_split_pdf_job.rb @@ -4,12 +4,12 @@ module Jobs # Encapsulates logic for cleanup when the PDF is destroyed after pdf splitting into child works class RequestSplitPdfJob < IiifPrint::Jobs::ApplicationJob ## - # @param file_set [FileSet] + # @param file_set_id [FileSet id] # @param user [User] # rubocop:disable Metrics/MethodLength - def perform(file_set:, user:) - return true unless file_set.pdf? - + def perform(file_set_id:, user:) + file_set = IiifPrint.find_by(id: file_set_id) + return true unless IiifPrint.pdf?(file_set) work = IiifPrint.parent_for(file_set) # Woe is ye who changes the configuration of the model, thus removing the splitting. @@ -18,11 +18,11 @@ def perform(file_set:, user:) # clean up any existing spawned child works of this file_set IiifPrint::SplitPdfs::DestroyPdfChildWorksService.conditionally_destroy_spawned_children_of( file_set: file_set, - work: work + work: work, + user: user ) - location = Hyrax::WorkingDirectory.find_or_retrieve(file_set.files.first.id, file_set.id) - + location = IiifPrint.pdf_path_for(file_set: file_set) IiifPrint.conditionally_submit_split_for(work: work, file_set: file_set, locations: [location], user: user) end # rubocop:enable Metrics/MethodLength diff --git a/lib/iiif_print.rb b/lib/iiif_print.rb index b7a5e464..821da170 100644 --- a/lib/iiif_print.rb +++ b/lib/iiif_print.rb @@ -66,6 +66,7 @@ class << self :solr_construct_query, :solr_name, :solr_query, + :pdf_path_for, to: :persistence_adapter ) end diff --git a/lib/iiif_print/persistence_layer.rb b/lib/iiif_print/persistence_layer.rb index 5ddd62d0..f13a8996 100644 --- a/lib/iiif_print/persistence_layer.rb +++ b/lib/iiif_print/persistence_layer.rb @@ -113,6 +113,10 @@ def self.copy_derivatives_from_data_store(stream:, directives:) def self.extract_text_for(file_set:) raise NotImplementedError, "#{self}.{__method__}" end + + def self.pdf_path_for(file_set:) + raise NotImplementedError, "#{self}.{__method__}" + end end end end diff --git a/lib/iiif_print/persistence_layer/active_fedora_adapter.rb b/lib/iiif_print/persistence_layer/active_fedora_adapter.rb index 6ff9bd07..51e661cc 100644 --- a/lib/iiif_print/persistence_layer/active_fedora_adapter.rb +++ b/lib/iiif_print/persistence_layer/active_fedora_adapter.rb @@ -184,6 +184,15 @@ def self.copy_derivatives_from_data_store(*) def self.extract_text_for(file_set:) IiifPrint.config.all_text_generator_function.call(object: file_set) || '' end + + ## + # Location of the file for resplitting + # + # @param [FileSet] an ActiveFedora fileset + # @return [String] location of the original file + def self.pdf_path_for(file_set:) + Hyrax::WorkingDirectory.find_or_retrieve(file_set.files.first.id, file_set.id) + end end end end diff --git a/lib/iiif_print/persistence_layer/valkyrie_adapter.rb b/lib/iiif_print/persistence_layer/valkyrie_adapter.rb index d98d6648..ad56a84c 100644 --- a/lib/iiif_print/persistence_layer/valkyrie_adapter.rb +++ b/lib/iiif_print/persistence_layer/valkyrie_adapter.rb @@ -76,14 +76,28 @@ def self.solr_name(field_name) Hyrax.config.index_field_mapper.solr_name(field_name.to_s) end - # rubocop:disable Lint/UnusedMethodArgument + # NOTE: this isn't the most efficient method, but it is the most reliable. + # Attribute 'split_from_pdf_id' is saved in Valkyrie as a string rather than as { id: string }, + # so we can't use the 'find_inverse_references_by' query. + # Additionally, the attribute does not exist on all child works, as it was added later, so using + # a child work's title allows us to find child works when the attribute isn't present. + # Building a custom query to find these child works directly via the attribute would be more efficient. + # However, it would require more effort for a lesser-used feature, and would not allow for the fallback + # of finding child works by title. def self.destroy_children_split_from(file_set:, work:, model:, user:) - # rubocop:enable Lint/UnusedMethodArgument - # look for child records by the file set id they were split from - Hyrax.query_service.find_inverse_references_by(resource: file_set, property: :split_from_pdf_id, model: model).each do |child| - Hyrax.persister.delete(resource: child) - Hyrax.indexing_service.delete(resource: child) - Hyrax.publisher.publish('object.deleted', object: child, user: user) + all_child_works = Hyrax.custom_queries.find_child_works(resource: work) + return if all_child_works.blank? + # look first for children by the file set id they were split from + children = all_child_works.select { |m| m.split_from_pdf_id == file_set.id } + if children.blank? + # find works where file name and work `to_param` are both in the title + children = all_child_works.select { |m| m.title.include?(file_set.label) && m.title.include?(work.to_param) } + end + return if children.blank? + children.each do |rcd| + Hyrax.persister.delete(resource: rcd) + Hyrax.index_adapter.delete(resource: rcd) + Hyrax.publisher.publish('object.deleted', object: rcd, user: user) end true end @@ -176,6 +190,17 @@ def self.extract_text_for(file_set:) return if text_fm.nil? text_fm.content end + + ## + # Location of the file for resplitting + # + # @param [Hyrax::FileSet] a Valkyrie fileset + # @return [String] location of the original file + def self.pdf_path_for(file_set:) + file = file_set.original_file + return '' unless file.pdf? + file.file.disk_path.to_s + end end end end