Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix pdf resplitting #380

Merged
merged 1 commit into from
Jan 29, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 4 additions & 3 deletions app/controllers/iiif_print/split_pdfs_controller.rb
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,10 @@ class SplitPdfsController < ApplicationController
before_action :authenticate_user!

def create
@file_set = FileSet.where(id: params[:file_set_id]).first
@file_set = IiifPrint.find_by(id: params[:file_set_id])
authorize_create_split_request!(@file_set)
IiifPrint::Jobs::RequestSplitPdfJob.perform_later(file_set: @file_set, user: current_user)

IiifPrint::Jobs::RequestSplitPdfJob.perform_later(file_set_id: @file_set.to_param, user: current_user)
respond_to do |wants|
wants.html { redirect_to polymorphic_path([main_app, @file_set]), notice: t("iiif_print.file_set.split_submitted", id: @file_set.id) }
wants.json { render json: { id: @file_set.id, to_param: @file_set.to_param }, status: :ok }
Expand All @@ -27,7 +28,7 @@ def authorize_create_split_request!(file_set)
# Rely on CanCan's authorize! method. We could add the :split_pdf action to the ability
# class. But we're pigging backing on the idea that you can do this if you can edit the work.
authorize!(:edit, file_set)
raise "Expected #{file_set.class} ID=#{file_set.id} #to_param=#{file_set.to_param} to be a PDF. Instead found mime_type of #{file_set.mime_type}." unless file_set.pdf?
raise "Expected #{file_set.class} ID=#{file_set.id} #to_param=#{file_set.to_param} to be a PDF. Instead found mime_type of #{file_set.mime_type}." unless IiifPrint.pdf?(file_set)

work = IiifPrint.parent_for(file_set)
raise WorkNotConfiguredToSplitFileSetError.new(file_set: file_set, work: work) unless work&.iiif_print_config&.pdf_splitter_job&.presence
Expand Down
14 changes: 7 additions & 7 deletions app/jobs/iiif_print/jobs/request_split_pdf_job.rb
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,12 @@ module Jobs
# Encapsulates logic for cleanup when the PDF is destroyed after pdf splitting into child works
class RequestSplitPdfJob < IiifPrint::Jobs::ApplicationJob
##
# @param file_set [FileSet]
# @param file_set_id [FileSet id]
# @param user [User]
# rubocop:disable Metrics/MethodLength
def perform(file_set:, user:)
return true unless file_set.pdf?

def perform(file_set_id:, user:)
file_set = IiifPrint.find_by(id: file_set_id)
return true unless IiifPrint.pdf?(file_set)
work = IiifPrint.parent_for(file_set)

# Woe is ye who changes the configuration of the model, thus removing the splitting.
Expand All @@ -18,11 +18,11 @@ def perform(file_set:, user:)
# clean up any existing spawned child works of this file_set
IiifPrint::SplitPdfs::DestroyPdfChildWorksService.conditionally_destroy_spawned_children_of(
file_set: file_set,
work: work
work: work,
user: user
)

location = Hyrax::WorkingDirectory.find_or_retrieve(file_set.files.first.id, file_set.id)

location = IiifPrint.pdf_path_for(file_set: file_set)
IiifPrint.conditionally_submit_split_for(work: work, file_set: file_set, locations: [location], user: user)
end
# rubocop:enable Metrics/MethodLength
Expand Down
1 change: 1 addition & 0 deletions lib/iiif_print.rb
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,7 @@ class << self
:solr_construct_query,
:solr_name,
:solr_query,
:pdf_path_for,
to: :persistence_adapter
)
end
Expand Down
4 changes: 4 additions & 0 deletions lib/iiif_print/persistence_layer.rb
Original file line number Diff line number Diff line change
Expand Up @@ -113,6 +113,10 @@ def self.copy_derivatives_from_data_store(stream:, directives:)
def self.extract_text_for(file_set:)
raise NotImplementedError, "#{self}.{__method__}"
end

def self.pdf_path_for(file_set:)
raise NotImplementedError, "#{self}.{__method__}"
end
end
end
end
9 changes: 9 additions & 0 deletions lib/iiif_print/persistence_layer/active_fedora_adapter.rb
Original file line number Diff line number Diff line change
Expand Up @@ -184,6 +184,15 @@ def self.copy_derivatives_from_data_store(*)
def self.extract_text_for(file_set:)
IiifPrint.config.all_text_generator_function.call(object: file_set) || ''
end

##
# Location of the file for resplitting
#
# @param [FileSet] an ActiveFedora fileset
# @return [String] location of the original file
def self.pdf_path_for(file_set:)
Hyrax::WorkingDirectory.find_or_retrieve(file_set.files.first.id, file_set.id)
end
end
end
end
39 changes: 32 additions & 7 deletions lib/iiif_print/persistence_layer/valkyrie_adapter.rb
Original file line number Diff line number Diff line change
Expand Up @@ -76,14 +76,28 @@ def self.solr_name(field_name)
Hyrax.config.index_field_mapper.solr_name(field_name.to_s)
end

# rubocop:disable Lint/UnusedMethodArgument
# NOTE: this isn't the most efficient method, but it is the most reliable.
# Attribute 'split_from_pdf_id' is saved in Valkyrie as a string rather than as { id: string },
# so we can't use the 'find_inverse_references_by' query.
# Additionally, the attribute does not exist on all child works, as it was added later, so using
# a child work's title allows us to find child works when the attribute isn't present.
# Building a custom query to find these child works directly via the attribute would be more efficient.
# However, it would require more effort for a lesser-used feature, and would not allow for the fallback
# of finding child works by title.
def self.destroy_children_split_from(file_set:, work:, model:, user:)
# rubocop:enable Lint/UnusedMethodArgument
# look for child records by the file set id they were split from
Hyrax.query_service.find_inverse_references_by(resource: file_set, property: :split_from_pdf_id, model: model).each do |child|
Hyrax.persister.delete(resource: child)
Hyrax.indexing_service.delete(resource: child)
Hyrax.publisher.publish('object.deleted', object: child, user: user)
all_child_works = Hyrax.custom_queries.find_child_works(resource: work)
return if all_child_works.blank?
# look first for children by the file set id they were split from
children = all_child_works.select { |m| m.split_from_pdf_id == file_set.id }
if children.blank?
# find works where file name and work `to_param` are both in the title
children = all_child_works.select { |m| m.title.include?(file_set.label) && m.title.include?(work.to_param) }
end
return if children.blank?
children.each do |rcd|
Hyrax.persister.delete(resource: rcd)
Hyrax.index_adapter.delete(resource: rcd)
Hyrax.publisher.publish('object.deleted', object: rcd, user: user)
end
true
end
Expand Down Expand Up @@ -176,6 +190,17 @@ def self.extract_text_for(file_set:)
return if text_fm.nil?
text_fm.content
end

##
# Location of the file for resplitting
#
# @param [Hyrax::FileSet] a Valkyrie fileset
# @return [String] location of the original file
def self.pdf_path_for(file_set:)
file = file_set.original_file
return '' unless file.pdf?
file.file.disk_path.to_s
end
end
end
end
Loading