diff --git a/.gitignore b/.gitignore index 96d225b7..0557c465 100644 --- a/.gitignore +++ b/.gitignore @@ -46,3 +46,4 @@ fcrepo-webapp-* *.gem pkg/ *~undo-tree~ +.DS_Store \ No newline at end of file diff --git a/app/actors/iiif_print/actors/iiif_print_upload_actor.rb b/app/actors/iiif_print/actors/iiif_print_upload_actor.rb index c9cabe28..586c793a 100644 --- a/app/actors/iiif_print/actors/iiif_print_upload_actor.rb +++ b/app/actors/iiif_print/actors/iiif_print_upload_actor.rb @@ -4,18 +4,20 @@ class IiifPrintUploadActor < Hyrax::Actors::AbstractActor # An actor which locates all uploaded PDF paths and # spins off IiifPrint::CreatePagesJob to split them. def create(env) - # TODO: test what happens when ensure_title is removed. ensure_title(env) @pdf_paths = [] - hold_upload_paths(env) if responds_to_split?(env.curation_concern) + @prior_pdfs_count = 0 + hold_upload_paths(env) if iiif_print?(env) next_actor.create(env) && after_other_actors(env) end def update(env) - # TODO: test what happens when ensure_title is removed. ensure_title(env) @pdf_paths = [] - hold_upload_paths(env) if responds_to_split?(env.curation_concern) + if iiif_print?(env) + hold_upload_paths(env) + count_existing_pdfs(env) + end next_actor.update(env) && after_other_actors(env) end @@ -28,16 +30,22 @@ def hold_upload_paths(env) return if upload_ids.empty? uploads = Hyrax::UploadedFile.find(upload_ids) paths = uploads.map(&method(:upload_path)) - @pdf_paths = paths.select { |path| path.end_with?('.pdf') } + # TODO: remote routes in bulkrax may not always end in pdf. Consider other + # methods to identify a PDF file. + @pdf_paths = paths.select { |path| path.end_with?('.pdf', '.PDF') } end - def responds_to_split?(curation_concern) - return true if curation_concern.respond_to?(:split_pdf) - false + def iiif_print?(env) + @iiif_print_defined ||= env.curation_concern.try(:iiif_print_config?) + end + + # TODO: find the number of pdfs on the parent work prior to this update, to support addition of more PDFs. Parm env will then be required to pull out env.curation_concern. + def count_existing_pdfs(_env) + @prior_pdfs_count = 0 end def after_other_actors(env) - handle_issue_upload(env) if responds_to_split?(env.curation_concern) + handle_issue_upload(env) if iiif_print?(env) # needs to return true to not break actor stack traversal true end @@ -47,17 +55,24 @@ def handle_issue_upload(env) work = env.curation_concern # must persist work to serialize job using it work.save!(validate: false) - user = env.current_ability.current_user.user_key - env.attributes[:admin_set_id] ||= default_admin_set - queue_job(work, @pdf_paths, user, env.attributes[:admin_set_id]) + user = env.current_ability.current_user + admin_set = env.attributes[:admin_set_id] ||= default_admin_set + queue_job(work, @pdf_paths, user, admin_set, @prior_pdfs_count) end - def queue_job(work, paths, user, admin_set_id) - IiifPrint::CreatePagesJob.perform_later( + # submit the job + # @param [GenericWork, etc] A valid type of hyrax work + # @param [Array] paths to PDF attachments + # @param [User] user + # @param [String] admin set ID + # @param [Integer] count of PDFs already existing on the parent work + def queue_job(work, paths, user, admin_set_id, prior_pdfs) + work.iiif_print_config.pdf_splitter_job.perform_later( work, paths, user, - admin_set_id + admin_set_id, + prior_pdfs ) end diff --git a/app/jobs/iiif_print/application_job.rb b/app/jobs/iiif_print/application_job.rb deleted file mode 100644 index efeb08bd..00000000 --- a/app/jobs/iiif_print/application_job.rb +++ /dev/null @@ -1,4 +0,0 @@ -module IiifPrint - class ApplicationJob < ActiveJob::Base - end -end diff --git a/app/jobs/iiif_print/compose_issue_pdf_job.rb b/app/jobs/iiif_print/compose_issue_pdf_job.rb deleted file mode 100644 index a917d8d8..00000000 --- a/app/jobs/iiif_print/compose_issue_pdf_job.rb +++ /dev/null @@ -1,13 +0,0 @@ -module IiifPrint - # Compose and attach a multi-page PDF from constituent pages, if ready - # (if not ready, job retry requires Rails >= 5.1) - class ComposeIssuePDFJob < IiifPrint::ApplicationJob - retry_on IiifPrint::PagesNotReady, - wait: :exponentially_longer, - attempts: 8 - - def perform(issue) - IiifPrint::IssuePDFComposer.new(issue).compose - end - end -end diff --git a/app/jobs/iiif_print/create_pages_job.rb b/app/jobs/iiif_print/create_pages_job.rb deleted file mode 100644 index bf239374..00000000 --- a/app/jobs/iiif_print/create_pages_job.rb +++ /dev/null @@ -1,21 +0,0 @@ -module IiifPrint - # Break a pdf into individual pages - class CreatePagesJob < IiifPrint::ApplicationJob - def perform(work, _pdf_paths, user, admin_set_id) - # we will need depositor set on work, if it is nil - work.depositor ||= user - # if we do not have admin_set_id yet, set it on the issue work: - work.admin_set_id ||= admin_set_id - # create child pages for each page within each PDF uploaded: - # TODO need to reimplement this w/o it being tied up with - # the otherwise un-needed ingest work - # pdf_paths.each do |path| - # adapter = IiifPrint::Ingest::NewspaperIssueIngest.new(work) - # adapter.load(path) - # adapter.create_child_pages - # end - # re-save pages so that parent and sibling relationships are indexed - # work.pages.each(&:save) - end - end -end diff --git a/app/models/concerns/iiif_print/iiif_print_behavior.rb b/app/models/concerns/iiif_print/iiif_print_behavior.rb index f71244ea..95d86145 100644 --- a/app/models/concerns/iiif_print/iiif_print_behavior.rb +++ b/app/models/concerns/iiif_print/iiif_print_behavior.rb @@ -1,10 +1,5 @@ module IiifPrint module IiifPrintBehavior - # adds IIIF Print behavior to an object - def split_pdf - true - end - ## # relationship indexing for fileset and works # diff --git a/app/models/iiif_print/pending_relationship.rb b/app/models/iiif_print/pending_relationship.rb new file mode 100644 index 00000000..8c7fc990 --- /dev/null +++ b/app/models/iiif_print/pending_relationship.rb @@ -0,0 +1,7 @@ +module IiifPrint + class PendingRelationship < ApplicationRecord + validates :parent_id, presence: true + validates :child_title, presence: true + validates :child_order, presence: true + end +end diff --git a/db/migrate/20230109000000_create_iiif_print_pending_relationships.rb b/db/migrate/20230109000000_create_iiif_print_pending_relationships.rb new file mode 100644 index 00000000..b157b7be --- /dev/null +++ b/db/migrate/20230109000000_create_iiif_print_pending_relationships.rb @@ -0,0 +1,11 @@ +class CreateIiifPrintPendingRelationships < ActiveRecord::Migration[5.1] + def change + create_table :iiif_print_pending_relationships do |t| + t.string :child_title, null: false + t.string :parent_id, null: false + t.string :child_order, null: false + t.timestamps + end + add_index :iiif_print_pending_relationships, :parent_id + end +end diff --git a/lib/iiif_print.rb b/lib/iiif_print.rb index 2e3c509e..7d26f165 100644 --- a/lib/iiif_print.rb +++ b/lib/iiif_print.rb @@ -15,6 +15,10 @@ require "iiif_print/tiff_derivative_service" require "iiif_print/metadata" require "iiif_print/works_controller_behavior" +require "iiif_print/jobs/application_job" +require "iiif_print/jobs/child_works_from_pdf_job" +require "iiif_print/jobs/create_relationships_job" +require "iiif_print/split_pdfs/pages_into_images_service" module IiifPrint extend ActiveSupport::Autoload @@ -41,8 +45,9 @@ def self.config(&block) end DEFAULT_MODEL_CONFIGURATION = { - # TODO: This should be a class and not a string; but I don't know what that should just now be. - pdf_splitter_job: "IiifPrint::DefaultPdfSplitterJob", + # Split a PDF into individual page images and create a new child work for each image. + pdf_splitter_job: IiifPrint::Jobs::ChildWorksFromPdfJob, + pdf_splitter_service: IiifPrint::SplitPdfs::PagesIntoImagesService, derivative_service_plugins: [ IiifPrint::JP2DerivativeService, IiifPrint::PDFDerivativeService, diff --git a/lib/iiif_print/jobs/application_job.rb b/lib/iiif_print/jobs/application_job.rb new file mode 100644 index 00000000..b00167e2 --- /dev/null +++ b/lib/iiif_print/jobs/application_job.rb @@ -0,0 +1,6 @@ +module IiifPrint + module Jobs + class ApplicationJob < ActiveJob::Base + end + end +end diff --git a/lib/iiif_print/jobs/child_works_from_pdf_job.rb b/lib/iiif_print/jobs/child_works_from_pdf_job.rb new file mode 100644 index 00000000..389c0314 --- /dev/null +++ b/lib/iiif_print/jobs/child_works_from_pdf_job.rb @@ -0,0 +1,107 @@ +module IiifPrint + module Jobs + class ChildWorksFromPdfJob < IiifPrint::Jobs::ApplicationJob + # Break a pdf into individual pages + # @param parent_work + # @param pdf_paths: [ String>] paths to pdfs + # @param user: [User] + # @param admin_set_id: [] + # @param prior_pdfs: [] count of pdfs already on parent work + def perform(parent_work, pdf_paths, user, admin_set_id, prior_pdfs) + @parent_work = parent_work + @child_admin_set_id = admin_set_id + child_model = @parent_work.iiif_print_config.pdf_split_child_model + + # handle each input pdf + pdf_paths.each_with_index do |path, pdf_idx| + split_pdf(path, pdf_idx, user, prior_pdfs, child_model) + end + + # Link newly created child works to the parent + # @param user: [User] user + # @param parent_id: [] parent work id + # @param parent_model: [] parent model + # @param child_model: [] child model + IiifPrint::Jobs::CreateRelationshipsJob.set(wait: 10.minutes).perform_later( + user: user, + parent_id: @parent_work.id, + parent_model: @parent_work.class.to_s, + child_model: child_model.to_s + ) + + # TODO: clean up image_files and pdf_paths + end + + private + + def split_pdf(path, pdf_idx, user, prior_pdfs_count, child_model) + image_files = @parent_work.iiif_print_config.pdf_splitter_service.new(path).to_a + return if image_files.blank? + + pdf_sequence = pdf_idx + prior_pdfs_count + prepare_import_data(pdf_sequence, image_files, user) + + # submit the job to create all the child works for one PDF + # @param [User] user + # @param [Hash String>] titles + # @param [Hash String>] resource_types (optional) + # @param [Array] uploaded_files Hyrax::UploadedFile IDs + # @param [Hash] attributes attributes to apply to all works, including :model + # @param [Hyrax::BatchCreateOperation] operation + operation = Hyrax::BatchCreateOperation.create!( + user: user, + operation_type: "PDF Batch Create" + ) + BatchCreateJob.perform_later(user, + @child_work_titles, + {}, + @uploaded_files, + attributes.merge!(model: child_model.to_s).with_indifferent_access, + operation) + end + + def prepare_import_data(pdf_sequence, image_files, user) + @uploaded_files = [] + @child_work_titles = {} + image_files.each_with_index do |image_path, idx| + file_id = create_uploaded_file(user, image_path).to_s + file_title = set_title(@parent_work.title.first, pdf_sequence, idx) + @uploaded_files << file_id + @child_work_titles[file_id] = file_title + # save child work info to create the member relationships + PendingRelationship.create!(child_title: file_title, + parent_id: @parent_work.id, + child_order: sort_order(pdf_sequence, idx)) + end + end + + def sort_order(pdf_sequence, idx) + "#{pdf_sequence} #{idx}" + end + + def create_uploaded_file(user, path) + uf = Hyrax::UploadedFile.new + uf.user_id = user.id + uf.file = CarrierWave::SanitizedFile.new(path) + uf.save! + uf.id + end + + def set_title(title, pdf_sequence, idx) + pdf_index = "Pdf Nbr #{pdf_sequence + 1}" + page_number = "Page #{idx + 1}" + "#{title}: #{pdf_index}, #{page_number}" + end + + # TODO: what attributes do we need to fill in from the parent work? What about AllinsonFlex? + def attributes + { + admin_set_id: @child_admin_set_id.to_s, + creator: @parent_work.creator.to_a, + rights_statement: @parent_work.rights_statement.to_a, + visibility: @parent_work.visibility.to_s + } + end + end + end +end diff --git a/lib/iiif_print/jobs/create_relationships_job.rb b/lib/iiif_print/jobs/create_relationships_job.rb new file mode 100644 index 00000000..9082a3e0 --- /dev/null +++ b/lib/iiif_print/jobs/create_relationships_job.rb @@ -0,0 +1,70 @@ +module IiifPrint + module Jobs + # Break a pdf into individual pages + class CreateRelationshipsJob < IiifPrint::Jobs::ApplicationJob + # Link newly created child works to the parent + # @param user: [User] user + # @param parent_id: [] parent work id + # @param parent_model: [] parent model + # @param child_model: [] child model + def perform(user:, parent_id:, parent_model:, child_model:) + if completed_child_data_for(parent_id, child_model) + # add the members + parent_work = parent_model.constantize.find(parent_id) + create_relationships(user: user, parent: parent_work, ordered_child_ids: @child_ids) + @pending_children.each(&:destroy) + else + # reschedule the job and end this one normally + reschedule(user: user, parent_id: parent_id, parent_model: parent_model, child_model: child_model) + end + end + + private + + # load @child_ids, and return true or false + def completed_child_data_for(parent_id, child_model) + @child_ids = [] + found_all_children = true + + # find and sequence all pending children + @pending_children = IiifPrint::PendingRelationship.where(parent_id: parent_id).order('child_order asc') + + # find child ids (skip out if any haven't yet been created) + @pending_children.each do |child| + # find by title... if any aren't found, the child works are not yet ready + found_child = find_id_by_title_for(child.child_title, child_model) + found_all_children = false if found_child.empty? + break unless found_all_children == true + @child_ids += found_child + end + # return boolean + found_all_children + end + + def find_id_by_title_for(title, model) + model.constantize.where(title: title).map(&:id) + end + + def reschedule(user:, parent_id:, parent_model:, child_model:) + CreateRelationshipsJob.set(wait: 10.minutes).perform_later( + user: user, + parent_id: parent_id, + parent_model: parent_model, + child_model: child_model + ) + end + + def create_relationships(user:, parent:, ordered_child_ids:) + records_hash = {} + ordered_child_ids.each_with_index do |child_id, i| + records_hash[i] = { id: child_id } + end + attrs = { work_members_attributes: records_hash } + parent.try(:reindex_extent=, Hyrax::Adapters::NestingIndexAdapter::LIMITED_REINDEX) + env = Hyrax::Actors::Environment.new(parent, Ability.new(user), attrs) + + Hyrax::CurationConcern.actor.update(env) + end + end + end +end diff --git a/lib/iiif_print/split_pdfs/pages_into_images_service.rb b/lib/iiif_print/split_pdfs/pages_into_images_service.rb new file mode 100644 index 00000000..d9429522 --- /dev/null +++ b/lib/iiif_print/split_pdfs/pages_into_images_service.rb @@ -0,0 +1,126 @@ +require 'open3' +require 'securerandom' +require 'tmpdir' +require 'iiif_print/split_pdfs/pdf_image_extraction_service' + +module IiifPrint + module SplitPdfs + class PagesIntoImagesService + include Enumerable + + def initialize(path) + @baseid = SecureRandom.uuid + @pdfpath = path + @info = nil + @entries = nil + @tmpdir = nil + @size = nil + @pagecount = nil + @pdftext = nil + end + + # return + def pdfinfo + @info = IiifPrint::SplitPdfs::PdfImageExtractionService.new(@pdfpath) if @info.nil? + @info + end + + # TODO: put this test somewhere to prevent invalid pdfs from crashing the image service. + def invalid_pdf? + return true if pdfinfo.color.include?(nil) || pdfinfo.width.nil? || pdfinfo.height.nil? || pdfinfo.entries.length.zero? + false + end + + def tmpdir + @tmpdir = Dir.mktmpdir if @tmpdir.nil? + @tmpdir + end + + def colordevice(channels, bpc) + bits = bpc * channels + # will be either 8bpc/16bpd color TIFF, + # with any CMYK source transformed to 8bpc RBG + bits = 24 unless [24, 48].include? bits + "tiff#{bits}nc" + end + + def gsdevice + color, channels, bpc = pdfinfo.color + device = nil + # CCITT Group 4 Black and White, if applicable: + device = 'tiffg4' if color == 'gray' && bpc == 1 + # 8 Bit Grayscale, if applicable: + device = 'tiffgray' if color == 'gray' && bpc > 1 + # otherwise color: + device = colordevice(channels, bpc) if device.nil? + device + end + + # TODO: this method came from newspaper gem but appears to be unused. Is it needed anywhere? + # def gstext + # cmd = "gs -q -dNOPAUSE -dBATCH -sDEVICE=txtwrite " \ + # "-sOutputFile=- -f #{@pdfpath}" + # Open3.popen3(cmd) do |_stdin, stdout, _stderr, _wait_thr| + # @pdftext = stdout.read + # end + # @pdftext + # end + + def pagecount + cmd = "pdfinfo #{@pdfpath}" + Open3.popen3(cmd) do |_stdin, stdout, _stderr, _wait_thr| + output = stdout.read.split("\n") + # rubocop:disable Performance/Detect + pages_e = output.select { |e| e.start_with?('Pages:') }[0] + # rubocop:enable Performance/Detect + @pagecount = pages_e.split[-1].to_i + end + @pagecount + end + + def looks_scanned + max_image_px = pdfinfo.width * pdfinfo.height + single_image_per_page = pdfinfo.entries.length == pagecount + # single 10mp+ image per page? + single_image_per_page && max_image_px > 1024 * 1024 * 10 + end + + def ppi + unless looks_scanned + # 400 dpi for something that does not look like scanned media: + return 400 + end + # For scanned media, defer to detected image PPI: + pdfinfo.ppi + end + + # ghostscript convert all pages to TIFF + def gsconvert + output_base = File.join(tmpdir, "#{@baseid}-page%d.tiff") + cmd = "gs -dNOPAUSE -dBATCH -sDEVICE=#{gsdevice} " \ + "-dTextAlphaBits=4 " \ + "-sOutputFile=#{output_base} -r#{ppi} -f #{@pdfpath}" + Open3.popen3(cmd) do |_stdin, stdout, _stderr, _wait_thr| + output = stdout.read.split("\n") + # rubocop:disable Performance/Count + @size = output.select { |e| e.start_with?('Page ') }.length + # rubocop:enable Performance/Count + end + # Return an array of expected filenames + (1..@size).map { |n| File.join(tmpdir, "#{@baseid}-page#{n}.tiff") } + end + + # entries for each page + def entries + @entries = gsconvert if @entries.nil? + @entries + end + + def each + entries.each do |e| + yield(e) + end + end + end + end +end diff --git a/lib/iiif_print/split_pdfs/pdf_image_extraction_service.rb b/lib/iiif_print/split_pdfs/pdf_image_extraction_service.rb new file mode 100644 index 00000000..71bcac13 --- /dev/null +++ b/lib/iiif_print/split_pdfs/pdf_image_extraction_service.rb @@ -0,0 +1,85 @@ +require 'open3' +require 'mini_magick' + +module IiifPrint + module SplitPdfs + # Uses poppler 0.19+ pdfimages command to extract image + # listing metadata from PDF files. + # For dpi extraction, falls back to calculating using MiniMagick, + # if neccessary. + class PdfImageExtractionService + # class constant column numbers + COL_WIDTH = 3 + COL_HEIGHT = 4 + COL_COLOR = 5 + COL_CHANNELS = 6 + COL_BITS = 7 + # only poppler 0.25+ has this column in output: + COL_XPPI = 12 + + def initialize(path) + @path = path + @cmd = format('pdfimages -list %s', path: path) + @output = nil + @entries = nil + end + + def process + # call just once + if @output.nil? + Open3.popen3(@cmd) do |_stdin, stdout, _stderr, _wait_thr| + @output = stdout.read.split("\n") + end + end + @output.slice(2, @output.size - 1) + end + + def entries + if @entries.nil? + @entries = [] + output = process + (0..output.size - 1).each do |i| + @entries.push(output[i].gsub(/\s+/m, ' ').strip.split(" ")) + end + end + @entries + end + + def selectcolumn(i, &block) + result = entries.map { |e| e[i] } + return result.map!(&block) if block_given? + result + end + + def width + selectcolumn(COL_WIDTH, &:to_i).max + end + + def height + selectcolumn(COL_HEIGHT, &:to_i).max + end + + def color + # desc is either 'gray', 'cmyk', 'rgb', but 1-bit gray is black/white + # so caller may want all of this information, and in case of + # mixed color spaces across images, this returns maximum + desc = entries.any? { |e| e[COL_COLOR] != 'gray' } ? 'rgb' : 'gray' + channels = entries.map { |e| e[COL_CHANNELS].to_i }.max + bits = entries.map { |e| e[COL_BITS].to_i }.max + [desc, channels, bits] + end + + def ppi + if entries[0].size <= 12 + # poppler < 0.25 + pdf = MiniMagick::Image.open(@path) + width_points = pdf.width + width_px = width + return (72 * width_px / width_points).to_i + end + # with poppler 0.25+, pdfimages just gives us this: + selectcolumn(COL_XPPI, &:to_i).max + end + end + end +end diff --git a/spec/actors/iiif_print/actors/iiif_print_upload_actor_spec.rb b/spec/actors/iiif_print/actors/iiif_print_upload_actor_spec.rb index 4109962c..a48747e0 100644 --- a/spec/actors/iiif_print/actors/iiif_print_upload_actor_spec.rb +++ b/spec/actors/iiif_print/actors/iiif_print_upload_actor_spec.rb @@ -1,17 +1,24 @@ require 'spec_helper' -# TODO: revisit commented out spec code which belongs in a feature or CreatePagesJob spec -# require 'faraday' -# require 'misc_shared' +require 'misc_shared' -RSpec.describe IiifPrint::Actors::IiifPrintUploadActor do # , :perform_enqueued do - let(:work) { build(:newspaper_issue) } +RSpec.describe IiifPrint::Actors::IiifPrintUploadActor do + let(:work_with_config) { WorkWithIiifPrintConfig.new(title: ['required title']) } + let(:work_without_config) { WorkWithOutConfig.new(title: ['required title']) } + let(:my_user) { build(:user) } let(:ability) { build(:ability) } let(:uploaded_pdf_file) { create(:uploaded_pdf_file) } let(:uploaded_txt_file) { create(:uploaded_txt_file) } let(:uploaded_file_ids) { [uploaded_pdf_file.id, uploaded_txt_file.id] } + # duplicates logic from actor to find the path the job will expect + let(:pdf_paths) do + uploads = Hyrax::UploadedFile.find(uploaded_file_ids) + upload_paths = uploads.map { |upload| upload.file.file.file } + upload_paths.select { |path| path.end_with?('.pdf', '.PDF') } + end + + # attributes for environments let(:attributes) { { title: ['foo'], uploaded_files: uploaded_file_ids } } let(:no_pdf_attributes) { { title: ['foo'], uploaded_files: [] } } - # environment with uploads: let(:with_pdf_env) { Hyrax::Actors::Environment.new(work, ability, attributes) } # environment with NO uploads: @@ -34,29 +41,27 @@ end end - context 'when work model includes IiifPrintBehavior' do + context 'when work model includes IiifPrint' do + let(:work) { work_with_config } describe ':create' do let(:mode) { :create } - before do - allow(work).to receive(:respond_to?).and_call_original - allow(work).to receive(:respond_to?).with(:split_pdf).and_return true - end context 'when work has a pdf file' do let(:mode_env) { with_pdf_env } - it 'queues a IiifPrint::CreatePagesJob' do - expect(IiifPrint::CreatePagesJob).to receive(:perform_later).with( + it 'queues IiifPrint::Jobs::ChildWorksFromPdfJob' do + expect(IiifPrint::Jobs::ChildWorksFromPdfJob).to receive(:perform_later).with( work, - ["/app/samvera/hyrax-webapp/.internal_test_app/tmp/uploads/hyrax/uploaded_file/file/1/minimal-2-page.pdf"], - "spaceballs@example.com", - "admin_set/default" + pdf_paths, + my_user, + "admin_set/default", + 0 ) expect(middleware.public_send(mode, mode_env)).to be true end end context 'when work has no pdf file' do let(:mode_env) { no_pdf_env } - it 'does not queue IiifPrint::CreatePagesJob' do - expect(IiifPrint::CreatePagesJob).not_to receive(:perform_later) + it 'does not queue job' do + expect(IiifPrint::Jobs::ChildWorksFromPdfJob).not_to receive(:perform_later) expect(middleware.public_send(mode, mode_env)).to be true end end @@ -64,38 +69,44 @@ describe ':update' do let(:mode) { :update } - before do - allow(work).to receive(:respond_to?).and_call_original - allow(work).to receive(:respond_to?).with(:split_pdf).and_return true - end - context 'works is updated with no additional uploads' do + context 'work is updated with no additional uploads' do let(:mode_env) { edit_env } - it 'queues a IiifPrint::CreatePagesJob' do - expect(IiifPrint::CreatePagesJob).not_to receive(:perform_later) + it 'does not queue job' do + expect(IiifPrint::Jobs::ChildWorksFromPdfJob).not_to receive(:perform_later) + expect(middleware.public_send(mode, mode_env)).to be true + end + end + context 'work is updated with an additional PDF' do + let(:mode_env) { with_pdf_env } + it 'queues IiifPrint::Jobs::ChildWorksFromPdfJob' do + expect(IiifPrint::Jobs::ChildWorksFromPdfJob).to receive(:perform_later).with( + work, + pdf_paths, + my_user, + "admin_set/default", + 0 + ) expect(middleware.public_send(mode, mode_env)).to be true end end end end - context 'when work model does not IiifPrintBehavior' do + context 'when work model does not use IiifPrint' do + let(:work) { work_without_config } describe ':create' do let(:mode) { :create } - before do - allow(work).to receive(:respond_to?).and_call_original - allow(work).to receive(:respond_to?).with(:split_pdf).and_return false - end context 'when work has a pdf file' do let(:mode_env) { with_pdf_env } - it 'queues a IiifPrint::CreatePagesJob' do - expect(IiifPrint::CreatePagesJob).not_to receive(:perform_later) + it 'does not queue IiifPrint::Jobs::ChildWorksFromPdfJob' do + expect(IiifPrint::Jobs::ChildWorksFromPdfJob).not_to receive(:perform_later) expect(middleware.public_send(mode, mode_env)).to be true end end context 'when work has no pdf file' do let(:mode_env) { no_pdf_env } - it 'does not queue IiifPrint::CreatePagesJob' do - expect(IiifPrint::CreatePagesJob).not_to receive(:perform_later) + it 'does not queue IiifPrint::Jobs::ChildWorksFromPdfJob' do + expect(IiifPrint::Jobs::ChildWorksFromPdfJob).not_to receive(:perform_later) expect(middleware.public_send(mode, mode_env)).to be true end end @@ -103,61 +114,13 @@ describe ':update' do let(:mode) { :update } - before do - allow(work).to receive(:respond_to?).and_call_original - allow(work).to receive(:respond_to?).with(:split_pdf).and_return false - end - context 'works is updated with no additional uploads' do + context 'work is updated with no additional uploads' do let(:mode_env) { edit_env } - it 'queues a IiifPrint::CreatePagesJob' do - expect(IiifPrint::CreatePagesJob).not_to receive(:perform_later) + it 'does not queue IiifPrint::Jobs::ChildWorksFromPdfJob' do + expect(IiifPrint::Jobs::ChildWorksFromPdfJob).not_to receive(:perform_later) expect(middleware.public_send(mode, mode_env)).to be true end end end end - - # let(:uploaded_work) do - # middleware.public_send(:create, env) - # # return work, reloaded, because env.curation_concern will be stale after - # # running actor. - # NewspaperIssue.find(env.curation_concern.id) - # end - # let(:edited_work) do - # middleware.public_send(:update, edit_env) - # NewspaperIssue.find(edit_env.curation_concern.id) - # end - - # describe "NewspaperIssue upload of PDF" do - # do_now_jobs = [ - # IiifPrint::CreatePagesJob, - # IngestLocalFileJob, - # IngestJob - # ] - - # # we over-burden one example, because sadly RSpec does not do well with - # # shared state across examples (without use of `before(:all)` which is - # # mutually exclusive with `let` in practice, and ruffles rubocop's - # # overzealous sense of moral duty, speaking of which: - # xit "creates child pages for issue", perform_enqueued: do_now_jobs do - # pages = uploaded_issue.ordered_pages - # expect(pages.size).to eq 2 - # page = pages[0] - # # Page needs correct admin set: - # expect(page.admin_set_id).to eq 'admin_set/default' - # file_sets = page.members.select { |v| v.class == FileSet } - # expect(file_sets.size).to eq 1 - # files = file_sets[0].files - # url = files[0].uri.to_s - # # fetch the thing from Fedora Commons: - # response = Faraday.get(url) - # stored_size = response.body.length - # expect(stored_size).to be > 0 - # # expect that subsequent edits of same issue (run though update - # # method of actor stack) do not duplicate pages (verify by count): - # expect(edited_issue.id).to eq uploaded_issue.id - # pages = edited_issue.ordered_pages - # expect(pages.size).to eq 2 # still the same page count - # end - # end end diff --git a/spec/iiif_print_spec.rb b/spec/iiif_print_spec.rb index 818ce45e..2b03a501 100644 --- a/spec/iiif_print_spec.rb +++ b/spec/iiif_print_spec.rb @@ -31,8 +31,20 @@ it { is_expected.to be_iiif_print_config } it "has a #pdf_splitter_job" do - # TODO: This should be a class that is a Job but we don't yet have that. - expect(record.iiif_print_config.pdf_splitter_job).to be_present + expect(record.iiif_print_config.pdf_splitter_job).to be(IiifPrint::Jobs::ChildWorksFromPdfJob) + end + + it "has a #pdf_splitter_service" do + expect(record.iiif_print_config.pdf_splitter_service).to be(IiifPrint::SplitPdfs::PagesIntoImagesService) + end + + it "has #derivative_service_plugins" do + expect(record.iiif_print_config.derivative_service_plugins).to eq( + [IiifPrint::JP2DerivativeService, + IiifPrint::PDFDerivativeService, + IiifPrint::TextExtractionDerivativeService, + IiifPrint::TIFFDerivativeService] + ) end end end diff --git a/spec/lib/iiif_print/jobs/child_works_from_pdf_job_spec.rb b/spec/lib/iiif_print/jobs/child_works_from_pdf_job_spec.rb new file mode 100644 index 00000000..184a5da5 --- /dev/null +++ b/spec/lib/iiif_print/jobs/child_works_from_pdf_job_spec.rb @@ -0,0 +1,30 @@ +require 'spec_helper' +require 'misc_shared' + +RSpec.describe IiifPrint::Actors::IiifPrintUploadActor do + # TODO: add specs + let(:work) { WorkWithIiifPrintConfig.new(title: ['required title']) } + let(:my_user) { build(:user) } + let(:uploaded_pdf_file) { create(:uploaded_pdf_file) } + let(:uploaded_file_ids) { [uploaded_pdf_file.id] } + let(:pdf_paths) do + uploads = Hyrax::UploadedFile.find(uploaded_file_ids) + upload_paths = uploads.map { |upload| upload.file.file.file } + upload_paths.select { |path| path.end_with?('.pdf', '.PDF') } + end + let(:admin_set_id) { "admin_set/default" } + let(:prior_pdfs) { 0 } + + let(:subject) { described_class.perform(work, paths, user, admin_set_id, prior_pdfs) } + + describe '#perform' do + xit 'calls pdf splitter service with path' do + end + + xit 'submits one BatchCreateJob per PDF' do + end + + xit 'submits IiifPrint::Jobs::CreateRelationshipsJob' do + end + end +end diff --git a/spec/lib/iiif_print/jobs/create_relationship_job_spec.rb b/spec/lib/iiif_print/jobs/create_relationship_job_spec.rb new file mode 100644 index 00000000..1f6d8163 --- /dev/null +++ b/spec/lib/iiif_print/jobs/create_relationship_job_spec.rb @@ -0,0 +1,17 @@ +require 'spec_helper' +require 'misc_shared' + +RSpec.describe IiifPrint::Actors::IiifPrintUploadActor do + # TODO: add specs + let(:parent) { WorkWithIiifPrintConfig.new(title: ['required title']) } + let(:my_user) { build(:user) } + let(:parent_model) { WorkWithIiifPrintConfig } + let(:child_model) { WorkWithIiifPrintConfig } + + let(:subject) { described_class.perform(user: my_user, parent_id: parent.id, parent_model: parent_model, child_model: child_model) } + + describe '#perform' do + xit 'loads all child work ids into ordered_members' do + end + end +end diff --git a/spec/lib/iiif_print/split_pdfs/pages_into_images_service_spec.rb b/spec/lib/iiif_print/split_pdfs/pages_into_images_service_spec.rb new file mode 100644 index 00000000..36ad3c68 --- /dev/null +++ b/spec/lib/iiif_print/split_pdfs/pages_into_images_service_spec.rb @@ -0,0 +1,6 @@ +require 'spec_helper' +require 'misc_shared' + +RSpec.describe IiifPrint::SplitPdfs::PagesIntoImagesService do + # TODO: add specs +end diff --git a/spec/models/concerns/iiif_print/iiif_print_behavior_spec.rb b/spec/models/concerns/iiif_print/iiif_print_behavior_spec.rb index dab4080a..7a19ca62 100644 --- a/spec/models/concerns/iiif_print/iiif_print_behavior_spec.rb +++ b/spec/models/concerns/iiif_print/iiif_print_behavior_spec.rb @@ -1,17 +1,21 @@ require 'spec_helper' RSpec.describe IiifPrint::IiifPrintBehavior do describe "including_this_module" do - before do - class PrintWork < ActiveFedora::Base - include IiifPrint::IiifPrintBehavior - end - end - let(:klass) { Class.new } - subject { PrintWork.new } + # before do + # class PrintWork < ActiveFedora::Base + # include IiifPrint::IiifPrintBehavior + # end + + # class PrintWorkIndexer < Hyrax::WorkIndexer + # include IiifPrint::IiifPrintBehavior + # end + # end + # let(:klass) { Class.new } + # subject { PrintWork.new } - describe 'split_pdf' do - it 'is true' do - expect(subject.split_pdf).to be true + # TODO: add specs + describe 'ancestor_ids' do + xit 'does stuff' do end end end diff --git a/spec/spec_helper.rb b/spec/spec_helper.rb index 91c80e4a..6b4275d3 100644 --- a/spec/spec_helper.rb +++ b/spec/spec_helper.rb @@ -33,6 +33,7 @@ require 'rails-controller-testing' require 'rspec/rails' require 'support/controller_level_helpers' +require 'support/iiif_print_models' require 'rspec/active_model/mocks' require 'selenium-webdriver' require 'webdrivers' diff --git a/spec/support/iiif_print_models.rb b/spec/support/iiif_print_models.rb new file mode 100644 index 00000000..fbb5da05 --- /dev/null +++ b/spec/support/iiif_print_models.rb @@ -0,0 +1,23 @@ +# frozen_string_literal: true + +# TODO: merge this in with whatever is needed from misc_shared.rb +class WorkWithIiifPrintConfig < ActiveFedora::Base + include ::Hyrax::WorkBehavior + include IiifPrint::SetChildFlag + include IiifPrint.model_configuration(pdf_split_child_model: WorkWithIiifPrintConfig) + include ::Hyrax::BasicMetadata + + validates :title, presence: { message: 'Your work must have a title.' } + + # self.indexer = GenericWorkIndexer +end + +class WorkWithOutConfig < ActiveFedora::Base + include ::Hyrax::WorkBehavior + include IiifPrint::SetChildFlag + include ::Hyrax::BasicMetadata + + validates :title, presence: { message: 'Your work must have a title.' } + + # self.indexer = GenericWorkIndexer +end