diff --git a/app/indexers/concerns/iiif_print/file_set_indexer.rb b/app/indexers/concerns/iiif_print/file_set_indexer.rb index c0771d47..a4c23690 100644 --- a/app/indexers/concerns/iiif_print/file_set_indexer.rb +++ b/app/indexers/concerns/iiif_print/file_set_indexer.rb @@ -19,10 +19,8 @@ def generate_solr_document # only UV viewable images should have is_page_of, it is only used for iiif search solr_doc['is_page_of_ssim'] = iiif_print_lineage_service.ancestor_ids_for(object) if object.mime_type&.match(/image/) # index for full text search - text = IiifPrint::Data::WorkDerivatives.data(from: object, of_type: 'txt') - text = text.tr("\n", ' ').squeeze(' ') - solr_doc['all_text_timv'] = text - solr_doc['all_text_tsimv'] = text + solr_doc['all_text_timv'] = all_text + solr_doc['all_text_tsimv'] = all_text solr_doc['digest_ssim'] = digest_from_content end end @@ -33,5 +31,12 @@ def digest_from_content return unless object.original_file object.original_file.digest.first.to_s end + + def all_text + text = IiifPrint.config.all_text_generator_function.call(object: object) || '' + return text if text.empty? + + text.tr("\n", ' ').squeeze(' ') + end end end diff --git a/lib/iiif_print/configuration.rb b/lib/iiif_print/configuration.rb index 30b804ca..ec588cbf 100644 --- a/lib/iiif_print/configuration.rb +++ b/lib/iiif_print/configuration.rb @@ -183,6 +183,18 @@ def ocr_coords_from_json_function IiifPrint::Data::WorkDerivatives.data(from: file_set_id, of_type: 'json') end end + + attr_writer :all_text_generator_function + ## + # This configuration determines where to pull the full text from. By default, it will + # pull from the TXT file that is generated by the OCR engine. However, if your + # application has its own implementation of generating the full text, then you can + # set your own configuration here. + def all_text_generator_function + @all_text_generator_function ||= lambda do |object:| + IiifPrint::Data::WorkDerivatives.data(from: object, of_type: 'txt') + end + end end # rubocop:enable Metrics/ModuleLength end diff --git a/spec/iiif_print/configuration_spec.rb b/spec/iiif_print/configuration_spec.rb index 85b548b6..c810a9cc 100644 --- a/spec/iiif_print/configuration_spec.rb +++ b/spec/iiif_print/configuration_spec.rb @@ -143,4 +143,12 @@ expect(function.parameters).to eq([[:keyreq, :file_set_id], [:keyrest]]) end end + + describe "#all_text_generator_function" do + subject(:function) { config.all_text_generator_function } + + it "is expected to be a lambda with one keyword arg" do + expect(function.parameters).to eq([[:keyreq, :object]]) + end + end end