From 369f318b98dc4081e1d94db72172e35cbbfe31bd Mon Sep 17 00:00:00 2001 From: Jeremy Friesen Date: Sat, 1 Feb 2020 07:25:27 -0500 Subject: [PATCH 1/2] Refactoring ImportUrlJob to ease Valkyrization The code changes reflect a simple re-structuring that will make future work easier: 1) Extract method from perform into context specific private method 2) Extract additional attr_reader to allow for common means of using instance variables Refactoring related to #4195 --- app/jobs/import_url_job.rb | 63 ++++++++++++++++++++++---------------- 1 file changed, 36 insertions(+), 27 deletions(-) diff --git a/app/jobs/import_url_job.rb b/app/jobs/import_url_job.rb index 0e11becce7..4d0ae6a278 100644 --- a/app/jobs/import_url_job.rb +++ b/app/jobs/import_url_job.rb @@ -8,7 +8,7 @@ # and CreateWithRemoteFilesActor when files are located in some other service. class ImportUrlJob < Hyrax::ApplicationJob queue_as Hyrax.config.ingest_queue_name - attr_reader :file_set, :operation + attr_reader :file_set, :operation, :headers before_enqueue do |job| operation = job.arguments[1] @@ -17,35 +17,44 @@ class ImportUrlJob < Hyrax::ApplicationJob # @param [FileSet] file_set # @param [Hyrax::BatchCreateOperation] operation - def perform(file_set, operation, headers = {}) - operation.performing! - user = User.find_by_user_key(file_set.depositor) - uri = URI(file_set.import_url) - name = file_set.label - + # @param [Hash] headers - header data to use in interaction with remote url + def perform(file_set, operation, headers = {}, use_valkyrie: Hyrax.config.use_valkyrie?) @file_set = file_set @operation = operation - - unless BrowseEverything::Retriever.can_retrieve?(uri, headers) - send_error('Expired URL') - return false - end - - # @todo Use Hydra::Works::AddExternalFileToFileSet instead of manually - # copying the file here. This will be gnarly. - copy_remote_file(uri, name, headers) do |f| - # reload the FileSet once the data is copied since this is a long running task - file_set.reload - - # FileSetActor operates synchronously so that this tempfile is available. - # If asynchronous, the job might be invoked on a machine that did not have this temp file on its file system! - # NOTE: The return status may be successful even if the content never attaches. - log_import_status(uri, f, user) + @headers = headers + if use_valkyrie + # TODO + else + perform_af end end private + def perform_af + operation.performing! + user = User.find_by_user_key(file_set.depositor) + uri = URI(file_set.import_url) + name = file_set.label + + unless BrowseEverything::Retriever.can_retrieve?(uri, headers) + send_error('Expired URL') + return false + end + + # @todo Use Hydra::Works::AddExternalFileToFileSet instead of manually + # copying the file here. This will be gnarly. + copy_remote_file(uri, name, headers) do |f| + # reload the FileSet once the data is copied since this is a long running task + file_set.reload + + # FileSetActor operates synchronously so that this tempfile is available. + # If asynchronous, the job might be invoked on a machine that did not have this temp file on its file system! + # NOTE: The return status may be successful even if the content never attaches. + log_import_status(uri, f, user) + end + end + # Download file from uri, yields a block with a file in a temporary directory. # It is important that the file on disk has the same file name as the URL, # because when the file in added into Fedora the file name will get persisted in the @@ -75,9 +84,9 @@ def copy_remote_file(uri, name, headers = {}) # @param error_message [String] the download error message def send_error(error_message) user = User.find_by_user_key(file_set.depositor) - @file_set.errors.add('Error:', error_message) - Hyrax.config.callback.run(:after_import_url_failure, @file_set, user, warn: false) - @operation.fail!(@file_set.errors.full_messages.join(' ')) + file_set.errors.add('Error:', error_message) + Hyrax.config.callback.run(:after_import_url_failure, file_set, user, warn: false) + operation.fail!(file_set.errors.full_messages.join(' ')) end # Write file to the stream @@ -97,7 +106,7 @@ def write_file(uri, f, headers) # @param f [IO] the stream to write to # @param user [User] def log_import_status(uri, f, user) - if Hyrax::Actors::FileSetActor.new(@file_set, user).create_content(f, from_url: true) + if Hyrax::Actors::FileSetActor.new(file_set, user).create_content(f, from_url: true) operation.success! else send_error(uri.path, nil) From bfd316e23b5bfbf8d811e8a8dd582c1a2529887b Mon Sep 17 00:00:00 2001 From: Jeremy Friesen Date: Sat, 1 Feb 2020 07:44:39 -0500 Subject: [PATCH 2/2] Refactoring object to leverage instance vars Prior to this commit, there were parameters passed that could be instead assigned at the beginning of the `#perform` method. In doing, we have less chatter on the method calls. This also involved extract additional instance variables that were commonly used throughout. Refactoring related to #4195 --- app/jobs/import_url_job.rb | 47 +++++++++++++++++++------------------- 1 file changed, 24 insertions(+), 23 deletions(-) diff --git a/app/jobs/import_url_job.rb b/app/jobs/import_url_job.rb index 4d0ae6a278..29df0d9229 100644 --- a/app/jobs/import_url_job.rb +++ b/app/jobs/import_url_job.rb @@ -8,7 +8,7 @@ # and CreateWithRemoteFilesActor when files are located in some other service. class ImportUrlJob < Hyrax::ApplicationJob queue_as Hyrax.config.ingest_queue_name - attr_reader :file_set, :operation, :headers + attr_reader :file_set, :operation, :headers, :user, :uri before_enqueue do |job| operation = job.arguments[1] @@ -18,10 +18,19 @@ class ImportUrlJob < Hyrax::ApplicationJob # @param [FileSet] file_set # @param [Hyrax::BatchCreateOperation] operation # @param [Hash] headers - header data to use in interaction with remote url - def perform(file_set, operation, headers = {}, use_valkyrie: Hyrax.config.use_valkyrie?) + # @param [Boolean] use_valkyrie - a switch on whether or not to use Valkyrie processing + # + # @todo At present, this job works for ActiveFedora objects. The use_valkyrie is not complete. + def perform(file_set, operation, headers = {}, use_valkyrie: false) @file_set = file_set @operation = operation @headers = headers + operation.performing! + @user = User.find_by_user_key(file_set.depositor) + @uri = URI(file_set.import_url) + + return false unless can_retrieve_remote? + if use_valkyrie # TODO else @@ -31,27 +40,25 @@ def perform(file_set, operation, headers = {}, use_valkyrie: Hyrax.config.use_va private + def can_retrieve_remote? + return true if BrowseEverything::Retriever.can_retrieve?(uri, headers) + send_error('Expired URL') + false + end + def perform_af - operation.performing! - user = User.find_by_user_key(file_set.depositor) - uri = URI(file_set.import_url) name = file_set.label - unless BrowseEverything::Retriever.can_retrieve?(uri, headers) - send_error('Expired URL') - return false - end - # @todo Use Hydra::Works::AddExternalFileToFileSet instead of manually # copying the file here. This will be gnarly. - copy_remote_file(uri, name, headers) do |f| + copy_remote_file(name) do |f| # reload the FileSet once the data is copied since this is a long running task file_set.reload # FileSetActor operates synchronously so that this tempfile is available. # If asynchronous, the job might be invoked on a machine that did not have this temp file on its file system! # NOTE: The return status may be successful even if the content never attaches. - log_import_status(uri, f, user) + log_import_status(f) end end @@ -59,18 +66,16 @@ def perform_af # It is important that the file on disk has the same file name as the URL, # because when the file in added into Fedora the file name will get persisted in the # metadata. - # @param uri [URI] the uri of the file to download # @param name [String] the human-readable name of the file - # @param headers [Hash] the HTTP headers for the GET request (these may contain an authentication token) # @yield [IO] the stream to write to - def copy_remote_file(uri, name, headers = {}) + def copy_remote_file(name) filename = File.basename(name) dir = Dir.mktmpdir Rails.logger.debug("ImportUrlJob: Copying <#{uri}> to #{dir}") File.open(File.join(dir, filename), 'wb') do |f| begin - write_file(uri, f, headers) + write_file(f) yield f rescue StandardError => e send_error(e.message) @@ -83,16 +88,14 @@ def copy_remote_file(uri, name, headers = {}) # @param filename [String] the filename of the file to download # @param error_message [String] the download error message def send_error(error_message) - user = User.find_by_user_key(file_set.depositor) file_set.errors.add('Error:', error_message) Hyrax.config.callback.run(:after_import_url_failure, file_set, user, warn: false) operation.fail!(file_set.errors.full_messages.join(' ')) end # Write file to the stream - # @param uri [URI] the uri of the file to download # @param f [IO] the stream to write to - def write_file(uri, f, headers) + def write_file(f) retriever = BrowseEverything::Retriever.new uri_spec = ActiveSupport::HashWithIndifferentAccess.new(url: uri, headers: headers) retriever.retrieve(uri_spec) do |chunk| @@ -102,14 +105,12 @@ def write_file(uri, f, headers) end # Set the import operation status - # @param uri [URI] the uri of the file to download # @param f [IO] the stream to write to - # @param user [User] - def log_import_status(uri, f, user) + def log_import_status(f) if Hyrax::Actors::FileSetActor.new(file_set, user).create_content(f, from_url: true) operation.success! else - send_error(uri.path, nil) + send_error(uri.path) end end end