diff --git a/app/jobs/runtime/failed_jobs_cleanup.rb b/app/jobs/runtime/failed_jobs_cleanup.rb index b991f7b7802..69853c93373 100644 --- a/app/jobs/runtime/failed_jobs_cleanup.rb +++ b/app/jobs/runtime/failed_jobs_cleanup.rb @@ -2,10 +2,11 @@ module VCAP::CloudController module Jobs module Runtime class FailedJobsCleanup < VCAP::CloudController::Jobs::CCJob - attr_accessor :cutoff_age_in_days + attr_accessor :cutoff_age_in_days, :max_number_of_failed_delayed_jobs - def initialize(cutoff_age_in_days) + def initialize(cutoff_age_in_days:, max_number_of_failed_delayed_jobs: nil) @cutoff_age_in_days = cutoff_age_in_days + @max_number_of_failed_delayed_jobs = max_number_of_failed_delayed_jobs end def perform @@ -15,9 +16,25 @@ def perform where(Sequel.lit("run_at < CURRENT_TIMESTAMP - INTERVAL '?' DAY", cutoff_age_in_days.to_i)) logger = Steno.logger('cc.background') - logger.info("Cleaning up #{old_delayed_jobs.count} Failed Delayed Jobs") + logger.info("Cleaning up #{old_delayed_jobs.count} old Failed Delayed Jobs and potentially some more + Failed Delayed Jobs (the newer ones will be kept) to not exceed max_number_of_failed_delayed_jobs") old_delayed_jobs.delete + logger.info("max_number_of_failed_delayed_jobs = #{max_number_of_failed_delayed_jobs}") + unless max_number_of_failed_delayed_jobs.nil? + logger.info('Deleting too many failed_jobs') + ids_from_failed_jobs_that_are_too_much = Delayed::Job. + where(Sequel.lit('failed_at is not null')). + order(Sequel.desc(:id)). + offset(max_number_of_failed_delayed_jobs.to_i). + # Mysql handles offset differently, therefore using a large + # number to mimic unlimited offset + limit(1844674407370955161). + select(:id) + + Delayed::Job.where(id: ids_from_failed_jobs_that_are_too_much).delete + end + logger.info('Cleaning up ') end def job_name_in_configuration diff --git a/config/cloud_controller.yml b/config/cloud_controller.yml index 1a7c53ed413..f89f03d79e2 100644 --- a/config/cloud_controller.yml +++ b/config/cloud_controller.yml @@ -46,6 +46,9 @@ audit_events: failed_jobs: cutoff_age_in_days: 31 + max_number_of_failed_delayed_jobs: 100000 + frequency_in_seconds: 144000 #4h + expiration_in_seconds: 144000 completed_tasks: cutoff_age_in_days: 31 diff --git a/lib/cloud_controller/clock/scheduler.rb b/lib/cloud_controller/clock/scheduler.rb index 31bc6c47c65..0b282d468d4 100644 --- a/lib/cloud_controller/clock/scheduler.rb +++ b/lib/cloud_controller/clock/scheduler.rb @@ -7,7 +7,6 @@ class Scheduler CLEANUPS = [ { name: 'app_usage_events', class: Jobs::Runtime::AppUsageEventsCleanup, time: '18:00', arg_from_config: [:app_usage_events, :cutoff_age_in_days] }, { name: 'audit_events', class: Jobs::Runtime::EventsCleanup, time: '20:00', arg_from_config: [:audit_events, :cutoff_age_in_days] }, - { name: 'failed_jobs', class: Jobs::Runtime::FailedJobsCleanup, time: '21:00', arg_from_config: [:failed_jobs, :cutoff_age_in_days] }, { name: 'service_usage_events', class: Jobs::Services::ServiceUsageEventsCleanup, time: '22:00', arg_from_config: [:service_usage_events, :cutoff_age_in_days] }, { name: 'completed_tasks', class: Jobs::Runtime::PruneCompletedTasks, time: '23:00', arg_from_config: [:completed_tasks, :cutoff_age_in_days] }, { name: 'expired_blob_cleanup', class: Jobs::Runtime::ExpiredBlobCleanup, time: '00:00' }, @@ -23,6 +22,10 @@ class Scheduler FREQUENTS = [ { name: 'pending_droplets', class: Jobs::Runtime::PendingDropletCleanup }, { name: 'pending_builds', class: Jobs::Runtime::PendingBuildCleanup }, + { name: 'failed_jobs', class: Jobs::Runtime::FailedJobsCleanup, arg_from_config: [:failed_jobs, + :cutoff_age_in_days, + :max_number_of_failed_delayed_jobs, + :frequency_in_seconds] } ].freeze def initialize(config) @@ -69,7 +72,11 @@ def start_frequent_jobs } @clock.schedule_frequent_worker_job(**clock_opts) do klass = job_config[:class] - klass.new(@config.get(job_config[:name].to_sym, :expiration_in_seconds)) + if @config.get(job_config[:name].to_sym, :expiration_in_seconds).nil? + klass.new(**@config.get(job_config[:name].to_sym).reject { |k| [:frequency_in_seconds, :expiration_in_seconds].include?(k) }) + else + klass.new(@config.get(job_config[:name].to_sym, :expiration_in_seconds)) + end end end end diff --git a/lib/cloud_controller/config_schemas/base/clock_schema.rb b/lib/cloud_controller/config_schemas/base/clock_schema.rb index a3cd5cd83e2..f14881dfa25 100644 --- a/lib/cloud_controller/config_schemas/base/clock_schema.rb +++ b/lib/cloud_controller/config_schemas/base/clock_schema.rb @@ -22,7 +22,10 @@ class ClockSchema < VCAP::Config cutoff_age_in_days: Integer }, failed_jobs: { - cutoff_age_in_days: Integer + cutoff_age_in_days: Integer, + max_number_of_failed_delayed_jobs: Integer, + expiration_in_seconds: Integer, + frequency_in_seconds: Integer, }, completed_tasks: { cutoff_age_in_days: Integer diff --git a/spec/fixtures/config/port_8181_config.yml b/spec/fixtures/config/port_8181_config.yml index 848d057f803..1662d2a5165 100644 --- a/spec/fixtures/config/port_8181_config.yml +++ b/spec/fixtures/config/port_8181_config.yml @@ -36,6 +36,9 @@ audit_events: failed_jobs: cutoff_age_in_days: 31 + max_number_of_failed_delayed_jobs: 100000 + frequency_in_seconds: 144000 #4h + expiration_in_seconds: 144000 completed_tasks: cutoff_age_in_days: 31 diff --git a/spec/unit/jobs/runtime/failed_jobs_cleanup_spec.rb b/spec/unit/jobs/runtime/failed_jobs_cleanup_spec.rb index de1e608ae8f..3b5f552488a 100644 --- a/spec/unit/jobs/runtime/failed_jobs_cleanup_spec.rb +++ b/spec/unit/jobs/runtime/failed_jobs_cleanup_spec.rb @@ -23,10 +23,9 @@ def max_attempts end RSpec.describe FailedJobsCleanup, job_context: :worker do - let(:cutoff_age_in_days) { 2 } let(:worker) { Delayed::Worker.new } - subject(:cleanup_job) { FailedJobsCleanup.new(cutoff_age_in_days) } + subject(:cleanup_job) { FailedJobsCleanup.new(cutoff_age_in_days: 2, max_number_of_failed_delayed_jobs: 2) } it { is_expected.to be_a_valid_job } @@ -75,6 +74,29 @@ def max_attempts }.from(@delayed_job).to(nil) end end + context 'when the number of delayed jobs exceeds max_number_of_failed_delayed_jobs' do + let(:run_at) { Time.now.utc } + let(:the_job) { FailingJob.new } + let(:the_job2) { SuccessJob.new } + + it 'removes the exceeding jobs' do + Delayed::Worker.destroy_failed_jobs = false + @delayed_job2 = Delayed::Job.enqueue(the_job, run_at: run_at, queue: worker.name, created_at: (Time.now.utc - 1.day)) + @delayed_job3 = Delayed::Job.enqueue(the_job, run_at: run_at, queue: worker.name, created_at: (Time.now.utc - 1.day)) + @delayed_job4 = Delayed::Job.enqueue(the_job, run_at: run_at, queue: worker.name, created_at: (Time.now.utc - 1.day)) + @delayed_job5 = Delayed::Job.enqueue(the_job2, run_at: run_at, queue: worker.name, created_at: (Time.now.utc - 1.day)) + @delayed_job6 = Delayed::Job.enqueue(the_job2, run_at: run_at, queue: worker.name, created_at: (Time.now.utc - 1.day)) + worker.work_off 5 + + expect { + cleanup_job.perform + }.to change { + Delayed::Job.count + }.by(-2) + expect(Delayed::Job.find(id: @delayed_job.id)).to be_nil + expect(Delayed::Job.find(id: @delayed_job2.id)).to be_nil + end + end end end end diff --git a/spec/unit/lib/cloud_controller/clock/scheduler_spec.rb b/spec/unit/lib/cloud_controller/clock/scheduler_spec.rb index 7d4459ed157..f908a52f040 100644 --- a/spec/unit/lib/cloud_controller/clock/scheduler_spec.rb +++ b/spec/unit/lib/cloud_controller/clock/scheduler_spec.rb @@ -18,7 +18,7 @@ module VCAP::CloudController }, app_usage_events: { cutoff_age_in_days: 1, }, audit_events: { cutoff_age_in_days: 3, }, - failed_jobs: { cutoff_age_in_days: 4, }, + failed_jobs: { frequency_in_seconds: 400, cutoff_age_in_days: 4, max_number_of_failed_delayed_jobs: 10 }, service_usage_events: { cutoff_age_in_days: 5, }, completed_tasks: { cutoff_age_in_days: 6, }, pending_droplets: { frequency_in_seconds: 300, expiration_in_seconds: 600 }, @@ -69,12 +69,6 @@ module VCAP::CloudController expect(block.call).to be_instance_of(Jobs::Runtime::EventsCleanup) end - expect(clock).to receive(:schedule_daily_job) do |args, &block| - expect(args).to eql(name: 'failed_jobs', at: '21:00', priority: 0) - expect(Jobs::Runtime::FailedJobsCleanup).to receive(:new).with(4).and_call_original - expect(block.call).to be_instance_of(Jobs::Runtime::FailedJobsCleanup) - end - expect(clock).to receive(:schedule_daily_job) do |args, &block| expect(args).to eql(name: 'service_usage_events', at: '22:00', priority: 0) expect(Jobs::Services::ServiceUsageEventsCleanup).to receive(:new).with(5).and_call_original @@ -153,6 +147,12 @@ module VCAP::CloudController expect(block.call).to be_instance_of(Jobs::Runtime::PendingBuildCleanup) end + expect(clock).to receive(:schedule_frequent_worker_job) do |args, &block| + expect(args).to eql(name: 'failed_jobs', interval: 400) + expect(Jobs::Runtime::FailedJobsCleanup).to receive(:new).with(cutoff_age_in_days: 4, max_number_of_failed_delayed_jobs: 10).and_call_original + expect(block.call).to be_instance_of(Jobs::Runtime::FailedJobsCleanup) + end + schedule.start end