Skip to content

Commit

Permalink
More aggressive cleanup of failed delayed jobs
Browse files Browse the repository at this point in the history
Currently, failed delayed_jobs are deleted after 14d (configurable) to keep some info about failed jobs that helps debugging: https://github.com/cloudfoundry/cloud_controller_ng/blob/main/app/jobs/runtime/failed_jobs_cleanup.rb

This can still lead to very large number of delayed_jobs records that slow down DB queries working on delayed_jobs (also addressed by an index, ccng cloudfoundry#3324). Idea was to have an additional absolute limit of failed jobs so that they get deleted even before the 14d failed_jobs.cutoff_age_in_days. The limit should be configurable.
  • Loading branch information
kathap committed Jul 20, 2023
1 parent 1ae3490 commit fa6fd3f
Show file tree
Hide file tree
Showing 7 changed files with 70 additions and 15 deletions.
23 changes: 20 additions & 3 deletions app/jobs/runtime/failed_jobs_cleanup.rb
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,11 @@ module VCAP::CloudController
module Jobs
module Runtime
class FailedJobsCleanup < VCAP::CloudController::Jobs::CCJob
attr_accessor :cutoff_age_in_days
attr_accessor :cutoff_age_in_days, :max_number_of_failed_delayed_jobs

def initialize(cutoff_age_in_days)
def initialize(cutoff_age_in_days:, max_number_of_failed_delayed_jobs: nil)
@cutoff_age_in_days = cutoff_age_in_days
@max_number_of_failed_delayed_jobs = max_number_of_failed_delayed_jobs
end

def perform
Expand All @@ -15,9 +16,25 @@ def perform
where(Sequel.lit("run_at < CURRENT_TIMESTAMP - INTERVAL '?' DAY", cutoff_age_in_days.to_i))

logger = Steno.logger('cc.background')
logger.info("Cleaning up #{old_delayed_jobs.count} Failed Delayed Jobs")
logger.info("Cleaning up #{old_delayed_jobs.count} old Failed Delayed Jobs and potentially some more
Failed Delayed Jobs (the newer ones will be kept) to not exceed max_number_of_failed_delayed_jobs")

old_delayed_jobs.delete
logger.info("max_number_of_failed_delayed_jobs = #{max_number_of_failed_delayed_jobs}")
unless max_number_of_failed_delayed_jobs.nil?
logger.info("Deleting too many failed_jobs")
ids_from_failed_jobs_that_are_too_much = Delayed::Job.
where(Sequel.lit('failed_at is not null')).
order(Sequel.desc(:id)).
offset(max_number_of_failed_delayed_jobs.to_i).
# Mysql handles offset differently, therefore using a large
# number to mimic unlimited offset
limit(1844674407370955161).
select(:id)

Delayed::Job.where(id: ids_from_failed_jobs_that_are_too_much).delete
end
logger.info('Cleaning up ')
end

def job_name_in_configuration
Expand Down
3 changes: 3 additions & 0 deletions config/cloud_controller.yml
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,9 @@ audit_events:

failed_jobs:
cutoff_age_in_days: 31
max_number_of_failed_delayed_jobs: 100000
frequency_in_seconds: 144000 #4h
expiration_in_seconds: 144000

completed_tasks:
cutoff_age_in_days: 31
Expand Down
11 changes: 9 additions & 2 deletions lib/cloud_controller/clock/scheduler.rb
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@ class Scheduler
CLEANUPS = [
{ name: 'app_usage_events', class: Jobs::Runtime::AppUsageEventsCleanup, time: '18:00', arg_from_config: [:app_usage_events, :cutoff_age_in_days] },
{ name: 'audit_events', class: Jobs::Runtime::EventsCleanup, time: '20:00', arg_from_config: [:audit_events, :cutoff_age_in_days] },
{ name: 'failed_jobs', class: Jobs::Runtime::FailedJobsCleanup, time: '21:00', arg_from_config: [:failed_jobs, :cutoff_age_in_days] },
{ name: 'service_usage_events', class: Jobs::Services::ServiceUsageEventsCleanup, time: '22:00', arg_from_config: [:service_usage_events, :cutoff_age_in_days] },
{ name: 'completed_tasks', class: Jobs::Runtime::PruneCompletedTasks, time: '23:00', arg_from_config: [:completed_tasks, :cutoff_age_in_days] },
{ name: 'expired_blob_cleanup', class: Jobs::Runtime::ExpiredBlobCleanup, time: '00:00' },
Expand All @@ -23,6 +22,10 @@ class Scheduler
FREQUENTS = [
{ name: 'pending_droplets', class: Jobs::Runtime::PendingDropletCleanup },
{ name: 'pending_builds', class: Jobs::Runtime::PendingBuildCleanup },
{ name: 'failed_jobs', class: Jobs::Runtime::FailedJobsCleanup, arg_from_config: [:failed_jobs,
:cutoff_age_in_days,
:max_number_of_failed_delayed_jobs,
:frequency_in_seconds] }
].freeze

def initialize(config)
Expand Down Expand Up @@ -69,7 +72,11 @@ def start_frequent_jobs
}
@clock.schedule_frequent_worker_job(**clock_opts) do
klass = job_config[:class]
klass.new(@config.get(job_config[:name].to_sym, :expiration_in_seconds))
if @config.get(job_config[:name].to_sym, :expiration_in_seconds).nil?
klass.new(**@config.get(job_config[:name].to_sym).reject { |k| [:frequency_in_seconds, :expiration_in_seconds].include?(k) })
else
klass.new(@config.get(job_config[:name].to_sym, :expiration_in_seconds))
end
end
end
end
Expand Down
5 changes: 4 additions & 1 deletion lib/cloud_controller/config_schemas/base/clock_schema.rb
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,10 @@ class ClockSchema < VCAP::Config
cutoff_age_in_days: Integer
},
failed_jobs: {
cutoff_age_in_days: Integer
cutoff_age_in_days: Integer,
max_number_of_failed_delayed_jobs: Integer,
expiration_in_seconds: Integer,
frequency_in_seconds: Integer,
},
completed_tasks: {
cutoff_age_in_days: Integer
Expand Down
3 changes: 3 additions & 0 deletions spec/fixtures/config/port_8181_config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,9 @@ audit_events:

failed_jobs:
cutoff_age_in_days: 31
max_number_of_failed_delayed_jobs: 100000
frequency_in_seconds: 144000 #4h
expiration_in_seconds: 144000

completed_tasks:
cutoff_age_in_days: 31
Expand Down
26 changes: 24 additions & 2 deletions spec/unit/jobs/runtime/failed_jobs_cleanup_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -23,10 +23,9 @@ def max_attempts
end

RSpec.describe FailedJobsCleanup, job_context: :worker do
let(:cutoff_age_in_days) { 2 }
let(:worker) { Delayed::Worker.new }

subject(:cleanup_job) { FailedJobsCleanup.new(cutoff_age_in_days) }
subject(:cleanup_job) { FailedJobsCleanup.new(cutoff_age_in_days: 2, max_number_of_failed_delayed_jobs: 2) }

it { is_expected.to be_a_valid_job }

Expand Down Expand Up @@ -75,6 +74,29 @@ def max_attempts
}.from(@delayed_job).to(nil)
end
end
context 'when the number of delayed jobs exceeds max_number_of_failed_delayed_jobs' do
let(:run_at) { Time.now.utc }
let(:the_job) { FailingJob.new }
let(:the_job2) { SuccessJob.new }

it 'removes the exceeding jobs' do
Delayed::Worker.destroy_failed_jobs = false
@delayed_job2 = Delayed::Job.enqueue(the_job, run_at: run_at, queue: worker.name, created_at: (Time.now.utc - 1.day))
@delayed_job3 = Delayed::Job.enqueue(the_job, run_at: run_at, queue: worker.name, created_at: (Time.now.utc - 1.day))
@delayed_job4 = Delayed::Job.enqueue(the_job, run_at: run_at, queue: worker.name, created_at: (Time.now.utc - 1.day))
@delayed_job5 = Delayed::Job.enqueue(the_job2, run_at: run_at, queue: worker.name, created_at: (Time.now.utc - 1.day))
@delayed_job6 = Delayed::Job.enqueue(the_job2, run_at: run_at, queue: worker.name, created_at: (Time.now.utc - 1.day))
worker.work_off 5

expect {
cleanup_job.perform
}.to change {
Delayed::Job.count
}.by(-2)
expect(Delayed::Job.find(id: @delayed_job.id)).to be_nil
expect(Delayed::Job.find(id: @delayed_job2.id)).to be_nil
end
end
end
end
end
Expand Down
14 changes: 7 additions & 7 deletions spec/unit/lib/cloud_controller/clock/scheduler_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ module VCAP::CloudController
},
app_usage_events: { cutoff_age_in_days: 1, },
audit_events: { cutoff_age_in_days: 3, },
failed_jobs: { cutoff_age_in_days: 4, },
failed_jobs: { frequency_in_seconds: 400, cutoff_age_in_days: 4, max_number_of_failed_delayed_jobs: 10 },
service_usage_events: { cutoff_age_in_days: 5, },
completed_tasks: { cutoff_age_in_days: 6, },
pending_droplets: { frequency_in_seconds: 300, expiration_in_seconds: 600 },
Expand Down Expand Up @@ -69,12 +69,6 @@ module VCAP::CloudController
expect(block.call).to be_instance_of(Jobs::Runtime::EventsCleanup)
end

expect(clock).to receive(:schedule_daily_job) do |args, &block|
expect(args).to eql(name: 'failed_jobs', at: '21:00', priority: 0)
expect(Jobs::Runtime::FailedJobsCleanup).to receive(:new).with(4).and_call_original
expect(block.call).to be_instance_of(Jobs::Runtime::FailedJobsCleanup)
end

expect(clock).to receive(:schedule_daily_job) do |args, &block|
expect(args).to eql(name: 'service_usage_events', at: '22:00', priority: 0)
expect(Jobs::Services::ServiceUsageEventsCleanup).to receive(:new).with(5).and_call_original
Expand Down Expand Up @@ -153,6 +147,12 @@ module VCAP::CloudController
expect(block.call).to be_instance_of(Jobs::Runtime::PendingBuildCleanup)
end

expect(clock).to receive(:schedule_frequent_worker_job) do |args, &block|
expect(args).to eql(name: 'failed_jobs', interval: 400)
expect(Jobs::Runtime::FailedJobsCleanup).to receive(:new).with(cutoff_age_in_days: 4, max_number_of_failed_delayed_jobs: 10).and_call_original
expect(block.call).to be_instance_of(Jobs::Runtime::FailedJobsCleanup)
end

schedule.start
end

Expand Down

0 comments on commit fa6fd3f

Please sign in to comment.