Skip to content

Commit

Permalink
More aggressive cleanup of failed delayed jobs
Browse files Browse the repository at this point in the history
Currently, failed delayed_jobs are deleted after 14d (configurable) to keep some info about failed jobs that helps debugging: https://github.com/cloudfoundry/cloud_controller_ng/blob/main/app/jobs/runtime/failed_jobs_cleanup.rb

This can still lead to very large number of delayed_jobs records that slow down DB queries working on delayed_jobs (also addressed by an index, ccng cloudfoundry#3324). Idea was to have an additional absolute limit of failed jobs so that they get deleted even before the 14d failed_jobs.cutoff_age_in_days. The limit should be configurable.
  • Loading branch information
kathap committed Jul 18, 2023
1 parent 1ae3490 commit 30f1975
Show file tree
Hide file tree
Showing 7 changed files with 62 additions and 13 deletions.
22 changes: 19 additions & 3 deletions app/jobs/runtime/failed_jobs_cleanup.rb
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,11 @@ module VCAP::CloudController
module Jobs
module Runtime
class FailedJobsCleanup < VCAP::CloudController::Jobs::CCJob
attr_accessor :cutoff_age_in_days
attr_accessor :cutoff_age_in_days, :max_number_of_failed_delayed_jobs

def initialize(cutoff_age_in_days)
def initialize(cutoff_age_in_days, max_number_of_failed_delayed_jobs=nil)
@cutoff_age_in_days = cutoff_age_in_days
@max_number_of_failed_delayed_jobs = max_number_of_failed_delayed_jobs
end

def perform
Expand All @@ -15,9 +16,24 @@ def perform
where(Sequel.lit("run_at < CURRENT_TIMESTAMP - INTERVAL '?' DAY", cutoff_age_in_days.to_i))

logger = Steno.logger('cc.background')
logger.info("Cleaning up #{old_delayed_jobs.count} Failed Delayed Jobs")
logger.info("Cleaning up #{old_delayed_jobs.count} old Failed Delayed Jobs and potentially some more
Failed Delayed Jobs (the newer ones will be kept) to not exceed max_number_of_failed_delayed_jobs")

old_delayed_jobs.delete

unless max_number_of_failed_delayed_jobs.nil?
ids_from_failed_jobs_that_are_too_much = Delayed::Job.
where(Sequel.lit('failed_at is not null')).
order(Sequel.desc(:id)).
offset(max_number_of_failed_delayed_jobs.to_i).
# Mysql handles offset differently, therefore using a large
# number to mimic unlimited offset
limit(1844674407370955161).
select(:id)

Delayed::Job.where(id: ids_from_failed_jobs_that_are_too_much).delete
end
logger.info('Cleaning up ')
end

def job_name_in_configuration
Expand Down
3 changes: 3 additions & 0 deletions config/cloud_controller.yml
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,9 @@ audit_events:

failed_jobs:
cutoff_age_in_days: 31
max_number_of_failed_delayed_jobs: 100000
frequency_in_seconds: 144000 #4h
expiration_in_seconds: 144000

completed_tasks:
cutoff_age_in_days: 31
Expand Down
2 changes: 1 addition & 1 deletion lib/cloud_controller/clock/scheduler.rb
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@ class Scheduler
CLEANUPS = [
{ name: 'app_usage_events', class: Jobs::Runtime::AppUsageEventsCleanup, time: '18:00', arg_from_config: [:app_usage_events, :cutoff_age_in_days] },
{ name: 'audit_events', class: Jobs::Runtime::EventsCleanup, time: '20:00', arg_from_config: [:audit_events, :cutoff_age_in_days] },
{ name: 'failed_jobs', class: Jobs::Runtime::FailedJobsCleanup, time: '21:00', arg_from_config: [:failed_jobs, :cutoff_age_in_days] },
{ name: 'service_usage_events', class: Jobs::Services::ServiceUsageEventsCleanup, time: '22:00', arg_from_config: [:service_usage_events, :cutoff_age_in_days] },
{ name: 'completed_tasks', class: Jobs::Runtime::PruneCompletedTasks, time: '23:00', arg_from_config: [:completed_tasks, :cutoff_age_in_days] },
{ name: 'expired_blob_cleanup', class: Jobs::Runtime::ExpiredBlobCleanup, time: '00:00' },
Expand All @@ -23,6 +22,7 @@ class Scheduler
FREQUENTS = [
{ name: 'pending_droplets', class: Jobs::Runtime::PendingDropletCleanup },
{ name: 'pending_builds', class: Jobs::Runtime::PendingBuildCleanup },
{ name: 'failed_jobs', class: Jobs::Runtime::FailedJobsCleanup, arg_from_config: [:failed_jobs, :cutoff_age_in_days, :max_number_of_failed_delayed_jobs, :frequency_in_seconds] }
].freeze

def initialize(config)
Expand Down
5 changes: 4 additions & 1 deletion lib/cloud_controller/config_schemas/base/clock_schema.rb
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,10 @@ class ClockSchema < VCAP::Config
cutoff_age_in_days: Integer
},
failed_jobs: {
cutoff_age_in_days: Integer
cutoff_age_in_days: Integer,
max_number_of_failed_delayed_jobs: Integer,
expiration_in_seconds: Integer,
frequency_in_seconds: Integer,
},
completed_tasks: {
cutoff_age_in_days: Integer
Expand Down
3 changes: 3 additions & 0 deletions spec/fixtures/config/port_8181_config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,9 @@ audit_events:

failed_jobs:
cutoff_age_in_days: 31
max_number_of_failed_delayed_jobs: 100000
frequency_in_seconds: 144000 #4h
expiration_in_seconds: 144000

completed_tasks:
cutoff_age_in_days: 31
Expand Down
26 changes: 25 additions & 1 deletion spec/unit/jobs/runtime/failed_jobs_cleanup_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -24,9 +24,10 @@ def max_attempts

RSpec.describe FailedJobsCleanup, job_context: :worker do
let(:cutoff_age_in_days) { 2 }
let(:max_number_of_failed_delayed_jobs) { 2 }
let(:worker) { Delayed::Worker.new }

subject(:cleanup_job) { FailedJobsCleanup.new(cutoff_age_in_days) }
subject(:cleanup_job) { FailedJobsCleanup.new(cutoff_age_in_days, max_number_of_failed_delayed_jobs) }

it { is_expected.to be_a_valid_job }

Expand Down Expand Up @@ -75,6 +76,29 @@ def max_attempts
}.from(@delayed_job).to(nil)
end
end
context 'when the number of delayed jobs exceeds max_number_of_failed_delayed_jobs' do
let(:run_at) { Time.now.utc }
let(:the_job) { FailingJob.new }
let(:the_job2) { SuccessJob.new }

it 'removes the exceeding jobs' do
Delayed::Worker.destroy_failed_jobs = false
@delayed_job2 = Delayed::Job.enqueue(the_job, run_at: run_at, queue: worker.name, created_at: (Time.now.utc - 1.day))
@delayed_job3 = Delayed::Job.enqueue(the_job, run_at: run_at, queue: worker.name, created_at: (Time.now.utc - 1.day))
@delayed_job4 = Delayed::Job.enqueue(the_job, run_at: run_at, queue: worker.name, created_at: (Time.now.utc - 1.day))
@delayed_job5 = Delayed::Job.enqueue(the_job2, run_at: run_at, queue: worker.name, created_at: (Time.now.utc - 1.day))
@delayed_job6 = Delayed::Job.enqueue(the_job2, run_at: run_at, queue: worker.name, created_at: (Time.now.utc - 1.day))
worker.work_off 5

expect {
cleanup_job.perform
}.to change {
Delayed::Job.count
}.by(-2)
expect(Delayed::Job.find(id: @delayed_job.id)).to be_nil
expect(Delayed::Job.find(id: @delayed_job2.id)).to be_nil
end
end
end
end
end
Expand Down
14 changes: 7 additions & 7 deletions spec/unit/lib/cloud_controller/clock/scheduler_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ module VCAP::CloudController
},
app_usage_events: { cutoff_age_in_days: 1, },
audit_events: { cutoff_age_in_days: 3, },
failed_jobs: { cutoff_age_in_days: 4, },
failed_jobs: { cutoff_age_in_days: 4, max_number_of_failed_delayed_jobs: 10, frequency_in_seconds: 400, expiration_in_seconds: 700, },
service_usage_events: { cutoff_age_in_days: 5, },
completed_tasks: { cutoff_age_in_days: 6, },
pending_droplets: { frequency_in_seconds: 300, expiration_in_seconds: 600 },
Expand Down Expand Up @@ -69,12 +69,6 @@ module VCAP::CloudController
expect(block.call).to be_instance_of(Jobs::Runtime::EventsCleanup)
end

expect(clock).to receive(:schedule_daily_job) do |args, &block|
expect(args).to eql(name: 'failed_jobs', at: '21:00', priority: 0)
expect(Jobs::Runtime::FailedJobsCleanup).to receive(:new).with(4).and_call_original
expect(block.call).to be_instance_of(Jobs::Runtime::FailedJobsCleanup)
end

expect(clock).to receive(:schedule_daily_job) do |args, &block|
expect(args).to eql(name: 'service_usage_events', at: '22:00', priority: 0)
expect(Jobs::Services::ServiceUsageEventsCleanup).to receive(:new).with(5).and_call_original
Expand Down Expand Up @@ -153,6 +147,12 @@ module VCAP::CloudController
expect(block.call).to be_instance_of(Jobs::Runtime::PendingBuildCleanup)
end

expect(clock).to receive(:schedule_frequent_worker_job) do |args, &block|
expect(args).to eql(name: 'failed_jobs', interval: 400)
expect(Jobs::Runtime::FailedJobsCleanup).to receive(:new).with(700).and_call_original
expect(block.call).to be_instance_of(Jobs::Runtime::FailedJobsCleanup)
end

schedule.start
end

Expand Down

0 comments on commit 30f1975

Please sign in to comment.