Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implement exponential backoff for SB operations (CC v3 API) #2667

Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 13 additions & 3 deletions app/jobs/reoccurring_job.rb
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
module VCAP::CloudController
module Jobs
class ReoccurringJob < VCAP::CloudController::Jobs::CCJob
attr_reader :finished, :start_time
attr_reader :finished, :start_time, :retry_number

def success(current_delayed_job)
pollable_job = PollableJobModel.find_by_delayed_job(current_delayed_job)
Expand Down Expand Up @@ -49,6 +49,7 @@ def polling_interval_seconds=(interval)
def initialize
@start_time = Time.now
@finished = false
@retry_number = 0
end

def default_maximum_duration_seconds
Expand All @@ -59,8 +60,16 @@ def default_polling_interval_seconds
Config.config.get(:broker_client_default_async_poll_interval_seconds)
end

def default_polling_exponential_backoff
Config.config.get(:broker_client_async_poll_exponential_backoff_rate)
end

def next_execution_in
polling_interval_seconds * default_polling_exponential_backoff**retry_number
end

def next_enqueue_would_exceed_maximum_duration?
Time.now + polling_interval_seconds > start_time + maximum_duration_seconds
Time.now + next_execution_in > start_time + maximum_duration_seconds
end

def finish
Expand All @@ -75,9 +84,10 @@ def expire!
def enqueue_next_job(pollable_job)
opts = {
queue: Jobs::Queues.generic,
run_at: Delayed::Job.db_time_now + polling_interval_seconds
run_at: Delayed::Job.db_time_now + next_execution_in
}

@retry_number += 1
Jobs::Enqueuer.new(self, opts).enqueue_pollable(existing_guid: pollable_job.guid)
end
end
Expand Down
1 change: 1 addition & 0 deletions config/cloud_controller.yml
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,7 @@ max_retained_revisions_per_app: 100

broker_client_default_async_poll_interval_seconds: 60
broker_client_max_async_poll_duration_minutes: 10080
broker_client_async_poll_exponential_backoff_rate: 1.0

shared_isolation_segment_name: 'shared'

Expand Down
1 change: 1 addition & 0 deletions lib/cloud_controller/config_schemas/base/api_schema.rb
Original file line number Diff line number Diff line change
Expand Up @@ -184,6 +184,7 @@ class ApiSchema < VCAP::Config
broker_client_timeout_seconds: Integer,
broker_client_default_async_poll_interval_seconds: Integer,
broker_client_max_async_poll_duration_minutes: Integer,
broker_client_async_poll_exponential_backoff_rate: Float,
optional(:uaa_client_name) => String,
optional(:uaa_client_secret) => String,
optional(:uaa_client_scope) => String,
Expand Down
1 change: 1 addition & 0 deletions lib/cloud_controller/config_schemas/base/worker_schema.rb
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,7 @@ class WorkerSchema < VCAP::Config
broker_client_timeout_seconds: Integer,
broker_client_default_async_poll_interval_seconds: Integer,
broker_client_max_async_poll_duration_minutes: Integer,
broker_client_async_poll_exponential_backoff_rate: Float,
optional(:uaa_client_name) => String,
optional(:uaa_client_secret) => String,
optional(:uaa_client_scope) => String,
Expand Down
142 changes: 142 additions & 0 deletions spec/unit/jobs/reoccurring_job_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,148 @@ def perform
expect(job.polling_interval_seconds).to eq(24.hours)
end

context 'exponential backoff rate' do
context 'updates the polling interval' do
it 'when changing exponential backoff rate only' do
TestConfig.config[:broker_client_async_poll_exponential_backoff_rate] = 2.0
job = FakeJob.new

enqueued_time = Time.now

Jobs::Enqueuer.new(job, queue: Jobs::Queues.generic).enqueue_pollable
execute_all_jobs(expected_successes: 1, expected_failures: 0)

Timecop.freeze(59.seconds.after(enqueued_time)) do
execute_all_jobs(expected_successes: 0, expected_failures: 0)
end

Timecop.freeze(61.seconds.after(enqueued_time)) do
enqueued_time = Time.now
execute_all_jobs(expected_successes: 1, expected_failures: 0)
end

Timecop.freeze(119.seconds.after(enqueued_time)) do
execute_all_jobs(expected_successes: 0, expected_failures: 0)
end

Timecop.freeze(121.seconds.after(enqueued_time)) do
execute_all_jobs(expected_successes: 1, expected_failures: 0)
end
end
it 'when changing exponential backoff rate and default polling interval' do
TestConfig.config[:broker_client_async_poll_exponential_backoff_rate] = 1.3
TestConfig.config[:broker_client_default_async_poll_interval_seconds] = 10

job = FakeJob.new

enqueued_time = Time.now

Jobs::Enqueuer.new(job, queue: Jobs::Queues.generic).enqueue_pollable
execute_all_jobs(expected_successes: 1, expected_failures: 0)

Timecop.freeze(9.seconds.after(enqueued_time)) do
execute_all_jobs(expected_successes: 0, expected_failures: 0)
end

Timecop.freeze(11.seconds.after(enqueued_time)) do
enqueued_time = Time.now
execute_all_jobs(expected_successes: 1, expected_failures: 0)
end

Timecop.freeze(12.seconds.after(enqueued_time)) do
execute_all_jobs(expected_successes: 0, expected_failures: 0)
end

Timecop.freeze(14.seconds.after(enqueued_time)) do
execute_all_jobs(expected_successes: 1, expected_failures: 0)
end
end
it 'when changing exponential backoff rate and retry_after from the job' do
job = FakeJob.new(retry_after: ['20', '30'])
TestConfig.config[:broker_client_async_poll_exponential_backoff_rate] = 1.3
TestConfig.config[:broker_client_default_async_poll_interval_seconds] = 10

enqueued_time = Time.now

Jobs::Enqueuer.new(job, queue: Jobs::Queues.generic).enqueue_pollable
execute_all_jobs(expected_successes: 1, expected_failures: 0)

Timecop.freeze(19.seconds.after(enqueued_time)) do
execute_all_jobs(expected_successes: 0, expected_failures: 0)
end

Timecop.freeze(22.seconds.after(enqueued_time)) do
enqueued_time = Time.now
execute_all_jobs(expected_successes: 1, expected_failures: 0)
end

# the job should run after 30s * 1.3^1 = 39 seconds
Timecop.freeze(38.seconds.after(enqueued_time)) do
execute_all_jobs(expected_successes: 0, expected_failures: 0)
end

Timecop.freeze(40.seconds.after(enqueued_time)) do
execute_all_jobs(expected_successes: 1, expected_failures: 0)
end
end
end

it 'does not reduce the retries with the default exponential backoff rate' do
TestConfig.config[:broker_client_max_async_poll_duration_minutes] = 60
Jobs::Enqueuer.new(FakeJob.new(iterations: 100), queue: Jobs::Queues.generic).enqueue_pollable

# it is expected that the job runs 58 times within 60 minutes with the
# default exponential backoff rate of 1.0 and the default poll interval of 60s
58.times do |count|
retry_after = VCAP::CloudController::Config.config.get(:broker_client_default_async_poll_interval_seconds) *
VCAP::CloudController::Config.config.get(:broker_client_async_poll_exponential_backoff_rate)**count
Timecop.travel(retry_after.seconds + 1)
execute_all_jobs(expected_successes: 1, expected_failures: 0)
end

# the 59th run will then time out
retry_after = VCAP::CloudController::Config.config.get(:broker_client_default_async_poll_interval_seconds) *
VCAP::CloudController::Config.config.get(:broker_client_async_poll_exponential_backoff_rate)**58 + 1
Timecop.freeze(Time.now + retry_after) do
execute_all_jobs(expected_successes: 0, expected_failures: 1, jobs_to_execute: 1)
expect(PollableJobModel.first.state).to eq('FAILED')
expect(PollableJobModel.first.cf_api_error).not_to be_nil
error = YAML.safe_load(PollableJobModel.first.cf_api_error)
expect(error['errors'].first['code']).to eq(290006)
expect(error['errors'].first['detail']).
to eq('The job execution has timed out.')
end
end

it 'reduces the retries when the exponential backoff rate is set higher than 1.0' do
TestConfig.config[:broker_client_async_poll_exponential_backoff_rate] = 1.3
TestConfig.config[:broker_client_max_async_poll_duration_minutes] = 60
Jobs::Enqueuer.new(FakeJob.new(iterations: 100), queue: Jobs::Queues.generic).enqueue_pollable

# it is expected that the job runs 10 times within 60 minutes with a configured
# exponential backoff rate of 1.3 and a default poll intervall of 60s
10.times do |count|
retry_after = VCAP::CloudController::Config.config.get(:broker_client_default_async_poll_interval_seconds) *
VCAP::CloudController::Config.config.get(:broker_client_async_poll_exponential_backoff_rate)**count
Timecop.travel(retry_after.seconds + 1)
execute_all_jobs(expected_successes: 1, expected_failures: 0)
end

# the 11th run will then time out
retry_after = VCAP::CloudController::Config.config.get(:broker_client_default_async_poll_interval_seconds) *
VCAP::CloudController::Config.config.get(:broker_client_async_poll_exponential_backoff_rate)**10 + 1
Timecop.freeze(Time.now + retry_after) do
execute_all_jobs(expected_successes: 0, expected_failures: 1, jobs_to_execute: 1)
expect(PollableJobModel.first.state).to eq('FAILED')
expect(PollableJobModel.first.cf_api_error).not_to be_nil
error = YAML.safe_load(PollableJobModel.first.cf_api_error)
expect(error['errors'].first['code']).to eq(290006)
expect(error['errors'].first['detail']).
to eq('The job execution has timed out.')
end
end
end

context 'updates the polling interval if config changes' do
it 'when changed from the job only' do
job = FakeJob.new(retry_after: ['20', '30'])
Expand Down