Skip to content

Commit

Permalink
Revert "Add metrics for other unhealthy VM states"
Browse files Browse the repository at this point in the history
  • Loading branch information
jpalermo authored Feb 3, 2025
1 parent d114816 commit fd05a98
Show file tree
Hide file tree
Showing 12 changed files with 61 additions and 429 deletions.
28 changes: 10 additions & 18 deletions src/bosh-director/lib/bosh/director/metrics_collector.rb
Original file line number Diff line number Diff line change
Expand Up @@ -42,13 +42,6 @@ def initialize(config)
labels: %i[name],
docstring: 'Number of unresponsive agents per deployment',
)

@unhealthy_instances = Prometheus::Client.registry.gauge(
:bosh_unhealthy_instances,
labels: %i[name],
docstring: 'Number of unhealthy instances per deployment',
)

@scheduler = Rufus::Scheduler.new
end

Expand Down Expand Up @@ -116,33 +109,32 @@ def populate_metrics

populate_network_metrics

populate_vm_metrics('/unresponsive_agents', @unresponsive_agents)
populate_vm_metrics('/unhealthy_instances', @unhealthy_instances)
populate_vm_metrics

@logger.info('populated metrics')
end

def populate_vm_metrics(metric, metric_call)
response = Net::HTTP.get_response('127.0.0.1', "#{metric}", @config.health_monitor_port)
def populate_vm_metrics
response = Net::HTTP.get_response('127.0.0.1', '/unresponsive_agents', @config.health_monitor_port)
return unless response.is_a?(Net::HTTPSuccess)

metric_counts = JSON.parse(response.body)
return unless metric_counts.is_a?(Hash)
unresponsive_agent_counts = JSON.parse(response.body)
return unless unresponsive_agent_counts.is_a?(Hash)

existing_deployment_names = metric_call.values.map do |key, _|
existing_deployment_names = @unresponsive_agents.values.map do |key, _|
# The keys within the Prometheus::Client::Metric#values method are actually hashes. So the
# data returned from values looks like:
# { { name: "deployment_a"} => 10, { name: "deployment_b "} => 0, ... }
key[:name]
end

metric_counts.each do |deployment, count|
metric_call.set(count, labels: { name: deployment })
unresponsive_agent_counts.each do |deployment, count|
@unresponsive_agents.set(count, labels: { name: deployment })
end

removed_deployments = existing_deployment_names - metric_counts.keys
removed_deployments = existing_deployment_names - unresponsive_agent_counts.keys
removed_deployments.each do |deployment|
metric_call.set(0, labels: { name: deployment })
@unresponsive_agents.set(0, labels: { name: deployment })
end
end

Expand Down
125 changes: 34 additions & 91 deletions src/bosh-director/spec/unit/metrics_collector_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -35,8 +35,6 @@ def tick
allow(Api::ConfigManager).to receive(:deploy_config_enabled?).and_return(true, false)
stub_request(:get, /unresponsive_agents/)
.to_return(status: 200, body: JSON.dump('flaky_deployment' => 1, 'good_deployment' => 0))
stub_request(:get, /unhealthy_instances/)
.to_return(status: 200, body: JSON.dump('flaky_deployment' => 1, 'good_deployment' => 0))
end

after do
Expand All @@ -46,7 +44,6 @@ def tick
Prometheus::Client.registry.unregister(:bosh_networks_dynamic_ips_total)
Prometheus::Client.registry.unregister(:bosh_networks_dynamic_free_ips_total)
Prometheus::Client.registry.unregister(:bosh_unresponsive_agents)
Prometheus::Client.registry.unregister(:bosh_unhealthy_instances)
end

describe '#prep' do
Expand Down Expand Up @@ -289,107 +286,53 @@ def tick
end

describe 'vm metrics' do
context 'unresponsive agents' do
it 'emits the number of unresponsive agents for each deployment' do
metrics_collector.start
metric = Prometheus::Client.registry.get(:bosh_unresponsive_agents)
expect(metric.get(labels: { name: 'flaky_deployment' })).to eq(1)
expect(metric.get(labels: { name: 'good_deployment' })).to eq(0)
end

context 'when the health monitor returns a non 200 response' do
before do
stub_request(:get, '127.0.0.1:12345/unresponsive_agents')
.to_return(status: 404)
end

it 'does not emit the vm metrics' do
metrics_collector.start
metric = Prometheus::Client.registry.get(:bosh_unresponsive_agents)
expect(metric.values).to be_empty
end
end

context 'when the health monitor returns a non-json response' do
before do
stub_request(:get, '127.0.0.1:12345/unresponsive_agents')
.to_return(status: 200, body: JSON.dump('bad response'))
end
it 'emits the number of unresponsive agents for each deployment' do
metrics_collector.start
metric = Prometheus::Client.registry.get(:bosh_unresponsive_agents)
expect(metric.get(labels: { name: 'flaky_deployment' })).to eq(1)
expect(metric.get(labels: { name: 'good_deployment' })).to eq(0)
end

it 'does not emit the vm metrics' do
metrics_collector.start
metric = Prometheus::Client.registry.get(:bosh_unresponsive_agents)
expect(metric.values).to be_empty
end
context 'when the health monitor returns a non 200 response' do
before do
stub_request(:get, '127.0.0.1:12345/unresponsive_agents')
.to_return(status: 404)
end

context 'when a deployment is deleted after metrics are gathered' do
before do
stub_request(:get, /unresponsive_agents/)
.to_return(status: 200, body: JSON.dump('flaky_deployment' => 1, 'good_deployment' => 0))
metrics_collector.start

stub_request(:get, /unresponsive_agents/)
.to_return(status: 200, body: JSON.dump('good_deployment' => 0))
scheduler.tick
end

it 'resets the metrics for the deleted deployment' do
metric = Prometheus::Client.registry.get(:bosh_unresponsive_agents)
expect(metric.get(labels: { name: 'flaky_deployment' })).to eq(0)
end
it 'does not emit the vm metrics' do
metrics_collector.start
metric = Prometheus::Client.registry.get(:bosh_unresponsive_agents)
expect(metric.values).to be_empty
end
end

context 'unhealthy instances' do
it 'emits the number of unhealthy instances for each deployment' do
metrics_collector.start
metric = Prometheus::Client.registry.get(:bosh_unhealthy_instances)
expect(metric.get(labels: { name: 'flaky_deployment' })).to eq(1)
expect(metric.get(labels: { name: 'good_deployment' })).to eq(0)
context 'when the health monitor returns a non-json response' do
before do
stub_request(:get, '127.0.0.1:12345/unresponsive_agents')
.to_return(status: 200, body: JSON.dump('bad response'))
end

context 'when the health monitor returns a non 200 response' do
before do
stub_request(:get, '127.0.0.1:12345/unhealthy_instances')
.to_return(status: 404)
end

it 'does not emit the vm metrics' do
metrics_collector.start
metric = Prometheus::Client.registry.get(:bosh_unhealthy_instances)
expect(metric.values).to be_empty
end
it 'does not emit the vm metrics' do
metrics_collector.start
metric = Prometheus::Client.registry.get(:bosh_unresponsive_agents)
expect(metric.values).to be_empty
end
end

context 'when the health monitor returns a non-json response' do
before do
stub_request(:get, '127.0.0.1:12345/unhealthy_instances')
.to_return(status: 200, body: JSON.dump('bad response'))
end
context 'when a deployment is deleted after metrics are gathered' do
before do
stub_request(:get, /unresponsive_agents/)
.to_return(status: 200, body: JSON.dump('flaky_deployment' => 1, 'good_deployment' => 0))
metrics_collector.start

it 'does not emit the vm metrics' do
metrics_collector.start
metric = Prometheus::Client.registry.get(:bosh_unhealthy_instances)
expect(metric.values).to be_empty
end
stub_request(:get, /unresponsive_agents/)
.to_return(status: 200, body: JSON.dump('good_deployment' => 0))
scheduler.tick
end

context 'when a deployment is deleted after metrics are gathered' do
before do
stub_request(:get, /unhealthy_instances/)
.to_return(status: 200, body: JSON.dump('flaky_deployment' => 1, 'good_deployment' => 0))
metrics_collector.start

stub_request(:get, /unhealthy_instances/)
.to_return(status: 200, body: JSON.dump('good_deployment' => 0))
scheduler.tick
end

it 'resets the metrics for the deleted deployment' do
metric = Prometheus::Client.registry.get(:bosh_unhealthy_instances)
expect(metric.get(labels: { name: 'flaky_deployment' })).to eq(0)
end
it 'resets the metrics for the deleted deployment' do
metric = Prometheus::Client.registry.get(:bosh_unresponsive_agents)
expect(metric.get(labels: { name: 'flaky_deployment' })).to eq(0)
end
end
end
Expand Down
10 changes: 1 addition & 9 deletions src/bosh-monitor/lib/bosh/monitor/agent.rb
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ class Agent
attr_reader :discovered_at
attr_accessor :updated_at

ATTRIBUTES = %i[deployment job index instance_id cid job_state has_processes].freeze
ATTRIBUTES = %i[deployment job index instance_id cid].freeze

ATTRIBUTES.each do |attribute|
attr_accessor attribute
Expand All @@ -24,8 +24,6 @@ def initialize(id, opts = {})
@index = opts[:index]
@cid = opts[:cid]
@instance_id = opts[:instance_id]
@job_state = opts[:job_state]
@has_processes = opts[:has_processes]
end

def name
Expand Down Expand Up @@ -53,17 +51,11 @@ def rogue?
(Time.now - @discovered_at) > @intervals.rogue_agent_alert && @deployment.nil?
end

def is_not_running?
@job_state.to_s != 'running' || @has_processes == false
end

def update_instance(instance)
@job = instance.job
@index = instance.index
@cid = instance.cid
@instance_id = instance.id
@job_state = instance.job_state
@has_processes = instance.has_processes
end
end
end
8 changes: 0 additions & 8 deletions src/bosh-monitor/lib/bosh/monitor/api_controller.rb
Original file line number Diff line number Diff line change
Expand Up @@ -40,13 +40,5 @@ def initialize(heartbeat_interval = 1)
status(503)
end
end

get '/unhealthy_instances' do
if @instance_manager.director_initial_deployment_sync_done
JSON.generate(@instance_manager.unhealthy_instances)
else
status(503)
end
end
end
end
56 changes: 4 additions & 52 deletions src/bosh-monitor/lib/bosh/monitor/director.rb
Original file line number Diff line number Diff line change
Expand Up @@ -31,52 +31,6 @@ def get_deployment_instances(name)
parse_json(body, Array)
end

def get_deployment_instances_full(name, recursive_counter=0)
body, status, headers = perform_request(:get, "/deployments/#{name}/instances?format=full")
sleep_amount_seconds = 1
location = headers['location']
unless !location.nil? && location.include?('task')
raise DirectorError, "Can not find 'location' response header to retrieve the task location"
end
counter = 0
# States are documented here: https://bosh.io/docs/director-api-v1/#list-tasks
truthy_states = %w(cancelled cancelling done error timeout)
falsy_states = %w(queued processing)
body = nil, status = nil, state = nil
loop do
counter = counter + 1
body, status = perform_request(:get, location)
if status == 206 || body.nil? || body.empty?
sleep_amount_seconds = counter + sleep_amount_seconds
sleep(sleep_amount_seconds)
next
end
json_output = parse_json(body, Hash)
state = json_output['state']
if truthy_states.include?(state) || counter > 5
@logger.warn("The number of retries to fetch instance details for deployment '#{name}' has exceeded. Could not get the expected response from '#{location}'")
break
end
sleep_amount_seconds = counter + sleep_amount_seconds
sleep(sleep_amount_seconds)
end
updated_body = nil
if state == 'done'
body, status = perform_request(:get, "#{location}/output?type=result")
if status!= 200 || body.nil? || body.empty?
raise DirectorError, "Fetching full instance details for deployment '#{name}' failed"
end
updated_body = "[#{body.chomp.gsub(/\R+/, ',')}]"
return parse_json(updated_body, Array)
else
if recursive_counter > 0
return updated_body, state
end
@logger.warn("Could not fetch instance details for deployment '#{name}' in the first attempt, retrying once more ...")
return get_deployment_instances_full(name, recursive_counter + 1)
end
end

private

def endpoint
Expand All @@ -96,12 +50,10 @@ def parse_json(json, expected_type = nil)
end

def perform_request(method, request_path, options = {})
if !request_path.nil?
request_path = request_path.sub(endpoint, '')
end
parsed_endpoint = URI.parse(endpoint + (request_path || ''))
headers = options['headers'] || {}
parsed_endpoint = URI.parse(endpoint + request_path)
headers = {}
headers['authorization'] = auth_provider.auth_header unless options.fetch(:no_login, false)

ssl_context = OpenSSL::SSL::SSLContext.new
ssl_context.set_params(verify_mode: OpenSSL::SSL::VERIFY_NONE)
async_endpoint = Async::HTTP::Endpoint.parse(parsed_endpoint.to_s, ssl_context: ssl_context)
Expand All @@ -110,7 +62,7 @@ def perform_request(method, request_path, options = {})
body = response.read
status = response.status

[body, status, response.headers]
[body, status]
rescue URI::Error
raise DirectorError, "Invalid URI: #{endpoint + request_path}"
rescue => e
Expand Down
10 changes: 4 additions & 6 deletions src/bosh-monitor/lib/bosh/monitor/instance.rb
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
module Bosh::Monitor
class Instance
attr_reader :id, :agent_id, :job, :index, :cid, :expects_vm, :job_state, :has_processes
attr_accessor :deployment, :job_state
attr_reader :id, :agent_id, :job, :index, :cid, :expects_vm
attr_accessor :deployment

def initialize(instance_data)
@logger = Bosh::Monitor.logger
Expand All @@ -11,8 +11,6 @@ def initialize(instance_data)
@index = instance_data['index']
@cid = instance_data['cid']
@expects_vm = instance_data['expects_vm']
@job_state = instance_data['job_state']
@has_processes = instance_data['has_processes']
end

def self.create(instance_data)
Expand All @@ -32,11 +30,11 @@ def self.create(instance_data)
def name
if @job
identifier = "#{@job}(#{@id})"
attributes = create_optional_attributes(%i[agent_id index job_state has_processes])
attributes = create_optional_attributes(%i[agent_id index])
attributes += create_mandatory_attributes([:cid])
else
identifier = "instance #{@id}"
attributes = create_optional_attributes(%i[agent_id job index cid expects_vm job_state has_processes])
attributes = create_optional_attributes(%i[agent_id job index cid expects_vm])
end

"#{@deployment}: #{identifier} [#{attributes.join(', ')}]"
Expand Down
Loading

0 comments on commit fd05a98

Please sign in to comment.