Revert "Add metrics for other unhealthy VM states"

cloudfoundry · Feb 3, 2025 · fd05a98 · fd05a98
1 parent d114816
commit fd05a98
Show file tree

Hide file tree

Showing 12 changed files with 61 additions and 429 deletions.
diff --git a/src/bosh-director/lib/bosh/director/metrics_collector.rb b/src/bosh-director/lib/bosh/director/metrics_collector.rb
@@ -42,13 +42,6 @@ def initialize(config)
           labels: %i[name],
           docstring: 'Number of unresponsive agents per deployment',
         )
-
-        @unhealthy_instances = Prometheus::Client.registry.gauge(
-          :bosh_unhealthy_instances,
-          labels: %i[name],
-          docstring: 'Number of unhealthy instances per deployment',
-        )
-
         @scheduler = Rufus::Scheduler.new
       end
 
@@ -116,33 +109,32 @@ def populate_metrics
 
         populate_network_metrics
 
-        populate_vm_metrics('/unresponsive_agents', @unresponsive_agents)
-        populate_vm_metrics('/unhealthy_instances', @unhealthy_instances)
+        populate_vm_metrics
 
         @logger.info('populated metrics')
       end
 
-      def populate_vm_metrics(metric, metric_call)
-        response = Net::HTTP.get_response('127.0.0.1', "#{metric}", @config.health_monitor_port)
+      def populate_vm_metrics
+        response = Net::HTTP.get_response('127.0.0.1', '/unresponsive_agents', @config.health_monitor_port)
         return unless response.is_a?(Net::HTTPSuccess)
 
-        metric_counts = JSON.parse(response.body)
-        return unless metric_counts.is_a?(Hash)
+        unresponsive_agent_counts = JSON.parse(response.body)
+        return unless unresponsive_agent_counts.is_a?(Hash)
 
-        existing_deployment_names = metric_call.values.map do |key, _|
+        existing_deployment_names = @unresponsive_agents.values.map do |key, _|
           # The keys within the Prometheus::Client::Metric#values method are actually hashes. So the
           # data returned from values looks like:
           # { { name: "deployment_a"} => 10, { name: "deployment_b "} => 0, ... }
           key[:name]
         end
 
-        metric_counts.each do |deployment, count|
-          metric_call.set(count, labels: { name: deployment })
+        unresponsive_agent_counts.each do |deployment, count|
+          @unresponsive_agents.set(count, labels: { name: deployment })
         end
 
-        removed_deployments = existing_deployment_names - metric_counts.keys
+        removed_deployments = existing_deployment_names - unresponsive_agent_counts.keys
         removed_deployments.each do |deployment|
-          metric_call.set(0, labels: { name: deployment })
+          @unresponsive_agents.set(0, labels: { name: deployment })
         end
       end
 

diff --git a/src/bosh-director/spec/unit/metrics_collector_spec.rb b/src/bosh-director/spec/unit/metrics_collector_spec.rb
@@ -35,8 +35,6 @@ def tick
       allow(Api::ConfigManager).to receive(:deploy_config_enabled?).and_return(true, false)
       stub_request(:get, /unresponsive_agents/)
         .to_return(status: 200, body: JSON.dump('flaky_deployment' => 1, 'good_deployment' => 0))
-      stub_request(:get, /unhealthy_instances/)
-        .to_return(status: 200, body: JSON.dump('flaky_deployment' => 1, 'good_deployment' => 0))
     end
 
     after do
@@ -46,7 +44,6 @@ def tick
       Prometheus::Client.registry.unregister(:bosh_networks_dynamic_ips_total)
       Prometheus::Client.registry.unregister(:bosh_networks_dynamic_free_ips_total)
       Prometheus::Client.registry.unregister(:bosh_unresponsive_agents)
-      Prometheus::Client.registry.unregister(:bosh_unhealthy_instances)
     end
 
     describe '#prep' do
@@ -289,107 +286,53 @@ def tick
       end
 
       describe 'vm metrics' do
-        context 'unresponsive agents' do
-          it 'emits the number of unresponsive agents for each deployment' do
-            metrics_collector.start
-            metric = Prometheus::Client.registry.get(:bosh_unresponsive_agents)
-            expect(metric.get(labels: { name: 'flaky_deployment' })).to eq(1)
-            expect(metric.get(labels: { name: 'good_deployment' })).to eq(0)
-          end
-
-          context 'when the health monitor returns a non 200 response' do
-            before do
-              stub_request(:get, '127.0.0.1:12345/unresponsive_agents')
-                .to_return(status: 404)
-            end
-
-            it 'does not emit the vm metrics' do
-              metrics_collector.start
-              metric = Prometheus::Client.registry.get(:bosh_unresponsive_agents)
-              expect(metric.values).to be_empty
-            end
-          end
-
-          context 'when the health monitor returns a non-json response' do
-            before do
-              stub_request(:get, '127.0.0.1:12345/unresponsive_agents')
-                .to_return(status: 200, body: JSON.dump('bad response'))
-            end
+        it 'emits the number of unresponsive agents for each deployment' do
+          metrics_collector.start
+          metric = Prometheus::Client.registry.get(:bosh_unresponsive_agents)
+          expect(metric.get(labels: { name: 'flaky_deployment' })).to eq(1)
+          expect(metric.get(labels: { name: 'good_deployment' })).to eq(0)
+        end
 
-            it 'does not emit the vm metrics' do
-              metrics_collector.start
-              metric = Prometheus::Client.registry.get(:bosh_unresponsive_agents)
-              expect(metric.values).to be_empty
-            end
+        context 'when the health monitor returns a non 200 response' do
+          before do
+            stub_request(:get, '127.0.0.1:12345/unresponsive_agents')
+              .to_return(status: 404)
           end
 
-          context 'when a deployment is deleted after metrics are gathered' do
-            before do
-              stub_request(:get, /unresponsive_agents/)
-                .to_return(status: 200, body: JSON.dump('flaky_deployment' => 1, 'good_deployment' => 0))
-              metrics_collector.start
-
-              stub_request(:get, /unresponsive_agents/)
-                .to_return(status: 200, body: JSON.dump('good_deployment' => 0))
-              scheduler.tick
-            end
-
-            it 'resets the metrics for the deleted deployment' do
-              metric = Prometheus::Client.registry.get(:bosh_unresponsive_agents)
-              expect(metric.get(labels: { name: 'flaky_deployment' })).to eq(0)
-            end
+          it 'does not emit the vm metrics' do
+            metrics_collector.start
+            metric = Prometheus::Client.registry.get(:bosh_unresponsive_agents)
+            expect(metric.values).to be_empty
           end
         end
 
-        context 'unhealthy instances' do
-          it 'emits the number of unhealthy instances for each deployment' do
-            metrics_collector.start
-            metric = Prometheus::Client.registry.get(:bosh_unhealthy_instances)
-            expect(metric.get(labels: { name: 'flaky_deployment' })).to eq(1)
-            expect(metric.get(labels: { name: 'good_deployment' })).to eq(0)
+        context 'when the health monitor returns a non-json response' do
+          before do
+            stub_request(:get, '127.0.0.1:12345/unresponsive_agents')
+              .to_return(status: 200, body: JSON.dump('bad response'))
           end
 
-          context 'when the health monitor returns a non 200 response' do
-            before do
-              stub_request(:get, '127.0.0.1:12345/unhealthy_instances')
-                .to_return(status: 404)
-            end
-
-            it 'does not emit the vm metrics' do
-              metrics_collector.start
-              metric = Prometheus::Client.registry.get(:bosh_unhealthy_instances)
-              expect(metric.values).to be_empty
-            end
+          it 'does not emit the vm metrics' do
+            metrics_collector.start
+            metric = Prometheus::Client.registry.get(:bosh_unresponsive_agents)
+            expect(metric.values).to be_empty
           end
+        end
 
-          context 'when the health monitor returns a non-json response' do
-            before do
-              stub_request(:get, '127.0.0.1:12345/unhealthy_instances')
-                .to_return(status: 200, body: JSON.dump('bad response'))
-            end
+        context 'when a deployment is deleted after metrics are gathered' do
+          before do
+            stub_request(:get, /unresponsive_agents/)
+              .to_return(status: 200, body: JSON.dump('flaky_deployment' => 1, 'good_deployment' => 0))
+            metrics_collector.start
 
-            it 'does not emit the vm metrics' do
-              metrics_collector.start
-              metric = Prometheus::Client.registry.get(:bosh_unhealthy_instances)
-              expect(metric.values).to be_empty
-            end
+            stub_request(:get, /unresponsive_agents/)
+              .to_return(status: 200, body: JSON.dump('good_deployment' => 0))
+            scheduler.tick
           end
 
-          context 'when a deployment is deleted after metrics are gathered' do
-            before do
-              stub_request(:get, /unhealthy_instances/)
-                .to_return(status: 200, body: JSON.dump('flaky_deployment' => 1, 'good_deployment' => 0))
-              metrics_collector.start
-
-              stub_request(:get, /unhealthy_instances/)
-                .to_return(status: 200, body: JSON.dump('good_deployment' => 0))
-              scheduler.tick
-            end
-
-            it 'resets the metrics for the deleted deployment' do
-              metric = Prometheus::Client.registry.get(:bosh_unhealthy_instances)
-              expect(metric.get(labels: { name: 'flaky_deployment' })).to eq(0)
-            end
+          it 'resets the metrics for the deleted deployment' do
+            metric = Prometheus::Client.registry.get(:bosh_unresponsive_agents)
+            expect(metric.get(labels: { name: 'flaky_deployment' })).to eq(0)
           end
         end
       end

diff --git a/src/bosh-monitor/lib/bosh/monitor/agent.rb b/src/bosh-monitor/lib/bosh/monitor/agent.rb
@@ -4,7 +4,7 @@ class Agent
     attr_reader   :discovered_at
     attr_accessor :updated_at
 
-    ATTRIBUTES = %i[deployment job index instance_id cid job_state has_processes].freeze
+    ATTRIBUTES = %i[deployment job index instance_id cid].freeze
 
     ATTRIBUTES.each do |attribute|
       attr_accessor attribute
@@ -24,8 +24,6 @@ def initialize(id, opts = {})
       @index = opts[:index]
       @cid = opts[:cid]
       @instance_id = opts[:instance_id]
-      @job_state = opts[:job_state]
-      @has_processes = opts[:has_processes]
     end
 
     def name
@@ -53,17 +51,11 @@ def rogue?
       (Time.now - @discovered_at) > @intervals.rogue_agent_alert && @deployment.nil?
     end
 
-    def is_not_running?
-      @job_state.to_s != 'running' || @has_processes == false
-    end
-
     def update_instance(instance)
       @job = instance.job
       @index = instance.index
       @cid = instance.cid
       @instance_id = instance.id
-      @job_state = instance.job_state
-      @has_processes = instance.has_processes
     end
   end
 end
diff --git a/src/bosh-monitor/lib/bosh/monitor/api_controller.rb b/src/bosh-monitor/lib/bosh/monitor/api_controller.rb
@@ -40,13 +40,5 @@ def initialize(heartbeat_interval = 1)
         status(503)
       end
     end
-
-    get '/unhealthy_instances' do
-      if @instance_manager.director_initial_deployment_sync_done
-        JSON.generate(@instance_manager.unhealthy_instances)
-      else
-        status(503)
-      end
-    end
   end
 end
diff --git a/src/bosh-monitor/lib/bosh/monitor/director.rb b/src/bosh-monitor/lib/bosh/monitor/director.rb
@@ -31,52 +31,6 @@ def get_deployment_instances(name)
       parse_json(body, Array)
     end
 
-    def get_deployment_instances_full(name, recursive_counter=0)
-      body, status, headers = perform_request(:get, "/deployments/#{name}/instances?format=full")
-      sleep_amount_seconds = 1
-      location = headers['location']
-      unless !location.nil? && location.include?('task')
-        raise DirectorError, "Can not find 'location' response header to retrieve the task location"
-      end
-      counter = 0
-      # States are documented here: https://bosh.io/docs/director-api-v1/#list-tasks
-      truthy_states = %w(cancelled cancelling done error timeout)
-      falsy_states = %w(queued processing)
-      body = nil, status = nil, state = nil
-      loop do
-        counter = counter + 1
-        body, status = perform_request(:get, location)
-        if status == 206 || body.nil? || body.empty?
-          sleep_amount_seconds = counter + sleep_amount_seconds
-          sleep(sleep_amount_seconds)
-          next
-        end
-        json_output = parse_json(body, Hash)
-        state = json_output['state']
-        if truthy_states.include?(state) || counter > 5
-          @logger.warn("The number of retries to fetch instance details for deployment '#{name}' has exceeded. Could not get the expected response from '#{location}'")
-          break
-        end
-        sleep_amount_seconds = counter + sleep_amount_seconds
-        sleep(sleep_amount_seconds)
-      end
-      updated_body = nil
-      if state == 'done'
-        body, status = perform_request(:get, "#{location}/output?type=result")
-        if status!= 200 || body.nil? || body.empty?
-          raise DirectorError, "Fetching full instance details for deployment '#{name}' failed"
-        end
-        updated_body = "[#{body.chomp.gsub(/\R+/, ',')}]"
-        return parse_json(updated_body, Array)
-      else
-        if recursive_counter > 0
-          return updated_body, state
-        end
-        @logger.warn("Could not fetch instance details for deployment '#{name}' in the first attempt, retrying once more ...")
-        return get_deployment_instances_full(name, recursive_counter + 1)
-      end
-    end
-
     private
 
     def endpoint
@@ -96,12 +50,10 @@ def parse_json(json, expected_type = nil)
     end
 
     def perform_request(method, request_path, options = {})
-      if !request_path.nil?
-       request_path = request_path.sub(endpoint, '')
-      end
-      parsed_endpoint = URI.parse(endpoint + (request_path || ''))
-      headers = options['headers'] || {}
+      parsed_endpoint = URI.parse(endpoint + request_path)
+      headers = {}
       headers['authorization'] = auth_provider.auth_header unless options.fetch(:no_login, false)
+
       ssl_context = OpenSSL::SSL::SSLContext.new
       ssl_context.set_params(verify_mode: OpenSSL::SSL::VERIFY_NONE)
       async_endpoint = Async::HTTP::Endpoint.parse(parsed_endpoint.to_s, ssl_context: ssl_context)
@@ -110,7 +62,7 @@ def perform_request(method, request_path, options = {})
       body   = response.read
       status = response.status
 
-      [body, status, response.headers]
+      [body, status]
     rescue URI::Error
       raise DirectorError, "Invalid URI: #{endpoint + request_path}"
     rescue => e

diff --git a/src/bosh-monitor/lib/bosh/monitor/instance.rb b/src/bosh-monitor/lib/bosh/monitor/instance.rb
@@ -1,7 +1,7 @@
 module Bosh::Monitor
   class Instance
-    attr_reader :id, :agent_id, :job, :index, :cid, :expects_vm, :job_state, :has_processes
-    attr_accessor :deployment, :job_state
+    attr_reader :id, :agent_id, :job, :index, :cid, :expects_vm
+    attr_accessor :deployment
 
     def initialize(instance_data)
       @logger = Bosh::Monitor.logger
@@ -11,8 +11,6 @@ def initialize(instance_data)
       @index = instance_data['index']
       @cid = instance_data['cid']
       @expects_vm = instance_data['expects_vm']
-      @job_state = instance_data['job_state']
-      @has_processes = instance_data['has_processes']
     end
 
     def self.create(instance_data)
@@ -32,11 +30,11 @@ def self.create(instance_data)
     def name
       if @job
         identifier = "#{@job}(#{@id})"
-        attributes = create_optional_attributes(%i[agent_id index job_state has_processes])
+        attributes = create_optional_attributes(%i[agent_id index])
         attributes += create_mandatory_attributes([:cid])
       else
         identifier = "instance #{@id}"
-        attributes = create_optional_attributes(%i[agent_id job index cid expects_vm job_state has_processes])
+        attributes = create_optional_attributes(%i[agent_id job index cid expects_vm])
       end
 
       "#{@deployment}: #{identifier} [#{attributes.join(', ')}]"