From 37e100329594d8fa25859f0275c8688fa98513e3 Mon Sep 17 00:00:00 2001 From: Tim Downey Date: Tue, 27 Apr 2021 20:41:25 -0700 Subject: [PATCH 1/4] support partial Process Stats responses - this change rescues errors from log-cache and metric-proxy and passes them up as warnings to callers instead of exceptions so that the stats endpoint can continue to function on a best-effort basis - on /v3/processes/:guid/stats these warnings are included in the 'X-Cf-Warnings' header - addresses https://github.com/cloudfoundry/cloud_controller_ng/issues/2227 Authored-by: Tim Downey --- app/controllers/runtime/stats_controller.rb | 2 +- app/controllers/v3/processes_controller.rb | 3 +- app/presenters/v3/process_stats_presenter.rb | 26 +++-- .../reporters/instances_stats_reporter.rb | 97 ++++++++++++------- lib/logcache/client.rb | 6 -- spec/api/documentation/apps_api_spec.rb | 2 +- spec/logcache/client_spec.rb | 41 ++------ spec/request/processes_spec.rb | 64 ++++++------ spec/request/v2/apps_spec.rb | 2 +- .../runtime/stats_controller_spec.rb | 2 +- .../v3/processes_controller_spec.rb | 16 ++- .../instances_stats_reporter_spec.rb | 85 +++++++++++++--- .../v3/process_stats_presenter_spec.rb | 38 ++++++++ 13 files changed, 253 insertions(+), 131 deletions(-) diff --git a/app/controllers/runtime/stats_controller.rb b/app/controllers/runtime/stats_controller.rb index 3e4956e9c5d..ce2d12734d6 100644 --- a/app/controllers/runtime/stats_controller.rb +++ b/app/controllers/runtime/stats_controller.rb @@ -22,7 +22,7 @@ def stats(guid, opts={}) end begin - stats = instances_reporters.stats_for_app(process) + stats, _ = instances_reporters.stats_for_app(process) stats.each_value do |stats_hash| if stats_hash[:stats] stats_hash[:stats].delete_if { |key, _| key == :net_info } diff --git a/app/controllers/v3/processes_controller.rb b/app/controllers/v3/processes_controller.rb index 42617328cd4..a4b76a72b35 100644 --- a/app/controllers/v3/processes_controller.rb +++ b/app/controllers/v3/processes_controller.rb @@ -98,7 +98,8 @@ def scale end def stats - process_stats = instances_reporters.stats_for_app(@process) + process_stats, warnings = instances_reporters.stats_for_app(@process) + add_warning_headers(warnings) render status: :ok, json: Presenters::V3::ProcessStatsPresenter.new(@process.type, process_stats) end diff --git a/app/presenters/v3/process_stats_presenter.rb b/app/presenters/v3/process_stats_presenter.rb index f558762419e..47fbc14802d 100644 --- a/app/presenters/v3/process_stats_presenter.rb +++ b/app/presenters/v3/process_stats_presenter.rb @@ -9,7 +9,7 @@ def initialize(type, process_stats) def to_hash { - resources: present_stats_hash + resources: present_stats_hash, } end @@ -35,12 +35,6 @@ def found_instance_stats_hash(index, stats) type: @type, index: index, state: stats[:state], - usage: { - time: stats[:stats][:usage][:time], - cpu: stats[:stats][:usage][:cpu], - mem: stats[:stats][:usage][:mem], - disk: stats[:stats][:usage][:disk], - }, host: stats[:stats][:host], uptime: stats[:stats][:uptime], mem_quota: stats[:stats][:mem_quota], @@ -48,7 +42,10 @@ def found_instance_stats_hash(index, stats) fds_quota: stats[:stats][:fds_quota], isolation_segment: stats[:isolation_segment], details: stats[:details] - }.tap { |presented_stats| add_port_info(presented_stats, stats) } + }.tap do |presented_stats| + add_port_info(presented_stats, stats) + add_usage_info(presented_stats, stats) + end end def down_instance_stats_hash(index, stats) @@ -70,6 +67,19 @@ def add_port_info(presented_stats, stats) end end + def add_usage_info(presented_stats, stats) + presented_stats[:usage] = if stats[:stats][:usage].present? + { + time: stats[:stats][:usage][:time], + cpu: stats[:stats][:usage][:cpu], + mem: stats[:stats][:usage][:mem], + disk: stats[:stats][:usage][:disk], + } + else + {} + end + end + def net_info_to_instance_ports(net_info_ports) return [] if net_info_ports.nil? diff --git a/lib/cloud_controller/diego/reporters/instances_stats_reporter.rb b/lib/cloud_controller/diego/reporters/instances_stats_reporter.rb index 264dab5fef6..a37352776b6 100644 --- a/lib/cloud_controller/diego/reporters/instances_stats_reporter.rb +++ b/lib/cloud_controller/diego/reporters/instances_stats_reporter.rb @@ -20,22 +20,9 @@ def stats_for_app(process) logger.debug('stats_for_app.fetching_container_metrics', process_guid: process.guid) desired_lrp = bbs_instances_client.desired_lrp_instance(process) - log_cache_data = envelopes(desired_lrp, process) - stats = log_cache_data. - map { |e| - [ - e.containerMetric.instanceIndex, - converted_container_metrics(e.containerMetric, formatted_current_time), - ] - }.to_h - - quota_stats = log_cache_data. - map { |e| - [ - e.containerMetric.instanceIndex, - e.containerMetric, - ] - }.to_h + log_cache_data, log_cache_errors = envelopes(desired_lrp, process) + stats = formatted_process_stats(log_cache_data, formatted_current_time) + quota_stats = formatted_quota_stats(log_cache_data) bbs_instances_client.lrp_instances(process).each do |actual_lrp| index = actual_lrp.actual_lrp_key.index @@ -51,25 +38,20 @@ def stats_for_app(process) port: get_default_port(actual_lrp.actual_lrp_net_info), net_info: actual_lrp.actual_lrp_net_info.to_h, uptime: nanoseconds_to_seconds(current_time * 1e9 - actual_lrp.since), - mem_quota: quota_stats[index]&.memoryBytesQuota, - disk_quota: quota_stats[index]&.diskBytesQuota, fds_quota: process.file_descriptors, - usage: stats[index] || { - time: formatted_current_time, - cpu: 0, - mem: 0, - disk: 0, - }, - } + }.merge(metrics_data_for_instance(stats, quota_stats, log_cache_errors, formatted_current_time, index)) } info[:details] = actual_lrp.placement_error if actual_lrp.placement_error.present? result[actual_lrp.actual_lrp_key.index] = info end fill_unreported_instances_with_down_instances(result, process) + + warnings = [log_cache_errors].compact + return result, warnings rescue CloudController::Errors::NoRunningInstances => e logger.info('stats_for_app.error', error: e.to_s) - fill_unreported_instances_with_down_instances({}, process) + return fill_unreported_instances_with_down_instances({}, process), [] rescue StandardError => e logger.error('stats_for_app.error', error: e.to_s) if e.is_a?(CloudController::Errors::ApiError) && e.name == 'ServiceUnavailable' @@ -83,6 +65,53 @@ def stats_for_app(process) private + attr_reader :bbs_instances_client + + def metrics_data_for_instance(stats, quota_stats, log_cache_errors, formatted_current_time, index) + if log_cache_errors.blank? + { + mem_quota: quota_stats[index]&.memoryBytesQuota, + disk_quota: quota_stats[index]&.diskBytesQuota, + usage: stats[index] || missing_process_stats(formatted_current_time) + } + else + { + mem_quota: nil, + disk_quota: nil, + usage: {} + } + end + end + + def missing_process_stats(formatted_current_time) + { + time: formatted_current_time, + cpu: 0, + mem: 0, + disk: 0, + } + end + + def formatted_process_stats(log_cache_data, formatted_current_time) + log_cache_data. + map { |e| + [ + e.containerMetric.instanceIndex, + converted_container_metrics(e.containerMetric, formatted_current_time), + ] + }.to_h + end + + def formatted_quota_stats(log_cache_data) + log_cache_data. + map { |e| + [ + e.containerMetric.instanceIndex, + e.containerMetric, + ] + }.to_h + end + def envelopes(desired_lrp, process) if desired_lrp.metric_tags['process_id'] filter = ->(envelope) { envelope.tags.any? { |key, value| key == 'process_id' && value == process.guid } } @@ -92,15 +121,16 @@ def envelopes(desired_lrp, process) source_guid = process.guid end - @logstats_client.container_metrics( + return @logstats_client.container_metrics( source_guid: source_guid, auth_token: VCAP::CloudController::SecurityContext.auth_token, logcache_filter: filter - ) + ), nil + rescue GRPC::BadStatus, CloudController::Errors::ApiError => e + logger.error('stats_for_app.error', error: e.message, backtrace: e.backtrace.join("\n")) + return [], 'Stats server temporarily unavailable.' end - attr_reader :bbs_instances_client - def logger @logger ||= Steno.logger('cc.diego.instances_reporter') end @@ -111,12 +141,7 @@ def converted_container_metrics(container_metrics, formatted_current_time) disk = container_metrics.diskBytes if cpu.nil? || mem.nil? || disk.nil? - { - time: formatted_current_time, - cpu: 0, - mem: 0, - disk: 0 - } + missing_process_stats(formatted_current_time) else { time: formatted_current_time, diff --git a/lib/logcache/client.rb b/lib/logcache/client.rb index 5c9424d5e3c..ce472ed9e18 100644 --- a/lib/logcache/client.rb +++ b/lib/logcache/client.rb @@ -59,12 +59,6 @@ def with_request_error_handling(source_guid, &blk) retry end - if @temporary_ignore_server_unavailable_errors && e.is_a?(GRPC::BadStatus) && e.to_status.code == 14 - logger.warn("rescuing GRPC Unavailable error: #{e.to_status}") - - return EmptyEnvelope.new(source_guid) - end - raise e end diff --git a/spec/api/documentation/apps_api_spec.rb b/spec/api/documentation/apps_api_spec.rb index 5adfa6e7b3c..6555306f97b 100644 --- a/spec/api/documentation/apps_api_spec.rb +++ b/spec/api/documentation/apps_api_spec.rb @@ -374,7 +374,7 @@ def after_standard_model_delete(guid) instances_reporters = double(:instances_reporters) allow(CloudController::DependencyLocator.instance).to receive(:instances_reporters).and_return(instances_reporters) - allow(instances_reporters).to receive(:stats_for_app).and_return(stats) + allow(instances_reporters).to receive(:stats_for_app).and_return([stats, []]) client.get "/v2/apps/#{process.guid}/stats", {}, headers expect(status).to eq(200) diff --git a/spec/logcache/client_spec.rb b/spec/logcache/client_spec.rb index 74d9ffe3b1d..e7db9347576 100644 --- a/spec/logcache/client_spec.rb +++ b/spec/logcache/client_spec.rb @@ -76,41 +76,18 @@ module Logcache allow(logcache_service).to receive(:read).and_raise(bad_status) end - context 'and operator has enabled temporary_ignore_server_unavailable_errors' do - let(:client) do - Logcache::Client.new(host: host, port: port, client_ca_path: client_ca_path, - client_cert_path: client_cert_path, client_key_path: client_key_path, tls_subject_name: tls_subject_name, - temporary_ignore_server_unavailable_errors: true) - end - - it 'returns an empty envelope' do - expect( - client.container_metrics(source_guid: process.guid, envelope_limit: 1000, start_time: 100, end_time: 101) - ).to be_a(Logcache::EmptyEnvelope) - end - - # TODO: fix calling the function under test separately - it 'retries the request three times' do - client.container_metrics(source_guid: process.guid, envelope_limit: 1000, start_time: 100, end_time: 101) - - expect(logcache_service).to have_received(:read).with(logcache_request).exactly(3).times - end + let(:client) do + Logcache::Client.new(host: host, port: port, client_ca_path: client_ca_path, + client_cert_path: client_cert_path, client_key_path: client_key_path, tls_subject_name: tls_subject_name, + temporary_ignore_server_unavailable_errors: false) end - context 'and operator has disabled temporary_ignore_server_unavailable_errors' do - let(:client) do - Logcache::Client.new(host: host, port: port, client_ca_path: client_ca_path, - client_cert_path: client_cert_path, client_key_path: client_key_path, tls_subject_name: tls_subject_name, - temporary_ignore_server_unavailable_errors: false) - end - - it 'retries the request three times and raises an exception' do - expect { - client.container_metrics(source_guid: process.guid, envelope_limit: 1000, start_time: 100, end_time: 101) - }.to raise_error(bad_status) + it 'retries the request three times and raises an exception' do + expect { + client.container_metrics(source_guid: process.guid, envelope_limit: 1000, start_time: 100, end_time: 101) + }.to raise_error(bad_status) - expect(logcache_service).to have_received(:read).with(logcache_request).exactly(3).times - end + expect(logcache_service).to have_received(:read).with(logcache_request).exactly(3).times end end diff --git a/spec/request/processes_spec.rb b/spec/request/processes_spec.rb index 4e78a455754..fd021383c8b 100644 --- a/spec/request/processes_spec.rb +++ b/spec/request/processes_spec.rb @@ -480,44 +480,44 @@ let(:expected_response) do { - 'resources' => [{ - 'type' => 'worker', - 'index' => 0, - 'state' => 'RUNNING', - 'isolation_segment' => 'very-isolated', - 'details' => 'some-details', - 'usage' => { - 'time' => usage_time, - 'cpu' => 80, - 'mem' => 128, - 'disk' => 1024, - }, - 'host' => 'toast', - 'instance_ports' => [ - { - 'external' => 8080, - 'internal' => 1234, - 'external_tls_proxy_port' => 61002, - 'internal_tls_proxy_port' => 61003 + 'resources' => [{ + 'type' => 'worker', + 'index' => 0, + 'state' => 'RUNNING', + 'isolation_segment' => 'very-isolated', + 'details' => 'some-details', + 'usage' => { + 'time' => usage_time, + 'cpu' => 80, + 'mem' => 128, + 'disk' => 1024, }, - { - 'external' => 3000, - 'internal' => 4000, - 'external_tls_proxy_port' => 61006, - 'internal_tls_proxy_port' => 61007 - } - ], - 'uptime' => 12345, - 'mem_quota' => 1073741824, - 'disk_quota' => 1073741824, - 'fds_quota' => 16384 - }] + 'host' => 'toast', + 'instance_ports' => [ + { + 'external' => 8080, + 'internal' => 1234, + 'external_tls_proxy_port' => 61002, + 'internal_tls_proxy_port' => 61003 + }, + { + 'external' => 3000, + 'internal' => 4000, + 'external_tls_proxy_port' => 61006, + 'internal_tls_proxy_port' => 61007 + } + ], + 'uptime' => 12345, + 'mem_quota' => 1073741824, + 'disk_quota' => 1073741824, + 'fds_quota' => 16384 + }] } end before do CloudController::DependencyLocator.instance.register(:instances_reporters, instances_reporters) - allow(instances_reporters).to receive(:stats_for_app).and_return(stats_for_process) + allow(instances_reporters).to receive(:stats_for_app).and_return([stats_for_process, []]) end describe 'GET /v3/processes/:guid/stats' do diff --git a/spec/request/v2/apps_spec.rb b/spec/request/v2/apps_spec.rb index d9b09a37dc6..4233bd67008 100644 --- a/spec/request/v2/apps_spec.rb +++ b/spec/request/v2/apps_spec.rb @@ -1365,7 +1365,7 @@ before do allow(CloudController::DependencyLocator.instance).to receive(:instances_reporters).and_return(instances_reporters) - allow(instances_reporters).to receive(:stats_for_app).and_return(instances_reporter_response) + allow(instances_reporters).to receive(:stats_for_app).and_return([instances_reporter_response, []]) end it 'displays the stats' do diff --git a/spec/unit/controllers/runtime/stats_controller_spec.rb b/spec/unit/controllers/runtime/stats_controller_spec.rb index 58780bc5dde..b7816ad2379 100644 --- a/spec/unit/controllers/runtime/stats_controller_spec.rb +++ b/spec/unit/controllers/runtime/stats_controller_spec.rb @@ -26,7 +26,7 @@ module VCAP::CloudController before do CloudController::DependencyLocator.instance.register(:instances_reporters, instances_reporters) - allow(instances_reporters).to receive(:stats_for_app).and_return(stats) + allow(instances_reporters).to receive(:stats_for_app).and_return([stats, []]) end context 'because they are a developer' do diff --git a/spec/unit/controllers/v3/processes_controller_spec.rb b/spec/unit/controllers/v3/processes_controller_spec.rb index 05cc39f37cb..a5bbf82617a 100644 --- a/spec/unit/controllers/v3/processes_controller_spec.rb +++ b/spec/unit/controllers/v3/processes_controller_spec.rb @@ -808,11 +808,12 @@ let(:stats) { { 0 => { stats: { usage: {}, net_info: { ports: [] } } } } } let(:instances_reporters) { double(:instances_reporters) } let(:user) { set_current_user(VCAP::CloudController::User.make) } + let(:warnings) { [] } before do allow_user_read_access_for(user, spaces: [space]) CloudController::DependencyLocator.instance.register(:instances_reporters, instances_reporters) - allow(instances_reporters).to receive(:stats_for_app).and_return(stats) + allow(instances_reporters).to receive(:stats_for_app).and_return([stats, warnings]) end it 'returns the stats for all instances for the process' do @@ -820,6 +821,19 @@ expect(response.status).to eq(200) expect(parsed_body['resources'][0]['type']).to eq('potato') + expect(response.headers['X-Cf-Warnings']).to be_nil + end + + context 'when the the instances reporter returns warnings' do + let(:warnings) { ['s0mjgnbha'] } + + it 'returns the stats and a header with the warnings' do + get :stats, params: { process_guid: process_type.guid } + + expect(response.status).to eq(200) + expect(parsed_body['resources'][0]['type']).to eq('potato') + expect(response.headers['X-Cf-Warnings']).to eq('s0mjgnbha') + end end context 'accessed as app subresource' do diff --git a/spec/unit/lib/cloud_controller/diego/reporters/instances_stats_reporter_spec.rb b/spec/unit/lib/cloud_controller/diego/reporters/instances_stats_reporter_spec.rb index 1f23499d1a0..5fe14ae4baa 100644 --- a/spec/unit/lib/cloud_controller/diego/reporters/instances_stats_reporter_spec.rb +++ b/spec/unit/lib/cloud_controller/diego/reporters/instances_stats_reporter_spec.rb @@ -116,7 +116,7 @@ def make_actual_lrp(instance_guid:, index:, state:, error:, since:) end it 'returns a map of stats & states per index in the correct units' do - expect(instances_reporter.stats_for_app(process)).to eq(expected_stats_response) + expect(instances_reporter.stats_for_app(process)).to eq([expected_stats_response, []]) end it 'passes a process_id filter' do @@ -185,7 +185,7 @@ def make_actual_lrp(instance_guid:, index:, state:, error:, since:) with(auth_token: 'my-token', source_guid: process.guid, logcache_filter: anything). and_return(traffic_controller_response) - expect(instances_reporter.stats_for_app(process)).to eq(expected_stats_response) + expect(instances_reporter.stats_for_app(process)).to eq([expected_stats_response, []]) end end @@ -209,7 +209,7 @@ def make_actual_lrp(instance_guid:, index:, state:, error:, since:) end it 'sets "port" to 0' do - result = instances_reporter.stats_for_app(process) + result, _ = instances_reporter.stats_for_app(process) expect(result[0][:stats][:port]).to eq(0) end @@ -231,7 +231,8 @@ def make_actual_lrp(instance_guid:, index:, state:, error:, since:) end it 'sets all the stats to zero' do - expect(instances_reporter.stats_for_app(process)[0][:stats][:usage]).to eq({ + result, _ = instances_reporter.stats_for_app(process) + expect(result[0][:stats][:usage]).to eq({ time: formatted_current_time, cpu: 0, mem: 0, @@ -241,7 +242,7 @@ def make_actual_lrp(instance_guid:, index:, state:, error:, since:) end context 'when a NoRunningInstances error is thrown' do - let(:error) { CloudController::Errors::NoRunningInstances.new('ruh roh') } + let(:error) { CloudController::Errors::NoRunningInstances.new('No running instances ruh roh') } let(:expected_stats_response) do { 0 => { @@ -256,20 +257,42 @@ def make_actual_lrp(instance_guid:, index:, state:, error:, since:) end it 'shows all instances as "DOWN"' do - expect(instances_reporter.stats_for_app(process)).to eq(expected_stats_response) + expect(instances_reporter.stats_for_app(process)).to eq([expected_stats_response, []]) end end context 'when a LogcacheTimeoutReached is thrown' do let(:error) { CloudController::Errors::ApiError.new_from_details('ServiceUnavailable', 'Connection to Log Cache timed out') } let(:mock_logger) { double(:logger, error: nil, debug: nil) } + let(:expected_stats_response) do + { + 0 => { + state: 'RUNNING', + isolation_segment: 'isolation-segment-name', + stats: { + name: process.name, + uris: process.uris, + host: 'lrp-host', + port: 2222, + net_info: lrp_1_net_info.to_h, + uptime: two_days_in_seconds, + mem_quota: nil, + disk_quota: nil, + fds_quota: process.file_descriptors, + usage: {} + }, + details: 'some-details', + }, + } + end + before do allow(traffic_controller_client).to receive(:container_metrics).and_raise(error) allow(instances_reporter).to receive(:logger).and_return(mock_logger) end - it 'raises a timeout error' do - expect { instances_reporter.stats_for_app(process) }.to raise_error(error) + it 'returns a partial response and a warning' do + expect(instances_reporter.stats_for_app(process)).to eq([expected_stats_response, ['Stats server temporarily unavailable.']]) end end @@ -287,7 +310,7 @@ def make_actual_lrp(instance_guid:, index:, state:, error:, since:) end it 'provides defaults for unreported instances' do - expect(instances_reporter.stats_for_app(process)).to eq(expected_stats_response) + expect(instances_reporter.stats_for_app(process)).to eq([expected_stats_response, []]) end end @@ -296,7 +319,7 @@ def make_actual_lrp(instance_guid:, index:, state:, error:, since:) let(:traffic_controller_response) { [] } it 'ignores superfluous instances' do - expect(instances_reporter.stats_for_app(process)).to eq({}) + expect(instances_reporter.stats_for_app(process)).to eq([{}, []]) end end @@ -304,7 +327,7 @@ def make_actual_lrp(instance_guid:, index:, state:, error:, since:) let(:traffic_controller_response) { [] } it 'provides defaults for unreported instances' do - result = instances_reporter.stats_for_app(process) + result, _ = instances_reporter.stats_for_app(process) expect(result[0][:stats][:usage]).to eq({ time: formatted_current_time, @@ -361,6 +384,46 @@ def make_actual_lrp(instance_guid:, index:, state:, error:, since:) end end end + + context 'when there is an error fetching metrics envelopes' do + let(:error) { CloudController::Errors::ApiError.new_from_details('ServiceUnavailable', 'no metrics for you') } + let(:mock_logger) { double(:logger, error: nil, debug: nil) } + let(:expected_stats_response) do + { + 0 => { + state: 'RUNNING', + isolation_segment: 'isolation-segment-name', + stats: { + name: process.name, + uris: process.uris, + host: 'lrp-host', + port: 2222, + net_info: lrp_1_net_info.to_h, + uptime: two_days_in_seconds, + mem_quota: nil, + disk_quota: nil, + fds_quota: process.file_descriptors, + usage: {} + }, + details: 'some-details', + }, + } + end + + before do + allow(traffic_controller_client).to receive(:container_metrics). + with(auth_token: 'my-token', source_guid: process.app.guid, logcache_filter: anything). + and_raise(error) + allow(instances_reporter).to receive(:logger).and_return(mock_logger) + end + + it 'logs, omits metrics-driven fields, and provides a warning' do + expect(instances_reporter.stats_for_app(process)).to eq([expected_stats_response, ['Stats server temporarily unavailable.']]) + expect(mock_logger).to have_received(:error).with( + 'stats_for_app.error', { error: 'no metrics for you', backtrace: error.backtrace.join($INPUT_RECORD_SEPARATOR) } + ).once + end + end end end end diff --git a/spec/unit/presenters/v3/process_stats_presenter_spec.rb b/spec/unit/presenters/v3/process_stats_presenter_spec.rb index ec841e256ed..53df8e43f12 100644 --- a/spec/unit/presenters/v3/process_stats_presenter_spec.rb +++ b/spec/unit/presenters/v3/process_stats_presenter_spec.rb @@ -5,6 +5,7 @@ module VCAP::CloudController::Presenters::V3 RSpec.describe ProcessStatsPresenter do subject(:presenter) { ProcessStatsPresenter.new(process.type, stats_for_process) } let(:process) { VCAP::CloudController::ProcessModelFactory.make } + let(:warnings) { [] } describe '#present_stats_hash' do let(:process_usage) { process.type.usage } @@ -196,6 +197,43 @@ module VCAP::CloudController::Presenters::V3 disk: 1024 }) end end + + context 'the process stats are missing metrics data' do + let(:stats_for_process) do + { + 0 => { + state: 'RUNNING', + isolation_segment: 'hecka-compliant', + stats: { + name: process.name, + uris: process.uris, + host: 'myhost', + net_info: net_info_1, + uptime: 12345, + fds_quota: process.file_descriptors, + usage: {} + } + } + } + end + + it 'presents the process stats as a hash' do + result = presenter.present_stats_hash + + expect(result[0][:type]).to eq(process.type) + expect(result[0][:index]).to eq(0) + expect(result[0][:state]).to eq('RUNNING') + expect(result[0][:details]).to eq(nil) + expect(result[0][:isolation_segment]).to eq('hecka-compliant') + expect(result[0][:host]).to eq('myhost') + expect(result[0][:instance_ports]).to eq(instance_ports_1) + expect(result[0][:uptime]).to eq(12345) + expect(result[0][:mem_quota]).to be_nil + expect(result[0][:disk_quota]).to be_nil + expect(result[0][:fds_quota]).to eq(process.file_descriptors) + expect(result[0][:usage]).to eq({}) + end + end end describe '#to_hash' do From 2abeb94be2c1d1bf056cc482507e12c2acf4faa0 Mon Sep 17 00:00:00 2001 From: Tim Downey Date: Thu, 29 Apr 2021 09:57:00 -0700 Subject: [PATCH 2/4] Remove temporary_ignore_server_unavailable_errors config - this temporary config flag was used to ignore server unavailable errors coming back from log-cache (or metric-proxy in cf-for-k8s) - given the change proposed in https://github.com/cloudfoundry/cloud_controller_ng/issues/2227 to rescue more errors and serve partial stats responses this config is no longer necessary Authored-by: Tim Downey --- config/cloud_controller.yml | 1 - .../config_schemas/base/api_schema.rb | 1 - lib/cloud_controller/dependency_locator.rb | 1 - lib/logcache/client.rb | 4 +--- spec/logcache/client_spec.rb | 19 +++++++++++-------- .../dependency_locator_spec.rb | 3 --- 6 files changed, 12 insertions(+), 17 deletions(-) diff --git a/config/cloud_controller.yml b/config/cloud_controller.yml index 176bfccd57c..f8aa079e6e0 100644 --- a/config/cloud_controller.yml +++ b/config/cloud_controller.yml @@ -102,7 +102,6 @@ loggregator: logcache: host: 'http://doppler.service.cf.internal' port: 8080 - temporary_ignore_server_unavailable_errors: false logcache_tls: key_file: 'spec/fixtures/certs/log_cache.key' diff --git a/lib/cloud_controller/config_schemas/base/api_schema.rb b/lib/cloud_controller/config_schemas/base/api_schema.rb index 9d5846c3bc0..96995e88121 100644 --- a/lib/cloud_controller/config_schemas/base/api_schema.rb +++ b/lib/cloud_controller/config_schemas/base/api_schema.rb @@ -205,7 +205,6 @@ class ApiSchema < VCAP::Config logcache: { host: String, port: Integer, - temporary_ignore_server_unavailable_errors: bool, }, optional(:logcache_tls) => { diff --git a/lib/cloud_controller/dependency_locator.rb b/lib/cloud_controller/dependency_locator.rb index 0cf748c074e..59c8a16ac9c 100644 --- a/lib/cloud_controller/dependency_locator.rb +++ b/lib/cloud_controller/dependency_locator.rb @@ -526,7 +526,6 @@ def build_logcache_client client_cert_path: config.get(:logcache_tls, :cert_file), client_key_path: config.get(:logcache_tls, :key_file), tls_subject_name: config.get(:logcache_tls, :subject_name), - temporary_ignore_server_unavailable_errors: config.get(:logcache, :temporary_ignore_server_unavailable_errors) ) end diff --git a/lib/logcache/client.rb b/lib/logcache/client.rb index ce472ed9e18..0ab0ca7f40e 100644 --- a/lib/logcache/client.rb +++ b/lib/logcache/client.rb @@ -6,7 +6,7 @@ class Client MAX_LIMIT = 1000 DEFAULT_LIMIT = 100 - def initialize(host:, port:, client_ca_path:, client_cert_path:, client_key_path:, tls_subject_name:, temporary_ignore_server_unavailable_errors:) + def initialize(host:, port:, client_ca_path:, client_cert_path:, client_key_path:, tls_subject_name:) if client_ca_path client_ca = IO.read(client_ca_path) client_key = IO.read(client_key_path) @@ -25,8 +25,6 @@ def initialize(host:, port:, client_ca_path:, client_cert_path:, client_key_path timeout: 250 ) end - - @temporary_ignore_server_unavailable_errors = temporary_ignore_server_unavailable_errors end def container_metrics(source_guid:, envelope_limit: DEFAULT_LIMIT, start_time:, end_time:) diff --git a/spec/logcache/client_spec.rb b/spec/logcache/client_spec.rb index e7db9347576..24c7aaf29b7 100644 --- a/spec/logcache/client_spec.rb +++ b/spec/logcache/client_spec.rb @@ -21,8 +21,7 @@ module Logcache let(:channel_arg_hash) { { GRPC::Core::Channel::SSL_TARGET => tls_subject_name } } let(:client) do Logcache::Client.new(host: host, port: port, client_ca_path: client_ca_path, - client_cert_path: client_cert_path, client_key_path: client_key_path, tls_subject_name: tls_subject_name, - temporary_ignore_server_unavailable_errors: false) + client_cert_path: client_cert_path, client_key_path: client_key_path, tls_subject_name: tls_subject_name) end let(:client_ca) { File.open(client_ca_path).read } let(:client_key) { File.open(client_key_path).read } @@ -78,8 +77,7 @@ module Logcache let(:client) do Logcache::Client.new(host: host, port: port, client_ca_path: client_ca_path, - client_cert_path: client_cert_path, client_key_path: client_key_path, tls_subject_name: tls_subject_name, - temporary_ignore_server_unavailable_errors: false) + client_cert_path: client_cert_path, client_key_path: client_key_path, tls_subject_name: tls_subject_name) end it 'retries the request three times and raises an exception' do @@ -110,8 +108,7 @@ module Logcache let(:client) do Logcache::Client.new(host: host, port: port, client_ca_path: client_ca_path, - client_cert_path: client_cert_path, client_key_path: client_key_path, tls_subject_name: tls_subject_name, - temporary_ignore_server_unavailable_errors: false) + client_cert_path: client_cert_path, client_key_path: client_key_path, tls_subject_name: tls_subject_name) end it 'raises an exception' do @@ -150,8 +147,14 @@ module Logcache describe 'without TLS' do let(:client) do - Logcache::Client.new(host: host, port: port, temporary_ignore_server_unavailable_errors: false, - client_ca_path: nil, client_cert_path: nil, client_key_path: nil, tls_subject_name: nil) + Logcache::Client.new( + host: host, + port: port, + client_ca_path: nil, + client_cert_path: nil, + client_key_path: nil, + tls_subject_name: nil + ) end describe '#container_metrics' do diff --git a/spec/unit/lib/cloud_controller/dependency_locator_spec.rb b/spec/unit/lib/cloud_controller/dependency_locator_spec.rb index f05b25de066..2e45b473004 100644 --- a/spec/unit/lib/cloud_controller/dependency_locator_spec.rb +++ b/spec/unit/lib/cloud_controller/dependency_locator_spec.rb @@ -503,7 +503,6 @@ client_cert_path: nil, client_key_path: nil, tls_subject_name: nil, - temporary_ignore_server_unavailable_errors: false ) end @@ -513,7 +512,6 @@ logcache: { host: 'some-logcache-host', port: 1234, - temporary_ignore_server_unavailable_errors: false, }, logcache_tls: { ca_file: 'logcache-ca', @@ -531,7 +529,6 @@ client_cert_path: 'logcache-client-ca', client_key_path: 'logcache-client-key', tls_subject_name: 'some-tls-cert-san', - temporary_ignore_server_unavailable_errors: false ) end end From ed75cf64b5fdbb871ea02fc84c2f4ed4f6be248d Mon Sep 17 00:00:00 2001 From: Tim Downey Date: Thu, 29 Apr 2021 10:14:09 -0700 Subject: [PATCH 3/4] Document `null` and empty values for Process Stats In https://github.com/cloudfoundry/cloud_controller_ng/issues/2227 we updated the Process Stats endpoint to be able to return partial responses (with a warning) when the stats server is unavailable. Authored-by: Tim Downey --- .../source/includes/resources/processes/_stats_object.md.erb | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/docs/v3/source/includes/resources/processes/_stats_object.md.erb b/docs/v3/source/includes/resources/processes/_stats_object.md.erb index 4c85024ce79..c72a8729954 100644 --- a/docs/v3/source/includes/resources/processes/_stats_object.md.erb +++ b/docs/v3/source/includes/resources/processes/_stats_object.md.erb @@ -12,6 +12,7 @@ Name | Type | Description **type** | _string_ | Process type; a unique identifier for processes belonging to an app **index** | _integer_ | The zero-based index of running instances **state** | _string_ | The state of the instance; valid values are `RUNNING`, `CRASHED`, `STARTING`, `DOWN` +**usage** | _object_ | Object containing actual usage data for the instance; the value is `{}` when usage data is unavailable **usage.time** | _[timestamp](#timestamps)_ | The time when the usage was requested **usage.cpu** | _number_ | The current cpu usage of the instance **usage.mem** | _integer_ | The current memory usage of the instance @@ -19,8 +20,8 @@ Name | Type | Description **host** | _string_ | The host the instance is running on **instance_ports** | _object_ | JSON array of port mappings between the network-exposed port used to communicate with the app (`external`) and port opened to the running process that it can listen on (`internal`) **uptime** | _integer_ | The uptime in seconds for the instance -**mem_quota** | _integer_ | The current maximum memory allocated for the instance -**disk_quota** | _integer_ | The current maximum disk allocated for the instance +**mem_quota** | _integer_ | The current maximum memory allocated for the instance; the value is `null` when memory quota data is unavailable +**disk_quota** | _integer_ | The current maximum disk allocated for the instance; the value is `null` when disk quota data is unavailable **fds_quota** | _integer_ | The maximum file descriptors the instance is allowed to use **isolation_segment** | _string_ | The current isolation segment that the instance is running on; the value is `null` when the instance is not placed on a particular isolation segment **details** | _string_ | Information about errors placing the instance; the value is `null` if there are no placement errors From 932e8ecac467ae656432a38f49a68b240c380a74 Mon Sep 17 00:00:00 2001 From: Tim Downey Date: Thu, 29 Apr 2021 19:05:35 -0700 Subject: [PATCH 4/4] Set warning header on /v2/apps/:guid/stats for stats server errors Authored-by: Tim Downey --- app/controllers/runtime/stats_controller.rb | 7 +++- .../runtime/stats_controller_spec.rb | 38 ++++++++++++++++++- 2 files changed, 43 insertions(+), 2 deletions(-) diff --git a/app/controllers/runtime/stats_controller.rb b/app/controllers/runtime/stats_controller.rb index ce2d12734d6..e27ba2200d0 100644 --- a/app/controllers/runtime/stats_controller.rb +++ b/app/controllers/runtime/stats_controller.rb @@ -22,7 +22,12 @@ def stats(guid, opts={}) end begin - stats, _ = instances_reporters.stats_for_app(process) + stats, warnings = instances_reporters.stats_for_app(process) + + warnings.each do |warning_message| + add_warning(warning_message) + end + stats.each_value do |stats_hash| if stats_hash[:stats] stats_hash[:stats].delete_if { |key, _| key == :net_info } diff --git a/spec/unit/controllers/runtime/stats_controller_spec.rb b/spec/unit/controllers/runtime/stats_controller_spec.rb index b7816ad2379..40d35152d4a 100644 --- a/spec/unit/controllers/runtime/stats_controller_spec.rb +++ b/spec/unit/controllers/runtime/stats_controller_spec.rb @@ -22,11 +22,12 @@ module VCAP::CloudController } } end + let(:warnings) { [] } let(:instances_reporters) { double(:instances_reporters) } before do CloudController::DependencyLocator.instance.register(:instances_reporters, instances_reporters) - allow(instances_reporters).to receive(:stats_for_app).and_return([stats, []]) + allow(instances_reporters).to receive(:stats_for_app).and_return([stats, warnings]) end context 'because they are a developer' do @@ -55,6 +56,7 @@ module VCAP::CloudController expect(last_response.status).to eq(200) expect(MultiJson.load(last_response.body)).to eq(expected) + expect(last_response.headers['X-Cf-Warnings']).to be_nil expect(instances_reporters).to have_received(:stats_for_app).with( satisfy { |requested_app| requested_app.guid == process.app.guid }) end @@ -91,6 +93,40 @@ module VCAP::CloudController end end + context 'when the instances reporter returns warnings' do + let(:warnings) { ['s0mjgnbha', 'full_moon_with_s0mjgnbha'] } + + it 'should return the stats with an X-Cf-Warnings header' do + set_current_user(developer) + + process.state = 'STARTED' + process.instances = 1 + process.save + + process.refresh + + expected = { + '0' => { + 'state' => 'RUNNING', + 'stats' => {}, + }, + '1' => { + 'state' => 'DOWN', + 'details' => 'start-me', + 'since' => 1, + } + } + + get "/v2/apps/#{process.app.guid}/stats" + + expect(last_response.status).to eq(200) + expect(MultiJson.load(last_response.body)).to eq(expected) + expect(last_response.headers['X-Cf-Warnings']).to eq('s0mjgnbha,full_moon_with_s0mjgnbha') + expect(instances_reporters).to have_received(:stats_for_app).with( + satisfy { |requested_app| requested_app.guid == process.app.guid }) + end + end + context 'when instance reporter raises an ApiError' do before do @error = CloudController::Errors::ApiError.new_from_details('ServerError')