Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add additional telemetry report counters #2128

Merged
merged 4 commits into from
Feb 25, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 10 additions & 2 deletions include/istio/mixerclient/client.h
Original file line number Diff line number Diff line change
Expand Up @@ -114,9 +114,17 @@ struct Statistics {
//

// Total number of report calls.
uint64_t total_report_calls_{0};
uint64_t total_report_calls_{0}; // 1.0
// Total number of remote report calls.
uint64_t total_remote_report_calls_{0};
uint64_t total_remote_report_calls_{0}; // 1.0
// Remote report calls that succeeed
uint64_t total_remote_report_successes_{0}; // 1.1
// Remote report calls that fail due to timeout waiting for the response
uint64_t total_remote_report_timeouts_{0}; // 1.1
// Remote report calls that fail sending the request (socket connect or write)
uint64_t total_remote_report_send_errors_{0}; // 1.1
// Remote report calls that fail do to some other error
uint64_t total_remote_report_other_errors_{0}; // 1.1
};

class MixerClient {
Expand Down
4 changes: 4 additions & 0 deletions src/envoy/utils/stats.cc
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,10 @@ void MixerStatsObject::CheckAndUpdateStats(

CHECK_AND_UPDATE_STATS(total_report_calls_);
CHECK_AND_UPDATE_STATS(total_remote_report_calls_);
CHECK_AND_UPDATE_STATS(total_remote_report_successes_);
CHECK_AND_UPDATE_STATS(total_remote_report_timeouts_);
CHECK_AND_UPDATE_STATS(total_remote_report_send_errors_);
CHECK_AND_UPDATE_STATS(total_remote_report_other_errors_);

// Copy new_stats to old_stats_ for next stats update.
old_stats_ = new_stats;
Expand Down
6 changes: 5 additions & 1 deletion src/envoy/utils/stats.h
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,11 @@ namespace Utils {
COUNTER(total_remote_call_retries) \
COUNTER(total_remote_call_cancellations) \
COUNTER(total_report_calls) \
COUNTER(total_remote_report_calls)
COUNTER(total_remote_report_calls) \
COUNTER(total_remote_report_successes) \
COUNTER(total_remote_report_timeouts) \
COUNTER(total_remote_report_send_errors) \
COUNTER(total_remote_report_other_errors)
// clang-format on

/**
Expand Down
6 changes: 4 additions & 2 deletions src/istio/mixerclient/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ cc_library(
"attribute_compressor.h",
"check_cache.cc",
"check_cache.h",
"check_context.h",
"check_context.h",
"client_impl.cc",
"client_impl.h",
"global_dictionary.cc",
Expand All @@ -51,7 +51,9 @@ cc_library(
"referenced.h",
"report_batch.cc",
"report_batch.h",
"shared_attributes.h",
"shared_attributes.h",
"status_util.cc",
"status_util.h"
],
visibility = ["//visibility:public"],
deps = [
Expand Down
39 changes: 9 additions & 30 deletions src/istio/mixerclient/client_impl.cc
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
#include <cmath>
#include "include/istio/mixerclient/check_response.h"
#include "include/istio/utils/protobuf.h"
#include "src/istio/mixerclient/status_util.h"
#include "src/istio/utils/logger.h"

using ::google::protobuf::util::Status;
Expand All @@ -33,36 +34,6 @@ using ::istio::mixerclient::SharedAttributesSharedPtr;
namespace istio {
namespace mixerclient {

static ::google::protobuf::StringPiece TIMEOUT_MESSAGE(
"upstream request timeout");
static ::google::protobuf::StringPiece SEND_ERROR_MESSAGE(
"upstream connect error or disconnect/reset before headers");

enum class TransportResult {
SUCCESS, // Response received
SEND_ERROR, // Cannot connect to peer or send request to peer.
RESPONSE_TIMEOUT, // Connected to peer and sent request, but didn't receive a
// response in time.
OTHER // Something else went wrong
};

TransportResult TransportStatus(const Status &status) {
if (status.ok()) {
return TransportResult::SUCCESS;
}

if (Code::UNAVAILABLE == status.error_code()) {
if (TIMEOUT_MESSAGE == status.error_message()) {
return TransportResult::RESPONSE_TIMEOUT;
}
if (SEND_ERROR_MESSAGE == status.error_message()) {
return TransportResult::SEND_ERROR;
}
}

return TransportResult::OTHER;
}

MixerClientImpl::MixerClientImpl(const MixerClientOptions &options)
: options_(options) {
timer_create_ = options.env.timer_create_func;
Expand Down Expand Up @@ -359,6 +330,14 @@ void MixerClientImpl::GetStatistics(Statistics *stat) const {

stat->total_report_calls_ = report_batch_->total_report_calls();
stat->total_remote_report_calls_ = report_batch_->total_remote_report_calls();
stat->total_remote_report_successes_ =
report_batch_->total_remote_report_successes();
stat->total_remote_report_timeouts_ =
report_batch_->total_remote_report_timeouts();
stat->total_remote_report_send_errors_ =
report_batch_->total_remote_report_send_errors();
stat->total_remote_report_other_errors_ =
report_batch_->total_remote_report_other_errors();
}

// Creates a MixerClient object.
Expand Down
33 changes: 28 additions & 5 deletions src/istio/mixerclient/report_batch.cc
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@

#include "src/istio/mixerclient/report_batch.h"
#include "include/istio/utils/protobuf.h"
#include "src/istio/mixerclient/status_util.h"
#include "src/istio/utils/logger.h"

using ::google::protobuf::util::Status;
Expand Down Expand Up @@ -71,12 +72,34 @@ void ReportBatch::FlushWithLock() {

++total_remote_report_calls_;
auto request = batch_compressor_->Finish();
ReportResponse* response = new ReportResponse;
std::shared_ptr<ReportResponse> response{new ReportResponse()};
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

note to reviewer, i slipped this in while I was at it. let me know if you want me to revert

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I am fine with this change. Thanks!


// TODO(jblatt) I replaced a ReportResponse raw pointer with a shared
// pointer so at least the memory will be freed if this lambda is deleted
// without being called, but really this should be a unique_ptr that is
// moved into the transport_ and then moved into the lambda if invoked.
transport_(request, &*response, [this, response](const Status& status) {
//
// Classify and track transport errors
//

TransportResult result = TransportStatus(status);

switch (result) {
case TransportResult::SUCCESS:
++total_remote_report_successes_;
break;
case TransportResult::RESPONSE_TIMEOUT:
++total_remote_report_timeouts_;
break;
case TransportResult::SEND_ERROR:
++total_remote_report_send_errors_;
break;
case TransportResult::OTHER:
++total_remote_report_other_errors_;
break;
}

// TODO(jblatt) should an async call be made while this lock is held? Can the
// request send block()?
transport_(request, response, [this, response](const Status& status) {
delete response;
if (!status.ok()) {
if (MIXER_WARN_ENABLED &&
0 == REPORT_FAIL_LOG_MESSAGES++ % REPORT_FAIL_LOG_MODULUS) {
Expand Down
24 changes: 22 additions & 2 deletions src/istio/mixerclient/report_batch.h
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,22 @@ class ReportBatch {
return total_remote_report_calls_;
}

uint64_t total_remote_report_successes() const {
return total_remote_report_successes_;
}

uint64_t total_remote_report_timeouts() const {
return total_remote_report_timeouts_;
}

uint64_t total_remote_report_send_errors() const {
return total_remote_report_send_errors_;
}

uint64_t total_remote_report_other_errors() const {
return total_remote_report_other_errors_;
}

private:
void FlushWithLock();

Expand All @@ -68,8 +84,12 @@ class ReportBatch {
// batched report compressor
std::unique_ptr<BatchCompressor> batch_compressor_;

std::atomic_int_fast64_t total_report_calls_;
std::atomic_int_fast64_t total_remote_report_calls_;
std::atomic<uint64_t> total_report_calls_{0}; // 1.0
std::atomic<uint64_t> total_remote_report_calls_{0}; // 1.0
std::atomic<uint64_t> total_remote_report_successes_{0}; // 1.1
std::atomic<uint64_t> total_remote_report_timeouts_{0}; // 1.1
std::atomic<uint64_t> total_remote_report_send_errors_{0}; // 1.1
std::atomic<uint64_t> total_remote_report_other_errors_{0}; // 1.1

GOOGLE_DISALLOW_EVIL_CONSTRUCTORS(ReportBatch);
};
Expand Down
45 changes: 45 additions & 0 deletions src/istio/mixerclient/status_util.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
/* Copyright 2019 Istio Authors. All Rights Reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include "src/istio/mixerclient/status_util.h"

namespace istio {
namespace mixerclient {

static ::google::protobuf::StringPiece TIMEOUT_MESSAGE(
"upstream request timeout");
static ::google::protobuf::StringPiece SEND_ERROR_MESSAGE(
"upstream connect error or disconnect/reset before headers");

TransportResult TransportStatus(
const ::google::protobuf::util::Status &status) {
if (status.ok()) {
return TransportResult::SUCCESS;
}

if (::google::protobuf::util::error::Code::UNAVAILABLE ==
status.error_code()) {
if (TIMEOUT_MESSAGE == status.error_message()) {
return TransportResult::RESPONSE_TIMEOUT;
}
if (SEND_ERROR_MESSAGE == status.error_message()) {
return TransportResult::SEND_ERROR;
}
}

return TransportResult::OTHER;
}
} // namespace mixerclient
} // namespace istio
35 changes: 35 additions & 0 deletions src/istio/mixerclient/status_util.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
/* Copyright 2019 Istio Authors. All Rights Reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#pragma once

#include "google/protobuf/stubs/status.h"

namespace istio {
namespace mixerclient {

enum class TransportResult {
SUCCESS, // Response received
SEND_ERROR, // Cannot connect to peer or send request to peer.
RESPONSE_TIMEOUT, // Connected to peer and sent request, but didn't receive a
// response in time.
OTHER // Something else went wrong
};

extern TransportResult TransportStatus(
const ::google::protobuf::util::Status &status);

} // namespace mixerclient
} // namespace istio
14 changes: 14 additions & 0 deletions test/integration/mixer_fault_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -630,6 +630,9 @@ TEST_F(MixerFaultTest, TolerateTelemetryBlackhole) {
client->run(connections_to_initiate, requests_to_send, std::move(request),
std::chrono::milliseconds(10'000));

std::unordered_map<std::string, double> counters;
extractCounters("http_mixer_filter", counters);

// shutdown envoy by destroying it
test_server_ = nullptr;
// wait until the upstreams have closed all connections they accepted.
Expand Down Expand Up @@ -674,6 +677,17 @@ TEST_F(MixerFaultTest, TolerateTelemetryBlackhole) {
// Telemetry server accept callback is called at least once (h2 socket reuse
// means may only be called once)
EXPECT_GE(telemetry_cluster.connectionsAccepted(), 1);

// Assertions against the mixer filter's internal counters.
EXPECT_EQ(counters["http_mixer_filter.total_report_calls"], requests_to_send);
EXPECT_IN_RANGE(counters["http_mixer_filter.total_remote_report_calls_"], 0,
requests_to_send * 0.12);
// All remote reports should time out
EXPECT_EQ(counters["http_mixer_filter.total_remote_report_timeouts_"],
counters["http_mixer_filter.total_remote_report_calls_"]);
EXPECT_EQ(counters["http_mixer_filter.total_remote_report_successes_"], 0);
EXPECT_EQ(counters["http_mixer_filter.total_remote_report_send_errors_"], 0);
EXPECT_EQ(counters["http_mixer_filter.total_remote_report_other_errors_"], 0);
}

TEST_F(MixerFaultTest, FailOpenAndSendPolicyResponseSlowly) {
Expand Down