diff --git a/include/envoy/stats/BUILD b/include/envoy/stats/BUILD index cabb7d6d4fe..46928ce6ff6 100644 --- a/include/envoy/stats/BUILD +++ b/include/envoy/stats/BUILD @@ -27,6 +27,11 @@ envoy_cc_library( deps = ["//include/envoy/common:interval_set_interface"], ) +envoy_cc_library( + name = "symbol_table_interface", + hdrs = ["symbol_table.h"], +) + envoy_cc_library( name = "timespan", hdrs = ["timespan.h"], diff --git a/include/envoy/stats/symbol_table.h b/include/envoy/stats/symbol_table.h new file mode 100644 index 00000000000..8764402069e --- /dev/null +++ b/include/envoy/stats/symbol_table.h @@ -0,0 +1,50 @@ +#pragma once + +#include +#include +#include + +#include "envoy/common/pure.h" + +#include "absl/strings/string_view.h" + +namespace Envoy { +namespace Stats { + +/** + * Interface for storing a stat name. + */ +class StatName { +public: + virtual ~StatName(){}; + virtual std::string toString() const PURE; +}; + +using StatNamePtr = std::unique_ptr; + +/** + * Interface for shortening and retrieving stat names. + * + * Guarantees that x = encode(x).toString() for any x. + */ +class SymbolTable { +public: + virtual ~SymbolTable() {} + + /** + * Encodes a stat name into a StatNamePtr. Expects the name to be period-delimited. + * + * @param name the stat name to encode. + * @return StatNamePtr a unique_ptr to the StatName class encapsulating the symbol vector. + */ + virtual StatNamePtr encode(absl::string_view name) PURE; + + /** + * Returns the size of a SymbolTable, as measured in number of symbols stored. + * @return size_t the size of the table. + */ + virtual size_t size() const PURE; +}; + +} // namespace Stats +} // namespace Envoy diff --git a/source/common/stats/BUILD b/source/common/stats/BUILD index cde9e618511..456562cf16a 100644 --- a/source/common/stats/BUILD +++ b/source/common/stats/BUILD @@ -100,6 +100,7 @@ envoy_cc_library( ":raw_stat_data_lib", ":source_impl_lib", ":stats_options_lib", + ":symbol_table_lib", ":tag_extractor_lib", ":utility_lib", "//include/envoy/common:time_interface", @@ -117,6 +118,18 @@ envoy_cc_library( ], ) +envoy_cc_library( + name = "symbol_table_lib", + srcs = ["symbol_table_impl.cc"], + hdrs = ["symbol_table_impl.h"], + external_deps = ["abseil_base"], + deps = [ + "//include/envoy/stats:symbol_table_interface", + "//source/common/common:assert_lib", + "//source/common/common:utility_lib", + ], +) + envoy_cc_library( name = "stats_options_lib", hdrs = ["stats_options_impl.h"], diff --git a/source/common/stats/symbol_table_impl.cc b/source/common/stats/symbol_table_impl.cc new file mode 100644 index 00000000000..2ee7269d2cb --- /dev/null +++ b/source/common/stats/symbol_table_impl.cc @@ -0,0 +1,86 @@ +#include "common/stats/symbol_table_impl.h" + +#include +#include +#include + +#include "common/common/assert.h" + +namespace Envoy { +namespace Stats { + +// TODO(ambuc): There is a possible performance optimization here for avoiding the encoding of IPs, +// if they appear in stat names. We don't want to waste time symbolizing an integer as an integer, +// if we can help it. +StatNamePtr SymbolTableImpl::encode(const absl::string_view name) { + SymbolVec symbol_vec; + std::vector name_vec = absl::StrSplit(name, '.'); + symbol_vec.reserve(name_vec.size()); + std::transform(name_vec.begin(), name_vec.end(), std::back_inserter(symbol_vec), + [this](absl::string_view x) { return toSymbol(x); }); + + return std::make_unique(symbol_vec, *this); +} + +std::string SymbolTableImpl::decode(const SymbolVec& symbol_vec) const { + std::vector name; + name.reserve(symbol_vec.size()); + std::transform(symbol_vec.begin(), symbol_vec.end(), std::back_inserter(name), + [this](Symbol x) { return fromSymbol(x); }); + return absl::StrJoin(name, "."); +} + +void SymbolTableImpl::free(const SymbolVec& symbol_vec) { + for (const Symbol symbol : symbol_vec) { + auto decode_search = decode_map_.find(symbol); + ASSERT(decode_search != decode_map_.end()); + + auto encode_search = encode_map_.find(decode_search->second); + ASSERT(encode_search != encode_map_.end()); + + encode_search->second.ref_count_--; + // If that was the last remaining client usage of the symbol, erase the the current + // mappings and add the now-unused symbol to the reuse pool. + if (encode_search->second.ref_count_ == 0) { + decode_map_.erase(decode_search); + encode_map_.erase(encode_search); + pool_.push(symbol); + } + } +} + +Symbol SymbolTableImpl::toSymbol(absl::string_view sv) { + Symbol result; + auto encode_find = encode_map_.find(sv); + // If the string segment doesn't already exist, + if (encode_find == encode_map_.end()) { + // We create the actual string, place it in the decode_map_, and then insert a string_view + // pointing to it in the encode_map_. This allows us to only store the string once. + std::string str = std::string(sv); + + auto decode_insert = decode_map_.insert({next_symbol_, std::move(str)}); + ASSERT(decode_insert.second); + + auto encode_insert = encode_map_.insert( + {decode_insert.first->second, {.symbol_ = next_symbol_, .ref_count_ = 1}}); + ASSERT(encode_insert.second); + + result = next_symbol_; + newSymbol(); + } else { + // If the insertion didn't take place, return the actual value at that location and up the + // refcount at that location + result = encode_find->second.symbol_; + ++(encode_find->second.ref_count_); + } + return result; +} + +absl::string_view SymbolTableImpl::fromSymbol(const Symbol symbol) const { + auto search = decode_map_.find(symbol); + ASSERT(search != decode_map_.end()); + return search->second; +} + +} // namespace Stats +} // namespace Envoy diff --git a/source/common/stats/symbol_table_impl.h b/source/common/stats/symbol_table_impl.h new file mode 100644 index 00000000000..ab9ea86d38a --- /dev/null +++ b/source/common/stats/symbol_table_impl.h @@ -0,0 +1,139 @@ +#pragma once + +#include +#include +#include +#include +#include +#include + +#include "envoy/common/exception.h" +#include "envoy/stats/symbol_table.h" + +#include "common/common/assert.h" +#include "common/common/utility.h" + +#include "absl/strings/str_join.h" +#include "absl/strings/str_split.h" + +namespace Envoy { +namespace Stats { + +using Symbol = uint32_t; +using SymbolVec = std::vector; + +/** + * Underlying SymbolTableImpl implementation which manages per-symbol reference counting. + * + * The underlying Symbol / SymbolVec data structures are private to the impl. One side + * effect of the non-monotonically-increasing symbol counter is that if a string is encoded, the + * resulting stat is destroyed, and then that same string is re-encoded, it may or may not encode to + * the same underlying symbol. + */ +class SymbolTableImpl : public SymbolTable { +public: + StatNamePtr encode(absl::string_view name) override; + + // For testing purposes only. + size_t size() const override { + ASSERT(encode_map_.size() == decode_map_.size()); + return encode_map_.size(); + } + +private: + friend class StatNameImpl; + friend class StatNameTest; + + struct SharedSymbol { + Symbol symbol_; + uint32_t ref_count_; + }; + + /** + * Decodes a vector of symbols back into its period-delimited stat name. + * If decoding fails on any part of the symbol_vec, we release_assert and crash hard, since this + * should never happen, and we don't want to continue running with a corrupt stats set. + * + * @param symbol_vec the vector of symbols to decode. + * @return std::string the retrieved stat name. + */ + std::string decode(const SymbolVec& symbol_vec) const; + + /** + * Since SymbolTableImpl does manual reference counting, a client of SymbolTable (such as + * StatName) must manually call free(symbol_vec) when it is freeing the stat it represents. This + * way, the symbol table will grow and shrink dynamically, instead of being write-only. + * + * @param symbol_vec the vector of symbols to be freed. + */ + void free(const SymbolVec& symbol_vec); + + /** + * Convenience function for encode(), symbolizing one string segment at a time. + * + * @param sv the individual string to be encoded as a symbol. + * @return Symbol the encoded string. + */ + Symbol toSymbol(absl::string_view sv); + + /** + * Convenience function for decode(), decoding one symbol at a time. + * + * @param symbol the individual symbol to be decoded. + * @return absl::string_view the decoded string. + */ + absl::string_view fromSymbol(Symbol symbol) const; + + // Stages a new symbol for use. To be called after a successful insertion. + void newSymbol() { + if (pool_.empty()) { + next_symbol_ = ++monotonic_counter_; + } else { + next_symbol_ = pool_.top(); + pool_.pop(); + } + // This should catch integer overflow for the new symbol. + ASSERT(monotonic_counter_ != 0); + } + + Symbol monotonicCounter() { return monotonic_counter_; } + + // Stores the symbol to be used at next insertion. This should exist ahead of insertion time so + // that if insertion succeeds, the value written is the correct one. + Symbol next_symbol_ = 0; + + // If the free pool is exhausted, we monotonically increase this counter. + Symbol monotonic_counter_ = 0; + + // Bimap implementation. + // The encode map stores both the symbol and the ref count of that symbol. + // Using absl::string_view lets us only store the complete string once, in the decode map. + std::unordered_map encode_map_; + std::unordered_map decode_map_; + + // Free pool of symbols for re-use. + // TODO(ambuc): There might be an optimization here relating to storing ranges of freed symbols + // using an Envoy::IntervalSet. + std::stack pool_; +}; + +/** + * Implements RAII for Symbols, since the StatName destructor does the work of freeing its component + * symbols. + */ +class StatNameImpl : public StatName { +public: + StatNameImpl(SymbolVec symbol_vec, SymbolTableImpl& symbol_table) + : symbol_vec_(symbol_vec), symbol_table_(symbol_table) {} + ~StatNameImpl() override { symbol_table_.free(symbol_vec_); } + std::string toString() const override { return symbol_table_.decode(symbol_vec_); } + +private: + friend class StatNameTest; + SymbolVec symbolVec() { return symbol_vec_; } + SymbolVec symbol_vec_; + SymbolTableImpl& symbol_table_; +}; + +} // namespace Stats +} // namespace Envoy diff --git a/test/common/stats/BUILD b/test/common/stats/BUILD index 86eaa700b75..27d8daf51b4 100644 --- a/test/common/stats/BUILD +++ b/test/common/stats/BUILD @@ -54,10 +54,22 @@ envoy_cc_test( ], ) +envoy_cc_test( + name = "symbol_table_test", + srcs = ["symbol_table_test.cc"], + deps = [ + "//source/common/stats:symbol_table_lib", + "//test/mocks/stats:stats_mocks", + "//test/test_common:logging_lib", + "//test/test_common:utility_lib", + ], +) + envoy_cc_test( name = "thread_local_store_test", srcs = ["thread_local_store_test.cc"], deps = [ + "//source/common/stats:symbol_table_lib", "//source/common/stats:thread_local_store_lib", "//test/mocks/event:event_mocks", "//test/mocks/server:server_mocks", diff --git a/test/common/stats/symbol_table_test.cc b/test/common/stats/symbol_table_test.cc new file mode 100644 index 00000000000..c66a57b6eba --- /dev/null +++ b/test/common/stats/symbol_table_test.cc @@ -0,0 +1,184 @@ +#include + +#include "common/stats/symbol_table_impl.h" + +#include "test/test_common/logging.h" +#include "test/test_common/utility.h" + +#include "gtest/gtest.h" + +namespace Envoy { +namespace Stats { + +class StatNameTest : public testing::Test { +protected: + SymbolVec getSymbols(StatName* stat_name_ptr) { + StatNameImpl& impl = dynamic_cast(*stat_name_ptr); + return impl.symbolVec(); + } + std::string decodeSymbolVec(const SymbolVec& symbol_vec) { return table_.decode(symbol_vec); } + Symbol monotonicCounter() { return table_.monotonicCounter(); } + + SymbolTableImpl table_; +}; + +TEST_F(StatNameTest, TestArbitrarySymbolRoundtrip) { + const std::vector stat_names = {"", " ", " ", ",", "\t", "$", "%", "`", "."}; + for (auto stat_name : stat_names) { + EXPECT_EQ(stat_name, table_.encode(stat_name)->toString()); + } +} + +TEST_F(StatNameTest, TestUnusualDelimitersRoundtrip) { + const std::vector stat_names = {".", "..", "...", "foo", "foo.", + ".foo", ".foo.", ".foo..", "..foo.", "..foo.."}; + for (auto stat_name : stat_names) { + EXPECT_EQ(stat_name, table_.encode(stat_name)->toString()); + } +} + +TEST_F(StatNameTest, TestSuccessfulDoubleLookup) { + StatNamePtr stat_name_1 = table_.encode("foo.bar.baz"); + StatNamePtr stat_name_2 = table_.encode("foo.bar.baz"); + EXPECT_EQ(getSymbols(stat_name_1.get()), getSymbols(stat_name_2.get())); +} + +TEST_F(StatNameTest, TestSuccessfulDecode) { + std::string stat_name = "foo.bar.baz"; + auto stat_name_1 = table_.encode(stat_name); + auto stat_name_2 = table_.encode(stat_name); + EXPECT_EQ(stat_name_1->toString(), stat_name_2->toString()); + EXPECT_EQ(stat_name_1->toString(), stat_name); +} + +TEST_F(StatNameTest, TestBadDecodes) { + { + // If a symbol doesn't exist, decoding it should trigger an ASSERT() and crash. + SymbolVec bad_symbol_vec = {0}; + EXPECT_DEATH(decodeSymbolVec(bad_symbol_vec), ""); + } + + { + StatNamePtr stat_name_1 = table_.encode("foo"); + SymbolVec vec_1 = getSymbols(stat_name_1.get()); + // Decoding a symbol vec that exists is perfectly normal... + EXPECT_NO_THROW(decodeSymbolVec(vec_1)); + stat_name_1.reset(); + // But when the StatNamePtr is destroyed, its symbols are as well. + EXPECT_DEATH(decodeSymbolVec(vec_1), ""); + } +} + +TEST_F(StatNameTest, TestDifferentStats) { + auto stat_name_1 = table_.encode("foo.bar"); + auto stat_name_2 = table_.encode("bar.foo"); + EXPECT_NE(stat_name_1->toString(), stat_name_2->toString()); + EXPECT_NE(getSymbols(stat_name_1.get()), getSymbols(stat_name_2.get())); +} + +TEST_F(StatNameTest, TestSymbolConsistency) { + auto stat_name_1 = table_.encode("foo.bar"); + auto stat_name_2 = table_.encode("bar.foo"); + // We expect the encoding of "foo" in one context to be the same as another. + SymbolVec vec_1 = getSymbols(stat_name_1.get()); + SymbolVec vec_2 = getSymbols(stat_name_2.get()); + EXPECT_EQ(vec_1[0], vec_2[1]); + EXPECT_EQ(vec_2[0], vec_1[1]); +} + +TEST_F(StatNameTest, TestSameValueOnPartialFree) { + // This should hold true for components as well. Since "foo" persists even when "foo.bar" is + // freed, we expect both instances of "foo" to have the same symbol. + StatNamePtr stat_foo = table_.encode("foo"); + StatNamePtr stat_foobar_1 = table_.encode("foo.bar"); + SymbolVec stat_foobar_1_symbols = getSymbols(stat_foobar_1.get()); + stat_foobar_1.reset(); + + StatNamePtr stat_foobar_2 = table_.encode("foo.bar"); + SymbolVec stat_foobar_2_symbols = getSymbols(stat_foobar_2.get()); + + EXPECT_EQ(stat_foobar_1_symbols[0], + stat_foobar_2_symbols[0]); // Both "foo" components have the same symbol, + // And we have no expectation for the "bar" components, because of the free pool. +} + +TEST_F(StatNameTest, FreePoolTest) { + // To ensure that the free pool is being used, we should be able to cycle through a large number + // of stats while validating that: + // a) the size of the table has not increased, and + // b) the monotonically increasing counter has not risen to more than the maximum number of + // coexisting symbols during the life of the table. + + { + StatNamePtr stat_1 = table_.encode("1a"); + StatNamePtr stat_2 = table_.encode("2a"); + StatNamePtr stat_3 = table_.encode("3a"); + StatNamePtr stat_4 = table_.encode("4a"); + StatNamePtr stat_5 = table_.encode("5a"); + EXPECT_EQ(monotonicCounter(), 5); + EXPECT_EQ(table_.size(), 5); + } + EXPECT_EQ(monotonicCounter(), 5); + EXPECT_EQ(table_.size(), 0); + + // These are different strings being encoded, but they should recycle through the same symbols as + // the stats above. + StatNamePtr stat_1 = table_.encode("1b"); + StatNamePtr stat_2 = table_.encode("2b"); + StatNamePtr stat_3 = table_.encode("3b"); + StatNamePtr stat_4 = table_.encode("4b"); + StatNamePtr stat_5 = table_.encode("5b"); + EXPECT_EQ(monotonicCounter(), 5); + EXPECT_EQ(table_.size(), 5); + + StatNamePtr stat_6 = table_.encode("6"); + EXPECT_EQ(monotonicCounter(), 6); + EXPECT_EQ(table_.size(), 6); +} + +TEST_F(StatNameTest, TestShrinkingExpectation) { + // We expect that as we free stat names, the memory used to store those underlying symbols will + // be freed. + // ::size() is a public function, but should only be used for testing. + size_t table_size_0 = table_.size(); + + StatNamePtr stat_a = table_.encode("a"); + size_t table_size_1 = table_.size(); + + StatNamePtr stat_aa = table_.encode("a.a"); + EXPECT_EQ(table_size_1, table_.size()); + + StatNamePtr stat_ab = table_.encode("a.b"); + size_t table_size_2 = table_.size(); + + StatNamePtr stat_ac = table_.encode("a.c"); + size_t table_size_3 = table_.size(); + + StatNamePtr stat_acd = table_.encode("a.c.d"); + size_t table_size_4 = table_.size(); + + StatNamePtr stat_ace = table_.encode("a.c.e"); + size_t table_size_5 = table_.size(); + EXPECT_GE(table_size_5, table_size_4); + + stat_ace.reset(); + EXPECT_EQ(table_size_4, table_.size()); + + stat_acd.reset(); + EXPECT_EQ(table_size_3, table_.size()); + + stat_ac.reset(); + EXPECT_EQ(table_size_2, table_.size()); + + stat_ab.reset(); + EXPECT_EQ(table_size_1, table_.size()); + + stat_aa.reset(); + EXPECT_EQ(table_size_1, table_.size()); + + stat_a.reset(); + EXPECT_EQ(table_size_0, table_.size()); +} + +} // namespace Stats +} // namespace Envoy