From 5ed2131d4971aa90dfeb27cb0428200c60ad85d7 Mon Sep 17 00:00:00 2001 From: arista-hpandya Date: Mon, 10 Feb 2025 18:53:51 +0000 Subject: [PATCH] Add support for configurable debug drop monitoring feature - Adds logic to read configuration from DEBUG_DROP_MONITOR - Adds logic to generate syslogs if thresholds are met - Adds logic to toggle the feature off if desired --- orchagent/debugcounterorch.cpp | 228 +++++++++++++++++++++++++++++++++ orchagent/debugcounterorch.h | 23 ++++ orchagent/orchdaemon.cpp | 3 +- 3 files changed, 253 insertions(+), 1 deletion(-) diff --git a/orchagent/debugcounterorch.cpp b/orchagent/debugcounterorch.cpp index 882eaaf39b..da44347cfd 100644 --- a/orchagent/debugcounterorch.cpp +++ b/orchagent/debugcounterorch.cpp @@ -7,6 +7,8 @@ #include #include "observer.h" +#define DEBUG_DROP_MONITOR_POLL_TIMEOUT_SEC (60) + using std::string; using std::unordered_map; using std::unordered_set; @@ -30,6 +32,8 @@ DebugCounterOrch::DebugCounterOrch(DBConnector *db, const vector& table_ m_stateDb(new DBConnector("STATE_DB", 0)), m_debugCapabilitiesTable(new Table(m_stateDb.get(), STATE_DEBUG_COUNTER_CAPABILITIES_NAME)), m_countersDb(new DBConnector("COUNTERS_DB", 0)), + m_counterTable(new Table(m_countersDb.get(), COUNTERS_TABLE)), + m_counterPortNameMap(new Table(m_countersDb.get(), COUNTERS_PORT_NAME_MAP)), m_counterNameToPortStatMap(new Table(m_countersDb.get(), COUNTERS_DEBUG_NAME_PORT_STAT_MAP)), m_counterNameToSwitchStatMap(new Table(m_countersDb.get(), COUNTERS_DEBUG_NAME_SWITCH_STAT_MAP)) { @@ -37,6 +41,98 @@ DebugCounterOrch::DebugCounterOrch(DBConnector *db, const vector& table_ publishDropCounterCapabilities(); gPortsOrch->attach(this); + + // Setup timer for monitoring persistent drops in drop counters + auto interv = timespec { .tv_sec = DEBUG_DROP_MONITOR_POLL_TIMEOUT_SEC, .tv_nsec = 0 }; + m_dropCountMonitorTimer = new SelectableTimer(interv); + auto executor = new ExecutableTimer(m_dropCountMonitorTimer, this, "DEBUG_DROP_MONITOR_POLL"); + Orch::addExecutor(executor); +} + +void DebugCounterOrch::doTask(SelectableTimer &timer) +{ + SWSS_LOG_ENTER(); + + // Get the configured debug drop counters + for (const auto& debug_counter: debug_counters) + { + DebugCounter *counter = debug_counter.second.get(); + string counter_stat = counter->getDebugCounterSAIStat(); + string counter_type = counter->getCounterType(); + + // We only monitor port level debug counters + if (counter_type != PORT_INGRESS_DROPS && counter_type != PORT_EGRESS_DROPS) + continue; + + // Iterate over all port counters + std::vector ports; + m_counterPortNameMap->get("", ports); + for (const auto& port: ports) + { + // Get the drop value associated with the debug counter + string port_counter_oid = port.second; + string stat_drop_count = ""; + m_counterTable->hget(port_counter_oid, counter_stat, stat_drop_count); + + // Get the delta between drop counts since last poll + if (m_prevDropCountMap[counter_stat].find(port_counter_oid) == m_prevDropCountMap[counter_stat].end()) + { + m_prevDropCountMap[counter_stat][port_counter_oid] = 0; + } + uint64_t curr_drop_count = std::stoi(stat_drop_count); + uint64_t prev_drop_count = m_prevDropCountMap[counter_stat][port_counter_oid]; + uint64_t delta_drop_count = curr_drop_count - prev_drop_count; + + // Sanity check to verify drop counter content + if (curr_drop_count < prev_drop_count){ + SWSS_LOG_ERROR("%s cannot decrease over time. Anamoly detected in debug counter %s", counter_stat.c_str(), port_counter_oid.c_str()); + return; + } + + // Get current timestamp + const auto current_timestamp = std::chrono::system_clock::now(); + + // Check if drop count is greater than drop count threshold + if (delta_drop_count > m_drop_count_threshold) + { + m_violationsMap[counter_stat][port_counter_oid].push(current_timestamp); + } + + // Update prev_drop_count map + m_prevDropCountMap[counter_stat][port_counter_oid] = curr_drop_count; + + // Remove violations outside time window + auto violations = m_violationsMap[counter_stat][port_counter_oid]; + while (!violations.empty()) + { + auto violation_timestamp = violations.front(); + auto time_delta = std::chrono::duration_cast(current_timestamp - violation_timestamp).count(); + + // Remove the violations that are out of the time window + if (time_delta > m_window) + { + violations.pop(); + } + // If violation is within the window all subsequent violations are within the window + else + { + break; + } + } + + // Update the violations map + m_violationsMap[counter_stat][port_counter_oid] = violations; + + // Generate syslog for persistent drops + if (m_violationsMap[counter_stat][port_counter_oid].size() > m_incident_count_threshold){ + // Generate syslog entry + SWSS_LOG_ERROR("%s: Persistent packet drops detected on %s", debug_counter.first.c_str(), port.first.c_str()); + + // Clear all the current violations for this counter + m_violationsMap[counter_stat][port_counter_oid] = std::queue(); + } + } + } } DebugCounterOrch::~DebugCounterOrch(void) @@ -90,6 +186,7 @@ void DebugCounterOrch::update(SubjectType type, void *cntx) } } + // doTask processes updates from the consumer and modifies the state of the // following components: // 1) The ASIC, by creating, modifying, and deleting debug counters @@ -193,6 +290,25 @@ void DebugCounterOrch::doTask(Consumer& consumer) SWSS_LOG_ERROR("Unknown operation type %s\n", op.c_str()); } } + else if (table_name == CFG_DEBUG_DROP_MONITOR_TABLE_NAME) + { + if (op == SET_COMMAND) + { + try + { + if (key == "CONFIG") + task_status = updateDropMonitorConfig(values); + } + catch (const std::runtime_error& e) + { + task_status = task_process_status::task_failed; + } + } + else + { + SWSS_LOG_ERROR("Unknown operation type %s\n", op.c_str()); + } + } else { SWSS_LOG_ERROR("Received update from unknown table '%s'", table_name.c_str()); @@ -309,6 +425,23 @@ task_process_status DebugCounterOrch::installDebugCounter(const string& counter_ addFreeCounter(counter_name, counter_type); reconcileFreeDropCounters(counter_name); + // Add port counters to drop counter monitor + if (counter_type == PORT_INGRESS_DROPS || counter_type == PORT_EGRESS_DROPS) + { + auto counter = debug_counters[counter_name].get(); + string counter_stat = counter->getDebugCounterSAIStat(); + bool counter_in_violations_map = (m_violationsMap.find(counter_stat) != m_violationsMap.end()); + bool counter_in_prev_drop_count_map = (m_prevDropCountMap.find(counter_stat) != m_prevDropCountMap.end()); + if (!counter_in_violations_map) + m_violationsMap[counter_stat] = {}; + else + SWSS_LOG_WARN("%s already exists in drop counter violations map, and cannot be added", counter_stat.c_str()); + if (!counter_in_prev_drop_count_map) + m_prevDropCountMap[counter_stat] = {}; + else + SWSS_LOG_WARN("%s already exists in drop counter m_prevDropCountMap, and cannot be added", counter_stat.c_str()); + } + SWSS_LOG_NOTICE("Successfully created drop counter %s", counter_name.c_str()); return task_process_status::task_success; } @@ -336,6 +469,22 @@ task_process_status DebugCounterOrch::uninstallDebugCounter(const string& counte string counter_type = counter->getCounterType(); string counter_stat = counter->getDebugCounterSAIStat(); + // Delete port counters from drop counter monitor + if (counter_type == PORT_INGRESS_DROPS || counter_type == PORT_EGRESS_DROPS) + { + bool counter_in_violations_map = (m_violationsMap.find(counter_stat) != m_violationsMap.end()); + bool counter_in_prev_drop_count_map = (m_prevDropCountMap.find(counter_stat) != m_prevDropCountMap.end()); + if (counter_in_violations_map) + m_violationsMap.erase(m_violationsMap.find(counter_stat)); + else + SWSS_LOG_WARN("%s does not exist in drop counter violations map, and cannot be deleted", counter_stat.c_str()); + + if (counter_in_prev_drop_count_map) + m_prevDropCountMap.erase(m_prevDropCountMap.find(counter_stat)); + else + SWSS_LOG_WARN("%s does not exist in drop counter m_prevDropCountMap, and cannot be deleted", counter_stat.c_str()); + } + uninstallDebugFlexCounters(counter_type, counter_stat); if (counter_type == PORT_INGRESS_DROPS || counter_type == PORT_EGRESS_DROPS) @@ -422,6 +571,85 @@ task_process_status DebugCounterOrch::removeDropReason(const string& counter_nam return task_success; } +// This function is used to configure the drop counter monitor parameters from DEBUG_DROP_MONITOR table +// in CONFIG_DB. +task_process_status DebugCounterOrch::updateDropMonitorConfig(const std::vector& configs) +{ + SWSS_LOG_ENTER(); + + for (const auto& config : configs) + { + string config_name = config.first; + string config_value = config.second; + + // Update the appropriate config + try + { + if (config_name == "window") + { + int value = stoi(config_value); + if (value <= 0) + { + SWSS_LOG_ERROR("The window size should be positive. Window size supplied: %s", config_value.c_str()); + return task_process_status::task_failed; + } + m_window = uint32_t(value); + } + else if (config_name == "drop_count_threshold") + { + int value = stoi(config_value); + if (value < 0) + { + SWSS_LOG_ERROR("The drop count threshold should be non-negative. Drop count threshold supplied: %s", config_value.c_str()); + return task_process_status::task_failed; + } + m_drop_count_threshold = uint32_t(value); + } + else if (config_name == "incident_count_threshold") + { + int value = stoi(config_value); + if (value < 0) + { + SWSS_LOG_ERROR("The incident count threshold should be non-negative. Incident count threshold supplied: %s", config_value.c_str()); + return task_process_status::task_failed; + } + m_incident_count_threshold = uint32_t(value); + } + else if (config_name == "status") + { + // Status is responsible for controlling the dropCountMonitor timer + if (config_value == "enabled") + { + m_dropCountMonitorTimer->start(); + SWSS_LOG_INFO("The drop counter monitor feature has been enabled"); + } + else if (config_value == "disabled") + { + m_dropCountMonitorTimer->stop(); + SWSS_LOG_INFO("The drop counter monitor feature has been disabled"); + } + else + { + SWSS_LOG_ERROR("The status of drop counter monitor was not recognized: %s. Accepted values are enabled/disabled.", config_value.c_str()); + return task_process_status::task_failed; + } + } + else + { + SWSS_LOG_ERROR("Invalid debug drop counter configuration applied: config_name: %s, config_value: %s", config_name.c_str(), config_value.c_str()); + return task_process_status::task_failed; + } + } + catch(const std::runtime_error& e) + { + SWSS_LOG_ERROR("Encountered an error when updating DEBUG_DROP_MONITOR. config_name: %s, config_value: %s", config_name.c_str(), config_value.c_str()); + return task_process_status::task_failed; + } + } + + SWSS_LOG_NOTICE("Successfully updated debug drop counter configuration"); + return task_process_status::task_success; +} // Free Table Management Functions START HERE ------------------------------------------------------ // Note that entries will remain in the table until at least one drop reason is added to the counter. diff --git a/orchagent/debugcounterorch.h b/orchagent/debugcounterorch.h index edfb5d98e0..c7ec77810e 100644 --- a/orchagent/debugcounterorch.h +++ b/orchagent/debugcounterorch.h @@ -3,6 +3,7 @@ #include #include +#include #include #include @@ -11,6 +12,7 @@ #include "debug_counter.h" #include "drop_counter.h" #include "observer.h" +#include "timer.h" extern "C" { #include "sai.h" @@ -19,6 +21,7 @@ extern "C" { #define DEBUG_COUNTER_FLEX_COUNTER_GROUP "DEBUG_COUNTER" using DebugCounterMap = std::unordered_map>; +using Timestamp = std::chrono::system_clock::time_point; // DebugCounterOrch is an orchestrator for managing debug counters. It handles // the creation, deletion, and modification of debug counters. @@ -28,6 +31,7 @@ class DebugCounterOrch: public Orch, public Observer DebugCounterOrch(swss::DBConnector *db, const std::vector& table_names, int poll_interval); virtual ~DebugCounterOrch(void); + virtual void doTask(swss::SelectableTimer &timer); void doTask(Consumer& consumer); void update(SubjectType, void *cntx); @@ -40,6 +44,7 @@ class DebugCounterOrch: public Orch, public Observer task_process_status uninstallDebugCounter(const std::string& counter_name); task_process_status addDropReason(const std::string& counter_name, const std::string& drop_reason); task_process_status removeDropReason(const std::string& counter_name, const std::string& drop_reason); + task_process_status updateDropMonitorConfig(const std::vector& configs); // Free Table Management Functions void addFreeCounter(const std::string& counter_name, const std::string& counter_type); @@ -80,6 +85,8 @@ class DebugCounterOrch: public Orch, public Observer std::shared_ptr m_debugCapabilitiesTable = nullptr; std::shared_ptr m_countersDb = nullptr; + std::shared_ptr m_counterTable = nullptr; + std::shared_ptr m_counterPortNameMap = nullptr; std::shared_ptr m_counterNameToPortStatMap = nullptr; std::shared_ptr m_counterNameToSwitchStatMap = nullptr; @@ -87,6 +94,22 @@ class DebugCounterOrch: public Orch, public Observer std::unordered_set supported_ingress_drop_reasons; std::unordered_set supported_egress_drop_reasons; + // Data Members for Persistent Drop Monitoring + uint32_t m_window = 900; /* Window size in seconds */ + uint32_t m_drop_count_threshold = 100; /* Drops above this threshold are classified as incidents */ + uint32_t m_incident_count_threshold = 2; /* Incidents above this threshold will trigger an entry to syslog */ + swss::SelectableTimer *m_dropCountMonitorTimer = nullptr; + + // This map stores the incidents/violations as queues of timestamps. Incidents involve drop counts + // where the number of drops exceed the drop_count_threshold. The first key corresponds to + // the drop counter stat being tracked, the inner key corresponds to the port which experienced + // the drops + std::unordered_map>> m_violationsMap; + + // This is a map that stores the previous drop counts of various drop counters. The first key + // corresponds to the drop counter stat being tracked and the inner key corresponds to the port + std::unordered_map> m_prevDropCountMap; + FlexCounterStatManager flex_counter_manager; DebugCounterMap debug_counters; diff --git a/orchagent/orchdaemon.cpp b/orchagent/orchdaemon.cpp index eef7a32fc8..fbe208a1d1 100644 --- a/orchagent/orchdaemon.cpp +++ b/orchagent/orchdaemon.cpp @@ -377,7 +377,8 @@ bool OrchDaemon::init() vector debug_counter_tables = { CFG_DEBUG_COUNTER_TABLE_NAME, - CFG_DEBUG_COUNTER_DROP_REASON_TABLE_NAME + CFG_DEBUG_COUNTER_DROP_REASON_TABLE_NAME, + CFG_DEBUG_DROP_MONITOR_TABLE_NAME }; gDebugCounterOrch = new DebugCounterOrch(m_configDb, debug_counter_tables, 1000);