[ML] Anomaly detection for multiple bucket features (#175)

elastic · Aug 17, 2018 · 548222b · 548222b
1 parent 88095c1
commit 548222b
Show file tree

Hide file tree

Showing 59 changed files with 2,685 additions and 1,413 deletions.
diff --git a/bin/autodetect/CCmdLineParser.cc b/bin/autodetect/CCmdLineParser.cc
@@ -52,7 +52,6 @@ bool CCmdLineParser::parse(int argc,
                            bool& memoryUsage,
                            std::size_t& bucketResultsDelay,
                            bool& multivariateByFields,
-                           std::string& multipleBucketspans,
                            bool& perPartitionNormalization,
                            TStrVec& clauseTokens) {
     try {
@@ -117,8 +116,6 @@ bool CCmdLineParser::parse(int argc,
                         "The numer of half buckets to store before choosing which overlapping bucket has the biggest anomaly")
             ("multivariateByFields",
                         "Optional flag to enable multi-variate analysis of correlated by fields")
-            ("multipleBucketspans",  boost::program_options::value<std::string>(),
-                        "Optional comma-separated list of additional bucketspans - must be direct multiples of the main bucketspan")
             ("perPartitionNormalization",
                         "Optional flag to enable per partition normalization")
         ;
@@ -234,9 +231,6 @@ bool CCmdLineParser::parse(int argc,
         if (vm.count("multivariateByFields") > 0) {
             multivariateByFields = true;
         }
-        if (vm.count("multipleBucketspans") > 0) {
-            multipleBucketspans = vm["multipleBucketspans"].as<std::string>();
-        }
         if (vm.count("perPartitionNormalization") > 0) {
             perPartitionNormalization = true;
         }

diff --git a/bin/autodetect/CCmdLineParser.h b/bin/autodetect/CCmdLineParser.h
@@ -64,7 +64,6 @@ class CCmdLineParser {
                       bool& memoryUsage,
                       std::size_t& bucketResultsDelay,
                       bool& multivariateByFields,
-                      std::string& multipleBucketspans,
                       bool& perPartitionNormalization,
                       TStrVec& clauseTokens);
 

diff --git a/bin/autodetect/Main.cc b/bin/autodetect/Main.cc
@@ -88,7 +88,6 @@ int main(int argc, char** argv) {
     bool memoryUsage(false);
     std::size_t bucketResultsDelay(0);
     bool multivariateByFields(false);
-    std::string multipleBucketspans;
     bool perPartitionNormalization(false);
     TStrVec clauseTokens;
     if (ml::autodetect::CCmdLineParser::parse(
@@ -97,10 +96,9 @@ int main(int argc, char** argv) {
             summaryCountFieldName, delimiter, lengthEncodedInput, timeField,
             timeFormat, quantilesStateFile, deleteStateFiles, persistInterval,
             maxQuantileInterval, inputFileName, isInputFileNamedPipe, outputFileName,
-            isOutputFileNamedPipe, restoreFileName, isRestoreFileNamedPipe,
-            persistFileName, isPersistFileNamedPipe, maxAnomalyRecords, memoryUsage,
-            bucketResultsDelay, multivariateByFields, multipleBucketspans,
-            perPartitionNormalization, clauseTokens) == false) {
+            isOutputFileNamedPipe, restoreFileName, isRestoreFileNamedPipe, persistFileName,
+            isPersistFileNamedPipe, maxAnomalyRecords, memoryUsage, bucketResultsDelay,
+            multivariateByFields, perPartitionNormalization, clauseTokens) == false) {
         return EXIT_FAILURE;
     }
 
@@ -147,7 +145,7 @@ int main(int argc, char** argv) {
     ml::model::CAnomalyDetectorModelConfig modelConfig =
         ml::model::CAnomalyDetectorModelConfig::defaultConfig(
             bucketSpan, summaryMode, summaryCountFieldName, latency,
-            bucketResultsDelay, multivariateByFields, multipleBucketspans);
+            bucketResultsDelay, multivariateByFields);
     modelConfig.perPartitionNormalization(perPartitionNormalization);
     modelConfig.detectionRules(ml::model::CAnomalyDetectorModelConfig::TIntDetectionRuleVecUMapCRef(
         fieldConfig.detectionRules()));

diff --git a/docs/CHANGELOG.asciidoc b/docs/CHANGELOG.asciidoc
@@ -28,5 +28,20 @@
 
 //=== Regressions
 
-//=== Known Issues
-////
+ == {es} version 6.5.0
+
+//=== Breaking Changes
+
+//=== Deprecations
+
+//=== New Features
+
+=== Enhancements
+
+Perform anomaly detection on features derived from multiple bucket values to improve robustness
+of detection with respect to misconfigured bucket lengths and improve detection of long lasting
+anomalies. (See {pull}175[#175].)
+
+//=== Bug Fixes
+
+//=== Regressions
diff --git a/include/maths/CBasicStatistics.h b/include/maths/CBasicStatistics.h
@@ -149,6 +149,7 @@ class MATHS_EXPORT CBasicStatistics {
     template<typename T, unsigned int ORDER>
     struct SSampleCentralMoments : public std::unary_function<T, void> {
         using TCoordinate = typename SCoordinate<T>::Type;
+        using TValue = T;
 
         //! See core::CMemory.
         static bool dynamicSizeAlwaysZero() {
@@ -1480,17 +1481,6 @@ class MATHS_EXPORT CBasicStatistics {
         //! The set maximum.
         COrderStatisticsStack<T, 1, GREATER> m_Max;
     };
-
-    // Friends
-    template<typename T>
-    friend std::ostream&
-    operator<<(std::ostream& o, const CBasicStatistics::SSampleCentralMoments<T, 1u>&);
-    template<typename T>
-    friend std::ostream&
-    operator<<(std::ostream& o, const CBasicStatistics::SSampleCentralMoments<T, 2u>&);
-    template<typename T>
-    friend std::ostream&
-    operator<<(std::ostream& o, const CBasicStatistics::SSampleCentralMoments<T, 3u>&);
 };
 
 template<typename T>
@@ -1596,6 +1586,23 @@ template<typename U>
 void CBasicStatistics::SSampleCentralMoments<T, ORDER>::add(const U& x, const TCoordinate& n) {
     basic_statistics_detail::SCentralMomentsCustomAdd<U>::add(x, n, *this);
 }
+
+//! \brief Defines a promoted type for a SSampleCentralMoments.
+//!
+//! \see CTypeConversions.h for details.
+template<typename T, unsigned int N>
+struct SPromoted<CBasicStatistics::SSampleCentralMoments<T, N>> {
+    using Type = CBasicStatistics::SSampleCentralMoments<typename SPromoted<T>::Type, N>;
+};
+
+//! \brief Defines SSampleCentralMoments on a suitable floating point type.
+//!
+//! \see CTypeConversions.h for details.
+template<typename T, unsigned int N, typename U>
+struct SFloatingPoint<CBasicStatistics::SSampleCentralMoments<T, N>, U> {
+    using Type =
+        CBasicStatistics::SSampleCentralMoments<typename SFloatingPoint<T, U>::Type, N>;
+};
 }
 }
 

diff --git a/include/maths/CBasicStatisticsPersist.h b/include/maths/CBasicStatisticsPersist.h
@@ -48,6 +48,16 @@ template<typename T, std::size_t N>
 bool stringToType(const std::string& str, CSymmetricMatrixNxN<T, N>& value) {
     return value.fromDelimited(str);
 }
+//! Function to do conversion from string to a vector.
+template<typename T>
+bool stringToType(const std::string& str, CVector<T>& value) {
+    return value.fromDelimited(str);
+}
+//! Function to do conversion from string to a symmetric matrix.
+template<typename T>
+bool stringToType(const std::string& str, CSymmetricMatrix<T>& value) {
+    return value.fromDelimited(str);
+}
 
 //! Function to do conversion to a string.
 template<typename T>
@@ -72,6 +82,16 @@ template<typename T, std::size_t N>
 inline std::string typeToString(const CSymmetricMatrixNxN<T, N>& value) {
     return value.toDelimited();
 }
+//! Function to do conversion to a string from a vector.
+template<typename T>
+inline std::string typeToString(const CVector<T>& value) {
+    return value.toDelimited();
+}
+//! Function to do conversion to a string from a symmetric matrix.
+template<typename T>
+inline std::string typeToString(const CSymmetricMatrix<T>& value) {
+    return value.toDelimited();
+}
 }
 
 template<typename T, unsigned int ORDER>

diff --git a/include/maths/CLinearAlgebra.h b/include/maths/CLinearAlgebra.h
@@ -928,7 +928,6 @@ class CVectorNx1 : private boost::equality_comparable< CVectorNx1<T, N>,
 
 public:
     using TArray = T[N];
-    using TVec = std::vector<T>;
     using TBoostArray = boost::array<T, N>;
     using TConstIterator = typename TBoostArray::const_iterator;
 
@@ -950,21 +949,24 @@ class CVectorNx1 : private boost::equality_comparable< CVectorNx1<T, N>,
     }
 
     //! Construct from a boost array.
-    explicit CVectorNx1(const boost::array<T, N>& a) {
+    template<typename U>
+    explicit CVectorNx1(const boost::array<U, N>& a) {
         for (std::size_t i = 0u; i < N; ++i) {
             TBase::m_X[i] = a[i];
         }
     }
 
     //! Construct from a vector.
-    explicit CVectorNx1(const TVec& v) {
+    template<typename U>
+    explicit CVectorNx1(const std::vector<U>& v) {
         for (std::size_t i = 0u; i < N; ++i) {
             TBase::m_X[i] = v[i];
         }
     }
 
     //! Construct from a vector.
-    explicit CVectorNx1(const core::CSmallVectorBase<T>& v) {
+    template<typename U>
+    explicit CVectorNx1(const core::CSmallVectorBase<U>& v) {
         for (std::size_t i = 0u; i < N; ++i) {
             TBase::m_X[i] = v[i];
         }
@@ -1244,10 +1246,14 @@ class CVector : private boost::equality_comparable< CVector<T>,
     }
 
     //! Construct from a vector.
-    explicit CVector(const TArray& v) { TBase::m_X = v; }
+    template<typename U>
+    explicit CVector(const std::vector<U>& v) {
+        TBase::m_X = v;
+    }
 
     //! Construct from a vector.
-    explicit CVector(const core::CSmallVectorBase<T>& v) {
+    template<typename U>
+    explicit CVector(const core::CSmallVectorBase<U>& v) {
         TBase::m_X.assign(v.begin(), v.end());
     }
 

diff --git a/include/maths/CModel.h b/include/maths/CModel.h
@@ -104,8 +104,6 @@ class MATHS_EXPORT CModelAddSamplesParams {
     using TDouble2VecWeightsAryVec = std::vector<maths_t::TDouble2VecWeightsAry>;
 
 public:
-    CModelAddSamplesParams();
-
     //! Set whether or not the data are integer valued.
     CModelAddSamplesParams& integer(bool integer);
     //! Get the data type.
@@ -133,15 +131,15 @@ class MATHS_EXPORT CModelAddSamplesParams {
 
 private:
     //! The data type.
-    maths_t::EDataType m_Type;
+    maths_t::EDataType m_Type = maths_t::E_MixedData;
     //! True if the data are non-negative false otherwise.
-    bool m_IsNonNegative;
+    bool m_IsNonNegative = false;
     //! The propagation interval.
-    double m_PropagationInterval;
+    double m_PropagationInterval = 1.0;
     //! The trend sample weights.
-    const TDouble2VecWeightsAryVec* m_TrendWeights;
+    const TDouble2VecWeightsAryVec* m_TrendWeights = nullptr;
     //! The prior sample weights.
-    const TDouble2VecWeightsAryVec* m_PriorWeights;
+    const TDouble2VecWeightsAryVec* m_PriorWeights = nullptr;
 };
 
 //! \brief The extra parameters needed by CModel::probability.
@@ -178,6 +176,8 @@ class MATHS_EXPORT CModelProbabilityParams {
 
     //! Add whether a value's bucket is empty.
     CModelProbabilityParams& addBucketEmpty(const TBool2Vec& empty);
+    //! Set whether or not the values' bucket is empty.
+    CModelProbabilityParams& bucketEmpty(const TBool2Vec1Vec& empty);
     //! Get whether the values' bucket is empty.
     const TBool2Vec1Vec& bucketEmpty() const;
 
@@ -200,14 +200,19 @@ class MATHS_EXPORT CModelProbabilityParams {
     //! Get the most anomalous correlate if there is one.
     TOptionalSize mostAnomalousCorrelate() const;
 
-    //! Set whether or not to update the anomaly model.
-    CModelProbabilityParams& updateAnomalyModel(bool update);
-    //! Get whether or not to update the anomaly model.
-    bool updateAnomalyModel() const;
+    //! Set whether or not to use multibucket features.
+    CModelProbabilityParams& useMultibucketFeatures(bool use);
+    //! Get whether or not to use multibucket features.
+    bool useMultibucketFeatures() const;
+
+    //! Set whether or not to use the anomaly model.
+    CModelProbabilityParams& useAnomalyModel(bool use);
+    //! Get whether or not to use the anomaly model.
+    bool useAnomalyModel() const;
 
 private:
     //! The entity tag (if relevant otherwise 0).
-    std::size_t m_Tag;
+    std::size_t m_Tag = 0;
     //! The coordinates' probability calculations.
     TProbabilityCalculation2Vec m_Calculations;
     //! The confidence interval to use when detrending.
@@ -220,8 +225,41 @@ class MATHS_EXPORT CModelProbabilityParams {
     TSize2Vec m_Coordinates;
     //! The most anomalous coordinate (if there is one).
     TOptionalSize m_MostAnomalousCorrelate;
-    //! Whether or not to update the anomaly model.
-    bool m_UpdateAnomalyModel;
+    //! Whether or not to use multibucket features.
+    bool m_UseMultibucketFeatures = true;
+    //! Whether or not to use the anomaly model.
+    bool m_UseAnomalyModel = true;
+};
+
+//! \brief Describes the result of the model probability calculation.
+struct MATHS_EXPORT SModelProbabilityResult {
+    using TDouble4Vec = core::CSmallVector<double, 4>;
+    using TSize1Vec = core::CSmallVector<std::size_t, 1>;
+    using TTail2Vec = core::CSmallVector<maths_t::ETail, 2>;
+
+    //! \brief Wraps up a feature label and probability.
+    struct MATHS_EXPORT SFeatureProbability {
+        using TStrCRef = boost::reference_wrapper<const std::string>;
+        SFeatureProbability();
+        SFeatureProbability(const std::string& label, double probability);
+        TStrCRef s_Label;
+        double s_Probability = 1.0;
+    };
+    using TFeatureProbability4Vec = core::CSmallVector<SFeatureProbability, 4>;
+
+    //! The overall result probability.
+    double s_Probability = 1.0;
+    //! True if the probability depends on the correlation between two
+    //! time series and false otherwise.
+    bool s_Conditional = false;
+    //! The probabilities for each individual feature.
+    TFeatureProbability4Vec s_FeatureProbabilities;
+    //! The tail of the current bucket probability.
+    TTail2Vec s_Tail;
+    //! The identifier of the time series correlated with this one which
+    //! has the smallest probability in the current bucket (if and only
+    //! if the result depends on the correlation structure).
+    TSize1Vec s_MostAnomalousCorrelate;
 };
 
 //! \brief The model interface.
@@ -355,10 +393,7 @@ class MATHS_EXPORT CModel {
     virtual bool probability(const CModelProbabilityParams& params,
                              const TTime2Vec1Vec& time,
                              const TDouble2Vec1Vec& value,
-                             double& probability,
-                             TTail2Vec& tail,
-                             bool& conditional,
-                             TSize1Vec& mostAnomalousCorrelate) const = 0;
+                             SModelProbabilityResult& result) const = 0;
 
     //! Get the Winsorisation weight to apply to \p value,
     //! if appropriate.
@@ -499,14 +534,11 @@ class MATHS_EXPORT CModelStub : public CModel {
                           const TForecastPushDatapointFunc& forecastPushDataPointFunc,
                           std::string& messageOut);
 
-    //! Returns 1.0.
+    //! Returns true.
     virtual bool probability(const CModelProbabilityParams& params,
                              const TTime2Vec1Vec& time,
                              const TDouble2Vec1Vec& value,
-                             double& probability,
-                             TTail2Vec& tail,
-                             bool& conditional,
-                             TSize1Vec& mostAnomalousCorrelate) const;
+                             SModelProbabilityResult& result) const;
 
     //! Returns empty.
     virtual TDouble2Vec