[ML] Decrease the memory used by distribution models (elastic#146)

tveasey · Jul 23, 2018 · eb8f1b7 · eb8f1b7
1 parent 5cf2326
commit eb8f1b7
Show file tree

Hide file tree

Showing 22 changed files with 160 additions and 152 deletions.
diff --git a/docs/CHANGELOG.asciidoc b/docs/CHANGELOG.asciidoc
@@ -26,6 +26,7 @@ Improve partition analysis memory usage ({pull}97[#97])
 Reduce model memory by storing state for periodicity testing in a compressed format ({pull}100[#100])
 Improve the accuracy of model memory control ({pull}122[#122])
 Improve adaption of the modelling of cyclic components to very localised features ({pull}134[#134])
+Reduce the memory consumed by distribution models ({pull}146[#146])
 
 Forecasting of Machine Learning job time series is now supported for large jobs by temporarily storing
 model state on disk ({pull}89[#89])

diff --git a/include/maths/CGammaRateConjugate.h b/include/maths/CGammaRateConjugate.h
@@ -371,10 +371,10 @@ class MATHS_EXPORT CGammaRateConjugate : public CPrior {
     //! We assume that the data are described by \f$X = Y - u\f$, where
     //! \f$u\f$ is a constant and \f$Y\f$ is gamma distributed. This allows
     //! us to model data with negative values greater than \f$-u\f$.
-    double m_Offset;
+    CFloatStorage m_Offset;
 
     //! The margin between the smallest value and the support left end.
-    double m_OffsetMargin;
+    CFloatStorage m_OffsetMargin;
 
     //! The maximum likelihood estimate of the shape parameter.
     double m_LikelihoodShape;
@@ -386,10 +386,10 @@ class MATHS_EXPORT CGammaRateConjugate : public CPrior {
     TMeanVarAccumulator m_SampleMoments;
 
     //! The initial shape parameter of the prior gamma distribution.
-    double m_PriorShape;
+    CFloatStorage m_PriorShape;
 
     //! The initial rate parameter of the prior gamma distribution.
-    double m_PriorRate;
+    CFloatStorage m_PriorRate;
 };
 }
 }

diff --git a/include/maths/CLogNormalMeanPrecConjugate.h b/include/maths/CLogNormalMeanPrecConjugate.h
@@ -390,22 +390,22 @@ class MATHS_EXPORT CLogNormalMeanPrecConjugate : public CPrior {
     //! We assume that the data are described by \f$X = e^Y - u\f$, where
     //! \f$u\f$ is a constant and \f$Y\f$ is normally distributed. This
     //! allows us to model data with negative values greater than \f$-u\f$.
-    double m_Offset;
+    CFloatStorage m_Offset;
 
     //! The margin between the smallest value and the support left end.
-    double m_OffsetMargin;
+    CFloatStorage m_OffsetMargin;
 
     //! The mean of the prior conditional distribution for the mean of the
     //! exponentiated normal (conditioned on its precision).
-    double m_GaussianMean;
+    CFloatStorage m_GaussianMean;
 
     //! The precision of the prior conditional distribution for the mean
     //! of the exponentiated normal (conditioned on its precision).
-    double m_GaussianPrecision;
+    CFloatStorage m_GaussianPrecision;
 
     //! The shape of the marginal gamma distribution for the precision of the
     //! exponentiated normal.
-    double m_GammaShape;
+    CFloatStorage m_GammaShape;
 
     //! The rate of the marginal gamma distribution for the precision of the
     //! exponentiated normal.

diff --git a/include/maths/CNaturalBreaksClassifier.h b/include/maths/CNaturalBreaksClassifier.h
@@ -98,8 +98,8 @@ class MATHS_EXPORT CNaturalBreaksClassifier {
 public:
     using TSizeVec = std::vector<std::size_t>;
     using TDoubleVec = std::vector<double>;
-    using TDoubleDoublePr = std::pair<double, double>;
-    using TDoubleDoublePrVec = std::vector<TDoubleDoublePr>;
+    using TFloatFloatPr = std::pair<CFloatStorage, CFloatStorage>;
+    using TFloatFloatPrVec = std::vector<TFloatFloatPr>;
     using TDoubleTuple = CBasicStatistics::SSampleMeanVar<double>::TAccumulator;
     using TDoubleTupleVec = std::vector<TDoubleTuple>;
     using TTuple = CBasicStatistics::SSampleMeanVar<CFloatStorage>::TAccumulator;
@@ -270,16 +270,6 @@ class MATHS_EXPORT CNaturalBreaksClassifier {
 private:
     using TSizeSizePr = std::pair<std::size_t, std::size_t>;
 
-private:
-    //! Implementation called by naturalBreaks with explicit
-    //! tuple types.
-    template<typename TUPLE>
-    static bool naturalBreaksImpl(const std::vector<TUPLE>& categories,
-                                  std::size_t n,
-                                  std::size_t p,
-                                  EObjective target,
-                                  TSizeVec& result);
-
 private:
     //! The minimum permitted size for the classifier.
     static const std::size_t MINIMUM_SPACE;
@@ -295,6 +285,15 @@ class MATHS_EXPORT CNaturalBreaksClassifier {
                              double minimumCategoryCount,
                              TTupleVec& categories);
 
+    //! Implementation called by naturalBreaks with explicit
+    //! tuple types.
+    template<typename TUPLE>
+    static bool naturalBreaksImpl(const std::vector<TUPLE>& categories,
+                                  std::size_t n,
+                                  std::size_t p,
+                                  EObjective target,
+                                  TSizeVec& result);
+
     //! Reduce the number of tuples until we satisfy the space constraint.
     void reduce();
 
@@ -323,16 +322,16 @@ class MATHS_EXPORT CNaturalBreaksClassifier {
     std::size_t m_Space;
 
     //! The rate at which the categories lose information.
-    double m_DecayRate;
+    CFloatStorage m_DecayRate;
 
     //! The minimum permitted count for a category.
-    double m_MinimumCategoryCount;
+    CFloatStorage m_MinimumCategoryCount;
 
     //! The categories we are maintaining.
     TTupleVec m_Categories;
 
     //! A buffer of the points added while the space constraint is satisfied.
-    TDoubleDoublePrVec m_PointsBuffer;
+    TFloatFloatPrVec m_PointsBuffer;
 };
 }
 }

diff --git a/include/maths/CNormalMeanPrecConjugate.h b/include/maths/CNormalMeanPrecConjugate.h
@@ -335,15 +335,15 @@ class MATHS_EXPORT CNormalMeanPrecConjugate : public CPrior {
 private:
     //! The mean of the prior conditional distribution for the mean of the
     //! normal variable (conditioned on its precision).
-    double m_GaussianMean;
+    CFloatStorage m_GaussianMean;
 
     //! The precision of the prior conditional distribution for the mean
     //! of the normal variable (conditioned on its precision).
-    double m_GaussianPrecision;
+    CFloatStorage m_GaussianPrecision;
 
     //! The shape of the marginal gamma distribution for the precision of the
     //! normal variable.
-    double m_GammaShape;
+    CFloatStorage m_GammaShape;
 
     //! The rate of the marginal gamma distribution for the precision of the
     //! normal variable.

diff --git a/include/maths/CPoissonMeanConjugate.h b/include/maths/CPoissonMeanConjugate.h
@@ -287,14 +287,14 @@ class MATHS_EXPORT CPoissonMeanConjugate : public CPrior {
     //! We assume that the data are described by \f$X = Y - u\f$, where
     //! \f$u\f$ is a constant and \f$Y\f$ is Poisson distributed. This
     //! allows us to model data with negative values greater than \f$-u\f$.
-    double m_Offset;
+    CFloatStorage m_Offset;
 
     //! The shape parameter for the gamma distribution.
-    double m_Shape;
+    CFloatStorage m_Shape;
 
     //! The rate parameter for the gamma distribution. We work with the inverse
     //! scale parameter because it makes defining the non-informative prior easy.
-    double m_Rate;
+    CFloatStorage m_Rate;
 };
 }
 }

diff --git a/include/maths/CXMeansOnline.h b/include/maths/CXMeansOnline.h
@@ -588,8 +588,8 @@ class CXMeansOnline : public CClusterer<CVectorNx1<T, N>> {
                   const CClustererTypes::TSplitFunc& splitFunc = CClustererTypes::CDoNothing(),
                   const CClustererTypes::TMergeFunc& mergeFunc = CClustererTypes::CDoNothing())
         : CClusterer<TPoint>(splitFunc, mergeFunc), m_DataType(dataType),
-          m_InitialDecayRate(decayRate), m_DecayRate(decayRate),
-          m_HistoryLength(0.0), m_WeightCalc(weightCalc),
+          m_WeightCalc(weightCalc), m_InitialDecayRate(decayRate),
+          m_DecayRate(decayRate), m_HistoryLength(0.0),
           m_MinimumClusterFraction(minimumClusterFraction),
           m_MinimumClusterCount(minimumClusterCount),
           m_MinimumCategoryCount(minimumCategoryCount),
@@ -598,10 +598,9 @@ class CXMeansOnline : public CClusterer<CVectorNx1<T, N>> {
     //! Construct by traversing a state document.
     CXMeansOnline(const SDistributionRestoreParams& params, core::CStateRestoreTraverser& traverser)
         : CClusterer<TPoint>(CClustererTypes::CDoNothing(), CClustererTypes::CDoNothing()),
-          m_DataType(params.s_DataType), m_InitialDecayRate(params.s_DecayRate),
-          m_DecayRate(params.s_DecayRate), m_HistoryLength(),
-          m_WeightCalc(maths_t::E_ClustersEqualWeight),
-          m_MinimumClusterFraction(), m_MinimumClusterCount(),
+          m_DataType(params.s_DataType), m_WeightCalc(maths_t::E_ClustersEqualWeight),
+          m_InitialDecayRate(params.s_DecayRate), m_DecayRate(params.s_DecayRate),
+          m_HistoryLength(), m_MinimumClusterFraction(), m_MinimumClusterCount(),
           m_MinimumCategoryCount(params.s_MinimumCategoryCount) {
         traverser.traverseSubLevel(boost::bind(&CXMeansOnline::acceptRestoreTraverser,
                                                this, boost::cref(params), _1));
@@ -613,27 +612,28 @@ class CXMeansOnline : public CClusterer<CVectorNx1<T, N>> {
                   const CClustererTypes::TMergeFunc& mergeFunc,
                   core::CStateRestoreTraverser& traverser)
         : CClusterer<TPoint>(splitFunc, mergeFunc), m_DataType(params.s_DataType),
+          m_WeightCalc(maths_t::E_ClustersEqualWeight),
           m_InitialDecayRate(params.s_DecayRate), m_DecayRate(params.s_DecayRate),
-          m_HistoryLength(), m_WeightCalc(maths_t::E_ClustersEqualWeight),
-          m_MinimumClusterFraction(), m_MinimumClusterCount(),
+          m_HistoryLength(), m_MinimumClusterFraction(), m_MinimumClusterCount(),
           m_MinimumCategoryCount(params.s_MinimumCategoryCount) {
         traverser.traverseSubLevel(boost::bind(&CXMeansOnline::acceptRestoreTraverser,
                                                this, boost::cref(params), _1));
     }
 
     //! The x-means clusterer has value semantics.
     CXMeansOnline(const CXMeansOnline& other)
-        : CClusterer<TPoint>(other.splitFunc(), other.mergeFunc()),
-          m_Rng(other.m_Rng), m_DataType(other.m_DataType),
+        : CClusterer<TPoint>(other.splitFunc(), other.mergeFunc()), m_Rng(other.m_Rng),
+          m_DataType(other.m_DataType), m_WeightCalc(other.m_WeightCalc),
           m_InitialDecayRate(other.m_InitialDecayRate),
           m_DecayRate(other.m_DecayRate), m_HistoryLength(other.m_HistoryLength),
-          m_WeightCalc(other.m_WeightCalc),
           m_MinimumClusterFraction(other.m_MinimumClusterFraction),
           m_MinimumClusterCount(other.m_MinimumClusterCount),
           m_MinimumCategoryCount(other.m_MinimumCategoryCount),
           m_ClusterIndexGenerator(other.m_ClusterIndexGenerator.deepCopy()),
           m_Clusters(other.m_Clusters) {}
 
+    ~CXMeansOnline() = default;
+
     //! The x-means clusterer has value semantics.
     CXMeansOnline& operator=(const CXMeansOnline& other) {
         if (this != &other) {
@@ -644,17 +644,15 @@ class CXMeansOnline : public CClusterer<CVectorNx1<T, N>> {
     }
     //@}
 
-    virtual ~CXMeansOnline() {}
-
     //! Efficiently swap the contents of two k-means objects.
     void swap(CXMeansOnline& other) {
         this->CClusterer<TPoint>::swap(other);
         std::swap(m_Rng, other.m_Rng);
         std::swap(m_DataType, other.m_DataType);
+        std::swap(m_WeightCalc, other.m_WeightCalc);
         std::swap(m_InitialDecayRate, other.m_InitialDecayRate);
         std::swap(m_DecayRate, other.m_DecayRate);
         std::swap(m_HistoryLength, other.m_HistoryLength);
-        std::swap(m_WeightCalc, other.m_WeightCalc);
         std::swap(m_MinimumClusterFraction, other.m_MinimumClusterFraction);
         std::swap(m_MinimumClusterCount, other.m_MinimumClusterCount);
         std::swap(m_MinimumCategoryCount, other.m_MinimumCategoryCount);
@@ -675,13 +673,13 @@ class CXMeansOnline : public CClusterer<CVectorNx1<T, N>> {
             inserter.insertLevel(CLUSTER_TAG, boost::bind(&CCluster::acceptPersistInserter,
                                                           &cluster, _1));
         }
-        inserter.insertValue(DECAY_RATE_TAG, m_DecayRate, core::CIEEE754::E_SinglePrecision);
-        inserter.insertValue(HISTORY_LENGTH_TAG, m_HistoryLength,
-                             core::CIEEE754::E_SinglePrecision);
+        inserter.insertValue(DECAY_RATE_TAG, m_DecayRate.toString());
+        inserter.insertValue(HISTORY_LENGTH_TAG, m_HistoryLength.toString());
         inserter.insertValue(RNG_TAG, m_Rng.toString());
         inserter.insertValue(WEIGHT_CALC_TAG, static_cast<int>(m_WeightCalc));
-        inserter.insertValue(MINIMUM_CLUSTER_FRACTION_TAG, m_MinimumClusterFraction);
-        inserter.insertValue(MINIMUM_CLUSTER_COUNT_TAG, m_MinimumClusterCount);
+        inserter.insertValue(MINIMUM_CLUSTER_FRACTION_TAG,
+                             m_MinimumClusterFraction.toString());
+        inserter.insertValue(MINIMUM_CLUSTER_COUNT_TAG, m_MinimumClusterCount.toString());
         inserter.insertLevel(CLUSTER_INDEX_GENERATOR_TAG,
                              boost::bind(&CClustererTypes::CIndexGenerator::acceptPersistInserter,
                                          &m_ClusterIndexGenerator, _1));
@@ -999,18 +997,17 @@ class CXMeansOnline : public CClusterer<CVectorNx1<T, N>> {
             RESTORE_SETUP_TEARDOWN(DECAY_RATE_TAG, double decayRate,
                                    core::CStringUtils::stringToType(traverser.value(), decayRate),
                                    this->decayRate(decayRate))
-            RESTORE_BUILT_IN(HISTORY_LENGTH_TAG, m_HistoryLength)
+            RESTORE(HISTORY_LENGTH_TAG, m_HistoryLength.fromString(traverser.value()))
             RESTORE(RNG_TAG, m_Rng.fromString(traverser.value()));
             RESTORE(CLUSTER_INDEX_GENERATOR_TAG,
                     traverser.traverseSubLevel(boost::bind(
                         &CClustererTypes::CIndexGenerator::acceptRestoreTraverser,
                         &m_ClusterIndexGenerator, _1)))
-            RESTORE_SETUP_TEARDOWN(
-                WEIGHT_CALC_TAG, int weightCalc,
-                core::CStringUtils::stringToType(traverser.value(), weightCalc),
-                m_WeightCalc = static_cast<maths_t::EClusterWeightCalc>(weightCalc))
-            RESTORE_BUILT_IN(MINIMUM_CLUSTER_FRACTION_TAG, m_MinimumClusterFraction)
-            RESTORE_BUILT_IN(MINIMUM_CLUSTER_COUNT_TAG, m_MinimumClusterCount)
+            RESTORE_ENUM(WEIGHT_CALC_TAG, m_WeightCalc, maths_t::EClusterWeightCalc)
+            RESTORE(MINIMUM_CLUSTER_FRACTION_TAG,
+                    m_MinimumClusterFraction.fromString(traverser.value()))
+            RESTORE(MINIMUM_CLUSTER_COUNT_TAG,
+                    m_MinimumClusterCount.fromString(traverser.value()))
         } while (traverser.next());
 
         return true;
@@ -1223,26 +1220,26 @@ class CXMeansOnline : public CClusterer<CVectorNx1<T, N>> {
     //! The type of data being clustered.
     maths_t::EDataType m_DataType;
 
+    //! The style of the cluster weight calculation (see maths_t::EClusterWeightCalc).
+    maths_t::EClusterWeightCalc m_WeightCalc;
+
     //! The initial rate at which information is lost.
-    double m_InitialDecayRate;
+    CFloatStorage m_InitialDecayRate;
 
     //! The rate at which information is lost.
-    double m_DecayRate;
+    CFloatStorage m_DecayRate;
 
     //! A measure of the length of history of the data clustered.
-    double m_HistoryLength;
-
-    //! The style of the cluster weight calculation (see maths_t::EClusterWeightCalc).
-    maths_t::EClusterWeightCalc m_WeightCalc;
+    CFloatStorage m_HistoryLength;
 
     //! The minimum cluster fractional count.
-    double m_MinimumClusterFraction;
+    CFloatStorage m_MinimumClusterFraction;
 
     //! The minimum cluster count.
-    double m_MinimumClusterCount;
+    CFloatStorage m_MinimumClusterCount;
 
     //! The minimum count for a category in the sketch to cluster.
-    double m_MinimumCategoryCount;
+    CFloatStorage m_MinimumCategoryCount;
 
     //! A generator of unique cluster indices.
     CClustererTypes::CIndexGenerator m_ClusterIndexGenerator;

diff --git a/include/maths/CXMeansOnline1d.h b/include/maths/CXMeansOnline1d.h
@@ -103,7 +103,7 @@ class MATHS_EXPORT CAvailableModeDistributions {
 //! is expected to give largely order (of points processed) invariant
 //! unsupervised clustering of the data which identifies reasonably
 //! well separated clusters.
-class MATHS_EXPORT CXMeansOnline1d : public CClusterer1d {
+class MATHS_EXPORT CXMeansOnline1d final : public CClusterer1d {
 public:
     class CCluster;
     using TDoubleVec = CClusterer1d::TPointPreciseVec;
@@ -374,9 +374,8 @@ class MATHS_EXPORT CXMeansOnline1d : public CClusterer1d {
     CIndexGenerator& indexGenerator();
 
 private:
-    using TMinAccumulator = CBasicStatistics::COrderStatisticsStack<double, 1>;
-    using TMaxAccumulator =
-        CBasicStatistics::COrderStatisticsStack<double, 1, std::greater<double>>;
+    using TMinAccumulator = CBasicStatistics::SMin<double>::TAccumulator;
+    using TMaxAccumulator = CBasicStatistics::SMax<double>::TAccumulator;
 
 private:
     //! The minimum Kullback-Leibler divergence at which we'll
@@ -425,32 +424,32 @@ class MATHS_EXPORT CXMeansOnline1d : public CClusterer1d {
     //! The type of data being clustered.
     maths_t::EDataType m_DataType;
 
+    //! The style of the cluster weight calculation (see maths_t::EClusterWeightCalc).
+    maths_t::EClusterWeightCalc m_WeightCalc;
+
     //! The distributions available to model the clusters.
     CAvailableModeDistributions m_AvailableDistributions;
 
     //! The initial rate at which information is lost.
-    double m_InitialDecayRate;
+    CFloatStorage m_InitialDecayRate;
 
     //! The rate at which information is lost.
-    double m_DecayRate;
+    CFloatStorage m_DecayRate;
 
     //! A measure of the length of history of the data clustered.
-    double m_HistoryLength;
-
-    //! The style of the cluster weight calculation (see maths_t::EClusterWeightCalc).
-    maths_t::EClusterWeightCalc m_WeightCalc;
+    CFloatStorage m_HistoryLength;
 
     //! The minimum cluster fractional count.
-    double m_MinimumClusterFraction;
+    CFloatStorage m_MinimumClusterFraction;
 
     //! The minimum cluster count.
-    double m_MinimumClusterCount;
+    CFloatStorage m_MinimumClusterCount;
 
     //! The minimum count for a category in the sketch to cluster.
-    double m_MinimumCategoryCount;
+    CFloatStorage m_MinimumCategoryCount;
 
     //! The data central confidence interval on which to Winsorise.
-    double m_WinsorisationConfidenceInterval;
+    CFloatStorage m_WinsorisationConfidenceInterval;
 
     //! A generator of unique cluster indices.
     CIndexGenerator m_ClusterIndexGenerator;