Skip to content

Commit

Permalink
[ML] Decrease the memory used by distribution models (elastic#146)
Browse files Browse the repository at this point in the history
  • Loading branch information
tveasey committed Jul 23, 2018
1 parent 5cf2326 commit eb8f1b7
Show file tree
Hide file tree
Showing 22 changed files with 160 additions and 152 deletions.
1 change: 1 addition & 0 deletions docs/CHANGELOG.asciidoc
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ Improve partition analysis memory usage ({pull}97[#97])
Reduce model memory by storing state for periodicity testing in a compressed format ({pull}100[#100])
Improve the accuracy of model memory control ({pull}122[#122])
Improve adaption of the modelling of cyclic components to very localised features ({pull}134[#134])
Reduce the memory consumed by distribution models ({pull}146[#146])

Forecasting of Machine Learning job time series is now supported for large jobs by temporarily storing
model state on disk ({pull}89[#89])
Expand Down
8 changes: 4 additions & 4 deletions include/maths/CGammaRateConjugate.h
Original file line number Diff line number Diff line change
Expand Up @@ -371,10 +371,10 @@ class MATHS_EXPORT CGammaRateConjugate : public CPrior {
//! We assume that the data are described by \f$X = Y - u\f$, where
//! \f$u\f$ is a constant and \f$Y\f$ is gamma distributed. This allows
//! us to model data with negative values greater than \f$-u\f$.
double m_Offset;
CFloatStorage m_Offset;

//! The margin between the smallest value and the support left end.
double m_OffsetMargin;
CFloatStorage m_OffsetMargin;

//! The maximum likelihood estimate of the shape parameter.
double m_LikelihoodShape;
Expand All @@ -386,10 +386,10 @@ class MATHS_EXPORT CGammaRateConjugate : public CPrior {
TMeanVarAccumulator m_SampleMoments;

//! The initial shape parameter of the prior gamma distribution.
double m_PriorShape;
CFloatStorage m_PriorShape;

//! The initial rate parameter of the prior gamma distribution.
double m_PriorRate;
CFloatStorage m_PriorRate;
};
}
}
Expand Down
10 changes: 5 additions & 5 deletions include/maths/CLogNormalMeanPrecConjugate.h
Original file line number Diff line number Diff line change
Expand Up @@ -390,22 +390,22 @@ class MATHS_EXPORT CLogNormalMeanPrecConjugate : public CPrior {
//! We assume that the data are described by \f$X = e^Y - u\f$, where
//! \f$u\f$ is a constant and \f$Y\f$ is normally distributed. This
//! allows us to model data with negative values greater than \f$-u\f$.
double m_Offset;
CFloatStorage m_Offset;

//! The margin between the smallest value and the support left end.
double m_OffsetMargin;
CFloatStorage m_OffsetMargin;

//! The mean of the prior conditional distribution for the mean of the
//! exponentiated normal (conditioned on its precision).
double m_GaussianMean;
CFloatStorage m_GaussianMean;

//! The precision of the prior conditional distribution for the mean
//! of the exponentiated normal (conditioned on its precision).
double m_GaussianPrecision;
CFloatStorage m_GaussianPrecision;

//! The shape of the marginal gamma distribution for the precision of the
//! exponentiated normal.
double m_GammaShape;
CFloatStorage m_GammaShape;

//! The rate of the marginal gamma distribution for the precision of the
//! exponentiated normal.
Expand Down
29 changes: 14 additions & 15 deletions include/maths/CNaturalBreaksClassifier.h
Original file line number Diff line number Diff line change
Expand Up @@ -98,8 +98,8 @@ class MATHS_EXPORT CNaturalBreaksClassifier {
public:
using TSizeVec = std::vector<std::size_t>;
using TDoubleVec = std::vector<double>;
using TDoubleDoublePr = std::pair<double, double>;
using TDoubleDoublePrVec = std::vector<TDoubleDoublePr>;
using TFloatFloatPr = std::pair<CFloatStorage, CFloatStorage>;
using TFloatFloatPrVec = std::vector<TFloatFloatPr>;
using TDoubleTuple = CBasicStatistics::SSampleMeanVar<double>::TAccumulator;
using TDoubleTupleVec = std::vector<TDoubleTuple>;
using TTuple = CBasicStatistics::SSampleMeanVar<CFloatStorage>::TAccumulator;
Expand Down Expand Up @@ -270,16 +270,6 @@ class MATHS_EXPORT CNaturalBreaksClassifier {
private:
using TSizeSizePr = std::pair<std::size_t, std::size_t>;

private:
//! Implementation called by naturalBreaks with explicit
//! tuple types.
template<typename TUPLE>
static bool naturalBreaksImpl(const std::vector<TUPLE>& categories,
std::size_t n,
std::size_t p,
EObjective target,
TSizeVec& result);

private:
//! The minimum permitted size for the classifier.
static const std::size_t MINIMUM_SPACE;
Expand All @@ -295,6 +285,15 @@ class MATHS_EXPORT CNaturalBreaksClassifier {
double minimumCategoryCount,
TTupleVec& categories);

//! Implementation called by naturalBreaks with explicit
//! tuple types.
template<typename TUPLE>
static bool naturalBreaksImpl(const std::vector<TUPLE>& categories,
std::size_t n,
std::size_t p,
EObjective target,
TSizeVec& result);

//! Reduce the number of tuples until we satisfy the space constraint.
void reduce();

Expand Down Expand Up @@ -323,16 +322,16 @@ class MATHS_EXPORT CNaturalBreaksClassifier {
std::size_t m_Space;

//! The rate at which the categories lose information.
double m_DecayRate;
CFloatStorage m_DecayRate;

//! The minimum permitted count for a category.
double m_MinimumCategoryCount;
CFloatStorage m_MinimumCategoryCount;

//! The categories we are maintaining.
TTupleVec m_Categories;

//! A buffer of the points added while the space constraint is satisfied.
TDoubleDoublePrVec m_PointsBuffer;
TFloatFloatPrVec m_PointsBuffer;
};
}
}
Expand Down
6 changes: 3 additions & 3 deletions include/maths/CNormalMeanPrecConjugate.h
Original file line number Diff line number Diff line change
Expand Up @@ -335,15 +335,15 @@ class MATHS_EXPORT CNormalMeanPrecConjugate : public CPrior {
private:
//! The mean of the prior conditional distribution for the mean of the
//! normal variable (conditioned on its precision).
double m_GaussianMean;
CFloatStorage m_GaussianMean;

//! The precision of the prior conditional distribution for the mean
//! of the normal variable (conditioned on its precision).
double m_GaussianPrecision;
CFloatStorage m_GaussianPrecision;

//! The shape of the marginal gamma distribution for the precision of the
//! normal variable.
double m_GammaShape;
CFloatStorage m_GammaShape;

//! The rate of the marginal gamma distribution for the precision of the
//! normal variable.
Expand Down
6 changes: 3 additions & 3 deletions include/maths/CPoissonMeanConjugate.h
Original file line number Diff line number Diff line change
Expand Up @@ -287,14 +287,14 @@ class MATHS_EXPORT CPoissonMeanConjugate : public CPrior {
//! We assume that the data are described by \f$X = Y - u\f$, where
//! \f$u\f$ is a constant and \f$Y\f$ is Poisson distributed. This
//! allows us to model data with negative values greater than \f$-u\f$.
double m_Offset;
CFloatStorage m_Offset;

//! The shape parameter for the gamma distribution.
double m_Shape;
CFloatStorage m_Shape;

//! The rate parameter for the gamma distribution. We work with the inverse
//! scale parameter because it makes defining the non-informative prior easy.
double m_Rate;
CFloatStorage m_Rate;
};
}
}
Expand Down
67 changes: 32 additions & 35 deletions include/maths/CXMeansOnline.h
Original file line number Diff line number Diff line change
Expand Up @@ -588,8 +588,8 @@ class CXMeansOnline : public CClusterer<CVectorNx1<T, N>> {
const CClustererTypes::TSplitFunc& splitFunc = CClustererTypes::CDoNothing(),
const CClustererTypes::TMergeFunc& mergeFunc = CClustererTypes::CDoNothing())
: CClusterer<TPoint>(splitFunc, mergeFunc), m_DataType(dataType),
m_InitialDecayRate(decayRate), m_DecayRate(decayRate),
m_HistoryLength(0.0), m_WeightCalc(weightCalc),
m_WeightCalc(weightCalc), m_InitialDecayRate(decayRate),
m_DecayRate(decayRate), m_HistoryLength(0.0),
m_MinimumClusterFraction(minimumClusterFraction),
m_MinimumClusterCount(minimumClusterCount),
m_MinimumCategoryCount(minimumCategoryCount),
Expand All @@ -598,10 +598,9 @@ class CXMeansOnline : public CClusterer<CVectorNx1<T, N>> {
//! Construct by traversing a state document.
CXMeansOnline(const SDistributionRestoreParams& params, core::CStateRestoreTraverser& traverser)
: CClusterer<TPoint>(CClustererTypes::CDoNothing(), CClustererTypes::CDoNothing()),
m_DataType(params.s_DataType), m_InitialDecayRate(params.s_DecayRate),
m_DecayRate(params.s_DecayRate), m_HistoryLength(),
m_WeightCalc(maths_t::E_ClustersEqualWeight),
m_MinimumClusterFraction(), m_MinimumClusterCount(),
m_DataType(params.s_DataType), m_WeightCalc(maths_t::E_ClustersEqualWeight),
m_InitialDecayRate(params.s_DecayRate), m_DecayRate(params.s_DecayRate),
m_HistoryLength(), m_MinimumClusterFraction(), m_MinimumClusterCount(),
m_MinimumCategoryCount(params.s_MinimumCategoryCount) {
traverser.traverseSubLevel(boost::bind(&CXMeansOnline::acceptRestoreTraverser,
this, boost::cref(params), _1));
Expand All @@ -613,27 +612,28 @@ class CXMeansOnline : public CClusterer<CVectorNx1<T, N>> {
const CClustererTypes::TMergeFunc& mergeFunc,
core::CStateRestoreTraverser& traverser)
: CClusterer<TPoint>(splitFunc, mergeFunc), m_DataType(params.s_DataType),
m_WeightCalc(maths_t::E_ClustersEqualWeight),
m_InitialDecayRate(params.s_DecayRate), m_DecayRate(params.s_DecayRate),
m_HistoryLength(), m_WeightCalc(maths_t::E_ClustersEqualWeight),
m_MinimumClusterFraction(), m_MinimumClusterCount(),
m_HistoryLength(), m_MinimumClusterFraction(), m_MinimumClusterCount(),
m_MinimumCategoryCount(params.s_MinimumCategoryCount) {
traverser.traverseSubLevel(boost::bind(&CXMeansOnline::acceptRestoreTraverser,
this, boost::cref(params), _1));
}

//! The x-means clusterer has value semantics.
CXMeansOnline(const CXMeansOnline& other)
: CClusterer<TPoint>(other.splitFunc(), other.mergeFunc()),
m_Rng(other.m_Rng), m_DataType(other.m_DataType),
: CClusterer<TPoint>(other.splitFunc(), other.mergeFunc()), m_Rng(other.m_Rng),
m_DataType(other.m_DataType), m_WeightCalc(other.m_WeightCalc),
m_InitialDecayRate(other.m_InitialDecayRate),
m_DecayRate(other.m_DecayRate), m_HistoryLength(other.m_HistoryLength),
m_WeightCalc(other.m_WeightCalc),
m_MinimumClusterFraction(other.m_MinimumClusterFraction),
m_MinimumClusterCount(other.m_MinimumClusterCount),
m_MinimumCategoryCount(other.m_MinimumCategoryCount),
m_ClusterIndexGenerator(other.m_ClusterIndexGenerator.deepCopy()),
m_Clusters(other.m_Clusters) {}

~CXMeansOnline() = default;

//! The x-means clusterer has value semantics.
CXMeansOnline& operator=(const CXMeansOnline& other) {
if (this != &other) {
Expand All @@ -644,17 +644,15 @@ class CXMeansOnline : public CClusterer<CVectorNx1<T, N>> {
}
//@}

virtual ~CXMeansOnline() {}

//! Efficiently swap the contents of two k-means objects.
void swap(CXMeansOnline& other) {
this->CClusterer<TPoint>::swap(other);
std::swap(m_Rng, other.m_Rng);
std::swap(m_DataType, other.m_DataType);
std::swap(m_WeightCalc, other.m_WeightCalc);
std::swap(m_InitialDecayRate, other.m_InitialDecayRate);
std::swap(m_DecayRate, other.m_DecayRate);
std::swap(m_HistoryLength, other.m_HistoryLength);
std::swap(m_WeightCalc, other.m_WeightCalc);
std::swap(m_MinimumClusterFraction, other.m_MinimumClusterFraction);
std::swap(m_MinimumClusterCount, other.m_MinimumClusterCount);
std::swap(m_MinimumCategoryCount, other.m_MinimumCategoryCount);
Expand All @@ -675,13 +673,13 @@ class CXMeansOnline : public CClusterer<CVectorNx1<T, N>> {
inserter.insertLevel(CLUSTER_TAG, boost::bind(&CCluster::acceptPersistInserter,
&cluster, _1));
}
inserter.insertValue(DECAY_RATE_TAG, m_DecayRate, core::CIEEE754::E_SinglePrecision);
inserter.insertValue(HISTORY_LENGTH_TAG, m_HistoryLength,
core::CIEEE754::E_SinglePrecision);
inserter.insertValue(DECAY_RATE_TAG, m_DecayRate.toString());
inserter.insertValue(HISTORY_LENGTH_TAG, m_HistoryLength.toString());
inserter.insertValue(RNG_TAG, m_Rng.toString());
inserter.insertValue(WEIGHT_CALC_TAG, static_cast<int>(m_WeightCalc));
inserter.insertValue(MINIMUM_CLUSTER_FRACTION_TAG, m_MinimumClusterFraction);
inserter.insertValue(MINIMUM_CLUSTER_COUNT_TAG, m_MinimumClusterCount);
inserter.insertValue(MINIMUM_CLUSTER_FRACTION_TAG,
m_MinimumClusterFraction.toString());
inserter.insertValue(MINIMUM_CLUSTER_COUNT_TAG, m_MinimumClusterCount.toString());
inserter.insertLevel(CLUSTER_INDEX_GENERATOR_TAG,
boost::bind(&CClustererTypes::CIndexGenerator::acceptPersistInserter,
&m_ClusterIndexGenerator, _1));
Expand Down Expand Up @@ -999,18 +997,17 @@ class CXMeansOnline : public CClusterer<CVectorNx1<T, N>> {
RESTORE_SETUP_TEARDOWN(DECAY_RATE_TAG, double decayRate,
core::CStringUtils::stringToType(traverser.value(), decayRate),
this->decayRate(decayRate))
RESTORE_BUILT_IN(HISTORY_LENGTH_TAG, m_HistoryLength)
RESTORE(HISTORY_LENGTH_TAG, m_HistoryLength.fromString(traverser.value()))
RESTORE(RNG_TAG, m_Rng.fromString(traverser.value()));
RESTORE(CLUSTER_INDEX_GENERATOR_TAG,
traverser.traverseSubLevel(boost::bind(
&CClustererTypes::CIndexGenerator::acceptRestoreTraverser,
&m_ClusterIndexGenerator, _1)))
RESTORE_SETUP_TEARDOWN(
WEIGHT_CALC_TAG, int weightCalc,
core::CStringUtils::stringToType(traverser.value(), weightCalc),
m_WeightCalc = static_cast<maths_t::EClusterWeightCalc>(weightCalc))
RESTORE_BUILT_IN(MINIMUM_CLUSTER_FRACTION_TAG, m_MinimumClusterFraction)
RESTORE_BUILT_IN(MINIMUM_CLUSTER_COUNT_TAG, m_MinimumClusterCount)
RESTORE_ENUM(WEIGHT_CALC_TAG, m_WeightCalc, maths_t::EClusterWeightCalc)
RESTORE(MINIMUM_CLUSTER_FRACTION_TAG,
m_MinimumClusterFraction.fromString(traverser.value()))
RESTORE(MINIMUM_CLUSTER_COUNT_TAG,
m_MinimumClusterCount.fromString(traverser.value()))
} while (traverser.next());

return true;
Expand Down Expand Up @@ -1223,26 +1220,26 @@ class CXMeansOnline : public CClusterer<CVectorNx1<T, N>> {
//! The type of data being clustered.
maths_t::EDataType m_DataType;

//! The style of the cluster weight calculation (see maths_t::EClusterWeightCalc).
maths_t::EClusterWeightCalc m_WeightCalc;

//! The initial rate at which information is lost.
double m_InitialDecayRate;
CFloatStorage m_InitialDecayRate;

//! The rate at which information is lost.
double m_DecayRate;
CFloatStorage m_DecayRate;

//! A measure of the length of history of the data clustered.
double m_HistoryLength;

//! The style of the cluster weight calculation (see maths_t::EClusterWeightCalc).
maths_t::EClusterWeightCalc m_WeightCalc;
CFloatStorage m_HistoryLength;

//! The minimum cluster fractional count.
double m_MinimumClusterFraction;
CFloatStorage m_MinimumClusterFraction;

//! The minimum cluster count.
double m_MinimumClusterCount;
CFloatStorage m_MinimumClusterCount;

//! The minimum count for a category in the sketch to cluster.
double m_MinimumCategoryCount;
CFloatStorage m_MinimumCategoryCount;

//! A generator of unique cluster indices.
CClustererTypes::CIndexGenerator m_ClusterIndexGenerator;
Expand Down
27 changes: 13 additions & 14 deletions include/maths/CXMeansOnline1d.h
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,7 @@ class MATHS_EXPORT CAvailableModeDistributions {
//! is expected to give largely order (of points processed) invariant
//! unsupervised clustering of the data which identifies reasonably
//! well separated clusters.
class MATHS_EXPORT CXMeansOnline1d : public CClusterer1d {
class MATHS_EXPORT CXMeansOnline1d final : public CClusterer1d {
public:
class CCluster;
using TDoubleVec = CClusterer1d::TPointPreciseVec;
Expand Down Expand Up @@ -374,9 +374,8 @@ class MATHS_EXPORT CXMeansOnline1d : public CClusterer1d {
CIndexGenerator& indexGenerator();

private:
using TMinAccumulator = CBasicStatistics::COrderStatisticsStack<double, 1>;
using TMaxAccumulator =
CBasicStatistics::COrderStatisticsStack<double, 1, std::greater<double>>;
using TMinAccumulator = CBasicStatistics::SMin<double>::TAccumulator;
using TMaxAccumulator = CBasicStatistics::SMax<double>::TAccumulator;

private:
//! The minimum Kullback-Leibler divergence at which we'll
Expand Down Expand Up @@ -425,32 +424,32 @@ class MATHS_EXPORT CXMeansOnline1d : public CClusterer1d {
//! The type of data being clustered.
maths_t::EDataType m_DataType;

//! The style of the cluster weight calculation (see maths_t::EClusterWeightCalc).
maths_t::EClusterWeightCalc m_WeightCalc;

//! The distributions available to model the clusters.
CAvailableModeDistributions m_AvailableDistributions;

//! The initial rate at which information is lost.
double m_InitialDecayRate;
CFloatStorage m_InitialDecayRate;

//! The rate at which information is lost.
double m_DecayRate;
CFloatStorage m_DecayRate;

//! A measure of the length of history of the data clustered.
double m_HistoryLength;

//! The style of the cluster weight calculation (see maths_t::EClusterWeightCalc).
maths_t::EClusterWeightCalc m_WeightCalc;
CFloatStorage m_HistoryLength;

//! The minimum cluster fractional count.
double m_MinimumClusterFraction;
CFloatStorage m_MinimumClusterFraction;

//! The minimum cluster count.
double m_MinimumClusterCount;
CFloatStorage m_MinimumClusterCount;

//! The minimum count for a category in the sketch to cluster.
double m_MinimumCategoryCount;
CFloatStorage m_MinimumCategoryCount;

//! The data central confidence interval on which to Winsorise.
double m_WinsorisationConfidenceInterval;
CFloatStorage m_WinsorisationConfidenceInterval;

//! A generator of unique cluster indices.
CIndexGenerator m_ClusterIndexGenerator;
Expand Down
Loading

0 comments on commit eb8f1b7

Please sign in to comment.