Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[ML] Improvements to trend modelling and periodicity testing for forecasting #7

Merged
merged 4 commits into from
Feb 27, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
38 changes: 18 additions & 20 deletions include/maths/CAdaptiveBucketing.h
Original file line number Diff line number Diff line change
Expand Up @@ -80,12 +80,10 @@ namespace maths
class MATHS_EXPORT CAdaptiveBucketing
{
public:
typedef std::vector<double> TDoubleVec;
typedef std::vector<CFloatStorage> TFloatVec;
typedef std::pair<core_t::TTime, core_t::TTime> TTimeTimePr;
typedef CBasicStatistics::SSampleMeanVar<double>::TAccumulator TDoubleMeanVarAccumulator;
typedef std::pair<TTimeTimePr, TDoubleMeanVarAccumulator> TTimeTimePrMeanVarPr;
typedef std::vector<TTimeTimePrMeanVarPr> TTimeTimePrMeanVarPrVec;
using TDoubleVec = std::vector<double>;
using TFloatVec = std::vector<CFloatStorage>;
using TFloatMeanAccumulator = CBasicStatistics::SSampleMean<CFloatStorage>::TAccumulator;
using TFloatMeanAccumulatorVec = std::vector<TFloatMeanAccumulator>;

public:
//! Restore by traversing a state document
Expand Down Expand Up @@ -116,14 +114,17 @@ class MATHS_EXPORT CAdaptiveBucketing
//! \param[in] n The number of buckets.
bool initialize(double a, double b, std::size_t n);

//! Add the function moments \f$([a_i,b_i], S_i)\f$ where
//! \f$S_i\f$ are the means and variances of the function
//! in the time intervals \f$([a_i,b_i])\f$.
//! Add the function mean values \f$([a_i,b_i], m_i)\f$ where
//! \f$m_i\f$ are the means of the function in the time intervals
//! \f$([a+(i-1)l,b+il])\f$, \f$i\in[n]\f$ and \f$l=(b-a)/n\f$.
//!
//! \param[in] time The start of the period including \p values.
//! \param[in] values Time ranges and the corresponding function
//! value moments.
void initialValues(core_t::TTime time, const TTimeTimePrMeanVarPrVec &values);
//! \param[in] startTime The start of the period.
//! \param[in] endTime The start of the period.
//! \param[in] values The mean values in a regular subdivision
//! of [\p start,\p end].
void initialValues(core_t::TTime startTime,
core_t::TTime endTime,
const TFloatMeanAccumulatorVec &values);

//! Get the number of buckets.
std::size_t size(void) const;
Expand Down Expand Up @@ -204,21 +205,18 @@ class MATHS_EXPORT CAdaptiveBucketing
//! Get the memory used by this component
std::size_t memoryUsage(void) const;

private:
typedef CBasicStatistics::SSampleMean<CFloatStorage>::TAccumulator TFloatMeanAccumulator;

private:
//! Compute the values corresponding to the change in end
//! points from \p endpoints. The values are assigned based
//! on their intersection with each bucket in the previous
//! bucket configuration.
virtual void refresh(const TFloatVec &endpoints) = 0;

//! Check if \p time is in the this component's window.
virtual bool inWindow(core_t::TTime time) const = 0;

//! Add the function value at \p time.
virtual void add(std::size_t bucket,
core_t::TTime time,
double offset,
const TDoubleMeanVarAccumulator &value) = 0;
virtual void add(std::size_t bucket, core_t::TTime time, double value, double weight) = 0;

//! Get the offset w.r.t. the start of the bucketing of \p time.
virtual double offset(core_t::TTime time) const = 0;
Expand Down
23 changes: 23 additions & 0 deletions include/maths/CBasicStatistics.h
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,23 @@ class MATHS_EXPORT CBasicStatistics
//! Compute the sample median.
static double median(const TDoubleVec &dataIn);

//! Compute the maximum of \p first, \p second and \p third.
template<typename T>
static T max(T first, T second, T third)
{
return first >= second ?
(third >= first ? third : first) :
(third >= second ? third : second);
}

//! Compute the minimum of \p first, \p second and \p third.
template<typename T>
static T min(T first, T second, T third)
{
return first <= second ?
(third <= first ? third : first) :
(third <= second ? third : second);
}

/////////////////////////// ACCUMULATORS ///////////////////////////

Expand Down Expand Up @@ -1620,6 +1637,12 @@ class MATHS_EXPORT CBasicStatistics
return m_Max[0];
}

//! Get the range.
T range(void) const
{
return m_Max[0] - m_Min[0];
}

//! Get the margin by which all the values have the same sign.
T signMargin(void) const
{
Expand Down
13 changes: 6 additions & 7 deletions include/maths/CCalendarComponentAdaptiveBucketing.h
Original file line number Diff line number Diff line change
Expand Up @@ -47,8 +47,7 @@ class CSeasonalTime;
class MATHS_EXPORT CCalendarComponentAdaptiveBucketing : private CAdaptiveBucketing
{
public:
typedef CAdaptiveBucketing::TTimeTimePrMeanVarPrVec TTimeTimePrMeanVarPrVec;
typedef CBasicStatistics::SSampleMeanVar<CFloatStorage>::TAccumulator TFloatMeanVarAccumulator;
using TFloatMeanVarAccumulator = CBasicStatistics::SSampleMeanVar<CFloatStorage>::TAccumulator;

public:
CCalendarComponentAdaptiveBucketing(void);
Expand Down Expand Up @@ -160,7 +159,7 @@ class MATHS_EXPORT CCalendarComponentAdaptiveBucketing : private CAdaptiveBucket
//@}

private:
typedef std::vector<TFloatMeanVarAccumulator> TFloatMeanVarVec;
using TFloatMeanVarVec = std::vector<TFloatMeanVarAccumulator>;

private:
//! Restore by traversing a state document
Expand All @@ -174,11 +173,11 @@ class MATHS_EXPORT CCalendarComponentAdaptiveBucketing : private CAdaptiveBucket
//! \param[in] endpoints The old end points.
void refresh(const TFloatVec &endpoints);

//! Check if \p time is in the this component's window.
virtual bool inWindow(core_t::TTime time) const;

//! Add the function value to \p bucket.
virtual void add(std::size_t bucket,
core_t::TTime time,
double offset,
const TDoubleMeanVarAccumulator &value);
virtual void add(std::size_t bucket, core_t::TTime time, double value, double weight);

//! Get the offset w.r.t. the start of the bucketing of \p time.
virtual double offset(core_t::TTime time) const;
Expand Down
24 changes: 12 additions & 12 deletions include/maths/CDecompositionComponent.h
Original file line number Diff line number Diff line change
Expand Up @@ -43,15 +43,15 @@ namespace maths
class MATHS_EXPORT CDecompositionComponent
{
public:
typedef maths_t::TDoubleDoublePr TDoubleDoublePr;
typedef std::vector<double> TDoubleVec;
typedef std::vector<CFloatStorage> TFloatVec;
typedef CSpline<boost::reference_wrapper<const TFloatVec>,
boost::reference_wrapper<const TFloatVec>,
boost::reference_wrapper<const TDoubleVec> > TSplineCRef;
typedef CSpline<boost::reference_wrapper<TFloatVec>,
boost::reference_wrapper<TFloatVec>,
boost::reference_wrapper<TDoubleVec> > TSplineRef;
using TDoubleDoublePr = maths_t::TDoubleDoublePr;
using TDoubleVec = std::vector<double>;
using TFloatVec = std::vector<CFloatStorage>;
using TSplineCRef = CSpline<boost::reference_wrapper<const TFloatVec>,
boost::reference_wrapper<const TFloatVec>,
boost::reference_wrapper<const TDoubleVec>>;
using TSplineRef = CSpline<boost::reference_wrapper<TFloatVec>,
boost::reference_wrapper<TFloatVec>,
boost::reference_wrapper<TDoubleVec>>;

public:
//! Persist state by passing information to \p inserter.
Expand All @@ -72,9 +72,9 @@ class MATHS_EXPORT CDecompositionComponent
};

public:
typedef boost::array<CSplineTypes::EType, 2> TTypeArray;
typedef boost::array<TFloatVec, 2> TFloatVecArray;
typedef boost::array<TDoubleVec, 2> TDoubleVecArray;
using TTypeArray = boost::array<CSplineTypes::EType, 2>;
using TFloatVecArray = boost::array<TFloatVec, 2>;
using TDoubleVecArray = boost::array<TDoubleVec, 2>;

public:
CPackedSplines(CSplineTypes::EType valueInterpolationType,
Expand Down
135 changes: 135 additions & 0 deletions include/maths/CExpandingWindow.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,135 @@
/*
* ELASTICSEARCH CONFIDENTIAL
*
* Copyright (c) 2018 Elasticsearch BV. All Rights Reserved.
*
* Notice: this software, and all information contained
* therein, is the exclusive property of Elasticsearch BV
* and its licensors, if any, and is protected under applicable
* domestic and foreign law, and international treaties.
*
* Reproduction, republication or distribution without the
* express written consent of Elasticsearch BV is
* strictly prohibited.
*/

#ifndef INCLUDED_ml_maths_CExpandingWindow_h
#define INCLUDED_ml_maths_CExpandingWindow_h

#include <core/CFloatStorage.h>
#include <core/CoreTypes.h>
#include <core/CVectorRange.h>

#include <maths/CBasicStatistics.h>
#include <maths/ImportExport.h>

#include <cstddef>
#include <functional>
#include <vector>

namespace ml
{
namespace core
{
class CStatePersistInserter;
class CStateRestoreTraverser;
}

namespace maths
{

//! \brief Implements a fixed memory expanding time window.
//!
//! DESCRIPTION:\n
//! As the window expands it compresses by merging adjacent values
//! and maintaining means of merged values. It cycles through a
//! sequence of increasing compression factors, which are determined
//! by a sequence of increasing bucketing lengths supplied to the
//! constructor. At the point it overflows, i.e. time since the
//! beginning of the window exceeds "size" x "maximum bucket length",
//! it will re-initialize the bucketing and update the start time.
class MATHS_EXPORT CExpandingWindow
{
public:
using TDoubleVec = std::vector<double>;
using TTimeVec = std::vector<core_t::TTime>;
using TTimeCRng = core::CVectorRange<const TTimeVec>;
using TFloatMeanAccumulator = CBasicStatistics::SSampleMean<CFloatStorage>::TAccumulator;
using TFloatMeanAccumulatorVec = std::vector<TFloatMeanAccumulator>;
using TPredictor = std::function<double (core_t::TTime)>;

public:
CExpandingWindow(core_t::TTime bucketLength,
TTimeCRng bucketLengths,
std::size_t size,
double decayRate = 0.0);

//! Initialize by reading state from \p traverser.
bool acceptRestoreTraverser(core::CStateRestoreTraverser &traverser);

//! Persist state by passing information to \p inserter.
void acceptPersistInserter(core::CStatePersistInserter &inserter) const;

//! Get the start time of the sketch.
core_t::TTime startTime() const;

//! Get the end time of the sketch.
core_t::TTime endTime() const;

//! Get the current bucket length.
core_t::TTime bucketLength() const;

//! Get the bucket values.
const TFloatMeanAccumulatorVec &values() const;

//! Get the bucket values minus the values from \p trend.
TFloatMeanAccumulatorVec valuesMinusPrediction(const TPredictor &predictor) const;

//! Set the start time to \p time.
void initialize(core_t::TTime time);

//! Age the bucket values to account for \p time elapsed time.
void propagateForwardsByTime(double time);

//! Add \p value at \p time.
void add(core_t::TTime time, double value, double weight = 1.0);

//! Check if we need to compress by increasing the bucket span.
bool needToCompress(core_t::TTime time) const;

//! Get a checksum for this object.
uint64_t checksum(uint64_t seed = 0) const;

//! Debug the memory used by this object.
void debugMemoryUsage(core::CMemoryUsage::TMemoryUsagePtr mem) const;

//! Get the memory used by this object.
std::size_t memoryUsage() const;

private:
//! The rate at which the bucket values are aged.
double m_DecayRate;

//! The data bucketing length.
core_t::TTime m_BucketLength;

//! The bucket lengths to test.
TTimeCRng m_BucketLengths;

//! The index in m_BucketLengths of the current bucketing interval.
std::size_t m_BucketLengthIndex;

//! The time of the first data point.
core_t::TTime m_StartTime;

//! The bucket values.
TFloatMeanAccumulatorVec m_BucketValues;

//! The mean value time modulo the data bucketing length.
TFloatMeanAccumulator m_MeanOffset;
};

}
}

#endif // INCLUDED_ml_maths_CExpandingWindow_h
23 changes: 19 additions & 4 deletions include/maths/CGammaRateConjugate.h
Original file line number Diff line number Diff line change
Expand Up @@ -85,15 +85,19 @@ class MATHS_EXPORT CGammaRateConjugate : public CPrior
//! \param[in] priorShape The shape parameter of the gamma prior.
//! \param[in] priorRate The rate parameter of the gamma prior.
//! \param[in] decayRate The rate at which to revert to non-informative.
//! \param[in] offsetMargin The margin between the smallest value and the support
//! left end.
CGammaRateConjugate(maths_t::EDataType dataType,
double offset,
double priorShape,
double priorRate,
double decayRate = 0.0);
double decayRate = 0.0,
double offsetMargin = GAMMA_OFFSET_MARGIN);

//! Construct by traversing a state document.
CGammaRateConjugate(const SDistributionRestoreParams &params,
core::CStateRestoreTraverser &traverser);
core::CStateRestoreTraverser &traverser,
double offsetMargin = GAMMA_OFFSET_MARGIN);

// Default copy constructor and assignment operator work.

Expand All @@ -103,10 +107,13 @@ class MATHS_EXPORT CGammaRateConjugate : public CPrior
//! for details).
//! \param[in] offset The offset to apply to the data.
//! \param[in] decayRate The rate at which to revert to the non-informative prior.
//! \param[in] offsetMargin The margin between the smallest value and the support
//! left end.
//! \return A non-informative prior.
static CGammaRateConjugate nonInformativePrior(maths_t::EDataType dataType,
double offset = 0.0,
double decayRate = 0.0);
double decayRate = 0.0,
double offsetMargin = GAMMA_OFFSET_MARGIN);
//@}

//! \name Prior Contract
Expand All @@ -123,7 +130,12 @@ class MATHS_EXPORT CGammaRateConjugate : public CPrior
//! Reset the prior to non-informative.
virtual void setToNonInformative(double offset = 0.0, double decayRate = 0.0);

//! Returns false.
//! Get the margin between the smallest value and the support left
//! end. Priors with non-negative support, automatically adjust the
//! offset if a value is seen which is smaller than offset + margin.
virtual double offsetMargin(void) const;

//! Returns true.
virtual bool needsOffset(void) const;

//! Reset m_Offset so the smallest sample is not within some minimum
Expand Down Expand Up @@ -399,6 +411,9 @@ class MATHS_EXPORT CGammaRateConjugate : public CPrior
//! us to model data with negative values greater than \f$-u\f$.
double m_Offset;

//! The margin between the smallest value and the support left end.
double m_OffsetMargin;

//! The maximum likelihood estimate of the shape parameter.
double m_LikelihoodShape;

Expand Down
Loading