Skip to content

Commit 10d4912

Browse files
authored
[ML] Wire in change detection/modelling to our univariate time series model (#11)
This wires in change detection and starts some unit testing of CTimeSeriesModel with change points. There is some more work to be done to avoid using level shifts to try and fit other types of change points, such as scaling.
1 parent 4052390 commit 10d4912

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

48 files changed

+2535
-1556
lines changed

include/maths/CBasicStatistics.h

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -221,6 +221,16 @@ class MATHS_EXPORT CBasicStatistics
221221
}
222222
}
223223

224+
//! Update the moments with the collection \p x.
225+
template<typename U, std::size_t N>
226+
void add(const core::CSmallVector<U, N> &x)
227+
{
228+
for (const auto &xi : x)
229+
{
230+
this->add(xi);
231+
}
232+
}
233+
224234
//! Update the moments with the collection \p x.
225235
template<typename U>
226236
void add(const std::vector<SSampleCentralMoments<U, ORDER>> &x)

include/maths/CModel.h

Lines changed: 21 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -49,9 +49,11 @@ class MATHS_EXPORT CModelParams
4949
{
5050
public:
5151
CModelParams(core_t::TTime bucketLength,
52-
const double &learnRate,
53-
const double &decayRate,
54-
double minimumSeasonalVarianceScale);
52+
double learnRate,
53+
double decayRate,
54+
double minimumSeasonalVarianceScale,
55+
core_t::TTime minimumTimeToDetectChange,
56+
core_t::TTime maximumTimeToTestForChange);
5557

5658
//! Get the bucket length.
5759
core_t::TTime bucketLength(void) const;
@@ -68,6 +70,15 @@ class MATHS_EXPORT CModelParams
6870
//! Get the minimum seasonal variance scale.
6971
double minimumSeasonalVarianceScale(void) const;
7072

73+
//! Check if we should start testing for a change point in the model.
74+
bool testForChange(core_t::TTime changeInterval) const;
75+
76+
//! Get the minimum time to detect a change point in the model.
77+
core_t::TTime minimumTimeToDetectChange(core_t::TTime timeSinceLastChangePoint) const;
78+
79+
//! Get the maximum time to test for a change point in the model.
80+
core_t::TTime maximumTimeToTestForChange(void) const;
81+
7182
//! Set the probability that the bucket will be empty for the model.
7283
void probabilityBucketEmpty(double probability);
7384

@@ -83,6 +94,10 @@ class MATHS_EXPORT CModelParams
8394
double m_DecayRate;
8495
//! The minimum seasonal variance scale.
8596
double m_MinimumSeasonalVarianceScale;
97+
//! The minimum time permitted to detect a change in the model.
98+
core_t::TTime m_MinimumTimeToDetectChange;
99+
//! The maximum time permitted to test for a change in the model.
100+
core_t::TTime m_MaximumTimeToTestForChange;
86101
//! The probability that a bucket will be empty for the model.
87102
double m_ProbabilityBucketEmpty;
88103
};
@@ -97,8 +112,6 @@ class MATHS_EXPORT CModelAddSamplesParams
97112

98113
public:
99114
CModelAddSamplesParams(void);
100-
CModelAddSamplesParams(const CModelAddSamplesParams &) = delete;
101-
const CModelAddSamplesParams &operator=(const CModelAddSamplesParams &) = delete;
102115

103116
//! Set whether or not the data are integer valued.
104117
CModelAddSamplesParams &integer(bool integer);
@@ -160,8 +173,6 @@ class MATHS_EXPORT CModelProbabilityParams
160173

161174
public:
162175
CModelProbabilityParams(void);
163-
CModelProbabilityParams(const CModelAddSamplesParams &) = delete;
164-
const CModelProbabilityParams &operator=(const CModelAddSamplesParams &) = delete;
165176

166177
//! Set the tag for the entity for which to compute the probability.
167178
CModelProbabilityParams &tag(std::size_t tag);
@@ -278,6 +289,9 @@ class MATHS_EXPORT CModel
278289
E_Reset //!< Model reset.
279290
};
280291

292+
//! Combine the results \p lhs and \p rhs.
293+
static EUpdateResult combine(EUpdateResult lhs, EUpdateResult rhs);
294+
281295
public:
282296
CModel(const CModelParams &params);
283297
virtual ~CModel(void) = default;

include/maths/CNaiveBayes.h

Lines changed: 61 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -11,9 +11,11 @@
1111

1212
#include <maths/CPrior.h>
1313

14+
#include <boost/optional.hpp>
1415
#include <boost/unordered_map.hpp>
1516

1617
#include <cstddef>
18+
#include <string>
1719
#include <vector>
1820

1921
namespace ml
@@ -49,12 +51,18 @@ class MATHS_EXPORT CNaiveBayesFeatureDensity
4951
//! Persist state by passing information to \p inserter.
5052
virtual void acceptPersistInserter(core::CStatePersistInserter &inserter) const = 0;
5153

54+
//! Set the data type.
55+
virtual void dataType(maths_t::EDataType dataType) = 0;
56+
5257
//! Add the value \p x.
5358
virtual void add(const TDouble1Vec &x) = 0;
5459

5560
//! Compute the log value of the density function at \p x.
5661
virtual double logValue(const TDouble1Vec &x) const = 0;
5762

63+
//! Compute the density at the mode.
64+
virtual double logMaximumValue() const = 0;
65+
5866
//! Age out old values density to account for \p time passing.
5967
virtual void propagateForwardsByTime(double time) = 0;
6068

@@ -69,6 +77,9 @@ class MATHS_EXPORT CNaiveBayesFeatureDensity
6977

7078
//! Get a checksum for this object.
7179
virtual uint64_t checksum(uint64_t seed) const = 0;
80+
81+
//! Get a human readable description of the class density function.
82+
virtual std::string print() const = 0;
7283
};
7384

7485
//! \brief An implementation of the class conditional density function
@@ -77,7 +88,7 @@ class MATHS_EXPORT CNaiveBayesFeatureDensityFromPrior final : public CNaiveBayes
7788
{
7889
public:
7990
CNaiveBayesFeatureDensityFromPrior() = default;
80-
CNaiveBayesFeatureDensityFromPrior(CPrior &prior);
91+
CNaiveBayesFeatureDensityFromPrior(const CPrior &prior);
8192

8293
//! Create and return a clone.
8394
//!
@@ -97,6 +108,12 @@ class MATHS_EXPORT CNaiveBayesFeatureDensityFromPrior final : public CNaiveBayes
97108
//! Compute the log value of the density function at \p x.
98109
virtual double logValue(const TDouble1Vec &x) const;
99110

111+
//! Compute the density at the mode.
112+
virtual double logMaximumValue() const;
113+
114+
//! Set the data type.
115+
virtual void dataType(maths_t::EDataType dataType);
116+
100117
//! Age out old values density to account for \p time passing.
101118
virtual void propagateForwardsByTime(double time);
102119

@@ -112,6 +129,9 @@ class MATHS_EXPORT CNaiveBayesFeatureDensityFromPrior final : public CNaiveBayes
112129
//! Get a checksum for this object.
113130
virtual uint64_t checksum(uint64_t seed) const;
114131

132+
//! Get a human readable description of the class density function.
133+
virtual std::string print() const;
134+
115135
private:
116136
using TPriorPtr = boost::shared_ptr<CPrior>;
117137

@@ -128,16 +148,24 @@ class MATHS_EXPORT CNaiveBayes
128148
using TDoubleSizePrVec = std::vector<TDoubleSizePr>;
129149
using TDouble1Vec = core::CSmallVector<double, 1>;
130150
using TDouble1VecVec = std::vector<TDouble1Vec>;
151+
using TOptionalDouble = boost::optional<double>;
131152

132153
public:
133154
explicit CNaiveBayes(const CNaiveBayesFeatureDensity &exemplar,
134-
double decayRate = 0.0);
155+
double decayRate = 0.0,
156+
TOptionalDouble minMaxLogLikelihoodToUseFeature = TOptionalDouble());
135157
CNaiveBayes(const SDistributionRestoreParams &params,
136158
core::CStateRestoreTraverser &traverser);
137159

138160
//! Persist state by passing information to \p inserter.
139161
void acceptPersistInserter(core::CStatePersistInserter &inserter) const;
140162

163+
//! Efficiently swap the contents of this and \p other.
164+
void swap(CNaiveBayes &other);
165+
166+
//! Check if any training data has been added initialized.
167+
bool initialized() const;
168+
141169
//! This can be used to optionally seed the class counts
142170
//! with \p counts. These are added on to data class counts
143171
//! to compute the class posterior probabilities.
@@ -153,11 +181,14 @@ class MATHS_EXPORT CNaiveBayes
153181
//! for that feature.
154182
void addTrainingDataPoint(std::size_t label, const TDouble1VecVec &x);
155183

184+
//! Set the data type.
185+
void dataType(maths_t::EDataType dataType);
186+
156187
//! Age out old values from the class conditional densities
157188
//! to account for \p time passing.
158189
void propagateForwardsByTime(double time);
159190

160-
//! Get the top \p n class probabilities for \p features.
191+
//! Get the top \p n class probabilities for \p x.
161192
//!
162193
//! \param[in] n The number of class probabilities to estimate.
163194
//! \param[in] x The feature values.
@@ -167,6 +198,23 @@ class MATHS_EXPORT CNaiveBayes
167198
TDoubleSizePrVec highestClassProbabilities(std::size_t n,
168199
const TDouble1VecVec &x) const;
169200

201+
//! Get the probability of the class labeled \p label for \p x.
202+
//!
203+
//! \param[in] label The label of the class of interest.
204+
//! \param[in] x The feature values.
205+
//! \note \p x size should be equal to the number of features.
206+
//! A feature is missing is indicated by passing an empty vector
207+
//! for that feature.
208+
double classProbability(std::size_t label, const TDouble1VecVec &x) const;
209+
210+
//! Get the probabilities of all the classes for \p x.
211+
//!
212+
//! \param[in] x The feature values.
213+
//! \note \p x size should be equal to the number of features.
214+
//! A feature is missing is indicated by passing an empty vector
215+
//! for that feature.
216+
TDoubleSizePrVec classProbabilities(const TDouble1VecVec &x) const;
217+
170218
//! Debug the memory used by this object.
171219
void debugMemoryUsage(core::CMemoryUsage::TMemoryUsagePtr mem) const;
172220

@@ -176,6 +224,9 @@ class MATHS_EXPORT CNaiveBayes
176224
//! Get a checksum for this object.
177225
uint64_t checksum(uint64_t seed = 0) const;
178226

227+
//! Get a human readable description of the classifier.
228+
std::string print() const;
229+
179230
private:
180231
using TFeatureDensityPtr = boost::shared_ptr<CNaiveBayesFeatureDensity>;
181232
using TFeatureDensityPtrVec = std::vector<TFeatureDensityPtr>;
@@ -212,6 +263,13 @@ class MATHS_EXPORT CNaiveBayes
212263
bool validate(const TDouble1VecVec &x) const;
213264

214265
private:
266+
//! It is not always appropriate to use features with very low
267+
//! probability in all classes to discriminate: the class choice
268+
//! will be very sensitive to the underlying conditional density
269+
//! model. This is a cutoff (for the minimum maximum class log
270+
//! likelihood) in order to use a feature.
271+
TOptionalDouble m_MinMaxLogLikelihoodToUseFeature;
272+
215273
//! Controls the rate at which data are aged out.
216274
double m_DecayRate;
217275

include/maths/CRestoreParams.h

Lines changed: 30 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99

1010
#include <core/CoreTypes.h>
1111

12+
#include <maths/Constants.h>
1213
#include <maths/ImportExport.h>
1314
#include <maths/MathsTypes.h>
1415

@@ -20,33 +21,15 @@ namespace maths
2021
{
2122
class CModelParams;
2223

23-
//! \brief Gatherers up extra parameters supplied when restoring
24-
//! time series decompositions.
25-
struct MATHS_EXPORT STimeSeriesDecompositionRestoreParams
26-
{
27-
STimeSeriesDecompositionRestoreParams(double decayRate,
28-
core_t::TTime minimumBucketLength,
29-
std::size_t componentSize);
30-
31-
//! The rate at which decomposition loses information.
32-
double s_DecayRate;
33-
34-
//! The data bucket length.
35-
core_t::TTime s_MinimumBucketLength;
36-
37-
//! The decomposition seasonal component size.
38-
std::size_t s_ComponentSize;
39-
};
40-
4124
//! \brief Gatherers up extra parameters supplied when restoring
4225
//! distribution models.
4326
struct MATHS_EXPORT SDistributionRestoreParams
4427
{
4528
SDistributionRestoreParams(maths_t::EDataType dataType,
4629
double decayRate,
47-
double minimumClusterFraction,
48-
double minimumClusterCount,
49-
double minimumCategoryCount);
30+
double minimumClusterFraction = MINIMUM_CLUSTER_SPLIT_FRACTION,
31+
double minimumClusterCount = MINIMUM_CLUSTER_SPLIT_COUNT,
32+
double minimumCategoryCount = MINIMUM_CATEGORY_COUNT);
5033

5134
//! The type of data being clustered.
5235
maths_t::EDataType s_DataType;
@@ -66,6 +49,31 @@ struct MATHS_EXPORT SDistributionRestoreParams
6649

6750
//! \brief Gatherers up extra parameters supplied when restoring
6851
//! time series decompositions.
52+
struct MATHS_EXPORT STimeSeriesDecompositionRestoreParams
53+
{
54+
STimeSeriesDecompositionRestoreParams(double decayRate,
55+
core_t::TTime minimumBucketLength,
56+
std::size_t componentSize,
57+
const SDistributionRestoreParams &changeModelParams);
58+
STimeSeriesDecompositionRestoreParams(double decayRate,
59+
core_t::TTime minimumBucketLength,
60+
const SDistributionRestoreParams &changeModelParams);
61+
62+
//! The rate at which decomposition loses information.
63+
double s_DecayRate;
64+
65+
//! The data bucket length.
66+
core_t::TTime s_MinimumBucketLength;
67+
68+
//! The decomposition seasonal component size.
69+
std::size_t s_ComponentSize;
70+
71+
//! The change model distributions' restore parameters.
72+
SDistributionRestoreParams s_ChangeModelParams;
73+
};
74+
75+
//! \brief Gatherers up extra parameters supplied when restoring
76+
//! time series models.
6977
struct MATHS_EXPORT SModelRestoreParams
7078
{
7179
using TModelParamsCRef = boost::reference_wrapper<const CModelParams>;
@@ -80,7 +88,7 @@ struct MATHS_EXPORT SModelRestoreParams
8088
//! The time series decomposition restore parameters.
8189
STimeSeriesDecompositionRestoreParams s_DecompositionParams;
8290

83-
//! The time series decomposition restore parameters.
91+
//! The time series residual distribution restore parameters.
8492
SDistributionRestoreParams s_DistributionParams;
8593
};
8694

0 commit comments

Comments
 (0)