Skip to content

Commit 943e3cf

Browse files
authored
[ML] Improve change point detection for long bucket lengths (#95)
1 parent 4e1a83b commit 943e3cf

16 files changed

+323
-229
lines changed

include/maths/CBasicStatistics.h

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -67,10 +67,13 @@ class MATHS_EXPORT CBasicStatistics {
6767
}
6868

6969
//! Compute the sample mean.
70-
static double mean(const TDoubleVec& sample);
70+
static double mean(const TDoubleVec& data);
7171

7272
//! Compute the sample median.
73-
static double median(const TDoubleVec& dataIn);
73+
static double median(const TDoubleVec& data);
74+
75+
//! Compute the median absolute deviation.
76+
static double mad(const TDoubleVec& data);
7477

7578
//! Compute the maximum of \p first, \p second and \p third.
7679
template<typename T>

include/maths/CTimeSeriesChangeDetector.h

Lines changed: 18 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
#include <core/CoreTypes.h>
1515

1616
#include <maths/CBasicStatistics.h>
17+
#include <maths/CRegression.h>
1718
#include <maths/ImportExport.h>
1819
#include <maths/MathsTypes.h>
1920

@@ -90,11 +91,17 @@ class MATHS_EXPORT CUnivariateTimeSeriesChangeDetector {
9091
//! if there has been.
9192
TOptionalChangeDescription change();
9293

93-
//! The function used to decide whether to accept a change.
94-
//! A change is accepted at a value of 1.0 for this function.
94+
//! Get an rough estimate of the chance that the change will
95+
//! eventually be accepted.
96+
double probabilityWillAccept() const;
97+
98+
//! Evaluate the function used to decide whether to accept
99+
//! a change.
100+
//!
101+
//! A change is accepted for values >= 1.0.
95102
//!
96-
//! \param[out] change Filled in with the index of the change
97-
//! the most likely change.
103+
//! \param[out] change Filled in with the index of the most
104+
//! likely change.
98105
double decisionFunction(std::size_t& change) const;
99106

100107
//! Add \p samples to the change detector.
@@ -117,6 +124,7 @@ class MATHS_EXPORT CUnivariateTimeSeriesChangeDetector {
117124
using TChangeModelPtr = std::shared_ptr<TChangeModel>;
118125
using TChangeModelPtr5Vec = core::CSmallVector<TChangeModelPtr, 5>;
119126
using TMinMaxAccumulator = CBasicStatistics::CMinMax<core_t::TTime>;
127+
using TRegression = CRegression::CLeastSquaresOnline<1, double>;
120128

121129
private:
122130
//! The minimum amount of time we need to observe before
@@ -135,8 +143,12 @@ class MATHS_EXPORT CUnivariateTimeSeriesChangeDetector {
135143
//! The count of samples added to the change models.
136144
std::size_t m_SampleCount;
137145

138-
//! The current evidence of a change.
139-
double m_CurrentEvidenceOfChange;
146+
//! The current value of the decision function.
147+
double m_DecisionFunction;
148+
149+
//! A least squares fit to the log of the inverse decision
150+
//! function as a function of time.
151+
TRegression m_LogInvDecisionFunctionTrend;
140152

141153
//! The change models.
142154
TChangeModelPtr5Vec m_ChangeModels;

include/maths/CTimeSeriesDecompositionDetail.h

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -202,8 +202,16 @@ class MATHS_EXPORT CTimeSeriesDecompositionDetail {
202202
//! Test to see whether any seasonal components are present.
203203
void test(const SAddValue& message);
204204

205-
//! Clear the test identified by \p test.
206-
void clear(ETest test, core_t::TTime time);
205+
//! Clear the test if the shift is large compared to the median
206+
//! absolute deviation in the window.
207+
//!
208+
//! There is no point in continuing to use the historical window
209+
//! if the signal has changed significantly w.r.t. the possible
210+
//! magnitude of any seasonal component. Çonversely, if we detect
211+
//! a small change we don't want to throw a lot of history: since,
212+
//! depending on the false positive rate, we may never accumulate
213+
//! enough history to detect long seasonal components.
214+
void maybeClear(core_t::TTime time, double shift);
207215

208216
//! Age the test to account for the interval \p end - \p start
209217
//! elapsed time.

lib/maths/CAdaptiveBucketing.cc

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -290,11 +290,11 @@ void CAdaptiveBucketing::refine(core_t::TTime time) {
290290
// positions. Once they have stabilized on their desired location
291291
// for the trend, we are able to detect this by comparing the
292292
// time averaged desired displacement and the absolute desired
293-
// displacement. The lower the ratio the smaller more smoothing
294-
// we apply. Note we want to damp the noise out because the
295-
// process of adjusting the buckets end points loses a small
296-
// amount of information, see the comments at the start of
297-
// refresh for more details.
293+
// displacement. The lower the ratio the more smoothing we apply.
294+
// Note we want to damp the noise out because the process of
295+
// adjusting the buckets end points loses a small amount of
296+
// information, see the comments at the start of refresh for
297+
// more details.
298298
double alpha{
299299
ALPHA * (CBasicStatistics::mean(m_MeanAbsDesiredDisplacement) == 0.0
300300
? 1.0

lib/maths/CBasicStatistics.cc

Lines changed: 42 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -13,36 +13,19 @@
1313

1414
namespace ml {
1515
namespace maths {
16+
namespace {
1617

17-
double CBasicStatistics::mean(const TDoubleDoublePr& samples) {
18-
return 0.5 * (samples.first + samples.second);
19-
}
20-
21-
double CBasicStatistics::mean(const TDoubleVec& sample) {
22-
return std::accumulate(sample.begin(), sample.end(), 0.0) /
23-
static_cast<double>(sample.size());
24-
}
18+
//! Compute the median reordering \p samples in the process.
19+
double medianInPlace(std::vector<double>& data) {
20+
std::size_t size{data.size()};
2521

26-
double CBasicStatistics::median(const TDoubleVec& dataIn) {
27-
if (dataIn.empty()) {
28-
return 0.0;
29-
}
30-
31-
std::size_t size{dataIn.size()};
32-
if (size == 1) {
33-
return dataIn[0];
34-
}
35-
36-
TDoubleVec data{dataIn};
37-
38-
// If data size is even (1,2,3,4) then take mean of 2,3 = 2.5
39-
// If data size is odd (1,2,3,4,5) then take middle value = 3
40-
double median{0.0};
22+
// If sample size is even (1,2,3,4) then take mean of 2,3 = 2.5
23+
// If sample size is odd (1,2,3,4,5) then take middle value = 3
24+
bool useMean{size % 2 == 0};
4125

4226
// For an odd number of elements, this will get the median element into
4327
// place. For an even number of elements, it will get the second element
4428
// of the middle pair into place.
45-
bool useMean{size % 2 == 0};
4629
size_t index{size / 2};
4730
std::nth_element(data.begin(), data.begin() + index, data.end());
4831

@@ -52,12 +35,43 @@ double CBasicStatistics::median(const TDoubleVec& dataIn) {
5235
// before the nth one in the vector.
5336
auto left = std::max_element(data.begin(), data.begin() + index);
5437

55-
median = (*left + data[index]) / 2.0;
56-
} else {
57-
median = data[index];
38+
return (*left + data[index]) / 2.0;
39+
}
40+
41+
return data[index];
42+
}
43+
}
44+
45+
double CBasicStatistics::mean(const TDoubleDoublePr& data) {
46+
return 0.5 * (data.first + data.second);
47+
}
48+
49+
double CBasicStatistics::mean(const TDoubleVec& data) {
50+
return std::accumulate(data.begin(), data.end(), 0.0) /
51+
static_cast<double>(data.size());
52+
}
53+
54+
double CBasicStatistics::median(const TDoubleVec& data_) {
55+
if (data_.empty()) {
56+
return 0.0;
57+
}
58+
if (data_.size() == 1) {
59+
return data_[0];
5860
}
61+
TDoubleVec data{data_};
62+
return medianInPlace(data);
63+
}
5964

60-
return median;
65+
double CBasicStatistics::mad(const TDoubleVec& data_) {
66+
if (data_.size() < 2) {
67+
return 0.0;
68+
}
69+
TDoubleVec data{data_};
70+
double median{medianInPlace(data)};
71+
for (auto& datum : data) {
72+
datum = std::fabs(datum - median);
73+
}
74+
return medianInPlace(data);
6175
}
6276

6377
const char CBasicStatistics::INTERNAL_DELIMITER(':');

lib/maths/CModel.cc

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -74,8 +74,8 @@ CModelParams::CModelParams(core_t::TTime bucketLength,
7474
core_t::TTime maximumTimeToTestForChange)
7575
: m_BucketLength(bucketLength), m_LearnRate(learnRate), m_DecayRate(decayRate),
7676
m_MinimumSeasonalVarianceScale(minimumSeasonalVarianceScale),
77-
m_MinimumTimeToDetectChange(std::max(minimumTimeToDetectChange, 12 * bucketLength)),
78-
m_MaximumTimeToTestForChange(std::max(maximumTimeToTestForChange, 48 * bucketLength)),
77+
m_MinimumTimeToDetectChange(std::max(minimumTimeToDetectChange, 6 * bucketLength)),
78+
m_MaximumTimeToTestForChange(std::max(maximumTimeToTestForChange, 12 * bucketLength)),
7979
m_ProbabilityBucketEmpty(0.0) {
8080
}
8181

@@ -100,7 +100,7 @@ double CModelParams::minimumSeasonalVarianceScale() const {
100100
}
101101

102102
bool CModelParams::testForChange(core_t::TTime changeInterval) const {
103-
return changeInterval >= std::max(3 * m_BucketLength, 10 * core::constants::MINUTE);
103+
return changeInterval >= std::max(3 * m_BucketLength, core::constants::HOUR);
104104
}
105105

106106
core_t::TTime CModelParams::minimumTimeToDetectChange(void) const {

lib/maths/CPeriodicityHypothesisTests.cc

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1434,7 +1434,7 @@ bool CPeriodicityHypothesisTests::testPeriod(const TTimeTimePr2Vec& windows,
14341434
}
14351435

14361436
double scale{1.0 / stats.s_M};
1437-
LOG_TRACE(<< "scale = " << scale);
1437+
LOG_TRACE(<< " scale = " << scale);
14381438

14391439
double v{residualVariance<double>(trend, scale)};
14401440
v = varianceAtPercentile(v, df1, 50.0 + CONFIDENCE_INTERVAL / 2.0);
@@ -1503,12 +1503,10 @@ bool CPeriodicityHypothesisTests::testPeriod(const TTimeTimePr2Vec& windows,
15031503
std::size_t n{static_cast<std::size_t>(
15041504
std::ceil(Rt * static_cast<double>(windowLength / period_)))};
15051505
double at{stats.s_At * std::sqrt(v0 / scale)};
1506-
LOG_TRACE(<< " n = " << n << ", at = " << at << ", v = " << v);
1506+
LOG_TRACE(<< " n = " << n << ", at = " << at << ", v = " << v);
15071507
TMeanAccumulator level;
15081508
for (const auto& value : values) {
1509-
if (CBasicStatistics::count(value) > 0.0) {
1510-
level.add(CBasicStatistics::mean(value));
1511-
}
1509+
level.add(CBasicStatistics::mean(value), CBasicStatistics::count(value));
15121510
}
15131511
TMinAmplitudeVec amplitudes(period, {n, CBasicStatistics::mean(level)});
15141512
periodicTrend(values, window, m_BucketLength, amplitudes);

0 commit comments

Comments
 (0)