Skip to content

[ML] Improve change point detection for long bucket lengths #95

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 5 additions & 2 deletions include/maths/CBasicStatistics.h
Original file line number Diff line number Diff line change
Expand Up @@ -67,10 +67,13 @@ class MATHS_EXPORT CBasicStatistics {
}

//! Compute the sample mean.
static double mean(const TDoubleVec& sample);
static double mean(const TDoubleVec& data);

//! Compute the sample median.
static double median(const TDoubleVec& dataIn);
static double median(const TDoubleVec& data);

//! Compute the median absolute deviation.
static double mad(const TDoubleVec& data);

//! Compute the maximum of \p first, \p second and \p third.
template<typename T>
Expand Down
24 changes: 18 additions & 6 deletions include/maths/CTimeSeriesChangeDetector.h
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
#include <core/CoreTypes.h>

#include <maths/CBasicStatistics.h>
#include <maths/CRegression.h>
#include <maths/ImportExport.h>
#include <maths/MathsTypes.h>

Expand Down Expand Up @@ -90,11 +91,17 @@ class MATHS_EXPORT CUnivariateTimeSeriesChangeDetector {
//! if there has been.
TOptionalChangeDescription change();

//! The function used to decide whether to accept a change.
//! A change is accepted at a value of 1.0 for this function.
//! Get an rough estimate of the chance that the change will
//! eventually be accepted.
double probabilityWillAccept() const;

//! Evaluate the function used to decide whether to accept
//! a change.
//!
//! A change is accepted for values >= 1.0.
//!
//! \param[out] change Filled in with the index of the change
//! the most likely change.
//! \param[out] change Filled in with the index of the most
//! likely change.
double decisionFunction(std::size_t& change) const;

//! Add \p samples to the change detector.
Expand All @@ -117,6 +124,7 @@ class MATHS_EXPORT CUnivariateTimeSeriesChangeDetector {
using TChangeModelPtr = std::shared_ptr<TChangeModel>;
using TChangeModelPtr5Vec = core::CSmallVector<TChangeModelPtr, 5>;
using TMinMaxAccumulator = CBasicStatistics::CMinMax<core_t::TTime>;
using TRegression = CRegression::CLeastSquaresOnline<1, double>;

private:
//! The minimum amount of time we need to observe before
Expand All @@ -135,8 +143,12 @@ class MATHS_EXPORT CUnivariateTimeSeriesChangeDetector {
//! The count of samples added to the change models.
std::size_t m_SampleCount;

//! The current evidence of a change.
double m_CurrentEvidenceOfChange;
//! The current value of the decision function.
double m_DecisionFunction;

//! A least squares fit to the log of the inverse decision
//! function as a function of time.
TRegression m_LogInvDecisionFunctionTrend;

//! The change models.
TChangeModelPtr5Vec m_ChangeModels;
Expand Down
12 changes: 10 additions & 2 deletions include/maths/CTimeSeriesDecompositionDetail.h
Original file line number Diff line number Diff line change
Expand Up @@ -202,8 +202,16 @@ class MATHS_EXPORT CTimeSeriesDecompositionDetail {
//! Test to see whether any seasonal components are present.
void test(const SAddValue& message);

//! Clear the test identified by \p test.
void clear(ETest test, core_t::TTime time);
//! Clear the test if the shift is large compared to the median
//! absolute deviation in the window.
//!
//! There is no point in continuing to use the historical window
//! if the signal has changed significantly w.r.t. the possible
//! magnitude of any seasonal component. Çonversely, if we detect
//! a small change we don't want to throw a lot of history: since,
//! depending on the false positive rate, we may never accumulate
//! enough history to detect long seasonal components.
void maybeClear(core_t::TTime time, double shift);

//! Age the test to account for the interval \p end - \p start
//! elapsed time.
Expand Down
10 changes: 5 additions & 5 deletions lib/maths/CAdaptiveBucketing.cc
Original file line number Diff line number Diff line change
Expand Up @@ -290,11 +290,11 @@ void CAdaptiveBucketing::refine(core_t::TTime time) {
// positions. Once they have stabilized on their desired location
// for the trend, we are able to detect this by comparing the
// time averaged desired displacement and the absolute desired
// displacement. The lower the ratio the smaller more smoothing
// we apply. Note we want to damp the noise out because the
// process of adjusting the buckets end points loses a small
// amount of information, see the comments at the start of
// refresh for more details.
// displacement. The lower the ratio the more smoothing we apply.
// Note we want to damp the noise out because the process of
// adjusting the buckets end points loses a small amount of
// information, see the comments at the start of refresh for
// more details.
double alpha{
ALPHA * (CBasicStatistics::mean(m_MeanAbsDesiredDisplacement) == 0.0
? 1.0
Expand Down
70 changes: 42 additions & 28 deletions lib/maths/CBasicStatistics.cc
Original file line number Diff line number Diff line change
Expand Up @@ -13,36 +13,19 @@

namespace ml {
namespace maths {
namespace {

double CBasicStatistics::mean(const TDoubleDoublePr& samples) {
return 0.5 * (samples.first + samples.second);
}

double CBasicStatistics::mean(const TDoubleVec& sample) {
return std::accumulate(sample.begin(), sample.end(), 0.0) /
static_cast<double>(sample.size());
}
//! Compute the median reordering \p samples in the process.
double medianInPlace(std::vector<double>& data) {
std::size_t size{data.size()};

double CBasicStatistics::median(const TDoubleVec& dataIn) {
if (dataIn.empty()) {
return 0.0;
}

std::size_t size{dataIn.size()};
if (size == 1) {
return dataIn[0];
}

TDoubleVec data{dataIn};

// If data size is even (1,2,3,4) then take mean of 2,3 = 2.5
// If data size is odd (1,2,3,4,5) then take middle value = 3
double median{0.0};
// If sample size is even (1,2,3,4) then take mean of 2,3 = 2.5
// If sample size is odd (1,2,3,4,5) then take middle value = 3
bool useMean{size % 2 == 0};

// For an odd number of elements, this will get the median element into
// place. For an even number of elements, it will get the second element
// of the middle pair into place.
bool useMean{size % 2 == 0};
size_t index{size / 2};
std::nth_element(data.begin(), data.begin() + index, data.end());

Expand All @@ -52,12 +35,43 @@ double CBasicStatistics::median(const TDoubleVec& dataIn) {
// before the nth one in the vector.
auto left = std::max_element(data.begin(), data.begin() + index);

median = (*left + data[index]) / 2.0;
} else {
median = data[index];
return (*left + data[index]) / 2.0;
}

return data[index];
}
}

double CBasicStatistics::mean(const TDoubleDoublePr& data) {
return 0.5 * (data.first + data.second);
}

double CBasicStatistics::mean(const TDoubleVec& data) {
return std::accumulate(data.begin(), data.end(), 0.0) /
static_cast<double>(data.size());
}

double CBasicStatistics::median(const TDoubleVec& data_) {
if (data_.empty()) {
return 0.0;
}
if (data_.size() == 1) {
return data_[0];
}
TDoubleVec data{data_};
return medianInPlace(data);
}

return median;
double CBasicStatistics::mad(const TDoubleVec& data_) {
if (data_.size() < 2) {
return 0.0;
}
TDoubleVec data{data_};
double median{medianInPlace(data)};
for (auto& datum : data) {
datum = std::fabs(datum - median);
}
return medianInPlace(data);
}

const char CBasicStatistics::INTERNAL_DELIMITER(':');
Expand Down
6 changes: 3 additions & 3 deletions lib/maths/CModel.cc
Original file line number Diff line number Diff line change
Expand Up @@ -74,8 +74,8 @@ CModelParams::CModelParams(core_t::TTime bucketLength,
core_t::TTime maximumTimeToTestForChange)
: m_BucketLength(bucketLength), m_LearnRate(learnRate), m_DecayRate(decayRate),
m_MinimumSeasonalVarianceScale(minimumSeasonalVarianceScale),
m_MinimumTimeToDetectChange(std::max(minimumTimeToDetectChange, 12 * bucketLength)),
m_MaximumTimeToTestForChange(std::max(maximumTimeToTestForChange, 48 * bucketLength)),
m_MinimumTimeToDetectChange(std::max(minimumTimeToDetectChange, 6 * bucketLength)),
m_MaximumTimeToTestForChange(std::max(maximumTimeToTestForChange, 12 * bucketLength)),
m_ProbabilityBucketEmpty(0.0) {
}

Expand All @@ -100,7 +100,7 @@ double CModelParams::minimumSeasonalVarianceScale() const {
}

bool CModelParams::testForChange(core_t::TTime changeInterval) const {
return changeInterval >= std::max(3 * m_BucketLength, 10 * core::constants::MINUTE);
return changeInterval >= std::max(3 * m_BucketLength, core::constants::HOUR);
}

core_t::TTime CModelParams::minimumTimeToDetectChange(void) const {
Expand Down
8 changes: 3 additions & 5 deletions lib/maths/CPeriodicityHypothesisTests.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1434,7 +1434,7 @@ bool CPeriodicityHypothesisTests::testPeriod(const TTimeTimePr2Vec& windows,
}

double scale{1.0 / stats.s_M};
LOG_TRACE(<< "scale = " << scale);
LOG_TRACE(<< " scale = " << scale);

double v{residualVariance<double>(trend, scale)};
v = varianceAtPercentile(v, df1, 50.0 + CONFIDENCE_INTERVAL / 2.0);
Expand Down Expand Up @@ -1503,12 +1503,10 @@ bool CPeriodicityHypothesisTests::testPeriod(const TTimeTimePr2Vec& windows,
std::size_t n{static_cast<std::size_t>(
std::ceil(Rt * static_cast<double>(windowLength / period_)))};
double at{stats.s_At * std::sqrt(v0 / scale)};
LOG_TRACE(<< " n = " << n << ", at = " << at << ", v = " << v);
LOG_TRACE(<< " n = " << n << ", at = " << at << ", v = " << v);
TMeanAccumulator level;
for (const auto& value : values) {
if (CBasicStatistics::count(value) > 0.0) {
level.add(CBasicStatistics::mean(value));
}
level.add(CBasicStatistics::mean(value), CBasicStatistics::count(value));
}
TMinAmplitudeVec amplitudes(period, {n, CBasicStatistics::mean(level)});
periodicTrend(values, window, m_BucketLength, amplitudes);
Expand Down
Loading