Skip to content

Commit 54c6257

Browse files
authored
[ML] Fix sparse data edge cases for periodicity testing (#28)
This fixes issue #20. Digging into the root cause, they were all down to very sparse data over the window we maintain to test for periodicity. This showed up the need to lower bound the count of buckets with periodic repeats when testing for periodic partitions.
1 parent 188724f commit 54c6257

File tree

2 files changed

+55
-32
lines changed

2 files changed

+55
-32
lines changed

include/maths/CPeriodicityHypothesisTests.h

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -334,14 +334,15 @@ class MATHS_EXPORT CPeriodicityHypothesisTests
334334
testForDailyWithWeekend(const TFloatMeanAccumulatorCRng &buckets,
335335
STestStats &stats) const;
336336

337-
//! Test for a weekday/end partition with weekly .
337+
//! Test for a weekly period given we think there is a
338+
//! weekday/end partition.
338339
CPeriodicityHypothesisTestsResult
339340
testForWeeklyGivenDailyWithWeekend(const TTimeTimePr2Vec &window,
340341
const TFloatMeanAccumulatorCRng &buckets,
341342
STestStats &stats) const;
342343

343-
//! Test for the specified period given we think there is
344-
//! some diurnal periodicity.
344+
//! Test for the specified period given we think there is diurnal
345+
//! periodicity.
345346
CPeriodicityHypothesisTestsResult
346347
testForPeriod(const TTimeTimePr2Vec &window,
347348
const TFloatMeanAccumulatorCRng &buckets,
@@ -351,6 +352,11 @@ class MATHS_EXPORT CPeriodicityHypothesisTests
351352
bool seenSufficientDataToTest(core_t::TTime period,
352353
const TFloatMeanAccumulatorCRng &buckets) const;
353354

355+
//! Check if there are enough non-empty buckets which are repeated
356+
//! at at least one \p period in \p buckets.
357+
bool seenSufficientPeriodicallyPopulatedBucketsToTest(const TFloatMeanAccumulatorCRng &buckets,
358+
std::size_t period) const;
359+
354360
//! Compute various ancillary statistics for testing.
355361
bool testStatisticsFor(const TFloatMeanAccumulatorCRng &buckets,
356362
STestStats &stats) const;

lib/maths/CPeriodicityHypothesisTests.cc

Lines changed: 46 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -1030,18 +1030,18 @@ CPeriodicityHypothesisTests::best(const TNestedHypothesesVec &hypotheses) const
10301030
{
10311031
STestStats stats;
10321032
CPeriodicityHypothesisTestsResult resultForHypothesis{hypothesis.test(stats)};
1033-
summaries.emplace_back(stats.s_V0, stats.s_B - stats.s_DF0,
1034-
std::move(resultForHypothesis));
1033+
if (stats.s_B > stats.s_DF0)
1034+
{
1035+
summaries.emplace_back(stats.s_V0, stats.s_B - stats.s_DF0,
1036+
std::move(resultForHypothesis));
1037+
}
10351038
}
10361039

10371040
TMinAccumulator vCutoff;
10381041
for (const auto &summary : summaries)
10391042
{
1040-
if (summary.s_DF > 0.0)
1041-
{
1042-
vCutoff.add(varianceAtPercentile(summary.s_V, summary.s_DF,
1043-
50.0 + CONFIDENCE_INTERVAL / 2.0));
1044-
}
1043+
vCutoff.add(varianceAtPercentile(summary.s_V, summary.s_DF,
1044+
50.0 + CONFIDENCE_INTERVAL / 2.0));
10451045
}
10461046
if (vCutoff.count() > 0)
10471047
{
@@ -1275,6 +1275,26 @@ bool CPeriodicityHypothesisTests::seenSufficientDataToTest(core_t::TTime period,
12751275
>= 2.0 * ACCURATE_TEST_POPULATED_FRACTION * static_cast<double>(period);
12761276
}
12771277

1278+
bool CPeriodicityHypothesisTests::seenSufficientPeriodicallyPopulatedBucketsToTest(const TFloatMeanAccumulatorCRng &buckets,
1279+
std::size_t period) const
1280+
{
1281+
double repeats{0.0};
1282+
for (std::size_t i = 0u; i < period; ++i)
1283+
{
1284+
for (std::size_t j = i + period; j < buckets.size(); j += period)
1285+
{
1286+
if ( CBasicStatistics::count(buckets[j])
1287+
* CBasicStatistics::count(buckets[j - period]) > 0.0)
1288+
{
1289+
repeats += 1.0;
1290+
break;
1291+
}
1292+
}
1293+
}
1294+
LOG_TRACE("repeated values = " << repeats);
1295+
return repeats >= static_cast<double>(period) * ACCURATE_TEST_POPULATED_FRACTION / 3.0;
1296+
}
1297+
12781298
bool CPeriodicityHypothesisTests::testStatisticsFor(const TFloatMeanAccumulatorCRng &buckets,
12791299
STestStats &stats) const
12801300
{
@@ -1424,21 +1444,7 @@ bool CPeriodicityHypothesisTests::testPeriod(const TTimeTimePr2Vec &windows,
14241444

14251445
// We need to observe a minimum number of repeated values to test with
14261446
// an acceptable false positive rate.
1427-
double repeats{0.0};
1428-
for (std::size_t i = 0u; i < period; ++i)
1429-
{
1430-
for (std::size_t j = i + period; j < buckets.size(); j += period)
1431-
{
1432-
if ( CBasicStatistics::count(buckets[j])
1433-
* CBasicStatistics::count(buckets[j - period]) > 0.0)
1434-
{
1435-
repeats += 1.0;
1436-
break;
1437-
}
1438-
}
1439-
}
1440-
LOG_TRACE(" repeated values = " << repeats);
1441-
if (repeats < static_cast<double>(period) * ACCURATE_TEST_POPULATED_FRACTION / 3.0)
1447+
if (!this->seenSufficientPeriodicallyPopulatedBucketsToTest(buckets, period))
14421448
{
14431449
return false;
14441450
}
@@ -1484,7 +1490,8 @@ bool CPeriodicityHypothesisTests::testPeriod(const TTimeTimePr2Vec &windows,
14841490
LOG_TRACE(" significance = " << CStatisticalTests::leftTailFTest(v1 / v0, df1, df0));
14851491

14861492
double Rt{stats.s_Rt * CTools::truncate(1.0 - 0.5 * (vt - v1) / vt, 0.9, 1.0)};
1487-
if (v1 < vt && CStatisticalTests::leftTailFTest(v1 / v0, df1, df0) <= MAXIMUM_SIGNIFICANCE)
1493+
if ( v1 < vt && B > 1.0
1494+
&& CStatisticalTests::leftTailFTest(v1 / v0, df1, df0) <= MAXIMUM_SIGNIFICANCE)
14881495
{
14891496
double R{CSignal::autocorrelation(period, values)};
14901497
R = autocorrelationAtPercentile(R, B, 50.0 - CONFIDENCE_INTERVAL / 2.0);
@@ -1558,6 +1565,15 @@ bool CPeriodicityHypothesisTests::testPartition(const TTimeTimePr2Vec &partition
15581565
{
15591566
return false;
15601567
}
1568+
1569+
std::size_t period{static_cast<std::size_t>(period_ / m_BucketLength)};
1570+
1571+
// We need to observe a minimum number of repeated values to test with
1572+
// an acceptable false positive rate.
1573+
if (!this->seenSufficientPeriodicallyPopulatedBucketsToTest(buckets, period))
1574+
{
1575+
return false;
1576+
}
15611577
if (stats.s_HasPartition)
15621578
{
15631579
return true;
@@ -1568,7 +1584,6 @@ bool CPeriodicityHypothesisTests::testPartition(const TTimeTimePr2Vec &partition
15681584
// evidence that it reduces the residual variance and repeats.
15691585

15701586
core_t::TTime windowLength{length(buckets, m_BucketLength)};
1571-
std::size_t period{static_cast<std::size_t>(period_ / m_BucketLength)};
15721587
core_t::TTime repeat{length(partition)};
15731588
core_t::TTime startOfPartition{stats.s_StartOfPartition};
15741589
double B{stats.s_B};
@@ -1723,11 +1738,13 @@ bool CPeriodicityHypothesisTests::testPartition(const TTimeTimePr2Vec &partition
17231738
double BW{std::accumulate(partitionValues.begin(), partitionValues.end(), 0.0,
17241739
[](double n, const TFloatMeanAccumulator &value)
17251740
{ return n + (CBasicStatistics::count(value) > 0.0 ? 1.0 : 0.0); })};
1726-
R = std::max(R, autocorrelationAtPercentile(CSignal::autocorrelation(
1727-
windowLength_ + period, partitionValues),
1728-
BW, 50.0 - CONFIDENCE_INTERVAL / 2.0));
1729-
LOG_TRACE(" autocorrelation = " << R);
1730-
LOG_TRACE(" autocorrelationThreshold = " << Rt);
1741+
if (BW > 1.0)
1742+
{
1743+
double RW{CSignal::autocorrelation(windowLength_ + period, partitionValues)};
1744+
R = std::max(R, autocorrelationAtPercentile(RW, BW, 50.0 - CONFIDENCE_INTERVAL / 2.0));
1745+
LOG_TRACE(" autocorrelation = " << R);
1746+
LOG_TRACE(" autocorrelationThreshold = " << Rt);
1747+
}
17311748
}
17321749

17331750
if (R > Rt)

0 commit comments

Comments
 (0)