Skip to content

Commit 09f401c

Browse files
authored
[ML] Fix sparse data edge cases for periodicity testing (#28)
This fixes issue #20. Digging into the root cause, they were all down to very sparse data over the window we maintain to test for periodicity. This showed up the need to lower bound the count of buckets with periodic repeats when testing for periodic partitions.
1 parent 8251de6 commit 09f401c

File tree

2 files changed

+55
-32
lines changed

2 files changed

+55
-32
lines changed

include/maths/CPeriodicityHypothesisTests.h

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -343,14 +343,15 @@ class MATHS_EXPORT CPeriodicityHypothesisTests
343343
testForDailyWithWeekend(const TFloatMeanAccumulatorCRng &buckets,
344344
STestStats &stats) const;
345345

346-
//! Test for a weekday/end partition with weekly .
346+
//! Test for a weekly period given we think there is a
347+
//! weekday/end partition.
347348
CPeriodicityHypothesisTestsResult
348349
testForWeeklyGivenDailyWithWeekend(const TTimeTimePr2Vec &window,
349350
const TFloatMeanAccumulatorCRng &buckets,
350351
STestStats &stats) const;
351352

352-
//! Test for the specified period given we think there is
353-
//! some diurnal periodicity.
353+
//! Test for the specified period given we think there is diurnal
354+
//! periodicity.
354355
CPeriodicityHypothesisTestsResult
355356
testForPeriod(const TTimeTimePr2Vec &window,
356357
const TFloatMeanAccumulatorCRng &buckets,
@@ -360,6 +361,11 @@ class MATHS_EXPORT CPeriodicityHypothesisTests
360361
bool seenSufficientDataToTest(core_t::TTime period,
361362
const TFloatMeanAccumulatorCRng &buckets) const;
362363

364+
//! Check if there are enough non-empty buckets which are repeated
365+
//! at at least one \p period in \p buckets.
366+
bool seenSufficientPeriodicallyPopulatedBucketsToTest(const TFloatMeanAccumulatorCRng &buckets,
367+
std::size_t period) const;
368+
363369
//! Compute various ancillary statistics for testing.
364370
bool testStatisticsFor(const TFloatMeanAccumulatorCRng &buckets,
365371
STestStats &stats) const;

lib/maths/CPeriodicityHypothesisTests.cc

Lines changed: 46 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -1039,18 +1039,18 @@ CPeriodicityHypothesisTests::best(const TNestedHypothesesVec &hypotheses) const
10391039
{
10401040
STestStats stats;
10411041
CPeriodicityHypothesisTestsResult resultForHypothesis{hypothesis.test(stats)};
1042-
summaries.emplace_back(stats.s_V0, stats.s_B - stats.s_DF0,
1043-
std::move(resultForHypothesis));
1042+
if (stats.s_B > stats.s_DF0)
1043+
{
1044+
summaries.emplace_back(stats.s_V0, stats.s_B - stats.s_DF0,
1045+
std::move(resultForHypothesis));
1046+
}
10441047
}
10451048

10461049
TMinAccumulator vCutoff;
10471050
for (const auto &summary : summaries)
10481051
{
1049-
if (summary.s_DF > 0.0)
1050-
{
1051-
vCutoff.add(varianceAtPercentile(summary.s_V, summary.s_DF,
1052-
50.0 + CONFIDENCE_INTERVAL / 2.0));
1053-
}
1052+
vCutoff.add(varianceAtPercentile(summary.s_V, summary.s_DF,
1053+
50.0 + CONFIDENCE_INTERVAL / 2.0));
10541054
}
10551055
if (vCutoff.count() > 0)
10561056
{
@@ -1284,6 +1284,26 @@ bool CPeriodicityHypothesisTests::seenSufficientDataToTest(core_t::TTime period,
12841284
>= 2.0 * ACCURATE_TEST_POPULATED_FRACTION * static_cast<double>(period);
12851285
}
12861286

1287+
bool CPeriodicityHypothesisTests::seenSufficientPeriodicallyPopulatedBucketsToTest(const TFloatMeanAccumulatorCRng &buckets,
1288+
std::size_t period) const
1289+
{
1290+
double repeats{0.0};
1291+
for (std::size_t i = 0u; i < period; ++i)
1292+
{
1293+
for (std::size_t j = i + period; j < buckets.size(); j += period)
1294+
{
1295+
if ( CBasicStatistics::count(buckets[j])
1296+
* CBasicStatistics::count(buckets[j - period]) > 0.0)
1297+
{
1298+
repeats += 1.0;
1299+
break;
1300+
}
1301+
}
1302+
}
1303+
LOG_TRACE("repeated values = " << repeats);
1304+
return repeats >= static_cast<double>(period) * ACCURATE_TEST_POPULATED_FRACTION / 3.0;
1305+
}
1306+
12871307
bool CPeriodicityHypothesisTests::testStatisticsFor(const TFloatMeanAccumulatorCRng &buckets,
12881308
STestStats &stats) const
12891309
{
@@ -1433,21 +1453,7 @@ bool CPeriodicityHypothesisTests::testPeriod(const TTimeTimePr2Vec &windows,
14331453

14341454
// We need to observe a minimum number of repeated values to test with
14351455
// an acceptable false positive rate.
1436-
double repeats{0.0};
1437-
for (std::size_t i = 0u; i < period; ++i)
1438-
{
1439-
for (std::size_t j = i + period; j < buckets.size(); j += period)
1440-
{
1441-
if ( CBasicStatistics::count(buckets[j])
1442-
* CBasicStatistics::count(buckets[j - period]) > 0.0)
1443-
{
1444-
repeats += 1.0;
1445-
break;
1446-
}
1447-
}
1448-
}
1449-
LOG_TRACE(" repeated values = " << repeats);
1450-
if (repeats < static_cast<double>(period) * ACCURATE_TEST_POPULATED_FRACTION / 3.0)
1456+
if (!this->seenSufficientPeriodicallyPopulatedBucketsToTest(buckets, period))
14511457
{
14521458
return false;
14531459
}
@@ -1493,7 +1499,8 @@ bool CPeriodicityHypothesisTests::testPeriod(const TTimeTimePr2Vec &windows,
14931499
LOG_TRACE(" significance = " << CStatisticalTests::leftTailFTest(v1 / v0, df1, df0));
14941500

14951501
double Rt{stats.s_Rt * CTools::truncate(1.0 - 0.5 * (vt - v1) / vt, 0.9, 1.0)};
1496-
if (v1 < vt && CStatisticalTests::leftTailFTest(v1 / v0, df1, df0) <= MAXIMUM_SIGNIFICANCE)
1502+
if ( v1 < vt && B > 1.0
1503+
&& CStatisticalTests::leftTailFTest(v1 / v0, df1, df0) <= MAXIMUM_SIGNIFICANCE)
14971504
{
14981505
double R{CSignal::autocorrelation(period, values)};
14991506
R = autocorrelationAtPercentile(R, B, 50.0 - CONFIDENCE_INTERVAL / 2.0);
@@ -1567,6 +1574,15 @@ bool CPeriodicityHypothesisTests::testPartition(const TTimeTimePr2Vec &partition
15671574
{
15681575
return false;
15691576
}
1577+
1578+
std::size_t period{static_cast<std::size_t>(period_ / m_BucketLength)};
1579+
1580+
// We need to observe a minimum number of repeated values to test with
1581+
// an acceptable false positive rate.
1582+
if (!this->seenSufficientPeriodicallyPopulatedBucketsToTest(buckets, period))
1583+
{
1584+
return false;
1585+
}
15701586
if (stats.s_HasPartition)
15711587
{
15721588
return true;
@@ -1577,7 +1593,6 @@ bool CPeriodicityHypothesisTests::testPartition(const TTimeTimePr2Vec &partition
15771593
// evidence that it reduces the residual variance and repeats.
15781594

15791595
core_t::TTime windowLength{length(buckets, m_BucketLength)};
1580-
std::size_t period{static_cast<std::size_t>(period_ / m_BucketLength)};
15811596
core_t::TTime repeat{length(partition)};
15821597
core_t::TTime startOfPartition{stats.s_StartOfPartition};
15831598
double B{stats.s_B};
@@ -1732,11 +1747,13 @@ bool CPeriodicityHypothesisTests::testPartition(const TTimeTimePr2Vec &partition
17321747
double BW{std::accumulate(partitionValues.begin(), partitionValues.end(), 0.0,
17331748
[](double n, const TFloatMeanAccumulator &value)
17341749
{ return n + (CBasicStatistics::count(value) > 0.0 ? 1.0 : 0.0); })};
1735-
R = std::max(R, autocorrelationAtPercentile(CSignal::autocorrelation(
1736-
windowLength_ + period, partitionValues),
1737-
BW, 50.0 - CONFIDENCE_INTERVAL / 2.0));
1738-
LOG_TRACE(" autocorrelation = " << R);
1739-
LOG_TRACE(" autocorrelationThreshold = " << Rt);
1750+
if (BW > 1.0)
1751+
{
1752+
double RW{CSignal::autocorrelation(windowLength_ + period, partitionValues)};
1753+
R = std::max(R, autocorrelationAtPercentile(RW, BW, 50.0 - CONFIDENCE_INTERVAL / 2.0));
1754+
LOG_TRACE(" autocorrelation = " << R);
1755+
LOG_TRACE(" autocorrelationThreshold = " << Rt);
1756+
}
17401757
}
17411758

17421759
if (R > Rt)

0 commit comments

Comments
 (0)