Skip to content

Commit f26b907

Browse files
committed
[ML] Fix sparse data edge cases for periodicity testing (#28)
This fixes issue #20. Digging into the root cause, they were all down to very sparse data over the window we maintain to test for periodicity. This showed up the need to lower bound the count of buckets with periodic repeats when testing for periodic partitions.
1 parent 9e07171 commit f26b907

File tree

2 files changed

+55
-32
lines changed

2 files changed

+55
-32
lines changed

include/maths/CPeriodicityHypothesisTests.h

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -335,14 +335,15 @@ class MATHS_EXPORT CPeriodicityHypothesisTests
335335
testForDailyWithWeekend(const TFloatMeanAccumulatorCRng &buckets,
336336
STestStats &stats) const;
337337

338-
//! Test for a weekday/end partition with weekly .
338+
//! Test for a weekly period given we think there is a
339+
//! weekday/end partition.
339340
CPeriodicityHypothesisTestsResult
340341
testForWeeklyGivenDailyWithWeekend(const TTimeTimePr2Vec &window,
341342
const TFloatMeanAccumulatorCRng &buckets,
342343
STestStats &stats) const;
343344

344-
//! Test for the specified period given we think there is
345-
//! some diurnal periodicity.
345+
//! Test for the specified period given we think there is diurnal
346+
//! periodicity.
346347
CPeriodicityHypothesisTestsResult
347348
testForPeriod(const TTimeTimePr2Vec &window,
348349
const TFloatMeanAccumulatorCRng &buckets,
@@ -352,6 +353,11 @@ class MATHS_EXPORT CPeriodicityHypothesisTests
352353
bool seenSufficientDataToTest(core_t::TTime period,
353354
const TFloatMeanAccumulatorCRng &buckets) const;
354355

356+
//! Check if there are enough non-empty buckets which are repeated
357+
//! at at least one \p period in \p buckets.
358+
bool seenSufficientPeriodicallyPopulatedBucketsToTest(const TFloatMeanAccumulatorCRng &buckets,
359+
std::size_t period) const;
360+
355361
//! Compute various ancillary statistics for testing.
356362
bool testStatisticsFor(const TFloatMeanAccumulatorCRng &buckets,
357363
STestStats &stats) const;

lib/maths/CPeriodicityHypothesisTests.cc

Lines changed: 46 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -1030,18 +1030,18 @@ CPeriodicityHypothesisTests::best(const TNestedHypothesesVec &hypotheses) const
10301030
{
10311031
STestStats stats;
10321032
CPeriodicityHypothesisTestsResult resultForHypothesis{hypothesis.test(stats)};
1033-
summaries.emplace_back(stats.s_V0, stats.s_B - stats.s_DF0,
1034-
std::move(resultForHypothesis));
1033+
if (stats.s_B > stats.s_DF0)
1034+
{
1035+
summaries.emplace_back(stats.s_V0, stats.s_B - stats.s_DF0,
1036+
std::move(resultForHypothesis));
1037+
}
10351038
}
10361039

10371040
TMinAccumulator vCutoff;
10381041
for (const auto &summary : summaries)
10391042
{
1040-
if (summary.s_DF > 0.0)
1041-
{
1042-
vCutoff.add(varianceAtPercentile(summary.s_V, summary.s_DF,
1043-
50.0 + CONFIDENCE_INTERVAL / 2.0));
1044-
}
1043+
vCutoff.add(varianceAtPercentile(summary.s_V, summary.s_DF,
1044+
50.0 + CONFIDENCE_INTERVAL / 2.0));
10451045
}
10461046
if (vCutoff.count() > 0)
10471047
{
@@ -1275,6 +1275,26 @@ bool CPeriodicityHypothesisTests::seenSufficientDataToTest(core_t::TTime period,
12751275
>= 2.0 * ACCURATE_TEST_POPULATED_FRACTION * static_cast<double>(period);
12761276
}
12771277

1278+
bool CPeriodicityHypothesisTests::seenSufficientPeriodicallyPopulatedBucketsToTest(const TFloatMeanAccumulatorCRng &buckets,
1279+
std::size_t period) const
1280+
{
1281+
double repeats{0.0};
1282+
for (std::size_t i = 0u; i < period; ++i)
1283+
{
1284+
for (std::size_t j = i + period; j < buckets.size(); j += period)
1285+
{
1286+
if ( CBasicStatistics::count(buckets[j])
1287+
* CBasicStatistics::count(buckets[j - period]) > 0.0)
1288+
{
1289+
repeats += 1.0;
1290+
break;
1291+
}
1292+
}
1293+
}
1294+
LOG_TRACE("repeated values = " << repeats);
1295+
return repeats >= static_cast<double>(period) * ACCURATE_TEST_POPULATED_FRACTION / 3.0;
1296+
}
1297+
12781298
bool CPeriodicityHypothesisTests::testStatisticsFor(const TFloatMeanAccumulatorCRng &buckets,
12791299
STestStats &stats) const
12801300
{
@@ -1424,21 +1444,7 @@ bool CPeriodicityHypothesisTests::testPeriod(const TTimeTimePr2Vec &windows,
14241444

14251445
// We need to observe a minimum number of repeated values to test with
14261446
// an acceptable false positive rate.
1427-
double repeats{0.0};
1428-
for (std::size_t i = 0u; i < period; ++i)
1429-
{
1430-
for (std::size_t j = i + period; j < buckets.size(); j += period)
1431-
{
1432-
if ( CBasicStatistics::count(buckets[j])
1433-
* CBasicStatistics::count(buckets[j - period]) > 0.0)
1434-
{
1435-
repeats += 1.0;
1436-
break;
1437-
}
1438-
}
1439-
}
1440-
LOG_TRACE(" repeated values = " << repeats);
1441-
if (repeats < static_cast<double>(period) * ACCURATE_TEST_POPULATED_FRACTION / 3.0)
1447+
if (!this->seenSufficientPeriodicallyPopulatedBucketsToTest(buckets, period))
14421448
{
14431449
return false;
14441450
}
@@ -1484,7 +1490,8 @@ bool CPeriodicityHypothesisTests::testPeriod(const TTimeTimePr2Vec &windows,
14841490
LOG_TRACE(" significance = " << CStatisticalTests::leftTailFTest(v1 / v0, df1, df0));
14851491

14861492
double Rt{stats.s_Rt * CTools::truncate(1.0 - 0.5 * (vt - v1) / vt, 0.9, 1.0)};
1487-
if (v1 < vt && CStatisticalTests::leftTailFTest(v1 / v0, df1, df0) <= MAXIMUM_SIGNIFICANCE)
1493+
if ( v1 < vt && B > 1.0
1494+
&& CStatisticalTests::leftTailFTest(v1 / v0, df1, df0) <= MAXIMUM_SIGNIFICANCE)
14881495
{
14891496
double R{CSignal::autocorrelation(period, values)};
14901497
R = autocorrelationAtPercentile(R, B, 50.0 - CONFIDENCE_INTERVAL / 2.0);
@@ -1558,6 +1565,15 @@ bool CPeriodicityHypothesisTests::testPartition(const TTimeTimePr2Vec &partition
15581565
{
15591566
return false;
15601567
}
1568+
1569+
std::size_t period{static_cast<std::size_t>(period_ / m_BucketLength)};
1570+
1571+
// We need to observe a minimum number of repeated values to test with
1572+
// an acceptable false positive rate.
1573+
if (!this->seenSufficientPeriodicallyPopulatedBucketsToTest(buckets, period))
1574+
{
1575+
return false;
1576+
}
15611577
if (stats.s_HasPartition)
15621578
{
15631579
return true;
@@ -1568,7 +1584,6 @@ bool CPeriodicityHypothesisTests::testPartition(const TTimeTimePr2Vec &partition
15681584
// evidence that it reduces the residual variance and repeats.
15691585

15701586
core_t::TTime windowLength{length(buckets, m_BucketLength)};
1571-
std::size_t period{static_cast<std::size_t>(period_ / m_BucketLength)};
15721587
core_t::TTime repeat{length(partition)};
15731588
core_t::TTime startOfPartition{stats.s_StartOfPartition};
15741589
double B{stats.s_B};
@@ -1723,11 +1738,13 @@ bool CPeriodicityHypothesisTests::testPartition(const TTimeTimePr2Vec &partition
17231738
double BW{std::accumulate(partitionValues.begin(), partitionValues.end(), 0.0,
17241739
[](double n, const TFloatMeanAccumulator &value)
17251740
{ return n + (CBasicStatistics::count(value) > 0.0 ? 1.0 : 0.0); })};
1726-
R = std::max(R, autocorrelationAtPercentile(CSignal::autocorrelation(
1727-
windowLength_ + period, partitionValues),
1728-
BW, 50.0 - CONFIDENCE_INTERVAL / 2.0));
1729-
LOG_TRACE(" autocorrelation = " << R);
1730-
LOG_TRACE(" autocorrelationThreshold = " << Rt);
1741+
if (BW > 1.0)
1742+
{
1743+
double RW{CSignal::autocorrelation(windowLength_ + period, partitionValues)};
1744+
R = std::max(R, autocorrelationAtPercentile(RW, BW, 50.0 - CONFIDENCE_INTERVAL / 2.0));
1745+
LOG_TRACE(" autocorrelation = " << R);
1746+
LOG_TRACE(" autocorrelationThreshold = " << Rt);
1747+
}
17311748
}
17321749

17331750
if (R > Rt)

0 commit comments

Comments
 (0)