Skip to content

Commit 8355930

Browse files
authored
[7.x][ML] Recalculate memory usage before upgrading model state (#1587)
When a persist control message with arguments is received by the anomaly detector it doesn't go through the standard chain of persistence calls, as it unconditionally rewrites the state (even if no data has been seen) and includes only the anomaly detector state rather than the categorizer state too. Because of this the memory usage was not being recalculated prior to persisting the state as would normally happen. This PR rectifies that omission. Fixes one of the problems detailed in elastic/elasticsearch#64665 (review) Backport of #1585
1 parent f812ac8 commit 8355930

File tree

4 files changed

+37
-0
lines changed

4 files changed

+37
-0
lines changed

include/model/CResourceMonitor.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -102,6 +102,9 @@ class MODEL_EXPORT CResourceMonitor {
102102
//! Recalculate the memory usage regardless of whether there is a memory limit
103103
void forceRefresh(CMonitoredResource& resource);
104104

105+
//! Recalculate the memory usage for all monitored resources
106+
void forceRefreshAll();
107+
105108
//! Set the internal memory limit, as specified in a limits config file
106109
void memoryLimit(std::size_t limitMBs);
107110

lib/api/CAnomalyJob.cc

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -411,6 +411,11 @@ bool CAnomalyJob::parsePersistControlMessageArgs(const std::string& controlMessa
411411

412412
void CAnomalyJob::processPersistControlMessage(const std::string& controlMessageArgs) {
413413
if (m_PersistenceManager != nullptr) {
414+
// There is a subtle difference between these two cases. When there
415+
// are no control message arguments this triggers persistence of all
416+
// chained processors, i.e. maybe the categorizer as well as the anomaly
417+
// detector if there is one. But when control message arguments are
418+
// passed, ONLY the persistence of the anomaly detector is triggered.
414419
if (controlMessageArgs.empty()) {
415420
if (this->isPersistenceNeeded("state")) {
416421
m_PersistenceManager->startPersist(core::CTimeUtils::now());
@@ -421,6 +426,10 @@ void CAnomalyJob::processPersistControlMessage(const std::string& controlMessage
421426
std::string snapshotDescription;
422427
if (parsePersistControlMessageArgs(controlMessageArgs, snapshotTimestamp,
423428
snapshotId, snapshotDescription)) {
429+
// Since this is not going through the full persistence call
430+
// chain, make sure model size stats are up to date before
431+
// persisting
432+
m_Limits.resourceMonitor().forceRefreshAll();
424433
if (m_PersistenceManager->doForegroundPersist(
425434
[this, &snapshotDescription, &snapshotId,
426435
&snapshotTimestamp](core::CDataAdder& persister) {

lib/model/CResourceMonitor.cc

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -122,6 +122,14 @@ void CResourceMonitor::forceRefresh(CMonitoredResource& resource) {
122122
this->updateAllowAllocations();
123123
}
124124

125+
void CResourceMonitor::forceRefreshAll() {
126+
for (auto& resource : m_Resources) {
127+
this->memUsage(resource.first);
128+
}
129+
130+
this->updateAllowAllocations();
131+
}
132+
125133
void CResourceMonitor::updateAllowAllocations() {
126134
std::size_t total{this->totalMemory()};
127135
core::CProgramCounters::counter(counter_t::E_TSADMemoryUsage) = total;

lib/model/unittest/CResourceMonitorTest.cc

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -264,6 +264,23 @@ BOOST_FIXTURE_TEST_CASE(testMonitor, CTestFixture) {
264264
BOOST_REQUIRE_EQUAL(mem, m_ReportedModelSizeStats.s_Usage);
265265
BOOST_REQUIRE_EQUAL(model_t::E_MemoryStatusOk, m_ReportedModelSizeStats.s_MemoryStatus);
266266
}
267+
{
268+
// As above but refreshing all resources in one call
269+
CResourceMonitor mon;
270+
mon.registerComponent(categorizer);
271+
mon.registerComponent(detector1);
272+
mon.registerComponent(detector2);
273+
274+
mon.memoryUsageReporter(std::bind(&CTestFixture::reportCallback, this,
275+
std::placeholders::_1));
276+
m_ReportedModelSizeStats.s_Usage = 0;
277+
BOOST_REQUIRE_EQUAL(std::size_t(0), m_ReportedModelSizeStats.s_Usage);
278+
279+
mon.forceRefreshAll();
280+
mon.sendMemoryUsageReportIfSignificantlyChanged(0);
281+
BOOST_REQUIRE_EQUAL(mem, m_ReportedModelSizeStats.s_Usage);
282+
BOOST_REQUIRE_EQUAL(model_t::E_MemoryStatusOk, m_ReportedModelSizeStats.s_MemoryStatus);
283+
}
267284
{
268285
// Test the report callback for allocation failures
269286
CResourceMonitor mon;

0 commit comments

Comments
 (0)