Skip to content

Commit 8a02a57

Browse files
committed
Revert "Revert "[ML] Improve hard_limit audit message (#486)""
This reverts commit 5e9d610.
1 parent 5e9d610 commit 8a02a57

File tree

6 files changed

+78
-35
lines changed

6 files changed

+78
-35
lines changed

docs/CHANGELOG.asciidoc

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,8 @@ to the model. (See {ml-pull}214[#214].)
4444
4545
* Stop linking to libcrypt on Linux. (See {ml-pull}480[#480].)
4646
47+
* Improvements to hard_limit audit message. (See {ml-pull}486[#486].)
48+
4749
=== Bug Fixes
4850
4951
* Handle NaNs when detrending seasonal components. {ml-pull}408[#408]

include/model/CResourceMonitor.h

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -34,12 +34,15 @@ class MODEL_EXPORT CResourceMonitor {
3434
public:
3535
struct MODEL_EXPORT SResults {
3636
std::size_t s_Usage;
37+
std::size_t s_AdjustedUsage;
3738
std::size_t s_ByFields;
3839
std::size_t s_PartitionFields;
3940
std::size_t s_OverFields;
4041
std::size_t s_AllocationFailures;
4142
model_t::EMemoryStatus s_MemoryStatus;
4243
core_t::TTime s_BucketStartTime;
44+
std::size_t s_BytesExceeded;
45+
std::size_t s_BytesMemoryLimit;
4346
};
4447

4548
public:
@@ -65,10 +68,6 @@ class MODEL_EXPORT CResourceMonitor {
6568
//! taking up too much memory and further allocations should be banned
6669
bool areAllocationsAllowed() const;
6770

68-
//! Query the resource monitor to found out if it's Ok to
69-
//! create structures of a certain size
70-
bool areAllocationsAllowed(std::size_t size) const;
71-
7271
//! Return the amount of remaining space for allocations
7372
std::size_t allocationLimit() const;
7473

@@ -164,6 +163,11 @@ class MODEL_EXPORT CResourceMonitor {
164163
//! Returns the sum of used memory plus any extra memory
165164
std::size_t totalMemory() const;
166165

166+
//! Adjusts the amount of memory reported to take into
167+
//! account the current value of the byte limit margin and the effects
168+
//! of background persistence.
169+
std::size_t adjustedUsage(std::size_t usage) const;
170+
167171
private:
168172
//! The registered collection of components
169173
TDetectorPtrSizeUMap m_Detectors;
@@ -226,6 +230,9 @@ class MODEL_EXPORT CResourceMonitor {
226230
//! Don't do any sort of memory checking if this is set
227231
bool m_NoLimit;
228232

233+
//! The number of bytes over the high limit for memory usage at the last allocation failure
234+
std::size_t m_CurrentBytesExceeded;
235+
229236
//! Test friends
230237
friend class ::CResourceMonitorTest;
231238
friend class ::CResourceLimitTest;

lib/api/CModelSizeStatsJsonWriter.cc

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,8 @@ namespace {
1616
const std::string JOB_ID("job_id");
1717
const std::string MODEL_SIZE_STATS("model_size_stats");
1818
const std::string MODEL_BYTES("model_bytes");
19+
const std::string MODEL_BYTES_EXCEEDED("model_bytes_exceeded");
20+
const std::string MODEL_BYTES_MEMORY_LIMIT("model_bytes_memory_limit");
1921
const std::string TOTAL_BY_FIELD_COUNT("total_by_field_count");
2022
const std::string TOTAL_OVER_FIELD_COUNT("total_over_field_count");
2123
const std::string TOTAL_PARTITION_FIELD_COUNT("total_partition_field_count");
@@ -33,17 +35,15 @@ void CModelSizeStatsJsonWriter::write(const std::string& jobId,
3335

3436
writer.String(JOB_ID);
3537
writer.String(jobId);
38+
3639
writer.String(MODEL_BYTES);
37-
// Background persist causes the memory size to double due to copying
38-
// the models. On top of that, after the persist is done we may not
39-
// be able to retrieve that memory back. Thus, we report twice the
40-
// memory usage in order to allow for that.
41-
// See https://github.com/elastic/x-pack-elasticsearch/issues/1020.
42-
// Issue https://github.com/elastic/x-pack-elasticsearch/issues/857
43-
// discusses adding an option to perform only foreground persist.
44-
// If that gets implemented, we should only double when background
45-
// persist is configured.
46-
writer.Uint64(results.s_Usage * 2);
40+
writer.Uint64(results.s_AdjustedUsage);
41+
42+
writer.String(MODEL_BYTES_EXCEEDED);
43+
writer.Uint64(results.s_BytesExceeded);
44+
45+
writer.String(MODEL_BYTES_MEMORY_LIMIT);
46+
writer.Uint64(results.s_BytesMemoryLimit);
4747

4848
writer.String(TOTAL_BY_FIELD_COUNT);
4949
writer.Uint64(results.s_ByFields);

lib/api/unittest/CJsonOutputWriterTest.cc

Lines changed: 17 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1555,12 +1555,15 @@ void CJsonOutputWriterTest::testReportMemoryUsage() {
15551555

15561556
ml::model::CResourceMonitor::SResults resourceUsage;
15571557
resourceUsage.s_Usage = 1;
1558-
resourceUsage.s_ByFields = 2;
1559-
resourceUsage.s_PartitionFields = 3;
1560-
resourceUsage.s_OverFields = 4;
1561-
resourceUsage.s_AllocationFailures = 5;
1558+
resourceUsage.s_AdjustedUsage = 2;
1559+
resourceUsage.s_ByFields = 3;
1560+
resourceUsage.s_PartitionFields = 4;
1561+
resourceUsage.s_OverFields = 5;
1562+
resourceUsage.s_AllocationFailures = 6;
15621563
resourceUsage.s_MemoryStatus = ml::model_t::E_MemoryStatusHardLimit;
1563-
resourceUsage.s_BucketStartTime = 6;
1564+
resourceUsage.s_BucketStartTime = 7;
1565+
resourceUsage.s_BytesExceeded = 8;
1566+
resourceUsage.s_BytesMemoryLimit = 9;
15641567

15651568
writer.reportMemoryUsage(resourceUsage);
15661569
writer.endOutputBatch(false, 1ul);
@@ -1580,22 +1583,26 @@ void CJsonOutputWriterTest::testReportMemoryUsage() {
15801583
CPPUNIT_ASSERT(sizeStats.HasMember("model_bytes"));
15811584
CPPUNIT_ASSERT_EQUAL(2, sizeStats["model_bytes"].GetInt());
15821585
CPPUNIT_ASSERT(sizeStats.HasMember("total_by_field_count"));
1583-
CPPUNIT_ASSERT_EQUAL(2, sizeStats["total_by_field_count"].GetInt());
1586+
CPPUNIT_ASSERT_EQUAL(3, sizeStats["total_by_field_count"].GetInt());
15841587
CPPUNIT_ASSERT(sizeStats.HasMember("total_partition_field_count"));
1585-
CPPUNIT_ASSERT_EQUAL(3, sizeStats["total_partition_field_count"].GetInt());
1588+
CPPUNIT_ASSERT_EQUAL(4, sizeStats["total_partition_field_count"].GetInt());
15861589
CPPUNIT_ASSERT(sizeStats.HasMember("total_over_field_count"));
1587-
CPPUNIT_ASSERT_EQUAL(4, sizeStats["total_over_field_count"].GetInt());
1590+
CPPUNIT_ASSERT_EQUAL(5, sizeStats["total_over_field_count"].GetInt());
15881591
CPPUNIT_ASSERT(sizeStats.HasMember("bucket_allocation_failures_count"));
1589-
CPPUNIT_ASSERT_EQUAL(5, sizeStats["bucket_allocation_failures_count"].GetInt());
1592+
CPPUNIT_ASSERT_EQUAL(6, sizeStats["bucket_allocation_failures_count"].GetInt());
15901593
CPPUNIT_ASSERT(sizeStats.HasMember("timestamp"));
1591-
CPPUNIT_ASSERT_EQUAL(6000, sizeStats["timestamp"].GetInt());
1594+
CPPUNIT_ASSERT_EQUAL(7000, sizeStats["timestamp"].GetInt());
15921595
CPPUNIT_ASSERT(sizeStats.HasMember("memory_status"));
15931596
CPPUNIT_ASSERT_EQUAL(std::string("hard_limit"),
15941597
std::string(sizeStats["memory_status"].GetString()));
15951598
CPPUNIT_ASSERT(sizeStats.HasMember("log_time"));
15961599
int64_t nowMs = ml::core::CTimeUtils::now() * 1000ll;
15971600
CPPUNIT_ASSERT(nowMs >= sizeStats["log_time"].GetInt64());
15981601
CPPUNIT_ASSERT(nowMs + 1000ll >= sizeStats["log_time"].GetInt64());
1602+
CPPUNIT_ASSERT(sizeStats.HasMember("model_bytes_exceeded"));
1603+
CPPUNIT_ASSERT_EQUAL(8, sizeStats["model_bytes_exceeded"].GetInt());
1604+
CPPUNIT_ASSERT(sizeStats.HasMember("model_bytes_memory_limit"));
1605+
CPPUNIT_ASSERT_EQUAL(9, sizeStats["model_bytes_memory_limit"].GetInt());
15991606
}
16001607

16011608
void CJsonOutputWriterTest::testWriteScheduledEvent() {

lib/api/unittest/CModelSnapshotJsonWriterTest.cc

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,12 +33,15 @@ void CModelSnapshotJsonWriterTest::testWrite() {
3333
{
3434
model::CResourceMonitor::SResults modelSizeStats{
3535
10000, // bytes used
36+
20000, // bytes used (adjusted)
3637
3, // # by fields
3738
1, // # partition fields
3839
150, // # over fields
3940
4, // # allocation failures
4041
model_t::E_MemoryStatusOk, // memory status
41-
core_t::TTime(1521046309) // bucket start time
42+
core_t::TTime(1521046309), // bucket start time
43+
0, // model bytes exceeded
44+
50000 // model bytes memory limit
4245
};
4346

4447
CModelSnapshotJsonWriter::SModelSnapshotReport report{
@@ -114,6 +117,11 @@ void CModelSnapshotJsonWriterTest::testWrite() {
114117
CPPUNIT_ASSERT(modelSizeStats.HasMember("timestamp"));
115118
CPPUNIT_ASSERT_EQUAL(int64_t(1521046309000), modelSizeStats["timestamp"].GetInt64());
116119
CPPUNIT_ASSERT(modelSizeStats.HasMember("log_time"));
120+
CPPUNIT_ASSERT(modelSizeStats.HasMember("model_bytes_exceeded"));
121+
CPPUNIT_ASSERT_EQUAL(int64_t(0), modelSizeStats["model_bytes_exceeded"].GetInt64());
122+
CPPUNIT_ASSERT(modelSizeStats.HasMember("model_bytes_memory_limit"));
123+
CPPUNIT_ASSERT_EQUAL(int64_t(50000),
124+
modelSizeStats["model_bytes_memory_limit"].GetInt64());
117125

118126
CPPUNIT_ASSERT(snapshot.HasMember("quantiles"));
119127
const rapidjson::Value& quantiles = snapshot["quantiles"];

lib/model/CResourceMonitor.cc

Lines changed: 29 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,8 @@ CResourceMonitor::CResourceMonitor(double byteLimitMargin)
3535
m_HasPruningStarted(false), m_PruneThreshold(0), m_LastPruneTime(0),
3636
m_PruneWindow(std::numeric_limits<std::size_t>::max()),
3737
m_PruneWindowMaximum(std::numeric_limits<std::size_t>::max()),
38-
m_PruneWindowMinimum(std::numeric_limits<std::size_t>::max()), m_NoLimit(false) {
38+
m_PruneWindowMinimum(std::numeric_limits<std::size_t>::max()),
39+
m_NoLimit(false), m_CurrentBytesExceeded(0) {
3940
this->updateMemoryLimitsAndPruneThreshold(DEFAULT_MEMORY_LIMIT_MB);
4041
}
4142

@@ -108,18 +109,21 @@ void CResourceMonitor::refresh(CAnomalyDetector& detector) {
108109

109110
void CResourceMonitor::forceRefresh(CAnomalyDetector& detector) {
110111
this->memUsage(&detector);
111-
core::CProgramCounters::counter(counter_t::E_TSADMemoryUsage) = this->totalMemory();
112-
LOG_TRACE(<< "Checking allocations: currently at " << this->totalMemory());
112+
113113
this->updateAllowAllocations();
114114
}
115115

116116
void CResourceMonitor::updateAllowAllocations() {
117117
std::size_t total{this->totalMemory()};
118+
core::CProgramCounters::counter(counter_t::E_TSADMemoryUsage) = total;
119+
LOG_TRACE(<< "Checking allocations: currently at " << total);
118120
if (m_AllowAllocations) {
119121
if (total > this->highLimit()) {
120122
LOG_INFO(<< "Over current allocation high limit. " << total
121123
<< " bytes used, the limit is " << this->highLimit());
122124
m_AllowAllocations = false;
125+
std::size_t bytesExceeded{total - this->highLimit()};
126+
m_CurrentBytesExceeded = this->adjustedUsage(bytesExceeded);
123127
}
124128
} else if (total < this->lowLimit()) {
125129
LOG_INFO(<< "Below current allocation low limit. " << total
@@ -204,13 +208,6 @@ bool CResourceMonitor::areAllocationsAllowed() const {
204208
return m_AllowAllocations;
205209
}
206210

207-
bool CResourceMonitor::areAllocationsAllowed(std::size_t size) const {
208-
if (m_AllowAllocations) {
209-
return this->totalMemory() + size < this->highLimit();
210-
}
211-
return false;
212-
}
213-
214211
std::size_t CResourceMonitor::allocationLimit() const {
215212
return this->highLimit() - std::min(this->highLimit(), this->totalMemory());
216213
}
@@ -268,6 +265,9 @@ CResourceMonitor::SResults CResourceMonitor::createMemoryUsageReport(core_t::TTi
268265
res.s_OverFields = 0;
269266
res.s_PartitionFields = 0;
270267
res.s_Usage = this->totalMemory();
268+
res.s_AdjustedUsage = this->adjustedUsage(res.s_Usage);
269+
res.s_BytesMemoryLimit = 2 * m_ByteLimitHigh;
270+
res.s_BytesExceeded = m_CurrentBytesExceeded;
271271
res.s_AllocationFailures = 0;
272272
res.s_MemoryStatus = m_MemoryStatus;
273273
res.s_BucketStartTime = bucketStartTime;
@@ -281,6 +281,25 @@ CResourceMonitor::SResults CResourceMonitor::createMemoryUsageReport(core_t::TTi
281281
return res;
282282
}
283283

284+
std::size_t CResourceMonitor::adjustedUsage(std::size_t usage) const {
285+
// Background persist causes the memory size to double due to copying
286+
// the models. On top of that, after the persist is done we may not
287+
// be able to retrieve that memory back. Thus, we report twice the
288+
// memory usage in order to allow for that.
289+
// See https://github.com/elastic/x-pack-elasticsearch/issues/1020.
290+
// Issue https://github.com/elastic/x-pack-elasticsearch/issues/857
291+
// discusses adding an option to perform only foreground persist.
292+
// If that gets implemented, we should only double when background
293+
// persist is configured.
294+
295+
// We also scale the reported memory usage by the inverse of the byte limit margin.
296+
// This gives the user a fairer indication of how close the job is to hitting
297+
// the model memory limit in a concise manner (as the limit is scaled down by
298+
// the margin during the beginning period of the job's existence).
299+
size_t adjustedUsage = static_cast<std::size_t>(2 * usage / m_ByteLimitMargin);
300+
return adjustedUsage;
301+
}
302+
284303
void CResourceMonitor::acceptAllocationFailureResult(core_t::TTime time) {
285304
m_MemoryStatus = model_t::E_MemoryStatusHardLimit;
286305
++m_AllocationFailures[time];

0 commit comments

Comments
 (0)