Skip to content

[7.2][ML] Improve hard_limit audit message (#486) #487

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
May 18, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions docs/CHANGELOG.asciidoc
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,8 @@ to the model. (See {ml-pull}214[#214].)

* Stop linking to libcrypt on Linux. (See {ml-pull}480[#480].)

* Improvements to hard_limit audit message. (See {ml-pull}486[#486].)

=== Bug Fixes

* Handle NaNs when detrending seasonal components. {ml-pull}408[#408]
Expand Down
15 changes: 11 additions & 4 deletions include/model/CResourceMonitor.h
Original file line number Diff line number Diff line change
Expand Up @@ -34,12 +34,15 @@ class MODEL_EXPORT CResourceMonitor {
public:
struct MODEL_EXPORT SResults {
std::size_t s_Usage;
std::size_t s_AdjustedUsage;
std::size_t s_ByFields;
std::size_t s_PartitionFields;
std::size_t s_OverFields;
std::size_t s_AllocationFailures;
model_t::EMemoryStatus s_MemoryStatus;
core_t::TTime s_BucketStartTime;
std::size_t s_BytesExceeded;
std::size_t s_BytesMemoryLimit;
};

public:
Expand All @@ -65,10 +68,6 @@ class MODEL_EXPORT CResourceMonitor {
//! taking up too much memory and further allocations should be banned
bool areAllocationsAllowed() const;

//! Query the resource monitor to found out if it's Ok to
//! create structures of a certain size
bool areAllocationsAllowed(std::size_t size) const;

//! Return the amount of remaining space for allocations
std::size_t allocationLimit() const;

Expand Down Expand Up @@ -164,6 +163,11 @@ class MODEL_EXPORT CResourceMonitor {
//! Returns the sum of used memory plus any extra memory
std::size_t totalMemory() const;

//! Adjusts the amount of memory reported to take into
//! account the current value of the byte limit margin and the effects
//! of background persistence.
std::size_t adjustedUsage(std::size_t usage) const;

private:
//! The registered collection of components
TDetectorPtrSizeUMap m_Detectors;
Expand Down Expand Up @@ -226,6 +230,9 @@ class MODEL_EXPORT CResourceMonitor {
//! Don't do any sort of memory checking if this is set
bool m_NoLimit;

//! The number of bytes over the high limit for memory usage at the last allocation failure
std::size_t m_CurrentBytesExceeded;

//! Test friends
friend class ::CResourceMonitorTest;
friend class ::CResourceLimitTest;
Expand Down
20 changes: 10 additions & 10 deletions lib/api/CModelSizeStatsJsonWriter.cc
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@ namespace {
const std::string JOB_ID("job_id");
const std::string MODEL_SIZE_STATS("model_size_stats");
const std::string MODEL_BYTES("model_bytes");
const std::string MODEL_BYTES_EXCEEDED("model_bytes_exceeded");
const std::string MODEL_BYTES_MEMORY_LIMIT("model_bytes_memory_limit");
const std::string TOTAL_BY_FIELD_COUNT("total_by_field_count");
const std::string TOTAL_OVER_FIELD_COUNT("total_over_field_count");
const std::string TOTAL_PARTITION_FIELD_COUNT("total_partition_field_count");
Expand All @@ -33,17 +35,15 @@ void CModelSizeStatsJsonWriter::write(const std::string& jobId,

writer.String(JOB_ID);
writer.String(jobId);

writer.String(MODEL_BYTES);
// Background persist causes the memory size to double due to copying
// the models. On top of that, after the persist is done we may not
// be able to retrieve that memory back. Thus, we report twice the
// memory usage in order to allow for that.
// See https://github.com/elastic/x-pack-elasticsearch/issues/1020.
// Issue https://github.com/elastic/x-pack-elasticsearch/issues/857
// discusses adding an option to perform only foreground persist.
// If that gets implemented, we should only double when background
// persist is configured.
writer.Uint64(results.s_Usage * 2);
writer.Uint64(results.s_AdjustedUsage);

writer.String(MODEL_BYTES_EXCEEDED);
writer.Uint64(results.s_BytesExceeded);

writer.String(MODEL_BYTES_MEMORY_LIMIT);
writer.Uint64(results.s_BytesMemoryLimit);

writer.String(TOTAL_BY_FIELD_COUNT);
writer.Uint64(results.s_ByFields);
Expand Down
27 changes: 17 additions & 10 deletions lib/api/unittest/CJsonOutputWriterTest.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1555,12 +1555,15 @@ void CJsonOutputWriterTest::testReportMemoryUsage() {

ml::model::CResourceMonitor::SResults resourceUsage;
resourceUsage.s_Usage = 1;
resourceUsage.s_ByFields = 2;
resourceUsage.s_PartitionFields = 3;
resourceUsage.s_OverFields = 4;
resourceUsage.s_AllocationFailures = 5;
resourceUsage.s_AdjustedUsage = 2;
resourceUsage.s_ByFields = 3;
resourceUsage.s_PartitionFields = 4;
resourceUsage.s_OverFields = 5;
resourceUsage.s_AllocationFailures = 6;
resourceUsage.s_MemoryStatus = ml::model_t::E_MemoryStatusHardLimit;
resourceUsage.s_BucketStartTime = 6;
resourceUsage.s_BucketStartTime = 7;
resourceUsage.s_BytesExceeded = 8;
resourceUsage.s_BytesMemoryLimit = 9;

writer.reportMemoryUsage(resourceUsage);
writer.endOutputBatch(false, 1ul);
Expand All @@ -1580,22 +1583,26 @@ void CJsonOutputWriterTest::testReportMemoryUsage() {
CPPUNIT_ASSERT(sizeStats.HasMember("model_bytes"));
CPPUNIT_ASSERT_EQUAL(2, sizeStats["model_bytes"].GetInt());
CPPUNIT_ASSERT(sizeStats.HasMember("total_by_field_count"));
CPPUNIT_ASSERT_EQUAL(2, sizeStats["total_by_field_count"].GetInt());
CPPUNIT_ASSERT_EQUAL(3, sizeStats["total_by_field_count"].GetInt());
CPPUNIT_ASSERT(sizeStats.HasMember("total_partition_field_count"));
CPPUNIT_ASSERT_EQUAL(3, sizeStats["total_partition_field_count"].GetInt());
CPPUNIT_ASSERT_EQUAL(4, sizeStats["total_partition_field_count"].GetInt());
CPPUNIT_ASSERT(sizeStats.HasMember("total_over_field_count"));
CPPUNIT_ASSERT_EQUAL(4, sizeStats["total_over_field_count"].GetInt());
CPPUNIT_ASSERT_EQUAL(5, sizeStats["total_over_field_count"].GetInt());
CPPUNIT_ASSERT(sizeStats.HasMember("bucket_allocation_failures_count"));
CPPUNIT_ASSERT_EQUAL(5, sizeStats["bucket_allocation_failures_count"].GetInt());
CPPUNIT_ASSERT_EQUAL(6, sizeStats["bucket_allocation_failures_count"].GetInt());
CPPUNIT_ASSERT(sizeStats.HasMember("timestamp"));
CPPUNIT_ASSERT_EQUAL(6000, sizeStats["timestamp"].GetInt());
CPPUNIT_ASSERT_EQUAL(7000, sizeStats["timestamp"].GetInt());
CPPUNIT_ASSERT(sizeStats.HasMember("memory_status"));
CPPUNIT_ASSERT_EQUAL(std::string("hard_limit"),
std::string(sizeStats["memory_status"].GetString()));
CPPUNIT_ASSERT(sizeStats.HasMember("log_time"));
int64_t nowMs = ml::core::CTimeUtils::now() * 1000ll;
CPPUNIT_ASSERT(nowMs >= sizeStats["log_time"].GetInt64());
CPPUNIT_ASSERT(nowMs + 1000ll >= sizeStats["log_time"].GetInt64());
CPPUNIT_ASSERT(sizeStats.HasMember("model_bytes_exceeded"));
CPPUNIT_ASSERT_EQUAL(8, sizeStats["model_bytes_exceeded"].GetInt());
CPPUNIT_ASSERT(sizeStats.HasMember("model_bytes_memory_limit"));
CPPUNIT_ASSERT_EQUAL(9, sizeStats["model_bytes_memory_limit"].GetInt());
}

void CJsonOutputWriterTest::testWriteScheduledEvent() {
Expand Down
10 changes: 9 additions & 1 deletion lib/api/unittest/CModelSnapshotJsonWriterTest.cc
Original file line number Diff line number Diff line change
Expand Up @@ -33,12 +33,15 @@ void CModelSnapshotJsonWriterTest::testWrite() {
{
model::CResourceMonitor::SResults modelSizeStats{
10000, // bytes used
20000, // bytes used (adjusted)
3, // # by fields
1, // # partition fields
150, // # over fields
4, // # allocation failures
model_t::E_MemoryStatusOk, // memory status
core_t::TTime(1521046309) // bucket start time
core_t::TTime(1521046309), // bucket start time
0, // model bytes exceeded
50000 // model bytes memory limit
};

CModelSnapshotJsonWriter::SModelSnapshotReport report{
Expand Down Expand Up @@ -114,6 +117,11 @@ void CModelSnapshotJsonWriterTest::testWrite() {
CPPUNIT_ASSERT(modelSizeStats.HasMember("timestamp"));
CPPUNIT_ASSERT_EQUAL(int64_t(1521046309000), modelSizeStats["timestamp"].GetInt64());
CPPUNIT_ASSERT(modelSizeStats.HasMember("log_time"));
CPPUNIT_ASSERT(modelSizeStats.HasMember("model_bytes_exceeded"));
CPPUNIT_ASSERT_EQUAL(int64_t(0), modelSizeStats["model_bytes_exceeded"].GetInt64());
CPPUNIT_ASSERT(modelSizeStats.HasMember("model_bytes_memory_limit"));
CPPUNIT_ASSERT_EQUAL(int64_t(50000),
modelSizeStats["model_bytes_memory_limit"].GetInt64());

CPPUNIT_ASSERT(snapshot.HasMember("quantiles"));
const rapidjson::Value& quantiles = snapshot["quantiles"];
Expand Down
39 changes: 29 additions & 10 deletions lib/model/CResourceMonitor.cc
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,8 @@ CResourceMonitor::CResourceMonitor(double byteLimitMargin)
m_HasPruningStarted(false), m_PruneThreshold(0), m_LastPruneTime(0),
m_PruneWindow(std::numeric_limits<std::size_t>::max()),
m_PruneWindowMaximum(std::numeric_limits<std::size_t>::max()),
m_PruneWindowMinimum(std::numeric_limits<std::size_t>::max()), m_NoLimit(false) {
m_PruneWindowMinimum(std::numeric_limits<std::size_t>::max()),
m_NoLimit(false), m_CurrentBytesExceeded(0) {
this->updateMemoryLimitsAndPruneThreshold(DEFAULT_MEMORY_LIMIT_MB);
}

Expand Down Expand Up @@ -108,18 +109,21 @@ void CResourceMonitor::refresh(CAnomalyDetector& detector) {

void CResourceMonitor::forceRefresh(CAnomalyDetector& detector) {
this->memUsage(&detector);
core::CProgramCounters::counter(counter_t::E_TSADMemoryUsage) = this->totalMemory();
LOG_TRACE(<< "Checking allocations: currently at " << this->totalMemory());

this->updateAllowAllocations();
}

void CResourceMonitor::updateAllowAllocations() {
std::size_t total{this->totalMemory()};
core::CProgramCounters::counter(counter_t::E_TSADMemoryUsage) = total;
LOG_TRACE(<< "Checking allocations: currently at " << total);
if (m_AllowAllocations) {
if (total > this->highLimit()) {
LOG_INFO(<< "Over current allocation high limit. " << total
<< " bytes used, the limit is " << this->highLimit());
m_AllowAllocations = false;
std::size_t bytesExceeded{total - this->highLimit()};
m_CurrentBytesExceeded = this->adjustedUsage(bytesExceeded);
}
} else if (total < this->lowLimit()) {
LOG_INFO(<< "Below current allocation low limit. " << total
Expand Down Expand Up @@ -204,13 +208,6 @@ bool CResourceMonitor::areAllocationsAllowed() const {
return m_AllowAllocations;
}

bool CResourceMonitor::areAllocationsAllowed(std::size_t size) const {
if (m_AllowAllocations) {
return this->totalMemory() + size < this->highLimit();
}
return false;
}

std::size_t CResourceMonitor::allocationLimit() const {
return this->highLimit() - std::min(this->highLimit(), this->totalMemory());
}
Expand Down Expand Up @@ -268,6 +265,9 @@ CResourceMonitor::SResults CResourceMonitor::createMemoryUsageReport(core_t::TTi
res.s_OverFields = 0;
res.s_PartitionFields = 0;
res.s_Usage = this->totalMemory();
res.s_AdjustedUsage = this->adjustedUsage(res.s_Usage);
res.s_BytesMemoryLimit = 2 * m_ByteLimitHigh;
res.s_BytesExceeded = m_CurrentBytesExceeded;
res.s_AllocationFailures = 0;
res.s_MemoryStatus = m_MemoryStatus;
res.s_BucketStartTime = bucketStartTime;
Expand All @@ -281,6 +281,25 @@ CResourceMonitor::SResults CResourceMonitor::createMemoryUsageReport(core_t::TTi
return res;
}

std::size_t CResourceMonitor::adjustedUsage(std::size_t usage) const {
// Background persist causes the memory size to double due to copying
// the models. On top of that, after the persist is done we may not
// be able to retrieve that memory back. Thus, we report twice the
// memory usage in order to allow for that.
// See https://github.com/elastic/x-pack-elasticsearch/issues/1020.
// Issue https://github.com/elastic/x-pack-elasticsearch/issues/857
// discusses adding an option to perform only foreground persist.
// If that gets implemented, we should only double when background
// persist is configured.

// We also scale the reported memory usage by the inverse of the byte limit margin.
// This gives the user a fairer indication of how close the job is to hitting
// the model memory limit in a concise manner (as the limit is scaled down by
// the margin during the beginning period of the job's existence).
size_t adjustedUsage = static_cast<std::size_t>(2 * usage / m_ByteLimitMargin);
return adjustedUsage;
}

void CResourceMonitor::acceptAllocationFailureResult(core_t::TTime time) {
m_MemoryStatus = model_t::E_MemoryStatusHardLimit;
++m_AllocationFailures[time];
Expand Down