elastic · edsavage · May 18, 2019 · May 18, 2019
diff --git a/docs/CHANGELOG.asciidoc b/docs/CHANGELOG.asciidoc
@@ -44,6 +44,8 @@ to the model. (See {ml-pull}214[#214].)
 
 * Stop linking to libcrypt on Linux. (See {ml-pull}480[#480].)
 
+* Improvements to hard_limit audit message. (See {ml-pull}486[#486].)
+
 === Bug Fixes
 
 * Handle NaNs when detrending seasonal components. {ml-pull}408[#408]

diff --git a/include/model/CResourceMonitor.h b/include/model/CResourceMonitor.h
@@ -34,12 +34,15 @@ class MODEL_EXPORT CResourceMonitor {
 public:
     struct MODEL_EXPORT SResults {
         std::size_t s_Usage;
+        std::size_t s_AdjustedUsage;
         std::size_t s_ByFields;
         std::size_t s_PartitionFields;
         std::size_t s_OverFields;
         std::size_t s_AllocationFailures;
         model_t::EMemoryStatus s_MemoryStatus;
         core_t::TTime s_BucketStartTime;
+        std::size_t s_BytesExceeded;
+        std::size_t s_BytesMemoryLimit;
     };
 
 public:
@@ -65,10 +68,6 @@ class MODEL_EXPORT CResourceMonitor {
     //! taking up too much memory and further allocations should be banned
     bool areAllocationsAllowed() const;
 
-    //! Query the resource monitor to found out if it's Ok to
-    //! create structures of a certain size
-    bool areAllocationsAllowed(std::size_t size) const;
-
     //! Return the amount of remaining space for allocations
     std::size_t allocationLimit() const;
 
@@ -164,6 +163,11 @@ class MODEL_EXPORT CResourceMonitor {
     //! Returns the sum of used memory plus any extra memory
     std::size_t totalMemory() const;
 
+    //! Adjusts the amount of memory reported to take into
+    //! account the current value of the byte limit margin and the effects
+    //! of background persistence.
+    std::size_t adjustedUsage(std::size_t usage) const;
+
 private:
     //! The registered collection of components
     TDetectorPtrSizeUMap m_Detectors;
@@ -226,6 +230,9 @@ class MODEL_EXPORT CResourceMonitor {
     //! Don't do any sort of memory checking if this is set
     bool m_NoLimit;
 
+    //! The number of bytes over the high limit for memory usage at the last allocation failure
+    std::size_t m_CurrentBytesExceeded;
+
     //! Test friends
     friend class ::CResourceMonitorTest;
     friend class ::CResourceLimitTest;

diff --git a/lib/api/CModelSizeStatsJsonWriter.cc b/lib/api/CModelSizeStatsJsonWriter.cc
@@ -16,6 +16,8 @@ namespace {
 const std::string JOB_ID("job_id");
 const std::string MODEL_SIZE_STATS("model_size_stats");
 const std::string MODEL_BYTES("model_bytes");
+const std::string MODEL_BYTES_EXCEEDED("model_bytes_exceeded");
+const std::string MODEL_BYTES_MEMORY_LIMIT("model_bytes_memory_limit");
 const std::string TOTAL_BY_FIELD_COUNT("total_by_field_count");
 const std::string TOTAL_OVER_FIELD_COUNT("total_over_field_count");
 const std::string TOTAL_PARTITION_FIELD_COUNT("total_partition_field_count");
@@ -33,17 +35,15 @@ void CModelSizeStatsJsonWriter::write(const std::string& jobId,
 
     writer.String(JOB_ID);
     writer.String(jobId);
+
     writer.String(MODEL_BYTES);
-    // Background persist causes the memory size to double due to copying
-    // the models. On top of that, after the persist is done we may not
-    // be able to retrieve that memory back. Thus, we report twice the
-    // memory usage in order to allow for that.
-    // See https://github.com/elastic/x-pack-elasticsearch/issues/1020.
-    // Issue https://github.com/elastic/x-pack-elasticsearch/issues/857
-    // discusses adding an option to perform only foreground persist.
-    // If that gets implemented, we should only double when background
-    // persist is configured.
-    writer.Uint64(results.s_Usage * 2);
+    writer.Uint64(results.s_AdjustedUsage);
+
+    writer.String(MODEL_BYTES_EXCEEDED);
+    writer.Uint64(results.s_BytesExceeded);
+
+    writer.String(MODEL_BYTES_MEMORY_LIMIT);
+    writer.Uint64(results.s_BytesMemoryLimit);
 
     writer.String(TOTAL_BY_FIELD_COUNT);
     writer.Uint64(results.s_ByFields);

diff --git a/lib/api/unittest/CJsonOutputWriterTest.cc b/lib/api/unittest/CJsonOutputWriterTest.cc
@@ -1555,12 +1555,15 @@ void CJsonOutputWriterTest::testReportMemoryUsage() {
 
         ml::model::CResourceMonitor::SResults resourceUsage;
         resourceUsage.s_Usage = 1;
-        resourceUsage.s_ByFields = 2;
-        resourceUsage.s_PartitionFields = 3;
-        resourceUsage.s_OverFields = 4;
-        resourceUsage.s_AllocationFailures = 5;
+        resourceUsage.s_AdjustedUsage = 2;
+        resourceUsage.s_ByFields = 3;
+        resourceUsage.s_PartitionFields = 4;
+        resourceUsage.s_OverFields = 5;
+        resourceUsage.s_AllocationFailures = 6;
         resourceUsage.s_MemoryStatus = ml::model_t::E_MemoryStatusHardLimit;
-        resourceUsage.s_BucketStartTime = 6;
+        resourceUsage.s_BucketStartTime = 7;
+        resourceUsage.s_BytesExceeded = 8;
+        resourceUsage.s_BytesMemoryLimit = 9;
 
         writer.reportMemoryUsage(resourceUsage);
         writer.endOutputBatch(false, 1ul);
@@ -1580,22 +1583,26 @@ void CJsonOutputWriterTest::testReportMemoryUsage() {
     CPPUNIT_ASSERT(sizeStats.HasMember("model_bytes"));
     CPPUNIT_ASSERT_EQUAL(2, sizeStats["model_bytes"].GetInt());
     CPPUNIT_ASSERT(sizeStats.HasMember("total_by_field_count"));
-    CPPUNIT_ASSERT_EQUAL(2, sizeStats["total_by_field_count"].GetInt());
+    CPPUNIT_ASSERT_EQUAL(3, sizeStats["total_by_field_count"].GetInt());
     CPPUNIT_ASSERT(sizeStats.HasMember("total_partition_field_count"));
-    CPPUNIT_ASSERT_EQUAL(3, sizeStats["total_partition_field_count"].GetInt());
+    CPPUNIT_ASSERT_EQUAL(4, sizeStats["total_partition_field_count"].GetInt());
     CPPUNIT_ASSERT(sizeStats.HasMember("total_over_field_count"));
-    CPPUNIT_ASSERT_EQUAL(4, sizeStats["total_over_field_count"].GetInt());
+    CPPUNIT_ASSERT_EQUAL(5, sizeStats["total_over_field_count"].GetInt());
     CPPUNIT_ASSERT(sizeStats.HasMember("bucket_allocation_failures_count"));
-    CPPUNIT_ASSERT_EQUAL(5, sizeStats["bucket_allocation_failures_count"].GetInt());
+    CPPUNIT_ASSERT_EQUAL(6, sizeStats["bucket_allocation_failures_count"].GetInt());
     CPPUNIT_ASSERT(sizeStats.HasMember("timestamp"));
-    CPPUNIT_ASSERT_EQUAL(6000, sizeStats["timestamp"].GetInt());
+    CPPUNIT_ASSERT_EQUAL(7000, sizeStats["timestamp"].GetInt());
     CPPUNIT_ASSERT(sizeStats.HasMember("memory_status"));
     CPPUNIT_ASSERT_EQUAL(std::string("hard_limit"),
                          std::string(sizeStats["memory_status"].GetString()));
     CPPUNIT_ASSERT(sizeStats.HasMember("log_time"));
     int64_t nowMs = ml::core::CTimeUtils::now() * 1000ll;
     CPPUNIT_ASSERT(nowMs >= sizeStats["log_time"].GetInt64());
     CPPUNIT_ASSERT(nowMs + 1000ll >= sizeStats["log_time"].GetInt64());
+    CPPUNIT_ASSERT(sizeStats.HasMember("model_bytes_exceeded"));
+    CPPUNIT_ASSERT_EQUAL(8, sizeStats["model_bytes_exceeded"].GetInt());
+    CPPUNIT_ASSERT(sizeStats.HasMember("model_bytes_memory_limit"));
+    CPPUNIT_ASSERT_EQUAL(9, sizeStats["model_bytes_memory_limit"].GetInt());
 }
 
 void CJsonOutputWriterTest::testWriteScheduledEvent() {

diff --git a/lib/api/unittest/CModelSnapshotJsonWriterTest.cc b/lib/api/unittest/CModelSnapshotJsonWriterTest.cc
@@ -33,12 +33,15 @@ void CModelSnapshotJsonWriterTest::testWrite() {
     {
         model::CResourceMonitor::SResults modelSizeStats{
             10000,                     // bytes used
+            20000,                     // bytes used (adjusted)
             3,                         // # by fields
             1,                         // # partition fields
             150,                       // # over fields
             4,                         // # allocation failures
             model_t::E_MemoryStatusOk, // memory status
-            core_t::TTime(1521046309)  // bucket start time
+            core_t::TTime(1521046309), // bucket start time
+            0,                         // model bytes exceeded
+            50000                      // model bytes memory limit
         };
 
         CModelSnapshotJsonWriter::SModelSnapshotReport report{
@@ -114,6 +117,11 @@ void CModelSnapshotJsonWriterTest::testWrite() {
     CPPUNIT_ASSERT(modelSizeStats.HasMember("timestamp"));
     CPPUNIT_ASSERT_EQUAL(int64_t(1521046309000), modelSizeStats["timestamp"].GetInt64());
     CPPUNIT_ASSERT(modelSizeStats.HasMember("log_time"));
+    CPPUNIT_ASSERT(modelSizeStats.HasMember("model_bytes_exceeded"));
+    CPPUNIT_ASSERT_EQUAL(int64_t(0), modelSizeStats["model_bytes_exceeded"].GetInt64());
+    CPPUNIT_ASSERT(modelSizeStats.HasMember("model_bytes_memory_limit"));
+    CPPUNIT_ASSERT_EQUAL(int64_t(50000),
+                         modelSizeStats["model_bytes_memory_limit"].GetInt64());
 
     CPPUNIT_ASSERT(snapshot.HasMember("quantiles"));
     const rapidjson::Value& quantiles = snapshot["quantiles"];

diff --git a/lib/model/CResourceMonitor.cc b/lib/model/CResourceMonitor.cc
@@ -35,7 +35,8 @@ CResourceMonitor::CResourceMonitor(double byteLimitMargin)
       m_HasPruningStarted(false), m_PruneThreshold(0), m_LastPruneTime(0),
       m_PruneWindow(std::numeric_limits<std::size_t>::max()),
       m_PruneWindowMaximum(std::numeric_limits<std::size_t>::max()),
-      m_PruneWindowMinimum(std::numeric_limits<std::size_t>::max()), m_NoLimit(false) {
+      m_PruneWindowMinimum(std::numeric_limits<std::size_t>::max()),
+      m_NoLimit(false), m_CurrentBytesExceeded(0) {
     this->updateMemoryLimitsAndPruneThreshold(DEFAULT_MEMORY_LIMIT_MB);
 }
 
@@ -108,18 +109,21 @@ void CResourceMonitor::refresh(CAnomalyDetector& detector) {
 
 void CResourceMonitor::forceRefresh(CAnomalyDetector& detector) {
     this->memUsage(&detector);
-    core::CProgramCounters::counter(counter_t::E_TSADMemoryUsage) = this->totalMemory();
-    LOG_TRACE(<< "Checking allocations: currently at " << this->totalMemory());
+
     this->updateAllowAllocations();
 }
 
 void CResourceMonitor::updateAllowAllocations() {
     std::size_t total{this->totalMemory()};
+    core::CProgramCounters::counter(counter_t::E_TSADMemoryUsage) = total;
+    LOG_TRACE(<< "Checking allocations: currently at " << total);
     if (m_AllowAllocations) {
         if (total > this->highLimit()) {
             LOG_INFO(<< "Over current allocation high limit. " << total
                      << " bytes used, the limit is " << this->highLimit());
             m_AllowAllocations = false;
+            std::size_t bytesExceeded{total - this->highLimit()};
+            m_CurrentBytesExceeded = this->adjustedUsage(bytesExceeded);
         }
     } else if (total < this->lowLimit()) {
         LOG_INFO(<< "Below current allocation low limit. " << total
@@ -204,13 +208,6 @@ bool CResourceMonitor::areAllocationsAllowed() const {
     return m_AllowAllocations;
 }
 
-bool CResourceMonitor::areAllocationsAllowed(std::size_t size) const {
-    if (m_AllowAllocations) {
-        return this->totalMemory() + size < this->highLimit();
-    }
-    return false;
-}
-
 std::size_t CResourceMonitor::allocationLimit() const {
     return this->highLimit() - std::min(this->highLimit(), this->totalMemory());
 }
@@ -268,6 +265,9 @@ CResourceMonitor::SResults CResourceMonitor::createMemoryUsageReport(core_t::TTi
     res.s_OverFields = 0;
     res.s_PartitionFields = 0;
     res.s_Usage = this->totalMemory();
+    res.s_AdjustedUsage = this->adjustedUsage(res.s_Usage);
+    res.s_BytesMemoryLimit = 2 * m_ByteLimitHigh;
+    res.s_BytesExceeded = m_CurrentBytesExceeded;
     res.s_AllocationFailures = 0;
     res.s_MemoryStatus = m_MemoryStatus;
     res.s_BucketStartTime = bucketStartTime;
@@ -281,6 +281,25 @@ CResourceMonitor::SResults CResourceMonitor::createMemoryUsageReport(core_t::TTi
     return res;
 }
 
+std::size_t CResourceMonitor::adjustedUsage(std::size_t usage) const {
+    // Background persist causes the memory size to double due to copying
+    // the models. On top of that, after the persist is done we may not
+    // be able to retrieve that memory back. Thus, we report twice the
+    // memory usage in order to allow for that.
+    // See https://github.com/elastic/x-pack-elasticsearch/issues/1020.
+    // Issue https://github.com/elastic/x-pack-elasticsearch/issues/857
+    // discusses adding an option to perform only foreground persist.
+    // If that gets implemented, we should only double when background
+    // persist is configured.
+
+    // We also scale the reported memory usage by the inverse of the byte limit margin.
+    // This gives the user a fairer indication of how close the job is to hitting
+    // the model memory limit in a concise manner (as the limit is scaled down by
+    // the margin during the beginning period of the job's existence).
+    size_t adjustedUsage = static_cast<std::size_t>(2 * usage / m_ByteLimitMargin);
+    return adjustedUsage;
+}
+
 void CResourceMonitor::acceptAllocationFailureResult(core_t::TTime time) {
     m_MemoryStatus = model_t::E_MemoryStatusHardLimit;
     ++m_AllocationFailures[time];