[ML] Fix influencer count and influence calculation (#150)

Hendrik Muhs · web-flow · commit d41de3495f20 · 2018-07-11T16:45:07.000+02:00
Fix counting of influencer per bucket for metric population analyses, prior this fix the count has always been set to 1. Fixes #24
diff --git a/docs/CHANGELOG.asciidoc b/docs/CHANGELOG.asciidoc
@@ -59,6 +59,7 @@ Age seasonal components in proportion to the fraction of values with which they'
 Persist and restore was missing some of the trend model state ({pull}#99[#99])
 Stop zero variance data generating a log error in the forecast confidence interval calculation ({pull}#107[#107])
 Fix corner case failing to calculate lgamma values and the correspoinding log errors ({pull}#126[#126])
+Influence count per bucket for metric population analyses was wrong and lead to wrong influencer scoring ({pull}#150[#150])
 
 === Regressions
 
diff --git a/lib/model/CMetricPopulationModel.cc b/lib/model/CMetricPopulationModel.cc
@@ -943,7 +943,7 @@ void CMetricPopulationModel::fill(model_t::EFeature feature,
         this->currentBucketInterimCorrections().emplace(
             CCorrectionKey(feature, pid, cid), correction);
     }
-    params.s_Count = 1.0;
+    params.s_Count = bucket->count();
     params.s_ComputeProbabilityParams
         .tag(pid) // new line
         .addCalculation(model_t::probabilityCalculation(feature))
diff --git a/lib/model/CProbabilityAndInfluenceCalculator.cc b/lib/model/CProbabilityAndInfluenceCalculator.cc
@@ -144,43 +144,55 @@ double ratio(double numerator, double denominator, double zeroDividedByZero) {
     return numerator / denominator;
 }
 
+// Functions to compute influence based on different criteria
 //! \brief Computes the value of summed statistics on the set difference.
 class CValueDifference {
 public:
     //! Features.
-    void operator()(const TDouble2Vec& v,
+    bool operator()(const TDouble2Vec& v,
                     double n,
                     const TDouble1Vec& vi,
                     double ni,
                     maths::CModelProbabilityParams& params,
                     TDouble2Vec& difference) const {
+        if (n < ni) {
+            return false;
+        }
+
         for (std::size_t i = 0u; i < v.size(); ++i) {
             difference[i] = v[i] - vi[i];
         }
         params.addBucketEmpty({n == ni});
+
+        return true;
     }
 
     //! Correlates.
-    void operator()(const TDouble2Vec& v,
+    bool operator()(const TDouble2Vec& v,
                     const TDouble2Vec& n,
                     const TDouble1Vec& vi,
                     const TDouble1Vec& ni,
                     maths::CModelProbabilityParams& params,
                     TDouble2Vec& difference) const {
+        if (n < ni) {
+            return false;
+        }
         TBool2Vec bucketEmpty(2);
         for (std::size_t i = 0u; i < v.size(); ++i) {
             bucketEmpty[i] = ((n[i] - ni[i]) == 0);
             difference[i] = v[i] - vi[i];
         }
         params.addBucketEmpty(bucketEmpty);
+
+        return true;
     }
 };
 
 //! \brief Computes the value of min, max, dc, etc on the set intersection.
 class CValueIntersection {
 public:
     //! Features.
-    void operator()(const TDouble2Vec& /*v*/,
+    bool operator()(const TDouble2Vec& /*v*/,
                     double /*n*/,
                     const TDouble1Vec& vi,
                     double ni,
@@ -190,34 +202,50 @@ class CValueIntersection {
             intersection[i] = vi[i];
         }
         params.addBucketEmpty({ni == 0});
+
+        return true;
     }
 
     //! Correlates.
-    void operator()(const TDouble2Vec& /*v*/,
+    bool operator()(const TDouble2Vec& /*v*/,
                     const TDouble2Vec& /*n*/,
                     const TDouble1Vec& vi,
                     const TDouble1Vec& ni,
                     maths::CModelProbabilityParams& params,
                     TDouble2Vec& intersection) const {
+
         TBool2Vec bucketEmpty(2);
         for (std::size_t i = 0u; i < vi.size(); ++i) {
             bucketEmpty[i] = (ni[i] == 0);
             intersection[i] = vi[i];
         }
         params.addBucketEmpty(bucketEmpty);
+
+        return true;
     }
 };
 
 //! \brief Computes the value of the mean statistic on a set difference.
 class CMeanDifference {
 public:
     //! Features.
-    void operator()(const TDouble2Vec& v,
+    //!
+    //! \param[in] v overall mean
+    //! \param[in] n overall count
+    //! \param[in] vi influencer mean
+    //! \param[in] ni influencer count
+    //! \param[out] params model parameters to be updated
+    //! \param[out] difference computed mean difference
+    bool operator()(const TDouble2Vec& v,
                     double n,
                     const TDouble1Vec& vi,
                     double ni,
                     maths::CModelProbabilityParams& params,
                     TDouble2Vec& difference) const {
+        if (n <= ni) {
+            return false;
+        }
+
         for (std::size_t d = 0u; d < v.size(); ++d) {
             difference[d] = maths::CBasicStatistics::mean(
                 maths::CBasicStatistics::accumulator(n, v[d]) -
@@ -226,15 +254,21 @@ class CMeanDifference {
         maths_t::multiplyCountVarianceScale(TDouble2Vec(v.size(), n / (n - ni)),
                                             params.weights()[0]);
         params.addBucketEmpty({n == ni});
+
+        return true;
     }
 
     //! Correlates.
-    void operator()(const TDouble2Vec& v,
+    bool operator()(const TDouble2Vec& v,
                     const TDouble2Vec& n,
                     const TDouble1Vec& vi,
                     const TDouble1Vec& ni,
                     maths::CModelProbabilityParams& params,
                     TDouble2Vec& difference) const {
+        if (n <= ni) {
+            return false;
+        }
+
         TBool2Vec bucketEmpty(2);
         for (std::size_t d = 0u; d < 2; ++d) {
             bucketEmpty[d] = ((n[d] - ni[d]) == 0);
@@ -246,19 +280,31 @@ class CMeanDifference {
             TDouble2Vec{n[0] / (n[0] - ni[0]), n[1] / (n[1] - ni[1])},
             params.weights()[0]);
         params.addBucketEmpty(bucketEmpty);
+
+        return true;
     }
 };
 
 //! \brief Computes the value of the variance statistic on a set difference.
 class CVarianceDifference {
 public:
     //! Features.
-    void operator()(const TDouble1Vec& v,
+    //!
+    //! \param[in] v overall variance and mean
+    //! \param[in] n overall count
+    //! \param[in] vi influencer variance and mean
+    //! \param[in] ni influencer count
+    //! \param[out] params model parameters to be updated
+    //! \param[out] difference computed mean difference
+    bool operator()(const TDouble1Vec& v,
                     double n,
                     const TDouble1Vec& vi,
                     double ni,
                     maths::CModelProbabilityParams& params,
                     TDouble2Vec& difference) const {
+        if (n < ni) {
+            return false;
+        }
         std::size_t dimension = v.size() / 2;
         for (std::size_t d = 0u; d < dimension; ++d) {
             difference[d] = maths::CBasicStatistics::maximumLikelihoodVariance(
@@ -268,15 +314,20 @@ class CVarianceDifference {
         maths_t::multiplyCountVarianceScale(TDouble2Vec(dimension, n / (n - ni)),
                                             params.weights()[0]);
         params.addBucketEmpty({n == ni});
+
+        return true;
     }
 
     //! Correlates.
-    void operator()(const TDouble2Vec& v,
+    bool operator()(const TDouble2Vec& v,
                     const TDouble2Vec& n,
                     const TDouble1Vec& vi,
                     const TDouble1Vec& ni,
                     maths::CModelProbabilityParams& params,
                     TDouble2Vec& difference) const {
+        if (n < ni) {
+            return false;
+        }
         TBool2Vec bucketEmpty(2);
         for (std::size_t d = 0u; d < 2; ++d) {
             bucketEmpty[d] = ((n[d] - ni[d]) == 0);
@@ -288,6 +339,8 @@ class CVarianceDifference {
         maths_t::multiplyCountVarianceScale(
             TDouble2Vec{n[0] / (n[0] - ni[0]), n[1] / (n[1] - ni[1])},
             params.weights()[0]);
+
+        return true;
     }
 };
 
@@ -370,8 +423,14 @@ void doComputeInfluences(model_t::EFeature feature,
     for (auto i = influencerValues.begin(); i != influencerValues.end(); ++i) {
         params.weights(weights).updateAnomalyModel(false);
 
-        computeInfluencedValue(value, count, i->second.first, i->second.second,
-                               params, influencedValue[0]);
+        if (computeInfluencedValue(value, count, i->second.first, i->second.second,
+                                   params, influencedValue[0]) == false) {
+            LOG_ERROR(<< "Failed to compute influencer value (value = " << value
+                      << " , count = " << count
+                      << " , influencer value = " << i->second.first
+                      << " , influencer count = " << i->second.second << ")");
+            continue;
+        }
 
         double pi;
         bool conditional;
diff --git a/lib/model/unittest/CMetricPopulationModelTest.cc b/lib/model/unittest/CMetricPopulationModelTest.cc
@@ -1548,6 +1548,8 @@ CppUnit::Test* CMetricPopulationModelTest::suite() {
     suiteOfTests->addTest(new CppUnit::TestCaller<CMetricPopulationModelTest>(
         "CMetricPopulationModelTest::testMinMaxAndMean",
         &CMetricPopulationModelTest::testMinMaxAndMean));
+    suiteOfTests->addTest(new CppUnit::TestCaller<CMetricPopulationModelTest>(
+        "CMetricPopulationModelTest::testVarp", &CMetricPopulationModelTest::testVarp));
     suiteOfTests->addTest(new CppUnit::TestCaller<CMetricPopulationModelTest>(
         "CMetricPopulationModelTest::testComputeProbability",
         &CMetricPopulationModelTest::testComputeProbability));

Original file line number	Diff line number	Diff line change
`@@ -943,7 +943,7 @@ void CMetricPopulationModel::fill(model_t::EFeature feature,`
`943`	`943`	`this->currentBucketInterimCorrections().emplace(`
`944`	`944`	`CCorrectionKey(feature, pid, cid), correction);`
`945`	`945`	`}`
`946`		`- params.s_Count = 1.0;`
	`946`	`+ params.s_Count = bucket->count();`
`947`	`947`	`params.s_ComputeProbabilityParams`
`948`	`948`	`.tag(pid) // new line`
`949`	`949`	`.addCalculation(model_t::probabilityCalculation(feature))`