Skip to content

Commit 5587587

Browse files
authored
[ML] First pass implementation of support functionality for change detection and modelling (#9)
This implements 1) a naive Bayes classifier, using our distribution models, which will be used for modelling the probability of a change, and 2) a change detector framework, currently supporting detecting level shifts and time shifts, which works by comparing BIC of the various possible hypotheses against one another and a null hypothesis that there is no change.
1 parent 5fb3037 commit 5587587

14 files changed

+2337
-3
lines changed

include/maths/CNaiveBayes.h

Lines changed: 237 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,237 @@
1+
/*
2+
* ELASTICSEARCH CONFIDENTIAL
3+
*
4+
* Copyright (c) 2018 Elasticsearch BV. All Rights Reserved.
5+
*
6+
* Notice: this software, and all information contained
7+
* therein, is the exclusive property of Elasticsearch BV
8+
* and its licensors, if any, and is protected under applicable
9+
* domestic and foreign law, and international treaties.
10+
*
11+
* Reproduction, republication or distribution without the
12+
* express written consent of Elasticsearch BV is
13+
* strictly prohibited.
14+
*/
15+
16+
#ifndef INCLUDED_ml_maths_CNaiveBayes_h
17+
#define INCLUDED_ml_maths_CNaiveBayes_h
18+
19+
#include <maths/ImportExport.h>
20+
21+
#include <maths/CPrior.h>
22+
23+
#include <boost/unordered_map.hpp>
24+
25+
#include <cstddef>
26+
#include <vector>
27+
28+
namespace ml
29+
{
30+
namespace core
31+
{
32+
class CStatePersistInserter;
33+
class CStateRestoreTraverser;
34+
}
35+
namespace maths
36+
{
37+
struct SDistributionRestoreParams;
38+
39+
//! \brief The interface expected by CNaiveBayes for implementations
40+
//! of the class conditional density functions.
41+
class MATHS_EXPORT CNaiveBayesFeatureDensity
42+
{
43+
public:
44+
using TDouble1Vec = core::CSmallVector<double, 1>;
45+
46+
public:
47+
virtual ~CNaiveBayesFeatureDensity() = default;
48+
49+
//! Create and return a clone.
50+
//!
51+
//! \note The caller owns this.
52+
virtual CNaiveBayesFeatureDensity *clone() const = 0;
53+
54+
//! Initialize by reading state from \p traverser.
55+
virtual bool acceptRestoreTraverser(const SDistributionRestoreParams &params,
56+
core::CStateRestoreTraverser &traverser) = 0;
57+
58+
//! Persist state by passing information to \p inserter.
59+
virtual void acceptPersistInserter(core::CStatePersistInserter &inserter) const = 0;
60+
61+
//! Add the value \p x.
62+
virtual void add(const TDouble1Vec &x) = 0;
63+
64+
//! Compute the log value of the density function at \p x.
65+
virtual double logValue(const TDouble1Vec &x) const = 0;
66+
67+
//! Age out old values density to account for \p time passing.
68+
virtual void propagateForwardsByTime(double time) = 0;
69+
70+
//! Debug the memory used by this object.
71+
virtual void debugMemoryUsage(core::CMemoryUsage::TMemoryUsagePtr mem) const = 0;
72+
73+
//! Get the static size of this object.
74+
virtual std::size_t staticSize() const = 0;
75+
76+
//! Get the memory used by this object.
77+
virtual std::size_t memoryUsage() const = 0;
78+
79+
//! Get a checksum for this object.
80+
virtual uint64_t checksum(uint64_t seed) const = 0;
81+
};
82+
83+
//! \brief An implementation of the class conditional density function
84+
//! based on the CPrior hierarchy.
85+
class MATHS_EXPORT CNaiveBayesFeatureDensityFromPrior final : public CNaiveBayesFeatureDensity
86+
{
87+
public:
88+
CNaiveBayesFeatureDensityFromPrior() = default;
89+
CNaiveBayesFeatureDensityFromPrior(CPrior &prior);
90+
91+
//! Create and return a clone.
92+
//!
93+
//! \note The caller owns this.
94+
virtual CNaiveBayesFeatureDensityFromPrior *clone() const;
95+
96+
//! Initialize by reading state from \p traverser.
97+
virtual bool acceptRestoreTraverser(const SDistributionRestoreParams &params,
98+
core::CStateRestoreTraverser &traverser);
99+
100+
//! Persist state by passing information to \p inserter.
101+
virtual void acceptPersistInserter(core::CStatePersistInserter &inserter) const;
102+
103+
//! Add the value \p x.
104+
virtual void add(const TDouble1Vec &x);
105+
106+
//! Compute the log value of the density function at \p x.
107+
virtual double logValue(const TDouble1Vec &x) const;
108+
109+
//! Age out old values density to account for \p time passing.
110+
virtual void propagateForwardsByTime(double time);
111+
112+
//! Debug the memory used by this object.
113+
virtual void debugMemoryUsage(core::CMemoryUsage::TMemoryUsagePtr mem) const;
114+
115+
//! Get the static size of this object.
116+
virtual std::size_t staticSize() const;
117+
118+
//! Get the memory used by this object.
119+
virtual std::size_t memoryUsage() const;
120+
121+
//! Get a checksum for this object.
122+
virtual uint64_t checksum(uint64_t seed) const;
123+
124+
private:
125+
using TPriorPtr = boost::shared_ptr<CPrior>;
126+
127+
private:
128+
//! The density model.
129+
TPriorPtr m_Prior;
130+
};
131+
132+
//! \brief Implements a Naive Bayes classifier.
133+
class MATHS_EXPORT CNaiveBayes
134+
{
135+
public:
136+
using TDoubleSizePr = std::pair<double, std::size_t>;
137+
using TDoubleSizePrVec = std::vector<TDoubleSizePr>;
138+
using TDouble1Vec = core::CSmallVector<double, 1>;
139+
using TDouble1VecVec = std::vector<TDouble1Vec>;
140+
141+
public:
142+
explicit CNaiveBayes(const CNaiveBayesFeatureDensity &exemplar,
143+
double decayRate = 0.0);
144+
CNaiveBayes(const SDistributionRestoreParams &params,
145+
core::CStateRestoreTraverser &traverser);
146+
147+
//! Persist state by passing information to \p inserter.
148+
void acceptPersistInserter(core::CStatePersistInserter &inserter) const;
149+
150+
//! This can be used to optionally seed the class counts
151+
//! with \p counts. These are added on to data class counts
152+
//! to compute the class posterior probabilities.
153+
void initialClassCounts(const TDoubleSizePrVec &counts);
154+
155+
//! Add a training data point comprising the pair \f$(x,l)\f$
156+
//! for feature vector \f$x\f$ and class label \f$l\f$.
157+
//!
158+
//! \param[in] label The class label for \p x.
159+
//! \param[in] x The feature values.
160+
//! \note \p x size should be equal to the number of features.
161+
//! A feature is missing is indicated by passing an empty vector
162+
//! for that feature.
163+
void addTrainingDataPoint(std::size_t label, const TDouble1VecVec &x);
164+
165+
//! Age out old values from the class conditional densities
166+
//! to account for \p time passing.
167+
void propagateForwardsByTime(double time);
168+
169+
//! Get the top \p n class probabilities for \p features.
170+
//!
171+
//! \param[in] n The number of class probabilities to estimate.
172+
//! \param[in] x The feature values.
173+
//! \note \p x size should be equal to the number of features.
174+
//! A feature is missing is indicated by passing an empty vector
175+
//! for that feature.
176+
TDoubleSizePrVec highestClassProbabilities(std::size_t n,
177+
const TDouble1VecVec &x) const;
178+
179+
//! Debug the memory used by this object.
180+
void debugMemoryUsage(core::CMemoryUsage::TMemoryUsagePtr mem) const;
181+
182+
//! Get the memory used by this object.
183+
std::size_t memoryUsage() const;
184+
185+
//! Get a checksum for this object.
186+
uint64_t checksum(uint64_t seed = 0) const;
187+
188+
private:
189+
using TFeatureDensityPtr = boost::shared_ptr<CNaiveBayesFeatureDensity>;
190+
using TFeatureDensityPtrVec = std::vector<TFeatureDensityPtr>;
191+
192+
//! \brief The data associated with a class.
193+
struct SClass
194+
{
195+
//! Initialize by reading state from \p traverser.
196+
bool acceptRestoreTraverser(const SDistributionRestoreParams &params,
197+
core::CStateRestoreTraverser &traverser);
198+
//! Persist state by passing information to \p inserter.
199+
void acceptPersistInserter(core::CStatePersistInserter &inserter) const;
200+
//! Debug the memory used by this object.
201+
void debugMemoryUsage(core::CMemoryUsage::TMemoryUsagePtr mem) const;
202+
//! Get the memory used by this object.
203+
std::size_t memoryUsage() const;
204+
//! Get a checksum for this object.
205+
uint64_t checksum(uint64_t seed = 0) const;
206+
207+
//! The number of examples in this class.
208+
double s_Count = 0.0;
209+
//! The feature conditional densities for this class.
210+
TFeatureDensityPtrVec s_ConditionalDensities;
211+
};
212+
213+
using TSizeClassUMap = boost::unordered_map<std::size_t, SClass>;
214+
215+
private:
216+
//! Initialize by reading state from \p traverser.
217+
bool acceptRestoreTraverser(const SDistributionRestoreParams &params,
218+
core::CStateRestoreTraverser &traverser);
219+
220+
//! Validate \p x.
221+
bool validate(const TDouble1VecVec &x) const;
222+
223+
private:
224+
//! Controls the rate at which data are aged out.
225+
double m_DecayRate;
226+
227+
//! An exemplar for creating conditional densities.
228+
TFeatureDensityPtr m_Exemplar;
229+
230+
//! The class conditional density estimates and weights.
231+
TSizeClassUMap m_ClassConditionalDensities;
232+
};
233+
234+
}
235+
}
236+
237+
#endif // INCLUDED_ml_maths_CNaiveBayes_h

0 commit comments

Comments
 (0)