Skip to content

Commit 0353d16

Browse files
authored
[ML] Remove old per-partition normalization code (#184) (#186)
Per-partition normalization is an old, undocumented feature that was never used by clients. It has been superseded by per-partition maximum scoring (see #32748). This PR removes the now redundant code. Relates elastic/elasticsearch#32816
1 parent 1fb820e commit 0353d16

21 files changed

+35
-311
lines changed

bin/autodetect/CCmdLineParser.cc

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,6 @@ bool CCmdLineParser::parse(int argc,
5252
bool& memoryUsage,
5353
std::size_t& bucketResultsDelay,
5454
bool& multivariateByFields,
55-
bool& perPartitionNormalization,
5655
TStrVec& clauseTokens) {
5756
try {
5857
boost::program_options::options_description desc(DESCRIPTION);
@@ -116,8 +115,6 @@ bool CCmdLineParser::parse(int argc,
116115
"The numer of half buckets to store before choosing which overlapping bucket has the biggest anomaly")
117116
("multivariateByFields",
118117
"Optional flag to enable multi-variate analysis of correlated by fields")
119-
("perPartitionNormalization",
120-
"Optional flag to enable per partition normalization")
121118
;
122119
// clang-format on
123120

@@ -231,9 +228,6 @@ bool CCmdLineParser::parse(int argc,
231228
if (vm.count("multivariateByFields") > 0) {
232229
multivariateByFields = true;
233230
}
234-
if (vm.count("perPartitionNormalization") > 0) {
235-
perPartitionNormalization = true;
236-
}
237231

238232
boost::program_options::collect_unrecognized(
239233
parsed.options, boost::program_options::include_positional)

bin/autodetect/CCmdLineParser.h

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -64,7 +64,6 @@ class CCmdLineParser {
6464
bool& memoryUsage,
6565
std::size_t& bucketResultsDelay,
6666
bool& multivariateByFields,
67-
bool& perPartitionNormalization,
6867
TStrVec& clauseTokens);
6968

7069
private:

bin/autodetect/Main.cc

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -88,7 +88,6 @@ int main(int argc, char** argv) {
8888
bool memoryUsage(false);
8989
std::size_t bucketResultsDelay(0);
9090
bool multivariateByFields(false);
91-
bool perPartitionNormalization(false);
9291
TStrVec clauseTokens;
9392
if (ml::autodetect::CCmdLineParser::parse(
9493
argc, argv, limitConfigFile, modelConfigFile, fieldConfigFile,
@@ -98,7 +97,7 @@ int main(int argc, char** argv) {
9897
maxQuantileInterval, inputFileName, isInputFileNamedPipe, outputFileName,
9998
isOutputFileNamedPipe, restoreFileName, isRestoreFileNamedPipe, persistFileName,
10099
isPersistFileNamedPipe, maxAnomalyRecords, memoryUsage, bucketResultsDelay,
101-
multivariateByFields, perPartitionNormalization, clauseTokens) == false) {
100+
multivariateByFields, clauseTokens) == false) {
102101
return EXIT_FAILURE;
103102
}
104103

@@ -146,7 +145,6 @@ int main(int argc, char** argv) {
146145
ml::model::CAnomalyDetectorModelConfig::defaultConfig(
147146
bucketSpan, summaryMode, summaryCountFieldName, latency,
148147
bucketResultsDelay, multivariateByFields);
149-
modelConfig.perPartitionNormalization(perPartitionNormalization);
150148
modelConfig.detectionRules(ml::model::CAnomalyDetectorModelConfig::TIntDetectionRuleVecUMapCRef(
151149
fieldConfig.detectionRules()));
152150
modelConfig.scheduledEvents(ml::model::CAnomalyDetectorModelConfig::TStrDetectionRulePrVecCRef(

bin/normalize/CCmdLineParser.cc

Lines changed: 1 addition & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -30,8 +30,7 @@ bool CCmdLineParser::parse(int argc,
3030
bool& isOutputFileNamedPipe,
3131
std::string& quantilesState,
3232
bool& deleteStateFiles,
33-
bool& writeCsv,
34-
bool& perPartitionNormalization) {
33+
bool& writeCsv) {
3534
try {
3635
boost::program_options::options_description desc(DESCRIPTION);
3736
// clang-format off
@@ -60,8 +59,6 @@ bool CCmdLineParser::parse(int argc,
6059
"If this flag is set then delete the normalizer state files once they have been read")
6160
("writeCsv",
6261
"Write the results in CSV format (default is lineified JSON)")
63-
("perPartitionNormalization",
64-
"Optional flag to enable per partition normalization")
6562
;
6663
// clang-format on
6764

@@ -114,9 +111,6 @@ bool CCmdLineParser::parse(int argc,
114111
if (vm.count("writeCsv") > 0) {
115112
writeCsv = true;
116113
}
117-
if (vm.count("perPartitionNormalization") > 0) {
118-
perPartitionNormalization = true;
119-
}
120114
} catch (std::exception& e) {
121115
std::cerr << "Error processing command line: " << e.what() << std::endl;
122116
return false;

bin/normalize/CCmdLineParser.h

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -43,8 +43,7 @@ class CCmdLineParser {
4343
bool& isOutputFileNamedPipe,
4444
std::string& quantilesState,
4545
bool& deleteStateFiles,
46-
bool& writeCsv,
47-
bool& perPartitionNormalization);
46+
bool& writeCsv);
4847

4948
private:
5049
static const std::string DESCRIPTION;

bin/normalize/Main.cc

Lines changed: 3 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -54,12 +54,10 @@ int main(int argc, char** argv) {
5454
std::string quantilesStateFile;
5555
bool deleteStateFiles(false);
5656
bool writeCsv(false);
57-
bool perPartitionNormalization(false);
5857
if (ml::normalize::CCmdLineParser::parse(
59-
argc, argv, modelConfigFile, logProperties, logPipe, bucketSpan,
60-
lengthEncodedInput, inputFileName, isInputFileNamedPipe,
61-
outputFileName, isOutputFileNamedPipe, quantilesStateFile,
62-
deleteStateFiles, writeCsv, perPartitionNormalization) == false) {
58+
argc, argv, modelConfigFile, logProperties, logPipe, bucketSpan, lengthEncodedInput,
59+
inputFileName, isInputFileNamedPipe, outputFileName, isOutputFileNamedPipe,
60+
quantilesStateFile, deleteStateFiles, writeCsv) == false) {
6361
return EXIT_FAILURE;
6462
}
6563

@@ -93,7 +91,6 @@ int main(int argc, char** argv) {
9391
LOG_FATAL(<< "Ml model config file '" << modelConfigFile << "' could not be loaded");
9492
return EXIT_FAILURE;
9593
}
96-
modelConfig.perPartitionNormalization(perPartitionNormalization);
9794

9895
// There's a choice of input and output formats for the numbers to be normalised
9996
using TInputParserCUPtr = const std::unique_ptr<ml::api::CInputParser>;

include/api/CHierarchicalResultsWriter.h

Lines changed: 1 addition & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -51,12 +51,7 @@ class API_EXPORT CHierarchicalResultsWriter : public model::CHierarchicalResults
5151
using TStr1Vec = core::CSmallVector<std::string, 1>;
5252

5353
public:
54-
enum EResultType {
55-
E_SimpleCountResult,
56-
E_PopulationResult,
57-
E_PartitionResult,
58-
E_Result
59-
};
54+
enum EResultType { E_SimpleCountResult, E_PopulationResult, E_Result };
6055
//! Type which wraps up the results of anomaly detection.
6156
struct API_EXPORT SResults {
6257
//! Construct for population results
@@ -168,9 +163,6 @@ class API_EXPORT CHierarchicalResultsWriter : public model::CHierarchicalResults
168163
//! pivot.
169164
void writePivotResult(const model::CHierarchicalResults& results, const TNode& node);
170165

171-
//! Write partition result if \p node is a partition level result
172-
void writePartitionResult(const model::CHierarchicalResults& results, const TNode& node);
173-
174166
//! Write out a simple count result if \p node is simple
175167
//! count.
176168
void writeSimpleCountResult(const TNode& node);

include/api/CJsonOutputWriter.h

Lines changed: 0 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -162,9 +162,6 @@ class API_EXPORT CJsonOutputWriter : public COutputHandler {
162162
// when the number to write is limited
163163
double s_LowestBucketInfluencerScore;
164164

165-
//! Partition scores
166-
TDocumentWeakPtrVec s_PartitionScoreDocuments;
167-
168165
//! scheduled event descriptions
169166
TStr1Vec s_ScheduledEventDescriptions;
170167
};
@@ -304,10 +301,6 @@ class API_EXPORT CJsonOutputWriter : public COutputHandler {
304301
void addInfluences(const CHierarchicalResultsWriter::TStoredStringPtrStoredStringPtrPrDoublePrVec& influenceResults,
305302
TDocumentWeakPtr weakDoc);
306303

307-
//! Write partition score & probability
308-
void addPartitionScores(const CHierarchicalResultsWriter::TResults& results,
309-
TDocumentWeakPtr weakDoc);
310-
311304
private:
312305
//! The job ID
313306
std::string m_JobId;

include/api/CResultNormalizer.h

Lines changed: 0 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -93,15 +93,6 @@ class API_EXPORT CResultNormalizer {
9393
std::string& valueFieldName,
9494
double& probability);
9595

96-
bool parseDataFields(const TStrStrUMap& dataRowFields,
97-
std::string& level,
98-
std::string& partition,
99-
std::string& partitionValue,
100-
std::string& person,
101-
std::string& function,
102-
std::string& valueFieldName,
103-
double& probability);
104-
10596
template<typename T>
10697
bool parseDataField(const TStrStrUMap& dataRowFields,
10798
const std::string& fieldName,

include/model/CAnomalyDetectorModelConfig.h

Lines changed: 0 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -418,12 +418,6 @@ class MODEL_EXPORT CAnomalyDetectorModelConfig {
418418
const TDoubleDoublePrVec& normalizedScoreKnotPoints() const;
419419
//@}
420420

421-
//! Check if we should create one normalizer per partition field value.
422-
bool perPartitionNormalization() const;
423-
424-
//! Set whether we should create one normalizer per partition field value.
425-
void perPartitionNormalization(bool value);
426-
427421
//! Sets the reference to the detection rules map
428422
void detectionRules(TIntDetectionRuleVecUMapCRef detectionRules);
429423

@@ -494,9 +488,6 @@ class MODEL_EXPORT CAnomalyDetectorModelConfig {
494488
//! and the normalized anomaly score with these knot points.
495489
//! \see DEFAULT_NORMALIZED_SCORE_KNOT_POINTS for details.
496490
TDoubleDoublePrVec m_NormalizedScoreKnotPoints;
497-
498-
//! If true then create one normalizer per partition field value.
499-
bool m_PerPartitionNormalisation;
500491
//@}
501492

502493
//! A reference to the map containing detection rules per

include/model/CHierarchicalResultsLevelSet.h

Lines changed: 13 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -162,11 +162,7 @@ class CHierarchicalResultsLevelSet : public CHierarchicalResultsVisitor {
162162

163163
//! Get and possibly add a normalizer for \p node.
164164
template<typename FACTORY>
165-
void elements(const TNode& node,
166-
bool pivot,
167-
const FACTORY& factory,
168-
TTypePtrVec& result,
169-
bool distinctLeavesPerPartition = false) {
165+
void elements(const TNode& node, bool pivot, const FACTORY& factory, TTypePtrVec& result) {
170166
result.clear();
171167
if (this->isSimpleCount(node)) {
172168
return;
@@ -193,39 +189,38 @@ class CHierarchicalResultsLevelSet : public CHierarchicalResultsVisitor {
193189
return;
194190
}
195191

196-
std::string partitionKey = distinctLeavesPerPartition
197-
? *node.s_Spec.s_PartitionFieldName +
198-
*node.s_Spec.s_PartitionFieldValue
199-
: *node.s_Spec.s_PartitionFieldName;
200-
201192
if (this->isLeaf(node)) {
202-
TWord word = ms_Dictionary.word(partitionKey, *node.s_Spec.s_PersonFieldName,
203-
*node.s_Spec.s_FunctionName,
204-
*node.s_Spec.s_ValueFieldName);
193+
TWord word = ms_Dictionary.word(
194+
*node.s_Spec.s_PartitionFieldName, *node.s_Spec.s_PersonFieldName,
195+
*node.s_Spec.s_FunctionName, *node.s_Spec.s_ValueFieldName);
205196
TWordTypePrVecItr i = element(m_LeafSet, word);
206197
if (i == m_LeafSet.end() || i->first != word) {
207198
i = m_LeafSet.insert(
208-
i, TWordTypePr(word, factory.make(partitionKey, *node.s_Spec.s_PersonFieldName,
199+
i, TWordTypePr(word, factory.make(*node.s_Spec.s_PartitionFieldName,
200+
*node.s_Spec.s_PersonFieldName,
209201
*node.s_Spec.s_FunctionName,
210202
*node.s_Spec.s_ValueFieldName)));
211203
}
212204
result.push_back(&i->second);
213205
}
214206
if (this->isPerson(node)) {
215-
TWord word = ms_Dictionary.word(partitionKey, *node.s_Spec.s_PersonFieldName);
207+
TWord word = ms_Dictionary.word(*node.s_Spec.s_PartitionFieldName,
208+
*node.s_Spec.s_PersonFieldName);
216209
TWordTypePrVecItr i = element(m_PersonSet, word);
217210
if (i == m_PersonSet.end() || i->first != word) {
218211
i = m_PersonSet.insert(
219-
i, TWordTypePr(word, factory.make(partitionKey, *node.s_Spec.s_PersonFieldName)));
212+
i, TWordTypePr(word, factory.make(*node.s_Spec.s_PartitionFieldName,
213+
*node.s_Spec.s_PersonFieldName)));
220214
}
221215
result.push_back(&i->second);
222216
}
223217
if (this->isPartition(node)) {
224-
TWord word = ms_Dictionary.word(partitionKey);
218+
TWord word = ms_Dictionary.word(*node.s_Spec.s_PartitionFieldName);
225219

226220
TWordTypePrVecItr i = element(m_PartitionSet, word);
227221
if (i == m_PartitionSet.end() || i->first != word) {
228-
i = m_PartitionSet.insert(i, TWordTypePr(word, factory.make(partitionKey)));
222+
i = m_PartitionSet.insert(
223+
i, TWordTypePr(word, factory.make(*node.s_Spec.s_PartitionFieldName)));
229224
}
230225
result.push_back(&i->second);
231226
}

lib/api/CHierarchicalResultsWriter.cc

Lines changed: 0 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -126,7 +126,6 @@ void CHierarchicalResultsWriter::visit(const model::CHierarchicalResults& result
126126
} else {
127127
this->writePopulationResult(results, node);
128128
this->writeIndividualResult(results, node);
129-
this->writePartitionResult(results, node);
130129
this->writeSimpleCountResult(node);
131130
}
132131
}
@@ -258,34 +257,6 @@ void CHierarchicalResultsWriter::writeIndividualResult(const model::CHierarchica
258257
node.s_Spec.s_Detector, node.s_BucketLength, EMPTY_STRING_LIST));
259258
}
260259

261-
void CHierarchicalResultsWriter::writePartitionResult(const model::CHierarchicalResults& results,
262-
const TNode& node) {
263-
if (!m_ModelConfig.perPartitionNormalization() || this->isSimpleCount(node) ||
264-
this->isPopulation(node) || !this->isPartition(node) ||
265-
!this->shouldWriteResult(m_Limits, results, node, false)) {
266-
return;
267-
}
268-
269-
model_t::EFeature feature =
270-
node.s_AnnotatedProbability.s_AttributeProbabilities.empty()
271-
? model_t::E_IndividualCountByBucketAndPerson
272-
: node.s_AnnotatedProbability.s_AttributeProbabilities[0].s_Feature;
273-
274-
TDouble1Vec emptyDoubleVec;
275-
276-
m_ResultWriterFunc(TResults(
277-
E_PartitionResult, *node.s_Spec.s_PartitionFieldName,
278-
*node.s_Spec.s_PartitionFieldValue, *node.s_Spec.s_ByFieldName,
279-
*node.s_Spec.s_PersonFieldValue, EMPTY_STRING, node.s_BucketStartTime,
280-
*node.s_Spec.s_FunctionName, model_t::outputFunctionName(feature),
281-
node.s_AnnotatedProbability.s_BaselineBucketCount,
282-
node.s_AnnotatedProbability.s_CurrentBucketCount, emptyDoubleVec, emptyDoubleVec,
283-
node.s_RawAnomalyScore, node.s_NormalizedAnomalyScore, node.probability(),
284-
*node.s_Spec.s_ValueFieldName, node.s_AnnotatedProbability.s_Influences,
285-
node.s_Spec.s_UseNull, model::function_t::isMetric(node.s_Spec.s_Function),
286-
node.s_Spec.s_Detector, node.s_BucketLength, EMPTY_STRING_LIST));
287-
}
288-
289260
void CHierarchicalResultsWriter::writePivotResult(const model::CHierarchicalResults& results,
290261
const TNode& node) {
291262
if (this->isSimpleCount(node) ||

lib/api/CJsonOutputWriter.cc

Lines changed: 0 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -70,7 +70,6 @@ const std::string EXAMPLES("examples");
7070
const std::string BUCKET_SPAN("bucket_span");
7171
const std::string PROCESSING_TIME("processing_time_ms");
7272
const std::string TIME_INFLUENCER("bucket_time");
73-
const std::string PARTITION_SCORES("partition_scores");
7473
const std::string SCHEDULED_EVENTS("scheduled_events");
7574
const std::string QUANTILES("quantiles");
7675

@@ -191,14 +190,6 @@ bool CJsonOutputWriter::acceptResult(const CHierarchicalResultsWriter::TResults&
191190
return true;
192191
}
193192

194-
if (results.s_ResultType == CHierarchicalResultsWriter::E_PartitionResult) {
195-
TDocumentWeakPtr partitionDoc = m_Writer.makeStorableDoc();
196-
this->addPartitionScores(results, partitionDoc);
197-
bucketData.s_PartitionScoreDocuments.push_back(partitionDoc);
198-
199-
return true;
200-
}
201-
202193
++bucketData.s_RecordCount;
203194

204195
TDocumentWeakPtrIntPrVec& detectorDocumentsToWrite = bucketData.s_DocumentsToWrite;
@@ -513,26 +504,6 @@ void CJsonOutputWriter::writeBucket(bool isInterim,
513504
m_Writer.EndArray();
514505
}
515506

516-
if (!bucketData.s_PartitionScoreDocuments.empty()) {
517-
// Write the array of partition-anonaly score pairs
518-
m_Writer.String(PARTITION_SCORES);
519-
m_Writer.StartArray();
520-
for (TDocumentWeakPtrVecItr partitionScoresIter =
521-
bucketData.s_PartitionScoreDocuments.begin();
522-
partitionScoresIter != bucketData.s_PartitionScoreDocuments.end();
523-
++partitionScoresIter) {
524-
TDocumentWeakPtr weakDoc = *partitionScoresIter;
525-
TDocumentPtr docPtr = weakDoc.lock();
526-
if (!docPtr) {
527-
LOG_ERROR(<< "Inconsistent program state. JSON document unavailable.");
528-
continue;
529-
}
530-
531-
m_Writer.write(*docPtr);
532-
}
533-
m_Writer.EndArray();
534-
}
535-
536507
m_Writer.String(PROCESSING_TIME);
537508
m_Writer.Uint64(bucketProcessingTime);
538509

@@ -816,24 +787,6 @@ void CJsonOutputWriter::addInfluencerFields(bool isBucketInfluencer,
816787
}
817788
}
818789

819-
void CJsonOutputWriter::addPartitionScores(const CHierarchicalResultsWriter::TResults& results,
820-
TDocumentWeakPtr weakDoc) {
821-
TDocumentPtr docPtr = weakDoc.lock();
822-
if (!docPtr) {
823-
LOG_ERROR(<< "Inconsistent program state. JSON document unavailable.");
824-
return;
825-
}
826-
827-
m_Writer.addDoubleFieldToObj(PROBABILITY, results.s_Probability, *docPtr);
828-
m_Writer.addStringFieldCopyToObj(PARTITION_FIELD_NAME,
829-
results.s_PartitionFieldName, *docPtr);
830-
m_Writer.addStringFieldCopyToObj(PARTITION_FIELD_VALUE,
831-
results.s_PartitionFieldValue, *docPtr, true);
832-
m_Writer.addDoubleFieldToObj(INITIAL_RECORD_SCORE,
833-
results.s_NormalizedAnomalyScore, *docPtr);
834-
m_Writer.addDoubleFieldToObj(RECORD_SCORE, results.s_NormalizedAnomalyScore, *docPtr);
835-
}
836-
837790
void CJsonOutputWriter::limitNumberRecords(size_t count) {
838791
m_RecordOutputLimit = count;
839792
}

0 commit comments

Comments
 (0)