Skip to content

Commit 06cc5dc

Browse files
[ML] Change outlier detection feature influence format to nested object (#1475)
This changes the format of `feature_influence` for outlier detection so that the feature name is not part of the field name. This helps reduce field explosion in the results index. Feature influence is now an array with nested objects. Each of them contains the `feature_name` and the `influence` value.
1 parent c3eebc9 commit 06cc5dc

File tree

3 files changed

+24
-7
lines changed

3 files changed

+24
-7
lines changed

docs/CHANGELOG.asciidoc

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,7 @@
4444
=== Enhancements
4545

4646
* Calculate total feature importance to store with model metadata. (See {ml-pull}1387[#1387].)
47+
* Change outlier detection feature_influence format to array with nested objects. (See {ml-pull}1475[#1475], {es-pull}62068[#62068].)
4748

4849
=== Bug Fixes
4950

lib/api/CDataFrameOutliersRunner.cc

Lines changed: 13 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,9 @@ const CDataFrameAnalysisConfigReader& parameterReader() {
5252

5353
// Output
5454
const std::string OUTLIER_SCORE_FIELD_NAME{"outlier_score"};
55-
const std::string FEATURE_INFLUENCE_FIELD_NAME_PREFIX{"feature_influence."};
55+
const std::string FEATURE_NAME_FIELD_NAME{"feature_name"};
56+
const std::string FEATURE_INFLUENCE_FIELD_NAME{"feature_influence"};
57+
const std::string INFLUENCE_FIELD_NAME{"influence"};
5658
}
5759

5860
CDataFrameOutliersRunner::CDataFrameOutliersRunner(const CDataFrameAnalysisSpecification& spec,
@@ -93,11 +95,19 @@ void CDataFrameOutliersRunner::writeOneRow(const core::CDataFrame& frame,
9395
writer.StartObject();
9496
writer.Key(OUTLIER_SCORE_FIELD_NAME);
9597
writer.Double(row[scoreColumn]);
96-
if (row[scoreColumn] > m_FeatureInfluenceThreshold) {
98+
if (row[scoreColumn] > m_FeatureInfluenceThreshold && numberFeatureScoreColumns > 0) {
99+
writer.Key(FEATURE_INFLUENCE_FIELD_NAME);
100+
writer.StartArray();
101+
97102
for (std::size_t i = 0; i < numberFeatureScoreColumns; ++i) {
98-
writer.Key(FEATURE_INFLUENCE_FIELD_NAME_PREFIX + frame.columnNames()[i]);
103+
writer.StartObject();
104+
writer.Key(FEATURE_NAME_FIELD_NAME);
105+
writer.String(frame.columnNames()[i]);
106+
writer.Key(INFLUENCE_FIELD_NAME);
99107
writer.Double(row[beginFeatureScoreColumns + i]);
108+
writer.EndObject();
100109
}
110+
writer.EndArray();
101111
}
102112
writer.EndObject();
103113
}

lib/api/unittest/CDataFrameAnalyzerOutlierTest.cc

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -285,8 +285,7 @@ BOOST_AUTO_TEST_CASE(testRunOutlierFeatureInfluences) {
285285

286286
TDoubleVec expectedScores;
287287
TDoubleVecVec expectedFeatureInfluences;
288-
TStrVec expectedNames{"feature_influence.c1", "feature_influence.c2", "feature_influence.c3",
289-
"feature_influence.c4", "feature_influence.c5"};
288+
TStrVec expectedNames{"c1", "c2", "c3", "c4", "c5"};
290289

291290
TStrVec fieldNames{"c1", "c2", "c3", "c4", "c5", ".", "."};
292291
TStrVec fieldValues{"", "", "", "", "", "0", ""};
@@ -301,12 +300,19 @@ BOOST_AUTO_TEST_CASE(testRunOutlierFeatureInfluences) {
301300
auto expectedFeatureInfluence = expectedFeatureInfluences.begin();
302301
for (const auto& result : results.GetArray()) {
303302
if (result.HasMember("row_results")) {
303+
304304
BOOST_TEST_REQUIRE(expectedFeatureInfluence !=
305305
expectedFeatureInfluences.end());
306-
for (std::size_t i = 0; i < 5; ++i) {
306+
for (int i = 0; i < 5; ++i) {
307+
BOOST_REQUIRE_EQUAL(
308+
expectedNames[i].c_str(),
309+
result["row_results"]["results"]["ml"]["feature_influence"][i]["feature_name"]
310+
.GetString());
311+
307312
BOOST_REQUIRE_CLOSE_ABSOLUTE(
308313
(*expectedFeatureInfluence)[i],
309-
result["row_results"]["results"]["ml"][expectedNames[i]].GetDouble(),
314+
result["row_results"]["results"]["ml"]["feature_influence"][i]["influence"]
315+
.GetDouble(),
310316
1e-4 * (*expectedFeatureInfluence)[i]);
311317
}
312318
++expectedFeatureInfluence;

0 commit comments

Comments
 (0)