Skip to content

Commit

Permalink
[ML] Change outlier detection feature influence format to nested obje…
Browse files Browse the repository at this point in the history
…ct (#1475)

This changes the format of `feature_influence` for outlier detection so
that the feature name is not part of the field name. This helps reduce
field explosion in the results index.

Feature influence is now an array with nested objects. Each of them contains
the `feature_name` and the `influence` value.
  • Loading branch information
dimitris-athanasiou authored Sep 8, 2020
1 parent c3eebc9 commit 06cc5dc
Show file tree
Hide file tree
Showing 3 changed files with 24 additions and 7 deletions.
1 change: 1 addition & 0 deletions docs/CHANGELOG.asciidoc
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@
=== Enhancements

* Calculate total feature importance to store with model metadata. (See {ml-pull}1387[#1387].)
* Change outlier detection feature_influence format to array with nested objects. (See {ml-pull}1475[#1475], {es-pull}62068[#62068].)

=== Bug Fixes

Expand Down
16 changes: 13 additions & 3 deletions lib/api/CDataFrameOutliersRunner.cc
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,9 @@ const CDataFrameAnalysisConfigReader& parameterReader() {

// Output
const std::string OUTLIER_SCORE_FIELD_NAME{"outlier_score"};
const std::string FEATURE_INFLUENCE_FIELD_NAME_PREFIX{"feature_influence."};
const std::string FEATURE_NAME_FIELD_NAME{"feature_name"};
const std::string FEATURE_INFLUENCE_FIELD_NAME{"feature_influence"};
const std::string INFLUENCE_FIELD_NAME{"influence"};
}

CDataFrameOutliersRunner::CDataFrameOutliersRunner(const CDataFrameAnalysisSpecification& spec,
Expand Down Expand Up @@ -93,11 +95,19 @@ void CDataFrameOutliersRunner::writeOneRow(const core::CDataFrame& frame,
writer.StartObject();
writer.Key(OUTLIER_SCORE_FIELD_NAME);
writer.Double(row[scoreColumn]);
if (row[scoreColumn] > m_FeatureInfluenceThreshold) {
if (row[scoreColumn] > m_FeatureInfluenceThreshold && numberFeatureScoreColumns > 0) {
writer.Key(FEATURE_INFLUENCE_FIELD_NAME);
writer.StartArray();

for (std::size_t i = 0; i < numberFeatureScoreColumns; ++i) {
writer.Key(FEATURE_INFLUENCE_FIELD_NAME_PREFIX + frame.columnNames()[i]);
writer.StartObject();
writer.Key(FEATURE_NAME_FIELD_NAME);
writer.String(frame.columnNames()[i]);
writer.Key(INFLUENCE_FIELD_NAME);
writer.Double(row[beginFeatureScoreColumns + i]);
writer.EndObject();
}
writer.EndArray();
}
writer.EndObject();
}
Expand Down
14 changes: 10 additions & 4 deletions lib/api/unittest/CDataFrameAnalyzerOutlierTest.cc
Original file line number Diff line number Diff line change
Expand Up @@ -285,8 +285,7 @@ BOOST_AUTO_TEST_CASE(testRunOutlierFeatureInfluences) {

TDoubleVec expectedScores;
TDoubleVecVec expectedFeatureInfluences;
TStrVec expectedNames{"feature_influence.c1", "feature_influence.c2", "feature_influence.c3",
"feature_influence.c4", "feature_influence.c5"};
TStrVec expectedNames{"c1", "c2", "c3", "c4", "c5"};

TStrVec fieldNames{"c1", "c2", "c3", "c4", "c5", ".", "."};
TStrVec fieldValues{"", "", "", "", "", "0", ""};
Expand All @@ -301,12 +300,19 @@ BOOST_AUTO_TEST_CASE(testRunOutlierFeatureInfluences) {
auto expectedFeatureInfluence = expectedFeatureInfluences.begin();
for (const auto& result : results.GetArray()) {
if (result.HasMember("row_results")) {

BOOST_TEST_REQUIRE(expectedFeatureInfluence !=
expectedFeatureInfluences.end());
for (std::size_t i = 0; i < 5; ++i) {
for (int i = 0; i < 5; ++i) {
BOOST_REQUIRE_EQUAL(
expectedNames[i].c_str(),
result["row_results"]["results"]["ml"]["feature_influence"][i]["feature_name"]
.GetString());

BOOST_REQUIRE_CLOSE_ABSOLUTE(
(*expectedFeatureInfluence)[i],
result["row_results"]["results"]["ml"][expectedNames[i]].GetDouble(),
result["row_results"]["results"]["ml"]["feature_influence"][i]["influence"]
.GetDouble(),
1e-4 * (*expectedFeatureInfluence)[i]);
}
++expectedFeatureInfluence;
Expand Down

0 comments on commit 06cc5dc

Please sign in to comment.