From 8090d1e7eb9bbb2b68bcefa2e6e03633ae253054 Mon Sep 17 00:00:00 2001 From: Dimitris Athanasiou Date: Wed, 9 Sep 2020 10:09:50 +0300 Subject: [PATCH] [7.x][ML] Change outlier detection feature influence format to nested object (#1475) (#1479) This changes the format of `feature_influence` for outlier detection so that the feature name is not part of the field name. This helps reduce field explosion in the results index. Feature influence is now an array with nested objects. Each of them contains the `feature_name` and the `influence` value. Backport of #1475 --- docs/CHANGELOG.asciidoc | 1 + lib/api/CDataFrameOutliersRunner.cc | 16 +++++++++++++--- .../unittest/CDataFrameAnalyzerOutlierTest.cc | 14 ++++++++++---- 3 files changed, 24 insertions(+), 7 deletions(-) diff --git a/docs/CHANGELOG.asciidoc b/docs/CHANGELOG.asciidoc index 3b4ba28927..32c69102ec 100644 --- a/docs/CHANGELOG.asciidoc +++ b/docs/CHANGELOG.asciidoc @@ -33,6 +33,7 @@ === Enhancements * Calculate total feature importance to store with model metadata. (See {ml-pull}1387[#1387].) +* Change outlier detection feature_influence format to array with nested objects. (See {ml-pull}1475[#1475], {es-pull}62068[#62068].) === Bug Fixes diff --git a/lib/api/CDataFrameOutliersRunner.cc b/lib/api/CDataFrameOutliersRunner.cc index 6e28a35152..d785a2a5a7 100644 --- a/lib/api/CDataFrameOutliersRunner.cc +++ b/lib/api/CDataFrameOutliersRunner.cc @@ -52,7 +52,9 @@ const CDataFrameAnalysisConfigReader& parameterReader() { // Output const std::string OUTLIER_SCORE_FIELD_NAME{"outlier_score"}; -const std::string FEATURE_INFLUENCE_FIELD_NAME_PREFIX{"feature_influence."}; +const std::string FEATURE_NAME_FIELD_NAME{"feature_name"}; +const std::string FEATURE_INFLUENCE_FIELD_NAME{"feature_influence"}; +const std::string INFLUENCE_FIELD_NAME{"influence"}; } CDataFrameOutliersRunner::CDataFrameOutliersRunner(const CDataFrameAnalysisSpecification& spec, @@ -93,11 +95,19 @@ void CDataFrameOutliersRunner::writeOneRow(const core::CDataFrame& frame, writer.StartObject(); writer.Key(OUTLIER_SCORE_FIELD_NAME); writer.Double(row[scoreColumn]); - if (row[scoreColumn] > m_FeatureInfluenceThreshold) { + if (row[scoreColumn] > m_FeatureInfluenceThreshold && numberFeatureScoreColumns > 0) { + writer.Key(FEATURE_INFLUENCE_FIELD_NAME); + writer.StartArray(); + for (std::size_t i = 0; i < numberFeatureScoreColumns; ++i) { - writer.Key(FEATURE_INFLUENCE_FIELD_NAME_PREFIX + frame.columnNames()[i]); + writer.StartObject(); + writer.Key(FEATURE_NAME_FIELD_NAME); + writer.String(frame.columnNames()[i]); + writer.Key(INFLUENCE_FIELD_NAME); writer.Double(row[beginFeatureScoreColumns + i]); + writer.EndObject(); } + writer.EndArray(); } writer.EndObject(); } diff --git a/lib/api/unittest/CDataFrameAnalyzerOutlierTest.cc b/lib/api/unittest/CDataFrameAnalyzerOutlierTest.cc index 564ee5f0c3..020174ab7b 100644 --- a/lib/api/unittest/CDataFrameAnalyzerOutlierTest.cc +++ b/lib/api/unittest/CDataFrameAnalyzerOutlierTest.cc @@ -285,8 +285,7 @@ BOOST_AUTO_TEST_CASE(testRunOutlierFeatureInfluences) { TDoubleVec expectedScores; TDoubleVecVec expectedFeatureInfluences; - TStrVec expectedNames{"feature_influence.c1", "feature_influence.c2", "feature_influence.c3", - "feature_influence.c4", "feature_influence.c5"}; + TStrVec expectedNames{"c1", "c2", "c3", "c4", "c5"}; TStrVec fieldNames{"c1", "c2", "c3", "c4", "c5", ".", "."}; TStrVec fieldValues{"", "", "", "", "", "0", ""}; @@ -301,12 +300,19 @@ BOOST_AUTO_TEST_CASE(testRunOutlierFeatureInfluences) { auto expectedFeatureInfluence = expectedFeatureInfluences.begin(); for (const auto& result : results.GetArray()) { if (result.HasMember("row_results")) { + BOOST_TEST_REQUIRE(expectedFeatureInfluence != expectedFeatureInfluences.end()); - for (std::size_t i = 0; i < 5; ++i) { + for (int i = 0; i < 5; ++i) { + BOOST_REQUIRE_EQUAL( + expectedNames[i].c_str(), + result["row_results"]["results"]["ml"]["feature_influence"][i]["feature_name"] + .GetString()); + BOOST_REQUIRE_CLOSE_ABSOLUTE( (*expectedFeatureInfluence)[i], - result["row_results"]["results"]["ml"][expectedNames[i]].GetDouble(), + result["row_results"]["results"]["ml"]["feature_influence"][i]["influence"] + .GetDouble(), 1e-4 * (*expectedFeatureInfluence)[i]); } ++expectedFeatureInfluence;