Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Include supported aggregations in field caps api response #93884

Draft
wants to merge 6 commits into
base: main
Choose a base branch
from

Conversation

martijnvg
Copy link
Member

Improve field caps api to include what aggregations are supported on a per field bases. For fields that are aggregatable, a list of supported aggregations is included.

Closes #93539

@martijnvg
Copy link
Member Author

Example usage: GET /k8s/_field_caps?fields=kubernetes.node.network*

Response (with the new supported_aggregations field):

{
    "indices": [
        ".ds-k8s-2022.10.13-000001"
    ],
    "fields": {
        "kubernetes": {
            "object": {
                "type": "object",
                "metadata_field": false,
                "searchable": false,
                "aggregatable": false
            }
        },
        "kubernetes.node.network.tx.errors": {
            "long": {
                "type": "long",
                "metadata_field": false,
                "searchable": true,
                "aggregatable": true,
                "time_series_metric": "counter",
                "supported_aggregations": [
                    [
                        "histogram",
                        "variable_width_histogram",
                        "rate",
                        "min",
                        "max",
                        "top_metrics",
                        "range"
                    ]
                ]
            }
        },
        "kubernetes.node": {
            "object": {
                "type": "object",
                "metadata_field": false,
                "searchable": false,
                "aggregatable": false
            }
        },
        "kubernetes.node.network.rx.errors": {
            "long": {
                "type": "long",
                "metadata_field": false,
                "searchable": true,
                "aggregatable": true,
                "time_series_metric": "counter",
                "supported_aggregations": [
                    [
                        "histogram",
                        "variable_width_histogram",
                        "rate",
                        "min",
                        "max",
                        "top_metrics",
                        "range"
                    ]
                ]
            }
        },
        "kubernetes.node.network.tx.bytes": {
            "long": {
                "type": "long",
                "metadata_field": false,
                "searchable": true,
                "aggregatable": true,
                "time_series_metric": "gauge",
                "supported_aggregations": [
                    [
                        "max",
                        "top_metrics",
                        "missing",
                        "date_histogram",
                        "sum",
                        "rate",
                        "boxplot",
                        "value_count",
                        "avg",
                        "percentiles",
                        "cardinality",
                        "histogram",
                        "variable_width_histogram",
                        "frequent_item_sets",
                        "min",
                        "stats",
                        "diversified_sampler",
                        "percentile_ranks",
                        "median_absolute_deviation",
                        "multi_terms",
                        "auto_date_histogram",
                        "rare_terms",
                        "range",
                        "extended_stats",
                        "date_range",
                        "terms",
                        "significant_terms"
                    ]
                ]
            }
        },
        "kubernetes.node.network.rx.bytes": {
            "long": {
                "type": "long",
                "metadata_field": false,
                "searchable": true,
                "aggregatable": true,
                "time_series_metric": "gauge",
                "supported_aggregations": [
                    [
                        "max",
                        "top_metrics",
                        "missing",
                        "date_histogram",
                        "sum",
                        "rate",
                        "boxplot",
                        "value_count",
                        "avg",
                        "percentiles",
                        "cardinality",
                        "histogram",
                        "variable_width_histogram",
                        "frequent_item_sets",
                        "min",
                        "stats",
                        "diversified_sampler",
                        "percentile_ranks",
                        "median_absolute_deviation",
                        "multi_terms",
                        "auto_date_histogram",
                        "rare_terms",
                        "range",
                        "extended_stats",
                        "date_range",
                        "terms",
                        "significant_terms"
                    ]
                ]
            }
        },
        "kubernetes.node.network": {
            "object": {
                "type": "object",
                "metadata_field": false,
                "searchable": false,
                "aggregatable": false
            }
        },
        "kubernetes.node.network.rx": {
            "object": {
                "type": "object",
                "metadata_field": false,
                "searchable": false,
                "aggregatable": false
            }
        },
        "kubernetes.node.network.tx": {
            "object": {
                "type": "object",
                "metadata_field": false,
                "searchable": false,
                "aggregatable": false
            }
        }
    }
}

Improve field caps api to include what aggregations are supported on a per field bases.
For fields that are aggregatable, a list of supported aggregations is included.

Closes elastic#93539
@martijnvg martijnvg force-pushed the field_caps/supported_aggregations branch from ce8be27 to 8bf158e Compare February 18, 2023 14:14
@@ -50,6 +50,12 @@ public String familyTypeName() {
return KeywordFieldMapper.CONTENT_TYPE;
}

@Override
public boolean isAggregatable() {
// Doesn't support field data (yet) and therefor attempting to aggregate on field of this field type fail
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The field caps api loads index field data to determine a field's value source type.
This field type doesn't have field data support (it doesn't overwrite fielddataBuilder(...) method).
So this field of this type never really supported aggregations, and there for returning false here is ok.

public AggregationUsageService getUsageService() {
return usageService;
}

private static Map<ValuesSourceType, Set<String>> buildSupportedAggregations(
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think that all aggregations use the ValuesSourceRegistry. So not all aggregations have registered an aggregatorSupplier on a per values source type basis yet. We need to check which aggregations are not using ValuesSourceRegistry and make these aggregations use it.

For counter value source type, it looks that all supported aggregations use ValuesSourceRegistry.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Anything that uses either ArrayValuesSourceAggregationBuilder or MultiValuesSourceAggregationBuilder isn't using the registry, because we never decided how we wanted that to work. There are a few others; IIRC the joins (nested, parent-child) don't. Might be more. There are others that use it in non-obvious ways, like Top metrics and Composite.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think for some aggregations it is ok to not be registered.

The only subclass of ArrayValuesSourceAggregationBuilder is matrix_stats and maybe we can leave this out initially? I think usage of this agg is low.

The test, geo_line and weighted avg extend from MultiValuesSourceAggregationBuilder. Not sure how to handle these aggregations. Maybe in case of geo_line the point field should determine with what value source type it gets registered?

@@ -117,6 +117,13 @@ static Map<String, IndexFieldCapabilities> retrieveFieldCaps(
for (String field : fieldNames) {
MappedFieldType ft = context.getFieldType(field);
if (filter.test(ft)) {
Set<String> supportedAggregations;
if (ft.isAggregatable()) {
var valuesSourceType = context.getForField(ft, MappedFieldType.FielddataOperation.SEARCH).getValuesSourceType();
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe not load index field data? The only reason that index field data is loaded, is to figure out the value source type. Maybe getValueSourceType() method can be added to MappedFieldType? This avoids having to load field data. For some fields like _id this causes warnings (see note in FieldCapabilitiesFilterTests). On the other hand verifying that index field data can be loaded is a good test that the field is actually aggregatable?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I vaguely recall that there was some difficulty with putting getValuesSourceType() on MappedFieldType, but I don't recall what it was. I think it's a good idea if it's not a ton of work.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I see, if it isn't too much then I think we should try it.

It does avoid many deprecation warning when asking field caps for all fields, since it will try to load Index field data for _id field and that triggers deprecation warning (I had to alter a test to take into account these deprecation warnings).

Copy link
Member

@not-napoleon not-napoleon left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this is the right approach. Worth noting that there are some "supported" aggregation-values source combinations that make no sense, like date aggs on boolean fields or stats aggs on murmur3 fields, so this isn't going to be perfect. Over time, we'd like to eliminate those, but so far we have considered doing so breaking. I wonder if we should have some way for this API to indicate "support is deprecated"?

@@ -117,6 +117,13 @@ static Map<String, IndexFieldCapabilities> retrieveFieldCaps(
for (String field : fieldNames) {
MappedFieldType ft = context.getFieldType(field);
if (filter.test(ft)) {
Set<String> supportedAggregations;
if (ft.isAggregatable()) {
var valuesSourceType = context.getForField(ft, MappedFieldType.FielddataOperation.SEARCH).getValuesSourceType();
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I vaguely recall that there was some difficulty with putting getValuesSourceType() on MappedFieldType, but I don't recall what it was. I think it's a good idea if it's not a ton of work.

public AggregationUsageService getUsageService() {
return usageService;
}

private static Map<ValuesSourceType, Set<String>> buildSupportedAggregations(
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Anything that uses either ArrayValuesSourceAggregationBuilder or MultiValuesSourceAggregationBuilder isn't using the registry, because we never decided how we wanted that to work. There are a few others; IIRC the joins (nested, parent-child) don't. Might be more. There are others that use it in non-obvious ways, like Top metrics and Composite.

@@ -35,11 +35,6 @@ public ConstantFieldType(String name, Map<String, String> meta) {
assert isSearchable();
}

@Override
public final boolean isAggregatable() {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Are we changing it so Constant Fields are no longer aggregatable, or did you discover that the default implementation works for this field type? The former seems like a breaking change.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It turns out that _tier meta field doesn't support aggregating. It doesn't overwrite fielddataBuilder(...) method.
I removed this method because this assumes that all constant fields are aggregatable and this isn't the case. Only constant keyword and the index meta fields are aggregatable.

Removing this method fixes this, the base class delegates to hasDocValues() method do determine whether a field is aggreagatable. Both constant keyword and index meta field types are mapped as if they have doc values, but the tier meta field is mapped that it doesn't have doc values. So the isAggregatable() in the super class is doing the right thing here.

@martijnvg
Copy link
Member Author

Thanks for taking a look Mark.

I wonder if we should have some way for this API to indicate "support is deprecated"?

I like that idea. We can return two lists of aggregations. One supported list and one supported but deprecated list. The former list would be a subset of the latter list. In order for this to work we would need to indicate in the places where aggregations get registered that a registration to a value source type is deprecated.

@not-napoleon
Copy link
Member

In order for this to work we would need to indicate in the places where aggregations get registered that a registration to a value source type is deprecate

I thought about adding a registerDeprecated method to the values source registry, but I didn't think we'd get enough use out of it. If you look in (e.g.) DateHistogramAggregatorFactory#registerAggregators, you can see my ad-hoc way of registering deprecated pairings.

@martijnvg martijnvg mentioned this pull request Mar 8, 2023
4 tasks
@droberts195
Copy link
Contributor

For simple fields like long or keyword with no special time series settings, will this return a very long list of aggregations? It would be good if your example comment could include simple long and keyword fields to show what the output will look like for them. Will the list include aggregations defined in plugins like frequent_item_sets?

@martijnvg
Copy link
Member Author

For simple fields like long or keyword with no special time series settings, will this return a very long list of aggregations?

Yes, this PR will return a long list for these kinds of fields.

It would be good if your #93884 (comment) could include simple long and keyword fields to show what the output will look like for them.

I will update the example.

Will the list include aggregations defined in plugins like frequent_item_sets?

Any aggregation that registers itself with the ValuesSourceRegistry should be included. Looking at FrequentItemSetsAggregationBuilder#registerAggregators(...) this is the case for frequent_item_sets aggregation. @not-napoleon pointed out that not all aggregations use the ValuesSourceRegistry. I think for some of these aggregations it is ok not to be included. I think we may not to look at weighted_avg, geo_line, t_test and matrix_stats (not sure about these last two) and see how these aggregations can get registered with ValuesSourceRegistry.

@droberts195
Copy link
Contributor

One more question: what will happen if field caps is called on an 8.8 cluster but for index patterns that cover indices on remote 8.7 clusters?

For example, suppose someone asks for field caps for *:metrics* against the 8.8 central management cluster of a large setup with 10 remote clusters, some on 8.7 and others on 8.8, that all contain metrics* data streams.

@martijnvg
Copy link
Member Author

what will happen if field caps is called on an 8.8 cluster but for index patterns that cover indices on remote 8.7 clusters?

Right now, an empty list of supported aggregations is returned. The logic looks the two fields with the same name of different indices and intersects the lists of supported aggregations. This felt like a safe approach for me, since if more aggregations are supported in a newer version, but older version doesn't yet have this extended support then it is best to return supported aggregations that both fields (from different indices on different nodes) have in common.

@droberts195
Copy link
Contributor

Right now, an empty list of supported aggregations is returned.

This means that in 8.8 Kibana should not make use of the list of aggregations for simple field types that have existed for a long time. If the supported list of aggregations were applied everywhere in Kibana in 8.8 then it might become impossible to chart a simple integer or long field on a search being done against 8.7 indices in a remote cluster.

I added elastic/kibana#152912 (comment) to the Kibana issue to make sure this gets taken into account in deciding what to do for 8.8.

It's still good to add the list of aggregations so that it can be used as the source of truth eventually. But it cannot be the source of truth until the oldest remote cluster is on 8.8, not just the local cluster.

@martijnvg
Copy link
Member Author

It's still good to add the list of aggregations so that it can be used as the source of truth eventually. But it cannot be the source of truth until the oldest remote cluster is on 8.8, not just the local cluster.

True, with the way this PR works now.

But maybe we can define a pre-8.8.0 list of support aggregations per value source type. For each value source type we define a list of aggregations we know is supported before 8.8.0. Nodes that are on 8.8.0 and later can use that in case a pre 8.8.0 node is part of the field caps execution. I think this way this PR can still be useful for mixed and remote clusters with version pre and post 8.8.0?

@ppisljar
Copy link
Member

this increases field caps response size significantly, in my tests 3-5 times. @mattkime do you think this could be a problem for some of our integrations that use indices with huge amount of fields ?

@mattkime
Copy link

@ppisljar It definitely would. While we have plans to reduce the number of fields we're loading via data views, we haven't started the work yet. @ppisljar you have a better idea of what assumptions kibana makes about supported aggregations - perhaps we should avoid increasing the payload size unless its necessary.

@martijnvg
Copy link
Member Author

We can definitely optionally return the list of supported aggregations and introduce a parameter for this.

@ppisljar
Copy link
Member

ppisljar commented Mar 22, 2023

also it seems there is a lot of duplicated data in the response, so maybe we can optimize this into something like this? :
(our field lists will contain thousands of fields with same type (like keyword or number))

{
    "indices": [
        ".ds-k8s-2022.10.13-000001"
    ],
    "supported_aggregations": {
        "counter": [
                        "histogram",
                        "variable_width_histogram",
                        "rate",
                        "min",
                        "max",
                        "top_metrics",
                        "range"
                    ],
           "gauge": [
                    [
                        "max",
                        "top_metrics",
                        "missing",
                        "date_histogram",
                        "sum",
                        "rate",
                        "boxplot",
                        "value_count",
                        "avg",
                        "percentiles",
                        "cardinality",
                        "histogram",
                        "variable_width_histogram",
                        "frequent_item_sets",
                        "min",
                        "stats",
                        "diversified_sampler",
                        "percentile_ranks",
                        "median_absolute_deviation",
                        "multi_terms",
                        "auto_date_histogram",
                        "rare_terms",
                        "range",
                        "extended_stats",
                        "date_range",
                        "terms",
                        "significant_terms"
                    ]
                ]
    },
    "fields": {
        "kubernetes": {
            "object": {
                "type": "object",
                "metadata_field": false,
                "searchable": false,
                "aggregatable": false
            }
        },
        "kubernetes.node.network.tx.errors": {
            "long": {
                "type": "long",
                "metadata_field": false,
                "searchable": true,
                "aggregatable": true,
                "time_series_metric": "counter",
                "supported_aggregations": "counter",
            }
        },
        "kubernetes.node": {
            "object": {
                "type": "object",
                "metadata_field": false,
                "searchable": false,
                "aggregatable": false
            }
        },
        "kubernetes.node.network.rx.errors": {
            "long": {
                "type": "long",
                "metadata_field": false,
                "searchable": true,
                "aggregatable": true,
                "time_series_metric": "counter",
                "supported_aggregations": "counter"
            }
        },
        "kubernetes.node.network.tx.bytes": {
            "long": {
                "type": "long",
                "metadata_field": false,
                "searchable": true,
                "aggregatable": true,
                "time_series_metric": "gauge",
                "supported_aggregations": "gauge",
            }
        },
        "kubernetes.node.network.rx.bytes": {
            "long": {
                "type": "long",
                "metadata_field": false,
                "searchable": true,
                "aggregatable": true,
                "time_series_metric": "gauge",
                "supported_aggregations": "gauge",
            }
        },
        "kubernetes.node.network": {
            "object": {
                "type": "object",
                "metadata_field": false,
                "searchable": false,
                "aggregatable": false
            }
        },
        "kubernetes.node.network.rx": {
            "object": {
                "type": "object",
                "metadata_field": false,
                "searchable": false,
                "aggregatable": false
            }
        },
        "kubernetes.node.network.tx": {
            "object": {
                "type": "object",
                "metadata_field": false,
                "searchable": false,
                "aggregatable": false
            }
        }
    }
}

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
Projects
None yet
Development

Successfully merging this pull request may close these issues.

Certain Aggregation functions not supported when TSDB datastream is used.