Skip to content

Commit

Permalink
Adds hard_bounds to histogram aggregations (#59175) (#59656)
Browse files Browse the repository at this point in the history
Adds a hard_bounds parameter to explicitly limit the buckets that a histogram
can generate. This is especially useful in case of open ended ranges that can
produce a very large number of buckets.
  • Loading branch information
imotov authored Jul 16, 2020
1 parent 10be10c commit 2408803
Show file tree
Hide file tree
Showing 27 changed files with 865 additions and 119 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -600,3 +600,48 @@ setup:
- match: { profile.shards.0.aggregations.0.description: histo }
- match: { profile.shards.0.aggregations.0.breakdown.collect_count: 4 }
- match: { profile.shards.0.aggregations.0.debug.total_buckets: 3 }

---
"histogram with hard bounds":
- skip:
version: " - 7.9.99"
reason: hard_bounds were introduced in 7.10.0

- do:
indices.create:
index: test_3
body:
mappings:
properties:
range:
type: long_range

- do:
bulk:
index: test_3
refresh: true
body:
- '{"index": {}}'
- '{"range": {"lte": 10}}'
- '{"index": {}}'
- '{"range": {"gte": 15}}'

- do:
search:
index: test_3
body:
size: 0
aggs:
histo:
histogram:
field: range
interval: 1
hard_bounds:
min: 0
max: 20
- match: { hits.total.value: 2 }
- length: { aggregations.histo.buckets: 21 }
- match: { aggregations.histo.buckets.0.key: 0 }
- match: { aggregations.histo.buckets.0.doc_count: 1 }
- match: { aggregations.histo.buckets.20.key: 20 }
- match: { aggregations.histo.buckets.20.doc_count: 1 }
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
setup:
- skip:
version: " - 7.1.99"
reason: calendar_interval introduced in 7.2.0

- do:
indices.create:
index: test_date_hist
body:
settings:
# There was a BWC issue that only showed up on empty shards. This
# test has 4 docs and 5 shards makes sure we get one empty.
number_of_shards: 5
mappings:
properties:
range:
type: date_range

- do:
bulk:
index: test_date_hist
refresh: true
body:
- '{"index": {}}'
- '{"range": {"gte": "2016-01-01", "lt": "2016-01-02"}}'
- '{"index": {}}'
- '{"range": {"gte": "2016-01-02", "lt": "2016-01-03"}}'
- '{"index": {}}'
- '{"range": {"gte": "2016-02-01", "lt": "2016-02-02"}}'
- '{"index": {}}'
- '{"range": {"gte": "2016-03-01", "lt": "2016-03-02"}}'
- '{"index": {}}'
- '{"range": {"gte": "2016-04-01"}}'
- '{"index": {}}'
- '{"range": {"lt": "2016-02-01"}}'

---
"date_histogram on range with hard bounds":
- skip:
version: " - 7.9.99"
reason: hard_bounds introduced in 7.10.0

- do:
search:
body:
size: 0
aggs:
histo:
date_histogram:
field: range
calendar_interval: month
hard_bounds:
"min": "2015-06-01"
"max": "2016-06-01"

- match: { hits.total.value: 6 }
- length: { aggregations.histo.buckets: 13 }
- match: { aggregations.histo.buckets.0.key_as_string: "2015-06-01T00:00:00.000Z" }
- match: { aggregations.histo.buckets.0.doc_count: 1 }
- match: { aggregations.histo.buckets.8.key_as_string: "2016-02-01T00:00:00.000Z" }
- match: { aggregations.histo.buckets.8.doc_count: 1 }
- match: { aggregations.histo.buckets.12.key_as_string: "2016-06-01T00:00:00.000Z" }
- match: { aggregations.histo.buckets.12.doc_count: 1 }
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@
import org.elasticsearch.search.aggregations.BucketOrder;
import org.elasticsearch.search.aggregations.InternalAggregation;
import org.elasticsearch.search.aggregations.bucket.histogram.DateHistogramInterval;
import org.elasticsearch.search.aggregations.bucket.histogram.ExtendedBounds;
import org.elasticsearch.search.aggregations.bucket.histogram.LongBounds;
import org.elasticsearch.search.aggregations.bucket.histogram.Histogram;
import org.elasticsearch.search.aggregations.bucket.histogram.Histogram.Bucket;
import org.elasticsearch.search.aggregations.bucket.histogram.InternalDateHistogram;
Expand Down Expand Up @@ -1085,7 +1085,7 @@ public void testSingleValueFieldWithExtendedBounds() throws Exception {
.dateHistogramInterval(DateHistogramInterval.days(interval))
.minDocCount(0)
// when explicitly specifying a format, the extended bounds should be defined by the same format
.extendedBounds(new ExtendedBounds(format(boundsMin, pattern), format(boundsMax, pattern)))
.extendedBounds(new LongBounds(format(boundsMin, pattern), format(boundsMax, pattern)))
.format(pattern))
.get();

Expand Down Expand Up @@ -1153,7 +1153,7 @@ public void testSingleValueFieldWithExtendedBoundsTimezone() throws Exception {
.from("now/d").to("now/d").includeLower(true).includeUpper(true).timeZone(timezone.getId()))
.addAggregation(
dateHistogram("histo").field("date").dateHistogramInterval(DateHistogramInterval.hours(1))
.timeZone(timezone).minDocCount(0).extendedBounds(new ExtendedBounds("now/d", "now/d+23h"))
.timeZone(timezone).minDocCount(0).extendedBounds(new LongBounds("now/d", "now/d+23h"))
).get();
assertSearchResponse(response);

Expand Down Expand Up @@ -1206,7 +1206,7 @@ public void testSingleValueFieldWithExtendedBoundsOffset() throws Exception {
.addAggregation(
dateHistogram("histo").field("date").dateHistogramInterval(DateHistogramInterval.days(1))
.offset("+6h").minDocCount(0)
.extendedBounds(new ExtendedBounds("2016-01-01T06:00:00Z", "2016-01-08T08:00:00Z"))
.extendedBounds(new LongBounds("2016-01-01T06:00:00Z", "2016-01-08T08:00:00Z"))
).get();
assertSearchResponse(response);

Expand Down Expand Up @@ -1378,7 +1378,7 @@ public void testFormatIndexUnmapped() throws InterruptedException, ExecutionExce
SearchResponse response = client().prepareSearch(indexDateUnmapped)
.addAggregation(
dateHistogram("histo").field("dateField").dateHistogramInterval(DateHistogramInterval.MONTH).format("yyyy-MM")
.minDocCount(0).extendedBounds(new ExtendedBounds("2018-01", "2018-01")))
.minDocCount(0).extendedBounds(new LongBounds("2018-01", "2018-01")))
.get();
assertSearchResponse(response);
Histogram histo = response.getAggregations().get("histo");
Expand Down Expand Up @@ -1434,7 +1434,7 @@ public void testDSTEndTransition() throws Exception {
.setQuery(new MatchNoneQueryBuilder())
.addAggregation(dateHistogram("histo").field("date").timeZone(ZoneId.of("Europe/Oslo"))
.calendarInterval(DateHistogramInterval.HOUR).minDocCount(0).extendedBounds(
new ExtendedBounds("2015-10-25T02:00:00.000+02:00", "2015-10-25T04:00:00.000+01:00")))
new LongBounds("2015-10-25T02:00:00.000+02:00", "2015-10-25T04:00:00.000+01:00")))
.get();

Histogram histo = response.getAggregations().get("histo");
Expand All @@ -1451,7 +1451,7 @@ public void testDSTEndTransition() throws Exception {
.setQuery(new MatchNoneQueryBuilder())
.addAggregation(dateHistogram("histo").field("date").timeZone(ZoneId.of("Europe/Oslo"))
.dateHistogramInterval(DateHistogramInterval.HOUR).minDocCount(0).extendedBounds(
new ExtendedBounds("2015-10-25T02:00:00.000+02:00", "2015-10-25T04:00:00.000+01:00")))
new LongBounds("2015-10-25T02:00:00.000+02:00", "2015-10-25T04:00:00.000+01:00")))
.get();

histo = response.getAggregations().get("histo");
Expand Down Expand Up @@ -1649,4 +1649,23 @@ public void testDateKeyFormatting() {
assertThat(buckets.get(1).getKeyAsString(), equalTo("2012-02-01T00:00:00.000-07:00"));
assertThat(buckets.get(2).getKeyAsString(), equalTo("2012-03-01T00:00:00.000-07:00"));
}

public void testHardBoundsOnDates() {
SearchResponse response = client().prepareSearch("idx")
.addAggregation(dateHistogram("histo")
.field("date")
.calendarInterval(DateHistogramInterval.DAY)
.hardBounds(new LongBounds("2012-02-01T00:00:00.000", "2012-03-03T00:00:00.000"))
)
.get();

assertSearchResponse(response);

InternalDateHistogram histogram = response.getAggregations().get("histo");
List<InternalDateHistogram.Bucket> buckets = histogram.getBuckets();
assertThat(buckets.size(), equalTo(30));
assertThat(buckets.get(1).getKeyAsString(), equalTo("2012-02-03T00:00:00.000Z"));
assertThat(buckets.get(29).getKeyAsString(), equalTo("2012-03-02T00:00:00.000Z"));
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@
import org.elasticsearch.search.aggregations.AggregationExecutionException;
import org.elasticsearch.search.aggregations.InternalAggregation;
import org.elasticsearch.search.aggregations.bucket.filter.Filter;
import org.elasticsearch.search.aggregations.bucket.histogram.DoubleBounds;
import org.elasticsearch.search.aggregations.bucket.histogram.Histogram;
import org.elasticsearch.search.aggregations.bucket.histogram.Histogram.Bucket;
import org.elasticsearch.search.aggregations.metrics.Avg;
Expand Down Expand Up @@ -1194,6 +1195,58 @@ public void testSingleValuedFieldOrderedBySingleValueSubAggregationAscAsCompound
assertMultiSortResponse(expectedKeys, BucketOrder.aggregation("avg_l", true));
}

public void testInvalidBounds() {
SearchPhaseExecutionException e = expectThrows(SearchPhaseExecutionException.class, () -> client().prepareSearch("empty_bucket_idx")
.addAggregation(histogram("histo").field(SINGLE_VALUED_FIELD_NAME).hardBounds(new DoubleBounds(0.0, 10.0))
.extendedBounds(3, 20)).get());
assertThat(e.toString(), containsString("Extended bounds have to be inside hard bounds, hard bounds"));

e = expectThrows(SearchPhaseExecutionException.class, () -> client().prepareSearch("empty_bucket_idx")
.addAggregation(histogram("histo").field(SINGLE_VALUED_FIELD_NAME).hardBounds(new DoubleBounds(3.0, null))
.extendedBounds(0, 20)).get());
assertThat(e.toString(), containsString("Extended bounds have to be inside hard bounds, hard bounds"));
}

public void testHardBounds() throws Exception {
assertAcked(prepareCreate("test").addMapping("type", "d", "type=double").get());
indexRandom(true,
client().prepareIndex("test", "type", "1").setSource("d", -0.6),
client().prepareIndex("test", "type", "2").setSource("d", 0.5),
client().prepareIndex("test", "type", "3").setSource("d", 0.1));

SearchResponse r = client().prepareSearch("test")
.addAggregation(histogram("histo").field("d").interval(0.1).hardBounds(new DoubleBounds(0.0, null)))
.get();
assertSearchResponse(r);

Histogram histogram = r.getAggregations().get("histo");
List<? extends Bucket> buckets = histogram.getBuckets();
assertEquals(5, buckets.size());
assertEquals(0.1, (double) buckets.get(0).getKey(), 0.01d);
assertEquals(0.5, (double) buckets.get(4).getKey(), 0.01d);

r = client().prepareSearch("test")
.addAggregation(histogram("histo").field("d").interval(0.1).hardBounds(new DoubleBounds(null, 0.0)))
.get();
assertSearchResponse(r);

histogram = r.getAggregations().get("histo");
buckets = histogram.getBuckets();
assertEquals(1, buckets.size());
assertEquals(-0.6, (double) buckets.get(0).getKey(), 0.01d);

r = client().prepareSearch("test")
.addAggregation(histogram("histo").field("d").interval(0.1).hardBounds(new DoubleBounds(0.0, 3.0)))
.get();
assertSearchResponse(r);

histogram = r.getAggregations().get("histo");
buckets = histogram.getBuckets();
assertEquals(1, buckets.size());
assertEquals(0.1, (double) buckets.get(0).getKey(), 0.01d);

}

private void assertMultiSortResponse(long[] expectedKeys, BucketOrder... order) {
SearchResponse response = client()
.prepareSearch("sort_idx")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@ public abstract class AbstractHistogramAggregator extends BucketsAggregator {
protected final long minDocCount;
protected final double minBound;
protected final double maxBound;
protected final DoubleBounds hardBounds;
protected final LongKeyedBucketOrds bucketOrds;

public AbstractHistogramAggregator(
Expand All @@ -62,6 +63,7 @@ public AbstractHistogramAggregator(
long minDocCount,
double minBound,
double maxBound,
DoubleBounds hardBounds,
DocValueFormat formatter,
SearchContext context,
Aggregator parent,
Expand All @@ -80,6 +82,7 @@ public AbstractHistogramAggregator(
this.minDocCount = minDocCount;
this.minBound = minBound;
this.maxBound = maxBound;
this.hardBounds = hardBounds;
this.formatter = formatter;
bucketOrds = LongKeyedBucketOrds.build(context.bigArrays(), cardinalityUpperBound);
}
Expand Down
Loading

0 comments on commit 2408803

Please sign in to comment.