Address PR comments

apache · Jan 25, 2017 · 95e2e50 · 95e2e50
1 parent b67e83e
commit 95e2e50
Show file tree

Hide file tree

Showing 47 changed files with 738 additions and 628 deletions.
diff --git a/benchmarks/src/main/java/io/druid/benchmark/DimensionPredicateFilterBenchmark.java b/benchmarks/src/main/java/io/druid/benchmark/DimensionPredicateFilterBenchmark.java
@@ -87,27 +87,13 @@ public boolean apply(String input)
         @Override
         public DruidLongPredicate makeLongPredicate()
         {
-          return new DruidLongPredicate()
-          {
-            @Override
-            public boolean applyLong(long input)
-            {
-              return false;
-            }
-          };
+          return DruidLongPredicate.FALSE_PREDICATE;
         }
 
         @Override
         public DruidFloatPredicate makeFloatPredicate()
         {
-          return new DruidFloatPredicate()
-          {
-            @Override
-            public boolean applyFloat(float input)
-            {
-              return false;
-            }
-          };
+          return DruidFloatPredicate.FALSE_PREDICATE;
         }
       },
       null

diff --git a/benchmarks/src/main/java/io/druid/benchmark/FilterPartitionBenchmark.java b/benchmarks/src/main/java/io/druid/benchmark/FilterPartitionBenchmark.java
@@ -627,27 +627,13 @@ public boolean apply(String input)
           @Override
           public DruidLongPredicate makeLongPredicate()
           {
-            return new DruidLongPredicate()
-            {
-              @Override
-              public boolean applyLong(long input)
-              {
-                return false;
-              }
-            };
+            return DruidLongPredicate.FALSE_PREDICATE;
           }
 
           @Override
           public DruidFloatPredicate makeFloatPredicate()
           {
-            return new DruidFloatPredicate()
-            {
-              @Override
-              public boolean applyFloat(float input)
-              {
-                return false;
-              }
-            };
+            return DruidFloatPredicate.FALSE_PREDICATE;
           }
         };
 

diff --git a/benchmarks/src/main/java/io/druid/benchmark/datagen/BenchmarkSchemaInfo.java b/benchmarks/src/main/java/io/druid/benchmark/datagen/BenchmarkSchemaInfo.java
@@ -29,16 +29,19 @@ public class BenchmarkSchemaInfo
   private List<BenchmarkColumnSchema> columnSchemas;
   private List<AggregatorFactory> aggs;
   private Interval dataInterval;
+  private boolean withRollup;
 
   public BenchmarkSchemaInfo (
       List<BenchmarkColumnSchema> columnSchemas,
       List<AggregatorFactory> aggs,
-      Interval dataInterval
+      Interval dataInterval,
+      boolean withRollup
   )
   {
     this.columnSchemas = columnSchemas;
     this.aggs = aggs;
     this.dataInterval = dataInterval;
+    this.withRollup = withRollup;
   }
 
   public List<BenchmarkColumnSchema> getColumnSchemas()
@@ -61,4 +64,8 @@ public Interval getDataInterval()
     return dataInterval;
   }
 
+  public boolean isWithRollup()
+  {
+    return withRollup;
+  }
 }
diff --git a/benchmarks/src/main/java/io/druid/benchmark/datagen/BenchmarkSchemas.java b/benchmarks/src/main/java/io/druid/benchmark/datagen/BenchmarkSchemas.java
@@ -89,8 +89,71 @@ public class BenchmarkSchemas
     BenchmarkSchemaInfo basicSchema = new BenchmarkSchemaInfo(
         basicSchemaColumns,
         basicSchemaIngestAggs,
-        basicSchemaDataInterval
+        basicSchemaDataInterval,
+        true
     );
     SCHEMA_MAP.put("basic", basicSchema);
   }
+
+  static { // simple single string column and count agg schema, no rollup
+    List<BenchmarkColumnSchema> basicSchemaColumns = ImmutableList.of(
+        // dims
+        BenchmarkColumnSchema.makeSequential("dimSequential", ValueType.STRING, false, 1, null, 0, 1000000)
+    );
+
+    List<AggregatorFactory> basicSchemaIngestAggs = new ArrayList<>();
+    basicSchemaIngestAggs.add(new CountAggregatorFactory("rows"));
+
+    Interval basicSchemaDataInterval = new Interval(0, 1000000);
+
+    BenchmarkSchemaInfo basicSchema = new BenchmarkSchemaInfo(
+        basicSchemaColumns,
+        basicSchemaIngestAggs,
+        basicSchemaDataInterval,
+        false
+    );
+    SCHEMA_MAP.put("simple", basicSchema);
+  }
+
+  static { // simple single long column and count agg schema, no rollup
+    List<BenchmarkColumnSchema> basicSchemaColumns = ImmutableList.of(
+        // dims, ingest as a metric for now with rollup off, until numeric dims at ingestion are supported
+        BenchmarkColumnSchema.makeSequential("dimSequential", ValueType.LONG, true, 1, null, 0, 1000000)
+    );
+
+    List<AggregatorFactory> basicSchemaIngestAggs = new ArrayList<>();
+    basicSchemaIngestAggs.add(new LongSumAggregatorFactory("dimSequential", "dimSequential"));
+    basicSchemaIngestAggs.add(new CountAggregatorFactory("rows"));
+
+    Interval basicSchemaDataInterval = new Interval(0, 1000000);
+
+    BenchmarkSchemaInfo basicSchema = new BenchmarkSchemaInfo(
+        basicSchemaColumns,
+        basicSchemaIngestAggs,
+        basicSchemaDataInterval,
+        false
+    );
+    SCHEMA_MAP.put("simpleLong", basicSchema);
+  }
+
+  static { // simple single float column and count agg schema, no rollup
+    List<BenchmarkColumnSchema> basicSchemaColumns = ImmutableList.of(
+        // dims, ingest as a metric for now with rollup off, until numeric dims at ingestion are supported
+        BenchmarkColumnSchema.makeSequential("dimSequential", ValueType.FLOAT, true, 1, null, 0, 1000000)
+    );
+
+    List<AggregatorFactory> basicSchemaIngestAggs = new ArrayList<>();
+    basicSchemaIngestAggs.add(new DoubleSumAggregatorFactory("dimSequential", "dimSequential"));
+    basicSchemaIngestAggs.add(new CountAggregatorFactory("rows"));
+
+    Interval basicSchemaDataInterval = new Interval(0, 1000000);
+
+    BenchmarkSchemaInfo basicSchema = new BenchmarkSchemaInfo(
+        basicSchemaColumns,
+        basicSchemaIngestAggs,
+        basicSchemaDataInterval,
+        false
+    );
+    SCHEMA_MAP.put("simpleFloat", basicSchema);
+  }
 }
diff --git a/benchmarks/src/main/java/io/druid/benchmark/query/GroupByBenchmark.java b/benchmarks/src/main/java/io/druid/benchmark/query/GroupByBenchmark.java
@@ -73,6 +73,7 @@
 import io.druid.segment.QueryableIndex;
 import io.druid.segment.QueryableIndexSegment;
 import io.druid.segment.column.ColumnConfig;
+import io.druid.segment.column.ValueType;
 import io.druid.segment.incremental.IncrementalIndex;
 import io.druid.segment.incremental.IncrementalIndexSchema;
 import io.druid.segment.incremental.OnheapIncrementalIndex;
@@ -237,8 +238,90 @@ private void setupQueries()
 
       basicQueries.put("nested", queryA);
     }
-
     SCHEMA_QUERY_MAP.put("basic", basicQueries);
+
+    // simple one column schema, for testing performance difference between querying on numeric values as Strings and
+    // directly as longs
+    Map<String, GroupByQuery> simpleQueries = new LinkedHashMap<>();
+    BenchmarkSchemaInfo simpleSchema = BenchmarkSchemas.SCHEMA_MAP.get("simple");
+
+    { // simple.A
+      QuerySegmentSpec intervalSpec = new MultipleIntervalSegmentSpec(Arrays.asList(simpleSchema.getDataInterval()));
+      List<AggregatorFactory> queryAggs = new ArrayList<>();
+      queryAggs.add(new LongSumAggregatorFactory(
+          "rows",
+          "rows"
+      ));
+      GroupByQuery queryA = GroupByQuery
+          .builder()
+          .setDataSource("blah")
+          .setQuerySegmentSpec(intervalSpec)
+          .setDimensions(Lists.<DimensionSpec>newArrayList(
+              new DefaultDimensionSpec("dimSequential", "dimSequential", ValueType.STRING)
+          ))
+          .setAggregatorSpecs(
+              queryAggs
+          )
+          .setGranularity(QueryGranularity.fromString(queryGranularity))
+          .build();
+
+      simpleQueries.put("A", queryA);
+    }
+    SCHEMA_QUERY_MAP.put("simple", simpleQueries);
+
+
+    Map<String, GroupByQuery> simpleLongQueries = new LinkedHashMap<>();
+    BenchmarkSchemaInfo simpleLongSchema = BenchmarkSchemas.SCHEMA_MAP.get("simpleLong");
+    { // simpleLong.A
+      QuerySegmentSpec intervalSpec = new MultipleIntervalSegmentSpec(Arrays.asList(simpleLongSchema.getDataInterval()));
+      List<AggregatorFactory> queryAggs = new ArrayList<>();
+      queryAggs.add(new LongSumAggregatorFactory(
+          "rows",
+          "rows"
+      ));
+      GroupByQuery queryA = GroupByQuery
+          .builder()
+          .setDataSource("blah")
+          .setQuerySegmentSpec(intervalSpec)
+          .setDimensions(Lists.<DimensionSpec>newArrayList(
+              new DefaultDimensionSpec("dimSequential", "dimSequential", ValueType.LONG)
+          ))
+          .setAggregatorSpecs(
+              queryAggs
+          )
+          .setGranularity(QueryGranularity.fromString(queryGranularity))
+          .build();
+
+      simpleLongQueries.put("A", queryA);
+    }
+    SCHEMA_QUERY_MAP.put("simpleLong", simpleLongQueries);
+
+
+    Map<String, GroupByQuery> simpleFloatQueries = new LinkedHashMap<>();
+    BenchmarkSchemaInfo simpleFloatSchema = BenchmarkSchemas.SCHEMA_MAP.get("simpleFloat");
+    { // simpleFloat.A
+      QuerySegmentSpec intervalSpec = new MultipleIntervalSegmentSpec(Arrays.asList(simpleFloatSchema.getDataInterval()));
+      List<AggregatorFactory> queryAggs = new ArrayList<>();
+      queryAggs.add(new LongSumAggregatorFactory(
+          "rows",
+          "rows"
+      ));
+      GroupByQuery queryA = GroupByQuery
+          .builder()
+          .setDataSource("blah")
+          .setQuerySegmentSpec(intervalSpec)
+          .setDimensions(Lists.<DimensionSpec>newArrayList(
+              new DefaultDimensionSpec("dimSequential", "dimSequential", ValueType.FLOAT)
+          ))
+          .setAggregatorSpecs(
+              queryAggs
+          )
+          .setGranularity(QueryGranularity.fromString(queryGranularity))
+          .build();
+
+      simpleFloatQueries.put("A", queryA);
+    }
+    SCHEMA_QUERY_MAP.put("simpleFloat", simpleFloatQueries);
   }
 
   @Setup(Level.Trial)
@@ -278,7 +361,7 @@ public void setup() throws IOException
     for (int i = 0; i < numSegments; i++) {
       log.info("Generating rows for segment %d/%d", i + 1, numSegments);
 
-      final IncrementalIndex index = makeIncIndex();
+      final IncrementalIndex index = makeIncIndex(schemaInfo.isWithRollup());
 
       for (int j = 0; j < rowsPerSegment; j++) {
         final InputRow row = dataGenerator.nextRow();
@@ -393,13 +476,14 @@ public String getFormatString()
     );
   }
 
-  private IncrementalIndex makeIncIndex()
+  private IncrementalIndex makeIncIndex(boolean withRollup)
   {
     return new OnheapIncrementalIndex(
         new IncrementalIndexSchema.Builder()
             .withQueryGranularity(QueryGranularities.NONE)
             .withMetrics(schemaInfo.getAggsArray())
             .withDimensionsSpec(new DimensionsSpec(null, null, null))
+            .withRollup(withRollup)
             .build(),
         true,
         false,

diff --git a/docs/content/querying/dimensionspecs.md b/docs/content/querying/dimensionspecs.md
@@ -15,9 +15,18 @@ The following JSON fields can be used in a query to operate on dimension values.
 Returns dimension values as is and optionally renames the dimension.
 
 ```json
-{ "type" : "default", "dimension" : <dimension>, "outputName": <output_name> }
+{
+  "type" : "default",
+  "dimension" : <dimension>,
+  "outputName": <output_name>,
+  "outputType": <"STRING"|"LONG"|"FLOAT">
+}
 ```
 
+When specifying a DimensionSpec on a numeric column, the user should include the type of the column in the `outputType` field. This is necessary as it is possible for a column with given name to have different value types in different segments: result merging may fail unless results of different type are converted to the type specified by `outputType`
+
+If left unspecified, the `outputType` defaults to STRING.
+
 ### Extraction DimensionSpec
 
 Returns dimension values transformed using the given [extraction function](#extraction-functions).

diff --git a/docs/content/querying/query-context.md b/docs/content/querying/query-context.md
@@ -21,4 +21,3 @@ The query context is used for various query configuration parameters.
 |`maxResults`|500000|Maximum number of results groupBy query can process. Default value used can be changed by `druid.query.groupBy.maxResults` in druid configuration at broker and historical nodes. At query time you can only lower the value.|
 |`maxIntermediateRows`|50000|Maximum number of intermediate rows while processing single segment for groupBy query. Default value used can be changed by `druid.query.groupBy.maxIntermediateRows` in druid configuration at broker and historical nodes. At query time you can only lower the value.|
 |`groupByIsSingleThreaded`|false|Whether to run single threaded group By queries. Default value used can be changed by `druid.query.groupBy.singleThreaded` in druid configuration at historical nodes.|
-|`typeHints`|{}| A map of column name -> column type (String, Long, Float). By default, druid returns all column values as strings within query results. If querying on a non-String column, `typeHints` must be included in a query, containing a mapping of the name of the non-String column to its desired return type. This is necessary because columns with the same name in different segments do not necessarily have the same value type, and a type must be chosen when merging results.|
diff --git a/docs/content/querying/querying.md b/docs/content/querying/querying.md
@@ -99,21 +99,3 @@ Possible codes for the *error* field include:
 |`Query cancelled`|The query was cancelled through the query cancellation API.|
 |`Resource limit exceeded`|The query exceeded a configured resource limit (e.g. groupBy maxResults).|
 |`Unknown exception`|Some other exception occurred. Check errorMessage and errorClass for details, although keep in mind that the contents of those fields are free-form and may change from release to release.|
-
-
-Column Types
------------
-It is possible to run queries with non-String columns as dimensions, for example, grouping on a column previously ingested as a long metric.
-
-When including a non-String column as a dimension, its desired return type must be specified in the query context. See `typeHints` in [Context](../querying/query-context.html) for more information.
-
-For example, if a grouping on a long column named "longNumbers", the GroupBy query should contain a `typeHints` map within its query context:
-
-```json
-
-"context": {
-  "typeHints" : {
-    "longNumbers": "LONG"
-  }
-}
-```