apache · Jackie-Jiang · May 25, 2024 · May 23, 2024 · xiangfu0 · May 25, 2024
diff --git a/pinot-common/src/main/java/org/apache/pinot/common/utils/config/QueryOptionsUtils.java b/pinot-common/src/main/java/org/apache/pinot/common/utils/config/QueryOptionsUtils.java
@@ -236,6 +236,10 @@ public static boolean isServerReturnFinalResult(Map<String, String> queryOptions
     return Boolean.parseBoolean(queryOptions.get(QueryOptionKey.SERVER_RETURN_FINAL_RESULT));
   }
 
+  public static boolean isServerReturnFinalResultKeyUnpartitioned(Map<String, String> queryOptions) {
+    return Boolean.parseBoolean(queryOptions.get(QueryOptionKey.SERVER_RETURN_FINAL_RESULT_KEY_UNPARTITIONED));
+  }
+
   @Nullable
   public static String getOrderByAlgorithm(Map<String, String> queryOptions) {
     return queryOptions.get(QueryOptionKey.ORDER_BY_ALGORITHM);

diff --git a/pinot-common/src/main/java/org/apache/pinot/common/utils/grpc/GrpcQueryClient.java b/pinot-common/src/main/java/org/apache/pinot/common/utils/grpc/GrpcQueryClient.java
@@ -25,6 +25,7 @@
 import io.grpc.netty.shaded.io.netty.handler.ssl.SslContext;
 import io.grpc.netty.shaded.io.netty.handler.ssl.SslContextBuilder;
 import io.grpc.netty.shaded.io.netty.handler.ssl.SslProvider;
+import java.io.Closeable;
 import java.util.Collections;
 import java.util.Iterator;
 import java.util.Map;
@@ -42,7 +43,7 @@
 import org.slf4j.LoggerFactory;
 
 
-public class GrpcQueryClient {
+public class GrpcQueryClient implements Closeable {
   private static final Logger LOGGER = LoggerFactory.getLogger(GrpcQueryClient.class);
   private static final int DEFAULT_CHANNEL_SHUTDOWN_TIMEOUT_SECOND = 10;
   // the key is the hashCode of the TlsConfig, the value is the SslContext
@@ -74,9 +75,8 @@ private SslContext buildSslContext(TlsConfig tlsConfig) {
     LOGGER.info("Building gRPC SSL context");
     SslContext sslContext = CLIENT_SSL_CONTEXTS_CACHE.computeIfAbsent(tlsConfig.hashCode(), tlsConfigHashCode -> {
       try {
-        SSLFactory sslFactory =
-            RenewableTlsUtils.createSSLFactoryAndEnableAutoRenewalWhenUsingFileStores(
-                tlsConfig, PinotInsecureMode::isPinotInInsecureMode);
+        SSLFactory sslFactory = RenewableTlsUtils.createSSLFactoryAndEnableAutoRenewalWhenUsingFileStores(tlsConfig,
+            PinotInsecureMode::isPinotInInsecureMode);
         SslContextBuilder sslContextBuilder = SslContextBuilder.forClient();
         sslFactory.getKeyManagerFactory().ifPresent(sslContextBuilder::keyManager);
         sslFactory.getTrustManagerFactory().ifPresent(sslContextBuilder::trustManager);
@@ -98,6 +98,7 @@ public Iterator<Server.ServerResponse> submit(Server.ServerRequest request) {
     return _blockingStub.submit(request);
   }
 
+  @Override
   public void close() {
     if (!_managedChannel.isShutdown()) {
       try {

diff --git a/pinot-core/src/main/java/org/apache/pinot/core/data/table/ConcurrentIndexedTable.java b/pinot-core/src/main/java/org/apache/pinot/core/data/table/ConcurrentIndexedTable.java
@@ -34,7 +34,12 @@ public class ConcurrentIndexedTable extends IndexedTable {
 
   public ConcurrentIndexedTable(DataSchema dataSchema, QueryContext queryContext, int resultSize, int trimSize,
       int trimThreshold) {
-    super(dataSchema, queryContext, resultSize, trimSize, trimThreshold, new ConcurrentHashMap<>());
+    this(dataSchema, false, queryContext, resultSize, trimSize, trimThreshold);
+  }
+
+  public ConcurrentIndexedTable(DataSchema dataSchema, boolean hasFinalInput, QueryContext queryContext, int resultSize,
+      int trimSize, int trimThreshold) {
+    super(dataSchema, hasFinalInput, queryContext, resultSize, trimSize, trimThreshold, new ConcurrentHashMap<>());
   }
 
   /**

diff --git a/pinot-core/src/main/java/org/apache/pinot/core/data/table/IndexedTable.java b/pinot-core/src/main/java/org/apache/pinot/core/data/table/IndexedTable.java
@@ -38,6 +38,7 @@
 @SuppressWarnings({"rawtypes", "unchecked"})
 public abstract class IndexedTable extends BaseTable {
   protected final Map<Key, Record> _lookupMap;
+  protected final boolean _hasFinalInput;
   protected final int _resultSize;
   protected final int _numKeyColumns;
   protected final AggregationFunction[] _aggregationFunctions;
@@ -54,16 +55,18 @@ public abstract class IndexedTable extends BaseTable {
    * Constructor for the IndexedTable.
    *
    * @param dataSchema    Data schema of the table
+   * @param hasFinalInput Whether the input is the final aggregate result
    * @param queryContext  Query context
    * @param resultSize    Number of records to keep in the final result after calling {@link #finish(boolean, boolean)}
    * @param trimSize      Number of records to keep when trimming the table
    * @param trimThreshold Trim the table when the number of records exceeds the threshold
    * @param lookupMap     Map from keys to records
    */
-  protected IndexedTable(DataSchema dataSchema, QueryContext queryContext, int resultSize, int trimSize,
-      int trimThreshold, Map<Key, Record> lookupMap) {
+  protected IndexedTable(DataSchema dataSchema, boolean hasFinalInput, QueryContext queryContext, int resultSize,
+      int trimSize, int trimThreshold, Map<Key, Record> lookupMap) {
     super(dataSchema);
     _lookupMap = lookupMap;
+    _hasFinalInput = hasFinalInput;
     _resultSize = resultSize;
 
     List<ExpressionContext> groupByExpressions = queryContext.getGroupByExpressions();
@@ -74,7 +77,7 @@ protected IndexedTable(DataSchema dataSchema, QueryContext queryContext, int res
     if (orderByExpressions != null) {
       // GROUP BY with ORDER BY
       _hasOrderBy = true;
-      _tableResizer = new TableResizer(dataSchema, queryContext);
+      _tableResizer = new TableResizer(dataSchema, hasFinalInput, queryContext);
       // NOTE: trimSize is bounded by trimThreshold/2 to protect the server from using too much memory.
       // TODO: Re-evaluate it as it can lead to in-accurate results
       _trimSize = Math.min(trimSize, trimThreshold / 2);
@@ -102,34 +105,32 @@ public boolean upsert(Record record) {
    * Adds a record with new key or updates a record with existing key.
    */
   protected void addOrUpdateRecord(Key key, Record newRecord) {
-    _lookupMap.compute(key, (k, v) -> {
-      if (v == null) {
-        return newRecord;
-      } else {
-        Object[] existingValues = v.getValues();
-        Object[] newValues = newRecord.getValues();
-        int aggNum = 0;
-        for (int i = _numKeyColumns; i < _numColumns; i++) {
-          existingValues[i] = _aggregationFunctions[aggNum++].merge(existingValues[i], newValues[i]);
-        }
-        return v;
-      }
-    });
+    _lookupMap.compute(key, (k, v) -> v == null ? newRecord : updateRecord(v, newRecord));
   }
 
   /**
    * Updates a record with existing key. Record with new key will be ignored.
    */
   protected void updateExistingRecord(Key key, Record newRecord) {
-    _lookupMap.computeIfPresent(key, (k, v) -> {
-      Object[] existingValues = v.getValues();
-      Object[] newValues = newRecord.getValues();
-      int aggNum = 0;
-      for (int i = _numKeyColumns; i < _numColumns; i++) {
-        existingValues[i] = _aggregationFunctions[aggNum++].merge(existingValues[i], newValues[i]);
+    _lookupMap.computeIfPresent(key, (k, v) -> updateRecord(v, newRecord));
+  }
+
+  private Record updateRecord(Record existingRecord, Record newRecord) {
+    Object[] existingValues = existingRecord.getValues();
+    Object[] newValues = newRecord.getValues();
+    int numAggregations = _aggregationFunctions.length;
+    int index = _numKeyColumns;
+    if (!_hasFinalInput) {
+      for (int i = 0; i < numAggregations; i++, index++) {
+        existingValues[index] = _aggregationFunctions[i].merge(existingValues[index], newValues[index]);
       }
-      return v;
-    });
+    } else {
+      for (int i = 0; i < numAggregations; i++, index++) {
+        existingValues[index] = _aggregationFunctions[i].mergeFinalResult((Comparable) existingValues[index],
+            (Comparable) newValues[index]);
+      }
+    }
+    return existingRecord;
   }
 
   /**
@@ -156,7 +157,8 @@ public void finish(boolean sort, boolean storeFinalResult) {
       _topRecords = _lookupMap.values();
     }
     // TODO: Directly return final result in _tableResizer.getTopRecords to avoid extracting final result multiple times
-    if (storeFinalResult) {
+    assert !(_hasFinalInput && !storeFinalResult);
+    if (storeFinalResult && !_hasFinalInput) {
       ColumnDataType[] columnDataTypes = _dataSchema.getColumnDataTypes();
       int numAggregationFunctions = _aggregationFunctions.length;
       for (int i = 0; i < numAggregationFunctions; i++) {

diff --git a/pinot-core/src/main/java/org/apache/pinot/core/data/table/SimpleIndexedTable.java b/pinot-core/src/main/java/org/apache/pinot/core/data/table/SimpleIndexedTable.java
@@ -32,7 +32,12 @@ public class SimpleIndexedTable extends IndexedTable {
 
   public SimpleIndexedTable(DataSchema dataSchema, QueryContext queryContext, int resultSize, int trimSize,
       int trimThreshold) {
-    super(dataSchema, queryContext, resultSize, trimSize, trimThreshold, new HashMap<>());
+    this(dataSchema, false, queryContext, resultSize, trimSize, trimThreshold);
+  }
+
+  public SimpleIndexedTable(DataSchema dataSchema, boolean hasFinalInput, QueryContext queryContext, int resultSize,
+      int trimSize, int trimThreshold) {
+    super(dataSchema, hasFinalInput, queryContext, resultSize, trimSize, trimThreshold, new HashMap<>());
   }
 
   /**

diff --git a/pinot-core/src/main/java/org/apache/pinot/core/data/table/TableResizer.java b/pinot-core/src/main/java/org/apache/pinot/core/data/table/TableResizer.java
@@ -50,6 +50,7 @@
 @SuppressWarnings({"rawtypes", "unchecked"})
 public class TableResizer {
   private final DataSchema _dataSchema;
+  private final boolean _hasFinalInput;
   private final int _numGroupByExpressions;
   private final Map<ExpressionContext, Integer> _groupByExpressionIndexMap;
   private final AggregationFunction[] _aggregationFunctions;
@@ -61,7 +62,12 @@ public class TableResizer {
   private final Comparator<IntermediateRecord> _intermediateRecordComparator;
 
   public TableResizer(DataSchema dataSchema, QueryContext queryContext) {
+    this(dataSchema, false, queryContext);
+  }
+
+  public TableResizer(DataSchema dataSchema, boolean hasFinalInput, QueryContext queryContext) {
     _dataSchema = dataSchema;
+    _hasFinalInput = hasFinalInput;
 
     // NOTE: The data schema will always have group-by expressions in the front, followed by aggregation functions of
     //       the same order as in the query context. This is handled in AggregationGroupByOrderByOperator.
@@ -144,16 +150,20 @@ private OrderByValueExtractor getOrderByValueExtractor(ExpressionContext express
         expression);
     if (function.getType() == FunctionContext.Type.AGGREGATION) {
       // Aggregation function
-      return new AggregationFunctionExtractor(_aggregationFunctionIndexMap.get(function));
-    } else if (function.getType() == FunctionContext.Type.TRANSFORM
-        && "FILTER".equalsIgnoreCase(function.getFunctionName())) {
+      int index = _aggregationFunctionIndexMap.get(function);
+      // For final aggregate result, we can handle it the same way as group key
+      return _hasFinalInput ? new GroupByExpressionExtractor(_numGroupByExpressions + index)
+          : new AggregationFunctionExtractor(index);
+    } else if (function.getType() == FunctionContext.Type.TRANSFORM && "FILTER".equalsIgnoreCase(
+        function.getFunctionName())) {
+      // Filtered aggregation
       FunctionContext aggregation = function.getArguments().get(0).getFunction();
       ExpressionContext filterExpression = function.getArguments().get(1);
       FilterContext filter = RequestContextUtils.getFilter(filterExpression);
-
-      int functionIndex = _filteredAggregationIndexMap.get(Pair.of(aggregation, filter));
-      AggregationFunction aggregationFunction = _filteredAggregationFunctions.get(functionIndex).getLeft();
-      return new AggregationFunctionExtractor(functionIndex, aggregationFunction);
+      int index = _filteredAggregationIndexMap.get(Pair.of(aggregation, filter));
+      // For final aggregate result, we can handle it the same way as group key
+      return _hasFinalInput ? new GroupByExpressionExtractor(_numGroupByExpressions + index)
+          : new AggregationFunctionExtractor(index, _filteredAggregationFunctions.get(index).getLeft());
     } else {
       // Post-aggregation function
       return new PostAggregationFunctionExtractor(function);

diff --git a/...-core/src/main/java/org/apache/pinot/core/data/table/UnboundedConcurrentIndexedTable.java b/...-core/src/main/java/org/apache/pinot/core/data/table/UnboundedConcurrentIndexedTable.java
@@ -36,7 +36,12 @@
 public class UnboundedConcurrentIndexedTable extends ConcurrentIndexedTable {
 
   public UnboundedConcurrentIndexedTable(DataSchema dataSchema, QueryContext queryContext, int resultSize) {
-    super(dataSchema, queryContext, resultSize, Integer.MAX_VALUE, Integer.MAX_VALUE);
+    this(dataSchema, false, queryContext, resultSize);
+  }
+
+  public UnboundedConcurrentIndexedTable(DataSchema dataSchema, boolean hasFinalInput, QueryContext queryContext,
+      int resultSize) {
+    super(dataSchema, hasFinalInput, queryContext, resultSize, Integer.MAX_VALUE, Integer.MAX_VALUE);
   }
 
   @Override

diff --git a/pinot-core/src/main/java/org/apache/pinot/core/operator/combine/GroupByCombineOperator.java b/pinot-core/src/main/java/org/apache/pinot/core/operator/combine/GroupByCombineOperator.java
@@ -239,10 +239,12 @@ public BaseResultsBlock mergeResults()
     }
 
     IndexedTable indexedTable = _indexedTable;
-    if (!_queryContext.isServerReturnFinalResult()) {
-      indexedTable.finish(false);
-    } else {
+    if (_queryContext.isServerReturnFinalResult()) {
       indexedTable.finish(true, true);
+    } else if (_queryContext.isServerReturnFinalResultKeyUnpartitioned()) {
+      indexedTable.finish(false, true);
+    } else {
+      indexedTable.finish(false);
     }
     GroupByResultsBlock mergedBlock = new GroupByResultsBlock(indexedTable, _queryContext);
     mergedBlock.setNumGroupsLimitReached(_numGroupsLimitReached);

diff --git a/...c/main/java/org/apache/pinot/core/operator/streaming/StreamingGroupByCombineOperator.java b/...c/main/java/org/apache/pinot/core/operator/streaming/StreamingGroupByCombineOperator.java
@@ -244,10 +244,12 @@ private BaseResultsBlock getFinalResult()
     }
 
     IndexedTable indexedTable = _indexedTable;
-    if (!_queryContext.isServerReturnFinalResult()) {
-      indexedTable.finish(false);
-    } else {
+    if (_queryContext.isServerReturnFinalResult()) {
       indexedTable.finish(true, true);
+    } else if (_queryContext.isServerReturnFinalResultKeyUnpartitioned()) {
+      indexedTable.finish(false, true);
+    } else {
+      indexedTable.finish(false);
     }
     GroupByResultsBlock mergedBlock = new GroupByResultsBlock(indexedTable, _queryContext);
     mergedBlock.setNumGroupsLimitReached(_numGroupsLimitReached);

diff --git a/...e/src/main/java/org/apache/pinot/core/query/aggregation/function/AggregationFunction.java b/...e/src/main/java/org/apache/pinot/core/query/aggregation/function/AggregationFunction.java
@@ -124,6 +124,14 @@ void aggregateGroupByMV(int length, int[][] groupKeysArray, GroupByResultHolder
    */
   FinalResult extractFinalResult(IntermediateResult intermediateResult);
 
+  /**
+   * Merges two final results. This can be used to optimized certain functions (e.g. DISTINCT_COUNT) when data is
+   * partitioned on each server, where we may directly request servers to return final result and merge them on broker.
+   */
+  default FinalResult mergeFinalResult(FinalResult finalResult1, FinalResult finalResult2) {
+    throw new UnsupportedOperationException("Cannot merge final results for function: " + getType());
+  }
+
   /** @return Description of this operator for Explain Plan */
   String toExplainString();
 }
diff --git a/.../main/java/org/apache/pinot/core/query/aggregation/function/AggregationFunctionUtils.java b/.../main/java/org/apache/pinot/core/query/aggregation/function/AggregationFunctionUtils.java
@@ -18,6 +18,11 @@
  */
 package org.apache.pinot.core.query.aggregation.function;
 
+import it.unimi.dsi.fastutil.doubles.DoubleArrayList;
+import it.unimi.dsi.fastutil.floats.FloatArrayList;
+import it.unimi.dsi.fastutil.ints.IntArrayList;
+import it.unimi.dsi.fastutil.longs.LongArrayList;
+import it.unimi.dsi.fastutil.objects.ObjectArrayList;
 import java.sql.Timestamp;
 import java.util.ArrayList;
 import java.util.Collections;
@@ -141,7 +146,7 @@ public static Map<ExpressionContext, BlockValSet> getBlockValSetMap(
    * TODO: Move ser/de into AggregationFunction interface
    */
   public static Object getIntermediateResult(DataTable dataTable, ColumnDataType columnDataType, int rowId, int colId) {
-    switch (columnDataType) {
+    switch (columnDataType.getStoredType()) {
       case INT:
         return dataTable.getInt(rowId, colId);
       case LONG:
@@ -156,9 +161,43 @@ public static Object getIntermediateResult(DataTable dataTable, ColumnDataType c
     }
   }
 
+  /**
+   * Reads the final result from the {@link DataTable}.
+   */
+  public static Comparable getFinalResult(DataTable dataTable, ColumnDataType columnDataType, int rowId, int colId) {
+    switch (columnDataType.getStoredType()) {
+      case INT:
+        return dataTable.getInt(rowId, colId);
+      case LONG:
+        return dataTable.getLong(rowId, colId);
+      case FLOAT:
+        return dataTable.getFloat(rowId, colId);
+      case DOUBLE:
+        return dataTable.getDouble(rowId, colId);
+      case BIG_DECIMAL:
+        return dataTable.getBigDecimal(rowId, colId);
+      case STRING:
+        return dataTable.getString(rowId, colId);
+      case BYTES:
+        return dataTable.getBytes(rowId, colId);
+      case INT_ARRAY:
+        return IntArrayList.wrap(dataTable.getIntArray(rowId, colId));
+      case LONG_ARRAY:
+        return LongArrayList.wrap(dataTable.getLongArray(rowId, colId));
+      case FLOAT_ARRAY:
+        return FloatArrayList.wrap(dataTable.getFloatArray(rowId, colId));
+      case DOUBLE_ARRAY:
+        return DoubleArrayList.wrap(dataTable.getDoubleArray(rowId, colId));
+      case STRING_ARRAY:
+        return ObjectArrayList.wrap(dataTable.getStringArray(rowId, colId));
+      default:
+        throw new IllegalStateException("Illegal column data type in final result: " + columnDataType);
+    }
+  }
+
   /**
    * Reads the converted final result from the {@link DataTable}. It should be equivalent to running
-   * {@link AggregationFunction#extractFinalResult(Object)} and {@link ColumnDataType#convert(Object)}.
+   * {@link #getFinalResult} and {@link ColumnDataType#convert}.
    */
   public static Object getConvertedFinalResult(DataTable dataTable, ColumnDataType columnDataType, int rowId,
       int colId) {

diff --git a/...java/org/apache/pinot/core/query/aggregation/function/BaseBooleanAggregationFunction.java b/...java/org/apache/pinot/core/query/aggregation/function/BaseBooleanAggregationFunction.java
@@ -246,6 +246,11 @@ public Integer extractFinalResult(Integer intermediateResult) {
     return intermediateResult;
   }
 
+  @Override
+  public Integer mergeFinalResult(Integer finalResult1, Integer finalResult2) {
+    return merge(finalResult1, finalResult2);
+  }
+
   private int getInt(Integer val) {
     return val == null ? _merger.getDefaultValue() : val;
   }

diff --git a/.../main/java/org/apache/pinot/core/query/aggregation/function/ChildAggregationFunction.java b/.../main/java/org/apache/pinot/core/query/aggregation/function/ChildAggregationFunction.java
@@ -119,6 +119,11 @@ public final Long extractFinalResult(Long longValue) {
     return 0L;
   }
 
+  @Override
+  public Long mergeFinalResult(Long finalResult1, Long finalResult2) {
+    return 0L;
+  }
+
   /**
    * The name of the column as follows:
    * CHILD_AGGREGATION_NAME_PREFIX + actual function type + operands + CHILD_AGGREGATION_SEPERATOR

diff --git a/.../main/java/org/apache/pinot/core/query/aggregation/function/CountAggregationFunction.java b/.../main/java/org/apache/pinot/core/query/aggregation/function/CountAggregationFunction.java
@@ -204,6 +204,11 @@ public Long extractFinalResult(Long intermediateResult) {
     return intermediateResult;
   }
 
+  @Override
+  public Long mergeFinalResult(Long finalResult1, Long finalResult2) {
+    return finalResult1 + finalResult2;
+  }
+
   @Override
   public String toExplainString() {
     StringBuilder stringBuilder = new StringBuilder(getType().getName()).append('(');

diff --git a/...va/org/apache/pinot/core/query/aggregation/function/DistinctCountAggregationFunction.java b/...va/org/apache/pinot/core/query/aggregation/function/DistinctCountAggregationFunction.java
@@ -66,4 +66,9 @@ public ColumnDataType getFinalResultColumnType() {
   public Integer extractFinalResult(Set intermediateResult) {
     return intermediateResult.size();
   }
+
+  @Override
+  public Integer mergeFinalResult(Integer finalResult1, Integer finalResult2) {
+    return finalResult1 + finalResult2;
+  }
 }