apache · davies · Jan 9, 2016 · Jan 9, 2016 · Feb 10, 2016 · Feb 10, 2016
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
@@ -233,14 +233,39 @@ class Analyzer(
           }
         }.toMap
 
-        val aggregations: Seq[NamedExpression] = x.aggregations.map {
-          // If an expression is an aggregate (contains a AggregateExpression) then we dont change
-          // it so that the aggregation is computed on the unmodified value of its argument
-          // expressions.
-          case expr if expr.find(_.isInstanceOf[AggregateExpression]).nonEmpty => expr
-          // If not then its a grouping expression and we need to use the modified (with nulls from
-          // Expand) value of the expression.
-          case expr => expr.transformDown {
+        val aggregations: Seq[NamedExpression] = x.aggregations.map { case expr =>
+          // collect all the found AggregateExpression, so we can check an expression is part of
+          // any AggregateExpression or not.
+          val aggsBuffer = ArrayBuffer[Expression]()
+          def isPartOfAggregation(e: Expression): Boolean = {
+            aggsBuffer.exists(a => a.find(_ eq e).isDefined)
+          }
+          expr.transformDown {
+            // AggregateExpression should be computed on the unmodified value of its argument
+            // expressions, so we should not replace any references to grouping expression
+            // inside it.
+            case e: AggregateExpression =>
+              aggsBuffer += e
+              e
+            case e if isPartOfAggregation(e) => e
+            case e: GroupingID =>
+              if (e.groupByExprs == x.groupByExprs) {
+                // the bitmask is following Hive, which is wrong, we need to reverse it here
+                // TODO: don't not follow Hive
+                BitwiseReverse(BitwiseNot(gid), e.groupByExprs.length)
+              } else {
+                throw new AnalysisException(
+                  s"Columns of grouping_id (${e.groupByExprs.mkString(",")}) does not match " +
+                    s"grouping columns (${x.groupByExprs.mkString(",")})")
+              }
+            case Grouping(col: Expression) =>
+              val idx = x.groupByExprs.indexOf(col)
+              if (idx >= 0) {
+                Cast(BitwiseAnd(ShiftRight(BitwiseNot(gid), Literal(idx)), Literal(1)), ByteType)
+              } else {
+                throw new AnalysisException(s"Column of grouping ($col) can't be found " +
+                  s"in grouping columns ${x.groupByExprs.mkString(",")}")
+              }
             case e =>
               groupByAliases.find(_.child.semanticEquals(e)).map(attributeMap(_)).getOrElse(e)
           }.asInstanceOf[NamedExpression]
@@ -707,8 +732,11 @@ class Analyzer(
         }
     }
 
+    private def isAggregateExpression(e: Expression): Boolean = {
+      e.isInstanceOf[AggregateExpression] || e.isInstanceOf[Grouping] || e.isInstanceOf[GroupingID]
+    }
     protected def containsAggregate(condition: Expression): Boolean = {
-      condition.find(_.isInstanceOf[AggregateExpression]).isDefined
+      condition.find(isAggregateExpression).isDefined
     }
   }
 
@@ -889,7 +917,7 @@ class Analyzer(
         _.transform {
           // Extracts children expressions of a WindowFunction (input parameters of
           // a WindowFunction).
-          case wf : WindowFunction =>
+          case wf: WindowFunction =>
             val newChildren = wf.children.map(extractExpr)
             wf.withNewChildren(newChildren)
 

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
@@ -289,6 +289,8 @@ object FunctionRegistry {
     // grouping sets
     expression[Cube]("cube"),
     expression[Rollup]("rollup"),
+    expression[Grouping]("grouping"),
+    expression[GroupingID]("grouping_id"),
 
     // window functions
     expression[Lead]("lead"),

diff --git a/...alyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/interfaces.scala b/...alyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/interfaces.scala
@@ -344,4 +344,3 @@ abstract class DeclarativeAggregate
     def right: AttributeReference = inputAggBufferAttributes(aggBufferAttributes.indexOf(a))
   }
 }
-
diff --git a/...atalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/bitwiseExpressions.scala b/...atalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/bitwiseExpressions.scala
@@ -124,3 +124,47 @@ case class BitwiseNot(child: Expression) extends UnaryExpression with ExpectsInp
 
   protected override def nullSafeEval(input: Any): Any = not(input)
 }
+
+/**
+  * A function that reverse the lowest N bits of a integer.
+  *
+  * Note: this is only used for grouping_id()
+  */
+case class BitwiseReverse(child: Expression, width: Int)
+  extends UnaryExpression with ExpectsInputTypes {
+
+  override def inputTypes: Seq[AbstractDataType] = Seq(IntegerType)
+
+  override def dataType: DataType = IntegerType
+
+  override def toString: String = s"^$child"
+
+  override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
+    nullSafeCodeGen(ctx, ev, c => {
+      val v = ctx.freshName("v")
+      val i = ctx.freshName("i")
+      s"""
+         | int $v = $c;
+         | ${ev.value} = 0;
+         | for (int $i = 0; $i < $width; $i ++) {
+         |   ${ev.value} <<= 1;
+         |   ${ev.value} |= $v & 1;
+         |   $v >>>= 1;
+         | }
+       """.stripMargin
+    })
+  }
+
+  protected override def nullSafeEval(input: Any): Any = {
+    var v = input.asInstanceOf[Int]
+    var r = 0
+    var i = 0
+    while (i < width) {
+      r <<= 1
+      r |= v & 1
+      v >>>= 1
+      i += 1
+    }
+    r
+  }
+}
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/grouping.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/grouping.scala
@@ -41,3 +41,24 @@ trait GroupingSet extends Expression with CodegenFallback {
 case class Cube(groupByExprs: Seq[Expression]) extends GroupingSet {}
 
 case class Rollup(groupByExprs: Seq[Expression]) extends GroupingSet {}
+
+/**
+  * Indicates whether a specified column expression in a GROUP BY list is aggregated or not.
+  * GROUPING returns 1 for aggregated or 0 for not aggregated in the result set.
+  */
+case class Grouping(child: Expression) extends Expression with Unevaluable {
+  override def references: AttributeSet = AttributeSet(VirtualColumn.groupingIdAttribute :: Nil)
+  override def children: Seq[Expression] = child :: Nil
+  override def dataType: DataType = IntegerType
+  override def nullable: Boolean = false
+}
+
+/**
+  * GroupingID is a function that computes the level of grouping.
+  */
+case class GroupingID(groupByExprs: Seq[Expression]) extends Expression with Unevaluable {
+  override def references: AttributeSet = AttributeSet(VirtualColumn.groupingIdAttribute :: Nil)
+  override def children: Seq[Expression] = groupByExprs
+  override def dataType: DataType = ByteType
+  override def nullable: Boolean = false
+}
diff --git a/...lyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/BitwiseFunctionsSuite.scala b/...lyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/BitwiseFunctionsSuite.scala
@@ -131,4 +131,17 @@ class BitwiseFunctionsSuite extends SparkFunSuite with ExpressionEvalHelper {
       checkConsistencyBetweenInterpretedAndCodegen(BitwiseXor, dt, dt)
     }
   }
+
+  test("BitwiseReverse") {
+    def check(input1: Any, width: Int, expected: Any): Unit = {
+      val expr = BitwiseReverse(Literal(input1), width)
+      checkEvaluation(expr, expected)
+    }
+
+    check(1, 1, 1)
+    check(0, 1, 0)
+    check(1, 2, 2)
+    check(3, 4, 12)
+    check(9, 4, 9)
+  }
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
@@ -324,6 +324,51 @@ object functions extends LegacyFunctions {
    */
   def first(columnName: String): Column = first(Column(columnName))
 
+
+  /**
+    * Aggregate function: indicates whether a specified column in a GROUP BY list is aggregated
+    * or not, returns 1 for aggregated or 0 for not aggregated in the result set.
+    *
+    * @group agg_funcs
+    * @since 2.0.0
+    */
+  def grouping(e: Column): Column = Column(Grouping(e.expr))
+
+  /**
+    * Aggregate function: indicates whether a specified column in a GROUP BY list is aggregated
+    * or not, returns 1 for aggregated or 0 for not aggregated in the result set.
+    *
+    * @group agg_funcs
+    * @since 2.0.0
+    */
+  def grouping(columnName: String): Column = grouping(Column(columnName))
+
+  /**
+    * Aggregate function: returns the level of grouping, equals to
+    *
+    *   (grouping(c1) << (n-1)) + (grouping(c1) << (n-2)) + ... + grouping(cn)
+    *
+    * Note: the list of columns should match with grouping columns exactly.
+    *
+    * @group agg_funcs
+    * @since 2.0.0
+    */
+  def grouping_id(cols: Column*): Column = Column(GroupingID(cols.map(_.expr)))
+
+  /**
+    * Aggregate function: returns the level of grouping, equals to
+    *
+    *   (grouping(c1) << (n-1)) + (grouping(c1) << (n-2)) + ... + grouping(cn)
+    *
+    * Note: the list of columns should match with grouping columns exactly.
+    *
+    * @group agg_funcs
+    * @since 2.0.0
+    */
+  def grouping_id(colName: String, colNames: String*): Column = {
+    grouping_id((Seq(colName) ++ colNames).map(n => Column(n)): _*)
+  }
+
   /**
    * Aggregate function: returns the kurtosis of the values in a group.
    *

diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameAggregateSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameAggregateSuite.scala
@@ -17,6 +17,7 @@
 
 package org.apache.spark.sql
 
+import org.apache.spark.sql.expressions.Window
 import org.apache.spark.sql.functions._
 import org.apache.spark.sql.test.SharedSQLContext
 import org.apache.spark.sql.types.DecimalType
@@ -98,6 +99,42 @@ class DataFrameAggregateSuite extends QueryTest with SharedSQLContext {
     assert(cube0.where("date IS NULL").count > 0)
   }
 
+  test("grouping and grouping_id") {
+    checkAnswer(
+      courseSales.cube("course", "year")
+        .agg(grouping("course"), grouping("year"), grouping_id("course", "year")),
+      Row("Java", 2012, 0, 0, 0) ::
+        Row("Java", 2013, 0, 0, 0) ::
+        Row("Java", null, 0, 1, 1) ::
+        Row("dotNET", 2012, 0, 0, 0) ::
+        Row("dotNET", 2013, 0, 0, 0) ::
+        Row("dotNET", null, 0, 1, 1) ::
+        Row(null, 2012, 1, 0, 2) ::
+        Row(null, 2013, 1, 0, 2) ::
+        Row(null, null, 1, 1, 3) :: Nil
+    )
+  }
+
+  test("grouping/grouping_id inside window function") {
+
+    val w = Window.orderBy(sum("earnings"))
+    checkAnswer(
+      courseSales.cube("course", "year")
+        .agg(sum("earnings"),
+          grouping_id("course", "year"),
+          rank().over(Window.partitionBy(grouping_id("course", "year")).orderBy(sum("earnings")))),
+      Row("Java", 2012, 20000.0, 0, 2) ::
+        Row("Java", 2013, 30000.0, 0, 3) ::
+        Row("Java", null, 50000.0, 1, 1) ::
+        Row("dotNET", 2012, 15000.0, 0, 1) ::
+        Row("dotNET", 2013, 48000.0, 0, 4) ::
+        Row("dotNET", null, 63000.0, 1, 2) ::
+        Row(null, 2012, 35000.0, 2, 1) ::
+        Row(null, 2013, 78000.0, 2, 2) ::
+        Row(null, null, 113000.0, 3, 1) :: Nil
+    )
+  }
+
   test("rollup overlapping columns") {
     checkAnswer(
       testData2.rollup($"a" + $"b" as "foo", $"b" as "bar").agg(sum($"a" - $"b") as "foo"),

diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
@@ -2057,6 +2057,22 @@ class SQLQuerySuite extends QueryTest with SharedSQLContext {
     )
   }
 
+  test("grouping and grouping_id") {
+    checkAnswer(
+      sql("select course, year, grouping(course), grouping(year), grouping_id(course, year)" +
+        " from courseSales group by cube(course, year)"),
+      Row("Java", 2012, 0, 0, 0) ::
+        Row("Java", 2013, 0, 0, 0) ::
+        Row("Java", null, 0, 1, 1) ::
+        Row("dotNET", 2012, 0, 0, 0) ::
+        Row("dotNET", 2013, 0, 0, 0) ::
+        Row("dotNET", null, 0, 1, 1) ::
+        Row(null, 2012, 1, 0, 2) ::
+        Row(null, 2013, 1, 0, 2) ::
+        Row(null, null, 1, 1, 3) :: Nil
+    )
+  }
+
   test("hash function") {
     val df = Seq(1 -> "a", 2 -> "b").toDF("i", "j")
     withTempTable("tbl") {
Original file line number	Diff line number	Diff line change
Expand Up		@@ -344,4 +344,3 @@ abstract class DeclarativeAggregate
		def right: AttributeReference = inputAggBufferAttributes(aggBufferAttributes.indexOf(a))
		}
		}