-
Notifications
You must be signed in to change notification settings - Fork 28.5k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[SPARK-12789] [SQL] Support Order By Ordinal in SQL #11815
Changes from 7 commits
aee1023
3342798
240aeb3
e0dce39
256814b
b04529b
d52e1c7
4d44b75
46e3f69
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -27,6 +27,7 @@ import org.apache.spark.sql.catalyst.{CatalystConf, ScalaReflection, SimpleCatal | |
import org.apache.spark.sql.catalyst.encoders.OuterScopes | ||
import org.apache.spark.sql.catalyst.expressions._ | ||
import org.apache.spark.sql.catalyst.expressions.aggregate._ | ||
import org.apache.spark.sql.catalyst.planning.IntegerIndex | ||
import org.apache.spark.sql.catalyst.plans._ | ||
import org.apache.spark.sql.catalyst.plans.logical._ | ||
import org.apache.spark.sql.catalyst.rules._ | ||
|
@@ -40,7 +41,10 @@ import org.apache.spark.sql.types._ | |
* references. | ||
*/ | ||
object SimpleAnalyzer | ||
extends Analyzer(EmptyCatalog, EmptyFunctionRegistry, new SimpleCatalystConf(true)) | ||
extends Analyzer( | ||
EmptyCatalog, | ||
EmptyFunctionRegistry, | ||
new SimpleCatalystConf(caseSensitiveAnalysis = true)) | ||
|
||
/** | ||
* Provides a logical query plan analyzer, which translates [[UnresolvedAttribute]]s and | ||
|
@@ -618,13 +622,36 @@ class Analyzer( | |
* clause. This rule detects such queries and adds the required attributes to the original | ||
* projection, so that they will be available during sorting. Another projection is added to | ||
* remove these attributes after sorting. | ||
* | ||
* This rule also resolves the position number in sort references. This support is introduced | ||
* in Spark 2.0. Before Spark 2.0, the integers in Order By has no effect on output sorting. | ||
* - When the sort references are not integer but foldable expressions, ignore them. | ||
* - When spark.sql.orderByOrdinal is set to false, ignore the position numbers too. | ||
*/ | ||
object ResolveSortReferences extends Rule[LogicalPlan] { | ||
def apply(plan: LogicalPlan): LogicalPlan = plan resolveOperators { | ||
case s: Sort if !s.child.resolved => s | ||
// Replace the index with the related attribute for ORDER BY | ||
// which is a 1-base position of the projection list. | ||
case s @ Sort(orders, global, child) | ||
if conf.orderByOrdinal && orders.exists(o => IntegerIndex.unapply(o.child).nonEmpty) => | ||
val newOrders = orders map { | ||
case s @ SortOrder(IntegerIndex(index), direction) => | ||
if (index > 0 && index <= child.output.size) { | ||
SortOrder(child.output(index - 1), direction) | ||
} else { | ||
throw new UnresolvedException(s, | ||
"Order/sort By position: $index does not exist " + | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. seems you miss a There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. True. Let me correct it. Thanks! |
||
"The Select List is indexed from 1 to ${child.output.size}") | ||
} | ||
case o => o | ||
} | ||
Sort(newOrders, global, child) | ||
|
||
// Skip sort with aggregate. This will be handled in ResolveAggregateFunctions | ||
case sa @ Sort(_, _, child: Aggregate) => sa | ||
|
||
case s @ Sort(order, _, child) if !s.resolved && child.resolved => | ||
case s @ Sort(order, _, child) if !s.resolved => | ||
try { | ||
val newOrder = order.map(resolveExpressionRecursively(_, child).asInstanceOf[SortOrder]) | ||
val requiredAttrs = AttributeSet(newOrder).filter(_.resolved) | ||
|
Original file line number | Diff line number | Diff line change | ||
---|---|---|---|---|
|
@@ -24,6 +24,7 @@ import org.apache.spark.internal.Logging | |||
import org.apache.spark.sql.catalyst.expressions._ | ||||
import org.apache.spark.sql.catalyst.plans._ | ||||
import org.apache.spark.sql.catalyst.plans.logical._ | ||||
import org.apache.spark.sql.types.IntegerType | ||||
|
||||
/** | ||||
* A pattern that matches any number of project or filter operations on top of another relational | ||||
|
@@ -202,3 +203,15 @@ object Unions { | |||
} | ||||
} | ||||
} | ||||
|
||||
/** | ||||
* Extractor for retrieving Int value. | ||||
*/ | ||||
object IntegerIndex { | ||||
def unapply(a: Any): Option[Int] = a match { | ||||
case Literal(a: Int, IntegerType) => Some(a) | ||||
// When resolving ordinal in Sort, negative values are extracted for issuing error messages. | ||||
case UnaryMinus(IntegerLiteral(v)) => Some(-v) | ||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. i still don't get it - why are we matching unary minus ? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. As shown in the following link, our parser converts spark/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/CatalystQl.scala Line 641 in 637a78f
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. But why are we getting negative values? Ordinals can only be positive. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. When users input negative values, we just detect and then issue an exception message, like There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ah ic so this is used to detect errors. i'd add some comment here explaining why we are having this. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. sure, will do. Thanks! |
||||
case _ => None | ||||
} | ||||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -435,6 +435,11 @@ object SQLConf { | |
defaultValue = Some(true), | ||
doc = "When false, we will treat bucketed table as normal table") | ||
|
||
val ORDER_BY_ORDINAL = booleanConf("spark.sql.orderByOrdinal", | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. There's a similar configuration value in hive since 0.11 and later, which is I think we also need to read from ConfVar for this config. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't mind just diverging from Hive over time and not support any of the config options. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Sounds good to me. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @adrian-wang Thank you for your input! Glad to learn it. |
||
defaultValue = Some(true), | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Should we turn it off by default? It's not a standard feature and hive also turn it off by default. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We are facing two options. Not sure which one is better. For RDBMS users, the default is true. |
||
doc = "When true, the ordinal numbers are treated as the position in the select list. " + | ||
"When false, the ordinal numbers in order/sort By clause are ignored.") | ||
|
||
// The output committer class used by HadoopFsRelation. The specified class needs to be a | ||
// subclass of org.apache.hadoop.mapreduce.OutputCommitter. | ||
// | ||
|
@@ -631,6 +636,8 @@ class SQLConf extends Serializable with CatalystConf with ParserConf with Loggin | |
|
||
def supportSQL11ReservedKeywords: Boolean = getConf(PARSER_SUPPORT_SQL11_RESERVED_KEYWORDS) | ||
|
||
override def orderByOrdinal: Boolean = getConf(ORDER_BY_ORDINAL) | ||
|
||
/** ********************** SQLConf functionality methods ************ */ | ||
|
||
/** Set Spark SQL configuration properties. */ | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
last question: should we have 2 configs for order by and group by? Their behaviours are similar, maybe we can just use one config?
cc @rxin
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
do you have a good name?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
how about
positionAlias
? It's inspired by hive's config:hive.groupby.orderby.position.alias
.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
yea the problem is that once you don't have the groupby or order by there, it is very unclear what position alias means.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Or how about simply call it
groupByOrderByOrdinal
?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Doesn't seem like a great name ... if we can come up with a good name I think having a single config makes sense. If not, doesn't matter that much to have two.