Skip to content

Commit

Permalink
[SPARK-16248][SQL] Whitelist the list of Hive fallback functions
Browse files Browse the repository at this point in the history
## What changes were proposed in this pull request?
This patch removes the blind fallback into Hive for functions. Instead, it creates a whitelist and adds only a small number of functions to the whitelist, i.e. the ones we intend to support in the long run in Spark.

## How was this patch tested?
Updated tests to reflect the change.

Author: Reynold Xin <[email protected]>

Closes apache#13939 from rxin/hive-whitelist.
  • Loading branch information
rxin committed Jun 29, 2016
1 parent 5bf8881 commit 363bced
Show file tree
Hide file tree
Showing 4 changed files with 40 additions and 50 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -248,6 +248,7 @@ object FunctionRegistry {
expression[Average]("mean"),
expression[Min]("min"),
expression[Skewness]("skewness"),
expression[StddevSamp]("std"),
expression[StddevSamp]("stddev"),
expression[StddevPop]("stddev_pop"),
expression[StddevSamp]("stddev_samp"),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -517,6 +517,18 @@ class HiveCompatibilitySuite extends HiveQueryFileTest with BeforeAndAfter {
// This test uses CREATE EXTERNAL TABLE without specifying LOCATION
"alter2",

// [SPARK-16248][SQL] Whitelist the list of Hive fallback functions
"udf_field",
"udf_reflect2",
"udf_xpath",
"udf_xpath_boolean",
"udf_xpath_double",
"udf_xpath_float",
"udf_xpath_int",
"udf_xpath_long",
"udf_xpath_short",
"udf_xpath_string",

// These tests DROP TABLE that don't exist (but do not specify IF EXISTS)
"alter_rename_partition1",
"date_1",
Expand Down Expand Up @@ -1004,7 +1016,6 @@ class HiveCompatibilitySuite extends HiveQueryFileTest with BeforeAndAfter {
"udf_elt",
"udf_equal",
"udf_exp",
"udf_field",
"udf_find_in_set",
"udf_float",
"udf_floor",
Expand Down Expand Up @@ -1049,7 +1060,6 @@ class HiveCompatibilitySuite extends HiveQueryFileTest with BeforeAndAfter {
"udf_power",
"udf_radians",
"udf_rand",
"udf_reflect2",
"udf_regexp",
"udf_regexp_extract",
"udf_regexp_replace",
Expand Down Expand Up @@ -1090,14 +1100,6 @@ class HiveCompatibilitySuite extends HiveQueryFileTest with BeforeAndAfter {
"udf_variance",
"udf_weekofyear",
"udf_when",
"udf_xpath",
"udf_xpath_boolean",
"udf_xpath_double",
"udf_xpath_float",
"udf_xpath_int",
"udf_xpath_long",
"udf_xpath_short",
"udf_xpath_string",
"union10",
"union11",
"union13",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -534,31 +534,6 @@ class HiveWindowFunctionQuerySuite extends HiveComparisonTest with BeforeAndAfte
| rows between 2 preceding and 2 following);
""".stripMargin, reset = false)

// collect_set() output array in an arbitrary order, hence causes different result
// when running this test suite under Java 7 and 8.
// We change the original sql query a little bit for making the test suite passed
// under different JDK
/* Disabled because:
- Spark uses a different default stddev.
- Tiny numerical differences in stddev results.
createQueryTest("windowing.q -- 20. testSTATs",
"""
|select p_mfgr,p_name, p_size, sdev, sdev_pop, uniq_data, var, cor, covarp
|from (
|select p_mfgr,p_name, p_size,
|stddev(p_retailprice) over w1 as sdev,
|stddev_pop(p_retailprice) over w1 as sdev_pop,
|collect_set(p_size) over w1 as uniq_size,
|variance(p_retailprice) over w1 as var,
|corr(p_size, p_retailprice) over w1 as cor,
|covar_pop(p_size, p_retailprice) over w1 as covarp
|from part
|window w1 as (distribute by p_mfgr sort by p_mfgr, p_name
| rows between 2 preceding and 2 following)
|) t lateral view explode(uniq_size) d as uniq_data
|order by p_mfgr,p_name, p_size, sdev, sdev_pop, uniq_data, var, cor, covarp
""".stripMargin, reset = false)
*/
createQueryTest("windowing.q -- 21. testDISTs",
"""
|select p_mfgr,p_name, p_size,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -162,17 +162,6 @@ private[sql] class HiveSessionCatalog(
}
}

// We have a list of Hive built-in functions that we do not support. So, we will check
// Hive's function registry and lazily load needed functions into our own function registry.
// Those Hive built-in functions are
// assert_true, collect_list, collect_set, compute_stats, context_ngrams, create_union,
// current_user ,elt, ewah_bitmap, ewah_bitmap_and, ewah_bitmap_empty, ewah_bitmap_or, field,
// histogram_numeric, in_file, index, inline, java_method, map_keys, map_values,
// matchpath, ngrams, noop, noopstreaming, noopwithmap, noopwithmapstreaming,
// parse_url, parse_url_tuple, percentile, percentile_approx, posexplode, reflect, reflect2,
// regexp, sentences, stack, std, str_to_map, windowingtablefunction, xpath, xpath_boolean,
// xpath_double, xpath_float, xpath_int, xpath_long, xpath_number,
// xpath_short, and xpath_string.
override def lookupFunction(name: FunctionIdentifier, children: Seq[Expression]): Expression = {
// TODO: Once lookupFunction accepts a FunctionIdentifier, we should refactor this method to
// if (super.functionExists(name)) {
Expand All @@ -196,10 +185,12 @@ private[sql] class HiveSessionCatalog(
// built-in function.
// Hive is case insensitive.
val functionName = funcName.unquotedString.toLowerCase
// TODO: This may not really work for current_user because current_user is not evaluated
// with session info.
// We do not need to use executionHive at here because we only load
// Hive's builtin functions, which do not need current db.
if (!hiveFunctions.contains(functionName)) {
failFunctionLookup(funcName.unquotedString)
}

// TODO: Remove this fallback path once we implement the list of fallback functions
// defined below in hiveFunctions.
val functionInfo = {
try {
Option(HiveFunctionRegistry.getFunctionInfo(functionName)).getOrElse(
Expand All @@ -221,4 +212,25 @@ private[sql] class HiveSessionCatalog(
}
}
}

/** List of functions we pass over to Hive. Note that over time this list should go to 0. */
// We have a list of Hive built-in functions that we do not support. So, we will check
// Hive's function registry and lazily load needed functions into our own function registry.
// List of functions we are explicitly not supporting are:
// compute_stats, context_ngrams, create_union,
// current_user, ewah_bitmap, ewah_bitmap_and, ewah_bitmap_empty, ewah_bitmap_or, field,
// in_file, index, java_method,
// matchpath, ngrams, noop, noopstreaming, noopwithmap, noopwithmapstreaming,
// parse_url_tuple, posexplode, reflect2,
// str_to_map, windowingtablefunction.
private val hiveFunctions = Seq(
"elt", "hash", "java_method", "histogram_numeric",
"map_keys", "map_values",
"parse_url", "percentile", "percentile_approx", "reflect", "sentences", "stack", "str_to_map",
"xpath", "xpath_boolean", "xpath_double", "xpath_float", "xpath_int", "xpath_long",
"xpath_number", "xpath_short", "xpath_string",

// table generating function
"inline", "posexplode"
)
}

0 comments on commit 363bced

Please sign in to comment.