From bd40a33d06797b13ccbe60705e6e6df01ba3ccc4 Mon Sep 17 00:00:00 2001 From: Yin Huai Date: Mon, 28 Jul 2014 17:16:09 -0700 Subject: [PATCH] Address comments. --- python/pyspark/sql.py | 88 +++++++++++-------- .../sql/catalyst/analysis/Analyzer.scala | 4 +- .../catalyst/analysis/HiveTypeCoercion.scala | 8 +- .../catalyst/expressions/BoundAttribute.scala | 5 +- .../sql/catalyst/planning/QueryPlanner.scala | 3 +- .../sql/catalyst/planning/patterns.scala | 7 +- .../spark/sql/catalyst/rules/Rule.scala | 3 +- .../sql/catalyst/rules/RuleExecutor.scala | 13 ++- .../spark/sql/api/java/types/ArrayType.java | 9 +- .../spark/sql/api/java/types/BinaryType.java | 2 + .../spark/sql/api/java/types/BooleanType.java | 5 ++ .../spark/sql/api/java/types/ByteType.java | 4 +- .../spark/sql/api/java/types/DataType.java | 21 ++++- .../spark/sql/api/java/types/DecimalType.java | 2 + .../spark/sql/api/java/types/DoubleType.java | 4 +- .../spark/sql/api/java/types/FloatType.java | 4 +- .../spark/sql/api/java/types/IntegerType.java | 4 +- .../spark/sql/api/java/types/LongType.java | 4 +- .../spark/sql/api/java/types/MapType.java | 9 +- .../spark/sql/api/java/types/ShortType.java | 4 +- .../spark/sql/api/java/types/StringType.java | 2 + .../spark/sql/api/java/types/StructField.java | 4 + .../spark/sql/api/java/types/StructType.java | 5 ++ .../sql/api/java/types/TimestampType.java | 2 + .../sql/api/java/types/package-info.java | 2 +- .../org/apache/spark/sql/SQLContext.scala | 3 +- .../apache/spark/sql/execution/Exchange.scala | 2 +- .../scala/org/apache/spark/sql/package.scala | 9 +- 28 files changed, 154 insertions(+), 78 deletions(-) diff --git a/python/pyspark/sql.py b/python/pyspark/sql.py index c0d086acbe955..1e91fdc60b46f 100644 --- a/python/pyspark/sql.py +++ b/python/pyspark/sql.py @@ -41,18 +41,18 @@ class StringType(object): """ __metaclass__ = PrimitiveTypeSingleton - def _get_scala_type_string(self): + def __repr__(self): return "StringType" class BinaryType(object): """Spark SQL BinaryType - The data type representing bytes values and bytearray values. + The data type representing bytearray values. """ __metaclass__ = PrimitiveTypeSingleton - def _get_scala_type_string(self): + def __repr__(self): return "BinaryType" class BooleanType(object): @@ -63,14 +63,18 @@ class BooleanType(object): """ __metaclass__ = PrimitiveTypeSingleton - def _get_scala_type_string(self): + def __repr__(self): return "BooleanType" class TimestampType(object): - """Spark SQL TimestampType""" + """Spark SQL TimestampType + + The data type representing datetime.datetime values. + + """ __metaclass__ = PrimitiveTypeSingleton - def _get_scala_type_string(self): + def __repr__(self): return "TimestampType" class DecimalType(object): @@ -81,40 +85,48 @@ class DecimalType(object): """ __metaclass__ = PrimitiveTypeSingleton - def _get_scala_type_string(self): + def __repr__(self): return "DecimalType" class DoubleType(object): """Spark SQL DoubleType - The data type representing float values. Because a float value + The data type representing float values. """ __metaclass__ = PrimitiveTypeSingleton - def _get_scala_type_string(self): + def __repr__(self): return "DoubleType" class FloatType(object): """Spark SQL FloatType - For PySpark, please use L{DoubleType} instead of using L{FloatType}. + For now, please use L{DoubleType} instead of using L{FloatType}. + Because query evaluation is done in Scala, java.lang.Double will be be used + for Python float numbers. Because the underlying JVM type of FloatType is + java.lang.Float (in Java) and Float (in scala), there will be a java.lang.ClassCastException + if FloatType (Python) used. """ __metaclass__ = PrimitiveTypeSingleton - def _get_scala_type_string(self): + def __repr__(self): return "FloatType" class ByteType(object): """Spark SQL ByteType - For PySpark, please use L{IntegerType} instead of using L{ByteType}. + For now, please use L{IntegerType} instead of using L{ByteType}. + Because query evaluation is done in Scala, java.lang.Integer will be be used + for Python int numbers. Because the underlying JVM type of ByteType is + java.lang.Byte (in Java) and Byte (in scala), there will be a java.lang.ClassCastException + if ByteType (Python) used. """ __metaclass__ = PrimitiveTypeSingleton - def _get_scala_type_string(self): + def __repr__(self): return "ByteType" class IntegerType(object): @@ -125,7 +137,7 @@ class IntegerType(object): """ __metaclass__ = PrimitiveTypeSingleton - def _get_scala_type_string(self): + def __repr__(self): return "IntegerType" class LongType(object): @@ -137,18 +149,22 @@ class LongType(object): """ __metaclass__ = PrimitiveTypeSingleton - def _get_scala_type_string(self): + def __repr__(self): return "LongType" class ShortType(object): """Spark SQL ShortType - For PySpark, please use L{IntegerType} instead of using L{ShortType}. + For now, please use L{IntegerType} instead of using L{ShortType}. + Because query evaluation is done in Scala, java.lang.Integer will be be used + for Python int numbers. Because the underlying JVM type of ShortType is + java.lang.Short (in Java) and Short (in scala), there will be a java.lang.ClassCastException + if ShortType (Python) used. """ __metaclass__ = PrimitiveTypeSingleton - def _get_scala_type_string(self): + def __repr__(self): return "ShortType" class ArrayType(object): @@ -157,23 +173,23 @@ class ArrayType(object): The data type representing list values. """ - def __init__(self, elementType, containsNull): + def __init__(self, elementType, containsNull=False): """Creates an ArrayType :param elementType: the data type of elements. :param containsNull: indicates whether the list contains null values. :return: - >>> ArrayType(StringType, True) == ArrayType(StringType, False) - False - >>> ArrayType(StringType, True) == ArrayType(StringType, True) + >>> ArrayType(StringType) == ArrayType(StringType, False) True + >>> ArrayType(StringType, True) == ArrayType(StringType) + False """ self.elementType = elementType self.containsNull = containsNull - def _get_scala_type_string(self): - return "ArrayType(" + self.elementType._get_scala_type_string() + "," + \ + def __repr__(self): + return "ArrayType(" + self.elementType.__repr__() + "," + \ str(self.containsNull).lower() + ")" def __eq__(self, other): @@ -207,9 +223,9 @@ def __init__(self, keyType, valueType, valueContainsNull=True): self.valueType = valueType self.valueContainsNull = valueContainsNull - def _get_scala_type_string(self): - return "MapType(" + self.keyType._get_scala_type_string() + "," + \ - self.valueType._get_scala_type_string() + "," + \ + def __repr__(self): + return "MapType(" + self.keyType.__repr__() + "," + \ + self.valueType.__repr__() + "," + \ str(self.valueContainsNull).lower() + ")" def __eq__(self, other): @@ -243,9 +259,9 @@ def __init__(self, name, dataType, nullable): self.dataType = dataType self.nullable = nullable - def _get_scala_type_string(self): + def __repr__(self): return "StructField(" + self.name + "," + \ - self.dataType._get_scala_type_string() + "," + \ + self.dataType.__repr__() + "," + \ str(self.nullable).lower() + ")" def __eq__(self, other): @@ -280,9 +296,9 @@ def __init__(self, fields): """ self.fields = fields - def _get_scala_type_string(self): + def __repr__(self): return "StructType(List(" + \ - ",".join([field._get_scala_type_string() for field in self.fields]) + "))" + ",".join([field.__repr__() for field in self.fields]) + "))" def __eq__(self, other): return (isinstance(other, self.__class__) and \ @@ -319,7 +335,7 @@ def _parse_datatype_string(datatype_string): :return: >>> def check_datatype(datatype): - ... scala_datatype = sqlCtx._ssql_ctx.parseDataType(datatype._get_scala_type_string()) + ... scala_datatype = sqlCtx._ssql_ctx.parseDataType(datatype.__repr__()) ... python_datatype = _parse_datatype_string(scala_datatype.toString()) ... return datatype == python_datatype >>> check_datatype(StringType()) @@ -536,7 +552,7 @@ def applySchema(self, rdd, schema): True """ jrdd = self._pythonToJavaMap(rdd._jrdd) - srdd = self._ssql_ctx.applySchema(jrdd.rdd(), schema._get_scala_type_string()) + srdd = self._ssql_ctx.applySchema(jrdd.rdd(), schema.__repr__()) return SchemaRDD(srdd, self) def registerRDDAsTable(self, rdd, tableName): @@ -569,7 +585,7 @@ def parquetFile(self, path): jschema_rdd = self._ssql_ctx.parquetFile(path) return SchemaRDD(jschema_rdd, self) - def jsonFile(self, path, schema = None): + def jsonFile(self, path, schema=None): """Loads a text file storing one JSON object per line as a L{SchemaRDD}. If the schema is provided, applies the given schema to this JSON dataset. @@ -618,11 +634,11 @@ def jsonFile(self, path, schema = None): if schema is None: jschema_rdd = self._ssql_ctx.jsonFile(path) else: - scala_datatype = self._ssql_ctx.parseDataType(schema._get_scala_type_string()) + scala_datatype = self._ssql_ctx.parseDataType(schema.__repr__()) jschema_rdd = self._ssql_ctx.jsonFile(path, scala_datatype) return SchemaRDD(jschema_rdd, self) - def jsonRDD(self, rdd, schema = None): + def jsonRDD(self, rdd, schema=None): """Loads an RDD storing one JSON object per string as a L{SchemaRDD}. If the schema is provided, applies the given schema to this JSON dataset. @@ -672,7 +688,7 @@ def func(split, iterator): if schema is None: jschema_rdd = self._ssql_ctx.jsonRDD(jrdd.rdd()) else: - scala_datatype = self._ssql_ctx.parseDataType(schema._get_scala_type_string()) + scala_datatype = self._ssql_ctx.parseDataType(schema.__repr__()) jschema_rdd = self._ssql_ctx.jsonRDD(jrdd.rdd(), scala_datatype) return SchemaRDD(jschema_rdd, self) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala index 02bdb64f308a5..f847355a43537 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala @@ -109,12 +109,12 @@ class Analyzer(catalog: Catalog, registry: FunctionRegistry, caseSensitive: Bool object ResolveReferences extends Rule[LogicalPlan] { def apply(plan: LogicalPlan): LogicalPlan = plan transformUp { case q: LogicalPlan if q.childrenResolved => - logger.trace(s"Attempting to resolve ${q.simpleString}") + logTrace(s"Attempting to resolve ${q.simpleString}") q transformExpressions { case u @ UnresolvedAttribute(name) => // Leave unchanged if resolution fails. Hopefully will be resolved next round. val result = q.resolve(name).getOrElse(u) - logger.debug(s"Resolving $u to $result") + logDebug(s"Resolving $u to $result") result } } diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala index 47c7ad076ad07..e94f2a3bea63e 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala @@ -75,7 +75,7 @@ trait HiveTypeCoercion { // Leave the same if the dataTypes match. case Some(newType) if a.dataType == newType.dataType => a case Some(newType) => - logger.debug(s"Promoting $a to $newType in ${q.simpleString}}") + logDebug(s"Promoting $a to $newType in ${q.simpleString}}") newType } } @@ -154,7 +154,7 @@ trait HiveTypeCoercion { (Alias(Cast(l, StringType), l.name)(), r) case (l, r) if l.dataType != r.dataType => - logger.debug(s"Resolving mismatched union input ${l.dataType}, ${r.dataType}") + logDebug(s"Resolving mismatched union input ${l.dataType}, ${r.dataType}") findTightestCommonType(l.dataType, r.dataType).map { widestType => val newLeft = if (l.dataType == widestType) l else Alias(Cast(l, widestType), l.name)() @@ -170,7 +170,7 @@ trait HiveTypeCoercion { val newLeft = if (castedLeft.map(_.dataType) != left.output.map(_.dataType)) { - logger.debug(s"Widening numeric types in union $castedLeft ${left.output}") + logDebug(s"Widening numeric types in union $castedLeft ${left.output}") Project(castedLeft, left) } else { left @@ -178,7 +178,7 @@ trait HiveTypeCoercion { val newRight = if (castedRight.map(_.dataType) != right.output.map(_.dataType)) { - logger.debug(s"Widening numeric types in union $castedRight ${right.output}") + logDebug(s"Widening numeric types in union $castedRight ${right.output}") Project(castedRight, right) } else { right diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/BoundAttribute.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/BoundAttribute.scala index cbc214d442064..92a30810c736d 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/BoundAttribute.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/BoundAttribute.scala @@ -17,8 +17,7 @@ package org.apache.spark.sql.catalyst.expressions -import com.typesafe.scalalogging.slf4j.Logging - +import org.apache.spark.Logging import org.apache.spark.sql.catalyst.trees import org.apache.spark.sql.catalyst.errors.attachTree import org.apache.spark.sql.catalyst.plans.QueryPlan @@ -80,7 +79,7 @@ object BindReferences extends Logging { // produce new attributes that can't be bound. Likely the right thing to do is remove // this rule and require all operators to explicitly bind to the input schema that // they specify. - logger.debug(s"Couldn't find $a in ${input.mkString("[", ",", "]")}") + logDebug(s"Couldn't find $a in ${input.mkString("[", ",", "]")}") a } else { BoundReference(ordinal, a) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/planning/QueryPlanner.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/planning/QueryPlanner.scala index 4ff5791635f4c..5839c9f7c43ef 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/planning/QueryPlanner.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/planning/QueryPlanner.scala @@ -17,8 +17,7 @@ package org.apache.spark.sql.catalyst.planning -import com.typesafe.scalalogging.slf4j.Logging - +import org.apache.spark.Logging import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan import org.apache.spark.sql.catalyst.trees.TreeNode diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/planning/patterns.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/planning/patterns.scala index b8ae326be6fab..820eaeb75f768 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/planning/patterns.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/planning/patterns.scala @@ -19,8 +19,7 @@ package org.apache.spark.sql.catalyst.planning import scala.annotation.tailrec -import com.typesafe.scalalogging.slf4j.Logging - +import org.apache.spark.Logging import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.plans._ import org.apache.spark.sql.catalyst.plans.logical._ @@ -114,7 +113,7 @@ object ExtractEquiJoinKeys extends Logging with PredicateHelper { def unapply(plan: LogicalPlan): Option[ReturnType] = plan match { case join @ Join(left, right, joinType, condition) => - logger.debug(s"Considering join on: $condition") + logDebug(s"Considering join on: $condition") // Find equi-join predicates that can be evaluated before the join, and thus can be used // as join keys. val (joinPredicates, otherPredicates) = @@ -132,7 +131,7 @@ object ExtractEquiJoinKeys extends Logging with PredicateHelper { val rightKeys = joinKeys.map(_._2) if (joinKeys.nonEmpty) { - logger.debug(s"leftKeys:${leftKeys} | rightKeys:${rightKeys}") + logDebug(s"leftKeys:${leftKeys} | rightKeys:${rightKeys}") Some((joinType, leftKeys, rightKeys, otherPredicates.reduceOption(And), left, right)) } else { None diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/rules/Rule.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/rules/Rule.scala index f39bff8c25164..03414b2301e81 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/rules/Rule.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/rules/Rule.scala @@ -17,8 +17,7 @@ package org.apache.spark.sql.catalyst.rules -import com.typesafe.scalalogging.slf4j.Logging - +import org.apache.spark.Logging import org.apache.spark.sql.catalyst.trees.TreeNode abstract class Rule[TreeType <: TreeNode[_]] extends Logging { diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/rules/RuleExecutor.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/rules/RuleExecutor.scala index e70ce66cb745f..e73515ff29377 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/rules/RuleExecutor.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/rules/RuleExecutor.scala @@ -17,8 +17,7 @@ package org.apache.spark.sql.catalyst.rules -import com.typesafe.scalalogging.slf4j.Logging - +import org.apache.spark.Logging import org.apache.spark.sql.catalyst.trees.TreeNode import org.apache.spark.sql.catalyst.util.sideBySide @@ -61,7 +60,7 @@ abstract class RuleExecutor[TreeType <: TreeNode[_]] extends Logging { case (plan, rule) => val result = rule(plan) if (!result.fastEquals(plan)) { - logger.trace( + logTrace( s""" |=== Applying Rule ${rule.ruleName} === |${sideBySide(plan.treeString, result.treeString).mkString("\n")} @@ -72,25 +71,25 @@ abstract class RuleExecutor[TreeType <: TreeNode[_]] extends Logging { } iteration += 1 if (iteration > batch.strategy.maxIterations) { - logger.info(s"Max iterations ($iteration) reached for batch ${batch.name}") + logInfo(s"Max iterations ($iteration) reached for batch ${batch.name}") continue = false } if (curPlan.fastEquals(lastPlan)) { - logger.trace(s"Fixed point reached for batch ${batch.name} after $iteration iterations.") + logTrace(s"Fixed point reached for batch ${batch.name} after $iteration iterations.") continue = false } lastPlan = curPlan } if (!batchStartPlan.fastEquals(curPlan)) { - logger.debug( + logDebug( s""" |=== Result of Batch ${batch.name} === |${sideBySide(plan.treeString, curPlan.treeString).mkString("\n")} """.stripMargin) } else { - logger.trace(s"Batch ${batch.name} has no effect.") + logTrace(s"Batch ${batch.name} has no effect.") } } diff --git a/sql/core/src/main/java/org/apache/spark/sql/api/java/types/ArrayType.java b/sql/core/src/main/java/org/apache/spark/sql/api/java/types/ArrayType.java index 61f52055842e6..17334ca31b2b7 100644 --- a/sql/core/src/main/java/org/apache/spark/sql/api/java/types/ArrayType.java +++ b/sql/core/src/main/java/org/apache/spark/sql/api/java/types/ArrayType.java @@ -21,8 +21,13 @@ * The data type representing Lists. * An ArrayType object comprises two fields, {@code DataType elementType} and * {@code boolean containsNull}. The field of {@code elementType} is used to specify the type of - * array elements. The field of {@code containsNull} is used to specify if the array can have - * any {@code null} value. + * array elements. The field of {@code containsNull} is used to specify if the array has + * {@code null} values. + * + * To create an {@link ArrayType}, + * {@link org.apache.spark.sql.api.java.types.DataType#createArrayType(DataType)} or + * {@link org.apache.spark.sql.api.java.types.DataType#createArrayType(DataType, boolean)} + * should be used. */ public class ArrayType extends DataType { private DataType elementType; diff --git a/sql/core/src/main/java/org/apache/spark/sql/api/java/types/BinaryType.java b/sql/core/src/main/java/org/apache/spark/sql/api/java/types/BinaryType.java index c33ee5e25cd32..61703179850e9 100644 --- a/sql/core/src/main/java/org/apache/spark/sql/api/java/types/BinaryType.java +++ b/sql/core/src/main/java/org/apache/spark/sql/api/java/types/BinaryType.java @@ -19,6 +19,8 @@ /** * The data type representing byte[] values. + * + * {@code BinaryType} is represented by the singleton object {@link DataType#BinaryType}. */ public class BinaryType extends DataType { protected BinaryType() {} diff --git a/sql/core/src/main/java/org/apache/spark/sql/api/java/types/BooleanType.java b/sql/core/src/main/java/org/apache/spark/sql/api/java/types/BooleanType.java index 38981a21da58d..8fa24d85d1238 100644 --- a/sql/core/src/main/java/org/apache/spark/sql/api/java/types/BooleanType.java +++ b/sql/core/src/main/java/org/apache/spark/sql/api/java/types/BooleanType.java @@ -17,6 +17,11 @@ package org.apache.spark.sql.api.java.types; +/** + * The data type representing boolean and Boolean values. + * + * {@code BooleanType} is represented by the singleton object {@link DataType#BooleanType}. + */ public class BooleanType extends DataType { protected BooleanType() {} } diff --git a/sql/core/src/main/java/org/apache/spark/sql/api/java/types/ByteType.java b/sql/core/src/main/java/org/apache/spark/sql/api/java/types/ByteType.java index 16b0d9ecf688c..2de32978e2705 100644 --- a/sql/core/src/main/java/org/apache/spark/sql/api/java/types/ByteType.java +++ b/sql/core/src/main/java/org/apache/spark/sql/api/java/types/ByteType.java @@ -18,7 +18,9 @@ package org.apache.spark.sql.api.java.types; /** - * The data type representing Byte values. + * The data type representing byte and Byte values. + * + * {@code ByteType} is represented by the singleton object {@link DataType#ByteType}. */ public class ByteType extends DataType { protected ByteType() {} diff --git a/sql/core/src/main/java/org/apache/spark/sql/api/java/types/DataType.java b/sql/core/src/main/java/org/apache/spark/sql/api/java/types/DataType.java index c67287dea8ba6..6fd04aa2c6c9c 100644 --- a/sql/core/src/main/java/org/apache/spark/sql/api/java/types/DataType.java +++ b/sql/core/src/main/java/org/apache/spark/sql/api/java/types/DataType.java @@ -23,6 +23,9 @@ /** * The base type of all Spark SQL data types. + * + * To get/create specific data type, users should use singleton objects and factory methods + * provided by this class. */ public abstract class DataType { @@ -81,6 +84,21 @@ public abstract class DataType { */ public static final ShortType ShortType = new ShortType(); + /** + * Creates an ArrayType by specifying the data type of elements ({@code elementType}). + * The field of {@code containsNull} is set to {@code false}. + * + * @param elementType + * @return + */ + public static ArrayType createArrayType(DataType elementType) { + if (elementType == null) { + throw new IllegalArgumentException("elementType should not be null."); + } + + return new ArrayType(elementType, false); + } + /** * Creates an ArrayType by specifying the data type of elements ({@code elementType}) and * whether the array contains null values ({@code containsNull}). @@ -98,7 +116,8 @@ public static ArrayType createArrayType(DataType elementType, boolean containsNu /** * Creates a MapType by specifying the data type of keys ({@code keyType}) and values - * ({@code keyType}). + * ({@code keyType}). The field of {@code valueContainsNull} is set to {@code true}. + * * @param keyType * @param valueType * @return diff --git a/sql/core/src/main/java/org/apache/spark/sql/api/java/types/DecimalType.java b/sql/core/src/main/java/org/apache/spark/sql/api/java/types/DecimalType.java index d483824999e85..9250491a2d2ca 100644 --- a/sql/core/src/main/java/org/apache/spark/sql/api/java/types/DecimalType.java +++ b/sql/core/src/main/java/org/apache/spark/sql/api/java/types/DecimalType.java @@ -19,6 +19,8 @@ /** * The data type representing java.math.BigDecimal values. + * + * {@code DecimalType} is represented by the singleton object {@link DataType#DecimalType}. */ public class DecimalType extends DataType { protected DecimalType() {} diff --git a/sql/core/src/main/java/org/apache/spark/sql/api/java/types/DoubleType.java b/sql/core/src/main/java/org/apache/spark/sql/api/java/types/DoubleType.java index 13a7bf6bbb5ed..3e86917fddc4b 100644 --- a/sql/core/src/main/java/org/apache/spark/sql/api/java/types/DoubleType.java +++ b/sql/core/src/main/java/org/apache/spark/sql/api/java/types/DoubleType.java @@ -18,7 +18,9 @@ package org.apache.spark.sql.api.java.types; /** - * The data type representing Double values. + * The data type representing double and Double values. + * + * {@code DoubleType} is represented by the singleton object {@link DataType#DoubleType}. */ public class DoubleType extends DataType { protected DoubleType() {} diff --git a/sql/core/src/main/java/org/apache/spark/sql/api/java/types/FloatType.java b/sql/core/src/main/java/org/apache/spark/sql/api/java/types/FloatType.java index bf47d4fc1fa07..fa860d40176ef 100644 --- a/sql/core/src/main/java/org/apache/spark/sql/api/java/types/FloatType.java +++ b/sql/core/src/main/java/org/apache/spark/sql/api/java/types/FloatType.java @@ -18,7 +18,9 @@ package org.apache.spark.sql.api.java.types; /** - * The data type representing Float values. + * The data type representing float and Float values. + * + * {@code FloatType} is represented by the singleton object {@link DataType#FloatType}. */ public class FloatType extends DataType { protected FloatType() {} diff --git a/sql/core/src/main/java/org/apache/spark/sql/api/java/types/IntegerType.java b/sql/core/src/main/java/org/apache/spark/sql/api/java/types/IntegerType.java index f41ec2260df6b..bd973eca2c3ce 100644 --- a/sql/core/src/main/java/org/apache/spark/sql/api/java/types/IntegerType.java +++ b/sql/core/src/main/java/org/apache/spark/sql/api/java/types/IntegerType.java @@ -18,7 +18,9 @@ package org.apache.spark.sql.api.java.types; /** - * The data type representing Int values. + * The data type representing int and Integer values. + * + * {@code IntegerType} is represented by the singleton object {@link DataType#IntegerType}. */ public class IntegerType extends DataType { protected IntegerType() {} diff --git a/sql/core/src/main/java/org/apache/spark/sql/api/java/types/LongType.java b/sql/core/src/main/java/org/apache/spark/sql/api/java/types/LongType.java index 7c73a7b506a2b..e00233304cefa 100644 --- a/sql/core/src/main/java/org/apache/spark/sql/api/java/types/LongType.java +++ b/sql/core/src/main/java/org/apache/spark/sql/api/java/types/LongType.java @@ -18,7 +18,9 @@ package org.apache.spark.sql.api.java.types; /** - * The data type representing Long values. + * The data type representing long and Long values. + * + * {@code LongType} is represented by the singleton object {@link DataType#LongType}. */ public class LongType extends DataType { protected LongType() {} diff --git a/sql/core/src/main/java/org/apache/spark/sql/api/java/types/MapType.java b/sql/core/src/main/java/org/apache/spark/sql/api/java/types/MapType.java index d116241a0e407..d2270d4b6ff9c 100644 --- a/sql/core/src/main/java/org/apache/spark/sql/api/java/types/MapType.java +++ b/sql/core/src/main/java/org/apache/spark/sql/api/java/types/MapType.java @@ -19,9 +19,16 @@ /** * The data type representing Maps. A MapType object comprises two fields, - * {@code DataType keyType} and {@code DataType valueType}. + * {@code DataType keyType}, {@code DataType valueType}, and {@code boolean valueContainsNull}. * The field of {@code keyType} is used to specify the type of keys in the map. * The field of {@code valueType} is used to specify the type of values in the map. + * The field of {@code valueContainsNull} is used to specify if map values have + * {@code null} values. + * + * To create a {@link MapType}, + * {@link org.apache.spark.sql.api.java.types.DataType#createMapType(DataType, DataType)} or + * {@link org.apache.spark.sql.api.java.types.DataType#createMapType(DataType, DataType, boolean)} + * should be used. */ public class MapType extends DataType { private DataType keyType; diff --git a/sql/core/src/main/java/org/apache/spark/sql/api/java/types/ShortType.java b/sql/core/src/main/java/org/apache/spark/sql/api/java/types/ShortType.java index 8ffa75a835e63..98f9507acf121 100644 --- a/sql/core/src/main/java/org/apache/spark/sql/api/java/types/ShortType.java +++ b/sql/core/src/main/java/org/apache/spark/sql/api/java/types/ShortType.java @@ -18,7 +18,9 @@ package org.apache.spark.sql.api.java.types; /** - * The data type representing Short values. + * The data type representing short and Short values. + * + * {@code ShortType} is represented by the singleton object {@link DataType#ShortType}. */ public class ShortType extends DataType { protected ShortType() {} diff --git a/sql/core/src/main/java/org/apache/spark/sql/api/java/types/StringType.java b/sql/core/src/main/java/org/apache/spark/sql/api/java/types/StringType.java index dd9be52f8c53b..b8e7dbe646071 100644 --- a/sql/core/src/main/java/org/apache/spark/sql/api/java/types/StringType.java +++ b/sql/core/src/main/java/org/apache/spark/sql/api/java/types/StringType.java @@ -19,6 +19,8 @@ /** * The data type representing String values. + * + * {@code StringType} is represented by the singleton object {@link DataType#StringType}. */ public class StringType extends DataType { protected StringType() {} diff --git a/sql/core/src/main/java/org/apache/spark/sql/api/java/types/StructField.java b/sql/core/src/main/java/org/apache/spark/sql/api/java/types/StructField.java index 25c82de9641c5..54e9c11ea415e 100644 --- a/sql/core/src/main/java/org/apache/spark/sql/api/java/types/StructField.java +++ b/sql/core/src/main/java/org/apache/spark/sql/api/java/types/StructField.java @@ -24,6 +24,10 @@ * The field of {@code dataType} specifies the data type of a StructField. * The field of {@code nullable} specifies if values of a StructField can contain {@code null} * values. + * + * To create a {@link StructField}, + * {@link org.apache.spark.sql.api.java.types.DataType#createStructField(String, DataType, boolean)} + * should be used. */ public class StructField { private String name; diff --git a/sql/core/src/main/java/org/apache/spark/sql/api/java/types/StructType.java b/sql/core/src/main/java/org/apache/spark/sql/api/java/types/StructType.java index 17142ff672822..33a42f4b16265 100644 --- a/sql/core/src/main/java/org/apache/spark/sql/api/java/types/StructType.java +++ b/sql/core/src/main/java/org/apache/spark/sql/api/java/types/StructType.java @@ -23,6 +23,11 @@ /** * The data type representing Rows. * A StructType object comprises an array of StructFields. + * + * To create an {@link StructType}, + * {@link org.apache.spark.sql.api.java.types.DataType#createStructType(java.util.List)} or + * {@link org.apache.spark.sql.api.java.types.DataType#createStructType(StructField[])} + * should be used. */ public class StructType extends DataType { private StructField[] fields; diff --git a/sql/core/src/main/java/org/apache/spark/sql/api/java/types/TimestampType.java b/sql/core/src/main/java/org/apache/spark/sql/api/java/types/TimestampType.java index 8c2f203d950c4..65295779f71ec 100644 --- a/sql/core/src/main/java/org/apache/spark/sql/api/java/types/TimestampType.java +++ b/sql/core/src/main/java/org/apache/spark/sql/api/java/types/TimestampType.java @@ -19,6 +19,8 @@ /** * The data type representing java.sql.Timestamp values. + * + * {@code TimestampType} is represented by the singleton object {@link DataType#TimestampType}. */ public class TimestampType extends DataType { protected TimestampType() {} diff --git a/sql/core/src/main/java/org/apache/spark/sql/api/java/types/package-info.java b/sql/core/src/main/java/org/apache/spark/sql/api/java/types/package-info.java index a1c6fcf1430f5..f169ac65e226f 100644 --- a/sql/core/src/main/java/org/apache/spark/sql/api/java/types/package-info.java +++ b/sql/core/src/main/java/org/apache/spark/sql/api/java/types/package-info.java @@ -19,4 +19,4 @@ /** * Allows users to get and create Spark SQL data types. */ -package org.apache.spark.sql.api.java.types; \ No newline at end of file +package org.apache.spark.sql.api.java.types; diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala index cb48b689903c7..f93839c66c5d8 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala @@ -107,7 +107,8 @@ class SQLContext(@transient val sparkContext: SparkContext) /** * Parses the data type in our internal string representation. The data type string should - * have the same format as the one generate by `toString` in scala. + * have the same format as the one generated by `toString` in scala. + * It is only used by PySpark. */ private[sql] def parseDataType(dataTypeString: String): DataType = { val parser = org.apache.spark.sql.catalyst.types.DataType diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/Exchange.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/Exchange.scala index 00010ef6e798a..67d70d599c3f7 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/Exchange.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/Exchange.scala @@ -99,7 +99,7 @@ private[sql] case class AddExchange(sqlContext: SQLContext) extends Rule[SparkPl !operator.requiredChildDistribution.zip(operator.children).map { case (required, child) => val valid = child.outputPartitioning.satisfies(required) - logger.debug( + logDebug( s"${if (valid) "Valid" else "Invalid"} distribution," + s"required: $required current: ${child.outputPartitioning}") valid diff --git a/sql/core/src/main/scala/org/apache/spark/sql/package.scala b/sql/core/src/main/scala/org/apache/spark/sql/package.scala index 819e36bbf9c02..e7732fd7d8336 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/package.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/package.scala @@ -17,10 +17,7 @@ package org.apache.spark -import scala.collection.JavaConverters._ - import org.apache.spark.annotation.DeveloperApi -import org.apache.spark.sql.api.java.types.{DataType => JDataType, StructField => JStructField} /** * Allows the execution of relational queries, including those expressed in SQL using Spark. @@ -243,10 +240,12 @@ package object sql { /** * :: DeveloperApi :: * - * The data type representing `Seq`s. + * The data type for collections of multiple values. + * Internally these are represented as columns that contain a ``scala.collection.Seq``. + * * An [[ArrayType]] object comprises two fields, `elementType: [[DataType]]` and * `containsNull: Boolean`. The field of `elementType` is used to specify the type of - * array elements. The field of `containsNull` is used to specify if the array has `null` valus. + * array elements. The field of `containsNull` is used to specify if the array has `null` values. * * @group dataType */