-
Notifications
You must be signed in to change notification settings - Fork 28.5k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[SPARK-24305][SQL][FOLLOWUP] Avoid serialization of private fields in collection expressions. #21352
[SPARK-24305][SQL][FOLLOWUP] Avoid serialization of private fields in collection expressions. #21352
Changes from 7 commits
ded67f5
e96962e
f6368b5
2862d3e
a4d1e7f
62c55ad
294ac69
94b86a2
a0abc25
872ef99
fd3a945
922d2f0
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -168,27 +168,21 @@ case class ArraysZip(children: Seq[Expression]) extends Expression with ExpectsI | |
|
||
override def inputTypes: Seq[AbstractDataType] = Seq.fill(children.length)(ArrayType) | ||
|
||
override def dataType: DataType = ArrayType(mountSchema) | ||
|
||
override def nullable: Boolean = children.exists(_.nullable) | ||
|
||
private lazy val arrayTypes = children.map(_.dataType.asInstanceOf[ArrayType]) | ||
|
||
private lazy val arrayElementTypes = arrayTypes.map(_.elementType) | ||
|
||
@transient private lazy val mountSchema: StructType = { | ||
@transient override lazy val dataType: DataType = { | ||
val fields = children.zip(arrayElementTypes).zipWithIndex.map { | ||
case ((expr: NamedExpression, elementType), _) => | ||
StructField(expr.name, elementType, nullable = true) | ||
case ((_, elementType), idx) => | ||
StructField(idx.toString, elementType, nullable = true) | ||
} | ||
StructType(fields) | ||
ArrayType(StructType(fields), containsNull = false) | ||
} | ||
|
||
@transient lazy val numberOfArrays: Int = children.length | ||
override def nullable: Boolean = children.exists(_.nullable) | ||
|
||
@transient lazy val genericArrayData = classOf[GenericArrayData].getName | ||
private def arrayElementTypes = children.map(_.dataType.asInstanceOf[ArrayType].elementType) | ||
|
||
private def genericArrayData = classOf[GenericArrayData].getName | ||
|
||
def emptyInputGenCode(ev: ExprCode): ExprCode = { | ||
ev.copy(code""" | ||
|
@@ -256,7 +250,7 @@ case class ArraysZip(children: Seq[Expression]) extends Expression with ExpectsI | |
("ArrayData[]", arrVals) :: Nil) | ||
|
||
val initVariables = s""" | ||
|ArrayData[] $arrVals = new ArrayData[$numberOfArrays]; | ||
|ArrayData[] $arrVals = new ArrayData[${children.length}]; | ||
|int $biggestCardinality = 0; | ||
|${CodeGenerator.javaType(dataType)} ${ev.value} = null; | ||
""".stripMargin | ||
|
@@ -268,7 +262,7 @@ case class ArraysZip(children: Seq[Expression]) extends Expression with ExpectsI | |
|if (!${ev.isNull}) { | ||
| Object[] $args = new Object[$biggestCardinality]; | ||
| for (int $i = 0; $i < $biggestCardinality; $i ++) { | ||
| Object[] $currentRow = new Object[$numberOfArrays]; | ||
| Object[] $currentRow = new Object[${children.length}]; | ||
| $getValueForTypeSplitted | ||
| $args[$i] = new $genericInternalRow($currentRow); | ||
| } | ||
|
@@ -278,7 +272,7 @@ case class ArraysZip(children: Seq[Expression]) extends Expression with ExpectsI | |
} | ||
|
||
override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = { | ||
if (numberOfArrays == 0) { | ||
if (children.length == 0) { | ||
emptyInputGenCode(ev) | ||
} else { | ||
nonEmptyInputGenCode(ctx, ev) | ||
|
@@ -360,7 +354,7 @@ case class MapEntries(child: Expression) extends UnaryExpression with ExpectsInp | |
|
||
override def inputTypes: Seq[AbstractDataType] = Seq(MapType) | ||
|
||
lazy val childDataType: MapType = child.dataType.asInstanceOf[MapType] | ||
private def childDataType: MapType = child.dataType.asInstanceOf[MapType] | ||
|
||
override def dataType: DataType = { | ||
ArrayType( | ||
|
@@ -741,14 +735,15 @@ case class MapConcat(children: Seq[Expression]) extends Expression { | |
since = "2.4.0") | ||
case class MapFromEntries(child: Expression) extends UnaryExpression { | ||
|
||
@transient | ||
private lazy val dataTypeDetails: Option[(MapType, Boolean, Boolean)] = child.dataType match { | ||
case ArrayType( | ||
StructType(Array( | ||
StructField(_, keyType, keyNullable, _), | ||
StructField(_, valueType, valueNullable, _))), | ||
containsNull) => Some((MapType(keyType, valueType, valueNullable), keyNullable, containsNull)) | ||
case _ => None | ||
@transient private lazy val dataTypeDetails: Option[(MapType, Boolean, Boolean)] = { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this is an unneeded change, isn't it? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Here I wanted to be consistent in terms of formatting. ( There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I see, but this seems an unneeded change to me and I think there are other places where we use this syntax, so I see no reason to change it |
||
child.dataType match { | ||
case ArrayType( | ||
StructType(Array( | ||
StructField(_, kt, kn, _), | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit: Is there any reason to change variable names? It would be good to minimize differences for review and ease of understanding. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. the motivation is described here. I will revert this piece of code shortly. |
||
StructField(_, vt, vn, _))), | ||
cn) => Some((MapType(kt, vt, vn), kn, cn)) | ||
case _ => None | ||
} | ||
} | ||
|
||
private def nullEntries: Boolean = dataTypeDetails.get._3 | ||
|
@@ -953,8 +948,7 @@ trait ArraySortLike extends ExpectsInputTypes { | |
|
||
protected def nullOrder: NullOrder | ||
|
||
@transient | ||
private lazy val lt: Comparator[Any] = { | ||
@transient private lazy val lt: Comparator[Any] = { | ||
val ordering = arrayExpression.dataType match { | ||
case _ @ ArrayType(n: AtomicType, _) => n.ordering.asInstanceOf[Ordering[Any]] | ||
case _ @ ArrayType(a: ArrayType, _) => a.interpretedOrdering.asInstanceOf[Ordering[Any]] | ||
|
@@ -976,8 +970,7 @@ trait ArraySortLike extends ExpectsInputTypes { | |
} | ||
} | ||
|
||
@transient | ||
private lazy val gt: Comparator[Any] = { | ||
@transient private lazy val gt: Comparator[Any] = { | ||
val ordering = arrayExpression.dataType match { | ||
case _ @ ArrayType(n: AtomicType, _) => n.ordering.asInstanceOf[Ordering[Any]] | ||
case _ @ ArrayType(a: ArrayType, _) => a.interpretedOrdering.asInstanceOf[Ordering[Any]] | ||
|
@@ -1215,7 +1208,7 @@ case class Reverse(child: Expression) extends UnaryExpression with ImplicitCastI | |
|
||
override def dataType: DataType = child.dataType | ||
|
||
lazy val elementType: DataType = dataType.asInstanceOf[ArrayType].elementType | ||
private def elementType: DataType = dataType.asInstanceOf[ArrayType].elementType | ||
|
||
override def nullSafeEval(input: Any): Any = input match { | ||
case a: ArrayData => new GenericArrayData(a.toObjectArray(elementType).reverse) | ||
|
@@ -1607,7 +1600,7 @@ case class Slice(x: Expression, start: Expression, length: Expression) | |
|
||
override def children: Seq[Expression] = Seq(x, start, length) | ||
|
||
lazy val elementType: DataType = x.dataType.asInstanceOf[ArrayType].elementType | ||
private def elementType: DataType = x.dataType.asInstanceOf[ArrayType].elementType | ||
|
||
override def nullSafeEval(xVal: Any, startVal: Any, lengthVal: Any): Any = { | ||
val startInt = startVal.asInstanceOf[Int] | ||
|
@@ -1893,7 +1886,7 @@ case class ArrayMin(child: Expression) extends UnaryExpression with ImplicitCast | |
|
||
override def inputTypes: Seq[AbstractDataType] = Seq(ArrayType) | ||
|
||
private lazy val ordering = TypeUtils.getInterpretedOrdering(dataType) | ||
@transient private lazy val ordering = TypeUtils.getInterpretedOrdering(dataType) | ||
|
||
override def checkInputDataTypes(): TypeCheckResult = { | ||
val typeCheckResult = super.checkInputDataTypes() | ||
|
@@ -1958,7 +1951,7 @@ case class ArrayMax(child: Expression) extends UnaryExpression with ImplicitCast | |
|
||
override def inputTypes: Seq[AbstractDataType] = Seq(ArrayType) | ||
|
||
private lazy val ordering = TypeUtils.getInterpretedOrdering(dataType) | ||
@transient private lazy val ordering = TypeUtils.getInterpretedOrdering(dataType) | ||
|
||
override def checkInputDataTypes(): TypeCheckResult = { | ||
val typeCheckResult = super.checkInputDataTypes() | ||
|
@@ -2213,9 +2206,7 @@ case class ElementAt(left: Expression, right: Expression) extends GetMapValueUti | |
""") | ||
case class Concat(children: Seq[Expression]) extends Expression { | ||
|
||
private val MAX_ARRAY_LENGTH: Int = ByteArrayMethods.MAX_ROUNDED_ARRAY_LENGTH | ||
|
||
val allowedTypes = Seq(StringType, BinaryType, ArrayType) | ||
private def allowedTypes: Seq[AbstractDataType] = Seq(StringType, BinaryType, ArrayType) | ||
|
||
override def checkInputDataTypes(): TypeCheckResult = { | ||
if (children.isEmpty) { | ||
|
@@ -2234,7 +2225,7 @@ case class Concat(children: Seq[Expression]) extends Expression { | |
|
||
override def dataType: DataType = children.map(_.dataType).headOption.getOrElse(StringType) | ||
|
||
lazy val javaType: String = CodeGenerator.javaType(dataType) | ||
private def javaType: String = CodeGenerator.javaType(dataType) | ||
|
||
override def nullable: Boolean = children.exists(_.nullable) | ||
|
||
|
@@ -2254,9 +2245,10 @@ case class Concat(children: Seq[Expression]) extends Expression { | |
} else { | ||
val arrayData = inputs.map(_.asInstanceOf[ArrayData]) | ||
val numberOfElements = arrayData.foldLeft(0L)((sum, ad) => sum + ad.numElements()) | ||
if (numberOfElements > MAX_ARRAY_LENGTH) { | ||
if (numberOfElements > ByteArrayMethods.MAX_ROUNDED_ARRAY_LENGTH) { | ||
throw new RuntimeException(s"Unsuccessful try to concat arrays with $numberOfElements" + | ||
s" elements due to exceeding the array size limit $MAX_ARRAY_LENGTH.") | ||
" elements due to exceeding the array size limit " + | ||
ByteArrayMethods.MAX_ROUNDED_ARRAY_LENGTH + ".") | ||
} | ||
val finalData = new Array[AnyRef](numberOfElements.toInt) | ||
var position = 0 | ||
|
@@ -2314,9 +2306,10 @@ case class Concat(children: Seq[Expression]) extends Expression { | |
|for (int z = 0; z < ${children.length}; z++) { | ||
| $numElements += args[z].numElements(); | ||
|} | ||
|if ($numElements > $MAX_ARRAY_LENGTH) { | ||
|if ($numElements > ${ByteArrayMethods.MAX_ROUNDED_ARRAY_LENGTH}) { | ||
| throw new RuntimeException("Unsuccessful try to concat arrays with " + $numElements + | ||
| " elements due to exceeding the array size limit $MAX_ARRAY_LENGTH."); | ||
| " elements due to exceeding the array size limit" + | ||
| " ${ByteArrayMethods.MAX_ROUNDED_ARRAY_LENGTH}."); | ||
|} | ||
""".stripMargin | ||
|
||
|
@@ -2411,15 +2404,13 @@ case class Concat(children: Seq[Expression]) extends Expression { | |
since = "2.4.0") | ||
case class Flatten(child: Expression) extends UnaryExpression { | ||
|
||
private val MAX_ARRAY_LENGTH = ByteArrayMethods.MAX_ROUNDED_ARRAY_LENGTH | ||
|
||
private lazy val childDataType: ArrayType = child.dataType.asInstanceOf[ArrayType] | ||
private def childDataType: ArrayType = child.dataType.asInstanceOf[ArrayType] | ||
|
||
override def nullable: Boolean = child.nullable || childDataType.containsNull | ||
|
||
override def dataType: DataType = childDataType.elementType | ||
|
||
lazy val elementType: DataType = dataType.asInstanceOf[ArrayType].elementType | ||
private def elementType: DataType = dataType.asInstanceOf[ArrayType].elementType | ||
|
||
override def checkInputDataTypes(): TypeCheckResult = child.dataType match { | ||
case ArrayType(_: ArrayType, _) => | ||
|
@@ -2439,9 +2430,10 @@ case class Flatten(child: Expression) extends UnaryExpression { | |
} else { | ||
val arrayData = elements.map(_.asInstanceOf[ArrayData]) | ||
val numberOfElements = arrayData.foldLeft(0L)((sum, e) => sum + e.numElements()) | ||
if (numberOfElements > MAX_ARRAY_LENGTH) { | ||
if (numberOfElements > ByteArrayMethods.MAX_ROUNDED_ARRAY_LENGTH) { | ||
throw new RuntimeException("Unsuccessful try to flatten an array of arrays with " + | ||
s"$numberOfElements elements due to exceeding the array size limit $MAX_ARRAY_LENGTH.") | ||
s"$numberOfElements elements due to exceeding the array size limit " + | ||
ByteArrayMethods.MAX_ROUNDED_ARRAY_LENGTH + ".") | ||
} | ||
val flattenedData = new Array(numberOfElements.toInt) | ||
var position = 0 | ||
|
@@ -2474,9 +2466,10 @@ case class Flatten(child: Expression) extends UnaryExpression { | |
|for (int z = 0; z < $childVariableName.numElements(); z++) { | ||
| $variableName += $childVariableName.getArray(z).numElements(); | ||
|} | ||
|if ($variableName > $MAX_ARRAY_LENGTH) { | ||
|if ($variableName > ${ByteArrayMethods.MAX_ROUNDED_ARRAY_LENGTH}) { | ||
| throw new RuntimeException("Unsuccessful try to flatten an array of arrays with " + | ||
| $variableName + " elements due to exceeding the array size limit $MAX_ARRAY_LENGTH."); | ||
| $variableName + " elements due to exceeding the array size limit" + | ||
| " ${ByteArrayMethods.MAX_ROUNDED_ARRAY_LENGTH}."); | ||
|} | ||
""".stripMargin | ||
(code, variableName) | ||
|
@@ -2600,7 +2593,7 @@ case class Sequence( | |
|
||
override def nullable: Boolean = children.exists(_.nullable) | ||
|
||
override lazy val dataType: ArrayType = ArrayType(start.dataType, containsNull = false) | ||
override def dataType: ArrayType = ArrayType(start.dataType, containsNull = false) | ||
|
||
override def checkInputDataTypes(): TypeCheckResult = { | ||
val startType = start.dataType | ||
|
@@ -2631,7 +2624,7 @@ case class Sequence( | |
stepOpt.map(step => if (step.dataType != CalendarIntervalType) Cast(step, widerType) else step), | ||
timeZoneId) | ||
|
||
private lazy val impl: SequenceImpl = dataType.elementType match { | ||
@transient private lazy val impl: SequenceImpl = dataType.elementType match { | ||
case iType: IntegralType => | ||
type T = iType.InternalType | ||
val ct = ClassTag[T](iType.tag.mirror.runtimeClass(iType.tag.tpe)) | ||
|
@@ -2951,8 +2944,6 @@ object Sequence { | |
case class ArrayRepeat(left: Expression, right: Expression) | ||
extends BinaryExpression with ExpectsInputTypes { | ||
|
||
private val MAX_ARRAY_LENGTH = ByteArrayMethods.MAX_ROUNDED_ARRAY_LENGTH | ||
|
||
override def dataType: ArrayType = ArrayType(left.dataType, left.nullable) | ||
|
||
override def inputTypes: Seq[AbstractDataType] = Seq(AnyDataType, IntegerType) | ||
|
@@ -2964,9 +2955,9 @@ case class ArrayRepeat(left: Expression, right: Expression) | |
if (count == null) { | ||
null | ||
} else { | ||
if (count.asInstanceOf[Int] > MAX_ARRAY_LENGTH) { | ||
if (count.asInstanceOf[Int] > ByteArrayMethods.MAX_ROUNDED_ARRAY_LENGTH) { | ||
throw new RuntimeException(s"Unsuccessful try to create array with $count elements " + | ||
s"due to exceeding the array size limit $MAX_ARRAY_LENGTH."); | ||
s"due to exceeding the array size limit ${ByteArrayMethods.MAX_ROUNDED_ARRAY_LENGTH}."); | ||
} | ||
val element = left.eval(input) | ||
new GenericArrayData(Array.fill(count.asInstanceOf[Int])(element)) | ||
|
@@ -3025,9 +3016,10 @@ case class ArrayRepeat(left: Expression, right: Expression) | |
|if ($count > 0) { | ||
| $numElements = $count; | ||
|} | ||
|if ($numElements > $MAX_ARRAY_LENGTH) { | ||
|if ($numElements > ${ByteArrayMethods.MAX_ROUNDED_ARRAY_LENGTH}) { | ||
| throw new RuntimeException("Unsuccessful try to create array with " + $numElements + | ||
| " elements due to exceeding the array size limit $MAX_ARRAY_LENGTH."); | ||
| " elements due to exceeding the array size limit" + | ||
| " ${ByteArrayMethods.MAX_ROUNDED_ARRAY_LENGTH}."); | ||
|} | ||
""".stripMargin | ||
|
||
|
@@ -3109,7 +3101,7 @@ case class ArrayRemove(left: Expression, right: Expression) | |
Seq(ArrayType, elementType) | ||
} | ||
|
||
lazy val elementType: DataType = left.dataType.asInstanceOf[ArrayType].elementType | ||
private def elementType: DataType = left.dataType.asInstanceOf[ArrayType].elementType | ||
|
||
@transient private lazy val ordering: Ordering[Any] = | ||
TypeUtils.getInterpretedOrdering(right.dataType) | ||
|
@@ -3226,7 +3218,7 @@ case class ArrayDistinct(child: Expression) | |
|
||
override def dataType: DataType = child.dataType | ||
|
||
@transient lazy val elementType: DataType = dataType.asInstanceOf[ArrayType].elementType | ||
private def elementType: DataType = dataType.asInstanceOf[ArrayType].elementType | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit: as this is used in the eval method, with this PR we are re-evaluating this code for each row. Despite probably it is not a big issue, I'd rather not introduce perf regression. WDYT @cloud-fan ? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yea, makes sense. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. +1. If it's used in There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @mgaido91 Thanks for your point! |
||
|
||
@transient private lazy val ordering: Ordering[Any] = | ||
TypeUtils.getInterpretedOrdering(elementType) | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
this should be a lazy val
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I missed that one. Thanks!