Skip to content

Commit

Permalink
[SPARK-12398] Smart truncation of DataFrame / Dataset toString
Browse files Browse the repository at this point in the history
When a DataFrame or Dataset has a long schema, we should intelligently truncate to avoid flooding the screen with unreadable information.
// Standard output
[a: int, b: int]

// Truncate many top level fields
[a: int, b, string ... 10 more fields]

// Truncate long inner structs
[a: struct<a: Int ... 10 more fields>]

Author: Dilip Biswal <[email protected]>

Closes #10373 from dilipbiswal/spark-12398.
  • Loading branch information
dilipbiswal authored and marmbrus committed Dec 21, 2015
1 parent 1920d72 commit 474eb21
Show file tree
Hide file tree
Showing 4 changed files with 73 additions and 1 deletion.
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,9 @@ abstract class DataType extends AbstractDataType {
/** Readable string representation for the type. */
def simpleString: String = typeName

/** Readable string representation for the type with truncation */
private[sql] def simpleString(maxNumberFields: Int): String = simpleString

/**
* Check if `this` and `other` are the same data type when ignoring nullability
* (`StructField.nullable`, `ArrayType.containsNull`, and `MapType.valueContainsNull`).
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -278,6 +278,23 @@ case class StructType(fields: Array[StructField]) extends DataType with Seq[Stru
s"struct<${fieldTypes.mkString(",")}>"
}

private[sql] override def simpleString(maxNumberFields: Int): String = {
val builder = new StringBuilder
val fieldTypes = fields.take(maxNumberFields).map {
case f => s"${f.name}: ${f.dataType.simpleString(maxNumberFields)}"
}
builder.append("struct<")
builder.append(fieldTypes.mkString(", "))
if (fields.length > 2) {
if (fields.length - fieldTypes.size == 1) {
builder.append(" ... 1 more field")
} else {
builder.append(" ... " + (fields.length - 2) + " more fields")
}
}
builder.append(">").toString()
}

/**
* Merges with another schema (`StructType`). For a struct field A from `this` and a struct field
* B from `that`,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,20 @@ private[sql] trait Queryable {

override def toString: String = {
try {
schema.map(f => s"${f.name}: ${f.dataType.simpleString}").mkString("[", ", ", "]")
val builder = new StringBuilder
val fields = schema.take(2).map {
case f => s"${f.name}: ${f.dataType.simpleString(2)}"
}
builder.append("[")
builder.append(fields.mkString(", "))
if (schema.length > 2) {
if (schema.length - fields.size == 1) {
builder.append(" ... 1 more field")
} else {
builder.append(" ... " + (schema.length - 2) + " more fields")
}
}
builder.append("]").toString()
} catch {
case NonFatal(e) =>
s"Invalid tree; ${e.getMessage}:\n$queryExecution"
Expand Down
39 changes: 39 additions & 0 deletions sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
Original file line number Diff line number Diff line change
Expand Up @@ -1177,4 +1177,43 @@ class DataFrameSuite extends QueryTest with SharedSQLContext {
val primitiveUDF = udf((i: Int) => i * 2)
checkAnswer(df.select(primitiveUDF($"age")), Row(44) :: Row(null) :: Nil)
}

test("SPARK-12398 truncated toString") {
val df1 = Seq((1L, "row1")).toDF("id", "name")
assert(df1.toString() === "[id: bigint, name: string]")

val df2 = Seq((1L, "c2", false)).toDF("c1", "c2", "c3")
assert(df2.toString === "[c1: bigint, c2: string ... 1 more field]")

val df3 = Seq((1L, "c2", false, 10)).toDF("c1", "c2", "c3", "c4")
assert(df3.toString === "[c1: bigint, c2: string ... 2 more fields]")

val df4 = Seq((1L, Tuple2(1L, "val"))).toDF("c1", "c2")
assert(df4.toString === "[c1: bigint, c2: struct<_1: bigint, _2: string>]")

val df5 = Seq((1L, Tuple2(1L, "val"), 20.0)).toDF("c1", "c2", "c3")
assert(df5.toString === "[c1: bigint, c2: struct<_1: bigint, _2: string> ... 1 more field]")

val df6 = Seq((1L, Tuple2(1L, "val"), 20.0, 1)).toDF("c1", "c2", "c3", "c4")
assert(df6.toString === "[c1: bigint, c2: struct<_1: bigint, _2: string> ... 2 more fields]")

val df7 = Seq((1L, Tuple3(1L, "val", 2), 20.0, 1)).toDF("c1", "c2", "c3", "c4")
assert(
df7.toString ===
"[c1: bigint, c2: struct<_1: bigint, _2: string ... 1 more field> ... 2 more fields]")

val df8 = Seq((1L, Tuple7(1L, "val", 2, 3, 4, 5, 6), 20.0, 1)).toDF("c1", "c2", "c3", "c4")
assert(
df8.toString ===
"[c1: bigint, c2: struct<_1: bigint, _2: string ... 5 more fields> ... 2 more fields]")

val df9 =
Seq((1L, Tuple4(1L, Tuple4(1L, 2L, 3L, 4L), 2L, 3L), 20.0, 1)).toDF("c1", "c2", "c3", "c4")
assert(
df9.toString ===
"[c1: bigint, c2: struct<_1: bigint," +
" _2: struct<_1: bigint," +
" _2: bigint ... 2 more fields> ... 2 more fields> ... 2 more fields]")

}
}

0 comments on commit 474eb21

Please sign in to comment.