diff --git a/sql/core/src/main/scala/org/apache/spark/sql/api/java/JavaSQLContext.scala b/sql/core/src/main/scala/org/apache/spark/sql/api/java/JavaSQLContext.scala index 1a7b9864a6f8f..80a417697eb8b 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/api/java/JavaSQLContext.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/api/java/JavaSQLContext.scala @@ -87,6 +87,10 @@ class JavaSQLContext(val sqlContext: SQLContext) extends UDFRegistration { /** * Applies a schema to an RDD of Java Beans. + * + * WARNING: The ordering of elements in the schema may differ from Scala. + * If you create a [[org.apache.spark.sql.SchemaRDD]] using [[SQLContext]] + * with the same Java Bean, row elements may be in a different order. */ def applySchema(rdd: JavaRDD[_], beanClass: Class[_]): JavaSchemaRDD = { val attributeSeq = getSchema(beanClass) @@ -193,11 +197,16 @@ class JavaSQLContext(val sqlContext: SQLContext) extends UDFRegistration { sqlContext.registerRDDAsTable(rdd.baseSchemaRDD, tableName) } - /** Returns a Catalyst Schema for the given java bean class. */ + /** + * Returns a Catalyst Schema for the given java bean class. + */ protected def getSchema(beanClass: Class[_]): Seq[AttributeReference] = { // TODO: All of this could probably be moved to Catalyst as it is mostly not Spark specific. val beanInfo = Introspector.getBeanInfo(beanClass) + // Note: The ordering of elements may differ from when the schema is inferred in Scala. + // This is because beanInfo.getPropertyDescriptors gives no guarantees about + // element ordering. val fields = beanInfo.getPropertyDescriptors.filterNot(_.getName == "class") fields.map { property => val (dataType, nullable) = property.getPropertyType match { diff --git a/sql/core/src/test/java/org/apache/spark/sql/api/java/JavaUserDefinedTypeSuite.java b/sql/core/src/test/java/org/apache/spark/sql/api/java/JavaUserDefinedTypeSuite.java index f6a68b52a4cbf..0caa8219a63e9 100644 --- a/sql/core/src/test/java/org/apache/spark/sql/api/java/JavaUserDefinedTypeSuite.java +++ b/sql/core/src/test/java/org/apache/spark/sql/api/java/JavaUserDefinedTypeSuite.java @@ -75,42 +75,14 @@ public void useScalaUDT() { Assert.assertTrue(actualFeatures.contains(lp.features())); } - List actual = javaSqlCtx.sql("SELECT * FROM points").collect(); + List actual = javaSqlCtx.sql("SELECT label, features FROM points").collect(); List actualPoints = new LinkedList(); for (Row r : actual) { - // Note: JavaSQLContext.getSchema switches the ordering of the Row elements - // in the MyLabeledPoint case class. - actualPoints.add(new MyLabeledPoint( - r.getDouble(1), (MyDenseVector)r.get(0))); + actualPoints.add(new MyLabeledPoint(r.getDouble(0), (MyDenseVector)r.get(1))); } for (MyLabeledPoint lp : points) { Assert.assertTrue(actualPoints.contains(lp)); } - /* - // THIS FAILS BECAUSE JavaSQLContext.getSchema switches the ordering of the Row elements - // in the MyLabeledPoint case class. - List expected = new LinkedList(); - expected.add(Row.create(new MyLabeledPoint(1.0, - new MyDenseVector(new double[]{0.1, 1.0})))); - expected.add(Row.create(new MyLabeledPoint(0.0, - new MyDenseVector(new double[]{0.2, 2.0})))); - System.out.println("Expected:"); - for (Row r : expected) { - System.out.println("r: " + r.toString()); - for (int i = 0; i < r.length(); ++i) { - System.out.println(" r[i]: " + r.get(i).toString()); - } - } - - System.out.println("Actual:"); - for (Row r : actual) { - System.out.println("r: " + r.toString()); - for (int i = 0; i < r.length(); ++i) { - System.out.println(" r[i]: " + r.get(i).toString()); - } - Assert.assertTrue(expected.contains(r)); - } - */ } }