Cleaned up Java UDT Suite, and added warning about element ordering w…

…hen creating schema from Java Bean
apache · Nov 2, 2014 · d063380 · d063380
1 parent a571bb6
commit d063380
Show file tree

Hide file tree

Showing 2 changed files with 12 additions and 31 deletions.
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/api/java/JavaSQLContext.scala b/sql/core/src/main/scala/org/apache/spark/sql/api/java/JavaSQLContext.scala
@@ -87,6 +87,10 @@ class JavaSQLContext(val sqlContext: SQLContext) extends UDFRegistration {
 
   /**
    * Applies a schema to an RDD of Java Beans.
+   *
+   * WARNING: The ordering of elements in the schema may differ from Scala.
+   *          If you create a [[org.apache.spark.sql.SchemaRDD]] using [[SQLContext]]
+   *          with the same Java Bean, row elements may be in a different order.
    */
   def applySchema(rdd: JavaRDD[_], beanClass: Class[_]): JavaSchemaRDD = {
     val attributeSeq = getSchema(beanClass)
@@ -193,11 +197,16 @@ class JavaSQLContext(val sqlContext: SQLContext) extends UDFRegistration {
     sqlContext.registerRDDAsTable(rdd.baseSchemaRDD, tableName)
   }
 
-  /** Returns a Catalyst Schema for the given java bean class. */
+  /**
+   * Returns a Catalyst Schema for the given java bean class.
+   */
   protected def getSchema(beanClass: Class[_]): Seq[AttributeReference] = {
     // TODO: All of this could probably be moved to Catalyst as it is mostly not Spark specific.
     val beanInfo = Introspector.getBeanInfo(beanClass)
 
+    // Note: The ordering of elements may differ from when the schema is inferred in Scala.
+    //       This is because beanInfo.getPropertyDescriptors gives no guarantees about
+    //       element ordering.
     val fields = beanInfo.getPropertyDescriptors.filterNot(_.getName == "class")
     fields.map { property =>
       val (dataType, nullable) = property.getPropertyType match {

diff --git a/sql/core/src/test/java/org/apache/spark/sql/api/java/JavaUserDefinedTypeSuite.java b/sql/core/src/test/java/org/apache/spark/sql/api/java/JavaUserDefinedTypeSuite.java
@@ -75,42 +75,14 @@ public void useScalaUDT() {
       Assert.assertTrue(actualFeatures.contains(lp.features()));
     }
 
-    List<Row> actual = javaSqlCtx.sql("SELECT * FROM points").collect();
+    List<Row> actual = javaSqlCtx.sql("SELECT label, features FROM points").collect();
     List<MyLabeledPoint> actualPoints =
         new LinkedList<MyLabeledPoint>();
     for (Row r : actual) {
-      // Note: JavaSQLContext.getSchema switches the ordering of the Row elements
-      //       in the MyLabeledPoint case class.
-      actualPoints.add(new MyLabeledPoint(
-          r.getDouble(1), (MyDenseVector)r.get(0)));
+      actualPoints.add(new MyLabeledPoint(r.getDouble(0), (MyDenseVector)r.get(1)));
     }
     for (MyLabeledPoint lp : points) {
       Assert.assertTrue(actualPoints.contains(lp));
     }
-    /*
-    // THIS FAILS BECAUSE JavaSQLContext.getSchema switches the ordering of the Row elements
-    //  in the MyLabeledPoint case class.
-    List<Row> expected = new LinkedList<Row>();
-    expected.add(Row.create(new MyLabeledPoint(1.0,
-        new MyDenseVector(new double[]{0.1, 1.0}))));
-    expected.add(Row.create(new MyLabeledPoint(0.0,
-        new MyDenseVector(new double[]{0.2, 2.0}))));
-    System.out.println("Expected:");
-    for (Row r : expected) {
-      System.out.println("r: " + r.toString());
-      for (int i = 0; i < r.length(); ++i) {
-        System.out.println("  r[i]: " + r.get(i).toString());
-      }
-    }
-
-    System.out.println("Actual:");
-    for (Row r : actual) {
-      System.out.println("r: " + r.toString());
-      for (int i = 0; i < r.length(); ++i) {
-        System.out.println("  r[i]: " + r.get(i).toString());
-      }
-      Assert.assertTrue(expected.contains(r));
-    }
-    */
   }
 }