ORC-946: Unified json library (#869)

### What changes were proposed in this pull request? The java project depends on several json libraries. Replacing jackson/jettison with Gson ``` use jackson-core in orc-benchmarks-core org.apache.orc.bench.core.convert.json.JsonWriter use jettison in tools org.apache.orc.tools.KeyTool org.apache.orc.tools.JsonFileDump org.apache.orc.tools.PrintData ``` gson vs. jettison has some inconsistent 1. the scope of character escaping is inconsistent; jettison escapes '/' gson does not modify 2. Inconsistent floating point writing; jettison removes trailing zeros and decimal points where possible. gson leaves it as is. 3. prettyPrint; jettison's prettyPrint is faulty, '[' and '{' are often linked together without a line break. gson prettyPrint is fine. These three behaviours are not forward compatible in this pr. I don't think prettyPrint needs to be made compatible. The other two I would like to hear your opinion on. ### Why are the changes needed? Reduce dependencies and use a unified json lib. ### How was this patch tested? Pass the CIs.
apache · Aug 16, 2021 · 2690bd3 · 2690bd3
1 parent daa57f8
commit 2690bd3
Show file tree

Hide file tree

Showing 9 changed files with 821 additions and 838 deletions.
diff --git a/java/bench/core/pom.xml b/java/bench/core/pom.xml
@@ -31,10 +31,6 @@
   </description>
 
   <dependencies>
-    <dependency>
-      <groupId>com.fasterxml.jackson.core</groupId>
-      <artifactId>jackson-core</artifactId>
-    </dependency>
     <dependency>
       <groupId>com.google.auto.service</groupId>
       <artifactId>auto-service</artifactId>

diff --git a/java/bench/core/src/java/org/apache/orc/bench/core/convert/json/JsonWriter.java b/java/bench/core/src/java/org/apache/orc/bench/core/convert/json/JsonWriter.java
@@ -18,8 +18,6 @@
 
 package org.apache.orc.bench.core.convert.json;
 
-import com.fasterxml.jackson.core.JsonFactory;
-import com.fasterxml.jackson.core.JsonGenerator;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector;
@@ -47,7 +45,7 @@
 
 public class JsonWriter implements BatchWriter {
   private final Writer outStream;
-  private final JsonGenerator writer;
+  private final com.google.gson.stream.JsonWriter writer;
   private final TypeDescription schema;
 
   public JsonWriter(Path path, TypeDescription schema,
@@ -56,114 +54,113 @@ public JsonWriter(Path path, TypeDescription schema,
     OutputStream file = path.getFileSystem(conf).create(path, true);
     outStream = new OutputStreamWriter(compression.create(file),
         StandardCharsets.UTF_8);
-    JsonFactory factory = new JsonFactory();
-    factory.setRootValueSeparator("\n");
-    writer = factory.createGenerator(outStream);
+    writer = new com.google.gson.stream.JsonWriter(outStream);
+    writer.setLenient(true);
     this.schema = schema;
   }
 
-  private static void printMap(JsonGenerator writer,
+  private static void printMap(com.google.gson.stream.JsonWriter writer,
                                MapColumnVector vector,
                                TypeDescription schema,
                                int row) throws IOException {
-    writer.writeStartArray();
+    writer.beginArray();
     TypeDescription keyType = schema.getChildren().get(0);
     TypeDescription valueType = schema.getChildren().get(1);
     int offset = (int) vector.offsets[row];
     for (int i = 0; i < vector.lengths[row]; ++i) {
-      writer.writeStartObject();
-      writer.writeFieldName("_key");
+      writer.beginObject();
+      writer.name("_key");
       printValue(writer, vector.keys, keyType, offset + i);
-      writer.writeFieldName("_value");
+      writer.name("_value");
       printValue(writer, vector.values, valueType, offset + i);
-      writer.writeEndObject();
+      writer.endObject();
     }
-    writer.writeEndArray();
+    writer.endArray();
   }
 
-  private static void printList(JsonGenerator writer,
+  private static void printList(com.google.gson.stream.JsonWriter writer,
                                 ListColumnVector vector,
                                 TypeDescription schema,
                                 int row) throws IOException {
-    writer.writeStartArray();
+    writer.beginArray();
     int offset = (int) vector.offsets[row];
     TypeDescription childType = schema.getChildren().get(0);
     for (int i = 0; i < vector.lengths[row]; ++i) {
       printValue(writer, vector.child, childType, offset + i);
     }
-    writer.writeEndArray();
+    writer.endArray();
   }
 
-  private static void printUnion(JsonGenerator writer,
+  private static void printUnion(com.google.gson.stream.JsonWriter writer,
                                  UnionColumnVector vector,
                                  TypeDescription schema,
                                  int row) throws IOException {
     int tag = vector.tags[row];
     printValue(writer, vector.fields[tag], schema.getChildren().get(tag), row);
   }
 
-  static void printStruct(JsonGenerator writer,
+  static void printStruct(com.google.gson.stream.JsonWriter writer,
                           StructColumnVector batch,
                           TypeDescription schema,
                           int row) throws IOException {
-    writer.writeStartObject();
+    writer.beginObject();
     List<String> fieldNames = schema.getFieldNames();
     List<TypeDescription> fieldTypes = schema.getChildren();
     for (int i = 0; i < fieldTypes.size(); ++i) {
-      writer.writeFieldName(fieldNames.get(i));
+      writer.name(fieldNames.get(i));
       printValue(writer, batch.fields[i], fieldTypes.get(i), row);
     }
-    writer.writeEndObject();
+    writer.endObject();
   }
 
-  static void printBinary(JsonGenerator writer, BytesColumnVector vector,
+  static void printBinary(com.google.gson.stream.JsonWriter writer, BytesColumnVector vector,
                           int row) throws IOException {
     StringBuilder buffer = new StringBuilder();
     int offset = vector.start[row];
     for(int i=0; i < vector.length[row]; ++i) {
       int value = 0xff & (int) vector.vector[row][offset + i];
       buffer.append(String.format("%02x", value));
     }
-    writer.writeString(buffer.toString());
+    writer.value(buffer.toString());
   }
 
-  static void printValue(JsonGenerator writer, ColumnVector vector,
+  static void printValue(com.google.gson.stream.JsonWriter writer, ColumnVector vector,
                          TypeDescription schema, int row) throws IOException {
     if (vector.isRepeating) {
       row = 0;
     }
     if (vector.noNulls || !vector.isNull[row]) {
       switch (schema.getCategory()) {
         case BOOLEAN:
-          writer.writeBoolean(((LongColumnVector) vector).vector[row] != 0);
+          writer.value(((LongColumnVector) vector).vector[row] != 0);
           break;
         case BYTE:
         case SHORT:
         case INT:
         case LONG:
-          writer.writeNumber(((LongColumnVector) vector).vector[row]);
+          writer.value(((LongColumnVector) vector).vector[row]);
           break;
         case FLOAT:
         case DOUBLE:
-          writer.writeNumber(((DoubleColumnVector) vector).vector[row]);
+          writer.value(((DoubleColumnVector) vector).vector[row]);
           break;
         case STRING:
         case CHAR:
         case VARCHAR:
-          writer.writeString(((BytesColumnVector) vector).toString(row));
+          writer.value(((BytesColumnVector) vector).toString(row));
           break;
         case BINARY:
           printBinary(writer, (BytesColumnVector) vector, row);
           break;
         case DECIMAL:
-          writer.writeString(((DecimalColumnVector) vector).vector[row].toString());
+          writer.value(((DecimalColumnVector) vector).vector[row].toString());
           break;
         case DATE:
-          writer.writeString(new DateWritable(
+          writer.value(new DateWritable(
               (int) ((LongColumnVector) vector).vector[row]).toString());
           break;
         case TIMESTAMP:
-          writer.writeString(((TimestampColumnVector) vector)
+          writer.value(((TimestampColumnVector) vector)
               .asScratchTimestamp(row).toString());
           break;
         case LIST:
@@ -179,27 +176,26 @@ static void printValue(JsonGenerator writer, ColumnVector vector,
           printUnion(writer, (UnionColumnVector) vector, schema, row);
           break;
         default:
-          throw new IllegalArgumentException("Unknown type " +
-              schema.toString());
+          throw new IllegalArgumentException("Unknown type " + schema);
       }
     } else {
-      writer.writeNull();
+      writer.nullValue();
     }
   }
 
-  static void printRow(JsonGenerator writer,
-                              VectorizedRowBatch batch,
-                              TypeDescription schema,
-                              int row) throws IOException {
+  static void printRow(com.google.gson.stream.JsonWriter writer,
+                       VectorizedRowBatch batch,
+                       TypeDescription schema,
+                       int row) throws IOException {
     if (schema.getCategory() == TypeDescription.Category.STRUCT) {
       List<TypeDescription> fieldTypes = schema.getChildren();
       List<String> fieldNames = schema.getFieldNames();
-      writer.writeStartObject();
+      writer.beginObject();
       for (int c = 0; c < batch.cols.length; ++c) {
-        writer.writeFieldName(fieldNames.get(c));
+        writer.name(fieldNames.get(c));
         printValue(writer, batch.cols[c], fieldTypes.get(c), row);
       }
-      writer.writeEndObject();
+      writer.endObject();
     } else {
       printValue(writer, batch.cols[0], schema, row);
     }
@@ -208,6 +204,7 @@ static void printRow(JsonGenerator writer,
   public void writeBatch(VectorizedRowBatch batch) throws IOException {
     for (int r = 0; r < batch.size; ++r) {
       printRow(writer, batch, schema, r);
+      outStream.write("\n");
     }
   }
 

diff --git a/java/pom.xml b/java/pom.xml
@@ -766,11 +766,6 @@
           </exclusion>
         </exclusions>
       </dependency>
-      <dependency>
-        <groupId>org.codehaus.jettison</groupId>
-        <artifactId>jettison</artifactId>
-        <version>1.4.1</version>
-      </dependency>
       <dependency>
         <groupId>org.jetbrains</groupId>
         <artifactId>annotations</artifactId>

diff --git a/java/tools/pom.xml b/java/tools/pom.xml
@@ -74,10 +74,6 @@
       <groupId>org.apache.hive</groupId>
       <artifactId>hive-storage-api</artifactId>
     </dependency>
-    <dependency>
-      <groupId>org.codehaus.jettison</groupId>
-      <artifactId>jettison</artifactId>
-    </dependency>
     <dependency>
       <groupId>org.slf4j</groupId>
       <artifactId>slf4j-api</artifactId>