ORC-1017: Add sizes tool to determine and display the sizes of each c…

…olumn in a set of files. (#925) ### What changes were proposed in this pull request? This patch adds a new tool that accounts for the total size of a set of ORC files. For files written by >= ORC 1.5, you'll get a column breakdown of the file. There are some virtual columns that are included: - _index the indexes that are used for skipping inside the stripe - _data the data in files written prior to ORC 1.5 - _stripe_footer the stripe metadata - _file_footer the file metadata - _padding padding added to align stripes to HDFS block boundaries I also added a new method on TypeDescription that gets the full field name, which is the inverse of findSubtype. ### Why are the changes needed? The tool helps diagnose the compression of a set of files. ### How was this patch tested? I added a test of the new TypeDescription.getFullFieldName. I ran the tool over some of the examples and some multiple-terabyte directories of production ORC files. (cherry picked from commit be0762b) Signed-off-by: Dongjoon Hyun <[email protected]>
apache · Dec 16, 2021 · 0fbfa66 · 0fbfa66
1 parent 43b53ea
commit 0fbfa66
Show file tree

Hide file tree

Showing 6 changed files with 336 additions and 26 deletions.
diff --git a/java/core/src/java/org/apache/orc/TypeDescription.java b/java/core/src/java/org/apache/orc/TypeDescription.java
@@ -862,4 +862,71 @@ public void annotateEncryption(String encryption, String masks) {
           + source);
     }
   }
+
+  /**
+   * Find the index of a given child object using == comparison.
+   * @param child The child type
+   * @return the index 0 to N-1 of the children.
+   */
+  private int getChildIndex(TypeDescription child) {
+    for(int i=children.size() - 1; i >= 0; --i) {
+      if (children.get(i) == child) {
+        return i;
+      }
+    }
+    throw new IllegalArgumentException("Child not found");
+  }
+
+  /**
+   * For a complex type, get the partial name for this child. For structures,
+   * it returns the corresponding field name. For lists and maps, it uses the
+   * special names "_elem", "_key", and "_value". Unions use the integer index.
+   * @param child The desired child, which must be the same object (==)
+   * @return The name of the field for the given child.
+   */
+  private String getPartialName(TypeDescription child) {
+    switch (category) {
+      case LIST:
+        return "_elem";
+      case MAP:
+        return getChildIndex(child) == 0 ? "_key" : "_value";
+      case STRUCT:
+        return fieldNames.get(getChildIndex(child));
+      case UNION:
+        return Integer.toString(getChildIndex(child));
+      default:
+        throw new IllegalArgumentException(
+            "Can't get the field name of a primitive type");
+    }
+  }
+
+  /**
+   * Get the full field name for the given type. For
+   * "struct&lt;a:struct&lt;list&lt;struct&lt;b:int,c:int&gt;&gt;&gt;&gt;" when
+   * called on c, would return "a._elem.c".
+   * @return A string that is the inverse of findSubtype
+   */
+  public String getFullFieldName() {
+    List<String> parts = new ArrayList<>(getId());
+    TypeDescription current = this;
+    TypeDescription parent = current.getParent();
+    // Handle the root as a special case so that it isn't an empty string.
+    if (parent == null) {
+      return Integer.toString(current.getId());
+    }
+    while (parent != null) {
+      parts.add(parent.getPartialName(current));
+      current = parent;
+      parent = current.getParent();
+    }
+    // Put the string together backwards
+    StringBuilder buffer = new StringBuilder();
+    for (int part=parts.size() - 1; part >= 0; --part) {
+      buffer.append(parts.get(part));
+      if (part != 0) {
+        buffer.append('.');
+      }
+    }
+    return buffer.toString();
+  }
 }
diff --git a/java/core/src/test/org/apache/orc/TestTypeDescription.java b/java/core/src/test/org/apache/orc/TestTypeDescription.java
@@ -490,4 +490,23 @@ public void testMaskConflict() {
     assertThrows(IllegalArgumentException.class, () ->
         schema.annotateEncryption(null,"nullify:name;sha256:name"));
   }
+
+  @Test
+  public void testGetFullFieldName() {
+    TypeDescription schema = TypeDescription.fromString(
+        "struct<" +
+            "name:struct<first:string,last:string>," +
+            "address:struct<street:string,city:string,country:string,post_code:string>," +
+            "credit_cards:array<struct<card_number:string,expire:date,ccv:string>>," +
+            "properties:map<string,uniontype<int,string>>>");
+    for (String column: new String[]{"0", "name", "name.first", "name.last",
+                                     "address.street", "address.city",
+                                     "credit_cards", "credit_cards._elem",
+                                     "credit_cards._elem.card_number",
+                                     "properties", "properties._key", "properties._value",
+                                     "properties._value.0", "properties._value.1"}) {
+      assertEquals(column,
+          schema.findSubtype(column, true).getFullFieldName());
+    }
+  }
 }
diff --git a/java/tools/pom.xml b/java/tools/pom.xml
@@ -115,25 +115,59 @@
         <artifactId>maven-compiler-plugin</artifactId>
       </plugin>
       <plugin>
-        <artifactId>maven-assembly-plugin</artifactId>
-        <version>${maven-assembly-plugin.version}</version>
-        <configuration>
-          <archive>
-            <manifest>
-              <mainClass>org.apache.orc.tools.Driver</mainClass>
-            </manifest>
-          </archive>
-          <descriptors>
-            <descriptor>src/assembly/uber.xml</descriptor>
-          </descriptors>
-        </configuration>
+        <groupId>org.apache.maven.plugins</groupId>
+        <artifactId>maven-shade-plugin</artifactId>
+        <version>${maven-shade-plugin.version}</version>
         <executions>
           <execution>
-            <id>make-assembly</id> <!-- this is used for inheritance merges -->
-            <phase>package</phase> <!-- bind to the packaging phase -->
+            <phase>package</phase>
             <goals>
-              <goal>single</goal>
+              <goal>shade</goal>
             </goals>
+            <configuration>
+              <artifactSet>
+                <includes>
+                  <include>*:*</include>
+                </includes>
+              </artifactSet>
+              <transformers>
+                <transformer implementation="org.apache.maven.plugins.shade.resource.ManifestResourceTransformer">
+                  <mainClass>org.apache.orc.tools.Driver</mainClass>
+                </transformer>
+              </transformers>
+              <shadedArtifactAttached>true</shadedArtifactAttached>
+              <shadedClassifierName>uber</shadedClassifierName>
+              <relocations>
+                <relocation>
+                  <pattern>com.google.protobuf</pattern>
+                  <shadedPattern>com.google.protobuf25</shadedPattern>
+                </relocation>
+                <relocation>
+                  <pattern>org.apache.hadoop.hive</pattern>
+                  <shadedPattern>org.apache.orc.storage</shadedPattern>
+                </relocation>
+                <relocation>
+                  <pattern>org.apache.hive</pattern>
+                  <shadedPattern>org.apache.orc.storage</shadedPattern>
+                </relocation>
+                <relocation>
+                  <pattern>org.apache.commons</pattern>
+                  <shadedPattern>org.apache.orc.shade.commons</shadedPattern>
+                </relocation>
+              </relocations>
+              <filters>
+                <filter>
+                  <artifact>*:*</artifact>
+                  <excludes>
+                    <exclude>module-info.class</exclude>
+                    <exclude>META-INF/MANIFEST.MF</exclude>
+                    <exclude>META-INF/DEPENDENCIES</exclude>
+                    <exclude>META-INF/LICENSE</exclude>
+                    <exclude>META-INF/NOTICE</exclude>
+                  </excludes>
+                </filter>
+              </filters>
+            </configuration>
           </execution>
         </executions>
       </plugin>

diff --git a/java/tools/src/findbugs/exclude.xml b/java/tools/src/findbugs/exclude.xml
@@ -19,17 +19,7 @@
   <!-- Java's try with resources causes a false positive.
        See https://github.com/SERG-Delft/jpacman/pull/27 . -->
   <Match>
-    <Bug pattern="RCN_REDUNDANT_NULLCHECK_OF_NONNULL_VALUE"/>
-    <Class name="~org\.apache\.orc\.tools\.(ScanData|PrintVersion)"/>
-    <Method name="main"/>
-  </Match>
-  <Match>
-    <Bug pattern="RCN_REDUNDANT_NULLCHECK_WOULD_HAVE_BEEN_A_NPE"/>
-    <Class name="~org\.apache\.orc.*\.Test.*"/>
-  </Match>
-  <Match>
-    <Bug pattern="RCN_REDUNDANT_NULLCHECK_WOULD_HAVE_BEEN_A_NPE"/>
-    <Class name="~org\.apache\.orc.tools.(RowCount|ScanData)"/>
+    <Bug pattern="RCN_REDUNDANT_NULLCHECK_OF_NONNULL_VALUE,RCN_REDUNDANT_NULLCHECK_WOULD_HAVE_BEEN_A_NPE"/>
   </Match>
   <Match>
     <Bug pattern="REC_CATCH_EXCEPTION"/>

diff --git a/java/tools/src/java/org/apache/orc/tools/ColumnSizes.java b/java/tools/src/java/org/apache/orc/tools/ColumnSizes.java
@@ -0,0 +1,196 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.orc.tools;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.LocatedFileStatus;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.fs.RemoteIterator;
+import org.apache.orc.ColumnStatistics;
+import org.apache.orc.OrcFile;
+import org.apache.orc.Reader;
+import org.apache.orc.StripeInformation;
+import org.apache.orc.TypeDescription;
+
+import java.io.IOException;
+import java.io.PrintStream;
+import java.util.ArrayList;
+import java.util.List;
+
+/**
+ * Given a set of paths, finds all of the "*.orc" files under them and
+ * prints the sizes of each column, both as a percentage and the number of
+ * bytes per a row.
+ */
+public class ColumnSizes {
+  final Configuration conf;
+  final TypeDescription schema;
+  final long[] columnSizes;
+  int goodFiles = 0;
+  long rows = 0;
+  long padding = 0;
+  long totalSize = 0;
+  long stripeFooterSize = 0;
+  long fileFooterSize = 0;
+  long stripeIndex = 0;
+  // data bytes that aren't assigned to a specific column
+  long stripeData = 0;
+
+  public ColumnSizes(Configuration conf,
+                     LocatedFileStatus file) throws IOException {
+    this.conf = conf;
+    try (Reader reader = OrcFile.createReader(file.getPath(),
+                                              OrcFile.readerOptions(conf))) {
+      this.schema = reader.getSchema();
+      columnSizes = new long[schema.getMaximumId() + 1];
+      addReader(file, reader);
+    }
+  }
+
+  private void checkStripes(LocatedFileStatus file,
+                            Reader reader) {
+    // Count the magic as file overhead
+    long offset = OrcFile.MAGIC.length();
+    fileFooterSize += offset;
+
+    for (StripeInformation stripe: reader.getStripes()) {
+      padding += stripe.getOffset() - offset;
+      stripeIndex += stripe.getIndexLength();
+      stripeData += stripe.getDataLength();
+      stripeFooterSize += stripe.getFooterLength();
+      offset = stripe.getOffset() + stripe.getLength();
+    }
+    // Add everything else as the file footer
+    fileFooterSize += file.getLen() - offset;
+  }
+
+  private boolean addReader(LocatedFileStatus file,
+                            Reader reader) {
+    // Validate that the schemas are the same
+    TypeDescription newSchema = reader.getSchema();
+    if (schema.equals(newSchema)) {
+      goodFiles += 1;
+      rows += reader.getNumberOfRows();
+      totalSize += file.getLen();
+      checkStripes(file, reader);
+      ColumnStatistics[] colStats = reader.getStatistics();
+      for (int c = 0; c < colStats.length && c < columnSizes.length; c++) {
+        columnSizes[c] += colStats[c].getBytesOnDisk();
+        // Don't double count. Either count the bytes as stripe data or as
+        // part of a column.
+        stripeData -= colStats[c].getBytesOnDisk();
+      }
+    } else {
+      System.err.println("Ignoring " + file.getPath()
+          + " because of schema mismatch: " + newSchema);
+      return false;
+    }
+    return true;
+  }
+
+  public boolean addFile(LocatedFileStatus file) throws IOException {
+    try (Reader reader = OrcFile.createReader(file.getPath(),
+        OrcFile.readerOptions(conf))) {
+      return addReader(file, reader);
+    }
+  }
+
+  private static class StringLongPair {
+    final String name;
+    final long size;
+    StringLongPair(String name, long size) {
+      this.name = name;
+      this.size = size;
+    }
+  }
+
+  private void printResults(PrintStream out) {
+    List<StringLongPair> sizes = new ArrayList<>(columnSizes.length + 5);
+    for(int column = 0; column < columnSizes.length; ++column) {
+      if (columnSizes[column] > 0) {
+        sizes.add(new StringLongPair(
+            schema.findSubtype(column).getFullFieldName(),
+            columnSizes[column]));
+      }
+    }
+    if (padding > 0) {
+      sizes.add(new StringLongPair("_padding", padding));
+    }
+    if (stripeFooterSize > 0) {
+      sizes.add(new StringLongPair("_stripe_footer", stripeFooterSize));
+    }
+    if (fileFooterSize > 0) {
+      sizes.add(new StringLongPair("_file_footer", fileFooterSize));
+    }
+    if (stripeIndex > 0) {
+      sizes.add(new StringLongPair("_index", stripeIndex));
+    }
+    if (stripeData > 0) {
+      sizes.add(new StringLongPair("_data", stripeData));
+    }
+    // sort by descending size, ascending name
+    sizes.sort((x, y) -> x.size != y.size ?
+        Long.compare(y.size, x.size) : x.name.compareTo(y.name));
+    out.println("Percent  Bytes/Row  Name");
+    for (StringLongPair item: sizes) {
+       out.println(String.format("  %-5.2f  %-9.2f  %s",
+           100.0 * item.size / totalSize, (double) item.size / rows, item.name));
+    }
+  }
+
+  public static void main(Configuration conf, String[] args) throws IOException {
+    ColumnSizes result = null;
+    int badFiles = 0;
+    for(String root: args) {
+      Path rootPath = new Path(root);
+      FileSystem fs = rootPath.getFileSystem(conf);
+      for(RemoteIterator<LocatedFileStatus> itr = fs.listFiles(rootPath, true); itr.hasNext(); ) {
+        LocatedFileStatus status = itr.next();
+        if (status.isFile() && status.getPath().getName().endsWith(".orc")) {
+          try {
+            if (result == null) {
+              result = new ColumnSizes(conf, status);
+            } else {
+              if (!result.addFile(status)) {
+                badFiles += 1;
+              }
+            }
+          } catch (IOException err) {
+            badFiles += 1;
+            System.err.println("Failed to read " + status.getPath());
+          }
+        }
+      }
+    }
+    if (result == null) {
+      System.err.println("No files found");
+    } else {
+      result.printResults(System.out);
+    }
+    if (badFiles > 0) {
+      System.err.println(badFiles + " bad ORC files found.");
+      System.exit(1);
+    }
+  }
+
+  public static void main(String[] args) throws IOException {
+    main(new Configuration(), args);
+  }
+}