From 327a7226da1628bfe8cd21ead5df02fe0498828b Mon Sep 17 00:00:00 2001 From: Owen O'Malley Date: Thu, 30 Sep 2021 21:35:19 -0700 Subject: [PATCH 1/2] ORC-1017: Add sizes tool to determine and display the sizes of each column in a set of files. --- .../java/org/apache/orc/TypeDescription.java | 67 ++++++ .../org/apache/orc/TestTypeDescription.java | 19 ++ .../org/apache/orc/tools/ColumnSizes.java | 196 ++++++++++++++++++ .../src/java/org/apache/orc/tools/Driver.java | 4 + 4 files changed, 286 insertions(+) create mode 100644 java/tools/src/java/org/apache/orc/tools/ColumnSizes.java diff --git a/java/core/src/java/org/apache/orc/TypeDescription.java b/java/core/src/java/org/apache/orc/TypeDescription.java index c4b7b26015..f422ea442e 100644 --- a/java/core/src/java/org/apache/orc/TypeDescription.java +++ b/java/core/src/java/org/apache/orc/TypeDescription.java @@ -862,4 +862,71 @@ public void annotateEncryption(String encryption, String masks) { + source); } } + + /** + * Find the index of a given child object using == comparison. + * @param child The child type + * @return the index 0 to N-1 of the children. + */ + private int getChildIndex(TypeDescription child) { + for(int i=children.size() - 1; i >= 0; --i) { + if (children.get(i) == child) { + return i; + } + } + throw new IllegalArgumentException("Child not found"); + } + + /** + * For a complex type, get the partial name for this child. For structures, + * it returns the corresponding field name. For lists and maps, it uses the + * special names "_elem", "_key", and "_value". Unions use the integer index. + * @param child The desired child, which must be the same object (==) + * @return The name of the field for the given child. + */ + private String getPartialName(TypeDescription child) { + switch (category) { + case LIST: + return "_elem"; + case MAP: + return getChildIndex(child) == 0 ? "_key" : "_value"; + case STRUCT: + return fieldNames.get(getChildIndex(child)); + case UNION: + return Integer.toString(getChildIndex(child)); + default: + throw new IllegalArgumentException( + "Can't get the field name of a primitive type"); + } + } + + /** + * Get the full field name for the given type. For + * "struct<a:struct<list<struct<b:int,c:int>>>>" when + * called on c, would return "a._elem.c". + * @return A string that is the inverse of findSubtype + */ + public String getFullFieldName() { + List parts = new ArrayList<>(getId()); + TypeDescription current = this; + TypeDescription parent = current.getParent(); + // Handle the root as a special case so that it isn't an empty string. + if (parent == null) { + return Integer.toString(current.getId()); + } + while (parent != null) { + parts.add(parent.getPartialName(current)); + current = parent; + parent = current.getParent(); + } + // Put the string together backwards + StringBuilder buffer = new StringBuilder(); + for (int part=parts.size() - 1; part >= 0; --part) { + buffer.append(parts.get(part)); + if (part != 0) { + buffer.append('.'); + } + } + return buffer.toString(); + } } diff --git a/java/core/src/test/org/apache/orc/TestTypeDescription.java b/java/core/src/test/org/apache/orc/TestTypeDescription.java index 80e3b77fc4..9203eaf6b1 100644 --- a/java/core/src/test/org/apache/orc/TestTypeDescription.java +++ b/java/core/src/test/org/apache/orc/TestTypeDescription.java @@ -490,4 +490,23 @@ public void testMaskConflict() { assertThrows(IllegalArgumentException.class, () -> schema.annotateEncryption(null,"nullify:name;sha256:name")); } + + @Test + public void testGetFullFieldName() { + TypeDescription schema = TypeDescription.fromString( + "struct<" + + "name:struct," + + "address:struct," + + "credit_cards:array>," + + "properties:map>>"); + for (String column: new String[]{"0", "name", "name.first", "name.last", + "address.street", "address.city", + "credit_cards", "credit_cards._elem", + "credit_cards._elem.card_number", + "properties", "properties._key", "properties._value", + "properties._value.0", "properties._value.1"}) { + assertEquals(column, + schema.findSubtype(column, true).getFullFieldName()); + } + } } diff --git a/java/tools/src/java/org/apache/orc/tools/ColumnSizes.java b/java/tools/src/java/org/apache/orc/tools/ColumnSizes.java new file mode 100644 index 0000000000..49dbc2be94 --- /dev/null +++ b/java/tools/src/java/org/apache/orc/tools/ColumnSizes.java @@ -0,0 +1,196 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.orc.tools; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.LocatedFileStatus; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.fs.RemoteIterator; +import org.apache.orc.ColumnStatistics; +import org.apache.orc.OrcFile; +import org.apache.orc.Reader; +import org.apache.orc.StripeInformation; +import org.apache.orc.TypeDescription; + +import java.io.IOException; +import java.io.PrintStream; +import java.util.ArrayList; +import java.util.List; + +/** + * Given a set of paths, finds all of the "*.orc" files under them and + * prints the sizes of each column, both as a percentage and the number of + * bytes per a row. + */ +public class ColumnSizes { + final Configuration conf; + final TypeDescription schema; + final long[] columnSizes; + int goodFiles = 0; + long rows = 0; + long padding = 0; + long totalSize = 0; + long stripeFooterSize = 0; + long fileFooterSize = 0; + long stripeIndex = 0; + // data bytes that aren't assigned to a specific column + long stripeData = 0; + + public ColumnSizes(Configuration conf, + LocatedFileStatus file) throws IOException { + this.conf = conf; + try (Reader reader = OrcFile.createReader(file.getPath(), + OrcFile.readerOptions(conf))) { + this.schema = reader.getSchema(); + columnSizes = new long[schema.getMaximumId() + 1]; + addReader(file, reader); + } + } + + private void checkStripes(LocatedFileStatus file, + Reader reader) { + // Count the magic as file overhead + long offset = OrcFile.MAGIC.length(); + fileFooterSize += offset; + + for (StripeInformation stripe: reader.getStripes()) { + padding += stripe.getOffset() - offset; + stripeIndex += stripe.getIndexLength(); + stripeData += stripe.getDataLength(); + stripeFooterSize += stripe.getFooterLength(); + offset = stripe.getOffset() + stripe.getLength(); + } + // Add everything else as the file footer + fileFooterSize += file.getLen() - offset; + } + + private boolean addReader(LocatedFileStatus file, + Reader reader) { + // Validate that the schemas are the same + TypeDescription newSchema = reader.getSchema(); + if (schema.equals(newSchema)) { + goodFiles += 1; + rows += reader.getNumberOfRows(); + totalSize += file.getLen(); + checkStripes(file, reader); + ColumnStatistics[] colStats = reader.getStatistics(); + for (int c = 0; c < colStats.length && c < columnSizes.length; c++) { + columnSizes[c] += colStats[c].getBytesOnDisk(); + // Don't double count. Either count the bytes as stripe data or as + // part of a column. + stripeData -= colStats[c].getBytesOnDisk(); + } + } else { + System.err.println("Ignoring " + file.getPath() + + " because of schema mismatch: " + newSchema); + return false; + } + return true; + } + + public boolean addFile(LocatedFileStatus file) throws IOException { + try (Reader reader = OrcFile.createReader(file.getPath(), + OrcFile.readerOptions(conf))) { + return addReader(file, reader); + } + } + + private static class StringLongPair { + final String name; + final long size; + StringLongPair(String name, long size) { + this.name = name; + this.size = size; + } + } + + private void printResults(PrintStream out) { + List sizes = new ArrayList<>(columnSizes.length + 5); + for(int column = 0; column < columnSizes.length; ++column) { + if (columnSizes[column] > 0) { + sizes.add(new StringLongPair( + schema.findSubtype(column).getFullFieldName(), + columnSizes[column])); + } + } + if (padding > 0) { + sizes.add(new StringLongPair("_padding", padding)); + } + if (stripeFooterSize > 0) { + sizes.add(new StringLongPair("_stripe_footer", stripeFooterSize)); + } + if (fileFooterSize > 0) { + sizes.add(new StringLongPair("_file_footer", fileFooterSize)); + } + if (stripeIndex > 0) { + sizes.add(new StringLongPair("_index", stripeIndex)); + } + if (stripeData > 0) { + sizes.add(new StringLongPair("_data", stripeData)); + } + // sort by descending size, ascending name + sizes.sort((x, y) -> x.size != y.size ? + Long.compare(y.size, x.size) : x.name.compareTo(y.name)); + out.println("Percent Bytes/Row Name"); + for (StringLongPair item: sizes) { + out.println(String.format(" %-5.2f %-9.2f %s", + 100.0 * item.size / totalSize, (double) item.size / rows, item.name)); + } + } + + public static void main(Configuration conf, String[] args) throws IOException { + ColumnSizes result = null; + int badFiles = 0; + for(String root: args) { + Path rootPath = new Path(root); + FileSystem fs = rootPath.getFileSystem(conf); + for(RemoteIterator itr = fs.listFiles(rootPath, true); itr.hasNext(); ) { + LocatedFileStatus status = itr.next(); + if (status.isFile() && status.getPath().getName().endsWith(".orc")) { + try { + if (result == null) { + result = new ColumnSizes(conf, status); + } else { + if (!result.addFile(status)) { + badFiles += 1; + } + } + } catch (IOException err) { + badFiles += 1; + System.err.println("Failed to read " + status.getPath()); + } + } + } + } + if (result == null) { + System.err.println("No files found"); + } else { + result.printResults(System.out); + } + if (badFiles > 0) { + System.err.println(badFiles + " bad ORC files found."); + System.exit(1); + } + } + + public static void main(String[] args) throws IOException { + main(new Configuration(), args); + } +} diff --git a/java/tools/src/java/org/apache/orc/tools/Driver.java b/java/tools/src/java/org/apache/orc/tools/Driver.java index 8e1850b870..c846e875b2 100644 --- a/java/tools/src/java/org/apache/orc/tools/Driver.java +++ b/java/tools/src/java/org/apache/orc/tools/Driver.java @@ -93,6 +93,7 @@ public static void main(String[] args) throws Exception { System.err.println(" key - print information about the keys"); System.err.println(" meta - print the metadata about the ORC file"); System.err.println(" scan - scan the ORC file"); + System.err.println(" sizes - list size on disk of each column"); System.err.println(" version - print the version of this ORC tool"); System.err.println(); System.err.println("To get more help, provide -h to the command"); @@ -125,6 +126,9 @@ public static void main(String[] args) throws Exception { case "scan": ScanData.main(conf, options.commandArgs); break; + case "sizes": + ColumnSizes.main(conf, options.commandArgs); + break; case "version": PrintVersion.main(conf, options.commandArgs); break; From c427f700de64633d5cf64d50bd9de89e5808ef45 Mon Sep 17 00:00:00 2001 From: Owen O'Malley Date: Fri, 1 Oct 2021 11:32:48 -0700 Subject: [PATCH 2/2] Mark the spurious findbugs warnings to ignore so that we can stop getting nailed on each patch when CI tests it patch. --- java/tools/pom.xml | 64 ++++++++++++++++++++++------- java/tools/src/findbugs/exclude.xml | 12 +----- 2 files changed, 50 insertions(+), 26 deletions(-) diff --git a/java/tools/pom.xml b/java/tools/pom.xml index c375b7df54..6ee1c0792a 100644 --- a/java/tools/pom.xml +++ b/java/tools/pom.xml @@ -111,25 +111,59 @@ maven-compiler-plugin - maven-assembly-plugin - ${maven-assembly-plugin.version} - - - - org.apache.orc.tools.Driver - - - - src/assembly/uber.xml - - + org.apache.maven.plugins + maven-shade-plugin + ${maven-shade-plugin.version} - make-assembly - package + package - single + shade + + + + *:* + + + + + org.apache.orc.tools.Driver + + + true + uber + + + com.google.protobuf + com.google.protobuf25 + + + org.apache.hadoop.hive + org.apache.orc.storage + + + org.apache.hive + org.apache.orc.storage + + + org.apache.commons + org.apache.orc.shade.commons + + + + + *:* + + module-info.class + META-INF/MANIFEST.MF + META-INF/DEPENDENCIES + META-INF/LICENSE + META-INF/NOTICE + + + + diff --git a/java/tools/src/findbugs/exclude.xml b/java/tools/src/findbugs/exclude.xml index 813a8e0ba3..555068bc93 100644 --- a/java/tools/src/findbugs/exclude.xml +++ b/java/tools/src/findbugs/exclude.xml @@ -19,17 +19,7 @@ - - - - - - - - - - - +