Skip to content

Commit

Permalink
ORC-1017: Add sizes tool to determine and display the sizes of each c…
Browse files Browse the repository at this point in the history
…olumn in a set of files. (#925)

### What changes were proposed in this pull request?

This patch adds a new tool that accounts for the total size of a set of ORC files. For files written by >= ORC 1.5, you'll get a column breakdown of the file. There are some virtual columns that are included:
- _index the indexes that are used for skipping inside the stripe
- _data the data in files written prior to ORC 1.5
- _stripe_footer the stripe metadata
- _file_footer the file metadata
- _padding padding added to align stripes to HDFS block boundaries

I also added a new method on TypeDescription that gets the full field name, which is the inverse of findSubtype.

### Why are the changes needed?

The tool helps diagnose the compression of a set of files.

### How was this patch tested?

I added a test of the new TypeDescription.getFullFieldName. I ran the tool over some of the examples and some multiple-terabyte directories of production ORC files.

(cherry picked from commit be0762b)
Signed-off-by: Dongjoon Hyun <[email protected]>
  • Loading branch information
omalley authored and dongjoon-hyun committed Dec 16, 2021
1 parent 43b53ea commit 0fbfa66
Show file tree
Hide file tree
Showing 6 changed files with 336 additions and 26 deletions.
67 changes: 67 additions & 0 deletions java/core/src/java/org/apache/orc/TypeDescription.java
Original file line number Diff line number Diff line change
Expand Up @@ -862,4 +862,71 @@ public void annotateEncryption(String encryption, String masks) {
+ source);
}
}

/**
* Find the index of a given child object using == comparison.
* @param child The child type
* @return the index 0 to N-1 of the children.
*/
private int getChildIndex(TypeDescription child) {
for(int i=children.size() - 1; i >= 0; --i) {
if (children.get(i) == child) {
return i;
}
}
throw new IllegalArgumentException("Child not found");
}

/**
* For a complex type, get the partial name for this child. For structures,
* it returns the corresponding field name. For lists and maps, it uses the
* special names "_elem", "_key", and "_value". Unions use the integer index.
* @param child The desired child, which must be the same object (==)
* @return The name of the field for the given child.
*/
private String getPartialName(TypeDescription child) {
switch (category) {
case LIST:
return "_elem";
case MAP:
return getChildIndex(child) == 0 ? "_key" : "_value";
case STRUCT:
return fieldNames.get(getChildIndex(child));
case UNION:
return Integer.toString(getChildIndex(child));
default:
throw new IllegalArgumentException(
"Can't get the field name of a primitive type");
}
}

/**
* Get the full field name for the given type. For
* "struct&lt;a:struct&lt;list&lt;struct&lt;b:int,c:int&gt;&gt;&gt;&gt;" when
* called on c, would return "a._elem.c".
* @return A string that is the inverse of findSubtype
*/
public String getFullFieldName() {
List<String> parts = new ArrayList<>(getId());
TypeDescription current = this;
TypeDescription parent = current.getParent();
// Handle the root as a special case so that it isn't an empty string.
if (parent == null) {
return Integer.toString(current.getId());
}
while (parent != null) {
parts.add(parent.getPartialName(current));
current = parent;
parent = current.getParent();
}
// Put the string together backwards
StringBuilder buffer = new StringBuilder();
for (int part=parts.size() - 1; part >= 0; --part) {
buffer.append(parts.get(part));
if (part != 0) {
buffer.append('.');
}
}
return buffer.toString();
}
}
19 changes: 19 additions & 0 deletions java/core/src/test/org/apache/orc/TestTypeDescription.java
Original file line number Diff line number Diff line change
Expand Up @@ -490,4 +490,23 @@ public void testMaskConflict() {
assertThrows(IllegalArgumentException.class, () ->
schema.annotateEncryption(null,"nullify:name;sha256:name"));
}

@Test
public void testGetFullFieldName() {
TypeDescription schema = TypeDescription.fromString(
"struct<" +
"name:struct<first:string,last:string>," +
"address:struct<street:string,city:string,country:string,post_code:string>," +
"credit_cards:array<struct<card_number:string,expire:date,ccv:string>>," +
"properties:map<string,uniontype<int,string>>>");
for (String column: new String[]{"0", "name", "name.first", "name.last",
"address.street", "address.city",
"credit_cards", "credit_cards._elem",
"credit_cards._elem.card_number",
"properties", "properties._key", "properties._value",
"properties._value.0", "properties._value.1"}) {
assertEquals(column,
schema.findSubtype(column, true).getFullFieldName());
}
}
}
64 changes: 49 additions & 15 deletions java/tools/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -115,25 +115,59 @@
<artifactId>maven-compiler-plugin</artifactId>
</plugin>
<plugin>
<artifactId>maven-assembly-plugin</artifactId>
<version>${maven-assembly-plugin.version}</version>
<configuration>
<archive>
<manifest>
<mainClass>org.apache.orc.tools.Driver</mainClass>
</manifest>
</archive>
<descriptors>
<descriptor>src/assembly/uber.xml</descriptor>
</descriptors>
</configuration>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-shade-plugin</artifactId>
<version>${maven-shade-plugin.version}</version>
<executions>
<execution>
<id>make-assembly</id> <!-- this is used for inheritance merges -->
<phase>package</phase> <!-- bind to the packaging phase -->
<phase>package</phase>
<goals>
<goal>single</goal>
<goal>shade</goal>
</goals>
<configuration>
<artifactSet>
<includes>
<include>*:*</include>
</includes>
</artifactSet>
<transformers>
<transformer implementation="org.apache.maven.plugins.shade.resource.ManifestResourceTransformer">
<mainClass>org.apache.orc.tools.Driver</mainClass>
</transformer>
</transformers>
<shadedArtifactAttached>true</shadedArtifactAttached>
<shadedClassifierName>uber</shadedClassifierName>
<relocations>
<relocation>
<pattern>com.google.protobuf</pattern>
<shadedPattern>com.google.protobuf25</shadedPattern>
</relocation>
<relocation>
<pattern>org.apache.hadoop.hive</pattern>
<shadedPattern>org.apache.orc.storage</shadedPattern>
</relocation>
<relocation>
<pattern>org.apache.hive</pattern>
<shadedPattern>org.apache.orc.storage</shadedPattern>
</relocation>
<relocation>
<pattern>org.apache.commons</pattern>
<shadedPattern>org.apache.orc.shade.commons</shadedPattern>
</relocation>
</relocations>
<filters>
<filter>
<artifact>*:*</artifact>
<excludes>
<exclude>module-info.class</exclude>
<exclude>META-INF/MANIFEST.MF</exclude>
<exclude>META-INF/DEPENDENCIES</exclude>
<exclude>META-INF/LICENSE</exclude>
<exclude>META-INF/NOTICE</exclude>
</excludes>
</filter>
</filters>
</configuration>
</execution>
</executions>
</plugin>
Expand Down
12 changes: 1 addition & 11 deletions java/tools/src/findbugs/exclude.xml
Original file line number Diff line number Diff line change
Expand Up @@ -19,17 +19,7 @@
<!-- Java's try with resources causes a false positive.
See https://github.com/SERG-Delft/jpacman/pull/27 . -->
<Match>
<Bug pattern="RCN_REDUNDANT_NULLCHECK_OF_NONNULL_VALUE"/>
<Class name="~org\.apache\.orc\.tools\.(ScanData|PrintVersion)"/>
<Method name="main"/>
</Match>
<Match>
<Bug pattern="RCN_REDUNDANT_NULLCHECK_WOULD_HAVE_BEEN_A_NPE"/>
<Class name="~org\.apache\.orc.*\.Test.*"/>
</Match>
<Match>
<Bug pattern="RCN_REDUNDANT_NULLCHECK_WOULD_HAVE_BEEN_A_NPE"/>
<Class name="~org\.apache\.orc.tools.(RowCount|ScanData)"/>
<Bug pattern="RCN_REDUNDANT_NULLCHECK_OF_NONNULL_VALUE,RCN_REDUNDANT_NULLCHECK_WOULD_HAVE_BEEN_A_NPE"/>
</Match>
<Match>
<Bug pattern="REC_CATCH_EXCEPTION"/>
Expand Down
196 changes: 196 additions & 0 deletions java/tools/src/java/org/apache/orc/tools/ColumnSizes.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,196 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.apache.orc.tools;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.LocatedFileStatus;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.RemoteIterator;
import org.apache.orc.ColumnStatistics;
import org.apache.orc.OrcFile;
import org.apache.orc.Reader;
import org.apache.orc.StripeInformation;
import org.apache.orc.TypeDescription;

import java.io.IOException;
import java.io.PrintStream;
import java.util.ArrayList;
import java.util.List;

/**
* Given a set of paths, finds all of the "*.orc" files under them and
* prints the sizes of each column, both as a percentage and the number of
* bytes per a row.
*/
public class ColumnSizes {
final Configuration conf;
final TypeDescription schema;
final long[] columnSizes;
int goodFiles = 0;
long rows = 0;
long padding = 0;
long totalSize = 0;
long stripeFooterSize = 0;
long fileFooterSize = 0;
long stripeIndex = 0;
// data bytes that aren't assigned to a specific column
long stripeData = 0;

public ColumnSizes(Configuration conf,
LocatedFileStatus file) throws IOException {
this.conf = conf;
try (Reader reader = OrcFile.createReader(file.getPath(),
OrcFile.readerOptions(conf))) {
this.schema = reader.getSchema();
columnSizes = new long[schema.getMaximumId() + 1];
addReader(file, reader);
}
}

private void checkStripes(LocatedFileStatus file,
Reader reader) {
// Count the magic as file overhead
long offset = OrcFile.MAGIC.length();
fileFooterSize += offset;

for (StripeInformation stripe: reader.getStripes()) {
padding += stripe.getOffset() - offset;
stripeIndex += stripe.getIndexLength();
stripeData += stripe.getDataLength();
stripeFooterSize += stripe.getFooterLength();
offset = stripe.getOffset() + stripe.getLength();
}
// Add everything else as the file footer
fileFooterSize += file.getLen() - offset;
}

private boolean addReader(LocatedFileStatus file,
Reader reader) {
// Validate that the schemas are the same
TypeDescription newSchema = reader.getSchema();
if (schema.equals(newSchema)) {
goodFiles += 1;
rows += reader.getNumberOfRows();
totalSize += file.getLen();
checkStripes(file, reader);
ColumnStatistics[] colStats = reader.getStatistics();
for (int c = 0; c < colStats.length && c < columnSizes.length; c++) {
columnSizes[c] += colStats[c].getBytesOnDisk();
// Don't double count. Either count the bytes as stripe data or as
// part of a column.
stripeData -= colStats[c].getBytesOnDisk();
}
} else {
System.err.println("Ignoring " + file.getPath()
+ " because of schema mismatch: " + newSchema);
return false;
}
return true;
}

public boolean addFile(LocatedFileStatus file) throws IOException {
try (Reader reader = OrcFile.createReader(file.getPath(),
OrcFile.readerOptions(conf))) {
return addReader(file, reader);
}
}

private static class StringLongPair {
final String name;
final long size;
StringLongPair(String name, long size) {
this.name = name;
this.size = size;
}
}

private void printResults(PrintStream out) {
List<StringLongPair> sizes = new ArrayList<>(columnSizes.length + 5);
for(int column = 0; column < columnSizes.length; ++column) {
if (columnSizes[column] > 0) {
sizes.add(new StringLongPair(
schema.findSubtype(column).getFullFieldName(),
columnSizes[column]));
}
}
if (padding > 0) {
sizes.add(new StringLongPair("_padding", padding));
}
if (stripeFooterSize > 0) {
sizes.add(new StringLongPair("_stripe_footer", stripeFooterSize));
}
if (fileFooterSize > 0) {
sizes.add(new StringLongPair("_file_footer", fileFooterSize));
}
if (stripeIndex > 0) {
sizes.add(new StringLongPair("_index", stripeIndex));
}
if (stripeData > 0) {
sizes.add(new StringLongPair("_data", stripeData));
}
// sort by descending size, ascending name
sizes.sort((x, y) -> x.size != y.size ?
Long.compare(y.size, x.size) : x.name.compareTo(y.name));
out.println("Percent Bytes/Row Name");
for (StringLongPair item: sizes) {
out.println(String.format(" %-5.2f %-9.2f %s",
100.0 * item.size / totalSize, (double) item.size / rows, item.name));
}
}

public static void main(Configuration conf, String[] args) throws IOException {
ColumnSizes result = null;
int badFiles = 0;
for(String root: args) {
Path rootPath = new Path(root);
FileSystem fs = rootPath.getFileSystem(conf);
for(RemoteIterator<LocatedFileStatus> itr = fs.listFiles(rootPath, true); itr.hasNext(); ) {
LocatedFileStatus status = itr.next();
if (status.isFile() && status.getPath().getName().endsWith(".orc")) {
try {
if (result == null) {
result = new ColumnSizes(conf, status);
} else {
if (!result.addFile(status)) {
badFiles += 1;
}
}
} catch (IOException err) {
badFiles += 1;
System.err.println("Failed to read " + status.getPath());
}
}
}
}
if (result == null) {
System.err.println("No files found");
} else {
result.printResults(System.out);
}
if (badFiles > 0) {
System.err.println(badFiles + " bad ORC files found.");
System.exit(1);
}
}

public static void main(String[] args) throws IOException {
main(new Configuration(), args);
}
}
Loading

0 comments on commit 0fbfa66

Please sign in to comment.