Skip to content

Commit

Permalink
ARROW-6460: [Java] Add benchmark and large fake data UT for avro adapter
Browse files Browse the repository at this point in the history
Related to [ARROW-6460](https://issues.apache.org/jira/browse/ARROW-6460).

i. This issue is about to add tests with a large fake data set (600000 rows) and ensures no OOMs occur.
ii. Add benchmark for avro iterator to get a baseline number.

Closes #5317 from tianchen92/ARROW-6460 and squashes the following commits:

d061f5a <tianchen> resolve comments
e9abe5e <tianchen> ARROW-6460:  Add benchmark and large fake data UT for avro adapter

Authored-by: tianchen <[email protected]>
Signed-off-by: Micah Kornfield <[email protected]>
  • Loading branch information
tianchen92 authored and emkornfield committed Sep 18, 2019
1 parent 500b828 commit 40eddfe
Show file tree
Hide file tree
Showing 4 changed files with 378 additions and 0 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,14 @@

package org.apache.arrow;

import static org.junit.Assert.assertEquals;

import java.io.EOFException;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.nio.ByteBuffer;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
Expand All @@ -37,8 +42,10 @@
import org.apache.avro.io.BinaryDecoder;
import org.apache.avro.io.BinaryEncoder;
import org.apache.avro.io.DatumWriter;
import org.apache.avro.io.Decoder;
import org.apache.avro.io.DecoderFactory;
import org.apache.avro.io.EncoderFactory;
import org.apache.avro.util.Utf8;
import org.junit.Test;

public class AvroToArrowIteratorTest extends AvroTestBase {
Expand Down Expand Up @@ -158,4 +165,149 @@ public void testArrayType() throws Exception {
checkArrayResult(data, vectors);
AutoCloseables.close(roots);
}

@Test
public void runLargeNumberOfRows() throws Exception {
Schema schema = getSchema("test_large_data.avsc");
int x = 0;
final int targetRows = 600000;
Decoder fakeDecoder = new FakeDecoder(targetRows);
try (AvroToArrowVectorIterator iter =
AvroToArrow.avroToArrowIterator(schema, fakeDecoder, new AvroToArrowConfig(config.getAllocator()))) {
while (iter.hasNext()) {
VectorSchemaRoot root = iter.next();
x += root.getRowCount();
root.close();
}
}

assertEquals(x, targetRows);
}

/**
* Fake avro decoder to test large data.
*/
private class FakeDecoder extends Decoder {

private int numRows;

FakeDecoder(int numRows) {
this.numRows = numRows;
}

// note that Decoder has no hasNext() API, assume enum is the first type in schema
// and fixed is the last type in schema and they are unique.
private void validate() throws EOFException {
if (numRows <= 0) {
throw new EOFException();
}
}

@Override
public void readNull() throws IOException {
}

@Override
public boolean readBoolean() throws IOException {
return false;
}

@Override
public int readInt() throws IOException {
return 0;
}

@Override
public long readLong() throws IOException {
return 0;
}

@Override
public float readFloat() throws IOException {
return 0;
}

@Override
public double readDouble() throws IOException {
return 0;
}

@Override
public Utf8 readString(Utf8 old) throws IOException {
return new Utf8("test123test123" + numRows);
}

@Override
public String readString() throws IOException {
return "test123test123" + numRows;
}

@Override
public void skipString() throws IOException {

}

@Override
public ByteBuffer readBytes(ByteBuffer old) throws IOException {
return ByteBuffer.allocate(0);
}

@Override
public void skipBytes() throws IOException {

}

@Override
public void readFixed(byte[] bytes, int start, int length) throws IOException {
// fixed type is last column, after read value, decrease numRows
numRows--;
}

@Override
public void skipFixed(int length) throws IOException {

}

@Override
public int readEnum() throws IOException {
// enum type is first column, validate numRows first.
validate();
return 0;
}

@Override
public long readArrayStart() throws IOException {
return 5;
}

@Override
public long arrayNext() throws IOException {
return 0;
}

@Override
public long skipArray() throws IOException {
return 0;
}

@Override
public long readMapStart() throws IOException {
return 5;
}

@Override
public long mapNext() throws IOException {
return 0;
}

@Override
public long skipMap() throws IOException {
return 0;
}

@Override
public int readIndex() throws IOException {
return 0;
}
}
}
75 changes: 75 additions & 0 deletions java/adapter/avro/src/test/resources/schema/test_large_data.avsc
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

{
"namespace": "org.apache.arrow.avro",
"type": "record",
"name": "testLargeData",
"fields": [
{
"name": "f0",
"type": {
"name" : "f0",
"type" : "enum",
"symbols" : ["value1", "value2", "value3", "value4", "value5"]
}
},
{
"name" : "f1",
"type" : {
"type" : "record",
"name" : "nestedRecord",
"fields": [
{"name": "f1_0", "type": "string"},
{"name": "f1_1", "type": "int"}
]
}
},

{"name": "f2", "type": "string"},
{"name": "f3", "type": "int"},
{"name": "f4", "type": "boolean"},
{"name": "f5", "type": "float"},
{"name": "f6", "type": "double"},
{"name": "f7", "type": "bytes"},
{"name": "f8", "type": ["string", "int"]},
{
"name": "f9",
"type": {
"name" : "f9",
"type" : "array",
"items" : "string"
}
},
{
"name": "f10",
"type": {
"name" : "f10",
"type" : "map",
"values" : "string"
}
},
{
"name": "f11",
"type": {
"type" : "fixed",
"name" : "f11",
"size" : 5
}
}
]
}
10 changes: 10 additions & 0 deletions java/performance/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,16 @@
<artifactId>arrow-memory</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
<groupId>org.apache.avro</groupId>
<artifactId>avro</artifactId>
<version>1.9.0</version>
</dependency>
<dependency>
<groupId>org.apache.arrow</groupId>
<artifactId>arrow-avro</artifactId>
<version>${project.version}</version>
</dependency>
</dependencies>

<properties>
Expand Down
Loading

0 comments on commit 40eddfe

Please sign in to comment.