Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: Fix readback of parquet files with missing dictionary page offse… #5810

Merged
merged 1 commit into from
Jul 19, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -176,21 +176,14 @@ public Function<SeekableChannelContext, Dictionary> getDictionarySupplier() {

@NotNull
private Dictionary getDictionary(final SeekableChannelContext channelContext) {
final long dictionaryPageOffset;
final ColumnMetaData chunkMeta = columnChunk.getMeta_data();
if (chunkMeta.isSetDictionary_page_offset()) {
dictionaryPageOffset = chunkMeta.getDictionary_page_offset();
} else if ((chunkMeta.isSetEncoding_stats() && (chunkMeta.getEncoding_stats().stream()
.anyMatch(pes -> pes.getEncoding() == PLAIN_DICTIONARY
|| pes.getEncoding() == RLE_DICTIONARY)))
|| (chunkMeta.isSetEncodings() && (chunkMeta.getEncodings().stream()
.anyMatch(en -> en == PLAIN_DICTIONARY || en == RLE_DICTIONARY)))) {
// Fallback, inspired by
// https://stackoverflow.com/questions/55225108/why-is-dictionary-page-offset-0-for-plain-dictionary-encoding
dictionaryPageOffset = chunkMeta.getData_page_offset();
} else {
return NULL_DICTIONARY;
}
// If the dictionary page offset is set, use it, otherwise inspect the first data page -- it might be the
// dictionary page. Fallback, inspired by
// https://stackoverflow.com/questions/55225108/why-is-dictionary-page-offset-0-for-plain-dictionary-encoding
final long dictionaryPageOffset = chunkMeta.isSetDictionary_page_offset()
? chunkMeta.getDictionary_page_offset()
: chunkMeta.getData_page_offset();

return readDictionary(dictionaryPageOffset, channelContext);
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
import io.deephaven.parquet.table.layout.ParquetKeyValuePartitionedLayout;
import io.deephaven.stringset.HashStringSet;
import io.deephaven.stringset.StringSet;
import io.deephaven.time.DateTimeUtils;
import io.deephaven.vector.*;
import junit.framework.TestCase;
import org.junit.*;
Expand All @@ -28,6 +29,7 @@
import java.io.IOException;
import java.lang.reflect.Proxy;
import java.nio.file.Files;
import java.time.Instant;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashSet;
Expand All @@ -39,7 +41,9 @@
import static io.deephaven.engine.testutil.TstUtils.assertTableEquals;
import static io.deephaven.engine.testutil.TstUtils.tableRangesAreEqual;
import static io.deephaven.engine.util.TableTools.*;
import static org.junit.Assert.*;
import static org.junit.Assert.assertTrue;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertNull;

/**
* Tests for {@link ParquetTools}.
Expand Down Expand Up @@ -527,6 +531,8 @@ private void testWriteRead(Table source, Function<Table, Table> transform) {
assertTableEquals(transform.apply(source), transform.apply(readBack));
}

// This method is used in a formula. Do not remove
@SuppressWarnings("unused")
public static DoubleVector generateDoubles(int howMany) {
final double[] yarr = new double[howMany];
for (int ii = 0; ii < howMany; ii++) {
Expand All @@ -535,6 +541,8 @@ public static DoubleVector generateDoubles(int howMany) {
return new DoubleVectorDirect(yarr);
}

// This method is used in a formula. Do not remove
@SuppressWarnings("unused")
public static FloatVector generateFloats(int howMany) {
final float[] yarr = new float[howMany];
for (int ii = 0; ii < howMany; ii++) {
Expand All @@ -543,11 +551,33 @@ public static FloatVector generateFloats(int howMany) {
return new FloatVectorDirect(yarr);
}

// This method is used in a formula. Do not remove
@SuppressWarnings("unused")
public static ObjectVector<String> makeSillyStringArray(int howMany) {
final String[] fireTruck = new String[howMany];
for (int ii = 0; ii < howMany; ii++) {
fireTruck[ii] = String.format("%04d", ii);
}
return new ObjectVectorDirect<>(fireTruck);
}

/**
* This test checks that we can read old parquet files that don't properly have dictionary page offset set, and also
* that we can read int96 columns with null values.
*/
@Test
public void testNoDictionaryOffset() {
final Table withNullsAndMissingOffsets =
ParquetTools.readTable(TestParquetTools.class.getResource("/dictOffset/dh17395.parquet").getFile());
final Table clean = ParquetTools
.readTable(TestParquetTools.class.getResource("/dictOffset/dh17395_clean.parquet").getFile());

assertEquals("D2442F78-AEFB-49F7-85A0-00506BBE74D4",
withNullsAndMissingOffsets.getColumnSource("LISTID").get(0));
assertNull(withNullsAndMissingOffsets.getColumnSource("TIMESTAMP").get(0));
assertNull(withNullsAndMissingOffsets.getColumnSource("SETTLEMENT_DATE").get(0));
assertEquals(1698672050703000000L,
DateTimeUtils.epochNanos((Instant) withNullsAndMissingOffsets.getColumnSource("CREATE_DATE").get(0)));
assertTableEquals(withNullsAndMissingOffsets, clean);
}
}
Git LFS file not shown
Git LFS file not shown
Loading