deephaven · devinrsmith · Jul 19, 2024 · Jul 12, 2024
diff --git a/extensions/parquet/base/src/main/java/io/deephaven/parquet/base/ColumnChunkReaderImpl.java b/extensions/parquet/base/src/main/java/io/deephaven/parquet/base/ColumnChunkReaderImpl.java
@@ -176,21 +176,14 @@ public Function<SeekableChannelContext, Dictionary> getDictionarySupplier() {
 
     @NotNull
     private Dictionary getDictionary(final SeekableChannelContext channelContext) {
-        final long dictionaryPageOffset;
         final ColumnMetaData chunkMeta = columnChunk.getMeta_data();
-        if (chunkMeta.isSetDictionary_page_offset()) {
-            dictionaryPageOffset = chunkMeta.getDictionary_page_offset();
-        } else if ((chunkMeta.isSetEncoding_stats() && (chunkMeta.getEncoding_stats().stream()
-                .anyMatch(pes -> pes.getEncoding() == PLAIN_DICTIONARY
-                        || pes.getEncoding() == RLE_DICTIONARY)))
-                || (chunkMeta.isSetEncodings() && (chunkMeta.getEncodings().stream()
-                        .anyMatch(en -> en == PLAIN_DICTIONARY || en == RLE_DICTIONARY)))) {
-            // Fallback, inspired by
-            // https://stackoverflow.com/questions/55225108/why-is-dictionary-page-offset-0-for-plain-dictionary-encoding
-            dictionaryPageOffset = chunkMeta.getData_page_offset();
-        } else {
-            return NULL_DICTIONARY;
-        }
+        // If the dictionary page offset is set, use it, otherwise inspect the first data page -- it might be the
+        // dictionary page. Fallback, inspired by
+        // https://stackoverflow.com/questions/55225108/why-is-dictionary-page-offset-0-for-plain-dictionary-encoding
+        final long dictionaryPageOffset = chunkMeta.isSetDictionary_page_offset()
+                ? chunkMeta.getDictionary_page_offset()
+                : chunkMeta.getData_page_offset();
+
         return readDictionary(dictionaryPageOffset, channelContext);
     }
 

diff --git a/extensions/parquet/table/src/test/java/io/deephaven/parquet/table/TestParquetTools.java b/extensions/parquet/table/src/test/java/io/deephaven/parquet/table/TestParquetTools.java
@@ -20,6 +20,7 @@
 import io.deephaven.parquet.table.layout.ParquetKeyValuePartitionedLayout;
 import io.deephaven.stringset.HashStringSet;
 import io.deephaven.stringset.StringSet;
+import io.deephaven.time.DateTimeUtils;
 import io.deephaven.vector.*;
 import junit.framework.TestCase;
 import org.junit.*;
@@ -28,6 +29,7 @@
 import java.io.IOException;
 import java.lang.reflect.Proxy;
 import java.nio.file.Files;
+import java.time.Instant;
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.HashSet;
@@ -39,7 +41,9 @@
 import static io.deephaven.engine.testutil.TstUtils.assertTableEquals;
 import static io.deephaven.engine.testutil.TstUtils.tableRangesAreEqual;
 import static io.deephaven.engine.util.TableTools.*;
-import static org.junit.Assert.*;
+import static org.junit.Assert.assertTrue;
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertNull;
 
 /**
  * Tests for {@link ParquetTools}.
@@ -527,6 +531,8 @@ private void testWriteRead(Table source, Function<Table, Table> transform) {
         assertTableEquals(transform.apply(source), transform.apply(readBack));
     }
 
+    // This method is used in a formula. Do not remove
+    @SuppressWarnings("unused")
     public static DoubleVector generateDoubles(int howMany) {
         final double[] yarr = new double[howMany];
         for (int ii = 0; ii < howMany; ii++) {
@@ -535,6 +541,8 @@ public static DoubleVector generateDoubles(int howMany) {
         return new DoubleVectorDirect(yarr);
     }
 
+    // This method is used in a formula. Do not remove
+    @SuppressWarnings("unused")
     public static FloatVector generateFloats(int howMany) {
         final float[] yarr = new float[howMany];
         for (int ii = 0; ii < howMany; ii++) {
@@ -543,11 +551,33 @@ public static FloatVector generateFloats(int howMany) {
         return new FloatVectorDirect(yarr);
     }
 
+    // This method is used in a formula. Do not remove
+    @SuppressWarnings("unused")
     public static ObjectVector<String> makeSillyStringArray(int howMany) {
         final String[] fireTruck = new String[howMany];
         for (int ii = 0; ii < howMany; ii++) {
             fireTruck[ii] = String.format("%04d", ii);
         }
         return new ObjectVectorDirect<>(fireTruck);
     }
+
+    /**
+     * This test checks that we can read old parquet files that don't properly have dictionary page offset set, and also
+     * that we can read int96 columns with null values.
+     */
+    @Test
+    public void testNoDictionaryOffset() {
+        final Table withNullsAndMissingOffsets =
+                ParquetTools.readTable(TestParquetTools.class.getResource("/dictOffset/dh17395.parquet").getFile());
+        final Table clean = ParquetTools
+                .readTable(TestParquetTools.class.getResource("/dictOffset/dh17395_clean.parquet").getFile());
+
+        assertEquals("D2442F78-AEFB-49F7-85A0-00506BBE74D4",
+                withNullsAndMissingOffsets.getColumnSource("LISTID").get(0));
+        assertNull(withNullsAndMissingOffsets.getColumnSource("TIMESTAMP").get(0));
+        assertNull(withNullsAndMissingOffsets.getColumnSource("SETTLEMENT_DATE").get(0));
+        assertEquals(1698672050703000000L,
+                DateTimeUtils.epochNanos((Instant) withNullsAndMissingOffsets.getColumnSource("CREATE_DATE").get(0)));
+        assertTableEquals(withNullsAndMissingOffsets, clean);
+    }
 }
diff --git a/extensions/parquet/table/src/test/resources/dictOffset/dh17395.parquet b/extensions/parquet/table/src/test/resources/dictOffset/dh17395.parquet
diff --git a/extensions/parquet/table/src/test/resources/dictOffset/dh17395_clean.parquet b/extensions/parquet/table/src/test/resources/dictOffset/dh17395_clean.parquet