apache · Jackie-Jiang · May 17, 2024 · Mar 2, 2024 · Mar 2, 2024 · Mar 21, 2024
diff --git a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/utils/HashUtils.java b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/utils/HashUtils.java
@@ -19,6 +19,10 @@
 package org.apache.pinot.segment.local.utils;
 
 import com.google.common.hash.Hashing;
+import java.nio.ByteBuffer;
+import java.nio.ByteOrder;
+import java.nio.charset.StandardCharsets;
+import java.util.UUID;
 import org.apache.pinot.spi.config.table.HashFunction;
 import org.apache.pinot.spi.data.readers.PrimaryKey;
 import org.apache.pinot.spi.utils.ByteArray;
@@ -36,6 +40,35 @@ public static byte[] hashMD5(byte[] bytes) {
     return Hashing.md5().hashBytes(bytes).asBytes();
   }
 
+  /**
+   * For use-cases where the primary-key is set to columns that are guaranteed to be type-4 UUIDs, this hash-function
+   * will reduce the number of bytes required from 36 to 16 for each UUID, without losing any precision. This leverages
+   * the fact that a type-4 UUID is essentially a 16-byte value.
+   */
+  public static byte[] hashUUIDv4(byte[] bytes) {
+    if (bytes.length % 36 != 0) {
+      return bytes;
+    }
+    byte[] resultBytes = new byte[(bytes.length / 36) * 16];
+    ByteBuffer byteBuffer = ByteBuffer.wrap(resultBytes).order(ByteOrder.BIG_ENDIAN);
+    for (int chunk = 0; chunk < bytes.length; chunk += 36) {
+      byte[] tempBytes = new byte[36];
+      System.arraycopy(bytes, chunk, tempBytes, 0, tempBytes.length);
+      UUID uuid;
+      try {
+        uuid = UUID.fromString(new String(tempBytes, StandardCharsets.UTF_8));
+      } catch (Exception e) {
+        // In case of failures, make the hash no-op.
+        return bytes;
+      }
+      long lsb = uuid.getLeastSignificantBits();
+      long msb = uuid.getMostSignificantBits();
+      byteBuffer.putLong(msb);
+      byteBuffer.putLong(lsb);
+    }
+    return resultBytes;
+  }
+
   public static Object hashPrimaryKey(PrimaryKey primaryKey, HashFunction hashFunction) {
     switch (hashFunction) {
       case NONE:
@@ -44,6 +77,8 @@ public static Object hashPrimaryKey(PrimaryKey primaryKey, HashFunction hashFunc
         return new ByteArray(HashUtils.hashMD5(primaryKey.asBytes()));
       case MURMUR3:
         return new ByteArray(HashUtils.hashMurmur3(primaryKey.asBytes()));
+      case UUID_V4:
+        return new ByteArray(HashUtils.hashUUIDv4(primaryKey.asBytes()));
       default:
         throw new IllegalArgumentException(String.format("Unrecognized hash function %s", hashFunction));
     }

diff --git a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/utils/HashUtilsTest.java b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/utils/HashUtilsTest.java
@@ -18,6 +18,8 @@
  */
 package org.apache.pinot.segment.local.utils;
 
+import java.nio.charset.StandardCharsets;
+import java.util.UUID;
 import org.apache.pinot.spi.utils.BytesUtils;
 import org.testng.Assert;
 import org.testng.annotations.Test;
@@ -31,4 +33,45 @@ public void testHashPlainValues() {
     Assert.assertEquals(BytesUtils.toHexString(HashUtils.hashMurmur3("hello world".getBytes())),
         "0e617feb46603f53b163eb607d4697ab");
   }
+
+  @Test
+  public void testHashUUIDv4() {
+    testHashUUIDv4(new UUID[]{UUID.randomUUID()});
+    testHashUUIDv4(new UUID[]{UUID.randomUUID(), UUID.randomUUID(), UUID.randomUUID()});
+
+    // Test failure scenario
+    byte[] invalidType4UUID = new byte[8];
+    // Set byte 0 to an arbitrary value. Hash function below should return the input array as is.
+    invalidType4UUID[0] = 0x10;
+    Assert.assertEquals(HashUtils.hashUUIDv4(invalidType4UUID), invalidType4UUID);
+  }
+
+  private void testHashUUIDv4(UUID[] uuids) {
+    StringBuilder concatenatedUUID = new StringBuilder();
+    for (UUID uuid : uuids) {
+      concatenatedUUID.append(uuid);
+    }
+    byte[] inputBytes = concatenatedUUID.toString().getBytes(StandardCharsets.UTF_8);
+    // Ensure test data is valid. Each UUID in string form should be 36 bytes.
+    Assert.assertEquals(inputBytes.length, 36 * uuids.length);
+    byte[] convertedBytes = HashUtils.hashUUIDv4(inputBytes);
+    // After hashing, each UUID should take 16 bytes.
+    Assert.assertEquals(convertedBytes.length, 16 * uuids.length);
+    // Below we reconstruct each UUID from the reduced 16-byte representation, and ensure it is the same as the input.
+    int convertedByteIndex = 0;
+    int uuidIndex = 0;
+    while (convertedByteIndex < convertedBytes.length) {
+      long msb = 0;
+      long lsb = 0;
+      for (int i = 0; i < 8; i++, convertedByteIndex++) {
+        msb = (msb << 8) | (convertedBytes[convertedByteIndex] & 0xFF);
+      }
+      for (int i = 0; i < 8; i++, convertedByteIndex++) {
+        lsb = (lsb << 8) | (convertedBytes[convertedByteIndex] & 0xFF);
+      }
+      UUID reconstructedUUID = new UUID(msb, lsb);
+      Assert.assertEquals(reconstructedUUID, uuids[uuidIndex]);
+      uuidIndex++;
+    }
+  }
 }
diff --git a/pinot-spi/src/main/java/org/apache/pinot/spi/config/table/HashFunction.java b/pinot-spi/src/main/java/org/apache/pinot/spi/config/table/HashFunction.java
@@ -19,5 +19,5 @@
 package org.apache.pinot.spi.config.table;
 
 public enum HashFunction {
-  NONE, MD5, MURMUR3
+  NONE, MD5, MURMUR3, UUID_V4
 }