Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Hash Function for UUID Primary Keys #12538

Merged
merged 9 commits into from
May 17, 2024
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,10 @@
package org.apache.pinot.segment.local.utils;

import com.google.common.hash.Hashing;
import java.nio.ByteBuffer;
import java.nio.ByteOrder;
import java.nio.charset.StandardCharsets;
import java.util.UUID;
import org.apache.pinot.spi.config.table.HashFunction;
import org.apache.pinot.spi.data.readers.PrimaryKey;
import org.apache.pinot.spi.utils.ByteArray;
Expand All @@ -36,6 +40,35 @@ public static byte[] hashMD5(byte[] bytes) {
return Hashing.md5().hashBytes(bytes).asBytes();
}

/**
* For use-cases where the primary-key is set to columns that are guaranteed to be type-4 UUIDs, this hash-function
* will reduce the number of bytes required from 36 to 16 for each UUID, without losing any precision. This leverages
* the fact that a type-4 UUID is essentially a 16-byte value.
*/
public static byte[] hashUUIDv4(byte[] bytes) {
ankitsultana marked this conversation as resolved.
Show resolved Hide resolved
if (bytes.length % 36 != 0) {
return bytes;
}
byte[] resultBytes = new byte[(bytes.length / 36) * 16];
ByteBuffer byteBuffer = ByteBuffer.wrap(resultBytes).order(ByteOrder.BIG_ENDIAN);
for (int chunk = 0; chunk < bytes.length; chunk += 36) {
byte[] tempBytes = new byte[36];
System.arraycopy(bytes, chunk, tempBytes, 0, tempBytes.length);
UUID uuid;
try {
uuid = UUID.fromString(new String(tempBytes, StandardCharsets.UTF_8));
} catch (Exception e) {
// In case of failures, make the hash no-op.
return bytes;
ankitsultana marked this conversation as resolved.
Show resolved Hide resolved
}
long lsb = uuid.getLeastSignificantBits();
long msb = uuid.getMostSignificantBits();
byteBuffer.putLong(msb);
byteBuffer.putLong(lsb);
}
return resultBytes;
}

public static Object hashPrimaryKey(PrimaryKey primaryKey, HashFunction hashFunction) {
switch (hashFunction) {
case NONE:
Expand All @@ -44,6 +77,8 @@ public static Object hashPrimaryKey(PrimaryKey primaryKey, HashFunction hashFunc
return new ByteArray(HashUtils.hashMD5(primaryKey.asBytes()));
case MURMUR3:
return new ByteArray(HashUtils.hashMurmur3(primaryKey.asBytes()));
case UUID_V4:
return new ByteArray(HashUtils.hashUUIDv4(primaryKey.asBytes()));
ankitsultana marked this conversation as resolved.
Show resolved Hide resolved
default:
throw new IllegalArgumentException(String.format("Unrecognized hash function %s", hashFunction));
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@
*/
package org.apache.pinot.segment.local.utils;

import java.nio.charset.StandardCharsets;
import java.util.UUID;
import org.apache.pinot.spi.utils.BytesUtils;
import org.testng.Assert;
import org.testng.annotations.Test;
Expand All @@ -31,4 +33,45 @@ public void testHashPlainValues() {
Assert.assertEquals(BytesUtils.toHexString(HashUtils.hashMurmur3("hello world".getBytes())),
"0e617feb46603f53b163eb607d4697ab");
}

@Test
public void testHashUUIDv4() {
testHashUUIDv4(new UUID[]{UUID.randomUUID()});
testHashUUIDv4(new UUID[]{UUID.randomUUID(), UUID.randomUUID(), UUID.randomUUID()});

// Test failure scenario
byte[] invalidType4UUID = new byte[8];
// Set byte 0 to an arbitrary value. Hash function below should return the input array as is.
invalidType4UUID[0] = 0x10;
Assert.assertEquals(HashUtils.hashUUIDv4(invalidType4UUID), invalidType4UUID);
}

private void testHashUUIDv4(UUID[] uuids) {
StringBuilder concatenatedUUID = new StringBuilder();
for (UUID uuid : uuids) {
concatenatedUUID.append(uuid);
}
byte[] inputBytes = concatenatedUUID.toString().getBytes(StandardCharsets.UTF_8);
// Ensure test data is valid. Each UUID in string form should be 36 bytes.
Assert.assertEquals(inputBytes.length, 36 * uuids.length);
byte[] convertedBytes = HashUtils.hashUUIDv4(inputBytes);
// After hashing, each UUID should take 16 bytes.
Assert.assertEquals(convertedBytes.length, 16 * uuids.length);
// Below we reconstruct each UUID from the reduced 16-byte representation, and ensure it is the same as the input.
int convertedByteIndex = 0;
int uuidIndex = 0;
while (convertedByteIndex < convertedBytes.length) {
long msb = 0;
long lsb = 0;
for (int i = 0; i < 8; i++, convertedByteIndex++) {
msb = (msb << 8) | (convertedBytes[convertedByteIndex] & 0xFF);
}
for (int i = 0; i < 8; i++, convertedByteIndex++) {
lsb = (lsb << 8) | (convertedBytes[convertedByteIndex] & 0xFF);
}
UUID reconstructedUUID = new UUID(msb, lsb);
Assert.assertEquals(reconstructedUUID, uuids[uuidIndex]);
uuidIndex++;
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -19,5 +19,5 @@
package org.apache.pinot.spi.config.table;

public enum HashFunction {
NONE, MD5, MURMUR3
NONE, MD5, MURMUR3, UUID_V4
}
Loading