Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Hash Function for UUID Primary Keys #12538

Merged
merged 9 commits into from
May 17, 2024
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,9 @@
package org.apache.pinot.segment.local.utils;

import com.google.common.hash.Hashing;
import java.nio.ByteBuffer;
import java.nio.ByteOrder;
import java.util.UUID;
import org.apache.pinot.spi.config.table.HashFunction;
import org.apache.pinot.spi.data.readers.PrimaryKey;
import org.apache.pinot.spi.utils.ByteArray;
Expand All @@ -36,6 +39,30 @@ public static byte[] hashMD5(byte[] bytes) {
return Hashing.md5().hashBytes(bytes).asBytes();
}

/**
* Returns a byte array that is a concatenation of the binary representation of each of the passed UUID values.
* If any of the values is not a valid UUID, then we return the result of {@link PrimaryKey#asBytes()}.
*/
public static byte[] hashUUID(PrimaryKey primaryKey) {
Object[] values = primaryKey.getValues();
byte[] result = new byte[values.length * 16];
ByteBuffer byteBuffer = ByteBuffer.wrap(result).order(ByteOrder.BIG_ENDIAN);
for (Object value : values) {
if (value == null) {
throw new IllegalArgumentException("Found null value in primary key");
}
UUID uuid;
try {
uuid = UUID.fromString(value.toString());
} catch (Throwable t) {
ankitsultana marked this conversation as resolved.
Show resolved Hide resolved
return primaryKey.asBytes();
}
byteBuffer.putLong(uuid.getMostSignificantBits());
byteBuffer.putLong(uuid.getLeastSignificantBits());
}
return result;
}

public static Object hashPrimaryKey(PrimaryKey primaryKey, HashFunction hashFunction) {
switch (hashFunction) {
case NONE:
Expand All @@ -44,6 +71,8 @@ public static Object hashPrimaryKey(PrimaryKey primaryKey, HashFunction hashFunc
return new ByteArray(HashUtils.hashMD5(primaryKey.asBytes()));
case MURMUR3:
return new ByteArray(HashUtils.hashMurmur3(primaryKey.asBytes()));
case UUID:
return new ByteArray(HashUtils.hashUUID(primaryKey));
default:
throw new IllegalArgumentException(String.format("Unrecognized hash function %s", hashFunction));
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,17 +18,64 @@
*/
package org.apache.pinot.segment.local.utils;

import java.util.UUID;
import org.apache.pinot.spi.data.readers.PrimaryKey;
import org.apache.pinot.spi.utils.BytesUtils;
import org.testng.Assert;
import org.testng.annotations.Test;

import static org.testng.Assert.*;


public class HashUtilsTest {
@Test
public void testHashPlainValues() {
Assert.assertEquals(BytesUtils.toHexString(HashUtils.hashMD5("hello world".getBytes())),
assertEquals(BytesUtils.toHexString(HashUtils.hashMD5("hello world".getBytes())),
"5eb63bbbe01eeed093cb22bb8f5acdc3");
Assert.assertEquals(BytesUtils.toHexString(HashUtils.hashMurmur3("hello world".getBytes())),
assertEquals(BytesUtils.toHexString(HashUtils.hashMurmur3("hello world".getBytes())),
"0e617feb46603f53b163eb607d4697ab");
}

@Test
public void testHashUUID() {
// Test happy cases: when all UUID values are valid
testHashUUID(new UUID[]{UUID.randomUUID()});
testHashUUID(new UUID[]{UUID.randomUUID(), UUID.randomUUID(), UUID.randomUUID()});

// Test failure scenario when there's a non-null invalid uuid value
PrimaryKey invalidUUIDs = new PrimaryKey(new String[]{"some-random-string"});
byte[] hashResult = HashUtils.hashUUID(invalidUUIDs);
// In case of failures, each element is prepended with length
byte[] expectedResult = invalidUUIDs.asBytes();
assertEquals(hashResult, expectedResult);
// Test failure scenario when one of the values is null
try {
PrimaryKey pKeyWithNull = new PrimaryKey(new String[]{UUID.randomUUID().toString(), null});
HashUtils.hashUUID(pKeyWithNull);
fail("Should have thrown an exception");
} catch (IllegalArgumentException e) {
assertTrue(e.getMessage().contains("Found null value"));
}
}

private void testHashUUID(UUID[] uuids) {
byte[] convertedBytes = HashUtils.hashUUID(new PrimaryKey(uuids));
// After hashing, each UUID should take 16 bytes.
assertEquals(convertedBytes.length, 16 * uuids.length);
// Below we reconstruct each UUID from the reduced 16-byte representation, and ensure it is the same as the input.
int convertedByteIndex = 0;
int uuidIndex = 0;
while (convertedByteIndex < convertedBytes.length) {
long msb = 0;
long lsb = 0;
for (int i = 0; i < 8; i++, convertedByteIndex++) {
msb = (msb << 8) | (convertedBytes[convertedByteIndex] & 0xFF);
}
for (int i = 0; i < 8; i++, convertedByteIndex++) {
lsb = (lsb << 8) | (convertedBytes[convertedByteIndex] & 0xFF);
}
UUID reconstructedUUID = new UUID(msb, lsb);
assertEquals(reconstructedUUID, uuids[uuidIndex]);
uuidIndex++;
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -19,5 +19,5 @@
package org.apache.pinot.spi.config.table;

public enum HashFunction {
NONE, MD5, MURMUR3
NONE, MD5, MURMUR3, UUID
}
Loading