Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Hash Function for UUID Primary Keys #12538

Merged
merged 9 commits into from
May 17, 2024
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,10 @@
package org.apache.pinot.segment.local.utils;

import com.google.common.hash.Hashing;
import java.nio.ByteBuffer;
import java.nio.ByteOrder;
import java.nio.charset.StandardCharsets;
import java.util.UUID;
import org.apache.pinot.spi.config.table.HashFunction;
import org.apache.pinot.spi.data.readers.PrimaryKey;
import org.apache.pinot.spi.utils.ByteArray;
Expand All @@ -36,6 +40,33 @@ public static byte[] hashMD5(byte[] bytes) {
return Hashing.md5().hashBytes(bytes).asBytes();
}

/**
* Returns a byte array that is a concatenation of the binary representation of each of the passed UUID values. This
* is done by getting a String from each value by calling {@link Object#toString()}, which is then used to create a
* {@link UUID} object. The 16 bytes of each UUID are appended to a buffer which is then returned in the result.
* If any of the value is not a valid UUID, then this function appends the UTF-8 string bytes of all elements and
* returns that in the result.
*/
public static byte[] hashUUID(Object[] values) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For clarification, we are not really hashing the values, but compress it instead right?

byte[] result = new byte[values.length * 16];
ByteBuffer byteBuffer = ByteBuffer.wrap(result).order(ByteOrder.BIG_ENDIAN);
for (Object value : values) {
if (value == null) {
return concatenate(values);
ankitsultana marked this conversation as resolved.
Show resolved Hide resolved
}
String uuidString = value.toString();
ankitsultana marked this conversation as resolved.
Show resolved Hide resolved
UUID uuid;
try {
uuid = UUID.fromString(uuidString);
} catch (Throwable t) {
ankitsultana marked this conversation as resolved.
Show resolved Hide resolved
return concatenate(values);
}
byteBuffer.putLong(uuid.getMostSignificantBits());
byteBuffer.putLong(uuid.getLeastSignificantBits());
}
return result;
}

public static Object hashPrimaryKey(PrimaryKey primaryKey, HashFunction hashFunction) {
switch (hashFunction) {
case NONE:
Expand All @@ -44,8 +75,26 @@ public static Object hashPrimaryKey(PrimaryKey primaryKey, HashFunction hashFunc
return new ByteArray(HashUtils.hashMD5(primaryKey.asBytes()));
case MURMUR3:
return new ByteArray(HashUtils.hashMurmur3(primaryKey.asBytes()));
case UUID:
return new ByteArray(HashUtils.hashUUID(primaryKey.getValues()));
default:
throw new IllegalArgumentException(String.format("Unrecognized hash function %s", hashFunction));
}
}

private static byte[] concatenate(Object[] values) {
ankitsultana marked this conversation as resolved.
Show resolved Hide resolved
ankitsultana marked this conversation as resolved.
Show resolved Hide resolved
byte[][] allValueBytes = new byte[values.length][];
int totalLen = 0;
for (int j = 0; j < allValueBytes.length; j++) {
allValueBytes[j] = values[j] == null ? "null".getBytes(StandardCharsets.UTF_8)
ankitsultana marked this conversation as resolved.
Show resolved Hide resolved
: values[j].toString().getBytes(StandardCharsets.UTF_8);
totalLen += allValueBytes[j].length;
}
byte[] result = new byte[totalLen];
for (int j = 0, offset = 0; j < allValueBytes.length; j++) {
System.arraycopy(allValueBytes[j], 0, result, offset, allValueBytes[j].length);
offset += allValueBytes[j].length;
}
return result;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@
*/
package org.apache.pinot.segment.local.utils;

import java.nio.charset.StandardCharsets;
import java.util.UUID;
import org.apache.pinot.spi.utils.BytesUtils;
import org.testng.Assert;
import org.testng.annotations.Test;
Expand All @@ -31,4 +33,43 @@ public void testHashPlainValues() {
Assert.assertEquals(BytesUtils.toHexString(HashUtils.hashMurmur3("hello world".getBytes())),
"0e617feb46603f53b163eb607d4697ab");
}

@Test
public void testHashUUID() {
testHashUUID(new UUID[]{UUID.randomUUID()});
testHashUUID(new UUID[]{UUID.randomUUID(), UUID.randomUUID(), UUID.randomUUID()});

// Test failure scenario
String[] invalidUUIDs = new String[]{"some-random-string"};
Assert.assertEquals(HashUtils.hashUUID(invalidUUIDs), invalidUUIDs[0].getBytes(StandardCharsets.UTF_8));
// Test scenario when one of the values is null
invalidUUIDs = new String[]{UUID.randomUUID().toString(), null};
byte[] hashResult = HashUtils.hashUUID(invalidUUIDs);
Assert.assertNotNull(hashResult);
byte[] lastFourBytes = new byte[4];
System.arraycopy(hashResult, hashResult.length - 4, lastFourBytes, 0, lastFourBytes.length);
Assert.assertEquals(new String(lastFourBytes, StandardCharsets.UTF_8), "null");
}

private void testHashUUID(UUID[] uuids) {
byte[] convertedBytes = HashUtils.hashUUID(uuids);
// After hashing, each UUID should take 16 bytes.
Assert.assertEquals(convertedBytes.length, 16 * uuids.length);
// Below we reconstruct each UUID from the reduced 16-byte representation, and ensure it is the same as the input.
int convertedByteIndex = 0;
int uuidIndex = 0;
while (convertedByteIndex < convertedBytes.length) {
long msb = 0;
long lsb = 0;
for (int i = 0; i < 8; i++, convertedByteIndex++) {
msb = (msb << 8) | (convertedBytes[convertedByteIndex] & 0xFF);
}
for (int i = 0; i < 8; i++, convertedByteIndex++) {
lsb = (lsb << 8) | (convertedBytes[convertedByteIndex] & 0xFF);
}
UUID reconstructedUUID = new UUID(msb, lsb);
Assert.assertEquals(reconstructedUUID, uuids[uuidIndex]);
uuidIndex++;
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -19,5 +19,5 @@
package org.apache.pinot.spi.config.table;

public enum HashFunction {
NONE, MD5, MURMUR3
NONE, MD5, MURMUR3, UUID
}
Loading