From e846bbaf60dd1a94c1f443aef2da8a41cdb88659 Mon Sep 17 00:00:00 2001 From: Prashant Singh Date: Wed, 15 Mar 2023 14:48:54 -0700 Subject: [PATCH 1/7] Core: Optimize S3 layout --- .../org/apache/iceberg/LocationProviders.java | 12 ++--- .../org/apache/iceberg/util/HashUtils.java | 52 +++++++++++++++++++ 2 files changed, 56 insertions(+), 8 deletions(-) create mode 100644 core/src/main/java/org/apache/iceberg/util/HashUtils.java diff --git a/core/src/main/java/org/apache/iceberg/LocationProviders.java b/core/src/main/java/org/apache/iceberg/LocationProviders.java index 61ad1c2a5727..661ae6f86da1 100644 --- a/core/src/main/java/org/apache/iceberg/LocationProviders.java +++ b/core/src/main/java/org/apache/iceberg/LocationProviders.java @@ -19,13 +19,11 @@ package org.apache.iceberg; import java.util.Map; -import java.util.function.Function; import org.apache.hadoop.fs.Path; import org.apache.iceberg.common.DynConstructors; import org.apache.iceberg.io.LocationProvider; import org.apache.iceberg.relocated.com.google.common.base.Preconditions; -import org.apache.iceberg.transforms.Transforms; -import org.apache.iceberg.types.Types; +import org.apache.iceberg.util.HashUtils; import org.apache.iceberg.util.LocationUtil; import org.apache.iceberg.util.PropertyUtil; @@ -104,8 +102,6 @@ public String newDataLocation(String filename) { } static class ObjectStoreLocationProvider implements LocationProvider { - private static final Function HASH_FUNC = - Transforms.bucket(Integer.MAX_VALUE).bind(Types.StringType.get()); private final String storageLocation; private final String context; @@ -143,11 +139,11 @@ public String newDataLocation(PartitionSpec spec, StructLike partitionData, Stri @Override public String newDataLocation(String filename) { - int hash = HASH_FUNC.apply(filename); + String hash = HashUtils.computeHash(filename); if (context != null) { - return String.format("%s/%08x/%s/%s", storageLocation, hash, context, filename); + return String.format("%s/%s/%s/%s", storageLocation, hash, context, filename); } else { - return String.format("%s/%08x/%s", storageLocation, hash, filename); + return String.format("%s/%s/%s", storageLocation, hash, filename); } } diff --git a/core/src/main/java/org/apache/iceberg/util/HashUtils.java b/core/src/main/java/org/apache/iceberg/util/HashUtils.java new file mode 100644 index 000000000000..d774c18af633 --- /dev/null +++ b/core/src/main/java/org/apache/iceberg/util/HashUtils.java @@ -0,0 +1,52 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.util; + +import java.nio.charset.StandardCharsets; +import org.apache.iceberg.relocated.com.google.common.base.Preconditions; +import org.apache.iceberg.relocated.com.google.common.hash.HashFunction; +import org.apache.iceberg.relocated.com.google.common.hash.Hashing; + +public class HashUtils { + + private HashUtils() {} + + private static final HashFunction hashFunction = Hashing.sha1(); + private static final String allChars = + "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789"; + + /** + * Compute hash containing alphanumeric characters + * + * @param fileName name of the file + * @return hash generated + */ + public static String computeHash(String fileName) { + Preconditions.checkState(fileName != null, "fileName cannot be null"); + byte[] messageDigest = + hashFunction.hashBytes(fileName.getBytes(StandardCharsets.UTF_8)).asBytes(); + + StringBuilder hash = new StringBuilder(); + for (int i = 0; i < 8; ++i) { + hash.append(allChars.charAt((messageDigest[i] % 62 + 62) % 62)); + } + + return hash.toString(); + } +} From 7a04d655aa3ab99d04f4c7c06577a5aff33525e3 Mon Sep 17 00:00:00 2001 From: Prashant Singh Date: Thu, 16 Mar 2023 18:09:08 -0700 Subject: [PATCH 2/7] address review feedback --- .../org/apache/iceberg/LocationProviders.java | 23 +++++++- .../org/apache/iceberg/util/HashUtils.java | 52 ------------------- 2 files changed, 21 insertions(+), 54 deletions(-) delete mode 100644 core/src/main/java/org/apache/iceberg/util/HashUtils.java diff --git a/core/src/main/java/org/apache/iceberg/LocationProviders.java b/core/src/main/java/org/apache/iceberg/LocationProviders.java index 661ae6f86da1..0f649457d69e 100644 --- a/core/src/main/java/org/apache/iceberg/LocationProviders.java +++ b/core/src/main/java/org/apache/iceberg/LocationProviders.java @@ -18,12 +18,14 @@ */ package org.apache.iceberg; +import java.nio.charset.StandardCharsets; import java.util.Map; import org.apache.hadoop.fs.Path; import org.apache.iceberg.common.DynConstructors; import org.apache.iceberg.io.LocationProvider; import org.apache.iceberg.relocated.com.google.common.base.Preconditions; -import org.apache.iceberg.util.HashUtils; +import org.apache.iceberg.relocated.com.google.common.hash.HashFunction; +import org.apache.iceberg.relocated.com.google.common.hash.Hashing; import org.apache.iceberg.util.LocationUtil; import org.apache.iceberg.util.PropertyUtil; @@ -103,6 +105,10 @@ public String newDataLocation(String filename) { static class ObjectStoreLocationProvider implements LocationProvider { + private static final HashFunction HASH_FUNC = Hashing.sha1(); + + private static final String allChars = + "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789"; private final String storageLocation; private final String context; @@ -139,7 +145,7 @@ public String newDataLocation(PartitionSpec spec, StructLike partitionData, Stri @Override public String newDataLocation(String filename) { - String hash = HashUtils.computeHash(filename); + String hash = computeHash(filename); if (context != null) { return String.format("%s/%s/%s/%s", storageLocation, hash, context, filename); } else { @@ -163,5 +169,18 @@ private static String pathContext(String tableLocation) { return resolvedContext; } + + private static String computeHash(String fileName) { + Preconditions.checkState(fileName != null, "fileName cannot be null"); + byte[] messageDigest = + HASH_FUNC.hashBytes(fileName.getBytes(StandardCharsets.UTF_8)).asBytes(); + + StringBuilder hash = new StringBuilder(); + for (int i = 0; i < 8; ++i) { + hash.append(allChars.charAt((messageDigest[i] % 62 + 62) % 62)); + } + + return hash.toString(); + } } } diff --git a/core/src/main/java/org/apache/iceberg/util/HashUtils.java b/core/src/main/java/org/apache/iceberg/util/HashUtils.java deleted file mode 100644 index d774c18af633..000000000000 --- a/core/src/main/java/org/apache/iceberg/util/HashUtils.java +++ /dev/null @@ -1,52 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.util; - -import java.nio.charset.StandardCharsets; -import org.apache.iceberg.relocated.com.google.common.base.Preconditions; -import org.apache.iceberg.relocated.com.google.common.hash.HashFunction; -import org.apache.iceberg.relocated.com.google.common.hash.Hashing; - -public class HashUtils { - - private HashUtils() {} - - private static final HashFunction hashFunction = Hashing.sha1(); - private static final String allChars = - "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789"; - - /** - * Compute hash containing alphanumeric characters - * - * @param fileName name of the file - * @return hash generated - */ - public static String computeHash(String fileName) { - Preconditions.checkState(fileName != null, "fileName cannot be null"); - byte[] messageDigest = - hashFunction.hashBytes(fileName.getBytes(StandardCharsets.UTF_8)).asBytes(); - - StringBuilder hash = new StringBuilder(); - for (int i = 0; i < 8; ++i) { - hash.append(allChars.charAt((messageDigest[i] % 62 + 62) % 62)); - } - - return hash.toString(); - } -} From 6757c8a455a491a17c4fb826361b1ac2ada5fb7a Mon Sep 17 00:00:00 2001 From: Prashant Singh Date: Sun, 19 Mar 2023 14:46:31 -0700 Subject: [PATCH 3/7] Address review feedback --- .../java/org/apache/iceberg/LocationProviders.java | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) diff --git a/core/src/main/java/org/apache/iceberg/LocationProviders.java b/core/src/main/java/org/apache/iceberg/LocationProviders.java index 0f649457d69e..6182b9008f8e 100644 --- a/core/src/main/java/org/apache/iceberg/LocationProviders.java +++ b/core/src/main/java/org/apache/iceberg/LocationProviders.java @@ -19,6 +19,7 @@ package org.apache.iceberg; import java.nio.charset.StandardCharsets; +import java.util.Base64; import java.util.Map; import org.apache.hadoop.fs.Path; import org.apache.iceberg.common.DynConstructors; @@ -105,10 +106,7 @@ public String newDataLocation(String filename) { static class ObjectStoreLocationProvider implements LocationProvider { - private static final HashFunction HASH_FUNC = Hashing.sha1(); - - private static final String allChars = - "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789"; + private static final HashFunction HASH_FUNC = Hashing.murmur3_32_fixed(); private final String storageLocation; private final String context; @@ -174,13 +172,9 @@ private static String computeHash(String fileName) { Preconditions.checkState(fileName != null, "fileName cannot be null"); byte[] messageDigest = HASH_FUNC.hashBytes(fileName.getBytes(StandardCharsets.UTF_8)).asBytes(); + String hash = Base64.getUrlEncoder().encodeToString(messageDigest); - StringBuilder hash = new StringBuilder(); - for (int i = 0; i < 8; ++i) { - hash.append(allChars.charAt((messageDigest[i] % 62 + 62) % 62)); - } - - return hash.toString(); + return hash.substring(0, 8); } } } From e70c81a9773cc2e4d54ff16316c5dc6c46fd0b8b Mon Sep 17 00:00:00 2001 From: Prashant Singh Date: Mon, 20 Mar 2023 16:22:59 -0700 Subject: [PATCH 4/7] use the suggested hash function --- .../main/java/org/apache/iceberg/LocationProviders.java | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/core/src/main/java/org/apache/iceberg/LocationProviders.java b/core/src/main/java/org/apache/iceberg/LocationProviders.java index 6182b9008f8e..83c7777d4e91 100644 --- a/core/src/main/java/org/apache/iceberg/LocationProviders.java +++ b/core/src/main/java/org/apache/iceberg/LocationProviders.java @@ -21,6 +21,7 @@ import java.nio.charset.StandardCharsets; import java.util.Base64; import java.util.Map; +import java.util.Random; import org.apache.hadoop.fs.Path; import org.apache.iceberg.common.DynConstructors; import org.apache.iceberg.io.LocationProvider; @@ -107,6 +108,7 @@ public String newDataLocation(String filename) { static class ObjectStoreLocationProvider implements LocationProvider { private static final HashFunction HASH_FUNC = Hashing.murmur3_32_fixed(); + private static final Random random = new Random(1); private final String storageLocation; private final String context; @@ -170,9 +172,12 @@ private static String pathContext(String tableLocation) { private static String computeHash(String fileName) { Preconditions.checkState(fileName != null, "fileName cannot be null"); + byte[] hashBuffer = new byte[6]; + random.nextBytes(hashBuffer); byte[] messageDigest = HASH_FUNC.hashBytes(fileName.getBytes(StandardCharsets.UTF_8)).asBytes(); - String hash = Base64.getUrlEncoder().encodeToString(messageDigest); + System.arraycopy(messageDigest, 0, hashBuffer, 0, 4); + String hash = Base64.getUrlEncoder().encodeToString(hashBuffer); return hash.substring(0, 8); } From 42aad4eb278cab2d34488daceb8b6215b023fb97 Mon Sep 17 00:00:00 2001 From: Ryan Blue Date: Sun, 2 Apr 2023 16:45:13 -0700 Subject: [PATCH 5/7] Core: Improve bit density in object storage layout. --- .../org/apache/iceberg/LocationProviders.java | 22 ++++++++----------- 1 file changed, 9 insertions(+), 13 deletions(-) diff --git a/core/src/main/java/org/apache/iceberg/LocationProviders.java b/core/src/main/java/org/apache/iceberg/LocationProviders.java index 83c7777d4e91..2790a74c580a 100644 --- a/core/src/main/java/org/apache/iceberg/LocationProviders.java +++ b/core/src/main/java/org/apache/iceberg/LocationProviders.java @@ -19,15 +19,15 @@ package org.apache.iceberg; import java.nio.charset.StandardCharsets; -import java.util.Base64; import java.util.Map; -import java.util.Random; import org.apache.hadoop.fs.Path; import org.apache.iceberg.common.DynConstructors; import org.apache.iceberg.io.LocationProvider; import org.apache.iceberg.relocated.com.google.common.base.Preconditions; +import org.apache.iceberg.relocated.com.google.common.hash.HashCode; import org.apache.iceberg.relocated.com.google.common.hash.HashFunction; import org.apache.iceberg.relocated.com.google.common.hash.Hashing; +import org.apache.iceberg.relocated.com.google.common.io.BaseEncoding; import org.apache.iceberg.util.LocationUtil; import org.apache.iceberg.util.PropertyUtil; @@ -108,7 +108,8 @@ public String newDataLocation(String filename) { static class ObjectStoreLocationProvider implements LocationProvider { private static final HashFunction HASH_FUNC = Hashing.murmur3_32_fixed(); - private static final Random random = new Random(1); + private static final BaseEncoding BASE64_ENCODER = BaseEncoding.base64Url().lowerCase().omitPadding(); + private final ThreadLocal temp = ThreadLocal.withInitial(() -> new byte[4]); private final String storageLocation; private final String context; @@ -170,16 +171,11 @@ private static String pathContext(String tableLocation) { return resolvedContext; } - private static String computeHash(String fileName) { - Preconditions.checkState(fileName != null, "fileName cannot be null"); - byte[] hashBuffer = new byte[6]; - random.nextBytes(hashBuffer); - byte[] messageDigest = - HASH_FUNC.hashBytes(fileName.getBytes(StandardCharsets.UTF_8)).asBytes(); - System.arraycopy(messageDigest, 0, hashBuffer, 0, 4); - String hash = Base64.getUrlEncoder().encodeToString(hashBuffer); - - return hash.substring(0, 8); + private String computeHash(String fileName) { + byte[] bytes = temp.get(); + HashCode hash = HASH_FUNC.hashString(fileName, StandardCharsets.UTF_8); + hash.writeBytesTo(bytes, 0, 4); + return BASE64_ENCODER.encode(bytes); } } } From b0eb42ae4027c62481984fc7c1447b28151ff23b Mon Sep 17 00:00:00 2001 From: Prashant Singh Date: Sun, 2 Apr 2023 23:34:36 -0700 Subject: [PATCH 6/7] fix style checks --- core/src/main/java/org/apache/iceberg/LocationProviders.java | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/core/src/main/java/org/apache/iceberg/LocationProviders.java b/core/src/main/java/org/apache/iceberg/LocationProviders.java index 2790a74c580a..74223c267938 100644 --- a/core/src/main/java/org/apache/iceberg/LocationProviders.java +++ b/core/src/main/java/org/apache/iceberg/LocationProviders.java @@ -108,7 +108,8 @@ public String newDataLocation(String filename) { static class ObjectStoreLocationProvider implements LocationProvider { private static final HashFunction HASH_FUNC = Hashing.murmur3_32_fixed(); - private static final BaseEncoding BASE64_ENCODER = BaseEncoding.base64Url().lowerCase().omitPadding(); + private static final BaseEncoding BASE64_ENCODER = + BaseEncoding.base64Url().lowerCase().omitPadding(); private final ThreadLocal temp = ThreadLocal.withInitial(() -> new byte[4]); private final String storageLocation; private final String context; From 024440c2224e0813b91a9b74e283fe999322f1df Mon Sep 17 00:00:00 2001 From: Prashant Singh Date: Mon, 3 Apr 2023 08:34:23 -0700 Subject: [PATCH 7/7] remove lowe-case requriement from base64 encoder --- core/src/main/java/org/apache/iceberg/LocationProviders.java | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/core/src/main/java/org/apache/iceberg/LocationProviders.java b/core/src/main/java/org/apache/iceberg/LocationProviders.java index 74223c267938..8a37b508eb93 100644 --- a/core/src/main/java/org/apache/iceberg/LocationProviders.java +++ b/core/src/main/java/org/apache/iceberg/LocationProviders.java @@ -108,8 +108,7 @@ public String newDataLocation(String filename) { static class ObjectStoreLocationProvider implements LocationProvider { private static final HashFunction HASH_FUNC = Hashing.murmur3_32_fixed(); - private static final BaseEncoding BASE64_ENCODER = - BaseEncoding.base64Url().lowerCase().omitPadding(); + private static final BaseEncoding BASE64_ENCODER = BaseEncoding.base64Url().omitPadding(); private final ThreadLocal temp = ThreadLocal.withInitial(() -> new byte[4]); private final String storageLocation; private final String context;