Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[#4528] improvement(hive-catalog): reduce hive catalog libs size from 146MB to 43MB #4531

Merged
merged 5 commits into from
Aug 30, 2024
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 11 additions & 5 deletions authorizations/authorization-ranger/build.gradle.kts
Original file line number Diff line number Diff line change
Expand Up @@ -31,11 +31,17 @@ dependencies {
implementation(project(":core")) {
exclude(group = "*")
}

implementation(libs.bundles.log4j)
implementation(libs.commons.lang3)
implementation(libs.guava)

implementation(libs.javax.jaxb.api) {
exclude("*")
}
implementation(libs.javax.ws.rs.api)
implementation(libs.jettison)
compileOnly(libs.lombok)
implementation(libs.mail)
implementation(libs.ranger.intg) {
exclude("org.apache.hadoop", "hadoop-common")
exclude("org.apache.hive", "hive-storage-api")
Expand All @@ -50,11 +56,9 @@ dependencies {
exclude("org.apache.ranger", "ranger-plugin-classloader")
exclude("net.java.dev.jna")
exclude("javax.ws.rs")
exclude("org.eclipse.jetty")
}
implementation(libs.javax.ws.rs.api)
implementation(libs.javax.jaxb.api) {
exclude("*")
}
implementation(libs.rome)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why do we need to introduce some dependencies like this?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Because I excluded some Hive catalog dependencies, RangerHiveIT encountered a ClassNotFoundException, so I introduced these dependencies to fix it.

I think the root cause is that the AuthRanger plugin uses the same class loader as the Hive catalog. Previously, the Ranger plugin used dependencies of the Hive catalog.


testImplementation(project(":common"))
testImplementation(project(":clients:client-java"))
Expand All @@ -70,6 +74,7 @@ dependencies {
exclude("org.apache.lucene")
exclude("org.apache.solr")
exclude("org.apache.kafka")
exclude("org.eclipse.jetty")
exclude("org.elasticsearch")
exclude("org.elasticsearch.client")
exclude("org.elasticsearch.plugin")
Expand All @@ -78,6 +83,7 @@ dependencies {
}
testImplementation(libs.hive2.jdbc) {
exclude("org.slf4j")
exclude("org.eclipse.jetty.aggregate")
}
testImplementation(libs.mysql.driver)
}
Expand Down
35 changes: 31 additions & 4 deletions catalogs/catalog-hive/build.gradle.kts
Original file line number Diff line number Diff line change
Expand Up @@ -30,32 +30,52 @@ val icebergVersion: String = libs.versions.iceberg.get()
val scalaCollectionCompatVersion: String = libs.versions.scala.collection.compat.get()

dependencies {
implementation(project(":api"))
implementation(project(":catalogs:catalog-common"))
implementation(project(":core"))
implementation(project(":api")) {
exclude("*")
}
implementation(project(":catalogs:catalog-common")) {
exclude("*")
}
implementation(project(":core")) {
exclude("*")
}

implementation(libs.caffeine)
implementation(libs.commons.collections3)
implementation(libs.commons.configuration1)
implementation(libs.htrace.core4)
implementation(libs.guava)
implementation(libs.hadoop2.auth) {
exclude("*")
}
implementation(libs.hive2.exec) {
artifact {
classifier = "core"
}
exclude("com.google.code.findbugs", "jsr305")
exclude("com.google.protobuf")
exclude("org.apache.avro")
exclude("org.apache.ant")
exclude("org.apache.calcite")
exclude("org.apache.calcite.avatica")
exclude("org.apache.curator")
exclude("org.apache.derby")
exclude("org.apache.hadoop", "hadoop-yarn-server-resourcemanager")
exclude("org.apache.hive", "hive-llap-tez")
exclude("org.apache.ivy")
exclude("org.apache.logging.log4j")
exclude("org.apache.zookeeper")
exclude("org.codehaus.groovy", "groovy-all")
exclude("org.datanucleus", "datanucleus-core")
exclude("org.eclipse.jetty.aggregate", "jetty-all")
exclude("org.eclipse.jetty.orbit", "javax.servlet")
exclude("org.openjdk.jol")
exclude("org.pentaho")
exclude("org.slf4j")
}
implementation(libs.woodstox.core)
implementation(libs.hive2.metastore) {
exclude("ant")
exclude("co.cask.tephra")
exclude("com.github.joshelser")
exclude("com.google.code.findbugs", "jsr305")
Expand All @@ -64,13 +84,16 @@ dependencies {
exclude("com.zaxxer", "HikariCP")
exclude("io.dropwizard.metricss")
exclude("javax.transaction", "transaction-api")
exclude("org.apache.ant")
exclude("org.apache.avro")
exclude("org.apache.curator")
exclude("org.apache.derby")
exclude("org.apache.hadoop", "hadoop-yarn-server-resourcemanager")
exclude("org.apache.hbase")
exclude("org.apache.logging.log4j")
exclude("org.apache.parquet", "parquet-hadoop-bundle")
exclude("org.apache.zookeeper")
exclude("org.datanucleus")
exclude("org.eclipse.jetty.aggregate", "jetty-all")
exclude("org.eclipse.jetty.orbit", "javax.servlet")
exclude("org.openjdk.jol")
Expand Down Expand Up @@ -135,7 +158,11 @@ tasks {

val copyCatalogLibs by registering(Copy::class) {
dependsOn("jar", "runtimeJars")
from("build/libs")
from("build/libs") {
exclude("guava-*.jar")
exclude("log4j-*.jar")
exclude("slf4j-*.jar")
}
into("$rootDir/distribution/package/catalogs/hive/libs")
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,6 @@
import java.util.stream.Collectors;
import java.util.stream.Stream;
import lombok.ToString;
import org.apache.commons.collections.CollectionUtils;
import org.apache.commons.lang3.ArrayUtils;
import org.apache.gravitino.catalog.hive.converter.HiveDataTypeConverter;
import org.apache.gravitino.connector.BaseTable;
Expand Down Expand Up @@ -87,7 +86,7 @@ public static HiveTable.Builder fromHiveTable(Table table) {

StorageDescriptor sd = table.getSd();
Distribution distribution = Distributions.NONE;
if (CollectionUtils.isNotEmpty(sd.getBucketCols())) {
if (sd.getBucketCols() != null && !sd.getBucketCols().isEmpty()) {
// Hive table use hash strategy as bucketing strategy
distribution =
Distributions.hash(
Expand All @@ -96,7 +95,7 @@ public static HiveTable.Builder fromHiveTable(Table table) {
}

SortOrder[] sortOrders = new SortOrder[0];
if (CollectionUtils.isNotEmpty(sd.getSortCols())) {
if (sd.getSortCols() != null && !sd.getSortCols().isEmpty()) {
sortOrders =
sd.getSortCols().stream()
.map(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,6 @@
import org.apache.hadoop.hive.metastore.api.StorageDescriptor;
import org.apache.hadoop.hive.metastore.api.Table;
import org.apache.hadoop.hive.metastore.api.UnknownTableException;
import org.apache.parquet.Strings;
import org.apache.thrift.TException;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
Expand Down Expand Up @@ -163,19 +162,19 @@ public Partition addPartition(Partition partition) throws PartitionAlreadyExists
Preconditions.checkArgument(
transformFields.size() == identityPartition.fieldNames().length,
"Hive partition field names must be the same as table partitioning field names: %s, but got %s",
Strings.join(transformFields, ","),
Strings.join(
String.join(",", transformFields),
String.join(
",",
Arrays.stream(identityPartition.fieldNames())
.map(f -> Strings.join(f, "."))
.collect(Collectors.toList()),
","));
.map(f -> String.join(".", f))
.collect(Collectors.toList())));
Arrays.stream(identityPartition.fieldNames())
.forEach(
f ->
Preconditions.checkArgument(
transformFields.contains(f[0]),
"Hive partition field name must be in table partitioning field names: %s, but got %s",
Strings.join(transformFields, ","),
String.join(",", transformFields),
f[0]));

try {
Expand Down
23 changes: 15 additions & 8 deletions gradle/libs.versions.toml
Original file line number Diff line number Diff line change
Expand Up @@ -28,18 +28,19 @@ jetty = "9.4.51.v20230217"
jersey = "2.41"
mockito = "4.11.0"
airlift-json = "237"
airlift-log = "231"
airlift-resolver = "1.6"
airlift-units = "1.8"
hive2 = "2.3.9"
hadoop2 = "2.10.2"
hadoop3 = "3.1.0"
hadoop-minikdc = "3.3.6"
htrace-core4 = "4.1.0-incubating"
httpclient5 = "5.2.1"
mockserver = "5.15.0"
commons-lang3 = "3.14.0"
commons-io = "2.15.0"
commons-collections4 = "4.4"
commons-collections3 = "3.2.2"
commons-configuration1 = "1.6"
commons-dbcp2 = "2.11.0"
caffeine = "2.9.3"
rocksdbjni = "7.10.2"
Expand All @@ -62,7 +63,6 @@ jline = "3.21.0"
okhttp3 = "4.11.0"
metrics = "4.2.25"
prometheus = "0.16.0"
jsqlparser = "4.2"
mysql = "8.0.23"
postgresql = "42.6.0"
immutables-value = "2.10.0"
Expand Down Expand Up @@ -91,6 +91,10 @@ node-plugin = "7.0.1"
commons-cli = "1.2"
sun-activation-version = "1.2.0"
error-prone = "3.1.0"
woodstox-core = "5.3.0"
mail = "1.4.1"
rome = "1.0"
jettison = "1.1"
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If we need to add these new dependencies, we should also update the license.bin file.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

added


[libraries]
protobuf-java = { group = "com.google.protobuf", name = "protobuf-java", version.ref = "protoc" }
Expand Down Expand Up @@ -130,18 +134,17 @@ hive2-metastore = { group = "org.apache.hive", name = "hive-metastore", version.
hive2-exec = { group = "org.apache.hive", name = "hive-exec", version.ref = "hive2"}
hive2-common = { group = "org.apache.hive", name = "hive-common", version.ref = "hive2"}
hive2-jdbc = { group = "org.apache.hive", name = "hive-jdbc", version.ref = "hive2"}
hadoop2-auth = { group = "org.apache.hadoop", name = "hadoop-auth", version.ref = "hadoop2" }
hadoop2-hdfs = { group = "org.apache.hadoop", name = "hadoop-hdfs", version.ref = "hadoop2" }
hadoop2-common = { group = "org.apache.hadoop", name = "hadoop-common", version.ref = "hadoop2"}
hadoop2-mapreduce-client-core = { group = "org.apache.hadoop", name = "hadoop-mapreduce-client-core", version.ref = "hadoop2"}
hadoop3-hdfs = { group = "org.apache.hadoop", name = "hadoop-hdfs", version.ref = "hadoop3" }
hadoop3-common = { group = "org.apache.hadoop", name = "hadoop-common", version.ref = "hadoop3"}
hadoop3-client = { group = "org.apache.hadoop", name = "hadoop-client", version.ref = "hadoop3"}
hadoop3-mapreduce-client-core = { group = "org.apache.hadoop", name = "hadoop-mapreduce-client-core", version.ref = "hadoop3"}
hadoop3-minicluster = { group = "org.apache.hadoop", name = "hadoop-minicluster", version.ref = "hadoop-minikdc"}
htrace-core4 = { group = "org.apache.htrace", name = "htrace-core4", version.ref = "htrace-core4" }
airlift-json = { group = "io.airlift", name = "json", version.ref = "airlift-json"}
airlift-resolver = { group = "io.airlift.resolver", name = "resolver", version.ref = "airlift-resolver"}
airlift-units = { group = "io.airlift", name = "units", version.ref = "airlift-units"}
airlift-log = { group = "io.airlift", name = "log", version.ref = "airlift-log"}
httpclient5 = { group = "org.apache.httpcomponents.client5", name = "httpclient5", version.ref = "httpclient5" }
mockserver-netty = { group = "org.mock-server", name = "mockserver-netty", version.ref = "mockserver" }
mockserver-client-java = { group = "org.mock-server", name = "mockserver-client-java", version.ref = "mockserver" }
Expand All @@ -150,6 +153,8 @@ commons-io = { group = "commons-io", name = "commons-io", version.ref = "commons
caffeine = { group = "com.github.ben-manes.caffeine", name = "caffeine", version.ref = "caffeine" }
rocksdbjni = { group = "org.rocksdb", name = "rocksdbjni", version.ref = "rocksdbjni" }
commons-collections4 = { group = "org.apache.commons", name = "commons-collections4", version.ref = "commons-collections4" }
commons-collections3 = { group = "commons-collections", name = "commons-collections", version.ref = "commons-collections3" }
commons-configuration1 = { group = "commons-configuration", name = "commons-configuration", version.ref = "commons-configuration1" }
iceberg-aws = { group = "org.apache.iceberg", name = "iceberg-aws", version.ref = "iceberg" }
iceberg-core = { group = "org.apache.iceberg", name = "iceberg-core", version.ref = "iceberg" }
iceberg-api = { group = "org.apache.iceberg", name = "iceberg-api", version.ref = "iceberg" }
Expand All @@ -158,7 +163,6 @@ paimon-core = { group = "org.apache.paimon", name = "paimon-core", version.ref =
paimon-format = { group = "org.apache.paimon", name = "paimon-format", version.ref = "paimon" }
paimon-hive-catalog = { group = "org.apache.paimon", name = "paimon-hive-catalog", version.ref = "paimon" }
trino-spi= { group = "io.trino", name = "trino-spi", version.ref = "trino" }
trino-toolkit= { group = "io.trino", name = "trino-plugin-toolkit", version.ref = "trino" }
trino-testing= { group = "io.trino", name = "trino-testing", version.ref = "trino" }
trino-memory= { group = "io.trino", name = "trino-memory", version.ref = "trino" }
trino-cli= { group = "io.trino", name = "trino-cli", version.ref = "trino" }
Expand All @@ -183,7 +187,6 @@ metrics-servlets = { group = "io.dropwizard.metrics", name = "metrics-servlets",
prometheus-client = { group = "io.prometheus", name = "simpleclient", version.ref = "prometheus" }
prometheus-dropwizard = { group = "io.prometheus", name = "simpleclient_dropwizard", version.ref = "prometheus" }
prometheus-servlet = { group = "io.prometheus", name = "simpleclient_servlet", version.ref = "prometheus" }
jsqlparser = { group = "com.github.jsqlparser", name = "jsqlparser", version.ref = "jsqlparser" }
mysql-driver = { group = "mysql", name = "mysql-connector-java", version.ref = "mysql" }
postgresql-driver = { group = "org.postgresql", name = "postgresql", version.ref = "postgresql" }
minikdc = { group = "org.apache.hadoop", name = "hadoop-minikdc", version.ref = "hadoop-minikdc"}
Expand All @@ -194,6 +197,7 @@ kafka-clients = { group = "org.apache.kafka", name = "kafka-clients", version.re
kafka = { group = "org.apache.kafka", name = "kafka_2.12", version.ref = "kafka" }
curator-test = { group = "org.apache.curator", name = "curator-test", version.ref = "curator"}
cglib = { group = "cglib", name = "cglib", version.ref = "cglib"}
woodstox-core = { group = "com.fasterxml.woodstox", name = "woodstox-core", version.ref = "woodstox-core"}

ranger-intg = { group = "org.apache.ranger", name = "ranger-intg", version.ref = "ranger" }
javax-jaxb-api = { group = "javax.xml.bind", name = "jaxb-api", version.ref = "javax-jaxb-api" }
Expand All @@ -204,6 +208,9 @@ mybatis = { group = "org.mybatis", name = "mybatis", version.ref = "mybatis"}
h2db = { group = "com.h2database", name = "h2", version.ref = "h2db"}
awaitility = { group = "org.awaitility", name = "awaitility", version.ref = "awaitility" }
servlet = { group = "javax.servlet", name = "javax.servlet-api", version.ref = "servlet" }
mail = { group = "javax.mail", name = "mail", version.ref = "mail" }
rome = { group = "rome", name = "rome", version.ref = "rome" }
jettison = { group = "org.codehaus.jettison", name = "jettison", version.ref = "jettison" }

[bundles]
log4j = ["slf4j-api", "log4j-slf4j2-impl", "log4j-api", "log4j-core", "log4j-12-api"]
Expand Down
Loading