diff --git a/docker/datahub-gms/env/docker-without-neo4j.env b/docker/datahub-gms/env/docker-without-neo4j.env index cc0dd6b4278b5..37b7ba1797af5 100644 --- a/docker/datahub-gms/env/docker-without-neo4j.env +++ b/docker/datahub-gms/env/docker-without-neo4j.env @@ -23,6 +23,8 @@ PE_CONSUMER_ENABLED=true UI_INGESTION_ENABLED=true ENTITY_SERVICE_ENABLE_RETENTION=true +ELASTIC_ID_HASH_ALGO=MD5 + # Uncomment to disable persistence of client-side analytics events # DATAHUB_ANALYTICS_ENABLED=false diff --git a/docker/datahub-gms/env/docker.env b/docker/datahub-gms/env/docker.env index 59fc4bdde02ff..0ecaa32c4cb12 100644 --- a/docker/datahub-gms/env/docker.env +++ b/docker/datahub-gms/env/docker.env @@ -27,6 +27,8 @@ MCE_CONSUMER_ENABLED=true PE_CONSUMER_ENABLED=true UI_INGESTION_ENABLED=true +ELASTIC_ID_HASH_ALGO=MD5 + # Uncomment to enable Metadata Service Authentication METADATA_SERVICE_AUTH_ENABLED=false diff --git a/docker/datahub-mae-consumer/env/docker-without-neo4j.env b/docker/datahub-mae-consumer/env/docker-without-neo4j.env index b6899f7e6d63b..6a82f235b2971 100644 --- a/docker/datahub-mae-consumer/env/docker-without-neo4j.env +++ b/docker/datahub-mae-consumer/env/docker-without-neo4j.env @@ -13,6 +13,8 @@ ES_BULK_REFRESH_POLICY=WAIT_UNTIL GRAPH_SERVICE_IMPL=elasticsearch ENTITY_REGISTRY_CONFIG_PATH=/datahub/datahub-mae-consumer/resources/entity-registry.yml +ELASTIC_ID_HASH_ALGO=MD5 + # Uncomment to disable persistence of client-side analytics events # DATAHUB_ANALYTICS_ENABLED=false diff --git a/docker/datahub-mae-consumer/env/docker.env b/docker/datahub-mae-consumer/env/docker.env index 5a6daa6eaeaed..1f0ee4b05b382 100644 --- a/docker/datahub-mae-consumer/env/docker.env +++ b/docker/datahub-mae-consumer/env/docker.env @@ -17,6 +17,8 @@ NEO4J_PASSWORD=datahub GRAPH_SERVICE_IMPL=neo4j ENTITY_REGISTRY_CONFIG_PATH=/datahub/datahub-mae-consumer/resources/entity-registry.yml +ELASTIC_ID_HASH_ALGO=MD5 + # Uncomment to disable persistence of client-side analytics events # DATAHUB_ANALYTICS_ENABLED=false diff --git a/docker/datahub-mce-consumer/env/docker-without-neo4j.env b/docker/datahub-mce-consumer/env/docker-without-neo4j.env index e7be7d8ed4ddc..b0edfc0a75b66 100644 --- a/docker/datahub-mce-consumer/env/docker-without-neo4j.env +++ b/docker/datahub-mce-consumer/env/docker-without-neo4j.env @@ -24,6 +24,8 @@ MAE_CONSUMER_ENABLED=false PE_CONSUMER_ENABLED=false UI_INGESTION_ENABLED=false +ELASTIC_ID_HASH_ALGO=MD5 + # Uncomment to configure kafka topic names # Make sure these names are consistent across the whole deployment # METADATA_CHANGE_PROPOSAL_TOPIC_NAME=MetadataChangeProposal_v1 diff --git a/docker/datahub-mce-consumer/env/docker.env b/docker/datahub-mce-consumer/env/docker.env index 8618f3f5f7af7..c0f85ef667546 100644 --- a/docker/datahub-mce-consumer/env/docker.env +++ b/docker/datahub-mce-consumer/env/docker.env @@ -24,6 +24,8 @@ MAE_CONSUMER_ENABLED=false PE_CONSUMER_ENABLED=false UI_INGESTION_ENABLED=false +ELASTIC_ID_HASH_ALGO=MD5 + # Uncomment to configure kafka topic names # Make sure these names are consistent across the whole deployment # METADATA_CHANGE_PROPOSAL_TOPIC_NAME=MetadataChangeProposal_v1 diff --git a/docker/quickstart/docker-compose-m1.quickstart.yml b/docker/quickstart/docker-compose-m1.quickstart.yml index 834d55096468f..a0f60d23710a0 100644 --- a/docker/quickstart/docker-compose-m1.quickstart.yml +++ b/docker/quickstart/docker-compose-m1.quickstart.yml @@ -86,6 +86,7 @@ services: - ELASTICSEARCH_INDEX_BUILDER_MAPPINGS_REINDEX=true - ELASTICSEARCH_INDEX_BUILDER_SETTINGS_REINDEX=true - ELASTICSEARCH_PORT=9200 + - ELASTIC_ID_HASH_ALGO=MD5 - ENTITY_REGISTRY_CONFIG_PATH=/datahub/datahub-gms/resources/entity-registry.yml - ENTITY_SERVICE_ENABLE_RETENTION=true - ES_BULK_REFRESH_POLICY=WAIT_UNTIL diff --git a/docker/quickstart/docker-compose-without-neo4j-m1.quickstart.yml b/docker/quickstart/docker-compose-without-neo4j-m1.quickstart.yml index 47fb50f78e4f0..11e33a9950ba9 100644 --- a/docker/quickstart/docker-compose-without-neo4j-m1.quickstart.yml +++ b/docker/quickstart/docker-compose-without-neo4j-m1.quickstart.yml @@ -86,6 +86,7 @@ services: - ELASTICSEARCH_INDEX_BUILDER_MAPPINGS_REINDEX=true - ELASTICSEARCH_INDEX_BUILDER_SETTINGS_REINDEX=true - ELASTICSEARCH_PORT=9200 + - ELASTIC_ID_HASH_ALGO=MD5 - ENTITY_REGISTRY_CONFIG_PATH=/datahub/datahub-gms/resources/entity-registry.yml - ENTITY_SERVICE_ENABLE_RETENTION=true - ES_BULK_REFRESH_POLICY=WAIT_UNTIL diff --git a/docker/quickstart/docker-compose-without-neo4j.quickstart.yml b/docker/quickstart/docker-compose-without-neo4j.quickstart.yml index 3fa13a9e56b42..2efa895983418 100644 --- a/docker/quickstart/docker-compose-without-neo4j.quickstart.yml +++ b/docker/quickstart/docker-compose-without-neo4j.quickstart.yml @@ -86,6 +86,7 @@ services: - ELASTICSEARCH_INDEX_BUILDER_MAPPINGS_REINDEX=true - ELASTICSEARCH_INDEX_BUILDER_SETTINGS_REINDEX=true - ELASTICSEARCH_PORT=9200 + - ELASTIC_ID_HASH_ALGO=MD5 - ENTITY_REGISTRY_CONFIG_PATH=/datahub/datahub-gms/resources/entity-registry.yml - ENTITY_SERVICE_ENABLE_RETENTION=true - ES_BULK_REFRESH_POLICY=WAIT_UNTIL diff --git a/docker/quickstart/docker-compose.consumers-without-neo4j.quickstart.yml b/docker/quickstart/docker-compose.consumers-without-neo4j.quickstart.yml index a4211acedcf10..4f47a3da24eb1 100644 --- a/docker/quickstart/docker-compose.consumers-without-neo4j.quickstart.yml +++ b/docker/quickstart/docker-compose.consumers-without-neo4j.quickstart.yml @@ -19,6 +19,7 @@ services: - ES_BULK_REFRESH_POLICY=WAIT_UNTIL - GRAPH_SERVICE_IMPL=elasticsearch - ENTITY_REGISTRY_CONFIG_PATH=/datahub/datahub-mae-consumer/resources/entity-registry.yml + - ELASTIC_ID_HASH_ALGO=MD5 hostname: datahub-mae-consumer image: ${DATAHUB_MAE_CONSUMER_IMAGE:-acryldata/datahub-mae-consumer}:${DATAHUB_VERSION:-head} ports: @@ -37,6 +38,7 @@ services: - EBEAN_DATASOURCE_USERNAME=datahub - ELASTICSEARCH_HOST=elasticsearch - ELASTICSEARCH_PORT=9200 + - ELASTIC_ID_HASH_ALGO=MD5 - ENTITY_REGISTRY_CONFIG_PATH=/datahub/datahub-mce-consumer/resources/entity-registry.yml - ENTITY_SERVICE_ENABLE_RETENTION=true - ES_BULK_REFRESH_POLICY=WAIT_UNTIL diff --git a/docker/quickstart/docker-compose.consumers.quickstart.yml b/docker/quickstart/docker-compose.consumers.quickstart.yml index e7571e4baf8b4..7dd7388b93988 100644 --- a/docker/quickstart/docker-compose.consumers.quickstart.yml +++ b/docker/quickstart/docker-compose.consumers.quickstart.yml @@ -26,6 +26,7 @@ services: - NEO4J_PASSWORD=datahub - GRAPH_SERVICE_IMPL=neo4j - ENTITY_REGISTRY_CONFIG_PATH=/datahub/datahub-mae-consumer/resources/entity-registry.yml + - ELASTIC_ID_HASH_ALGO=MD5 hostname: datahub-mae-consumer image: ${DATAHUB_MAE_CONSUMER_IMAGE:-acryldata/datahub-mae-consumer}:${DATAHUB_VERSION:-head} ports: @@ -47,6 +48,7 @@ services: - EBEAN_DATASOURCE_USERNAME=datahub - ELASTICSEARCH_HOST=elasticsearch - ELASTICSEARCH_PORT=9200 + - ELASTIC_ID_HASH_ALGO=MD5 - ENTITY_REGISTRY_CONFIG_PATH=/datahub/datahub-mce-consumer/resources/entity-registry.yml - ENTITY_SERVICE_ENABLE_RETENTION=true - ES_BULK_REFRESH_POLICY=WAIT_UNTIL diff --git a/docker/quickstart/docker-compose.quickstart.yml b/docker/quickstart/docker-compose.quickstart.yml index c63b6d1d61b03..f42ed1f40c246 100644 --- a/docker/quickstart/docker-compose.quickstart.yml +++ b/docker/quickstart/docker-compose.quickstart.yml @@ -86,6 +86,7 @@ services: - ELASTICSEARCH_INDEX_BUILDER_MAPPINGS_REINDEX=true - ELASTICSEARCH_INDEX_BUILDER_SETTINGS_REINDEX=true - ELASTICSEARCH_PORT=9200 + - ELASTIC_ID_HASH_ALGO=MD5 - ENTITY_REGISTRY_CONFIG_PATH=/datahub/datahub-gms/resources/entity-registry.yml - ENTITY_SERVICE_ENABLE_RETENTION=true - ES_BULK_REFRESH_POLICY=WAIT_UNTIL diff --git a/entity-registry/src/main/java/com/linkedin/metadata/aspect/models/graph/Edge.java b/entity-registry/src/main/java/com/linkedin/metadata/aspect/models/graph/Edge.java index ca5e6d0110384..3de09e599d99e 100644 --- a/entity-registry/src/main/java/com/linkedin/metadata/aspect/models/graph/Edge.java +++ b/entity-registry/src/main/java/com/linkedin/metadata/aspect/models/graph/Edge.java @@ -72,8 +72,9 @@ public String toDocId() { } try { + String hashAlgo = System.getenv("ELASTIC_ID_HASH_ALGO"); byte[] bytesOfRawDocID = rawDocId.toString().getBytes(StandardCharsets.UTF_8); - MessageDigest md = MessageDigest.getInstance("MD5"); + MessageDigest md = MessageDigest.getInstance(hashAlgo); byte[] thedigest = md.digest(bytesOfRawDocID); return Base64.getEncoder().encodeToString(thedigest); } catch (NoSuchAlgorithmException e) { diff --git a/metadata-io/build.gradle b/metadata-io/build.gradle index ff29cb5fff47d..17d9cb8cd14fe 100644 --- a/metadata-io/build.gradle +++ b/metadata-io/build.gradle @@ -130,6 +130,7 @@ test { // override, testng controlling parallelization // increasing >1 will merely run all tests extra times maxParallelForks = 1 + environment "ELASTIC_ID_HASH_ALGO", "MD5" } useTestNG() { suites 'src/test/resources/testng.xml' diff --git a/metadata-io/src/main/java/com/linkedin/metadata/systemmetadata/ElasticSearchSystemMetadataService.java b/metadata-io/src/main/java/com/linkedin/metadata/systemmetadata/ElasticSearchSystemMetadataService.java index 57002a3bfc59d..cdfc4e985293f 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/systemmetadata/ElasticSearchSystemMetadataService.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/systemmetadata/ElasticSearchSystemMetadataService.java @@ -86,10 +86,10 @@ private String toDocument(SystemMetadata systemMetadata, String urn, String aspe private String toDocId(@Nonnull final String urn, @Nonnull final String aspect) { String rawDocId = urn + DOC_DELIMETER + aspect; - + String hashAlgo = System.getenv("ELASTIC_ID_HASH_ALGO"); try { byte[] bytesOfRawDocID = rawDocId.getBytes(StandardCharsets.UTF_8); - MessageDigest md = MessageDigest.getInstance("MD5"); + MessageDigest md = MessageDigest.getInstance(hashAlgo); byte[] thedigest = md.digest(bytesOfRawDocID); return Base64.getEncoder().encodeToString(thedigest); } catch (NoSuchAlgorithmException e) { diff --git a/metadata-io/src/main/java/com/linkedin/metadata/timeseries/transformer/TimeseriesAspectTransformer.java b/metadata-io/src/main/java/com/linkedin/metadata/timeseries/transformer/TimeseriesAspectTransformer.java index c0f66acaaca5a..cf0a3f1466d25 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/timeseries/transformer/TimeseriesAspectTransformer.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/timeseries/transformer/TimeseriesAspectTransformer.java @@ -257,7 +257,9 @@ private static Pair getTimeseriesFieldCollectionDocument( finalDocument); } - private static String getDocId(@Nonnull JsonNode document, String collectionId) { + private static String getDocId(@Nonnull JsonNode document, String collectionId) + throws IllegalArgumentException { + String hashAlgo = System.getenv("ELASTIC_ID_HASH_ALGO"); String docId = document.get(MappingsBuilder.TIMESTAMP_MILLIS_FIELD).toString(); JsonNode eventGranularity = document.get(MappingsBuilder.EVENT_GRANULARITY); if (eventGranularity != null) { @@ -276,6 +278,11 @@ private static String getDocId(@Nonnull JsonNode document, String collectionId) docId += partitionSpec.toString(); } - return DigestUtils.md5Hex(docId); + if (hashAlgo.equalsIgnoreCase("SHA-256")) { + return DigestUtils.sha256Hex(docId); + } else if (hashAlgo.equalsIgnoreCase("MD5")) { + return DigestUtils.md5Hex(docId); + } + throw new IllegalArgumentException("Hash function not handled !"); } } diff --git a/metadata-service/configuration/src/main/resources/application.yaml b/metadata-service/configuration/src/main/resources/application.yaml index 921c2b43dc36c..2514060ff2d61 100644 --- a/metadata-service/configuration/src/main/resources/application.yaml +++ b/metadata-service/configuration/src/main/resources/application.yaml @@ -174,6 +174,7 @@ elasticsearch: opensearchUseAwsIamAuth: ${OPENSEARCH_USE_AWS_IAM_AUTH:false} region: ${AWS_REGION:#{null}} implementation: ${ELASTICSEARCH_IMPLEMENTATION:elasticsearch} # elasticsearch or opensearch, for handling divergent cases + idHashAlgo: ${ELASTIC_ID_HASH_ALGO:MD5} sslContext: # Required if useSSL is true protocol: ${ELASTICSEARCH_SSL_PROTOCOL:#{null}} secureRandomImplementation: ${ELASTICSEARCH_SSL_SECURE_RANDOM_IMPL:#{null}} diff --git a/smoke-test/build.gradle b/smoke-test/build.gradle index 9800cf65fc452..95f3ba8ed56d6 100644 --- a/smoke-test/build.gradle +++ b/smoke-test/build.gradle @@ -89,6 +89,7 @@ task noCypressSuite0(type: Exec, dependsOn: [installDev, ':metadata-ingestion:in environment 'DATAHUB_KAFKA_SCHEMA_REGISTRY_URL', 'http://localhost:8080/schema-registry/api/' environment 'KAFKA_BROKER_CONTAINER', 'datahub-kafka-broker-1' environment 'TEST_STRATEGY', 'no_cypress_suite0' + environment "ELASTIC_ID_HASH_ALGO", "MD5" workingDir = project.projectDir commandLine 'bash', '-c', @@ -101,6 +102,7 @@ task noCypressSuite1(type: Exec, dependsOn: [installDev, ':metadata-ingestion:in environment 'DATAHUB_KAFKA_SCHEMA_REGISTRY_URL', 'http://localhost:8080/schema-registry/api/' environment 'KAFKA_BROKER_CONTAINER', 'datahub-kafka-broker-1' environment 'TEST_STRATEGY', 'no_cypress_suite1' + environment "ELASTIC_ID_HASH_ALGO", "MD5" workingDir = project.projectDir commandLine 'bash', '-c', @@ -113,6 +115,7 @@ task cypressSuite1(type: Exec, dependsOn: [installDev, ':metadata-ingestion:inst environment 'DATAHUB_KAFKA_SCHEMA_REGISTRY_URL', 'http://localhost:8080/schema-registry/api/' environment 'KAFKA_BROKER_CONTAINER', 'datahub-kafka-broker-1' environment 'TEST_STRATEGY', 'cypress_suite1' + environment "ELASTIC_ID_HASH_ALGO", "MD5" workingDir = project.projectDir commandLine 'bash', '-c', @@ -125,6 +128,7 @@ task cypressRest(type: Exec, dependsOn: [installDev, ':metadata-ingestion:instal environment 'DATAHUB_KAFKA_SCHEMA_REGISTRY_URL', 'http://localhost:8080/schema-registry/api/' environment 'KAFKA_BROKER_CONTAINER', 'datahub-kafka-broker-1' environment 'TEST_STRATEGY', 'cypress_rest' + environment "ELASTIC_ID_HASH_ALGO", "MD5" workingDir = project.projectDir commandLine 'bash', '-c', @@ -139,6 +143,7 @@ task cypressDev(type: Exec, dependsOn: [installDev, ':metadata-ingestion:install environment 'RUN_QUICKSTART', 'false' environment 'DATAHUB_KAFKA_SCHEMA_REGISTRY_URL', 'http://localhost:8080/schema-registry/api/' environment 'KAFKA_BROKER_CONTAINER', 'datahub-kafka-broker-1' + environment "ELASTIC_ID_HASH_ALGO", "MD5" workingDir = project.projectDir commandLine 'bash', '-c', @@ -154,6 +159,7 @@ task cypressData(type: Exec, dependsOn: [installDev, ':metadata-ingestion:instal environment 'DATAHUB_KAFKA_SCHEMA_REGISTRY_URL', 'http://localhost:8080/schema-registry/api/' environment 'KAFKA_BROKER_CONTAINER', 'datahub-kafka-broker-1' environment 'RUN_UI', 'false' + environment "ELASTIC_ID_HASH_ALGO", "MD5" workingDir = project.projectDir commandLine 'bash', '-c', diff --git a/smoke-test/cypress-dev.sh b/smoke-test/cypress-dev.sh index 2b31c574d0578..59346b2606905 100755 --- a/smoke-test/cypress-dev.sh +++ b/smoke-test/cypress-dev.sh @@ -12,6 +12,7 @@ source venv/bin/activate export KAFKA_BROKER_CONTAINER="datahub-kafka-broker-1" export KAFKA_BOOTSTRAP_SERVER="broker:9092" +export ELASTIC_ID_HASH_ALGO="MD5" python -c 'from tests.cypress.integration_test import ingest_data; ingest_data()' cd tests/cypress diff --git a/smoke-test/run-quickstart.sh b/smoke-test/run-quickstart.sh index 05c321566d54a..1923d42eb5e93 100755 --- a/smoke-test/run-quickstart.sh +++ b/smoke-test/run-quickstart.sh @@ -15,6 +15,7 @@ DATAHUB_SEARCH_TAG="${DATAHUB_SEARCH_TAG:=2.9.0}" XPACK_SECURITY_ENABLED="${XPACK_SECURITY_ENABLED:=plugins.security.disabled=true}" ELASTICSEARCH_USE_SSL="${ELASTICSEARCH_USE_SSL:=false}" USE_AWS_ELASTICSEARCH="${USE_AWS_ELASTICSEARCH:=true}" +ELASTIC_ID_HASH_ALGO="${ELASTIC_ID_HASH_ALGO:=MD5}" echo "DATAHUB_VERSION = $DATAHUB_VERSION" DATAHUB_TELEMETRY_ENABLED=false \ diff --git a/smoke-test/smoke.sh b/smoke-test/smoke.sh index fafb2076fe699..c16865fe1e71e 100755 --- a/smoke-test/smoke.sh +++ b/smoke-test/smoke.sh @@ -25,6 +25,7 @@ source venv/bin/activate source ./set-cypress-creds.sh export DATAHUB_GMS_URL=http://localhost:8080 +export ELASTIC_ID_HASH_ALGO="MD5" # no_cypress_suite0, no_cypress_suite1, cypress_suite1, cypress_rest if [[ -z "${TEST_STRATEGY}" ]]; then