diff --git a/build.gradle b/build.gradle index 04c1b5de19cb3..56b002d221af5 100644 --- a/build.gradle +++ b/build.gradle @@ -149,6 +149,7 @@ project.ext.externalDependency = [ 'springJdbc': "org.springframework:spring-jdbc:$springVersion", 'springWeb': "org.springframework:spring-web:$springVersion", 'springWebMVC': "org.springframework:spring-webmvc:$springVersion", + 'springBootTest': "org.springframework.boot:spring-boot-starter-test:$springBootVersion", 'springBoot': "org.springframework.boot:spring-boot:$springBootVersion", 'springBootAutoconfigure': "org.springframework.boot:spring-boot-autoconfigure:$springBootVersion", 'springBootStarterWeb': "org.springframework.boot:spring-boot-starter-web:$springBootVersion", diff --git a/datahub-web-react/src/graphql/dataset.graphql b/datahub-web-react/src/graphql/dataset.graphql index e62ceded53645..2da35ffe47b28 100644 --- a/datahub-web-react/src/graphql/dataset.graphql +++ b/datahub-web-react/src/graphql/dataset.graphql @@ -1,38 +1,3 @@ -fragment simplifiedGlossaryTerms on GlossaryTerms { - terms { - term { - urn - name - type - hierarchicalName - properties { - name - description - definition - termSource - customProperties { - key - value - } - } - ownership { - ...ownershipFields - } - parentNodes { - count - nodes { - urn - type - properties { - name - } - } - } - } - associatedUrn - } -} - query getDataProfiles($urn: String!, $limit: Int, $startTime: Long, $endTime: Long) { dataset(urn: $urn) { urn @@ -84,7 +49,7 @@ fragment nonSiblingDatasetFields on Dataset { ...globalTagsFields } glossaryTerms { - ...simplifiedGlossaryTerms + ...glossaryTerms } } } @@ -98,7 +63,7 @@ fragment nonSiblingDatasetFields on Dataset { ...globalTagsFields } glossaryTerms { - ...simplifiedGlossaryTerms + ...glossaryTerms } subTypes { typeNames diff --git a/datahub-web-react/src/graphql/fragments.graphql b/datahub-web-react/src/graphql/fragments.graphql index 494d7065ce47f..3e1eaa3ff72be 100644 --- a/datahub-web-react/src/graphql/fragments.graphql +++ b/datahub-web-react/src/graphql/fragments.graphql @@ -73,7 +73,11 @@ fragment parentContainersFields on ParentContainersResult { fragment parentNodesFields on ParentNodesResult { count nodes { - ...glossaryNode + urn + type + properties { + name + } } } diff --git a/docker/datahub-gms/env/docker-without-neo4j.env b/docker/datahub-gms/env/docker-without-neo4j.env index 6a085266d5c88..1894f79318f79 100644 --- a/docker/datahub-gms/env/docker-without-neo4j.env +++ b/docker/datahub-gms/env/docker-without-neo4j.env @@ -8,6 +8,8 @@ KAFKA_BOOTSTRAP_SERVER=broker:29092 KAFKA_SCHEMAREGISTRY_URL=http://schema-registry:8081 ELASTICSEARCH_HOST=elasticsearch ELASTICSEARCH_PORT=9200 +ES_BULK_REFRESH_POLICY=WAIT_UNTIL +GRAPH_SERVICE_DIFF_MODE_ENABLED=true GRAPH_SERVICE_IMPL=elasticsearch JAVA_OPTS=-Xms1g -Xmx1g ENTITY_REGISTRY_CONFIG_PATH=/datahub/datahub-gms/resources/entity-registry.yml diff --git a/docker/datahub-gms/env/docker.env b/docker/datahub-gms/env/docker.env index 7c0297a3cd8ba..91055a622ea74 100644 --- a/docker/datahub-gms/env/docker.env +++ b/docker/datahub-gms/env/docker.env @@ -8,11 +8,14 @@ KAFKA_BOOTSTRAP_SERVER=broker:29092 KAFKA_SCHEMAREGISTRY_URL=http://schema-registry:8081 ELASTICSEARCH_HOST=elasticsearch ELASTICSEARCH_PORT=9200 +ES_BULK_REFRESH_POLICY=WAIT_UNTIL NEO4J_HOST=http://neo4j:7474 NEO4J_URI=bolt://neo4j NEO4J_USERNAME=neo4j NEO4J_PASSWORD=datahub JAVA_OPTS=-Xms1g -Xmx1g +ES_BULK_REFRESH_POLICY=WAIT_UNTIL +GRAPH_SERVICE_DIFF_MODE_ENABLED=true GRAPH_SERVICE_IMPL=neo4j ENTITY_REGISTRY_CONFIG_PATH=/datahub/datahub-gms/resources/entity-registry.yml ENTITY_SERVICE_ENABLE_RETENTION=true diff --git a/docker/quickstart/docker-compose-without-neo4j-m1.quickstart.yml b/docker/quickstart/docker-compose-without-neo4j-m1.quickstart.yml index 37c80cda2ef7a..bcab7a2bca683 100644 --- a/docker/quickstart/docker-compose-without-neo4j-m1.quickstart.yml +++ b/docker/quickstart/docker-compose-without-neo4j-m1.quickstart.yml @@ -73,6 +73,8 @@ services: - KAFKA_SCHEMAREGISTRY_URL=http://schema-registry:8081 - ELASTICSEARCH_HOST=elasticsearch - ELASTICSEARCH_PORT=9200 + - ES_BULK_REFRESH_POLICY=WAIT_UNTIL + - GRAPH_SERVICE_DIFF_MODE_ENABLED=true - GRAPH_SERVICE_IMPL=elasticsearch - JAVA_OPTS=-Xms1g -Xmx1g - ENTITY_REGISTRY_CONFIG_PATH=/datahub/datahub-gms/resources/entity-registry.yml diff --git a/docker/quickstart/docker-compose-without-neo4j.quickstart.yml b/docker/quickstart/docker-compose-without-neo4j.quickstart.yml index b007b59d52841..80464c98e8cd0 100644 --- a/docker/quickstart/docker-compose-without-neo4j.quickstart.yml +++ b/docker/quickstart/docker-compose-without-neo4j.quickstart.yml @@ -75,6 +75,8 @@ services: - KAFKA_SCHEMAREGISTRY_URL=http://schema-registry:8081 - ELASTICSEARCH_HOST=elasticsearch - ELASTICSEARCH_PORT=9200 + - ES_BULK_REFRESH_POLICY=WAIT_UNTIL + - GRAPH_SERVICE_DIFF_MODE_ENABLED=true - GRAPH_SERVICE_IMPL=elasticsearch - JAVA_OPTS=-Xms1g -Xmx1g - ENTITY_REGISTRY_CONFIG_PATH=/datahub/datahub-gms/resources/entity-registry.yml diff --git a/docker/quickstart/docker-compose.quickstart.yml b/docker/quickstart/docker-compose.quickstart.yml index 6c0772206a2be..2659099b960fe 100644 --- a/docker/quickstart/docker-compose.quickstart.yml +++ b/docker/quickstart/docker-compose.quickstart.yml @@ -77,11 +77,13 @@ services: - KAFKA_SCHEMAREGISTRY_URL=http://schema-registry:8081 - ELASTICSEARCH_HOST=elasticsearch - ELASTICSEARCH_PORT=9200 + - ES_BULK_REFRESH_POLICY=WAIT_UNTIL - NEO4J_HOST=http://neo4j:7474 - NEO4J_URI=bolt://neo4j - NEO4J_USERNAME=neo4j - NEO4J_PASSWORD=datahub - JAVA_OPTS=-Xms1g -Xmx1g + - GRAPH_SERVICE_DIFF_MODE_ENABLED=true - GRAPH_SERVICE_IMPL=neo4j - ENTITY_REGISTRY_CONFIG_PATH=/datahub/datahub-gms/resources/entity-registry.yml - ENTITY_SERVICE_ENABLE_RETENTION=true diff --git a/entity-registry/src/main/java/com/linkedin/metadata/models/extractor/FieldExtractor.java b/entity-registry/src/main/java/com/linkedin/metadata/models/extractor/FieldExtractor.java index bac2ec5698a11..ae3f2c9fb2297 100644 --- a/entity-registry/src/main/java/com/linkedin/metadata/models/extractor/FieldExtractor.java +++ b/entity-registry/src/main/java/com/linkedin/metadata/models/extractor/FieldExtractor.java @@ -14,6 +14,7 @@ import java.util.Optional; import java.util.function.Function; import java.util.stream.Collectors; +import javax.annotation.Nonnull; /** @@ -32,7 +33,7 @@ private static long getNumArrayWildcards(PathSpec pathSpec) { } // Extract the value of each field in the field specs from the input record - public static Map> extractFields(RecordTemplate record, List fieldSpecs) { + public static Map> extractFields(@Nonnull RecordTemplate record, List fieldSpecs) { final Map> extractedFields = new HashMap<>(); for (T fieldSpec : fieldSpecs) { Optional value = RecordUtils.getFieldValue(record, fieldSpec.getPath()); diff --git a/entity-registry/src/main/java/com/linkedin/metadata/models/registry/MergedEntityRegistry.java b/entity-registry/src/main/java/com/linkedin/metadata/models/registry/MergedEntityRegistry.java index faae7bc6e4fc1..91439f1dca9a3 100644 --- a/entity-registry/src/main/java/com/linkedin/metadata/models/registry/MergedEntityRegistry.java +++ b/entity-registry/src/main/java/com/linkedin/metadata/models/registry/MergedEntityRegistry.java @@ -31,8 +31,10 @@ public class MergedEntityRegistry implements EntityRegistry { private final Map _aspectNameToSpec; public MergedEntityRegistry(EntityRegistry baseEntityRegistry) { - entityNameToSpec = baseEntityRegistry.getEntitySpecs() != null ? baseEntityRegistry.getEntitySpecs() : new HashMap<>(); - eventNameToSpec = baseEntityRegistry.getEventSpecs() != null ? baseEntityRegistry.getEventSpecs() : new HashMap<>(); + // baseEntityRegistry.get*Specs() can return immutable Collections.emptyMap() which fails + // when this class attempts .put* operations on it. + entityNameToSpec = baseEntityRegistry.getEntitySpecs() != null ? new HashMap<>(baseEntityRegistry.getEntitySpecs()) : new HashMap<>(); + eventNameToSpec = baseEntityRegistry.getEventSpecs() != null ? new HashMap<>(baseEntityRegistry.getEventSpecs()) : new HashMap<>(); baseEntityRegistry.getAspectTemplateEngine(); _aspectTemplateEngine = baseEntityRegistry.getAspectTemplateEngine(); _aspectNameToSpec = baseEntityRegistry.getAspectSpecs(); diff --git a/metadata-ingestion/setup.py b/metadata-ingestion/setup.py index f1b31a96e5afc..2810d570fbd8c 100644 --- a/metadata-ingestion/setup.py +++ b/metadata-ingestion/setup.py @@ -57,6 +57,7 @@ def get_long_description(): rest_common = { "requests", + "requests_file" } kafka_common = { diff --git a/metadata-ingestion/src/datahub/cli/docker_cli.py b/metadata-ingestion/src/datahub/cli/docker_cli.py index 0aa66b1a2efbb..e89a864febf0a 100644 --- a/metadata-ingestion/src/datahub/cli/docker_cli.py +++ b/metadata-ingestion/src/datahub/cli/docker_cli.py @@ -17,6 +17,7 @@ import pydantic import requests from expandvars import expandvars +from requests_file import FileAdapter from datahub.cli.cli_utils import DATAHUB_ROOT_FOLDER from datahub.cli.docker_check import ( @@ -47,16 +48,19 @@ BOOTSTRAP_MCES_FILE = "metadata-ingestion/examples/mce_files/bootstrap_mce.json" -GITHUB_BASE_URL = "https://raw.githubusercontent.com/datahub-project/datahub/master" +DOCKER_COMPOSE_BASE = os.getenv( + "DOCKER_COMPOSE_BASE", + "https://raw.githubusercontent.com/datahub-project/datahub/master", +) -GITHUB_NEO4J_AND_ELASTIC_QUICKSTART_COMPOSE_URL = ( - f"{GITHUB_BASE_URL}/{NEO4J_AND_ELASTIC_QUICKSTART_COMPOSE_FILE}" +NEO4J_AND_ELASTIC_QUICKSTART_COMPOSE_URL = ( + f"{DOCKER_COMPOSE_BASE}/{NEO4J_AND_ELASTIC_QUICKSTART_COMPOSE_FILE}" ) -GITHUB_ELASTIC_QUICKSTART_COMPOSE_URL = ( - f"{GITHUB_BASE_URL}/{ELASTIC_QUICKSTART_COMPOSE_FILE}" +ELASTIC_QUICKSTART_COMPOSE_URL = ( + f"{DOCKER_COMPOSE_BASE}/{ELASTIC_QUICKSTART_COMPOSE_FILE}" ) -GITHUB_M1_QUICKSTART_COMPOSE_URL = f"{GITHUB_BASE_URL}/{M1_QUICKSTART_COMPOSE_FILE}" -GITHUB_BOOTSTRAP_MCES_URL = f"{GITHUB_BASE_URL}/{BOOTSTRAP_MCES_FILE}" +M1_QUICKSTART_COMPOSE_URL = f"{DOCKER_COMPOSE_BASE}/{M1_QUICKSTART_COMPOSE_FILE}" +BOOTSTRAP_MCES_URL = f"{DOCKER_COMPOSE_BASE}/{BOOTSTRAP_MCES_FILE}" class Architectures(Enum): @@ -630,13 +634,17 @@ def quickstart( fg="red", ) github_file = ( - GITHUB_NEO4J_AND_ELASTIC_QUICKSTART_COMPOSE_URL + NEO4J_AND_ELASTIC_QUICKSTART_COMPOSE_URL if should_use_neo4j and not is_arch_m1(quickstart_arch) - else GITHUB_ELASTIC_QUICKSTART_COMPOSE_URL + else ELASTIC_QUICKSTART_COMPOSE_URL if not is_arch_m1(quickstart_arch) - else GITHUB_M1_QUICKSTART_COMPOSE_URL + else M1_QUICKSTART_COMPOSE_URL ) + # also allow local files + request_session = requests.Session() + request_session.mount("file://", FileAdapter()) + with open( default_quickstart_compose_file, "wb" ) if default_quickstart_compose_file else tempfile.NamedTemporaryFile( @@ -646,16 +654,16 @@ def quickstart( quickstart_compose_file.append(path) click.echo(f"Fetching docker-compose file {github_file} from GitHub") # Download the quickstart docker-compose file from GitHub. - quickstart_download_response = requests.get(github_file) + quickstart_download_response = request_session.get(github_file) quickstart_download_response.raise_for_status() tmp_file.write(quickstart_download_response.content) logger.debug(f"Copied to {path}") if standalone_consumers: consumer_github_file = ( - f"{GITHUB_BASE_URL}/{CONSUMERS_QUICKSTART_COMPOSE_FILE}" + f"{DOCKER_COMPOSE_BASE}/{CONSUMERS_QUICKSTART_COMPOSE_FILE}" if should_use_neo4j - else f"{GITHUB_BASE_URL}/{ELASTIC_CONSUMERS_QUICKSTART_COMPOSE_FILE}" + else f"{DOCKER_COMPOSE_BASE}/{ELASTIC_CONSUMERS_QUICKSTART_COMPOSE_FILE}" ) default_consumer_compose_file = ( @@ -672,7 +680,7 @@ def quickstart( f"Fetching consumer docker-compose file {consumer_github_file} from GitHub" ) # Download the quickstart docker-compose file from GitHub. - quickstart_download_response = requests.get(consumer_github_file) + quickstart_download_response = request_session.get(consumer_github_file) quickstart_download_response.raise_for_status() tmp_file.write(quickstart_download_response.content) logger.debug(f"Copied to {path}") @@ -839,7 +847,7 @@ def ingest_sample_data(path: Optional[str], token: Optional[str]) -> None: path = str(pathlib.Path(tmp_file.name)) # Download the bootstrap MCE file from GitHub. - mce_json_download_response = requests.get(GITHUB_BOOTSTRAP_MCES_URL) + mce_json_download_response = requests.get(BOOTSTRAP_MCES_URL) mce_json_download_response.raise_for_status() tmp_file.write(mce_json_download_response.content) click.echo(f"Downloaded to {path}") diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_usage_v2.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_usage_v2.py index 60e0b3874d081..c0d4408abe230 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_usage_v2.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_usage_v2.py @@ -194,7 +194,7 @@ def _map_top_sql_queries(self, top_sql_queries: Dict) -> List[str]: [ trim_query(format_sql_query(query), budget_per_query) if self.config.format_sql_queries - else query + else trim_query(query, budget_per_query) for query in top_sql_queries ] ) diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql/mssql.py b/metadata-ingestion/src/datahub/ingestion/source/sql/mssql.py index a1f9c893fbe25..a8fc4d0bb413b 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/sql/mssql.py +++ b/metadata-ingestion/src/datahub/ingestion/source/sql/mssql.py @@ -136,13 +136,16 @@ def __init__(self, config: SQLServerConfig, ctx: PipelineContext): @staticmethod def _add_output_converters(conn: Connection) -> None: def handle_sql_variant_as_string(value): - return value.decode('utf-16le') + return value.decode("utf-16le") + # see https://stackoverflow.com/questions/45677374/pandas-pyodbc-odbc-sql-type-150-is-not-yet-supported # and https://stackoverflow.com/questions/11671170/adding-output-converter-to-pyodbc-connection-in-sqlalchemy try: conn.connection.add_output_converter(-150, handle_sql_variant_as_string) except AttributeError as e: - logger.debug(f"Failed to mount output converter for MSSQL data type -150 due to {e}") + logger.debug( + f"Failed to mount output converter for MSSQL data type -150 due to {e}" + ) def _populate_table_descriptions(self, conn: Connection, db_name: str) -> None: # see https://stackoverflow.com/questions/5953330/how-do-i-map-the-id-in-sys-extended-properties-to-an-object-name diff --git a/metadata-ingestion/src/datahub/ingestion/source/state/checkpoint.py b/metadata-ingestion/src/datahub/ingestion/source/state/checkpoint.py index 7c233a3a39da7..0df8c9ddce99c 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/state/checkpoint.py +++ b/metadata-ingestion/src/datahub/ingestion/source/state/checkpoint.py @@ -118,7 +118,7 @@ def create_from_checkpoint_aspect( except Exception as e: # Failure to load config is probably okay...config structure has changed. logger.warning( - "Failed to construct checkpoint's config from checkpoint aspect.", e + "Failed to construct checkpoint's config from checkpoint aspect. %s", e ) else: try: diff --git a/metadata-integration/java/datahub-client/src/test/java/datahub/client/kafka/KafkaEmitterTest.java b/metadata-integration/java/datahub-client/src/test/java/datahub/client/kafka/KafkaEmitterTest.java index 8940d951bfc10..213e987e74d88 100644 --- a/metadata-integration/java/datahub-client/src/test/java/datahub/client/kafka/KafkaEmitterTest.java +++ b/metadata-integration/java/datahub-client/src/test/java/datahub/client/kafka/KafkaEmitterTest.java @@ -15,7 +15,6 @@ import org.apache.kafka.clients.admin.KafkaAdminClient; import org.apache.kafka.clients.admin.NewTopic; import org.apache.kafka.clients.producer.ProducerConfig; -import org.apache.kafka.common.errors.TimeoutException; import org.junit.BeforeClass; import org.junit.Test; import org.testcontainers.containers.Network; @@ -106,7 +105,7 @@ private static String createTopics(Stream bootstraps) { try { createAdminClient(bootstrap).createTopics(singletonList(new NewTopic(TOPIC, partitions, replicationFactor))).all().get(); return bootstrap; - } catch (TimeoutException | InterruptedException | ExecutionException ex) { + } catch (RuntimeException | InterruptedException | ExecutionException ex) { return null; } }).filter(Objects::nonNull).findFirst().get(); diff --git a/metadata-integration/java/spark-lineage/spark-smoke-test/golden_json/JavaHdfsIn2HdfsOut2.json b/metadata-integration/java/spark-lineage/spark-smoke-test/golden_json/JavaHdfsIn2HdfsOut2.json index bc73d93364752..3d407c1e03546 100644 --- a/metadata-integration/java/spark-lineage/spark-smoke-test/golden_json/JavaHdfsIn2HdfsOut2.json +++ b/metadata-integration/java/spark-lineage/spark-smoke-test/golden_json/JavaHdfsIn2HdfsOut2.json @@ -116,11 +116,11 @@ { "com.linkedin.datajob.DataJobInputOutput": { "inputDatasets": [ - "urn:li:dataset:(urn:li:dataPlatform:hdfs,file:/opt/workspace/resources/data/in1.csv,PROD)", - "urn:li:dataset:(urn:li:dataPlatform:hdfs,file:/opt/workspace/resources/data/in2.csv,PROD)" + "urn:li:dataset:(urn:li:dataPlatform:file,file:/opt/workspace/resources/data/in1.csv,PROD)", + "urn:li:dataset:(urn:li:dataPlatform:file,file:/opt/workspace/resources/data/in2.csv,PROD)" ], "outputDatasets": [ - "urn:li:dataset:(urn:li:dataPlatform:hdfs,file:/opt/workspace/resources/data/JavaHdfsIn2HdfsOut2/out.csv,PROD)" + "urn:li:dataset:(urn:li:dataPlatform:file,file:/opt/workspace/resources/data/JavaHdfsIn2HdfsOut2/out.csv,PROD)" ] } }, diff --git a/metadata-integration/java/spark-lineage/spark-smoke-test/golden_json/JavaHiveInHiveOut.json b/metadata-integration/java/spark-lineage/spark-smoke-test/golden_json/JavaHiveInHiveOut.json index 8cbf34dae1904..b9a00de36321b 100644 --- a/metadata-integration/java/spark-lineage/spark-smoke-test/golden_json/JavaHiveInHiveOut.json +++ b/metadata-integration/java/spark-lineage/spark-smoke-test/golden_json/JavaHiveInHiveOut.json @@ -200,42 +200,6 @@ "jobId": "QueryExecId_11", "flow": "urn:li:dataFlow:(spark,JavaHiveInHiveOut,spark_spark-master_7077)" } - }, - { - "com.linkedin.common.DataPlatformInstance": { - "platform": "urn:li:dataPlatform:spark" - } - }, - { - "com.linkedin.datajob.DataJobInfo": { - "name": "insertInto at HiveInHiveOut.java:44", - "type": { - "string": "sparkJob" - }, - "customProperties": { - "SQLQueryId": "11", - "appName": "JavaHiveInHiveOut", - "description": "insertInto at HiveInHiveOut.java:44", - "queryPlan": "InsertIntoHiveTable `javahiveinhiveout`.`hivetab`, org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe, false, false, [a, b, c, d]\n+- HiveTableRelation `javahiveinhiveout`.`foo5`, org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe, [a#90, b#91, c#92, d#93]\n" - } - } - }, - { - "com.linkedin.datajob.DataJobInputOutput": { - "inputDatasets": [ - "urn:li:dataset:(urn:li:dataPlatform:hive,javahiveinhiveout.foo5,PROD)" - ], - "outputDatasets": [ - "urn:li:dataset:(urn:li:dataPlatform:hive,javahiveinhiveout.hivetab,PROD)" - ] - } - }, - { - "com.linkedin.common.BrowsePaths": { - "paths": [ - "/spark/spark_spark-master_7077" - ] - } } ] } diff --git a/metadata-integration/java/spark-lineage/spark-smoke-test/golden_json/PythonHdfsIn2HdfsOut2.json b/metadata-integration/java/spark-lineage/spark-smoke-test/golden_json/PythonHdfsIn2HdfsOut2.json index 2c0b699503cb7..33887f266ac24 100644 --- a/metadata-integration/java/spark-lineage/spark-smoke-test/golden_json/PythonHdfsIn2HdfsOut2.json +++ b/metadata-integration/java/spark-lineage/spark-smoke-test/golden_json/PythonHdfsIn2HdfsOut2.json @@ -50,11 +50,11 @@ { "com.linkedin.datajob.DataJobInputOutput": { "inputDatasets": [ - "urn:li:dataset:(urn:li:dataPlatform:hdfs,file:/opt/workspace/resources/data/in1.csv,PROD)", - "urn:li:dataset:(urn:li:dataPlatform:hdfs,file:/opt/workspace/resources/data/in2.csv,PROD)" + "urn:li:dataset:(urn:li:dataPlatform:file,file:/opt/workspace/resources/data/in1.csv,PROD)", + "urn:li:dataset:(urn:li:dataPlatform:file,file:/opt/workspace/resources/data/in2.csv,PROD)" ], "outputDatasets": [ - "urn:li:dataset:(urn:li:dataPlatform:hdfs,file:/opt/workspace/resources/data/PythonHdfsIn2HdfsOut2/out1.csv,PROD)" + "urn:li:dataset:(urn:li:dataPlatform:file,file:/opt/workspace/resources/data/PythonHdfsIn2HdfsOut2/out1.csv,PROD)" ] } }, diff --git a/metadata-integration/java/spark-lineage/spark-smoke-test/golden_json/PythonHiveInHiveOut.json b/metadata-integration/java/spark-lineage/spark-smoke-test/golden_json/PythonHiveInHiveOut.json index 09e9eabef4e51..3a85cb954159e 100644 --- a/metadata-integration/java/spark-lineage/spark-smoke-test/golden_json/PythonHiveInHiveOut.json +++ b/metadata-integration/java/spark-lineage/spark-smoke-test/golden_json/PythonHiveInHiveOut.json @@ -200,42 +200,6 @@ "jobId": "QueryExecId_11", "flow": "urn:li:dataFlow:(spark,PythonHiveInHiveOut,spark_spark-master_7077)" } - }, - { - "com.linkedin.common.DataPlatformInstance": { - "platform": "urn:li:dataPlatform:spark" - } - }, - { - "com.linkedin.datajob.DataJobInputOutput": { - "inputDatasets": [ - "urn:li:dataset:(urn:li:dataPlatform:hive,pythonhiveinhiveout.foo5,PROD)" - ], - "outputDatasets": [ - "urn:li:dataset:(urn:li:dataPlatform:hive,pythonhiveinhiveout.hivetab,PROD)" - ] - } - }, - { - "com.linkedin.common.BrowsePaths": { - "paths": [ - "/spark/spark_spark-master_7077" - ] - } - }, - { - "com.linkedin.datajob.DataJobInfo": { - "name": "insertInto at NativeMethodAccessorImpl.java:0", - "type": { - "string": "sparkJob" - }, - "customProperties": { - "SQLQueryId": "11", - "appName": "PythonHiveInHiveOut", - "description": "insertInto at NativeMethodAccessorImpl.java:0", - "queryPlan": "InsertIntoHiveTable `pythonhiveinhiveout`.`hivetab`, org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe, false, false, [a, b, c, d]\n+- HiveTableRelation `pythonhiveinhiveout`.`foo5`, org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe, [a#90, b#91, c#92, d#93]\n" - } - } } ] } diff --git a/metadata-integration/java/spark-lineage/spark-smoke-test/test_e2e.py b/metadata-integration/java/spark-lineage/spark-smoke-test/test_e2e.py index e5d83279d2cfe..412982db45497 100644 --- a/metadata-integration/java/spark-lineage/spark-smoke-test/test_e2e.py +++ b/metadata-integration/java/spark-lineage/spark-smoke-test/test_e2e.py @@ -1,19 +1,10 @@ import json -import time -# import urllib -# from typing import Any, Dict, Optional, cast import pytest import requests import os from jsoncomparison import Compare, NO_DIFF -# from datahub.ingestion.run.pipeline import Pipeline -# from datahub.ingestion.source.sql.mysql import MySQLConfig, MySQLSource -# from datahub.ingestion.source.sql.sql_common import BaseSQLAlchemyCheckpointState -# from datahub.ingestion.source.state.checkpoint import Checkpoint -# from tests.utils import ingest_file_via_rest - GMS_ENDPOINT = "http://localhost:8080" GOLDEN_FILES_PATH = "./spark-smoke-test/golden_json/" golden_files = os.listdir(GOLDEN_FILES_PATH) @@ -23,7 +14,6 @@ restli_default_headers = { "X-RestLi-Protocol-Version": "2.0.0", } -kafka_post_ingestion_wait_sec = 60 JSONDIFF_CONFIG = { 'output': { diff --git a/metadata-io/build.gradle b/metadata-io/build.gradle index 131c457293cb4..3c0edff7ac422 100644 --- a/metadata-io/build.gradle +++ b/metadata-io/build.gradle @@ -55,6 +55,7 @@ dependencies { testCompile externalDependency.testContainersCassandra testCompile externalDependency.lombok testCompile project(':test-models') + testImplementation externalDependency.springBootTest testAnnotationProcessor externalDependency.lombok diff --git a/metadata-io/src/main/java/com/linkedin/metadata/graph/GraphService.java b/metadata-io/src/main/java/com/linkedin/metadata/graph/GraphService.java index bc8815c35919a..c5dbbf4e25054 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/graph/GraphService.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/graph/GraphService.java @@ -32,6 +32,12 @@ public interface GraphService { */ void addEdge(final Edge edge); + /** + * Remove an edge from the graph. + * @param edge the edge to delete + */ + void removeEdge(final Edge edge); + /** * Find related entities (nodes) connected to a source entity via edges of given relationship types. Related entities * can be filtered by source and destination type (use `null` for any type), by source and destination entity filter diff --git a/metadata-io/src/main/java/com/linkedin/metadata/graph/dgraph/DgraphGraphService.java b/metadata-io/src/main/java/com/linkedin/metadata/graph/dgraph/DgraphGraphService.java index e532cc6028733..756138d3da46a 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/graph/dgraph/DgraphGraphService.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/graph/dgraph/DgraphGraphService.java @@ -385,6 +385,11 @@ protected static String getQueryForRelatedEntities(@Nullable List source relationships); } + @Override + public void removeEdge(final Edge edge) { + throw new UnsupportedOperationException("Remove edge not supported by DgraphGraphService at this time."); + } + @Nonnull @Override public RelatedEntitiesResult findRelatedEntities(@Nullable List sourceTypes, diff --git a/metadata-io/src/main/java/com/linkedin/metadata/graph/elastic/ESGraphQueryDAO.java b/metadata-io/src/main/java/com/linkedin/metadata/graph/elastic/ESGraphQueryDAO.java index ba1342e33f53f..90f7193798bde 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/graph/elastic/ESGraphQueryDAO.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/graph/elastic/ESGraphQueryDAO.java @@ -71,6 +71,7 @@ public class ESGraphQueryDAO { private static final String SOURCE = "source"; private static final String DESTINATION = "destination"; private static final String RELATIONSHIP_TYPE = "relationshipType"; + private static final String SEARCH_EXECUTIONS_METRIC = "num_elasticSearch_reads"; @Nonnull public static void addFilterToQueryBuilder(@Nonnull Filter filter, String node, BoolQueryBuilder rootQuery) { @@ -103,6 +104,7 @@ private SearchResponse executeSearchQuery(@Nonnull final QueryBuilder query, fin searchRequest.indices(indexConvention.getIndexName(INDEX_NAME)); try (Timer.Context ignored = MetricUtils.timer(this.getClass(), "esQuery").time()) { + MetricUtils.counter(this.getClass(), SEARCH_EXECUTIONS_METRIC).inc(); return client.search(searchRequest, RequestOptions.DEFAULT); } catch (Exception e) { log.error("Search query failed", e); diff --git a/metadata-io/src/main/java/com/linkedin/metadata/graph/elastic/ESGraphWriteDAO.java b/metadata-io/src/main/java/com/linkedin/metadata/graph/elastic/ESGraphWriteDAO.java index ab77ca8e2d205..a5c41c482da25 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/graph/elastic/ESGraphWriteDAO.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/graph/elastic/ESGraphWriteDAO.java @@ -3,22 +3,19 @@ import com.google.common.collect.ImmutableList; import com.linkedin.metadata.query.filter.Filter; import com.linkedin.metadata.query.filter.RelationshipFilter; +import com.linkedin.metadata.search.elasticsearch.update.ESBulkProcessor; import com.linkedin.metadata.utils.elasticsearch.IndexConvention; -import java.io.IOException; import java.util.List; import javax.annotation.Nonnull; import javax.annotation.Nullable; import lombok.RequiredArgsConstructor; import lombok.extern.slf4j.Slf4j; -import org.elasticsearch.action.bulk.BulkProcessor; +import org.elasticsearch.action.delete.DeleteRequest; import org.elasticsearch.action.index.IndexRequest; import org.elasticsearch.action.update.UpdateRequest; -import org.elasticsearch.client.RequestOptions; -import org.elasticsearch.client.RestHighLevelClient; import org.elasticsearch.common.xcontent.XContentType; import org.elasticsearch.index.query.BoolQueryBuilder; import org.elasticsearch.index.reindex.BulkByScrollResponse; -import org.elasticsearch.index.reindex.DeleteByQueryRequest; import static com.linkedin.metadata.graph.elastic.ESGraphQueryDAO.buildQuery; import static com.linkedin.metadata.graph.elastic.ElasticSearchGraphService.INDEX_NAME; @@ -27,9 +24,11 @@ @Slf4j @RequiredArgsConstructor public class ESGraphWriteDAO { - private final RestHighLevelClient client; private final IndexConvention indexConvention; - private final BulkProcessor bulkProcessor; + private final ESBulkProcessor bulkProcessor; + private final int numRetries; + + private static final String ES_WRITES_METRIC = "num_elasticSearch_writes"; /** * Updates or inserts the given search document. @@ -42,11 +41,23 @@ public void upsertDocument(@Nonnull String docId, @Nonnull String document) { new IndexRequest(indexConvention.getIndexName(INDEX_NAME)).id(docId).source(document, XContentType.JSON); final UpdateRequest updateRequest = new UpdateRequest(indexConvention.getIndexName(INDEX_NAME), docId).doc(document, XContentType.JSON) - .detectNoop(false) - .upsert(indexRequest); + .detectNoop(false) + .retryOnConflict(numRetries) + .upsert(indexRequest); bulkProcessor.add(updateRequest); } + /** + * Deletes the given search document. + * + * @param docId the ID of the document + */ + public void deleteDocument(@Nonnull String docId) { + final DeleteRequest deleteRequest = + new DeleteRequest(indexConvention.getIndexName(INDEX_NAME)).id(docId); + bulkProcessor.add(deleteRequest); + } + public BulkByScrollResponse deleteByQuery(@Nullable final String sourceType, @Nonnull final Filter sourceEntityFilter, @Nullable final String destinationType, @Nonnull final Filter destinationEntityFilter, @Nonnull final List relationshipTypes, @Nonnull final RelationshipFilter relationshipFilter) { @@ -55,19 +66,7 @@ public BulkByScrollResponse deleteByQuery(@Nullable final String sourceType, @No destinationType == null ? ImmutableList.of() : ImmutableList.of(destinationType), destinationEntityFilter, relationshipTypes, relationshipFilter); - DeleteByQueryRequest deleteByQueryRequest = new DeleteByQueryRequest(); - - deleteByQueryRequest.setQuery(finalQuery); - - deleteByQueryRequest.indices(indexConvention.getIndexName(INDEX_NAME)); - - try { - final BulkByScrollResponse deleteResponse = client.deleteByQuery(deleteByQueryRequest, RequestOptions.DEFAULT); - return deleteResponse; - } catch (IOException e) { - log.error("ERROR: Failed to delete by query. See stacktrace for a more detailed error:"); - e.printStackTrace(); - } - return null; + return bulkProcessor.deleteByQuery(finalQuery, indexConvention.getIndexName(INDEX_NAME)) + .orElse(null); } } diff --git a/metadata-io/src/main/java/com/linkedin/metadata/graph/elastic/ElasticSearchGraphService.java b/metadata-io/src/main/java/com/linkedin/metadata/graph/elastic/ElasticSearchGraphService.java index 41787d5dd9b76..a32ff7e8efb2f 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/graph/elastic/ElasticSearchGraphService.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/graph/elastic/ElasticSearchGraphService.java @@ -2,6 +2,7 @@ import com.fasterxml.jackson.databind.node.JsonNodeFactory; import com.fasterxml.jackson.databind.node.ObjectNode; +import com.google.common.annotations.VisibleForTesting; import com.google.common.collect.ImmutableList; import com.linkedin.common.urn.Urn; import com.linkedin.metadata.graph.Edge; @@ -22,6 +23,7 @@ import com.linkedin.metadata.query.filter.RelationshipDirection; import com.linkedin.metadata.query.filter.RelationshipFilter; import com.linkedin.metadata.search.elasticsearch.indexbuilder.ESIndexBuilder; +import com.linkedin.metadata.search.elasticsearch.update.ESBulkProcessor; import com.linkedin.metadata.utils.elasticsearch.IndexConvention; import io.opentelemetry.extension.annotations.WithSpan; import java.io.IOException; @@ -42,10 +44,7 @@ import lombok.RequiredArgsConstructor; import lombok.extern.slf4j.Slf4j; import org.elasticsearch.action.search.SearchResponse; -import org.elasticsearch.client.RequestOptions; -import org.elasticsearch.client.RestHighLevelClient; import org.elasticsearch.index.query.QueryBuilders; -import org.elasticsearch.index.reindex.DeleteByQueryRequest; @Slf4j @@ -53,7 +52,7 @@ public class ElasticSearchGraphService implements GraphService { private final LineageRegistry _lineageRegistry; - private final RestHighLevelClient _searchClient; + private final ESBulkProcessor _esBulkProcessor; private final IndexConvention _indexConvention; private final ESGraphWriteDAO _graphWriteDAO; private final ESGraphQueryDAO _graphReadDAO; @@ -101,12 +100,19 @@ public LineageRegistry getLineageRegistry() { return _lineageRegistry; } + @Override public void addEdge(@Nonnull final Edge edge) { String docId = toDocId(edge); String edgeDocument = toDocument(edge); _graphWriteDAO.upsertDocument(docId, edgeDocument); } + @Override + public void removeEdge(@Nonnull final Edge edge) { + String docId = toDocId(edge); + _graphWriteDAO.deleteDocument(docId); + } + @Nonnull public RelatedEntitiesResult findRelatedEntities( @Nullable final List sourceTypes, @@ -248,15 +254,10 @@ public void configure() { } } + @VisibleForTesting @Override public void clear() { - DeleteByQueryRequest deleteRequest = - new DeleteByQueryRequest(_indexConvention.getIndexName(INDEX_NAME)).setQuery(QueryBuilders.matchAllQuery()); - try { - _searchClient.deleteByQuery(deleteRequest, RequestOptions.DEFAULT); - } catch (Exception e) { - log.error("Failed to clear graph service: {}", e.toString()); - } + _esBulkProcessor.deleteByQuery(QueryBuilders.matchAllQuery(), true, _indexConvention.getIndexName(INDEX_NAME)); } @Override diff --git a/metadata-io/src/main/java/com/linkedin/metadata/graph/neo4j/Neo4jGraphService.java b/metadata-io/src/main/java/com/linkedin/metadata/graph/neo4j/Neo4jGraphService.java index 9b87c59d1e44c..39cedb86f843b 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/graph/neo4j/Neo4jGraphService.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/graph/neo4j/Neo4jGraphService.java @@ -74,6 +74,7 @@ public LineageRegistry getLineageRegistry() { return _lineageRegistry; } + @Override public void addEdge(@Nonnull final Edge edge) { log.debug(String.format("Adding Edge source: %s, destination: %s, type: %s", @@ -106,6 +107,11 @@ public void addEdge(@Nonnull final Edge edge) { executeStatements(statements); } + @Override + public void removeEdge(final Edge edge) { + throw new UnsupportedOperationException("Remove edge not supported by Neo4JGraphService at this time."); + } + @Nonnull @Override public EntityLineageResult getLineage(@Nonnull Urn entityUrn, @Nonnull LineageDirection direction, diff --git a/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/indexbuilder/ESIndexBuilder.java b/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/indexbuilder/ESIndexBuilder.java index 46c8c8334028b..2bd55f9edc467 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/indexbuilder/ESIndexBuilder.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/indexbuilder/ESIndexBuilder.java @@ -1,5 +1,6 @@ package com.linkedin.metadata.search.elasticsearch.indexbuilder; +import com.fasterxml.jackson.databind.ObjectMapper; import com.google.common.collect.ImmutableList; import com.google.common.collect.ImmutableMap; import com.google.common.collect.MapDifference; @@ -10,7 +11,11 @@ import java.util.Map; import java.util.Objects; import java.util.concurrent.TimeUnit; +import java.util.stream.Collectors; +import java.util.stream.Stream; import javax.annotation.Nonnull; + +import lombok.Getter; import lombok.RequiredArgsConstructor; import lombok.extern.slf4j.Slf4j; import org.elasticsearch.action.admin.cluster.node.tasks.list.ListTasksRequest; @@ -32,6 +37,7 @@ import org.elasticsearch.common.settings.Settings; import org.elasticsearch.index.query.QueryBuilders; import org.elasticsearch.index.reindex.ReindexRequest; +import org.elasticsearch.action.admin.indices.settings.put.UpdateSettingsRequest; @Slf4j @@ -39,11 +45,35 @@ public class ESIndexBuilder { private final RestHighLevelClient searchClient; + @Getter private final int numShards; + + @Getter private final int numReplicas; + + @Getter private final int numRetries; - private static final List SETTINGS_TO_COMPARE = ImmutableList.of("number_of_shards", "number_of_replicas"); + @Getter + private final int refreshIntervalSeconds; + + @Getter + private final Map> indexSettingOverrides; + + @Getter + private final boolean enableIndexSettingsReindex; + + private final static ObjectMapper OBJECT_MAPPER = new ObjectMapper(); + + /* + Most index settings are default values and populated by Elastic. This list is an include list to determine which + settings we care about when a difference is present. + */ + private static final List SETTINGS_DYNAMIC = ImmutableList.of("number_of_replicas", "refresh_interval"); + // These setting require reindex + private static final List SETTINGS_STATIC = ImmutableList.of("number_of_shards"); + private static final List SETTINGS = Stream.concat( + SETTINGS_DYNAMIC.stream(), SETTINGS_STATIC.stream()).collect(Collectors.toList()); public void buildIndex(String indexName, Map mappings, Map settings) throws IOException { @@ -53,6 +83,8 @@ public void buildIndex(String indexName, Map mappings, Map baseSettings = new HashMap<>(settings); baseSettings.put("number_of_shards", numShards); baseSettings.put("number_of_replicas", numReplicas); + baseSettings.put("refresh_interval", String.format("%ss", refreshIntervalSeconds)); + baseSettings.putAll(indexSettingOverrides.getOrDefault(indexName, Map.of())); Map finalSettings = ImmutableMap.of("index", baseSettings); // If index doesn't exist, create index @@ -70,26 +102,30 @@ public void buildIndex(String indexName, Map mappings, Map mappingsDiff = Maps.difference((Map) oldMappings.get("properties"), - (Map) mappings.get("properties")); + MapDifference mappingsDiff = Maps.difference( + (Map) oldMappings.getOrDefault("properties", Map.of()), + (Map) mappings.getOrDefault("properties", Map.of())); Settings oldSettings = searchClient.indices() .getSettings(new GetSettingsRequest().indices(indexName), RequestOptions.DEFAULT) .getIndexToSettings() .valuesIt() .next(); - boolean isSettingsEqual = equals(finalSettings, oldSettings); + + final boolean isAnalysisEqual = isAnalysisEqual(finalSettings, oldSettings); + final boolean isSettingsEqual = isSettingsEqual(finalSettings, oldSettings); + final boolean isSettingsReindexRequired = isSettingsReindexRequired(finalSettings, oldSettings); // If there are no updates to mappings and settings, return - if (mappingsDiff.areEqual() && isSettingsEqual) { + if (mappingsDiff.areEqual() && isAnalysisEqual && isSettingsEqual) { log.info("No updates to index {}", indexName); return; } // If there are no updates to settings, and there are only pure additions to mappings (no updates to existing fields), // there is no need to reindex. Just update mappings - if (isSettingsEqual && isPureAddition(mappingsDiff)) { - log.info("New fields have been added to index {}. Updating index in place", indexName); + if (isAnalysisEqual && isPureAddition(mappingsDiff) && isSettingsEqual) { + log.info("New fields have been added to index {}. Updating index in place. Adding: {}", indexName, mappingsDiff); PutMappingRequest request = new PutMappingRequest(indexName).source(mappings); searchClient.indices().putMapping(request, RequestOptions.DEFAULT); log.info("Updated index {} with new mappings", indexName); @@ -97,11 +133,49 @@ public void buildIndex(String indexName, Map mappings, Map indexSettings = ((Map) finalSettings.get("index")) + .entrySet().stream() + .filter(e -> SETTINGS_DYNAMIC.contains(e.getKey())) + .collect(Collectors.toMap(e -> "index." + e.getKey(), Map.Entry::getValue)); + + /* + We might not have any changes that can be applied without reindex. This is the case when a reindex + is needed due to a setting, but not allowed. We don't want to apply empty settings for no reason. + */ + if (!indexSettings.isEmpty()) { + request.settings(indexSettings); + boolean ack = searchClient.indices().putSettings(request, RequestOptions.DEFAULT).isAcknowledged(); + log.info("Updated index {} with new settings. Settings: {}, Acknowledged: {}", indexName, + OBJECT_MAPPER.writeValueAsString(indexSettings), ack); + } + } } + } + private void reindex(String indexName, Map mappings, Map finalSettings) + throws IOException { String tempIndexName = indexName + "_" + System.currentTimeMillis(); createIndex(tempIndexName, mappings, finalSettings); try { @@ -205,12 +279,12 @@ private void createIndex(String indexName, Map mappings, Map mapDifference) { + private static boolean isPureAddition(MapDifference mapDifference) { return !mapDifference.areEqual() && mapDifference.entriesDiffering().isEmpty() && !mapDifference.entriesOnlyOnRight().isEmpty(); } - private boolean equals(Map newSettings, Settings oldSettings) { + private static boolean isAnalysisEqual(Map newSettings, Settings oldSettings) { if (!newSettings.containsKey("index")) { return true; } @@ -221,15 +295,34 @@ private boolean equals(Map newSettings, Settings oldSettings) { // Compare analysis section Map newAnalysis = (Map) indexSettings.get("analysis"); Settings oldAnalysis = oldSettings.getByPrefix("index.analysis."); - if (!equalsGroup(newAnalysis, oldAnalysis)) { + return equalsGroup(newAnalysis, oldAnalysis); + } + + private static boolean isSettingsEqual(Map newSettings, Settings oldSettings) { + if (!newSettings.containsKey("index")) { + return true; + } + Map indexSettings = (Map) newSettings.get("index"); + return SETTINGS.stream() + .allMatch(settingKey -> Objects.equals(indexSettings.get(settingKey).toString(), oldSettings.get("index." + settingKey))); + } + + private static boolean isSettingsReindexRequired(Map newSettings, Settings oldSettings) { + if (!newSettings.containsKey("index")) { return false; } - // Compare remaining settings - return SETTINGS_TO_COMPARE.stream() - .noneMatch(settingKey -> Objects.equals(indexSettings.get(settingKey), oldSettings.get("index." + settingKey))); + Map indexSettings = (Map) newSettings.get("index"); + + if (SETTINGS_STATIC.stream().anyMatch(settingKey -> + !Objects.equals(indexSettings.get(settingKey).toString(), oldSettings.get("index." + settingKey)))) { + return true; + } + + return indexSettings.containsKey("analysis") + && !equalsGroup((Map) indexSettings.get("analysis"), oldSettings.getByPrefix("index.analysis.")); } - private boolean equalsGroup(Map newSettings, Settings oldSettings) { + private static boolean equalsGroup(Map newSettings, Settings oldSettings) { if (!newSettings.keySet().equals(oldSettings.names())) { return false; } diff --git a/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/update/BulkListener.java b/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/update/BulkListener.java index 5420a49741137..1e4751459a48c 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/update/BulkListener.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/update/BulkListener.java @@ -1,22 +1,41 @@ package com.linkedin.metadata.search.elasticsearch.update; +import com.linkedin.metadata.utils.metrics.MetricUtils; import lombok.extern.slf4j.Slf4j; +import org.elasticsearch.action.DocWriteRequest; import org.elasticsearch.action.bulk.BulkProcessor; import org.elasticsearch.action.bulk.BulkRequest; import org.elasticsearch.action.bulk.BulkResponse; +import org.elasticsearch.action.support.WriteRequest; + +import java.util.Arrays; +import java.util.HashMap; +import java.util.Map; +import java.util.stream.Collectors; @Slf4j public class BulkListener implements BulkProcessor.Listener { - private static final BulkListener INSTANCE = new BulkListener(); + private static final Map INSTANCES = new HashMap<>(); public static BulkListener getInstance() { - return INSTANCE; + return INSTANCES.computeIfAbsent(null, BulkListener::new); + } + public static BulkListener getInstance(WriteRequest.RefreshPolicy refreshPolicy) { + return INSTANCES.computeIfAbsent(refreshPolicy, BulkListener::new); + } + + private final WriteRequest.RefreshPolicy refreshPolicy; + + public BulkListener(WriteRequest.RefreshPolicy policy) { + refreshPolicy = policy; } @Override public void beforeBulk(long executionId, BulkRequest request) { - + if (refreshPolicy != null) { + request.setRefreshPolicy(refreshPolicy); + } } @Override @@ -28,10 +47,36 @@ public void afterBulk(long executionId, BulkRequest request, BulkResponse respon log.info("Successfully fed bulk request. Number of events: " + response.getItems().length + " Took time ms: " + response.getIngestTookInMillis()); } + incrementMetrics(response); } @Override public void afterBulk(long executionId, BulkRequest request, Throwable failure) { - log.error("Error feeding bulk request. No retries left", failure); + // Exception raised outside this method + log.error("Error feeding bulk request. No retries left. Request: {}", buildBulkRequestSummary(request), failure); + incrementMetrics(request, failure); + } + + private static void incrementMetrics(BulkResponse response) { + Arrays.stream(response.getItems()) + .map(req -> buildMetricName(req.getOpType(), req.status().name())) + .forEach(metricName -> MetricUtils.counter(BulkListener.class, metricName).inc()); + } + + private static void incrementMetrics(BulkRequest request, Throwable failure) { + request.requests().stream() + .map(req -> buildMetricName(req.opType(), "exception")) + .forEach(metricName -> MetricUtils.exceptionCounter(BulkListener.class, metricName, failure)); + } + + private static String buildMetricName(DocWriteRequest.OpType opType, String status) { + return opType.getLowercase() + MetricUtils.DELIMITER + status.toLowerCase(); + } + + private static String buildBulkRequestSummary(BulkRequest request) { + return request.requests().stream().map(req -> String.format( + "Failed to perform bulk request: index [%s], optype: [%s], type [%s], id [%s]", + req.index(), req.opType(), req.type(), req.id()) + ).collect(Collectors.joining(";")); } } diff --git a/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/update/ESBulkProcessor.java b/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/update/ESBulkProcessor.java new file mode 100644 index 0000000000000..4e4c3dd9e6f12 --- /dev/null +++ b/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/update/ESBulkProcessor.java @@ -0,0 +1,144 @@ +package com.linkedin.metadata.search.elasticsearch.update; + +import com.linkedin.metadata.utils.metrics.MetricUtils; +import lombok.AccessLevel; +import lombok.Builder; +import lombok.Getter; +import lombok.NonNull; +import lombok.Setter; +import lombok.extern.slf4j.Slf4j; +import org.elasticsearch.action.DocWriteRequest; +import org.elasticsearch.action.bulk.BackoffPolicy; +import org.elasticsearch.action.bulk.BulkProcessor; +import org.elasticsearch.action.bulk.BulkResponse; +import org.elasticsearch.action.support.WriteRequest; +import org.elasticsearch.client.RequestOptions; +import org.elasticsearch.client.RestHighLevelClient; +import org.elasticsearch.common.unit.TimeValue; +import org.elasticsearch.index.query.QueryBuilder; +import org.elasticsearch.index.reindex.BulkByScrollResponse; +import org.elasticsearch.index.reindex.DeleteByQueryRequest; + +import java.io.Closeable; +import java.io.IOException; +import java.util.Optional; + +@Slf4j +@Builder(builderMethodName = "hiddenBuilder") +public class ESBulkProcessor implements Closeable { + private static final String ES_WRITES_METRIC = "num_elasticSearch_writes"; + private static final String ES_DELETE_EXCEPTION_METRIC = "delete_by_query"; + + public static ESBulkProcessor.ESBulkProcessorBuilder builder(RestHighLevelClient searchClient) { + return hiddenBuilder().searchClient(searchClient); + } + + @NonNull + private final RestHighLevelClient searchClient; + @Builder.Default + @NonNull + private Boolean async = false; + @Builder.Default + private Integer bulkRequestsLimit = 500; + @Builder.Default + private Integer bulkFlushPeriod = 1; + @Builder.Default + private Integer numRetries = 3; + @Builder.Default + private Long retryInterval = 1L; + @Builder.Default + private TimeValue defaultTimeout = TimeValue.timeValueMinutes(1); + @Getter + private final WriteRequest.RefreshPolicy writeRequestRefreshPolicy; + @Setter(AccessLevel.NONE) + @Getter(AccessLevel.NONE) + private final BulkProcessor bulkProcessor; + + private ESBulkProcessor(@NonNull RestHighLevelClient searchClient, @NonNull Boolean async, Integer bulkRequestsLimit, + Integer bulkFlushPeriod, Integer numRetries, Long retryInterval, + TimeValue defaultTimeout, WriteRequest.RefreshPolicy writeRequestRefreshPolicy, + BulkProcessor ignored) { + this.searchClient = searchClient; + this.async = async; + this.bulkRequestsLimit = bulkRequestsLimit; + this.bulkFlushPeriod = bulkFlushPeriod; + this.numRetries = numRetries; + this.retryInterval = retryInterval; + this.defaultTimeout = defaultTimeout; + this.writeRequestRefreshPolicy = writeRequestRefreshPolicy; + this.bulkProcessor = async ? toAsyncBulkProcessor() : toBulkProcessor(); + } + + public ESBulkProcessor add(DocWriteRequest request) { + MetricUtils.counter(this.getClass(), ES_WRITES_METRIC).inc(); + bulkProcessor.add(request); + return this; + } + + public Optional deleteByQuery(QueryBuilder queryBuilder, String... indices) { + return deleteByQuery(queryBuilder, true, bulkRequestsLimit, defaultTimeout, indices); + } + + public Optional deleteByQuery(QueryBuilder queryBuilder, boolean refresh, String... indices) { + return deleteByQuery(queryBuilder, refresh, bulkRequestsLimit, defaultTimeout, indices); + } + + public Optional deleteByQuery(QueryBuilder queryBuilder, boolean refresh, + int limit, TimeValue timeout, String... indices) { + DeleteByQueryRequest deleteByQueryRequest = new DeleteByQueryRequest() + .setQuery(queryBuilder) + .setBatchSize(limit) + .setMaxRetries(numRetries) + .setRetryBackoffInitialTime(TimeValue.timeValueSeconds(retryInterval)) + .setTimeout(timeout) + .setRefresh(refresh); + deleteByQueryRequest.indices(indices); + + try { + // flush pending writes + bulkProcessor.flush(); + // perform delete after local flush + final BulkByScrollResponse deleteResponse = searchClient.deleteByQuery(deleteByQueryRequest, RequestOptions.DEFAULT); + MetricUtils.counter(this.getClass(), ES_WRITES_METRIC).inc(deleteResponse.getTotal()); + return Optional.of(deleteResponse); + } catch (Exception e) { + log.error("ERROR: Failed to delete by query. See stacktrace for a more detailed error:", e); + MetricUtils.exceptionCounter(ESBulkProcessor.class, ES_DELETE_EXCEPTION_METRIC, e); + } + + return Optional.empty(); + } + + private BulkProcessor toBulkProcessor() { + return BulkProcessor.builder((request, bulkListener) -> { + try { + BulkResponse response = searchClient.bulk(request, RequestOptions.DEFAULT); + bulkListener.onResponse(response); + } catch (IOException e) { + bulkListener.onFailure(e); + throw new RuntimeException(e); + } + }, BulkListener.getInstance(writeRequestRefreshPolicy)) + .setBulkActions(bulkRequestsLimit) + .setFlushInterval(TimeValue.timeValueSeconds(bulkFlushPeriod)) + // This retry is ONLY for "resource constraints", i.e. 429 errors (each request has other retry methods) + .setBackoffPolicy(BackoffPolicy.constantBackoff(TimeValue.timeValueSeconds(retryInterval), numRetries)) + .build(); + } + + private BulkProcessor toAsyncBulkProcessor() { + return BulkProcessor.builder((request, bulkListener) -> { + searchClient.bulkAsync(request, RequestOptions.DEFAULT, bulkListener); + }, BulkListener.getInstance(writeRequestRefreshPolicy)) + .setBulkActions(bulkRequestsLimit) + .setFlushInterval(TimeValue.timeValueSeconds(bulkFlushPeriod)) + // This retry is ONLY for "resource constraints", i.e. 429 errors (each request has other retry methods) + .setBackoffPolicy(BackoffPolicy.constantBackoff(TimeValue.timeValueSeconds(retryInterval), numRetries)) + .build(); + } + + @Override + public void close() throws IOException { + bulkProcessor.close(); + } +} diff --git a/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/update/ESWriteDAO.java b/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/update/ESWriteDAO.java index cda77412baeba..372d4248e5b22 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/update/ESWriteDAO.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/update/ESWriteDAO.java @@ -6,7 +6,6 @@ import javax.annotation.Nonnull; import lombok.RequiredArgsConstructor; import lombok.extern.slf4j.Slf4j; -import org.elasticsearch.action.bulk.BulkProcessor; import org.elasticsearch.action.delete.DeleteRequest; import org.elasticsearch.action.index.IndexRequest; import org.elasticsearch.action.update.UpdateRequest; @@ -16,7 +15,6 @@ import org.elasticsearch.client.indices.GetIndexResponse; import org.elasticsearch.common.xcontent.XContentType; import org.elasticsearch.index.query.QueryBuilders; -import org.elasticsearch.index.reindex.DeleteByQueryRequest; import org.elasticsearch.script.Script; @@ -27,7 +25,8 @@ public class ESWriteDAO { private final EntityRegistry entityRegistry; private final RestHighLevelClient searchClient; private final IndexConvention indexConvention; - private final BulkProcessor bulkProcessor; + private final ESBulkProcessor bulkProcessor; + private final int numRetries; /** * Updates or inserts the given search document. @@ -40,7 +39,10 @@ public void upsertDocument(@Nonnull String entityName, @Nonnull String document, final String indexName = indexConvention.getIndexName(entityRegistry.getEntitySpec(entityName)); final IndexRequest indexRequest = new IndexRequest(indexName).id(docId).source(document, XContentType.JSON); final UpdateRequest updateRequest = - new UpdateRequest(indexName, docId).doc(document, XContentType.JSON).detectNoop(false).upsert(indexRequest); + new UpdateRequest(indexName, docId).doc(document, XContentType.JSON) + .detectNoop(false) + .retryOnConflict(numRetries) + .upsert(indexRequest); bulkProcessor.add(updateRequest); } @@ -60,7 +62,7 @@ public void deleteDocument(@Nonnull String entityName, @Nonnull String docId) { */ public void applyScriptUpdate(@Nonnull String entityName, @Nonnull String docId, @Nonnull String script) { final String indexName = indexConvention.getIndexName(entityRegistry.getEntitySpec(entityName)); - bulkProcessor.add(new UpdateRequest(indexName, docId).script(new Script(script))); + bulkProcessor.add(new UpdateRequest(indexName, docId).retryOnConflict(numRetries).script(new Script(script))); } /** @@ -68,12 +70,7 @@ public void applyScriptUpdate(@Nonnull String entityName, @Nonnull String docId, */ public void clear() { String[] indices = getIndices(indexConvention.getAllEntityIndicesPattern()); - DeleteByQueryRequest deleteRequest = new DeleteByQueryRequest(indices).setQuery(QueryBuilders.matchAllQuery()); - try { - searchClient.deleteByQuery(deleteRequest, RequestOptions.DEFAULT); - } catch (Exception e) { - log.error("Failed to delete content of search indices: {}", e.toString()); - } + bulkProcessor.deleteByQuery(QueryBuilders.matchAllQuery(), indices); } private String[] getIndices(String pattern) { diff --git a/metadata-io/src/main/java/com/linkedin/metadata/systemmetadata/ESSystemMetadataDAO.java b/metadata-io/src/main/java/com/linkedin/metadata/systemmetadata/ESSystemMetadataDAO.java index c5a01278c546b..67703fe70a71a 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/systemmetadata/ESSystemMetadataDAO.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/systemmetadata/ESSystemMetadataDAO.java @@ -1,15 +1,16 @@ package com.linkedin.metadata.systemmetadata; import com.google.common.collect.ImmutableList; +import com.linkedin.metadata.search.elasticsearch.update.ESBulkProcessor; import com.linkedin.metadata.utils.elasticsearch.IndexConvention; import java.io.IOException; import java.util.Collections; import java.util.HashMap; import java.util.Map; +import java.util.Optional; import javax.annotation.Nonnull; import lombok.RequiredArgsConstructor; import lombok.extern.slf4j.Slf4j; -import org.elasticsearch.action.bulk.BulkProcessor; import org.elasticsearch.action.delete.DeleteRequest; import org.elasticsearch.action.delete.DeleteResponse; import org.elasticsearch.action.index.IndexRequest; @@ -22,7 +23,6 @@ import org.elasticsearch.index.query.BoolQueryBuilder; import org.elasticsearch.index.query.QueryBuilders; import org.elasticsearch.index.reindex.BulkByScrollResponse; -import org.elasticsearch.index.reindex.DeleteByQueryRequest; import org.elasticsearch.search.aggregations.AggregationBuilders; import org.elasticsearch.search.aggregations.PipelineAggregatorBuilders; import org.elasticsearch.search.aggregations.bucket.terms.TermsAggregationBuilder; @@ -39,7 +39,8 @@ public class ESSystemMetadataDAO { private final RestHighLevelClient client; private final IndexConvention indexConvention; - private final BulkProcessor bulkProcessor; + private final ESBulkProcessor bulkProcessor; + private final int numRetries; /** * Updates or inserts the given search document. @@ -52,8 +53,9 @@ public void upsertDocument(@Nonnull String docId, @Nonnull String document) { new IndexRequest(indexConvention.getIndexName(INDEX_NAME)).id(docId).source(document, XContentType.JSON); final UpdateRequest updateRequest = new UpdateRequest(indexConvention.getIndexName(INDEX_NAME), docId).doc(document, XContentType.JSON) - .detectNoop(false) - .upsert(indexRequest); + .detectNoop(false) + .retryOnConflict(numRetries) + .upsert(indexRequest); bulkProcessor.add(updateRequest); } @@ -74,20 +76,10 @@ public BulkByScrollResponse deleteByUrn(@Nonnull final String urn) { BoolQueryBuilder finalQuery = QueryBuilders.boolQuery(); finalQuery.must(QueryBuilders.termQuery("urn", urn)); - DeleteByQueryRequest deleteByQueryRequest = new DeleteByQueryRequest(); + final Optional deleteResponse = bulkProcessor.deleteByQuery(finalQuery, + indexConvention.getIndexName(INDEX_NAME)); - deleteByQueryRequest.setQuery(finalQuery); - - deleteByQueryRequest.indices(indexConvention.getIndexName(INDEX_NAME)); - - try { - final BulkByScrollResponse deleteResponse = client.deleteByQuery(deleteByQueryRequest, RequestOptions.DEFAULT); - return deleteResponse; - } catch (IOException e) { - log.error("ERROR: Failed to delete by query. See stacktrace for a more detailed error:"); - e.printStackTrace(); - } - return null; + return deleteResponse.orElse(null); } public BulkByScrollResponse deleteByUrnAspect(@Nonnull final String urn, @Nonnull final String aspect) { @@ -95,20 +87,10 @@ public BulkByScrollResponse deleteByUrnAspect(@Nonnull final String urn, @Nonnul finalQuery.must(QueryBuilders.termQuery("urn", urn)); finalQuery.must(QueryBuilders.termQuery("aspect", aspect)); - DeleteByQueryRequest deleteByQueryRequest = new DeleteByQueryRequest(); + final Optional deleteResponse = bulkProcessor.deleteByQuery(finalQuery, + indexConvention.getIndexName(INDEX_NAME)); - deleteByQueryRequest.setQuery(finalQuery); - - deleteByQueryRequest.indices(indexConvention.getIndexName(INDEX_NAME)); - - try { - final BulkByScrollResponse deleteResponse = client.deleteByQuery(deleteByQueryRequest, RequestOptions.DEFAULT); - return deleteResponse; - } catch (IOException e) { - log.error("ERROR: Failed to delete by query. See stacktrace for a more detailed error:"); - e.printStackTrace(); - } - return null; + return deleteResponse.orElse(null); } public SearchResponse findByParams(Map searchParams, boolean includeSoftDeleted, int from, int size) { diff --git a/metadata-io/src/main/java/com/linkedin/metadata/systemmetadata/ElasticSearchSystemMetadataService.java b/metadata-io/src/main/java/com/linkedin/metadata/systemmetadata/ElasticSearchSystemMetadataService.java index 6f6c625666cf8..7390b49b3fc1a 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/systemmetadata/ElasticSearchSystemMetadataService.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/systemmetadata/ElasticSearchSystemMetadataService.java @@ -2,10 +2,12 @@ import com.fasterxml.jackson.databind.node.JsonNodeFactory; import com.fasterxml.jackson.databind.node.ObjectNode; +import com.google.common.annotations.VisibleForTesting; import com.google.common.collect.ImmutableMap; import com.linkedin.metadata.run.AspectRowSummary; import com.linkedin.metadata.run.IngestionRunSummary; import com.linkedin.metadata.search.elasticsearch.indexbuilder.ESIndexBuilder; +import com.linkedin.metadata.search.elasticsearch.update.ESBulkProcessor; import com.linkedin.metadata.search.utils.ESUtils; import com.linkedin.metadata.utils.elasticsearch.IndexConvention; import com.linkedin.mxe.SystemMetadata; @@ -27,10 +29,7 @@ import lombok.RequiredArgsConstructor; import lombok.extern.slf4j.Slf4j; import org.elasticsearch.action.search.SearchResponse; -import org.elasticsearch.client.RequestOptions; -import org.elasticsearch.client.RestHighLevelClient; import org.elasticsearch.index.query.QueryBuilders; -import org.elasticsearch.index.reindex.DeleteByQueryRequest; import org.elasticsearch.search.SearchHits; import org.elasticsearch.search.aggregations.bucket.filter.ParsedFilter; import org.elasticsearch.search.aggregations.bucket.terms.ParsedStringTerms; @@ -42,7 +41,7 @@ @RequiredArgsConstructor public class ElasticSearchSystemMetadataService implements SystemMetadataService { - private final RestHighLevelClient _searchClient; + private final ESBulkProcessor _esBulkProcessor; private final IndexConvention _indexConvention; private final ESSystemMetadataDAO _esDAO; private final ESIndexBuilder _indexBuilder; @@ -204,14 +203,9 @@ public void configure() { } } + @VisibleForTesting @Override public void clear() { - DeleteByQueryRequest deleteRequest = - new DeleteByQueryRequest(_indexConvention.getIndexName(INDEX_NAME)).setQuery(QueryBuilders.matchAllQuery()); - try { - _searchClient.deleteByQuery(deleteRequest, RequestOptions.DEFAULT); - } catch (Exception e) { - log.error("Failed to clear system metadata service: {}", e.toString()); - } + _esBulkProcessor.deleteByQuery(QueryBuilders.matchAllQuery(), true, _indexConvention.getIndexName(INDEX_NAME)); } } diff --git a/metadata-io/src/main/java/com/linkedin/metadata/timeseries/elastic/ElasticSearchTimeseriesAspectService.java b/metadata-io/src/main/java/com/linkedin/metadata/timeseries/elastic/ElasticSearchTimeseriesAspectService.java index 0dc00f6408382..c0ac0a28e1c63 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/timeseries/elastic/ElasticSearchTimeseriesAspectService.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/timeseries/elastic/ElasticSearchTimeseriesAspectService.java @@ -15,6 +15,7 @@ import com.linkedin.metadata.query.filter.Condition; import com.linkedin.metadata.query.filter.Criterion; import com.linkedin.metadata.query.filter.Filter; +import com.linkedin.metadata.search.elasticsearch.update.ESBulkProcessor; import com.linkedin.metadata.search.utils.ESUtils; import com.linkedin.metadata.search.utils.QueryUtils; import com.linkedin.metadata.timeseries.TimeseriesAspectService; @@ -29,16 +30,15 @@ import com.linkedin.timeseries.DeleteAspectValuesResult; import com.linkedin.timeseries.GenericTable; import com.linkedin.timeseries.GroupingBucket; -import java.io.IOException; import java.nio.charset.StandardCharsets; import java.util.Arrays; import java.util.List; import java.util.Map; +import java.util.Optional; import java.util.stream.Collectors; import javax.annotation.Nonnull; import javax.annotation.Nullable; import lombok.extern.slf4j.Slf4j; -import org.elasticsearch.action.bulk.BulkProcessor; import org.elasticsearch.action.index.IndexRequest; import org.elasticsearch.action.search.SearchRequest; import org.elasticsearch.action.search.SearchResponse; @@ -49,8 +49,6 @@ import org.elasticsearch.common.xcontent.XContentType; import org.elasticsearch.index.query.BoolQueryBuilder; import org.elasticsearch.index.query.QueryBuilders; -import org.elasticsearch.index.reindex.BulkByScrollResponse; -import org.elasticsearch.index.reindex.DeleteByQueryRequest; import org.elasticsearch.search.SearchHit; import org.elasticsearch.search.SearchHits; import org.elasticsearch.search.builder.SearchSourceBuilder; @@ -66,7 +64,8 @@ public class ElasticSearchTimeseriesAspectService implements TimeseriesAspectSer private static final Integer DEFAULT_LIMIT = 10000; private final IndexConvention _indexConvention; - private final BulkProcessor _bulkProcessor; + private final ESBulkProcessor _bulkProcessor; + private final int _numRetries; private final TimeseriesAspectIndexBuilders _indexBuilders; private final RestHighLevelClient _searchClient; private final ESAggregatedStatsDAO _esAggregatedStatsDAO; @@ -74,12 +73,13 @@ public class ElasticSearchTimeseriesAspectService implements TimeseriesAspectSer public ElasticSearchTimeseriesAspectService(@Nonnull RestHighLevelClient searchClient, @Nonnull IndexConvention indexConvention, @Nonnull TimeseriesAspectIndexBuilders indexBuilders, - @Nonnull EntityRegistry entityRegistry, @Nonnull BulkProcessor bulkProcessor) { + @Nonnull EntityRegistry entityRegistry, @Nonnull ESBulkProcessor bulkProcessor, int numRetries) { _indexConvention = indexConvention; _indexBuilders = indexBuilders; _searchClient = searchClient; _bulkProcessor = bulkProcessor; _entityRegistry = entityRegistry; + _numRetries = numRetries; _esAggregatedStatsDAO = new ESAggregatedStatsDAO(indexConvention, searchClient, entityRegistry); } @@ -122,8 +122,9 @@ public void upsertDocument(@Nonnull String entityName, @Nonnull String aspectNam final IndexRequest indexRequest = new IndexRequest(indexName).id(docId).source(document.toString(), XContentType.JSON); final UpdateRequest updateRequest = new UpdateRequest(indexName, docId).doc(document.toString(), XContentType.JSON) - .detectNoop(false) - .upsert(indexRequest); + .detectNoop(false) + .retryOnConflict(_numRetries) + .upsert(indexRequest); _bulkProcessor.add(updateRequest); } @@ -202,16 +203,16 @@ public DeleteAspectValuesResult deleteAspectValues(@Nonnull String entityName, @ @Nonnull Filter filter) { final String indexName = _indexConvention.getTimeseriesAspectIndexName(entityName, aspectName); final BoolQueryBuilder filterQueryBuilder = ESUtils.buildFilterQuery(filter); - final DeleteByQueryRequest deleteByQueryRequest = new DeleteByQueryRequest(indexName).setQuery(filterQueryBuilder) - .setBatchSize(DEFAULT_LIMIT) - .setRefresh(true) - .setTimeout(TimeValue.timeValueMinutes(10)); - try { - final BulkByScrollResponse response = _searchClient.deleteByQuery(deleteByQueryRequest, RequestOptions.DEFAULT); - return new DeleteAspectValuesResult().setNumDocsDeleted(response.getDeleted()); - } catch (IOException e) { - log.error("Delete query failed:", e); - throw new ESQueryException("Delete query failed:", e); + + final Optional result = _bulkProcessor + .deleteByQuery(filterQueryBuilder, false, DEFAULT_LIMIT, TimeValue.timeValueMinutes(10), indexName) + .map(response -> new DeleteAspectValuesResult().setNumDocsDeleted(response.getDeleted())); + + if (result.isPresent()) { + return result.get(); + } else { + log.error("Delete query failed"); + throw new ESQueryException("Delete query failed"); } } diff --git a/metadata-io/src/test/java/com/linkedin/metadata/CassandraTestUtils.java b/metadata-io/src/test/java/com/linkedin/metadata/CassandraTestUtils.java index 979616d11da37..5645573917f00 100644 --- a/metadata-io/src/test/java/com/linkedin/metadata/CassandraTestUtils.java +++ b/metadata-io/src/test/java/com/linkedin/metadata/CassandraTestUtils.java @@ -12,6 +12,7 @@ import javax.annotation.Nonnull; import javax.net.ssl.SSLContext; import java.net.InetSocketAddress; +import java.time.Duration; import java.util.Arrays; import java.util.HashMap; import java.util.List; @@ -34,8 +35,9 @@ public static CassandraContainer setupContainer() { .asCompatibleSubstituteFor("cassandra"); CassandraContainer container = new CassandraContainer(imageName); - container.withEnv("JVM_OPTS", "-Xms64M -Xmx64M"); - container.start(); + container.withEnv("JVM_OPTS", "-Xms64M -Xmx96M") + .withStartupTimeout(Duration.ofMinutes(5)) // usually < 1min + .start(); try (Session session = container.getCluster().connect()) { session.execute(String.format("CREATE KEYSPACE IF NOT EXISTS %s WITH replication = \n" diff --git a/metadata-io/src/test/java/com/linkedin/metadata/ElasticSearchTestConfiguration.java b/metadata-io/src/test/java/com/linkedin/metadata/ElasticSearchTestConfiguration.java new file mode 100644 index 0000000000000..334dcee784a03 --- /dev/null +++ b/metadata-io/src/test/java/com/linkedin/metadata/ElasticSearchTestConfiguration.java @@ -0,0 +1,88 @@ +package com.linkedin.metadata; + +import com.linkedin.metadata.search.elasticsearch.indexbuilder.ESIndexBuilder; +import com.linkedin.metadata.search.elasticsearch.update.ESBulkProcessor; +import org.apache.http.HttpHost; +import org.apache.http.impl.nio.reactor.IOReactorConfig; +import org.elasticsearch.action.support.WriteRequest; +import org.elasticsearch.client.RestClient; +import org.elasticsearch.client.RestClientBuilder; +import org.elasticsearch.client.RestHighLevelClient; +import org.springframework.beans.factory.annotation.Qualifier; +import org.springframework.boot.test.context.TestConfiguration; +import org.springframework.context.annotation.Bean; +import org.springframework.context.annotation.Primary; +import org.springframework.context.annotation.Scope; +import org.testcontainers.elasticsearch.ElasticsearchContainer; + +import javax.annotation.Nonnull; + +import java.util.Map; + + +@TestConfiguration +public class ElasticSearchTestConfiguration { + private static final int HTTP_PORT = 9200; + private static final int REFRESH_INTERVAL_SECONDS = 1; + + public static void syncAfterWrite() throws InterruptedException { + Thread.sleep(REFRESH_INTERVAL_SECONDS * 1000); + } + + @Scope("singleton") + @Bean(name = "testElasticsearchContainer") + @Nonnull + public ElasticsearchContainer elasticsearchContainer() { + ElasticTestUtils.ES_CONTAINER.start(); + return ElasticTestUtils.ES_CONTAINER; + } + + @Primary + @Scope("singleton") + @Bean(name = "elasticSearchRestHighLevelClient") + @Nonnull + public RestHighLevelClient getElasticsearchClient(@Qualifier("testElasticsearchContainer") ElasticsearchContainer esContainer) { + // A helper method to create an ElasticseachContainer defaulting to the current image and version, with the ability + // within firewalled environments to override with an environment variable to point to the offline repository. + // A helper method to construct a standard rest client for Elastic search. + final RestClientBuilder builder = + RestClient.builder(new HttpHost( + "localhost", + esContainer.getMappedPort(HTTP_PORT), "http") + ).setHttpClientConfigCallback(httpAsyncClientBuilder -> + httpAsyncClientBuilder.setDefaultIOReactorConfig(IOReactorConfig.custom().setIoThreadCount(1).build())); + + builder.setRequestConfigCallback(requestConfigBuilder -> requestConfigBuilder. + setConnectionRequestTimeout(30000)); + + return new RestHighLevelClient(builder); + } + + /* + Cannot use the factory class without circular dependencies + */ + @Primary + @Bean(name = "elasticSearchBulkProcessor") + @Nonnull + public ESBulkProcessor getBulkProcessor(@Qualifier("elasticSearchRestHighLevelClient") RestHighLevelClient searchClient) { + return ESBulkProcessor.builder(searchClient) + .async(true) + /* + * Force a refresh as part of this request. This refresh policy does not scale for high indexing or search throughput but is useful + * to present a consistent view to for indices with very low traffic. And it is wonderful for tests! + */ + .writeRequestRefreshPolicy(WriteRequest.RefreshPolicy.IMMEDIATE) + .bulkRequestsLimit(1000) + .bulkFlushPeriod(REFRESH_INTERVAL_SECONDS - 1) + .retryInterval(1L) + .numRetries(1) + .build(); + } + + @Primary + @Bean(name = "elasticSearchIndexBuilder") + @Nonnull + public ESIndexBuilder getIndexBuilder(@Qualifier("elasticSearchRestHighLevelClient") RestHighLevelClient searchClient) { + return new ESIndexBuilder(searchClient, 1, 1, 3, 1, Map.of(), false); + } +} diff --git a/metadata-io/src/test/java/com/linkedin/metadata/ElasticSearchTestUtils.java b/metadata-io/src/test/java/com/linkedin/metadata/ElasticSearchTestUtils.java deleted file mode 100644 index c8db02ab3ea10..0000000000000 --- a/metadata-io/src/test/java/com/linkedin/metadata/ElasticSearchTestUtils.java +++ /dev/null @@ -1,116 +0,0 @@ -package com.linkedin.metadata; - -import io.github.resilience4j.retry.Retry; -import io.github.resilience4j.retry.RetryConfig; -import org.elasticsearch.ElasticsearchStatusException; -import org.elasticsearch.action.admin.indices.flush.FlushRequest; -import org.elasticsearch.action.admin.indices.flush.FlushResponse; -import org.elasticsearch.action.admin.indices.refresh.RefreshRequest; -import org.elasticsearch.action.admin.indices.refresh.RefreshResponse; -import org.elasticsearch.action.delete.DeleteRequest; -import org.elasticsearch.action.get.GetRequest; -import org.elasticsearch.action.get.GetResponse; -import org.elasticsearch.action.index.IndexRequest; -import org.elasticsearch.action.update.UpdateRequest; -import org.elasticsearch.client.RequestOptions; -import org.elasticsearch.client.RestHighLevelClient; -import org.elasticsearch.common.xcontent.XContentType; -import org.testng.TestException; - -import java.net.SocketTimeoutException; -import java.util.UUID; -import java.util.concurrent.TimeUnit; - -public class ElasticSearchTestUtils { - - // request options for all requests - private static final RequestOptions OPTIONS = RequestOptions.DEFAULT; - - // retry logic for ES requests - private static final Retry RETRY = Retry.of("ElasticSearchTestUtils", RetryConfig.custom() - .retryExceptions(SocketTimeoutException.class, ElasticsearchStatusException.class) - .failAfterMaxAttempts(true) - .maxAttempts(3) - .build() - ); - - // allow for Supplier that throw exceptions - private interface ThrowingSupplier { - T get() throws E; - } - - // We are retrying requests, otherwise concurrency tests will see exceptions like these: - // java.net.SocketTimeoutException: 30,000 milliseconds timeout on connection http-outgoing-1 [ACTIVE] - private static T retry(ThrowingSupplier func) { - return RETRY.executeSupplier(() -> { - try { - return func.get(); - } catch (Exception e) { - throw new RuntimeException(e); - } - }); - } - - private ElasticSearchTestUtils() { - } - - public static void syncAfterWrite(RestHighLevelClient client) throws Exception { - syncAfterWrite(client, "test-sync-flag"); - } - - public static void syncAfterWrite(RestHighLevelClient searchClient, String indexName) throws Exception { - // we add some more data (a sync flag) and wait for it to appear - // we pick a random flag so that this can be used concurrently - String syncFlag = UUID.randomUUID().toString(); - - // add the flag and wait for it to appear, preferably to the indexed modified outside - addSyncFlag(searchClient, syncFlag, indexName); - waitForSyncFlag(searchClient, syncFlag, indexName, true); - - // flush changes for all indices in ES to disk - FlushResponse fResponse = retry(() -> searchClient.indices().flush(new FlushRequest(), OPTIONS)); - if (fResponse.getFailedShards() > 0) { - throw new RuntimeException("Failed to flush " + fResponse.getFailedShards() + " of " + fResponse.getTotalShards() + " shards"); - } - - // wait for all indices to be refreshed - RefreshResponse rResponse = retry(() -> searchClient.indices().refresh(new RefreshRequest(), OPTIONS)); - if (rResponse.getFailedShards() > 0) { - throw new RuntimeException("Failed to refresh " + rResponse.getFailedShards() + " of " + rResponse.getTotalShards() + " shards"); - } - - // remove the flag again and wait for it to disappear - removeSyncFlag(searchClient, syncFlag, indexName); - waitForSyncFlag(searchClient, syncFlag, indexName, false); - } - - private static void addSyncFlag(RestHighLevelClient searchClient, String docId, String indexName) { - String document = "{ }"; - final IndexRequest indexRequest = new IndexRequest(indexName).id(docId).source(document, XContentType.JSON); - final UpdateRequest updateRequest = new UpdateRequest(indexName, docId).doc(document, XContentType.JSON) - .detectNoop(false) - .retryOnConflict(3) - .upsert(indexRequest); - retry(() -> searchClient.update(updateRequest, OPTIONS)); - } - - private static void removeSyncFlag(RestHighLevelClient searchClient, String docId, String indexName) { - final DeleteRequest deleteRequest = new DeleteRequest(indexName).id(docId); - retry(() -> searchClient.delete(deleteRequest, OPTIONS)); - } - - private static void waitForSyncFlag(RestHighLevelClient searchClient, String docId, String indexName, boolean toExist) - throws InterruptedException { - GetRequest request = new GetRequest(indexName).id(docId); - long timeout = System.currentTimeMillis() + TimeUnit.MILLISECONDS.convert(10, TimeUnit.SECONDS); - while (System.currentTimeMillis() < timeout) { - GetResponse response = retry(() -> searchClient.get(request, OPTIONS)); - if (response.isExists() == toExist) { - return; - } - TimeUnit.MILLISECONDS.sleep(50); - } - throw new TestException("Waiting for sync timed out"); - } - -} diff --git a/metadata-io/src/test/java/com/linkedin/metadata/ElasticTestUtils.java b/metadata-io/src/test/java/com/linkedin/metadata/ElasticTestUtils.java index 9f2c2f82d634c..2c9491c9660f4 100644 --- a/metadata-io/src/test/java/com/linkedin/metadata/ElasticTestUtils.java +++ b/metadata-io/src/test/java/com/linkedin/metadata/ElasticTestUtils.java @@ -1,14 +1,11 @@ package com.linkedin.metadata; -import org.apache.http.HttpHost; -import org.apache.http.impl.nio.reactor.IOReactorConfig; -import org.elasticsearch.client.RestClient; -import org.elasticsearch.client.RestClientBuilder; -import org.elasticsearch.client.RestHighLevelClient; import org.testcontainers.elasticsearch.ElasticsearchContainer; import org.testcontainers.utility.DockerImageName; -import javax.annotation.Nonnull; +import java.time.Duration; + +import static com.linkedin.metadata.DockerTestUtils.checkContainerEngine; public class ElasticTestUtils { private ElasticTestUtils() { @@ -22,27 +19,14 @@ private ElasticTestUtils() { private static final DockerImageName DOCKER_IMAGE_NAME = DockerImageName.parse(ELASTIC_IMAGE_FULL_NAME) .asCompatibleSubstituteFor(ELASTIC_IMAGE_NAME); - private static final int HTTP_PORT = 9200; + public static final ElasticsearchContainer ES_CONTAINER; // A helper method to create an ElasticseachContainer defaulting to the current image and version, with the ability // within firewalled environments to override with an environment variable to point to the offline repository. - @Nonnull - public static final ElasticsearchContainer getNewElasticsearchContainer() { - return new ElasticsearchContainer(DOCKER_IMAGE_NAME); - } - - // A helper method to construct a standard rest client for Elastic search. - @Nonnull - public static RestHighLevelClient buildRestClient(ElasticsearchContainer elasticsearchContainer) { - final RestClientBuilder builder = - RestClient.builder(new HttpHost("localhost", elasticsearchContainer.getMappedPort(HTTP_PORT), "http")) - .setHttpClientConfigCallback(httpAsyncClientBuilder -> httpAsyncClientBuilder.setDefaultIOReactorConfig( - IOReactorConfig.custom().setIoThreadCount(1).build())); - - builder.setRequestConfigCallback(requestConfigBuilder -> requestConfigBuilder. - setConnectionRequestTimeout(3000)); - - return new RestHighLevelClient(builder); + static { + ES_CONTAINER = new ElasticsearchContainer(DOCKER_IMAGE_NAME); + checkContainerEngine(ES_CONTAINER.getDockerClient()); + ES_CONTAINER.withEnv("ES_JAVA_OPTS", "-Xms64m -Xmx128m -XX:MaxDirectMemorySize=134217728") + .withStartupTimeout(Duration.ofMinutes(5)); // usually < 1min } - -} +} \ No newline at end of file diff --git a/metadata-io/src/test/java/com/linkedin/metadata/elasticsearch/update/BulkListenerTest.java b/metadata-io/src/test/java/com/linkedin/metadata/elasticsearch/update/BulkListenerTest.java new file mode 100644 index 0000000000000..154131ceb6fee --- /dev/null +++ b/metadata-io/src/test/java/com/linkedin/metadata/elasticsearch/update/BulkListenerTest.java @@ -0,0 +1,39 @@ +package com.linkedin.metadata.elasticsearch.update; + +import com.linkedin.metadata.search.elasticsearch.update.BulkListener; +import org.elasticsearch.action.bulk.BulkRequest; +import org.elasticsearch.action.support.WriteRequest; +import org.mockito.Mockito; +import org.testng.annotations.Test; + +import static org.mockito.Mockito.times; +import static org.mockito.Mockito.verify; +import static org.mockito.ArgumentMatchers.any; +import static org.testng.Assert.assertNotNull; +import static org.testng.Assert.assertEquals; +import static org.testng.Assert.assertNotEquals; + +public class BulkListenerTest { + + @Test + public void testConstructor() { + BulkListener test = BulkListener.getInstance(); + assertNotNull(test); + assertEquals(test, BulkListener.getInstance()); + assertNotEquals(test, BulkListener.getInstance(WriteRequest.RefreshPolicy.IMMEDIATE)); + } + + @Test + public void testDefaultPolicy() { + BulkListener test = BulkListener.getInstance(); + + BulkRequest mockRequest1 = Mockito.mock(BulkRequest.class); + test.beforeBulk(0L, mockRequest1); + verify(mockRequest1, times(0)).setRefreshPolicy(any(WriteRequest.RefreshPolicy.class)); + + BulkRequest mockRequest2 = Mockito.mock(BulkRequest.class); + test = BulkListener.getInstance(WriteRequest.RefreshPolicy.IMMEDIATE); + test.beforeBulk(0L, mockRequest2); + verify(mockRequest2, times(1)).setRefreshPolicy(WriteRequest.RefreshPolicy.IMMEDIATE); + } +} diff --git a/metadata-io/src/test/java/com/linkedin/metadata/elasticsearch/update/ESBulkProcessorTest.java b/metadata-io/src/test/java/com/linkedin/metadata/elasticsearch/update/ESBulkProcessorTest.java new file mode 100644 index 0000000000000..5c882e5158f90 --- /dev/null +++ b/metadata-io/src/test/java/com/linkedin/metadata/elasticsearch/update/ESBulkProcessorTest.java @@ -0,0 +1,18 @@ +package com.linkedin.metadata.elasticsearch.update; + +import com.linkedin.metadata.search.elasticsearch.update.ESBulkProcessor; +import org.elasticsearch.client.RestHighLevelClient; +import org.mockito.Mockito; +import org.testng.annotations.Test; + +import static org.testng.Assert.assertNotNull; + +public class ESBulkProcessorTest { + + @Test + public void testESBulkProcessorBuilder() { + RestHighLevelClient mock = Mockito.mock(RestHighLevelClient.class); + ESBulkProcessor test = ESBulkProcessor.builder(mock).build(); + assertNotNull(test); + } +} diff --git a/metadata-io/src/test/java/com/linkedin/metadata/graph/GraphServiceTestBase.java b/metadata-io/src/test/java/com/linkedin/metadata/graph/GraphServiceTestBase.java index ca80bf567672d..f0cbdfdad36a5 100644 --- a/metadata-io/src/test/java/com/linkedin/metadata/graph/GraphServiceTestBase.java +++ b/metadata-io/src/test/java/com/linkedin/metadata/graph/GraphServiceTestBase.java @@ -27,6 +27,8 @@ import java.util.stream.IntStream; import javax.annotation.Nonnull; import javax.annotation.Nullable; + +import org.springframework.test.context.testng.AbstractTestNGSpringContextTests; import org.testng.Assert; import org.testng.annotations.BeforeMethod; import org.testng.annotations.DataProvider; @@ -54,7 +56,7 @@ * Feel free to add a test to your test implementation that calls `getPopulatedGraphService` and * asserts the state of the graph in an implementation specific way. */ -abstract public class GraphServiceTestBase { +abstract public class GraphServiceTestBase extends AbstractTestNGSpringContextTests { private static class RelatedEntityComparator implements Comparator { @Override diff --git a/metadata-io/src/test/java/com/linkedin/metadata/graph/elastic/ElasticSearchGraphServiceTest.java b/metadata-io/src/test/java/com/linkedin/metadata/graph/elastic/ElasticSearchGraphServiceTest.java index 888b606e4f0a5..9f812dbf65fc5 100644 --- a/metadata-io/src/test/java/com/linkedin/metadata/graph/elastic/ElasticSearchGraphServiceTest.java +++ b/metadata-io/src/test/java/com/linkedin/metadata/graph/elastic/ElasticSearchGraphServiceTest.java @@ -1,8 +1,12 @@ package com.linkedin.metadata.graph.elastic; +import com.linkedin.common.FabricType; +import com.linkedin.common.urn.DataPlatformUrn; +import com.linkedin.common.urn.DatasetUrn; +import com.linkedin.common.urn.TagUrn; import com.linkedin.common.urn.Urn; -import com.linkedin.metadata.ElasticSearchTestUtils; -import com.linkedin.metadata.ElasticTestUtils; +import com.linkedin.metadata.ElasticSearchTestConfiguration; +import com.linkedin.metadata.graph.Edge; import com.linkedin.metadata.graph.GraphService; import com.linkedin.metadata.graph.GraphServiceTestBase; import com.linkedin.metadata.graph.RelatedEntitiesResult; @@ -12,13 +16,15 @@ import com.linkedin.metadata.query.filter.Filter; import com.linkedin.metadata.query.filter.RelationshipDirection; import com.linkedin.metadata.query.filter.RelationshipFilter; -import com.linkedin.metadata.search.elasticsearch.ElasticSearchServiceTest; +import com.linkedin.metadata.search.elasticsearch.indexbuilder.ESIndexBuilder; +import com.linkedin.metadata.search.elasticsearch.update.ESBulkProcessor; import com.linkedin.metadata.utils.elasticsearch.IndexConvention; import com.linkedin.metadata.utils.elasticsearch.IndexConventionImpl; +import java.util.Collections; import org.elasticsearch.client.RestHighLevelClient; -import org.testcontainers.elasticsearch.ElasticsearchContainer; +import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.context.annotation.Import; import org.testng.SkipException; -import org.testng.annotations.AfterClass; import org.testng.annotations.BeforeClass; import org.testng.annotations.BeforeMethod; import org.testng.annotations.Test; @@ -28,25 +34,28 @@ import java.util.HashSet; import java.util.List; -import static com.linkedin.metadata.DockerTestUtils.checkContainerEngine; import static com.linkedin.metadata.graph.elastic.ElasticSearchGraphService.INDEX_NAME; +import static com.linkedin.metadata.search.utils.QueryUtils.*; import static org.testng.Assert.assertEquals; - +@Import(ElasticSearchTestConfiguration.class) public class ElasticSearchGraphServiceTest extends GraphServiceTestBase { - private ElasticsearchContainer _elasticsearchContainer; + @Autowired private RestHighLevelClient _searchClient; + @Autowired + private ESBulkProcessor _bulkProcessor; + @Autowired + private ESIndexBuilder _esIndexBuilder; + private final IndexConvention _indexConvention = new IndexConventionImpl(null); private final String _indexName = _indexConvention.getIndexName(INDEX_NAME); private ElasticSearchGraphService _client; + private static final String TAG_RELATIONSHIP = "SchemaFieldTaggedWith"; + @BeforeClass public void setup() { - _elasticsearchContainer = ElasticTestUtils.getNewElasticsearchContainer(); - checkContainerEngine(_elasticsearchContainer.getDockerClient()); - _elasticsearchContainer.start(); - _searchClient = ElasticTestUtils.buildRestClient(_elasticsearchContainer); _client = buildService(); _client.configure(); } @@ -61,15 +70,9 @@ public void wipe() throws Exception { private ElasticSearchGraphService buildService() { LineageRegistry lineageRegistry = new LineageRegistry(SnapshotEntityRegistry.getInstance()); ESGraphQueryDAO readDAO = new ESGraphQueryDAO(_searchClient, lineageRegistry, _indexConvention); - ESGraphWriteDAO writeDAO = - new ESGraphWriteDAO(_searchClient, _indexConvention, ElasticSearchServiceTest.getBulkProcessor(_searchClient)); - return new ElasticSearchGraphService(lineageRegistry, _searchClient, _indexConvention, writeDAO, readDAO, - ElasticSearchServiceTest.getIndexBuilder(_searchClient)); - } - - @AfterClass - public void tearDown() { - _elasticsearchContainer.stop(); + ESGraphWriteDAO writeDAO = new ESGraphWriteDAO(_indexConvention, _bulkProcessor, 1); + return new ElasticSearchGraphService(lineageRegistry, _bulkProcessor, _indexConvention, writeDAO, readDAO, + _esIndexBuilder); } @Override @@ -80,7 +83,7 @@ protected GraphService getGraphService() { @Override protected void syncAfterWrite() throws Exception { - ElasticSearchTestUtils.syncAfterWrite(_searchClient, _indexName); + com.linkedin.metadata.ElasticSearchTestConfiguration.syncAfterWrite(); } @Override @@ -180,6 +183,28 @@ public void testRemoveEdgesFromNodeNoRelationshipTypes() { throw new SkipException("ElasticSearchGraphService does not support empty list of relationship types"); } + @Test + // TODO: Only in ES for now since unimplemented in other services + public void testRemoveEdge() throws Exception { + DatasetUrn datasetUrn = new DatasetUrn(new DataPlatformUrn("snowflake"), "test", FabricType.TEST); + TagUrn tagUrn = new TagUrn("newTag"); + Edge edge = new Edge(datasetUrn, tagUrn, TAG_RELATIONSHIP); + getGraphService().addEdge(edge); + syncAfterWrite(); + RelatedEntitiesResult result = getGraphService().findRelatedEntities(Collections.singletonList(datasetType), + newFilter(Collections.singletonMap("urn", datasetUrn.toString())), Collections.singletonList("tag"), + EMPTY_FILTER, Collections.singletonList(TAG_RELATIONSHIP), + newRelationshipFilter(EMPTY_FILTER, RelationshipDirection.OUTGOING), 0, 100); + assertEquals(result.getTotal(), 1); + getGraphService().removeEdge(edge); + syncAfterWrite(); + result = getGraphService().findRelatedEntities(Collections.singletonList(datasetType), + newFilter(Collections.singletonMap("urn", datasetUrn.toString())), Collections.singletonList("tag"), + EMPTY_FILTER, Collections.singletonList(TAG_RELATIONSHIP), + newRelationshipFilter(EMPTY_FILTER, RelationshipDirection.OUTGOING), 0, 100); + assertEquals(result.getTotal(), 0); + } + @Test @Override public void testConcurrentAddEdge() { diff --git a/metadata-io/src/test/java/com/linkedin/metadata/search/LineageSearchServiceTest.java b/metadata-io/src/test/java/com/linkedin/metadata/search/LineageSearchServiceTest.java index 00302bd06a99c..b70d210903cf6 100644 --- a/metadata-io/src/test/java/com/linkedin/metadata/search/LineageSearchServiceTest.java +++ b/metadata-io/src/test/java/com/linkedin/metadata/search/LineageSearchServiceTest.java @@ -7,7 +7,7 @@ import com.linkedin.common.urn.TestEntityUrn; import com.linkedin.common.urn.Urn; import com.linkedin.data.schema.annotation.PathSpecBasedSchemaAnnotationVisitor; -import com.linkedin.metadata.ElasticTestUtils; +import com.linkedin.metadata.ElasticSearchTestConfiguration; import com.linkedin.metadata.TestEntityUtil; import com.linkedin.metadata.graph.EntityLineageResult; import com.linkedin.metadata.graph.GraphService; @@ -21,21 +21,23 @@ import com.linkedin.metadata.search.cache.EntityDocCountCache; import com.linkedin.metadata.search.client.CachingEntitySearchService; import com.linkedin.metadata.search.elasticsearch.ElasticSearchService; -import com.linkedin.metadata.search.elasticsearch.ElasticSearchServiceTest; +import com.linkedin.metadata.search.elasticsearch.indexbuilder.ESIndexBuilder; import com.linkedin.metadata.search.elasticsearch.indexbuilder.EntityIndexBuilders; import com.linkedin.metadata.search.elasticsearch.indexbuilder.SettingsBuilder; import com.linkedin.metadata.search.elasticsearch.query.ESBrowseDAO; import com.linkedin.metadata.search.elasticsearch.query.ESSearchDAO; +import com.linkedin.metadata.search.elasticsearch.update.ESBulkProcessor; import com.linkedin.metadata.search.elasticsearch.update.ESWriteDAO; import com.linkedin.metadata.search.ranker.SimpleRanker; import com.linkedin.metadata.search.utils.QueryUtils; import com.linkedin.metadata.utils.elasticsearch.IndexConvention; import com.linkedin.metadata.utils.elasticsearch.IndexConventionImpl; import org.elasticsearch.client.RestHighLevelClient; +import org.springframework.beans.factory.annotation.Autowired; import org.springframework.cache.CacheManager; import org.springframework.cache.concurrent.ConcurrentMapCacheManager; -import org.testcontainers.elasticsearch.ElasticsearchContainer; -import org.testng.annotations.AfterClass; +import org.springframework.context.annotation.Import; +import org.springframework.test.context.testng.AbstractTestNGSpringContextTests; import org.testng.annotations.BeforeClass; import org.testng.annotations.BeforeMethod; import org.testng.annotations.Test; @@ -44,19 +46,23 @@ import java.util.Collections; import java.util.List; -import static com.linkedin.metadata.DockerTestUtils.checkContainerEngine; -import static com.linkedin.metadata.ElasticSearchTestUtils.syncAfterWrite; +import static com.linkedin.metadata.ElasticSearchTestConfiguration.syncAfterWrite; import static org.mockito.ArgumentMatchers.anyInt; import static org.mockito.ArgumentMatchers.eq; import static org.mockito.Mockito.mock; import static org.mockito.Mockito.when; import static org.testng.Assert.assertEquals; +@Import(ElasticSearchTestConfiguration.class) +public class LineageSearchServiceTest extends AbstractTestNGSpringContextTests { -public class LineageSearchServiceTest { - - private ElasticsearchContainer _elasticsearchContainer; + @Autowired private RestHighLevelClient _searchClient; + @Autowired + private ESBulkProcessor _bulkProcessor; + @Autowired + private ESIndexBuilder _esIndexBuilder; + private EntityRegistry _entityRegistry; private IndexConvention _indexConvention; private SettingsBuilder _settingsBuilder; @@ -77,12 +83,8 @@ public void disableAssert() { @BeforeClass public void setup() { _entityRegistry = new SnapshotEntityRegistry(new Snapshot()); - _indexConvention = new IndexConventionImpl(null); - _elasticsearchContainer = ElasticTestUtils.getNewElasticsearchContainer(); + _indexConvention = new IndexConventionImpl("lineage_search_service_test"); _settingsBuilder = new SettingsBuilder(Collections.emptyList(), null); - checkContainerEngine(_elasticsearchContainer.getDockerClient()); - _elasticsearchContainer.start(); - _searchClient = ElasticTestUtils.buildRestClient(_elasticsearchContainer); _elasticSearchService = buildEntitySearchService(); _elasticSearchService.configure(); _cacheManager = new ConcurrentMapCacheManager(); @@ -109,18 +111,17 @@ private void resetService(boolean withCache) { public void wipe() throws Exception { _elasticSearchService.clear(); clearCache(); - syncAfterWrite(_searchClient); + syncAfterWrite(); } @Nonnull private ElasticSearchService buildEntitySearchService() { EntityIndexBuilders indexBuilders = - new EntityIndexBuilders(ElasticSearchServiceTest.getIndexBuilder(_searchClient), _entityRegistry, + new EntityIndexBuilders(_esIndexBuilder, _entityRegistry, _indexConvention, _settingsBuilder); ESSearchDAO searchDAO = new ESSearchDAO(_entityRegistry, _searchClient, _indexConvention); ESBrowseDAO browseDAO = new ESBrowseDAO(_entityRegistry, _searchClient, _indexConvention); - ESWriteDAO writeDAO = new ESWriteDAO(_entityRegistry, _searchClient, _indexConvention, - ElasticSearchServiceTest.getBulkProcessor(_searchClient)); + ESWriteDAO writeDAO = new ESWriteDAO(_entityRegistry, _searchClient, _indexConvention, _bulkProcessor, 1); return new ElasticSearchService(indexBuilders, searchDAO, browseDAO, writeDAO); } @@ -129,11 +130,6 @@ private void clearCache() { resetService(true); } - @AfterClass - public void tearDown() { - _elasticsearchContainer.stop(); - } - private EntityLineageResult mockResult(List lineageRelationships) { return new EntityLineageResult().setRelationships(new LineageRelationshipArray(lineageRelationships)) .setStart(0) @@ -175,7 +171,7 @@ public void testSearchService() throws Exception { document.set("textFieldOverride", JsonNodeFactory.instance.textNode("textFieldOverride")); document.set("browsePaths", JsonNodeFactory.instance.textNode("/a/b/c")); _elasticSearchService.upsertDocument(ENTITY_NAME, document.toString(), urn.toString()); - syncAfterWrite(_searchClient); + syncAfterWrite(); when(_graphService.getLineage(eq(TEST_URN), eq(LineageDirection.DOWNSTREAM), anyInt(), anyInt(), anyInt())).thenReturn(mockResult(Collections.emptyList())); @@ -217,7 +213,7 @@ public void testSearchService() throws Exception { document2.set("textFieldOverride", JsonNodeFactory.instance.textNode("textFieldOverride2")); document2.set("browsePaths", JsonNodeFactory.instance.textNode("/b/c")); _elasticSearchService.upsertDocument(ENTITY_NAME, document2.toString(), urn2.toString()); - syncAfterWrite(_searchClient); + syncAfterWrite(); searchResult = _lineageSearchService.searchAcrossLineage(TEST_URN, LineageDirection.DOWNSTREAM, ImmutableList.of(), "test", @@ -238,7 +234,7 @@ public void testSearchService() throws Exception { _elasticSearchService.deleteDocument(ENTITY_NAME, urn.toString()); _elasticSearchService.deleteDocument(ENTITY_NAME, urn2.toString()); - syncAfterWrite(_searchClient); + syncAfterWrite(); when(_graphService.getLineage(eq(TEST_URN), eq(LineageDirection.DOWNSTREAM), anyInt(), anyInt(), anyInt())).thenReturn( @@ -246,6 +242,7 @@ public void testSearchService() throws Exception { searchResult = _lineageSearchService.searchAcrossLineage(TEST_URN, LineageDirection.DOWNSTREAM, ImmutableList.of(), "test", null, null, null, 0, 10); + assertEquals(searchResult.getNumEntities().intValue(), 0); } } diff --git a/metadata-io/src/test/java/com/linkedin/metadata/search/SearchServiceTest.java b/metadata-io/src/test/java/com/linkedin/metadata/search/SearchServiceTest.java index f62c008ddf46d..0f79edf2a0d7f 100644 --- a/metadata-io/src/test/java/com/linkedin/metadata/search/SearchServiceTest.java +++ b/metadata-io/src/test/java/com/linkedin/metadata/search/SearchServiceTest.java @@ -7,7 +7,7 @@ import com.linkedin.common.urn.TestEntityUrn; import com.linkedin.common.urn.Urn; import com.linkedin.data.template.StringArray; -import com.linkedin.metadata.ElasticTestUtils; +import com.linkedin.metadata.ElasticSearchTestConfiguration; import com.linkedin.metadata.models.registry.EntityRegistry; import com.linkedin.metadata.models.registry.SnapshotEntityRegistry; import com.linkedin.metadata.query.filter.Condition; @@ -21,20 +21,22 @@ import com.linkedin.metadata.search.cache.EntityDocCountCache; import com.linkedin.metadata.search.client.CachingEntitySearchService; import com.linkedin.metadata.search.elasticsearch.ElasticSearchService; -import com.linkedin.metadata.search.elasticsearch.ElasticSearchServiceTest; +import com.linkedin.metadata.search.elasticsearch.indexbuilder.ESIndexBuilder; import com.linkedin.metadata.search.elasticsearch.indexbuilder.EntityIndexBuilders; import com.linkedin.metadata.search.elasticsearch.indexbuilder.SettingsBuilder; import com.linkedin.metadata.search.elasticsearch.query.ESBrowseDAO; import com.linkedin.metadata.search.elasticsearch.query.ESSearchDAO; +import com.linkedin.metadata.search.elasticsearch.update.ESBulkProcessor; import com.linkedin.metadata.search.elasticsearch.update.ESWriteDAO; import com.linkedin.metadata.search.ranker.SimpleRanker; import com.linkedin.metadata.utils.elasticsearch.IndexConvention; import com.linkedin.metadata.utils.elasticsearch.IndexConventionImpl; import org.elasticsearch.client.RestHighLevelClient; +import org.springframework.beans.factory.annotation.Autowired; import org.springframework.cache.CacheManager; import org.springframework.cache.concurrent.ConcurrentMapCacheManager; -import org.testcontainers.elasticsearch.ElasticsearchContainer; -import org.testng.annotations.AfterClass; +import org.springframework.context.annotation.Import; +import org.springframework.test.context.testng.AbstractTestNGSpringContextTests; import org.testng.annotations.BeforeClass; import org.testng.annotations.BeforeMethod; import org.testng.annotations.Test; @@ -42,15 +44,18 @@ import javax.annotation.Nonnull; import java.util.Collections; -import static com.linkedin.metadata.DockerTestUtils.checkContainerEngine; -import static com.linkedin.metadata.ElasticSearchTestUtils.syncAfterWrite; +import static com.linkedin.metadata.ElasticSearchTestConfiguration.syncAfterWrite; import static org.testng.Assert.assertEquals; +@Import(ElasticSearchTestConfiguration.class) +public class SearchServiceTest extends AbstractTestNGSpringContextTests { -public class SearchServiceTest { - - private ElasticsearchContainer _elasticsearchContainer; + @Autowired private RestHighLevelClient _searchClient; + @Autowired + private ESBulkProcessor _bulkProcessor; + @Autowired + private ESIndexBuilder _esIndexBuilder; private EntityRegistry _entityRegistry; private IndexConvention _indexConvention; private SettingsBuilder _settingsBuilder; @@ -63,12 +68,8 @@ public class SearchServiceTest { @BeforeClass public void setup() { _entityRegistry = new SnapshotEntityRegistry(new Snapshot()); - _indexConvention = new IndexConventionImpl(null); - _elasticsearchContainer = ElasticTestUtils.getNewElasticsearchContainer(); + _indexConvention = new IndexConventionImpl("search_service_test"); _settingsBuilder = new SettingsBuilder(Collections.emptyList(), null); - checkContainerEngine(_elasticsearchContainer.getDockerClient()); - _elasticsearchContainer.start(); - _searchClient = ElasticTestUtils.buildRestClient(_elasticsearchContainer); _elasticSearchService = buildEntitySearchService(); _elasticSearchService.configure(); _cacheManager = new ConcurrentMapCacheManager(); @@ -99,18 +100,18 @@ private void resetSearchService() { @BeforeMethod public void wipe() throws Exception { _elasticSearchService.clear(); - syncAfterWrite(_searchClient); + syncAfterWrite(); } @Nonnull private ElasticSearchService buildEntitySearchService() { EntityIndexBuilders indexBuilders = - new EntityIndexBuilders(ElasticSearchServiceTest.getIndexBuilder(_searchClient), _entityRegistry, + new EntityIndexBuilders(_esIndexBuilder, _entityRegistry, _indexConvention, _settingsBuilder); ESSearchDAO searchDAO = new ESSearchDAO(_entityRegistry, _searchClient, _indexConvention); ESBrowseDAO browseDAO = new ESBrowseDAO(_entityRegistry, _searchClient, _indexConvention); ESWriteDAO writeDAO = new ESWriteDAO(_entityRegistry, _searchClient, _indexConvention, - ElasticSearchServiceTest.getBulkProcessor(_searchClient)); + _bulkProcessor, 1); return new ElasticSearchService(indexBuilders, searchDAO, browseDAO, writeDAO); } @@ -119,11 +120,6 @@ private void clearCache() { resetSearchService(); } - @AfterClass - public void tearDown() { - _elasticsearchContainer.stop(); - } - @Test public void testSearchService() throws Exception { SearchResult searchResult = @@ -140,7 +136,7 @@ public void testSearchService() throws Exception { document.set("textFieldOverride", JsonNodeFactory.instance.textNode("textFieldOverride")); document.set("browsePaths", JsonNodeFactory.instance.textNode("/a/b/c")); _elasticSearchService.upsertDocument(ENTITY_NAME, document.toString(), urn.toString()); - syncAfterWrite(_searchClient); + syncAfterWrite(); searchResult = _searchService.searchAcrossEntities(ImmutableList.of(), "test", null, null, 0, 10, null); assertEquals(searchResult.getNumEntities().intValue(), 1); @@ -154,7 +150,7 @@ public void testSearchService() throws Exception { document2.set("textFieldOverride", JsonNodeFactory.instance.textNode("textFieldOverride2")); document2.set("browsePaths", JsonNodeFactory.instance.textNode("/b/c")); _elasticSearchService.upsertDocument(ENTITY_NAME, document2.toString(), urn2.toString()); - syncAfterWrite(_searchClient); + syncAfterWrite(); searchResult = _searchService.searchAcrossEntities(ImmutableList.of(), "test", null, null, 0, 10, null); assertEquals(searchResult.getNumEntities().intValue(), 1); @@ -163,7 +159,7 @@ public void testSearchService() throws Exception { _elasticSearchService.deleteDocument(ENTITY_NAME, urn.toString()); _elasticSearchService.deleteDocument(ENTITY_NAME, urn2.toString()); - syncAfterWrite(_searchClient); + syncAfterWrite(); searchResult = _searchService.searchAcrossEntities(ImmutableList.of(), "test", null, null, 0, 10, null); assertEquals(searchResult.getNumEntities().intValue(), 0); } @@ -227,7 +223,7 @@ public void testAdvancedSearchOr() throws Exception { document3.set("platform", JsonNodeFactory.instance.textNode("snowflake")); _elasticSearchService.upsertDocument(ENTITY_NAME, document3.toString(), urn3.toString()); - syncAfterWrite(_searchClient); + syncAfterWrite(); searchResult = _searchService.searchAcrossEntities(ImmutableList.of(), "test", filterWithCondition, null, 0, 10, null); assertEquals(searchResult.getNumEntities().intValue(), 2); @@ -296,7 +292,7 @@ public void testAdvancedSearchSoftDelete() throws Exception { document.set("removed", JsonNodeFactory.instance.booleanNode(false)); _elasticSearchService.upsertDocument(ENTITY_NAME, document3.toString(), urn3.toString()); - syncAfterWrite(_searchClient); + syncAfterWrite(); searchResult = _searchService.searchAcrossEntities(ImmutableList.of(), "test", filterWithCondition, null, 0, 10, null); assertEquals(searchResult.getNumEntities().intValue(), 1); @@ -359,7 +355,7 @@ public void testAdvancedSearchNegated() throws Exception { document.set("removed", JsonNodeFactory.instance.booleanNode(false)); _elasticSearchService.upsertDocument(ENTITY_NAME, document3.toString(), urn3.toString()); - syncAfterWrite(_searchClient); + syncAfterWrite(); searchResult = _searchService.searchAcrossEntities(ImmutableList.of(), "test", filterWithCondition, null, 0, 10, null); assertEquals(searchResult.getNumEntities().intValue(), 1); diff --git a/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/ElasticSearchServiceTest.java b/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/ElasticSearchServiceTest.java index 5c2f0e651734e..d2091602a04df 100644 --- a/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/ElasticSearchServiceTest.java +++ b/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/ElasticSearchServiceTest.java @@ -5,7 +5,7 @@ import com.fasterxml.jackson.databind.node.ObjectNode; import com.linkedin.common.urn.TestEntityUrn; import com.linkedin.common.urn.Urn; -import com.linkedin.metadata.ElasticTestUtils; +import com.linkedin.metadata.ElasticSearchTestConfiguration; import com.linkedin.metadata.browse.BrowseResult; import com.linkedin.metadata.models.registry.EntityRegistry; import com.linkedin.metadata.models.registry.SnapshotEntityRegistry; @@ -15,18 +15,15 @@ import com.linkedin.metadata.search.elasticsearch.indexbuilder.SettingsBuilder; import com.linkedin.metadata.search.elasticsearch.query.ESBrowseDAO; import com.linkedin.metadata.search.elasticsearch.query.ESSearchDAO; -import com.linkedin.metadata.search.elasticsearch.update.BulkListener; +import com.linkedin.metadata.search.elasticsearch.update.ESBulkProcessor; import com.linkedin.metadata.search.elasticsearch.update.ESWriteDAO; import com.linkedin.metadata.utils.elasticsearch.IndexConvention; import com.linkedin.metadata.utils.elasticsearch.IndexConventionImpl; -import org.elasticsearch.action.bulk.BackoffPolicy; -import org.elasticsearch.action.bulk.BulkProcessor; -import org.elasticsearch.client.RequestOptions; import org.elasticsearch.client.RestHighLevelClient; -import org.elasticsearch.common.unit.TimeValue; -import org.testcontainers.elasticsearch.ElasticsearchContainer; +import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.context.annotation.Import; +import org.springframework.test.context.testng.AbstractTestNGSpringContextTests; import org.testcontainers.shaded.com.google.common.collect.ImmutableMap; -import org.testng.annotations.AfterClass; import org.testng.annotations.BeforeClass; import org.testng.annotations.BeforeMethod; import org.testng.annotations.Test; @@ -34,15 +31,19 @@ import javax.annotation.Nonnull; import java.util.Collections; -import static com.linkedin.metadata.DockerTestUtils.checkContainerEngine; -import static com.linkedin.metadata.ElasticSearchTestUtils.syncAfterWrite; +import static com.linkedin.metadata.ElasticSearchTestConfiguration.syncAfterWrite; import static org.testng.Assert.assertEquals; +@Import(ElasticSearchTestConfiguration.class) +public class ElasticSearchServiceTest extends AbstractTestNGSpringContextTests { -public class ElasticSearchServiceTest { - - private ElasticsearchContainer _elasticsearchContainer; + @Autowired private RestHighLevelClient _searchClient; + @Autowired + private ESBulkProcessor _bulkProcessor; + @Autowired + private ESIndexBuilder _esIndexBuilder; + private EntityRegistry _entityRegistry; private IndexConvention _indexConvention; private SettingsBuilder _settingsBuilder; @@ -53,12 +54,8 @@ public class ElasticSearchServiceTest { @BeforeClass public void setup() { _entityRegistry = new SnapshotEntityRegistry(new Snapshot()); - _indexConvention = new IndexConventionImpl(null); - _elasticsearchContainer = ElasticTestUtils.getNewElasticsearchContainer(); + _indexConvention = new IndexConventionImpl("es_service_test"); _settingsBuilder = new SettingsBuilder(Collections.emptyList(), null); - checkContainerEngine(_elasticsearchContainer.getDockerClient()); - _elasticsearchContainer.start(); - _searchClient = ElasticTestUtils.buildRestClient(_elasticsearchContainer); _elasticSearchService = buildService(); _elasticSearchService.configure(); } @@ -66,39 +63,19 @@ public void setup() { @BeforeMethod public void wipe() throws Exception { _elasticSearchService.clear(); - syncAfterWrite(_searchClient); - } - - public static BulkProcessor getBulkProcessor(RestHighLevelClient searchClient) { - return BulkProcessor.builder((request, bulkListener) -> { - searchClient.bulkAsync(request, RequestOptions.DEFAULT, bulkListener); - }, BulkListener.getInstance()) - .setBulkActions(1) - .setFlushInterval(TimeValue.timeValueSeconds(1)) - .setBackoffPolicy(BackoffPolicy.constantBackoff(TimeValue.timeValueSeconds(1000), 1)) - .build(); - } - - public static ESIndexBuilder getIndexBuilder(RestHighLevelClient searchClient) { - return new ESIndexBuilder(searchClient, 1, 1, 3); } @Nonnull private ElasticSearchService buildService() { EntityIndexBuilders indexBuilders = - new EntityIndexBuilders(getIndexBuilder(_searchClient), _entityRegistry, _indexConvention, _settingsBuilder); + new EntityIndexBuilders(_esIndexBuilder, _entityRegistry, _indexConvention, _settingsBuilder); ESSearchDAO searchDAO = new ESSearchDAO(_entityRegistry, _searchClient, _indexConvention); ESBrowseDAO browseDAO = new ESBrowseDAO(_entityRegistry, _searchClient, _indexConvention); ESWriteDAO writeDAO = - new ESWriteDAO(_entityRegistry, _searchClient, _indexConvention, getBulkProcessor(_searchClient)); + new ESWriteDAO(_entityRegistry, _searchClient, _indexConvention, _bulkProcessor, 1); return new ElasticSearchService(indexBuilders, searchDAO, browseDAO, writeDAO); } - @AfterClass - public void tearDown() { - _elasticsearchContainer.stop(); - } - @Test public void testElasticSearchService() throws Exception { SearchResult searchResult = _elasticSearchService.search(ENTITY_NAME, "test", null, null, 0, 10); @@ -116,7 +93,7 @@ public void testElasticSearchService() throws Exception { document.set("browsePaths", JsonNodeFactory.instance.textNode("/a/b/c")); document.set("foreignKey", JsonNodeFactory.instance.textNode("urn:li:tag:Node.Value")); _elasticSearchService.upsertDocument(ENTITY_NAME, document.toString(), urn.toString()); - syncAfterWrite(_searchClient); + syncAfterWrite(); searchResult = _elasticSearchService.search(ENTITY_NAME, "test", null, null, 0, 10); assertEquals(searchResult.getNumEntities().intValue(), 1); @@ -141,7 +118,7 @@ public void testElasticSearchService() throws Exception { document2.set("textFieldOverride", JsonNodeFactory.instance.textNode("textFieldOverride2")); document2.set("browsePaths", JsonNodeFactory.instance.textNode("/b/c")); _elasticSearchService.upsertDocument(ENTITY_NAME, document2.toString(), urn2.toString()); - syncAfterWrite(_searchClient); + syncAfterWrite(); searchResult = _elasticSearchService.search(ENTITY_NAME, "test", null, null, 0, 10); assertEquals(searchResult.getNumEntities().intValue(), 1); @@ -159,7 +136,7 @@ public void testElasticSearchService() throws Exception { _elasticSearchService.deleteDocument(ENTITY_NAME, urn.toString()); _elasticSearchService.deleteDocument(ENTITY_NAME, urn2.toString()); - syncAfterWrite(_searchClient); + syncAfterWrite(); searchResult = _elasticSearchService.search(ENTITY_NAME, "test", null, null, 0, 10); assertEquals(searchResult.getNumEntities().intValue(), 0); browseResult = _elasticSearchService.browse(ENTITY_NAME, "", null, 0, 10); diff --git a/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/indexbuilder/ESIndexBuilderTest.java b/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/indexbuilder/ESIndexBuilderTest.java new file mode 100644 index 0000000000000..407c8f7f16b40 --- /dev/null +++ b/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/indexbuilder/ESIndexBuilderTest.java @@ -0,0 +1,199 @@ +package com.linkedin.metadata.search.elasticsearch.indexbuilder; + +import com.google.common.collect.ImmutableMap; +import com.linkedin.metadata.ElasticSearchTestConfiguration; +import com.linkedin.metadata.systemmetadata.SystemMetadataMappingsBuilder; +import org.elasticsearch.ElasticsearchException; +import org.elasticsearch.action.admin.indices.alias.get.GetAliasesRequest; +import org.elasticsearch.action.admin.indices.delete.DeleteIndexRequest; +import org.elasticsearch.client.RestHighLevelClient; +import org.elasticsearch.client.indices.GetIndexRequest; +import org.elasticsearch.client.IndicesClient; +import org.elasticsearch.client.RequestOptions; +import org.elasticsearch.client.indices.GetIndexResponse; +import org.elasticsearch.cluster.metadata.AliasMetadata; +import org.elasticsearch.rest.RestStatus; +import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.context.annotation.Import; +import org.springframework.test.context.testng.AbstractTestNGSpringContextTests; +import org.testng.annotations.BeforeClass; +import org.testng.annotations.BeforeMethod; +import org.testng.annotations.Test; + +import java.io.IOException; +import java.util.Arrays; +import java.util.List; +import java.util.Map; +import java.util.stream.Collectors; + +import static org.testng.Assert.assertTrue; +import static org.testng.Assert.assertEquals; +import static org.testng.Assert.assertNotEquals; + +@Import(ElasticSearchTestConfiguration.class) +public class ESIndexBuilderTest extends AbstractTestNGSpringContextTests { + + @Autowired + private RestHighLevelClient _searchClient; + private static IndicesClient _indexClient; + private static final String TEST_INDEX_NAME = "esindex_builder_test"; + private static ESIndexBuilder testDefaultBuilder; + + + @BeforeClass + public void setup() { + _indexClient = _searchClient.indices(); + testDefaultBuilder = new ESIndexBuilder(_searchClient, 1, 0, 0, 0, Map.of(), false); + } + + @BeforeMethod + public static void wipe() throws Exception { + try { + _indexClient.getAlias(new GetAliasesRequest(TEST_INDEX_NAME), RequestOptions.DEFAULT) + .getAliases().keySet().forEach(index -> { + try { + _indexClient.delete(new DeleteIndexRequest(index), RequestOptions.DEFAULT); + } catch (IOException e) { + throw new RuntimeException(e); + } + }); + + _indexClient.delete(new DeleteIndexRequest(TEST_INDEX_NAME), RequestOptions.DEFAULT); + } catch (ElasticsearchException exception) { + if (exception.status() != RestStatus.NOT_FOUND) { + throw exception; + } + } + } + + public static GetIndexResponse getTestIndex() throws IOException { + return _indexClient.get(new GetIndexRequest(TEST_INDEX_NAME).includeDefaults(true), RequestOptions.DEFAULT); + } + + @Test + public void testESIndexBuilderCreation() throws Exception { + ESIndexBuilder customIndexBuilder = new ESIndexBuilder(_searchClient, 2, 0, 1, 0, Map.of(), false); + customIndexBuilder.buildIndex(TEST_INDEX_NAME, Map.of(), Map.of()); + GetIndexResponse resp = getTestIndex(); + + assertEquals("2", resp.getSetting(TEST_INDEX_NAME, "index.number_of_shards")); + assertEquals("0", resp.getSetting(TEST_INDEX_NAME, "index.number_of_replicas")); + assertEquals("0s", resp.getSetting(TEST_INDEX_NAME, "index.refresh_interval")); + } + + @Test + public void testMappingReindex() throws Exception { + // No mappings + testDefaultBuilder.buildIndex(TEST_INDEX_NAME, Map.of(), Map.of()); + String beforeCreationDate = getTestIndex().getSetting(TEST_INDEX_NAME, "index.creation_date"); + + // add new mappings + testDefaultBuilder.buildIndex(TEST_INDEX_NAME, SystemMetadataMappingsBuilder.getMappings(), Map.of()); + + String afterAddedMappingCreationDate = getTestIndex().getSetting(TEST_INDEX_NAME, "index.creation_date"); + assertEquals(beforeCreationDate, afterAddedMappingCreationDate, "Expected no reindex on *adding* mappings"); + + // change mappings + Map newProps = ((Map) SystemMetadataMappingsBuilder.getMappings().get("properties")) + .entrySet().stream() + .map(m -> !m.getKey().equals("urn") ? m + : Map.entry("urn", ImmutableMap.builder().put("type", "wildcard").build())) + .collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue)); + testDefaultBuilder.buildIndex(TEST_INDEX_NAME, Map.of("properties", newProps), Map.of()); + + assertTrue(Arrays.stream(getTestIndex().getIndices()).noneMatch(name -> name.equals(TEST_INDEX_NAME)), + "Expected original index to be replaced with alias"); + + Map.Entry> newIndex = getTestIndex().getAliases().entrySet().stream() + .filter(e -> e.getValue().stream().anyMatch(aliasMeta -> aliasMeta.alias().equals(TEST_INDEX_NAME))) + .findFirst().get(); + String afterChangedMappingCreationDate = getTestIndex().getSetting(newIndex.getKey(), "index.creation_date"); + assertNotEquals(beforeCreationDate, afterChangedMappingCreationDate, "Expected reindex on *changing* mappings"); + } + + @Test + public void testSettingsNumberOfShardsReindex() throws Exception { + // Set test defaults + testDefaultBuilder.buildIndex(TEST_INDEX_NAME, Map.of(), Map.of()); + assertEquals("1", getTestIndex().getSetting(TEST_INDEX_NAME, "index.number_of_shards")); + String beforeCreationDate = getTestIndex().getSetting(TEST_INDEX_NAME, "index.creation_date"); + + String expectedShards = "5"; + ESIndexBuilder changedShardBuilder = new ESIndexBuilder(_searchClient, + Integer.parseInt(expectedShards), + testDefaultBuilder.getNumReplicas(), + testDefaultBuilder.getNumRetries(), + testDefaultBuilder.getRefreshIntervalSeconds(), + Map.of(), + true); + + // add new shard setting + changedShardBuilder.buildIndex(TEST_INDEX_NAME, Map.of(), Map.of()); + assertTrue(Arrays.stream(getTestIndex().getIndices()).noneMatch(name -> name.equals(TEST_INDEX_NAME)), + "Expected original index to be replaced with alias"); + + Map.Entry> newIndex = getTestIndex().getAliases().entrySet().stream() + .filter(e -> e.getValue().stream().anyMatch(aliasMeta -> aliasMeta.alias().equals(TEST_INDEX_NAME))) + .findFirst().get(); + + String afterCreationDate = getTestIndex().getSetting(newIndex.getKey(), "index.creation_date"); + assertNotEquals(beforeCreationDate, afterCreationDate, "Expected reindex to result in different timestamp"); + assertEquals(expectedShards, getTestIndex().getSetting(newIndex.getKey(), "index.number_of_shards"), + "Expected number of shards: " + expectedShards); + } + + @Test + public void testSettingsNoReindex() throws Exception { + List noReindexBuilders = List.of( + new ESIndexBuilder(_searchClient, + testDefaultBuilder.getNumShards(), + testDefaultBuilder.getNumReplicas() + 1, + testDefaultBuilder.getNumRetries(), + testDefaultBuilder.getRefreshIntervalSeconds(), + Map.of(), + true), + new ESIndexBuilder(_searchClient, + testDefaultBuilder.getNumShards(), + testDefaultBuilder.getNumReplicas(), + testDefaultBuilder.getNumRetries(), + testDefaultBuilder.getRefreshIntervalSeconds() + 10, + Map.of(), + true), + new ESIndexBuilder(_searchClient, + testDefaultBuilder.getNumShards() + 1, + testDefaultBuilder.getNumReplicas(), + testDefaultBuilder.getNumRetries(), + testDefaultBuilder.getRefreshIntervalSeconds(), + Map.of(), + false), + new ESIndexBuilder(_searchClient, + testDefaultBuilder.getNumShards(), + testDefaultBuilder.getNumReplicas() + 1, + testDefaultBuilder.getNumRetries(), + testDefaultBuilder.getRefreshIntervalSeconds(), + Map.of(), + false) + ); + + for (ESIndexBuilder builder : noReindexBuilders) { + // Set test defaults + testDefaultBuilder.buildIndex(TEST_INDEX_NAME, Map.of(), Map.of()); + assertEquals("0", getTestIndex().getSetting(TEST_INDEX_NAME, "index.number_of_replicas")); + assertEquals("0s", getTestIndex().getSetting(TEST_INDEX_NAME, "index.refresh_interval")); + String beforeCreationDate = getTestIndex().getSetting(TEST_INDEX_NAME, "index.creation_date"); + + // build index with builder + builder.buildIndex(TEST_INDEX_NAME, Map.of(), Map.of()); + assertTrue(Arrays.asList(getTestIndex().getIndices()).contains(TEST_INDEX_NAME), + "Expected original index to remain"); + String afterCreationDate = getTestIndex().getSetting(TEST_INDEX_NAME, "index.creation_date"); + + assertEquals(beforeCreationDate, afterCreationDate, "Expected no difference in index timestamp"); + assertEquals(String.valueOf(builder.getNumReplicas()), getTestIndex().getSetting(TEST_INDEX_NAME, "index.number_of_replicas")); + assertEquals(builder.getRefreshIntervalSeconds() + "s", getTestIndex().getSetting(TEST_INDEX_NAME, "index.refresh_interval")); + + wipe(); + } + } + +} diff --git a/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/query/ESBrowseDAOTest.java b/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/query/ESBrowseDAOTest.java index 1b2259626a9da..d8c28ec34d213 100644 --- a/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/query/ESBrowseDAOTest.java +++ b/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/query/ESBrowseDAOTest.java @@ -30,7 +30,7 @@ public class ESBrowseDAOTest { @BeforeMethod public void setup() { _mockClient = mock(RestHighLevelClient.class); - _browseDAO = new ESBrowseDAO(new TestEntityRegistry(), _mockClient, new IndexConventionImpl(null)); + _browseDAO = new ESBrowseDAO(new TestEntityRegistry(), _mockClient, new IndexConventionImpl("es_browse_dao_test")); } public static Urn makeUrn(Object id) { diff --git a/metadata-io/src/test/java/com/linkedin/metadata/systemmetadata/ElasticSearchSystemMetadataServiceTest.java b/metadata-io/src/test/java/com/linkedin/metadata/systemmetadata/ElasticSearchSystemMetadataServiceTest.java index 4b4401545b73b..0451664f1d3bf 100644 --- a/metadata-io/src/test/java/com/linkedin/metadata/systemmetadata/ElasticSearchSystemMetadataServiceTest.java +++ b/metadata-io/src/test/java/com/linkedin/metadata/systemmetadata/ElasticSearchSystemMetadataServiceTest.java @@ -1,16 +1,18 @@ package com.linkedin.metadata.systemmetadata; -import com.linkedin.metadata.ElasticTestUtils; +import com.linkedin.metadata.ElasticSearchTestConfiguration; import com.linkedin.metadata.run.AspectRowSummary; import com.linkedin.metadata.run.IngestionRunSummary; -import com.linkedin.metadata.search.elasticsearch.ElasticSearchServiceTest; +import com.linkedin.metadata.search.elasticsearch.indexbuilder.ESIndexBuilder; +import com.linkedin.metadata.search.elasticsearch.update.ESBulkProcessor; import com.linkedin.metadata.search.utils.ESUtils; import com.linkedin.metadata.utils.elasticsearch.IndexConvention; import com.linkedin.metadata.utils.elasticsearch.IndexConventionImpl; import com.linkedin.mxe.SystemMetadata; import org.elasticsearch.client.RestHighLevelClient; -import org.testcontainers.elasticsearch.ElasticsearchContainer; -import org.testng.annotations.AfterClass; +import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.context.annotation.Import; +import org.springframework.test.context.testng.AbstractTestNGSpringContextTests; import org.testng.annotations.BeforeClass; import org.testng.annotations.BeforeMethod; import org.testng.annotations.Test; @@ -18,26 +20,25 @@ import javax.annotation.Nonnull; import java.util.List; -import static com.linkedin.metadata.DockerTestUtils.checkContainerEngine; -import static com.linkedin.metadata.ElasticSearchTestUtils.syncAfterWrite; +import static com.linkedin.metadata.ElasticSearchTestConfiguration.syncAfterWrite; import static com.linkedin.metadata.systemmetadata.ElasticSearchSystemMetadataService.INDEX_NAME; import static org.testng.Assert.assertEquals; +@Import(ElasticSearchTestConfiguration.class) +public class ElasticSearchSystemMetadataServiceTest extends AbstractTestNGSpringContextTests { -public class ElasticSearchSystemMetadataServiceTest { - - private ElasticsearchContainer _elasticsearchContainer; + @Autowired private RestHighLevelClient _searchClient; - private final IndexConvention _indexConvention = new IndexConventionImpl(null); + @Autowired + private ESBulkProcessor _bulkProcessor; + @Autowired + private ESIndexBuilder _esIndexBuilder; + private final IndexConvention _indexConvention = new IndexConventionImpl("es_system_metadata_service_test"); private final String _indexName = _indexConvention.getIndexName(INDEX_NAME); private ElasticSearchSystemMetadataService _client; @BeforeClass public void setup() { - _elasticsearchContainer = ElasticTestUtils.getNewElasticsearchContainer(); - checkContainerEngine(_elasticsearchContainer.getDockerClient()); - _elasticsearchContainer.start(); - _searchClient = ElasticTestUtils.buildRestClient(_elasticsearchContainer); _client = buildService(); _client.configure(); } @@ -45,20 +46,12 @@ public void setup() { @BeforeMethod public void wipe() throws Exception { _client.clear(); - syncAfterWrite(_searchClient, _indexName); } @Nonnull private ElasticSearchSystemMetadataService buildService() { - ESSystemMetadataDAO dao = new ESSystemMetadataDAO(_searchClient, _indexConvention, - ElasticSearchServiceTest.getBulkProcessor(_searchClient)); - return new ElasticSearchSystemMetadataService(_searchClient, _indexConvention, dao, - ElasticSearchServiceTest.getIndexBuilder(_searchClient)); - } - - @AfterClass - public void tearDown() { - _elasticsearchContainer.stop(); + ESSystemMetadataDAO dao = new ESSystemMetadataDAO(_searchClient, _indexConvention, _bulkProcessor, 1); + return new ElasticSearchSystemMetadataService(_bulkProcessor, _indexConvention, dao, _esIndexBuilder); } @Test @@ -78,7 +71,7 @@ public void testListRuns() throws Exception { _client.insert(metadata2, "urn:li:chart:2", "chartKey"); _client.insert(metadata2, "urn:li:chart:2", "Ownership"); - syncAfterWrite(_searchClient, _indexName); + syncAfterWrite(); List runs = _client.listRuns(0, 20, false); @@ -107,7 +100,7 @@ public void testOverwriteRuns() throws Exception { _client.insert(metadata2, "urn:li:chart:2", "chartKey"); _client.insert(metadata2, "urn:li:chart:2", "Ownership"); - syncAfterWrite(_searchClient, _indexName); + syncAfterWrite(); List runs = _client.listRuns(0, 20, false); @@ -136,7 +129,7 @@ public void testFindByRunId() throws Exception { _client.insert(metadata2, "urn:li:chart:2", "chartKey"); _client.insert(metadata2, "urn:li:chart:2", "Ownership"); - syncAfterWrite(_searchClient, _indexName); + syncAfterWrite(); List rows = _client.findByRunId("abc-456", false, 0, ESUtils.MAX_RESULT_SIZE); @@ -164,11 +157,11 @@ public void testDelete() throws Exception { _client.insert(metadata2, "urn:li:chart:2", "chartKey"); _client.insert(metadata2, "urn:li:chart:2", "Ownership"); - syncAfterWrite(_searchClient, _indexName); + syncAfterWrite(); _client.deleteUrn("urn:li:chart:1"); - syncAfterWrite(_searchClient, _indexName); + syncAfterWrite(); List rows = _client.findByRunId("abc-456", false, 0, ESUtils.MAX_RESULT_SIZE); @@ -180,7 +173,7 @@ public void testDelete() throws Exception { public void testInsertNullData() throws Exception { _client.insert(null, "urn:li:chart:1", "chartKey"); - syncAfterWrite(_searchClient, _indexName); + syncAfterWrite(); List runs = _client.listRuns(0, 20, false); diff --git a/metadata-io/src/test/java/com/linkedin/metadata/timeseries/elastic/ElasticSearchTimeseriesAspectServiceTest.java b/metadata-io/src/test/java/com/linkedin/metadata/timeseries/elastic/ElasticSearchTimeseriesAspectServiceTest.java index 3c175834c9946..94a0381f9ca28 100644 --- a/metadata-io/src/test/java/com/linkedin/metadata/timeseries/elastic/ElasticSearchTimeseriesAspectServiceTest.java +++ b/metadata-io/src/test/java/com/linkedin/metadata/timeseries/elastic/ElasticSearchTimeseriesAspectServiceTest.java @@ -15,7 +15,7 @@ import com.linkedin.data.template.StringArrayArray; import com.linkedin.data.template.StringMap; import com.linkedin.data.template.StringMapArray; -import com.linkedin.metadata.ElasticTestUtils; +import com.linkedin.metadata.ElasticSearchTestConfiguration; import com.linkedin.metadata.aspect.EnvelopedAspect; import com.linkedin.metadata.models.AspectSpec; import com.linkedin.metadata.models.DataSchemaFactory; @@ -26,7 +26,8 @@ import com.linkedin.metadata.query.filter.Criterion; import com.linkedin.metadata.query.filter.CriterionArray; import com.linkedin.metadata.query.filter.Filter; -import com.linkedin.metadata.search.elasticsearch.ElasticSearchServiceTest; +import com.linkedin.metadata.search.elasticsearch.indexbuilder.ESIndexBuilder; +import com.linkedin.metadata.search.elasticsearch.update.ESBulkProcessor; import com.linkedin.metadata.search.utils.QueryUtils; import com.linkedin.metadata.timeseries.elastic.indexbuilder.TimeseriesAspectIndexBuilders; import com.linkedin.metadata.timeseries.transformer.TimeseriesAspectTransformer; @@ -42,8 +43,9 @@ import com.linkedin.timeseries.GroupingBucketType; import com.linkedin.timeseries.TimeWindowSize; import org.elasticsearch.client.RestHighLevelClient; -import org.testcontainers.elasticsearch.ElasticsearchContainer; -import org.testng.annotations.AfterClass; +import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.context.annotation.Import; +import org.springframework.test.context.testng.AbstractTestNGSpringContextTests; import org.testng.annotations.BeforeClass; import org.testng.annotations.Test; @@ -55,14 +57,13 @@ import java.util.stream.Collectors; import java.util.stream.Stream; -import static com.linkedin.metadata.DockerTestUtils.checkContainerEngine; -import static com.linkedin.metadata.ElasticSearchTestUtils.syncAfterWrite; +import static com.linkedin.metadata.ElasticSearchTestConfiguration.syncAfterWrite; import static org.testng.Assert.assertEquals; import static org.testng.Assert.assertNotNull; import static org.testng.Assert.fail; - -public class ElasticSearchTimeseriesAspectServiceTest { +@Import(ElasticSearchTestConfiguration.class) +public class ElasticSearchTimeseriesAspectServiceTest extends AbstractTestNGSpringContextTests { private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); private static final String ENTITY_NAME = "testEntity"; @@ -75,8 +76,12 @@ public class ElasticSearchTimeseriesAspectServiceTest { private static final String ES_FILED_TIMESTAMP = "timestampMillis"; private static final String ES_FILED_STAT = "stat"; - private ElasticsearchContainer _elasticsearchContainer; + @Autowired private RestHighLevelClient _searchClient; + @Autowired + private ESBulkProcessor _bulkProcessor; + @Autowired + private ESIndexBuilder _esIndexBuilder; private EntityRegistry _entityRegistry; private IndexConvention _indexConvention; private ElasticSearchTimeseriesAspectService _elasticSearchTimeseriesAspectService; @@ -93,11 +98,7 @@ public class ElasticSearchTimeseriesAspectServiceTest { public void setup() { _entityRegistry = new ConfigEntityRegistry(new DataSchemaFactory("com.datahub.test"), TestEntityProfile.class.getClassLoader().getResourceAsStream("test-entity-registry.yml")); - _indexConvention = new IndexConventionImpl(null); - _elasticsearchContainer = ElasticTestUtils.getNewElasticsearchContainer(); - checkContainerEngine(_elasticsearchContainer.getDockerClient()); - _elasticsearchContainer.start(); - _searchClient = ElasticTestUtils.buildRestClient(_elasticsearchContainer); + _indexConvention = new IndexConventionImpl("es_timeseries_aspect_service_test"); _elasticSearchTimeseriesAspectService = buildService(); _elasticSearchTimeseriesAspectService.configure(); EntitySpec entitySpec = _entityRegistry.getEntitySpec(ENTITY_NAME); @@ -107,13 +108,8 @@ public void setup() { @Nonnull private ElasticSearchTimeseriesAspectService buildService() { return new ElasticSearchTimeseriesAspectService(_searchClient, _indexConvention, - new TimeseriesAspectIndexBuilders(ElasticSearchServiceTest.getIndexBuilder(_searchClient), _entityRegistry, - _indexConvention), _entityRegistry, ElasticSearchServiceTest.getBulkProcessor(_searchClient)); - } - - @AfterClass - public void tearDown() { - _elasticsearchContainer.stop(); + new TimeseriesAspectIndexBuilders(_esIndexBuilder, _entityRegistry, + _indexConvention), _entityRegistry, _bulkProcessor, 1); } /* @@ -185,7 +181,7 @@ public void testUpsertProfiles() throws Exception { } }); - syncAfterWrite(_searchClient); + syncAfterWrite(); } @Test(groups = "upsertUniqueMessageId") @@ -211,7 +207,7 @@ public void testUpsertProfilesWithUniqueMessageIds() throws Exception { } }); - syncAfterWrite(_searchClient); + syncAfterWrite(); List resultAspects = _elasticSearchTimeseriesAspectService.getAspectValues(urn, ENTITY_NAME, ASPECT_NAME, null, null, diff --git a/metadata-jobs/mae-consumer/src/main/java/com/linkedin/metadata/kafka/elasticsearch/ElasticsearchConnector.java b/metadata-jobs/mae-consumer/src/main/java/com/linkedin/metadata/kafka/elasticsearch/ElasticsearchConnector.java index 496333cd2d95a..322f8a5c5c4d0 100644 --- a/metadata-jobs/mae-consumer/src/main/java/com/linkedin/metadata/kafka/elasticsearch/ElasticsearchConnector.java +++ b/metadata-jobs/mae-consumer/src/main/java/com/linkedin/metadata/kafka/elasticsearch/ElasticsearchConnector.java @@ -2,60 +2,24 @@ import com.linkedin.events.metadata.ChangeType; import javax.annotation.Nonnull; + +import com.linkedin.metadata.search.elasticsearch.update.ESBulkProcessor; import lombok.extern.slf4j.Slf4j; import org.elasticsearch.action.DocWriteRequest; -import org.elasticsearch.action.bulk.BackoffPolicy; -import org.elasticsearch.action.bulk.BulkProcessor; -import org.elasticsearch.action.bulk.BulkRequest; -import org.elasticsearch.action.bulk.BulkResponse; import org.elasticsearch.action.delete.DeleteRequest; import org.elasticsearch.action.index.IndexRequest; import org.elasticsearch.action.update.UpdateRequest; -import org.elasticsearch.client.RequestOptions; -import org.elasticsearch.client.RestHighLevelClient; -import org.elasticsearch.common.unit.TimeValue; @Slf4j public class ElasticsearchConnector { - private BulkProcessor _bulkProcessor; - private static final int DEFAULT_NUMBER_OF_RETRIES = 3; // TODO: Test and also add these into config - private static final long DEFAULT_RETRY_INTERVAL = 1L; - - public ElasticsearchConnector(RestHighLevelClient elasticSearchRestClient, Integer bulkRequestsLimit, - Integer bulkFlushPeriod) { - initBulkProcessor(elasticSearchRestClient, bulkRequestsLimit, bulkFlushPeriod); - } - - private void initBulkProcessor(RestHighLevelClient elasticSearchRestClient, Integer bulkRequestsLimit, - Integer bulkFlushPeriod) { - BulkProcessor.Listener listener = new BulkProcessor.Listener() { - @Override - public void beforeBulk(long executionId, BulkRequest request) { - - } - - @Override - public void afterBulk(long executionId, BulkRequest request, BulkResponse response) { - log.info("Successfully feeded bulk request. Number of events: " + response.getItems().length + " Took time ms: " - + response.getIngestTookInMillis()); - } - - @Override - public void afterBulk(long executionId, BulkRequest request, Throwable failure) { - log.error("Error feeding bulk request. No retries left", failure); - } - }; + private final ESBulkProcessor _bulkProcessor; + private final int _numRetries; - _bulkProcessor = BulkProcessor.builder( - (request, bulkListener) -> elasticSearchRestClient.bulkAsync(request, RequestOptions.DEFAULT, bulkListener), - listener) - .setBulkActions(bulkRequestsLimit) - .setFlushInterval(TimeValue.timeValueSeconds(bulkFlushPeriod)) - .setBackoffPolicy(BackoffPolicy.constantBackoff(TimeValue.timeValueSeconds(DEFAULT_RETRY_INTERVAL), - DEFAULT_NUMBER_OF_RETRIES)) - .build(); + public ElasticsearchConnector(ESBulkProcessor bulkProcessor, int numRetries) { + _bulkProcessor = bulkProcessor; + _numRetries = numRetries; } public void feedElasticEvent(@Nonnull ElasticEvent event) { @@ -81,10 +45,11 @@ private static DeleteRequest createDeleteRequest(@Nonnull ElasticEvent event) { } @Nonnull - private static UpdateRequest createUpsertRequest(@Nonnull ElasticEvent event) { + private UpdateRequest createUpsertRequest(@Nonnull ElasticEvent event) { final IndexRequest indexRequest = new IndexRequest(event.getIndex()).id(event.getId()).source(event.buildJson()); return new UpdateRequest(event.getIndex(), event.getId()).doc(event.buildJson()) .detectNoop(false) + .retryOnConflict(_numRetries) .upsert(indexRequest); } } diff --git a/metadata-jobs/mae-consumer/src/main/java/com/linkedin/metadata/kafka/elasticsearch/ElasticsearchConnectorFactory.java b/metadata-jobs/mae-consumer/src/main/java/com/linkedin/metadata/kafka/elasticsearch/ElasticsearchConnectorFactory.java index a83f754cb91e3..a3672975e42e6 100644 --- a/metadata-jobs/mae-consumer/src/main/java/com/linkedin/metadata/kafka/elasticsearch/ElasticsearchConnectorFactory.java +++ b/metadata-jobs/mae-consumer/src/main/java/com/linkedin/metadata/kafka/elasticsearch/ElasticsearchConnectorFactory.java @@ -1,7 +1,9 @@ package com.linkedin.metadata.kafka.elasticsearch; +import com.linkedin.metadata.search.elasticsearch.update.ESBulkProcessor; import lombok.extern.slf4j.Slf4j; -import org.elasticsearch.client.RestHighLevelClient; +import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.beans.factory.annotation.Qualifier; import org.springframework.beans.factory.annotation.Value; import org.springframework.context.annotation.Bean; import org.springframework.context.annotation.Configuration; @@ -13,17 +15,17 @@ @Slf4j @Configuration public class ElasticsearchConnectorFactory { + @Autowired + @Qualifier("elasticSearchBulkProcessor") + private ESBulkProcessor bulkProcessor; - @Value("${ES_BULK_REQUESTS_LIMIT:1}") - private Integer bulkRequestsLimit; - - @Value("${ES_BULK_FLUSH_PERIOD:1}") - private Integer bulkFlushPeriod; + @Value("${elasticsearch.bulkProcessor.numRetries}") + private Integer numRetries; @Bean(name = "elasticsearchConnector") @Nonnull - public ElasticsearchConnector createInstance(@Nonnull RestHighLevelClient elasticSearchRestHighLevelClient) { - return new ElasticsearchConnector(elasticSearchRestHighLevelClient, bulkRequestsLimit, bulkFlushPeriod); + public ElasticsearchConnector createInstance() { + return new ElasticsearchConnector(bulkProcessor, numRetries); } } \ No newline at end of file diff --git a/metadata-jobs/mae-consumer/src/main/java/com/linkedin/metadata/kafka/hook/UpdateIndicesHook.java b/metadata-jobs/mae-consumer/src/main/java/com/linkedin/metadata/kafka/hook/UpdateIndicesHook.java index b02a0c8948343..35f41d7282341 100644 --- a/metadata-jobs/mae-consumer/src/main/java/com/linkedin/metadata/kafka/hook/UpdateIndicesHook.java +++ b/metadata-jobs/mae-consumer/src/main/java/com/linkedin/metadata/kafka/hook/UpdateIndicesHook.java @@ -37,6 +37,7 @@ import com.linkedin.metadata.timeseries.transformer.TimeseriesAspectTransformer; import com.linkedin.metadata.utils.EntityKeyUtils; import com.linkedin.metadata.utils.GenericRecordUtils; +import com.linkedin.mxe.GenericAspect; import com.linkedin.mxe.MetadataChangeLog; import com.linkedin.mxe.SystemMetadata; import com.linkedin.util.Pair; @@ -45,6 +46,7 @@ import java.net.URLEncoder; import java.util.ArrayList; import java.util.HashMap; +import java.util.Collections; import java.util.HashSet; import java.util.List; import java.util.Map; @@ -56,6 +58,7 @@ import javax.annotation.Nullable; import lombok.extern.slf4j.Slf4j; import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.beans.factory.annotation.Value; import org.springframework.context.annotation.Import; import org.springframework.stereotype.Component; @@ -75,6 +78,9 @@ public class UpdateIndicesHook implements MetadataChangeLogHook { private final EntityRegistry _entityRegistry; private final SearchDocumentTransformer _searchDocumentTransformer; + @Value("${featureFlags.graphServiceDiffModeEnabled:false}") + private boolean _diffMode; + public static final String DOWNSTREAM_OF = "DownstreamOf"; private static final Set VALID_CHANGE_TYPES = Stream.of( @@ -129,13 +135,24 @@ public void invoke(@Nonnull MetadataChangeLog event) { RecordTemplate aspect = GenericRecordUtils.deserializeAspect(event.getAspect().getValue(), event.getAspect().getContentType(), aspectSpec); + GenericAspect previousAspectValue = event.getPreviousAspectValue(); + RecordTemplate previousAspect = null; + if (previousAspectValue != null) { + previousAspect = GenericRecordUtils.deserializeAspect(previousAspectValue.getValue(), + previousAspectValue.getContentType(), aspectSpec); + } if (aspectSpec.isTimeseries()) { updateTimeseriesFields(event.getEntityType(), event.getAspectName(), urn, aspect, aspectSpec, event.getSystemMetadata()); } else { - updateSearchService(entitySpec.getName(), urn, aspectSpec, aspect, event.hasSystemMetadata() ? event.getSystemMetadata().getRunId() : null); - updateGraphService(urn, aspectSpec, aspect); + updateSearchService(entitySpec.getName(), urn, aspectSpec, aspect, + event.hasSystemMetadata() ? event.getSystemMetadata().getRunId() : null); updateSystemMetadata(event.getSystemMetadata(), urn, aspectSpec, aspect); + if (_diffMode) { + updateGraphServiceDiff(urn, aspectSpec, previousAspect, aspect); + } else { + updateGraphService(urn, aspectSpec, aspect); + } } } else if (event.getChangeType() == ChangeType.DELETE) { if (!event.hasAspectName() || !event.hasPreviousAspectValue()) { @@ -162,9 +179,9 @@ public void invoke(@Nonnull MetadataChangeLog event) { } private void updateFineGrainedEdgesAndRelationships( - RecordTemplate aspect, - List edgesToAdd, - HashMap> urnToRelationshipTypesBeingAdded + RecordTemplate aspect, + List edgesToAdd, + HashMap> urnToRelationshipTypesBeingAdded ) { UpstreamLineage upstreamLineage = new UpstreamLineage(aspect.data()); if (upstreamLineage.getFineGrainedLineages() != null) { @@ -193,10 +210,10 @@ private Urn generateSchemaFieldUrn(@Nonnull final String resourceUrn, @Nonnull f } private void updateInputFieldEdgesAndRelationships( - @Nonnull final Urn urn, - @Nonnull final InputFields inputFields, - @Nonnull final List edgesToAdd, - @Nonnull final HashMap> urnToRelationshipTypesBeingAdded + @Nonnull final Urn urn, + @Nonnull final InputFields inputFields, + @Nonnull final List edgesToAdd, + @Nonnull final HashMap> urnToRelationshipTypesBeingAdded ) { if (inputFields.hasFields()) { for (final InputField field : inputFields.getFields()) { @@ -211,7 +228,7 @@ private void updateInputFieldEdgesAndRelationships( } } - private Pair, HashMap>> getEdgesAndRelationshipTypesFromAspect(Urn urn, AspectSpec aspectSpec, RecordTemplate aspect) { + private Pair, HashMap>> getEdgesAndRelationshipTypesFromAspect(Urn urn, AspectSpec aspectSpec, @Nonnull RecordTemplate aspect) { final List edgesToAdd = new ArrayList<>(); final HashMap> urnToRelationshipTypesBeingAdded = new HashMap<>(); @@ -260,7 +277,43 @@ private void updateGraphService(Urn urn, AspectSpec aspectSpec, RecordTemplate a _graphService.removeEdgesFromNode(entry.getKey(), new ArrayList<>(entry.getValue()), newRelationshipFilter(new Filter().setOr(new ConjunctiveCriterionArray()), RelationshipDirection.OUTGOING)); } - edgesToAdd.forEach(edge -> _graphService.addEdge(edge)); + edgesToAdd.forEach(_graphService::addEdge); + } + } + + private void updateGraphServiceDiff(Urn urn, AspectSpec aspectSpec, @Nullable RecordTemplate oldAspect, @Nonnull RecordTemplate newAspect) { + Pair, HashMap>> oldEdgeAndRelationTypes = null; + if (oldAspect != null) { + oldEdgeAndRelationTypes = getEdgesAndRelationshipTypesFromAspect(urn, aspectSpec, oldAspect); + } + + final List oldEdges = oldEdgeAndRelationTypes != null ? oldEdgeAndRelationTypes.getFirst() : Collections.emptyList(); + final Set oldEdgeSet = new HashSet<>(oldEdges); + + Pair, HashMap>> newEdgeAndRelationTypes = + getEdgesAndRelationshipTypesFromAspect(urn, aspectSpec, newAspect); + + final List newEdges = newEdgeAndRelationTypes.getFirst(); + final Set newEdgeSet = new HashSet<>(newEdges); + + List additiveDifference = newEdges.stream() + .filter(edge -> !oldEdgeSet.contains(edge)) + .collect(Collectors.toList()); + + List subtractiveDifference = oldEdges.stream() + .filter(edge -> !newEdgeSet.contains(edge)) + .collect(Collectors.toList()); + + // Add new edges + if (additiveDifference.size() > 0) { + log.debug("Adding edges: {}", additiveDifference); + additiveDifference.forEach(_graphService::addEdge); + } + + // Remove any old edges that no longer exist + if (subtractiveDifference.size() > 0) { + log.debug("Removing edges: {}", subtractiveDifference); + subtractiveDifference.forEach(_graphService::removeEdge); } } diff --git a/metadata-service/factories/build.gradle b/metadata-service/factories/build.gradle index 6ad9fc6b89b16..1a82f4d52f210 100644 --- a/metadata-service/factories/build.gradle +++ b/metadata-service/factories/build.gradle @@ -30,6 +30,7 @@ dependencies { compile spec.product.pegasus.restliSpringBridge + testImplementation externalDependency.springBootTest testCompile externalDependency.mockito testCompile externalDependency.testng diff --git a/metadata-service/factories/src/main/java/com/linkedin/gms/factory/common/ElasticSearchGraphServiceFactory.java b/metadata-service/factories/src/main/java/com/linkedin/gms/factory/common/ElasticSearchGraphServiceFactory.java index 12ede1ef940b5..403f27a1367e4 100644 --- a/metadata-service/factories/src/main/java/com/linkedin/gms/factory/common/ElasticSearchGraphServiceFactory.java +++ b/metadata-service/factories/src/main/java/com/linkedin/gms/factory/common/ElasticSearchGraphServiceFactory.java @@ -33,9 +33,8 @@ public class ElasticSearchGraphServiceFactory { @Nonnull protected ElasticSearchGraphService getInstance() { LineageRegistry lineageRegistry = new LineageRegistry(entityRegistry); - return new ElasticSearchGraphService(lineageRegistry, components.getSearchClient(), components.getIndexConvention(), - new ESGraphWriteDAO(components.getSearchClient(), components.getIndexConvention(), - components.getBulkProcessor()), + return new ElasticSearchGraphService(lineageRegistry, components.getBulkProcessor(), components.getIndexConvention(), + new ESGraphWriteDAO(components.getIndexConvention(), components.getBulkProcessor(), components.getNumRetries()), new ESGraphQueryDAO(components.getSearchClient(), lineageRegistry, components.getIndexConvention()), components.getIndexBuilder()); } diff --git a/metadata-service/factories/src/main/java/com/linkedin/gms/factory/common/ElasticSearchSystemMetadataServiceFactory.java b/metadata-service/factories/src/main/java/com/linkedin/gms/factory/common/ElasticSearchSystemMetadataServiceFactory.java index 25afaef5e8eb7..89f196b056ee0 100644 --- a/metadata-service/factories/src/main/java/com/linkedin/gms/factory/common/ElasticSearchSystemMetadataServiceFactory.java +++ b/metadata-service/factories/src/main/java/com/linkedin/gms/factory/common/ElasticSearchSystemMetadataServiceFactory.java @@ -24,8 +24,8 @@ public class ElasticSearchSystemMetadataServiceFactory { @Bean(name = "elasticSearchSystemMetadataService") @Nonnull protected ElasticSearchSystemMetadataService getInstance() { - return new ElasticSearchSystemMetadataService(components.getSearchClient(), components.getIndexConvention(), + return new ElasticSearchSystemMetadataService(components.getBulkProcessor(), components.getIndexConvention(), new ESSystemMetadataDAO(components.getSearchClient(), components.getIndexConvention(), - components.getBulkProcessor()), components.getIndexBuilder()); + components.getBulkProcessor(), components.getNumRetries()), components.getIndexBuilder()); } } diff --git a/metadata-service/factories/src/main/java/com/linkedin/gms/factory/search/BaseElasticSearchComponentsFactory.java b/metadata-service/factories/src/main/java/com/linkedin/gms/factory/search/BaseElasticSearchComponentsFactory.java index 9c84b74363966..eeb32ae1ddbf9 100644 --- a/metadata-service/factories/src/main/java/com/linkedin/gms/factory/search/BaseElasticSearchComponentsFactory.java +++ b/metadata-service/factories/src/main/java/com/linkedin/gms/factory/search/BaseElasticSearchComponentsFactory.java @@ -4,10 +4,10 @@ import com.linkedin.gms.factory.common.RestHighLevelClientFactory; import com.linkedin.gms.factory.spring.YamlPropertySourceFactory; import com.linkedin.metadata.search.elasticsearch.indexbuilder.ESIndexBuilder; +import com.linkedin.metadata.search.elasticsearch.update.ESBulkProcessor; import com.linkedin.metadata.utils.elasticsearch.IndexConvention; import javax.annotation.Nonnull; -import lombok.Value; -import org.elasticsearch.action.bulk.BulkProcessor; +import org.springframework.beans.factory.annotation.Value; import org.elasticsearch.client.RestHighLevelClient; import org.springframework.beans.factory.annotation.Autowired; import org.springframework.beans.factory.annotation.Qualifier; @@ -25,14 +25,18 @@ ElasticSearchIndexBuilderFactory.class}) @PropertySource(value = "classpath:/application.yml", factory = YamlPropertySourceFactory.class) public class BaseElasticSearchComponentsFactory { - @Value + @lombok.Value public static class BaseElasticSearchComponents { RestHighLevelClient searchClient; IndexConvention indexConvention; - BulkProcessor bulkProcessor; + ESBulkProcessor bulkProcessor; ESIndexBuilder indexBuilder; + int numRetries; } + @Value("${elasticsearch.bulkProcessor.numRetries}") + private Integer numRetries; + @Autowired @Qualifier("elasticSearchRestHighLevelClient") private RestHighLevelClient searchClient; @@ -43,7 +47,7 @@ public static class BaseElasticSearchComponents { @Autowired @Qualifier("elasticSearchBulkProcessor") - private BulkProcessor bulkProcessor; + private ESBulkProcessor bulkProcessor; @Autowired @Qualifier("elasticSearchIndexBuilder") @@ -52,6 +56,6 @@ public static class BaseElasticSearchComponents { @Bean(name = "baseElasticSearchComponents") @Nonnull protected BaseElasticSearchComponents getInstance() { - return new BaseElasticSearchComponents(searchClient, indexConvention, bulkProcessor, indexBuilder); + return new BaseElasticSearchComponents(searchClient, indexConvention, bulkProcessor, indexBuilder, numRetries); } } diff --git a/metadata-service/factories/src/main/java/com/linkedin/gms/factory/search/ElasticSearchBulkProcessorFactory.java b/metadata-service/factories/src/main/java/com/linkedin/gms/factory/search/ElasticSearchBulkProcessorFactory.java index 0a7877acce8cf..60bb89cf3c589 100644 --- a/metadata-service/factories/src/main/java/com/linkedin/gms/factory/search/ElasticSearchBulkProcessorFactory.java +++ b/metadata-service/factories/src/main/java/com/linkedin/gms/factory/search/ElasticSearchBulkProcessorFactory.java @@ -2,13 +2,12 @@ import com.linkedin.gms.factory.common.RestHighLevelClientFactory; import com.linkedin.gms.factory.spring.YamlPropertySourceFactory; -import com.linkedin.metadata.search.elasticsearch.update.BulkListener; import javax.annotation.Nonnull; -import org.elasticsearch.action.bulk.BackoffPolicy; -import org.elasticsearch.action.bulk.BulkProcessor; -import org.elasticsearch.client.RequestOptions; + +import com.linkedin.metadata.search.elasticsearch.update.ESBulkProcessor; +import lombok.extern.slf4j.Slf4j; +import org.elasticsearch.action.support.WriteRequest; import org.elasticsearch.client.RestHighLevelClient; -import org.elasticsearch.common.unit.TimeValue; import org.springframework.beans.factory.annotation.Autowired; import org.springframework.beans.factory.annotation.Qualifier; import org.springframework.beans.factory.annotation.Value; @@ -18,6 +17,7 @@ import org.springframework.context.annotation.PropertySource; +@Slf4j @Configuration @Import({RestHighLevelClientFactory.class}) @PropertySource(value = "classpath:/application.yml", factory = YamlPropertySourceFactory.class) @@ -38,15 +38,22 @@ public class ElasticSearchBulkProcessorFactory { @Value("${elasticsearch.bulkProcessor.retryInterval}") private Long retryInterval; + @Value("#{new Boolean('${elasticsearch.bulkProcessor.async}')}") + private boolean async; + + @Value("${elasticsearch.bulkProcessor.refreshPolicy}") + private String refreshPolicy; + @Bean(name = "elasticSearchBulkProcessor") @Nonnull - protected BulkProcessor getInstance() { - return BulkProcessor.builder((request, bulkListener) -> { - searchClient.bulkAsync(request, RequestOptions.DEFAULT, bulkListener); - }, BulkListener.getInstance()) - .setBulkActions(bulkRequestsLimit) - .setFlushInterval(TimeValue.timeValueSeconds(bulkFlushPeriod)) - .setBackoffPolicy(BackoffPolicy.constantBackoff(TimeValue.timeValueSeconds(retryInterval), numRetries)) - .build(); + protected ESBulkProcessor getInstance() { + return ESBulkProcessor.builder(searchClient) + .async(async) + .bulkFlushPeriod(bulkFlushPeriod) + .bulkRequestsLimit(bulkRequestsLimit) + .retryInterval(retryInterval) + .numRetries(numRetries) + .writeRequestRefreshPolicy(WriteRequest.RefreshPolicy.valueOf(refreshPolicy)) + .build(); } } diff --git a/metadata-service/factories/src/main/java/com/linkedin/gms/factory/search/ElasticSearchIndexBuilderFactory.java b/metadata-service/factories/src/main/java/com/linkedin/gms/factory/search/ElasticSearchIndexBuilderFactory.java index d43603b328c17..224f865a11482 100644 --- a/metadata-service/factories/src/main/java/com/linkedin/gms/factory/search/ElasticSearchIndexBuilderFactory.java +++ b/metadata-service/factories/src/main/java/com/linkedin/gms/factory/search/ElasticSearchIndexBuilderFactory.java @@ -1,9 +1,15 @@ package com.linkedin.gms.factory.search; +import com.google.gson.Gson; +import com.google.gson.reflect.TypeToken; +import com.linkedin.gms.factory.common.IndexConventionFactory; import com.linkedin.gms.factory.common.RestHighLevelClientFactory; import com.linkedin.gms.factory.spring.YamlPropertySourceFactory; import com.linkedin.metadata.search.elasticsearch.indexbuilder.ESIndexBuilder; import javax.annotation.Nonnull; +import javax.annotation.Nullable; + +import com.linkedin.metadata.utils.elasticsearch.IndexConvention; import org.elasticsearch.client.RestHighLevelClient; import org.springframework.beans.factory.annotation.Autowired; import org.springframework.beans.factory.annotation.Qualifier; @@ -13,11 +19,19 @@ import org.springframework.context.annotation.Import; import org.springframework.context.annotation.PropertySource; +import java.util.Map; +import java.util.Optional; +import java.util.stream.Collectors; +import java.util.stream.Stream; + +import static com.linkedin.gms.factory.common.IndexConventionFactory.INDEX_CONVENTION_BEAN; + @Configuration -@Import({RestHighLevelClientFactory.class}) +@Import({RestHighLevelClientFactory.class, IndexConventionFactory.class}) @PropertySource(value = "classpath:/application.yml", factory = YamlPropertySourceFactory.class) public class ElasticSearchIndexBuilderFactory { + @Autowired @Qualifier("elasticSearchRestHighLevelClient") private RestHighLevelClient searchClient; @@ -31,9 +45,44 @@ public class ElasticSearchIndexBuilderFactory { @Value("${elasticsearch.index.numRetries}") private Integer numRetries; + @Value("${elasticsearch.index.refreshIntervalSeconds}") + private Integer refreshIntervalSeconds; + + @Value("${elasticsearch.index.settingsOverrides}") + private String indexSettingOverrides; + + @Value("${elasticsearch.index.entitySettingsOverrides}") + private String entityIndexSettingOverrides; + + @Value("#{new Boolean('${elasticsearch.index.enableSettingsReindex}')}") + private boolean enableSettingsReindex; + + @Bean(name = "elasticSearchIndexSettingsOverrides") + @Nonnull + protected Map> getIndexSettingsOverrides( + @Qualifier(INDEX_CONVENTION_BEAN) IndexConvention indexConvention) { + + return Stream.concat( + parseIndexSettingsMap(indexSettingOverrides).entrySet().stream() + .map(e -> Map.entry(indexConvention.getIndexName(e.getKey()), e.getValue())), + parseIndexSettingsMap(entityIndexSettingOverrides).entrySet().stream() + .map(e -> Map.entry(indexConvention.getEntityIndexName(e.getKey()), e.getValue()))) + .collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue)); + } + @Bean(name = "elasticSearchIndexBuilder") @Nonnull - protected ESIndexBuilder getInstance() { - return new ESIndexBuilder(searchClient, numShards, numReplicas, numRetries); + protected ESIndexBuilder getInstance( + @Qualifier("elasticSearchIndexSettingsOverrides") Map> overrides) { + return new ESIndexBuilder(searchClient, numShards, numReplicas, numRetries, refreshIntervalSeconds, overrides, + enableSettingsReindex); + } + + @Nonnull + private static Map> parseIndexSettingsMap(@Nullable String json) { + Optional>> parseOpt = Optional.ofNullable( + new Gson().fromJson(json, + new TypeToken>>() { }.getType())); + return parseOpt.orElse(Map.of()); } } \ No newline at end of file diff --git a/metadata-service/factories/src/main/java/com/linkedin/gms/factory/search/ElasticSearchServiceFactory.java b/metadata-service/factories/src/main/java/com/linkedin/gms/factory/search/ElasticSearchServiceFactory.java index 551085fd7e363..5c6f8a0476e61 100644 --- a/metadata-service/factories/src/main/java/com/linkedin/gms/factory/search/ElasticSearchServiceFactory.java +++ b/metadata-service/factories/src/main/java/com/linkedin/gms/factory/search/ElasticSearchServiceFactory.java @@ -44,6 +44,6 @@ protected ElasticSearchService getInstance() { settingsBuilder), esSearchDAO, new ESBrowseDAO(entityRegistry, components.getSearchClient(), components.getIndexConvention()), new ESWriteDAO(entityRegistry, components.getSearchClient(), components.getIndexConvention(), - components.getBulkProcessor())); + components.getBulkProcessor(), components.getNumRetries())); } } diff --git a/metadata-service/factories/src/main/java/com/linkedin/gms/factory/timeseries/ElasticSearchTimeseriesAspectServiceFactory.java b/metadata-service/factories/src/main/java/com/linkedin/gms/factory/timeseries/ElasticSearchTimeseriesAspectServiceFactory.java index 06d9cf951025e..717adf7d559b7 100644 --- a/metadata-service/factories/src/main/java/com/linkedin/gms/factory/timeseries/ElasticSearchTimeseriesAspectServiceFactory.java +++ b/metadata-service/factories/src/main/java/com/linkedin/gms/factory/timeseries/ElasticSearchTimeseriesAspectServiceFactory.java @@ -32,6 +32,6 @@ public class ElasticSearchTimeseriesAspectServiceFactory { protected ElasticSearchTimeseriesAspectService getInstance() { return new ElasticSearchTimeseriesAspectService(components.getSearchClient(), components.getIndexConvention(), new TimeseriesAspectIndexBuilders(components.getIndexBuilder(), entityRegistry, - components.getIndexConvention()), entityRegistry, components.getBulkProcessor()); + components.getIndexConvention()), entityRegistry, components.getBulkProcessor(), components.getNumRetries()); } } \ No newline at end of file diff --git a/metadata-service/factories/src/main/resources/application.yml b/metadata-service/factories/src/main/resources/application.yml index a54a16d59d408..30dfe3aaa8f25 100644 --- a/metadata-service/factories/src/main/resources/application.yml +++ b/metadata-service/factories/src/main/resources/application.yml @@ -141,18 +141,24 @@ elasticsearch: keyStorePassword: ${ELASTICSEARCH_SSL_KEYSTORE_PASSWORD:#{null}} keyPassword: ${ELASTICSEARCH_SSL_KEY_PASSWORD:#{null}} bulkProcessor: + async: ${ES_BULK_ASYNC:true} requestsLimit: ${ES_BULK_REQUESTS_LIMIT:1000} flushPeriod: ${ES_BULK_FLUSH_PERIOD:1} numRetries: ${ES_BULK_NUM_RETRIES:3} retryInterval: ${ES_BULK_RETRY_INTERVAL:1} + refreshPolicy: ${ES_BULK_REFRESH_POLICY:NONE} index: prefix: ${INDEX_PREFIX:} numShards: ${ELASTICSEARCH_NUM_SHARDS_PER_INDEX:1} numReplicas: ${ELASTICSEARCH_NUM_REPLICAS_PER_INDEX:1} numRetries: ${ELASTICSEARCH_INDEX_BUILDER_NUM_RETRIES:3} + refreshIntervalSeconds: ${ELASTICSEARCH_INDEX_BUILDER_REFRESH_INTERVAL_SECONDS:1} # increase to 30 if expected indexing rates to be greater than 100/s maxArrayLength: ${SEARCH_DOCUMENT_MAX_ARRAY_LENGTH:1000} maxObjectKeys: ${SEARCH_DOCUMENT_MAX_OBJECT_KEYS:1000} mainTokenizer: ${ELASTICSEARCH_MAIN_TOKENIZER:#{null}} + enableSettingsReindex: ${ELASTICSEARCH_INDEX_BUILDER_SETTINGS_REINDEX:false} + settingsOverrides: ${ELASTICSEARCH_INDEX_BUILDER_SETTINGS_OVERRIDES:#{null}} + entitySettingsOverrides: ${ELASTICSEARCH_INDEX_BUILDER_ENTITY_SETTINGS_OVERRIDES:#{null}} # TODO: Kafka topic convention kafka: @@ -206,6 +212,7 @@ bootstrap: featureFlags: showSimplifiedHomepageByDefault: ${SHOW_SIMPLIFIED_HOMEPAGE_BY_DEFAULT:false} # shows a simplified homepage with just datasets, charts and dashboards by default to users. this can be configured in user settings lineageSearchCacheEnabled: ${LINEAGE_SEARCH_CACHE_ENABLED:false} # Enables in-memory cache for searchAcrossLineage query, disabled by default to prevent unexpected update delays + graphServiceDiffModeEnabled: ${GRAPH_SERVICE_DIFF_MODE_ENABLED:true} # Enables diff mode for graph writes, uses a different code path that produces a diff from previous to next to write relationships instead of wholesale deleting edges and reading entityChangeEvents: enabled: ${ENABLE_ENTITY_CHANGE_EVENTS_HOOK:true} diff --git a/metadata-service/factories/src/test/java/com/linkedin/gms/factory/search/ElasticSearchBulkProcessorFactoryTest.java b/metadata-service/factories/src/test/java/com/linkedin/gms/factory/search/ElasticSearchBulkProcessorFactoryTest.java new file mode 100644 index 0000000000000..970306cc91a03 --- /dev/null +++ b/metadata-service/factories/src/test/java/com/linkedin/gms/factory/search/ElasticSearchBulkProcessorFactoryTest.java @@ -0,0 +1,25 @@ +package com.linkedin.gms.factory.search; + +import com.linkedin.metadata.search.elasticsearch.update.ESBulkProcessor; +import org.elasticsearch.action.support.WriteRequest; +import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.boot.test.context.SpringBootTest; +import org.springframework.test.context.TestPropertySource; +import org.springframework.test.context.testng.AbstractTestNGSpringContextTests; +import org.testng.annotations.Test; + +import static org.testng.Assert.assertEquals; +import static org.testng.Assert.assertNotNull; + +@TestPropertySource(locations = "classpath:/application.yml") +@SpringBootTest(classes = {ElasticSearchBulkProcessorFactory.class}) +public class ElasticSearchBulkProcessorFactoryTest extends AbstractTestNGSpringContextTests { + @Autowired + ESBulkProcessor test; + + @Test + void testInjection() { + assertNotNull(test); + assertEquals(WriteRequest.RefreshPolicy.NONE, test.getWriteRequestRefreshPolicy()); + } +} diff --git a/metadata-service/factories/src/test/java/com/linkedin/gms/factory/search/ElasticSearchIndexBuilderFactoryEmptyTest.java b/metadata-service/factories/src/test/java/com/linkedin/gms/factory/search/ElasticSearchIndexBuilderFactoryEmptyTest.java new file mode 100644 index 0000000000000..3022308b42faa --- /dev/null +++ b/metadata-service/factories/src/test/java/com/linkedin/gms/factory/search/ElasticSearchIndexBuilderFactoryEmptyTest.java @@ -0,0 +1,32 @@ +package com.linkedin.gms.factory.search; + +import com.linkedin.metadata.search.elasticsearch.indexbuilder.ESIndexBuilder; +import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.boot.test.context.SpringBootTest; +import org.springframework.context.annotation.PropertySource; +import org.springframework.test.context.testng.AbstractTestNGSpringContextTests; +import org.testng.annotations.Test; + +import java.util.Map; + +import static org.testng.Assert.assertEquals; +import static org.testng.Assert.assertNotNull; + +@PropertySource("classpath:/test-empty-application.yml") +@SpringBootTest( + properties = { + "elasticsearch.index.settingsOverrides=", + "elasticsearch.index.entitySettingsOverrides=", + "elasticsearch.index.prefix=test_prefix" + }, + classes = {ElasticSearchIndexBuilderFactory.class}) +public class ElasticSearchIndexBuilderFactoryEmptyTest extends AbstractTestNGSpringContextTests { + @Autowired + ESIndexBuilder test; + + @Test + void testInjection() { + assertNotNull(test); + assertEquals(Map.of(), test.getIndexSettingOverrides()); + } +} diff --git a/metadata-service/factories/src/test/java/com/linkedin/gms/factory/search/ElasticSearchIndexBuilderFactoryOverridesTest.java b/metadata-service/factories/src/test/java/com/linkedin/gms/factory/search/ElasticSearchIndexBuilderFactoryOverridesTest.java new file mode 100644 index 0000000000000..2f14507371f19 --- /dev/null +++ b/metadata-service/factories/src/test/java/com/linkedin/gms/factory/search/ElasticSearchIndexBuilderFactoryOverridesTest.java @@ -0,0 +1,28 @@ +package com.linkedin.gms.factory.search; + +import com.linkedin.metadata.search.elasticsearch.indexbuilder.ESIndexBuilder; +import org.springframework.boot.test.context.SpringBootTest; +import org.springframework.test.context.testng.AbstractTestNGSpringContextTests; +import org.testng.annotations.Test; +import org.springframework.beans.factory.annotation.Autowired; + +import static org.testng.Assert.*; + +@SpringBootTest( + properties = { + "elasticsearch.index.settingsOverrides={\"my_index\":{\"number_of_shards\":\"10\"}}", + "elasticsearch.index.entitySettingsOverrides={\"my_entity\":{\"number_of_shards\":\"5\"}}", + "elasticsearch.index.prefix=test_prefix" + }, + classes = {ElasticSearchIndexBuilderFactory.class}) +public class ElasticSearchIndexBuilderFactoryOverridesTest extends AbstractTestNGSpringContextTests { + @Autowired + ESIndexBuilder test; + + @Test + void testInjection() { + assertNotNull(test); + assertEquals("10", test.getIndexSettingOverrides().get("test_prefix_my_index").get("number_of_shards")); + assertEquals("5", test.getIndexSettingOverrides().get("test_prefix_my_entityindex_v2").get("number_of_shards")); + } +} diff --git a/metadata-service/factories/src/test/java/com/linkedin/gms/factory/search/ElasticSearchIndexBuilderFactoryTest.java b/metadata-service/factories/src/test/java/com/linkedin/gms/factory/search/ElasticSearchIndexBuilderFactoryTest.java new file mode 100644 index 0000000000000..f00a86191ae5d --- /dev/null +++ b/metadata-service/factories/src/test/java/com/linkedin/gms/factory/search/ElasticSearchIndexBuilderFactoryTest.java @@ -0,0 +1,26 @@ +package com.linkedin.gms.factory.search; + +import com.linkedin.metadata.search.elasticsearch.indexbuilder.ESIndexBuilder; +import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.boot.test.context.SpringBootTest; +import org.springframework.test.context.TestPropertySource; +import org.springframework.test.context.testng.AbstractTestNGSpringContextTests; +import org.testng.annotations.Test; + +import java.util.Map; + +import static org.testng.Assert.assertNotNull; +import static org.testng.Assert.assertEquals; + +@TestPropertySource(locations = "classpath:/application.yml") +@SpringBootTest(classes = {ElasticSearchIndexBuilderFactory.class}) +public class ElasticSearchIndexBuilderFactoryTest extends AbstractTestNGSpringContextTests { + @Autowired + ESIndexBuilder test; + + @Test + void testInjection() { + assertNotNull(test); + assertEquals(Map.of(), test.getIndexSettingOverrides()); + } +} diff --git a/metadata-utils/src/main/java/com/linkedin/metadata/utils/metrics/MetricUtils.java b/metadata-utils/src/main/java/com/linkedin/metadata/utils/metrics/MetricUtils.java index 2ba29b0007f5f..3d90cba85b0fb 100644 --- a/metadata-utils/src/main/java/com/linkedin/metadata/utils/metrics/MetricUtils.java +++ b/metadata-utils/src/main/java/com/linkedin/metadata/utils/metrics/MetricUtils.java @@ -11,6 +11,8 @@ public class MetricUtils { private MetricUtils() { } + public static final String DELIMITER = "_"; + public static final String NAME = "default"; private static final MetricRegistry REGISTRY = SharedMetricRegistries.getOrCreate(NAME); @@ -27,6 +29,14 @@ public static Counter counter(Class klass, String metricName) { return REGISTRY.counter(MetricRegistry.name(klass, metricName)); } + public static void exceptionCounter(Class klass, String metricName, Throwable t) { + String[] splitClassName = t.getClass().getName().split("[.]"); + String snakeCase = splitClassName[splitClassName.length - 1].replaceAll("([A-Z][a-z])", DELIMITER + "$1"); + + counter(klass, metricName).inc(); + counter(klass, metricName + DELIMITER + snakeCase).inc(); + } + public static Counter counter(String metricName) { return REGISTRY.counter(MetricRegistry.name(metricName)); } diff --git a/smoke-test/requests_wrapper/__init__.py b/smoke-test/requests_wrapper/__init__.py new file mode 100644 index 0000000000000..d9956e8434a89 --- /dev/null +++ b/smoke-test/requests_wrapper/__init__.py @@ -0,0 +1,3 @@ +from .utils_requests_wrapper import CustomSession as Session +from .utils_requests_wrapper import get, post +from .constants import * diff --git a/smoke-test/requests_wrapper/constants.py b/smoke-test/requests_wrapper/constants.py new file mode 100644 index 0000000000000..64b1d7c4495d2 --- /dev/null +++ b/smoke-test/requests_wrapper/constants.py @@ -0,0 +1,2 @@ + +ELASTICSEARCH_REFRESH_INTERVAL_SECONDS = 2 diff --git a/smoke-test/requests_wrapper/utils_requests_wrapper.py b/smoke-test/requests_wrapper/utils_requests_wrapper.py new file mode 100644 index 0000000000000..276a95c74767f --- /dev/null +++ b/smoke-test/requests_wrapper/utils_requests_wrapper.py @@ -0,0 +1,28 @@ +import requests +from .constants import * +from time import sleep + + +class CustomSession(requests.Session): + """ + Create a custom session to add consistency delay on writes + """ + + def post(self, *args, **kwargs): + response = super(CustomSession, self).post(*args, **kwargs) + if "/logIn" not in args[0]: + print("sleeping.") + sleep(ELASTICSEARCH_REFRESH_INTERVAL_SECONDS) + return response + + +def post(*args, **kwargs): + response = requests.post(*args, **kwargs) + if "/logIn" not in args[0]: + print("sleeping.") + sleep(ELASTICSEARCH_REFRESH_INTERVAL_SECONDS) + return response + + +def get(*args, **kwargs): + return requests.get(*args, **kwargs) diff --git a/smoke-test/requirements.txt b/smoke-test/requirements.txt index 3ff4af3aa14f5..c71982fceb560 100644 --- a/smoke-test/requirements.txt +++ b/smoke-test/requirements.txt @@ -4,4 +4,4 @@ psutil tenacity -e ../metadata-ingestion[datahub-rest,datahub-kafka,mysql] slack-sdk==3.18.1 -aiohttp +aiohttp \ No newline at end of file diff --git a/smoke-test/smoke-dev.sh b/smoke-test/smoke-dev.sh new file mode 100755 index 0000000000000..9237065e94835 --- /dev/null +++ b/smoke-test/smoke-dev.sh @@ -0,0 +1,29 @@ +#!/bin/bash +set -euxo pipefail + +# Runs a basic e2e test. It is not meant to be fully comprehensive, +# but rather should catch obvious bugs before they make it into prod. +# +# Script assumptions: +# - The gradle build has already been run. +# - Python 3.6+ is installed and in the PATH. + +# Log the locally loaded images +# docker images | grep "datahub-" + +DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" +cd "$DIR" + +python3 -m venv venv +source venv/bin/activate +pip install --upgrade pip wheel setuptools +pip install -r requirements.txt + +echo "DATAHUB_VERSION = ${DATAHUB_VERSION:=acryl-datahub 0.0.0.dev0}" +DATAHUB_TELEMETRY_ENABLED=false \ +DOCKER_COMPOSE_BASE="file://$( dirname "$DIR" )" \ +datahub docker quickstart --build-locally --standalone_consumers --dump-logs-on-failure + +(cd ..; ./gradlew :smoke-test:yarnInstall) + +pytest -rP --durations=20 -vv --junit-xml=junit.smoke.xml $@ diff --git a/smoke-test/smoke.sh b/smoke-test/smoke.sh index 033fcfd82c7ff..68347101e7692 100755 --- a/smoke-test/smoke.sh +++ b/smoke-test/smoke.sh @@ -23,7 +23,9 @@ mkdir -p ~/.datahub/plugins/frontend/auth/ echo "test_user:test_pass" >> ~/.datahub/plugins/frontend/auth/user.props echo "DATAHUB_VERSION = $DATAHUB_VERSION" -DATAHUB_TELEMETRY_ENABLED=false datahub docker quickstart --standalone_consumers --dump-logs-on-failure +DATAHUB_TELEMETRY_ENABLED=false \ +DOCKER_COMPOSE_BASE="file://$( dirname "$DIR" )" \ +datahub docker quickstart --standalone_consumers --dump-logs-on-failure (cd ..; ./gradlew :smoke-test:yarnInstall) diff --git a/smoke-test/test_e2e.py b/smoke-test/test_e2e.py index 734f2deda9698..e3d110ef216c9 100644 --- a/smoke-test/test_e2e.py +++ b/smoke-test/test_e2e.py @@ -3,7 +3,7 @@ from typing import Any, Optional import pytest -import requests +import requests_wrapper as requests import tenacity from datahub.ingestion.run.pipeline import Pipeline @@ -27,7 +27,7 @@ restli_default_headers = { "X-RestLi-Protocol-Version": "2.0.0", } -kafka_post_ingestion_wait_sec = 60 +kafka_post_ingestion_wait_sec = 30 sleep_sec, sleep_times = get_sleep_info() @@ -92,7 +92,7 @@ def _ensure_user_relationship_present(frontend_session, urn, relationships): assert res_data["data"] assert res_data["data"]["corpUser"] assert res_data["data"]["corpUser"]["relationships"] - assert res_data["data"]["corpUser"]["relationships"]["total"] == 1 + assert res_data["data"]["corpUser"]["relationships"]["total"] == relationships @tenacity.retry( diff --git a/smoke-test/test_rapid.py b/smoke-test/test_rapid.py index 69e194a218b39..98db3ee50ee40 100644 --- a/smoke-test/test_rapid.py +++ b/smoke-test/test_rapid.py @@ -1,5 +1,4 @@ import pytest -import requests import tenacity from tests.utils import ( @@ -73,6 +72,7 @@ def _ensure_dataset_present_correctly(frontend_session): assert res_data["data"] assert res_data["data"]["dataset"] assert res_data["data"]["dataset"]["urn"] == urn + assert len(res_data["data"]["dataset"]["outgoing"]["relationships"]) == 1 def test_ingestion_via_rest_rapid(frontend_session, wait_for_healthchecks): diff --git a/smoke-test/test_resources/graph_data.json b/smoke-test/test_resources/graph_data.json new file mode 100644 index 0000000000000..fdb245552e2af --- /dev/null +++ b/smoke-test/test_resources/graph_data.json @@ -0,0 +1,172 @@ +[ + { + "auditHeader": null, + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { + "urn": "urn:li:dataset:(urn:li:dataPlatform:graph,graph-test,PROD)", + "aspects": [ + { + "com.linkedin.pegasus2avro.common.BrowsePaths": { + "paths": [ + "/prod/kafka/SampleKafkaDataset" + ] + } + }, + { + "com.linkedin.pegasus2avro.dataset.DatasetProperties": { + "description": null, + "uri": null, + "tags": [], + "customProperties": { + "prop1": "fakeprop", + "prop2": "pikachu" + } + } + }, + { + "com.linkedin.pegasus2avro.common.Ownership": { + "owners": [ + { + "owner": "urn:li:corpuser:jdoe", + "type": "DATAOWNER", + "source": null + }, + { + "owner": "urn:li:corpuser:datahub", + "type": "DATAOWNER", + "source": null + } + ], + "lastModified": { + "time": 1581407189000, + "actor": "urn:li:corpuser:jdoe", + "impersonator": null + } + } + }, + { + "com.linkedin.pegasus2avro.common.InstitutionalMemory": { + "elements": [ + { + "url": "https://www.linkedin.com", + "description": "Sample doc", + "createStamp": { + "time": 1581407189000, + "actor": "urn:li:corpuser:jdoe", + "impersonator": null + } + } + ] + } + }, + { + "com.linkedin.pegasus2avro.schema.SchemaMetadata": { + "schemaName": "SampleKafkaSchema", + "platform": "urn:li:dataPlatform:kafka", + "version": 0, + "created": { + "time": 1581407189000, + "actor": "urn:li:corpuser:jdoe", + "impersonator": null + }, + "lastModified": { + "time": 1581407189000, + "actor": "urn:li:corpuser:jdoe", + "impersonator": null + }, + "deleted": null, + "dataset": null, + "cluster": null, + "hash": "", + "platformSchema": { + "com.linkedin.pegasus2avro.schema.KafkaSchema": { + "documentSchema": "{\"type\":\"record\",\"name\":\"SampleKafkaSchema\",\"namespace\":\"com.linkedin.dataset\",\"doc\":\"Sample Kafka dataset\",\"fields\":[{\"name\":\"field_foo\",\"type\":[\"string\"]},{\"name\":\"field_bar\",\"type\":[\"boolean\"]}]}" + } + }, + "fields": [ + { + "fieldPath": "[version=2.0].[type=boolean].field_foo_2", + "jsonPath": null, + "nullable": false, + "description": { + "string": "Foo field description" + }, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.BooleanType": {} + } + }, + "nativeDataType": "varchar(100)", + "globalTags": { + "tags": [ + { + "tag": "urn:li:tag:Legacy" + }, + { + "tag": "urn:li:tag:Cypress" + } + ] + }, + "recursive": false + }, + { + "fieldPath": "[version=2.0].[type=boolean].field_bar", + "jsonPath": null, + "nullable": false, + "description": { + "string": "Bar field description" + }, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.BooleanType": {} + } + }, + "nativeDataType": "boolean", + "globalTags": { + "tags": [ + { + "tag": "urn:li:tag:Legacy" + }, + { + "tag": "urn:li:tag:Cypress" + } + ] + }, + "recursive": false + }, + { + "fieldPath": "[version=2.0].[key=True].[type=int].id", + "jsonPath": null, + "nullable": false, + "description": { + "string": "Id specifying which partition the message should go to" + }, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.BooleanType": {} + } + }, + "nativeDataType": "boolean", + "globalTags": { + "tags": [ + { + "tag": "urn:li:tag:Legacy" + }, + { + "tag": "urn:li:tag:Cypress" + } + ] + }, + "recursive": false + } + ], + "primaryKeys": null, + "foreignKeysSpecs": null + } + } + ] + } + }, + "proposedDelta": null + } +] \ No newline at end of file diff --git a/smoke-test/test_resources/graph_dataDiff.json b/smoke-test/test_resources/graph_dataDiff.json new file mode 100644 index 0000000000000..1a5efd0865076 --- /dev/null +++ b/smoke-test/test_resources/graph_dataDiff.json @@ -0,0 +1,163 @@ +[ + { + "auditHeader": null, + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { + "urn": "urn:li:dataset:(urn:li:dataPlatform:graph,graph-test,PROD)", + "aspects": [ + { + "com.linkedin.pegasus2avro.common.BrowsePaths": { + "paths": [ + "/prod/kafka/SampleKafkaDataset" + ] + } + }, + { + "com.linkedin.pegasus2avro.dataset.DatasetProperties": { + "description": null, + "uri": null, + "tags": [], + "customProperties": { + "prop1": "fakeprop", + "prop2": "pikachu" + } + } + }, + { + "com.linkedin.pegasus2avro.common.Ownership": { + "owners": [ + { + "owner": "urn:li:corpuser:jdoe", + "type": "DATAOWNER", + "source": null + }, + { + "owner": "urn:li:corpuser:datahub", + "type": "DATAOWNER", + "source": null + } + ], + "lastModified": { + "time": 1581407189000, + "actor": "urn:li:corpuser:jdoe", + "impersonator": null + } + } + }, + { + "com.linkedin.pegasus2avro.common.InstitutionalMemory": { + "elements": [ + { + "url": "https://www.linkedin.com", + "description": "Sample doc", + "createStamp": { + "time": 1581407189000, + "actor": "urn:li:corpuser:jdoe", + "impersonator": null + } + } + ] + } + }, + { + "com.linkedin.pegasus2avro.schema.SchemaMetadata": { + "schemaName": "SampleKafkaSchema", + "platform": "urn:li:dataPlatform:kafka", + "version": 0, + "created": { + "time": 1581407189000, + "actor": "urn:li:corpuser:jdoe", + "impersonator": null + }, + "lastModified": { + "time": 1581407189000, + "actor": "urn:li:corpuser:jdoe", + "impersonator": null + }, + "deleted": null, + "dataset": null, + "cluster": null, + "hash": "", + "platformSchema": { + "com.linkedin.pegasus2avro.schema.KafkaSchema": { + "documentSchema": "{\"type\":\"record\",\"name\":\"SampleKafkaSchema\",\"namespace\":\"com.linkedin.dataset\",\"doc\":\"Sample Kafka dataset\",\"fields\":[{\"name\":\"field_foo\",\"type\":[\"string\"]},{\"name\":\"field_bar\",\"type\":[\"boolean\"]}]}" + } + }, + "fields": [ + { + "fieldPath": "[version=2.0].[type=boolean].field_foo_2", + "jsonPath": null, + "nullable": false, + "description": { + "string": "Foo field description" + }, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.BooleanType": {} + } + }, + "nativeDataType": "varchar(100)", + "globalTags": { + "tags": [ + { + "tag": "urn:li:tag:NeedsDocumentation1" + } + ] + }, + "recursive": false + }, + { + "fieldPath": "[version=2.0].[type=boolean].field_bar", + "jsonPath": null, + "nullable": false, + "description": { + "string": "Bar field description" + }, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.BooleanType": {} + } + }, + "nativeDataType": "boolean", + "globalTags": { + "tags": [ + { + "tag": "urn:li:tag:NeedsDocumentation2" + } + ] + }, + "recursive": false + }, + { + "fieldPath": "[version=2.0].[key=True].[type=int].id", + "jsonPath": null, + "nullable": false, + "description": { + "string": "Id specifying which partition the message should go to" + }, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.BooleanType": {} + } + }, + "nativeDataType": "boolean", + "globalTags": { + "tags": [ + { + "tag": "urn:li:tag:NeedsDocumentation3" + } + ] + }, + "recursive": false + } + ], + "primaryKeys": null, + "foreignKeysSpecs": null + } + } + ] + } + }, + "proposedDelta": null + } +] \ No newline at end of file diff --git a/smoke-test/tests/assertions/assertions_test.py b/smoke-test/tests/assertions/assertions_test.py index e6661feb97f14..4aa64c512f684 100644 --- a/smoke-test/tests/assertions/assertions_test.py +++ b/smoke-test/tests/assertions/assertions_test.py @@ -2,7 +2,7 @@ import urllib import pytest -import requests +import requests_wrapper as requests import tenacity from datahub.emitter.mce_builder import make_dataset_urn, make_schema_field_urn from datahub.emitter.mcp import MetadataChangeProposalWrapper diff --git a/smoke-test/tests/browse/browse_test.py b/smoke-test/tests/browse/browse_test.py index 7bedd37d6a026..49071327d5d47 100644 --- a/smoke-test/tests/browse/browse_test.py +++ b/smoke-test/tests/browse/browse_test.py @@ -1,9 +1,7 @@ -import json -import urllib import time import pytest -import requests +import requests_wrapper as requests from tests.utils import delete_urns_from_file, get_frontend_url, ingest_file_via_rest @@ -16,7 +14,7 @@ def ingest_cleanup_data(request): print("ingesting browse test data") ingest_file_via_rest("tests/browse/data.json") - time.sleep(5) # Allow for indexing time + yield print("removing browse test data") delete_urns_from_file("tests/browse/data.json") diff --git a/smoke-test/tests/cli/datahub-cli.py b/smoke-test/tests/cli/datahub-cli.py index 775ce155e9949..b89cadf257f2c 100644 --- a/smoke-test/tests/cli/datahub-cli.py +++ b/smoke-test/tests/cli/datahub-cli.py @@ -1,15 +1,14 @@ import json import pytest from time import sleep -from datahub.cli import delete_cli, ingest_cli from datahub.cli.cli_utils import guess_entity_type, post_entity, get_aspects_for_entity from datahub.cli.ingest_cli import get_session_and_host, rollback -from datahub.cli.delete_cli import guess_entity_type, delete_one_urn_cmd, delete_references -from tests.utils import ingest_file_via_rest, delete_urns_from_file +from tests.utils import ingest_file_via_rest ingested_dataset_run_id = "" ingested_editable_run_id = "" + @pytest.fixture(autouse=True) def test_setup(): """Fixture to execute asserts before and after a test is run""" @@ -30,7 +29,6 @@ def test_setup(): ingested_dataset_run_id = ingest_file_via_rest("tests/cli/cli_test_data.json").config.run_id print("Setup ingestion id: " + ingested_dataset_run_id) - sleep(5) assert "browsePaths" in get_aspects_for_entity(entity_urn=dataset_urn, aspects=["browsePaths"], typed=False) @@ -41,11 +39,11 @@ def test_setup(): session.post(rollback_url, data=json.dumps({"runId": ingested_editable_run_id, "dryRun": False, "hardDelete": True})) session.post(rollback_url, data=json.dumps({"runId": ingested_dataset_run_id, "dryRun": False, "hardDelete": True})) - sleep(5) assert "browsePaths" not in get_aspects_for_entity(entity_urn=dataset_urn, aspects=["browsePaths"], typed=False) assert "editableDatasetProperties" not in get_aspects_for_entity(entity_urn=dataset_urn, aspects=["editableDatasetProperties"], typed=False) + @pytest.mark.dependency() def test_rollback_editable(): global ingested_dataset_run_id diff --git a/smoke-test/tests/cli/datahub_graph_test.py b/smoke-test/tests/cli/datahub_graph_test.py index 371edd66563b4..16925d26f6983 100644 --- a/smoke-test/tests/cli/datahub_graph_test.py +++ b/smoke-test/tests/cli/datahub_graph_test.py @@ -1,12 +1,25 @@ import pytest +import tenacity from datahub.ingestion.graph.client import DatahubClientConfig, DataHubGraph from datahub.metadata.schema_classes import KafkaSchemaClass, SchemaMetadataClass -from tests.utils import delete_urns_from_file, ingest_file_via_rest, get_gms_url +from tests.utils import ( + delete_urns_from_file, + get_gms_url, + get_sleep_info, + ingest_file_via_rest, +) +sleep_sec, sleep_times = get_sleep_info() + + +graph = "test_resources/graph_data.json" +graph_2 = "test_resources/graph_dataDiff.json" @pytest.fixture(scope="module", autouse=False) def ingest_cleanup_data(request): + print("removing graph test data") + delete_urns_from_file("tests/cli/graph_data.json") print("ingesting graph test data") ingest_file_via_rest("tests/cli/graph_data.json") yield @@ -36,3 +49,60 @@ def test_get_aspect_v2(frontend_session, ingest_cleanup_data): k_schema.documentSchema == '{"type":"record","name":"SampleKafkaSchema","namespace":"com.linkedin.dataset","doc":"Sample Kafka dataset","fields":[{"name":"field_foo","type":["string"]},{"name":"field_bar","type":["boolean"]}]}' ) + + +@tenacity.retry( + stop=tenacity.stop_after_attempt(sleep_times), wait=tenacity.wait_fixed(sleep_sec) +) +def _ensure_dataset_present_correctly(graph_client: DataHubGraph): + urn = "urn:li:dataset:(urn:li:dataPlatform:graph,graph-test,PROD)" + json = { + "query": """query getDataset($urn: String!) {\n + dataset(urn: $urn) {\n + urn\n + name\n + description\n + platform {\n + urn\n + }\n + schemaMetadata {\n + name\n + version\n + createdAt\n + }\n + outgoing: relationships(\n + input: { types: ["SchemaFieldTaggedWith"], direction: OUTGOING, start: 0, count: 10000 }\n + ) {\n + start\n + count\n + total\n + relationships {\n + type\n + direction\n + entity {\n + urn\n + type\n + }\n + }\n + }\n + }\n + }""", + "variables": {"urn": urn}, + } + res_data = graph_client._post_generic("http://localhost:8080/api/graphql", json) + + assert res_data + assert res_data["data"] + assert res_data["data"]["dataset"] + assert res_data["data"]["dataset"]["urn"] == urn + assert len(res_data["data"]["dataset"]["outgoing"]["relationships"]) == 3 + + +@pytest.mark.dependency(depends=["test_healthchecks"]) +def test_graph_relationships(): + delete_urns_from_file(graph) + delete_urns_from_file(graph_2) + ingest_file_via_rest(graph) + ingest_file_via_rest(graph_2) + graph_client: DataHubGraph = DataHubGraph(DatahubClientConfig(server=get_gms_url())) + _ensure_dataset_present_correctly(graph_client) diff --git a/smoke-test/tests/cli/delete_cmd/test_timeseries_delete.py b/smoke-test/tests/cli/delete_cmd/test_timeseries_delete.py index e4787955860a1..c5913c6e84e3f 100644 --- a/smoke-test/tests/cli/delete_cmd/test_timeseries_delete.py +++ b/smoke-test/tests/cli/delete_cmd/test_timeseries_delete.py @@ -12,6 +12,7 @@ from tests.aspect_generators.timeseries.dataset_profile_gen import \ gen_dataset_profiles from tests.utils import get_strftime_from_timestamp_millis +import requests_wrapper as requests test_aspect_name: str = "datasetProfile" test_dataset_urn: str = builder.make_dataset_urn_with_platform_instance( @@ -25,8 +26,7 @@ def sync_elastic() -> None: - elastic_sync_wait_time_seconds: int = 5 - time.sleep(elastic_sync_wait_time_seconds) + time.sleep(requests.ELASTICSEARCH_REFRESH_INTERVAL_SECONDS) def datahub_put_profile(dataset_profile: DatasetProfileClass) -> None: diff --git a/smoke-test/tests/cli/ingest_cmd/test_timeseries_rollback.py b/smoke-test/tests/cli/ingest_cmd/test_timeseries_rollback.py index 70f2d15fbe383..f9c7786009f58 100644 --- a/smoke-test/tests/cli/ingest_cmd/test_timeseries_rollback.py +++ b/smoke-test/tests/cli/ingest_cmd/test_timeseries_rollback.py @@ -9,13 +9,13 @@ from datahub.entrypoints import datahub from datahub.metadata.schema_classes import DatasetProfileClass from tests.utils import ingest_file_via_rest +import requests_wrapper as requests runner = CliRunner() def sync_elastic() -> None: - elastic_sync_wait_time_seconds: int = 5 - time.sleep(elastic_sync_wait_time_seconds) + time.sleep(requests.ELASTICSEARCH_REFRESH_INTERVAL_SECONDS) def datahub_rollback(run_id: str) -> None: diff --git a/smoke-test/tests/delete/delete_test.py b/smoke-test/tests/delete/delete_test.py index dc04542cb33ba..51eaa36d05424 100644 --- a/smoke-test/tests/delete/delete_test.py +++ b/smoke-test/tests/delete/delete_test.py @@ -40,8 +40,6 @@ def test_setup(): ingested_dataset_run_id = ingest_file_via_rest("tests/delete/cli_test_data.json").config.run_id - sleep(3) - assert "browsePaths" in get_aspects_for_entity(entity_urn=dataset_urn, aspects=["browsePaths"], typed=False) yield diff --git a/smoke-test/tests/domains/domains_test.py b/smoke-test/tests/domains/domains_test.py index 5c24b6e8fe232..7ffe1682cafd8 100644 --- a/smoke-test/tests/domains/domains_test.py +++ b/smoke-test/tests/domains/domains_test.py @@ -52,6 +52,11 @@ def _ensure_more_domains(frontend_session, list_domains_json, before_count): @pytest.mark.dependency(depends=["test_healthchecks"]) def test_create_list_get_domain(frontend_session): + # Setup: Delete the domain (if exists) + response = frontend_session.post( + f"{get_gms_url()}/entities?action=delete", json={"urn": "urn:li:domain:test id"} + ) + # Get count of existing secrets list_domains_json = { "query": """query listDomains($input: ListDomainsInput!) {\n diff --git a/smoke-test/tests/managed-ingestion/managed_ingestion_test.py b/smoke-test/tests/managed-ingestion/managed_ingestion_test.py index aaea338478ed6..7367118c69d0f 100644 --- a/smoke-test/tests/managed-ingestion/managed_ingestion_test.py +++ b/smoke-test/tests/managed-ingestion/managed_ingestion_test.py @@ -1,5 +1,4 @@ import json -import time import pytest import tenacity diff --git a/smoke-test/tests/timeline/timeline_test.py b/smoke-test/tests/timeline/timeline_test.py index 0683792c7ec4d..824f297766056 100644 --- a/smoke-test/tests/timeline/timeline_test.py +++ b/smoke-test/tests/timeline/timeline_test.py @@ -1,9 +1,11 @@ import json +from time import sleep from datahub.cli import delete_cli from datahub.cli import timeline_cli from datahub.cli.cli_utils import guess_entity_type, post_entity from tests.utils import ingest_file_via_rest +from requests_wrapper import ELASTICSEARCH_REFRESH_INTERVAL_SECONDS def test_all(): @@ -21,6 +23,7 @@ def test_all(): res_data = timeline_cli.get_timeline(dataset_urn, ["TAG", "DOCUMENTATION", "TECHNICAL_SCHEMA", "GLOSSARY_TERM", "OWNER"], None, None, False) delete_cli.delete_one_urn_cmd(urn=dataset_urn) + assert res_data assert len(res_data) == 3 assert res_data[0]["semVerChange"] == "MINOR" @@ -174,3 +177,4 @@ def put(urn: str, aspect: str, aspect_data: str) -> None: entity_type=entity_type, aspect_value=aspect_obj, ) + sleep(ELASTICSEARCH_REFRESH_INTERVAL_SECONDS) diff --git a/smoke-test/tests/utils.py b/smoke-test/tests/utils.py index 490f258558a67..85704a1a636ba 100644 --- a/smoke-test/tests/utils.py +++ b/smoke-test/tests/utils.py @@ -2,8 +2,9 @@ import os from datetime import datetime, timedelta from typing import Tuple +from time import sleep -import requests +import requests_wrapper as requests from datahub.cli import cli_utils from datahub.cli.docker_cli import check_local_docker_containers @@ -62,7 +63,7 @@ def get_mysql_password(): def get_sleep_info() -> Tuple[int, int]: return ( int(os.getenv("DATAHUB_TEST_SLEEP_BETWEEN", 20)), - int(os.getenv("DATAHUB_TEST_SLEEP_TIMES", 15)), + int(os.getenv("DATAHUB_TEST_SLEEP_TIMES", 3)), ) @@ -106,6 +107,7 @@ def ingest_file_via_rest(filename: str) -> Pipeline: ) pipeline.run() pipeline.raise_from_status() + sleep(requests.ELASTICSEARCH_REFRESH_INTERVAL_SECONDS) return pipeline @@ -138,6 +140,7 @@ def delete_urns_from_file(filename: str) -> None: get_gms_url() + "/entities?action=delete", payload_obj, ) + sleep(requests.ELASTICSEARCH_REFRESH_INTERVAL_SECONDS) # Fixed now value