diff --git a/.github/pr-labeler-config.yml b/.github/pr-labeler-config.yml new file mode 100644 index 00000000000000..893193810d89d1 --- /dev/null +++ b/.github/pr-labeler-config.yml @@ -0,0 +1,11 @@ +ingestion: +- any: ['metadata-ingestion/**/*'] + +devops: +- any: ['docker/**/*', '.github/**/*'] + +product: +- any: ['datahub-frontend/**/*', 'datahub-graphql-core/**/*'] + +docs: +- any: ['docs/**/*'] \ No newline at end of file diff --git a/.github/workflows/docker-unified.yml b/.github/workflows/docker-unified.yml index 7afa33c1eaeb3f..20194bd650369c 100644 --- a/.github/workflows/docker-unified.yml +++ b/.github/workflows/docker-unified.yml @@ -128,7 +128,7 @@ jobs: publish: ${{ needs.setup.outputs.publish }} context: . file: ./docker/datahub-mae-consumer/Dockerfile - platforms: linux/amd64 + platforms: linux/amd64,linux/arm64 mae_consumer_scan: name: "[Monitoring] Scan MAE consumer images for vulnerabilities" runs-on: ubuntu-latest @@ -171,7 +171,7 @@ jobs: publish: ${{ needs.setup.outputs.publish }} context: . file: ./docker/datahub-mce-consumer/Dockerfile - platforms: linux/amd64 + platforms: linux/amd64,linux/arm64 mce_consumer_scan: name: "[Monitoring] Scan MCE consumer images for vulnerabilities" runs-on: ubuntu-latest diff --git a/.github/workflows/pr-labeler.yml b/.github/workflows/pr-labeler.yml new file mode 100644 index 00000000000000..75b13d74e1dbe7 --- /dev/null +++ b/.github/workflows/pr-labeler.yml @@ -0,0 +1,16 @@ +name: "Pull Request Labeler" +on: + pull_request_target: + types: [opened, reopened] + +jobs: + triage: + permissions: + contents: read + pull-requests: write + runs-on: ubuntu-latest + steps: + - uses: actions/labeler@v4 + with: + repo-token: "${{ secrets.GITHUB_TOKEN }}" + configuration-path: '.github/pr-labeler-config.yml' diff --git a/datahub-web-react/src/app/entity/chart/ChartSnippet.tsx b/datahub-web-react/src/app/entity/chart/ChartSnippet.tsx index 705eec806dd454..27982d3037207a 100644 --- a/datahub-web-react/src/app/entity/chart/ChartSnippet.tsx +++ b/datahub-web-react/src/app/entity/chart/ChartSnippet.tsx @@ -9,12 +9,13 @@ import { getMatchPrioritizingPrimary } from '../shared/utils'; type Props = { matchedFields: MatchedField[]; inputFields: Maybe | undefined; + isMatchingDashboard?: boolean; }; const LABEL_INDEX_NAME = 'fieldLabels'; const TYPE_PROPERTY_KEY_NAME = 'type'; -export const ChartSnippet = ({ matchedFields, inputFields }: Props) => { +export const ChartSnippet = ({ matchedFields, inputFields, isMatchingDashboard = false }: Props) => { const matchedField = getMatchPrioritizingPrimary(matchedFields, 'fieldLabels'); if (matchedField?.name === LABEL_INDEX_NAME) { @@ -36,7 +37,8 @@ export const ChartSnippet = ({ matchedFields, inputFields }: Props) => { return ( - Matches {termType} + Matches {termType} {' '} + {isMatchingDashboard && 'on a contained Chart'} ); } @@ -44,7 +46,8 @@ export const ChartSnippet = ({ matchedFields, inputFields }: Props) => { return matchedField ? ( - Matches {FIELDS_TO_HIGHLIGHT.get(matchedField.name)} {matchedField.value} + Matches {FIELDS_TO_HIGHLIGHT.get(matchedField.name)} {matchedField.value}{' '} + {isMatchingDashboard && 'on a contained Chart'} ) : null; }; diff --git a/datahub-web-react/src/app/entity/dashboard/DashboardEntity.tsx b/datahub-web-react/src/app/entity/dashboard/DashboardEntity.tsx index 8a245b9eaf2411..4358c910822286 100644 --- a/datahub-web-react/src/app/entity/dashboard/DashboardEntity.tsx +++ b/datahub-web-react/src/app/entity/dashboard/DashboardEntity.tsx @@ -23,7 +23,6 @@ import { SidebarDomainSection } from '../shared/containers/profile/sidebar/Domai import { EntityMenuItems } from '../shared/EntityDropdown/EntityDropdown'; import { LineageTab } from '../shared/tabs/Lineage/LineageTab'; import { DashboardStatsSummarySubHeader } from './profile/DashboardStatsSummarySubHeader'; -import { InputFieldsTab } from '../shared/tabs/Entity/InputFieldsTab'; import { ChartSnippet } from '../chart/ChartSnippet'; /** @@ -87,16 +86,6 @@ export class DashboardEntity implements Entity { name: 'Documentation', component: DocumentationTab, }, - { - name: 'Fields', - component: InputFieldsTab, - display: { - visible: (_, dashboard: GetDashboardQuery) => - (dashboard?.dashboard?.inputFields?.fields?.length || 0) > 0, - enabled: (_, dashboard: GetDashboardQuery) => - (dashboard?.dashboard?.inputFields?.fields?.length || 0) > 0, - }, - }, { name: 'Properties', component: PropertiesTab, @@ -215,7 +204,13 @@ export class DashboardEntity implements Entity { statsSummary={data.statsSummary} lastUpdatedMs={data.properties?.lastModified?.time} createdMs={data.properties?.created?.time} - snippet={} + snippet={ + + } /> ); }; diff --git a/docker/datahub-mae-consumer/Dockerfile b/docker/datahub-mae-consumer/Dockerfile index b8b28244dd7c28..f4e3a6946f1dfc 100644 --- a/docker/datahub-mae-consumer/Dockerfile +++ b/docker/datahub-mae-consumer/Dockerfile @@ -1,15 +1,30 @@ # Defining environment ARG APP_ENV=prod -FROM adoptopenjdk/openjdk8:alpine-jre as base +FROM alpine:3.14 AS base + ENV DOCKERIZE_VERSION v0.6.1 -RUN apk --no-cache add curl tar wget bash coreutils \ + +# Upgrade Alpine and base packages +RUN apk --no-cache --update-cache --available upgrade \ + && if [ $(arch) = "aarch64" ]; then \ + DOCKERIZE_ARCH='aarch64';\ + elif [ $(arch) = "x86_64" ]; then \ + DOCKERIZE_ARCH='amd64'; \ + else \ + echo >&2 "Unsupported architecture $(arch)" ; exit 1; \ + fi \ + && apk --no-cache add tar curl bash openjdk8-jre \ && wget --no-verbose https://github.com/open-telemetry/opentelemetry-java-instrumentation/releases/download/v1.4.1/opentelemetry-javaagent-all.jar \ && wget --no-verbose https://repo1.maven.org/maven2/io/prometheus/jmx/jmx_prometheus_javaagent/0.16.1/jmx_prometheus_javaagent-0.16.1.jar -O jmx_prometheus_javaagent.jar \ - && curl -sSL https://github.com/jwilder/dockerize/releases/download/$DOCKERIZE_VERSION/dockerize-linux-amd64-$DOCKERIZE_VERSION.tar.gz | tar -C /usr/local/bin -xzv + && curl -sSL https://github.com/treff7es/dockerize/releases/download/$DOCKERIZE_VERSION/dockerize-linux-${DOCKERIZE_ARCH}-$DOCKERIZE_VERSION.tar.gz | tar -C /usr/local/bin -xzv + +FROM --platform=$BUILDPLATFORM alpine:3.14.2 AS prod-build + +# Upgrade Alpine and base packages +RUN apk --no-cache --update-cache --available upgrade \ + && apk --no-cache add openjdk8 perl -FROM adoptopenjdk/openjdk8:alpine-slim as prod-build -RUN apk --no-cache add openjdk8-jre perl COPY . datahub-src RUN cd datahub-src && ./gradlew :metadata-jobs:mae-consumer-job:build -x test RUN cd datahub-src && cp metadata-jobs/mae-consumer-job/build/libs/mae-consumer-job.jar ../mae-consumer-job.jar diff --git a/docker/datahub-mce-consumer/Dockerfile b/docker/datahub-mce-consumer/Dockerfile index f15af8426dfdff..3626bb8f7af5f6 100644 --- a/docker/datahub-mce-consumer/Dockerfile +++ b/docker/datahub-mce-consumer/Dockerfile @@ -1,15 +1,31 @@ # Defining environment ARG APP_ENV=prod -FROM adoptopenjdk/openjdk8:alpine-jre as base +FROM alpine:3.14 AS base + ENV DOCKERIZE_VERSION v0.6.1 -RUN apk --no-cache add curl tar wget openjdk8-jre bash \ + +# Upgrade Alpine and base packages +RUN apk --no-cache --update-cache --available upgrade \ + && if [ $(arch) = "aarch64" ]; then \ + DOCKERIZE_ARCH='aarch64';\ + elif [ $(arch) = "x86_64" ]; then \ + DOCKERIZE_ARCH='amd64'; \ + else \ + echo >&2 "Unsupported architecture $(arch)" ; exit 1; \ + fi \ + && apk --no-cache add tar curl bash openjdk8-jre \ && wget --no-verbose https://github.com/open-telemetry/opentelemetry-java-instrumentation/releases/download/v1.4.1/opentelemetry-javaagent-all.jar \ && wget --no-verbose https://repo1.maven.org/maven2/io/prometheus/jmx/jmx_prometheus_javaagent/0.16.1/jmx_prometheus_javaagent-0.16.1.jar -O jmx_prometheus_javaagent.jar \ && cp /usr/lib/jvm/java-1.8-openjdk/jre/lib/security/cacerts /tmp/kafka.client.truststore.jks \ - && curl -sSL https://github.com/jwilder/dockerize/releases/download/$DOCKERIZE_VERSION/dockerize-linux-amd64-$DOCKERIZE_VERSION.tar.gz | tar -C /usr/local/bin -xzv + && curl -sSL https://github.com/treff7es/dockerize/releases/download/$DOCKERIZE_VERSION/dockerize-linux-${DOCKERIZE_ARCH}-$DOCKERIZE_VERSION.tar.gz | tar -C /usr/local/bin -xzv + +FROM --platform=$BUILDPLATFORM alpine:3.14.2 AS prod-build + +# Upgrade Alpine and base packages +RUN apk --no-cache --update-cache --available upgrade \ + && apk --no-cache add openjdk8 perl -FROM openjdk:8 as prod-build COPY . datahub-src RUN cd datahub-src && ./gradlew :metadata-jobs:mce-consumer-job:build RUN cd datahub-src && cp metadata-jobs/mce-consumer-job/build/libs/mce-consumer-job.jar ../mce-consumer-job.jar diff --git a/docker/quickstart/docker-compose.consumers-without-neo4j.quickstart.yml b/docker/quickstart/docker-compose.consumers-without-neo4j.quickstart.yml new file mode 100644 index 00000000000000..966e3542074814 --- /dev/null +++ b/docker/quickstart/docker-compose.consumers-without-neo4j.quickstart.yml @@ -0,0 +1,41 @@ +services: + datahub-gms: + environment: + - MAE_CONSUMER_ENABLED=false + - MCE_CONSUMER_ENABLED=false + datahub-mae-consumer: + container_name: datahub-mae-consumer + depends_on: + - kafka-setup + - elasticsearch-setup + environment: + - DATAHUB_GMS_HOST=datahub-gms + - DATAHUB_GMS_PORT=8080 + - MAE_CONSUMER_ENABLED=true + - PE_CONSUMER_ENABLED=true + - KAFKA_BOOTSTRAP_SERVER=broker:29092 + - KAFKA_SCHEMAREGISTRY_URL=http://schema-registry:8081 + - ELASTICSEARCH_HOST=elasticsearch + - ELASTICSEARCH_PORT=9200 + - GRAPH_SERVICE_IMPL=elasticsearch + - ENTITY_REGISTRY_CONFIG_PATH=/datahub/datahub-mae-consumer/resources/entity-registry.yml + hostname: datahub-mae-consumer + image: linkedin/datahub-mae-consumer:${DATAHUB_VERSION:-head} + ports: + - 9091:9091 + datahub-mce-consumer: + container_name: datahub-mce-consumer + depends_on: + - kafka-setup + - datahub-gms + environment: + - MCE_CONSUMER_ENABLED=true + - KAFKA_BOOTSTRAP_SERVER=broker:29092 + - KAFKA_SCHEMAREGISTRY_URL=http://schema-registry:8081 + - DATAHUB_GMS_HOST=datahub-gms + - DATAHUB_GMS_PORT=8080 + hostname: datahub-mce-consumer + image: linkedin/datahub-mce-consumer:${DATAHUB_VERSION:-head} + ports: + - 9090:9090 +version: '2.3' diff --git a/docker/quickstart/docker-compose.consumers.quickstart.yml b/docker/quickstart/docker-compose.consumers.quickstart.yml new file mode 100644 index 00000000000000..700f48d42de272 --- /dev/null +++ b/docker/quickstart/docker-compose.consumers.quickstart.yml @@ -0,0 +1,46 @@ +services: + datahub-gms: + environment: + - MAE_CONSUMER_ENABLED=false + - MCE_CONSUMER_ENABLED=false + datahub-mae-consumer: + container_name: datahub-mae-consumer + depends_on: + - kafka-setup + - elasticsearch-setup + - neo4j + environment: + - DATAHUB_GMS_HOST=datahub-gms + - DATAHUB_GMS_PORT=8080 + - MAE_CONSUMER_ENABLED=true + - PE_CONSUMER_ENABLED=true + - KAFKA_BOOTSTRAP_SERVER=broker:29092 + - KAFKA_SCHEMAREGISTRY_URL=http://schema-registry:8081 + - ELASTICSEARCH_HOST=elasticsearch + - ELASTICSEARCH_PORT=9200 + - NEO4J_HOST=http://neo4j:7474 + - NEO4J_URI=bolt://neo4j + - NEO4J_USERNAME=neo4j + - NEO4J_PASSWORD=datahub + - GRAPH_SERVICE_IMPL=neo4j + - ENTITY_REGISTRY_CONFIG_PATH=/datahub/datahub-mae-consumer/resources/entity-registry.yml + hostname: datahub-mae-consumer + image: linkedin/datahub-mae-consumer:${DATAHUB_VERSION:-head} + ports: + - 9091:9091 + datahub-mce-consumer: + container_name: datahub-mce-consumer + depends_on: + - kafka-setup + - datahub-gms + environment: + - MCE_CONSUMER_ENABLED=true + - KAFKA_BOOTSTRAP_SERVER=broker:29092 + - KAFKA_SCHEMAREGISTRY_URL=http://schema-registry:8081 + - DATAHUB_GMS_HOST=datahub-gms + - DATAHUB_GMS_PORT=8080 + hostname: datahub-mce-consumer + image: linkedin/datahub-mce-consumer:${DATAHUB_VERSION:-head} + ports: + - 9090:9090 +version: '2.3' diff --git a/docker/quickstart/generate_and_compare.sh b/docker/quickstart/generate_and_compare.sh index 6939eae7c2d1a6..4e7c26b2655a24 100755 --- a/docker/quickstart/generate_and_compare.sh +++ b/docker/quickstart/generate_and_compare.sh @@ -15,6 +15,8 @@ pip install -r requirements.txt python generate_docker_quickstart.py ../docker-compose.yml ../docker-compose.override.yml temp.quickstart.yml python generate_docker_quickstart.py ../docker-compose-without-neo4j.yml ../docker-compose-without-neo4j.override.yml temp-without-neo4j.quickstart.yml python generate_docker_quickstart.py ../monitoring/docker-compose.monitoring.yml temp.monitoring.quickstart.yml +python generate_docker_quickstart.py ../docker-compose.consumers.yml temp.consumers.quickstart.yml +python generate_docker_quickstart.py ../docker-compose.consumers-without-neo4j.yml temp.consumers-without-neo4j.quickstart.yml for flavour in "${FLAVOURS[@]}" do diff --git a/docker/quickstart/generate_docker_quickstart.sh b/docker/quickstart/generate_docker_quickstart.sh index d8427a67b593ab..aa3c767430df1c 100755 --- a/docker/quickstart/generate_docker_quickstart.sh +++ b/docker/quickstart/generate_docker_quickstart.sh @@ -12,3 +12,5 @@ pip install -r requirements.txt python generate_docker_quickstart.py ../docker-compose.yml ../docker-compose.override.yml docker-compose.quickstart.yml python generate_docker_quickstart.py ../docker-compose-without-neo4j.yml ../docker-compose-without-neo4j.override.yml docker-compose-without-neo4j.quickstart.yml python generate_docker_quickstart.py ../monitoring/docker-compose.monitoring.yml docker-compose.monitoring.quickstart.yml +python generate_docker_quickstart.py ../docker-compose.consumers.yml docker-compose.consumers.quickstart.yml +python generate_docker_quickstart.py ../docker-compose.consumers-without-neo4j.yml docker-compose.consumers-without-neo4j.quickstart.yml diff --git a/docs-website/genJsonSchema/gen_json_schema.py b/docs-website/genJsonSchema/gen_json_schema.py index dac5aba68fcd14..0b6212c8131ad6 100644 --- a/docs-website/genJsonSchema/gen_json_schema.py +++ b/docs-website/genJsonSchema/gen_json_schema.py @@ -159,7 +159,7 @@ def get_base() -> Any: "type": "array", "items": { "type": "object", - "description": "Transformer configs see at https://datahubproject.io/docs/metadata-ingestion/transformers", + "description": "Transformer configs see at https://datahubproject.io/docs/metadata-ingestion/docs/transformer", "properties": { "type": {"type": "string", "description": "Transformer type"}, "config": { diff --git a/docs-website/sidebars.js b/docs-website/sidebars.js index 91d3b8473a6968..f4ad3fb2f74099 100644 --- a/docs-website/sidebars.js +++ b/docs-website/sidebars.js @@ -137,7 +137,12 @@ module.exports = { { Sinks: list_ids_in_directory("metadata-ingestion/sink_docs"), }, - "metadata-ingestion/transformers", + { + Transformers: [ + "metadata-ingestion/docs/transformer/intro", + "metadata-ingestion/docs/transformer/dataset_transformer", + ], + }, { "Advanced Guides": [ { diff --git a/metadata-ingestion/README.md b/metadata-ingestion/README.md index cb291a49f4fd2f..206684ad8966d8 100644 --- a/metadata-ingestion/README.md +++ b/metadata-ingestion/README.md @@ -183,7 +183,7 @@ transformers: # an array of transformers applied sequentially # default sink, no config needed ``` -Check out the [transformers guide](./transformers.md) to learn more about how you can create really flexible pipelines for processing metadata using Transformers! +Check out the [transformers guide](./docs/transformer/intro.md) to learn more about how you can create really flexible pipelines for processing metadata using Transformers! ## Using as a library (SDK) @@ -195,5 +195,5 @@ In some cases, you might want to configure and run a pipeline entirely from with ## Developing -See the guides on [developing](./developing.md), [adding a source](./adding-source.md) and [using transformers](./transformers.md). +See the guides on [developing](./developing.md), [adding a source](./adding-source.md) and [using transformers](./docs/transformer/intro.md). diff --git a/metadata-ingestion/build.gradle b/metadata-ingestion/build.gradle index 2e6695a43022ea..b26c468c33d867 100644 --- a/metadata-ingestion/build.gradle +++ b/metadata-ingestion/build.gradle @@ -8,7 +8,7 @@ ext { } task checkPythonVersion(type: Exec) { - commandLine python_executable, '-c', 'import sys; assert sys.version_info >= (3, 7)' + commandLine python_executable, '-c', 'import sys; assert sys.version_info >= (3, 6)' } task environmentSetup(type: Exec, dependsOn: checkPythonVersion) { diff --git a/metadata-ingestion/docs/transformer/dataset_transformer.md b/metadata-ingestion/docs/transformer/dataset_transformer.md new file mode 100644 index 00000000000000..6b8a6ac8fe99ed --- /dev/null +++ b/metadata-ingestion/docs/transformer/dataset_transformer.md @@ -0,0 +1,1203 @@ +--- +title: "Dataset" +--- +# Dataset Transformers +The below table shows transformer which can transform aspects of entity [Dataset](../../../docs/generated/metamodel/entities/dataset.md). + +| Dataset Aspect | Transformer | +|---------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| `status` | - [Mark Dataset status](#mark-dataset-status) | +| `ownership` | - [Simple Add Dataset ownership](#simple-add-dataset-ownership)
- [Pattern Add Dataset ownership](#pattern-add-dataset-ownership)
- [Simple Remove Dataset Ownership](#simple-remove-dataset-ownership) | +| `globalTags` | - [Simple Add Dataset globalTags ](#simple-add-dataset-globaltags)
- [Pattern Add Dataset globalTags](#pattern-add-dataset-globaltags)
- [Add Dataset globalTags](#add-dataset-globaltags) | +| `browsePaths` | - [Set Dataset browsePath](#set-dataset-browsepath) | +| `glossaryTerms` | - [Simple Add Dataset glossaryTerms ](#simple-add-dataset-glossaryterms)
- [Pattern Add Dataset glossaryTerms](#pattern-add-dataset-glossaryterms) | +| `schemaMetadata` | - [Pattern Add Dataset Schema Field glossaryTerms](#pattern-add-dataset-schema-field-glossaryterms)
- [Pattern Add Dataset Schema Field globalTags](#pattern-add-dataset-schema-field-globaltags) | +| `datasetProperties` | - [Simple Add Dataset datasetProperties](#simple-add-dataset-datasetproperties)
- [Add Dataset datasetProperties](#add-dataset-datasetproperties) | +| `domains` | - [Simple Add Dataset domains](#simple-add-dataset-domains)
- [Pattern Add Dataset domains](#pattern-add-dataset-domains) | + +## Mark Dataset Status +### Config Details +| Field | Required | Type | Default | Description | +|-----------------------------|----------|---------|---------------|---------------------------------------------| +| `removed` | ✅ | boolean | | Flag to control visbility of dataset on UI. | + +If you would like to stop a dataset from appearing in the UI, then you need to mark the status of the dataset as removed. + +You can use this transformer in your source recipe to mark status as removed. + +```yaml +transformers: + - type: "mark_dataset_status" + config: + removed: true +``` +## Simple Add Dataset ownership +### Config Details +| Field | Required | Type | Default | Description | +|-----------------------------|----------|--------------|---------------|------------------------------------------------------------------| +| `owner_urns` | ✅ | list[string] | | List of owner urns. | +| `ownership_type` | | string | `DATAOWNER` | ownership type of the owners. | +| `replace_existing` | | boolean | `false` | Whether to remove owners from entity sent by ingestion source. | +| `semantics` | | enum | `OVERWRITE` | Whether to OVERWRITE or PATCH the entity present on DataHub GMS. | + +For transformer behaviour on `replace_existing` and `semantics`, please refer section [Relationship Between replace_existing And semantics](#relationship-between-replace_existing-and-semantics). + +
+Let’s suppose we’d like to append a series of users who we know to own a dataset but aren't detected during normal ingestion. To do so, we can use the `simple_add_dataset_ownership` transformer that’s included in the ingestion framework. + +The config, which we’d append to our ingestion recipe YAML, would look like this: + +Below configuration will add listed owner_urns in ownership aspect + +```yaml +transformers: + - type: "simple_add_dataset_ownership" + config: + owner_urns: + - "urn:li:corpuser:username1" + - "urn:li:corpuser:username2" + - "urn:li:corpGroup:groupname" + ownership_type: "PRODUCER" +``` + + +`simple_add_dataset_ownership` can be configured in below different way + +- Add owners, however replace existing owners sent by ingestion source + ```yaml + transformers: + - type: "simple_add_dataset_ownership" + config: + replace_existing: true # false is default behaviour + owner_urns: + - "urn:li:corpuser:username1" + - "urn:li:corpuser:username2" + - "urn:li:corpGroup:groupname" + ownership_type: "PRODUCER" + ``` +- Add owners, however overwrite the owners available for the dataset on DataHub GMS + ```yaml + transformers: + - type: "simple_add_dataset_ownership" + config: + semantics: OVERWRITE # OVERWRITE is default behaviour + owner_urns: + - "urn:li:corpuser:username1" + - "urn:li:corpuser:username2" + - "urn:li:corpGroup:groupname" + ownership_type: "PRODUCER" + ``` +- Add owners, however keep the owners available for the dataset on DataHub GMS + ```yaml + transformers: + - type: "simple_add_dataset_ownership" + config: + semantics: PATCH + owner_urns: + - "urn:li:corpuser:username1" + - "urn:li:corpuser:username2" + - "urn:li:corpGroup:groupname" + ownership_type: "PRODUCER" + ``` + +## Pattern Add Dataset ownership +### Config Details +| Field | Required | Type | Default | Description | +|-----------------------------|--------- |-----------------------|------------------|-----------------------------------------------------------------------------------------| +| `owner_pattern` | ✅ | map[regx, list[urn]] | | entity urn with regular expression and list of owners urn apply to matching entity urn. | +| `ownership_type` | | string | `DATAOWNER` | ownership type of the owners. | +| `replace_existing` | | boolean | `false` | Whether to remove owners from entity sent by ingestion source. | +| `semantics` | | enum | `OVERWRITE` | Whether to OVERWRITE or PATCH the entity present on DataHub GMS. | + +let’s suppose we’d like to append a series of users who we know to own a different dataset from a data source but aren't detected during normal ingestion. To do so, we can use the `pattern_add_dataset_ownership` module that’s included in the ingestion framework. This will match the pattern to `urn` of the dataset and assign the respective owners. + +The config, which we’d append to our ingestion recipe YAML, would look like this: + + ```yaml + transformers: + - type: "pattern_add_dataset_ownership" + config: + owner_pattern: + rules: + ".*example1.*": ["urn:li:corpuser:username1"] + ".*example2.*": ["urn:li:corpuser:username2"] + ownership_type: "DEVELOPER" + ``` + +`pattern_add_dataset_ownership` can be configured in below different way + +- Add owner, however replace existing owner sent by ingestion source + ```yaml + transformers: + - type: "pattern_add_dataset_ownership" + config: + replace_existing: true # false is default behaviour + owner_pattern: + rules: + ".*example1.*": ["urn:li:corpuser:username1"] + ".*example2.*": ["urn:li:corpuser:username2"] + ownership_type: "PRODUCER" + ``` +- Add owner, however overwrite the owners available for the dataset on DataHub GMS + ```yaml + transformers: + - type: "pattern_add_dataset_ownership" + config: + semantics: OVERWRITE # OVERWRITE is default behaviour + owner_pattern: + rules: + ".*example1.*": ["urn:li:corpuser:username1"] + ".*example2.*": ["urn:li:corpuser:username2"] + ownership_type: "PRODUCER" + ``` +- Add owner, however keep the owners available for the dataset on DataHub GMS + ```yaml + transformers: + - type: "pattern_add_dataset_ownership" + config: + semantics: PATCH + owner_pattern: + rules: + ".*example1.*": ["urn:li:corpuser:username1"] + ".*example2.*": ["urn:li:corpuser:username2"] + ownership_type: "PRODUCER" + ``` + +## Simple Remove Dataset ownership +If we wanted to clear existing owners sent by ingestion source we can use the `simple_remove_dataset_ownership` transformer which removes all owners sent by the ingestion source. + +```yaml +transformers: + - type: "simple_remove_dataset_ownership" + config: {} +``` + +The main use case of `simple_remove_dataset_ownership` is to remove incorrect owners present in the source. You can use it along with the [Simple Add Dataset ownership](#simple-add-dataset-ownership) to remove wrong owners and add the correct ones. + +Note that whatever owners you send via `simple_remove_dataset_ownership` will overwrite the owners present in the UI. +## Simple Add Dataset globalTags +### Config Details +| Field | Required | Type | Default | Description | +|-----------------------------|----------|--------------|---------------|------------------------------------------------------------------| +| `tag_urns` | ✅ | list[string] | | List of globalTags urn. | +| `replace_existing` | | boolean | `false` | Whether to remove owners from entity sent by ingestion source. | +| `semantics` | | enum | `OVERWRITE` | Whether to OVERWRITE or PATCH the entity present on DataHub GMS. | + +Let’s suppose we’d like to add a set of dataset tags. To do so, we can use the `simple_add_dataset_tags` transformer that’s included in the ingestion framework. + +The config, which we’d append to our ingestion recipe YAML, would look like this: + + ```yaml + transformers: + - type: "simple_add_dataset_tags" + config: + tag_urns: + - "urn:li:tag:NeedsDocumentation" + - "urn:li:tag:Legacy" + ``` + +`simple_add_dataset_tags` can be configured in below different way + +- Add tags, however replace existing tags sent by ingestion source + ```yaml + transformers: + - type: "simple_add_dataset_tags" + config: + replace_existing: true # false is default behaviour + tag_urns: + - "urn:li:tag:NeedsDocumentation" + - "urn:li:tag:Legacy" + ``` +- Add tags, however overwrite the tags available for the dataset on DataHub GMS + ```yaml + transformers: + - type: "simple_add_dataset_tags" + config: + semantics: OVERWRITE # OVERWRITE is default behaviour + tag_urns: + - "urn:li:tag:NeedsDocumentation" + - "urn:li:tag:Legacy" + ``` +- Add tags, however keep the tags available for the dataset on DataHub GMS + ```yaml + transformers: + - type: "simple_add_dataset_tags" + config: + semantics: PATCH + tag_urns: + - "urn:li:tag:NeedsDocumentation" + - "urn:li:tag:Legacy" + ``` +## Pattern Add Dataset globalTags +### Config Details +| Field | Required | Type | Default | Description | +|-----------------------------|----------|----------------------|-------------|---------------------------------------------------------------------------------------| +| `tag_pattern` | ✅ | map[regx, list[urn]] | | Entity urn with regular expression and list of tags urn apply to matching entity urn. | +| `replace_existing` | | boolean | `false` | Whether to remove owners from entity sent by ingestion source. | +| `semantics` | | enum | `OVERWRITE` | Whether to OVERWRITE or PATCH the entity present on DataHub GMS. | + +Let’s suppose we’d like to append a series of tags to specific datasets. To do so, we can use the `pattern_add_dataset_tags` module that’s included in the ingestion framework. This will match the regex pattern to `urn` of the dataset and assign the respective tags urns given in the array. + +The config, which we’d append to our ingestion recipe YAML, would look like this: + + ```yaml + transformers: + - type: "pattern_add_dataset_tags" + config: + tag_pattern: + rules: + ".*example1.*": ["urn:li:tag:NeedsDocumentation", "urn:li:tag:Legacy"] + ".*example2.*": ["urn:li:tag:NeedsDocumentation"] + ``` + +`pattern_add_dataset_tags` can be configured in below different way + +- Add tags, however replace existing tags sent by ingestion source + ```yaml + transformers: + - type: "pattern_add_dataset_tags" + config: + replace_existing: true # false is default behaviour + tag_pattern: + rules: + ".*example1.*": ["urn:li:tag:NeedsDocumentation", "urn:li:tag:Legacy"] + ".*example2.*": ["urn:li:tag:NeedsDocumentation"] + ``` +- Add tags, however overwrite the tags available for the dataset on DataHub GMS + ```yaml + transformers: + - type: "pattern_add_dataset_tags" + config: + semantics: OVERWRITE # OVERWRITE is default behaviour + tag_pattern: + rules: + ".*example1.*": ["urn:li:tag:NeedsDocumentation", "urn:li:tag:Legacy"] + ".*example2.*": ["urn:li:tag:NeedsDocumentation"] + ``` +- Add tags, however keep the tags available for the dataset on DataHub GMS + ```yaml + transformers: + - type: "pattern_add_dataset_tags" + config: + semantics: PATCH + tag_pattern: + rules: + ".*example1.*": ["urn:li:tag:NeedsDocumentation", "urn:li:tag:Legacy"] + ".*example2.*": ["urn:li:tag:NeedsDocumentation"] + ``` +## Add Dataset globalTags +### Config Details +| Field | Required | Type | Default | Description | +|-----------------------------|----------|--------------------------------------------|---------------|----------------------------------------------------------------------------| +| `get_tags_to_add` | ✅ | callable[[str], list[TagAssociationClass]] | | A function which takes entity urn as input and return TagAssociationClass. | +| `replace_existing` | | boolean | `false` | Whether to remove owners from entity sent by ingestion source. | +| `semantics` | | enum | `OVERWRITE` | Whether to OVERWRITE or PATCH the entity present on DataHub GMS. | + +If you'd like to add more complex logic for assigning tags, you can use the more generic add_dataset_tags transformer, which calls a user-provided function to determine the tags for each dataset. + +```yaml +transformers: + - type: "add_dataset_tags" + config: + get_tags_to_add: "." +``` + +Then define your function to return a list of TagAssociationClass tags, for example: + +```python +import logging + +import datahub.emitter.mce_builder as builder +from datahub.metadata.schema_classes import ( + DatasetSnapshotClass, + TagAssociationClass +) + +def custom_tags(current: DatasetSnapshotClass) -> List[TagAssociationClass]: + """ Returns tags to associate to a dataset depending on custom logic + + This function receives a DatasetSnapshotClass, performs custom logic and returns + a list of TagAssociationClass-wrapped tags. + + Args: + current (DatasetSnapshotClass): Single DatasetSnapshotClass object + + Returns: + List of TagAssociationClass objects. + """ + + tag_strings = [] + + ### Add custom logic here + tag_strings.append('custom1') + tag_strings.append('custom2') + + tag_strings = [builder.make_tag_urn(tag=n) for n in tag_strings] + tags = [TagAssociationClass(tag=tag) for tag in tag_strings] + + logging.info(f"Tagging dataset {current.urn} with {tag_strings}.") + return tags +``` +Finally, you can install and use your custom transformer as [shown here](#installing-the-package). + +`add_dataset_tags` can be configured in below different way + +- Add tags, however replace existing tags sent by ingestion source + ```yaml + transformers: + - type: "add_dataset_tags" + config: + replace_existing: true # false is default behaviour + get_tags_to_add: "." + ``` +- Add tags, however overwrite the tags available for the dataset on DataHub GMS + ```yaml + transformers: + - type: "add_dataset_tags" + config: + semantics: OVERWRITE # OVERWRITE is default behaviour + get_tags_to_add: "." + ``` +- Add tags, however keep the tags available for the dataset on DataHub GMS + ```yaml + transformers: + - type: "add_dataset_tags" + config: + semantics: PATCH + get_tags_to_add: "." + ``` +## Set Dataset browsePath +### Config Details +| Field | Required | Type | Default | Description | +|-----------------------------|----------|--------------|--------------|------------------------------------------------------------------| +| `path_templates` | ✅ | list[string] | | List of path templates. | +| `replace_existing` | | boolean | `false` | Whether to remove owners from entity sent by ingestion source. | +| `semantics` | | enum | `OVERWRITE` | Whether to OVERWRITE or PATCH the entity present on DataHub GMS. | + +If you would like to add to browse paths of dataset can use this transformer. There are 3 optional variables that you can use to get information from the dataset `urn`: +- ENV: env passed (default: prod) +- PLATFORM: `mysql`, `postgres` or different platform supported by datahub +- DATASET_PARTS: slash separated parts of dataset name. e.g. `database_name/schema_name/[table_name]` for postgres + +e.g. this can be used to create browse paths like `/prod/postgres/superset/public/logs` for table `superset.public.logs` in a `postgres` database +```yaml +transformers: + - type: "set_dataset_browse_path" + config: + path_templates: + - /ENV/PLATFORM/DATASET_PARTS +``` + +If you don't want the environment but wanted to add something static in the browse path like the database instance name you can use this. +```yaml +transformers: + - type: "set_dataset_browse_path" + config: + path_templates: + - /PLATFORM/marketing_db/DATASET_PARTS +``` +It will create browse path like `/mysql/marketing_db/sales/orders` for a table `sales.orders` in `mysql` database instance. + +You can use this to add multiple browse paths. Different people might know the same data assets by different names. +```yaml +transformers: + - type: "set_dataset_browse_path" + config: + path_templates: + - /PLATFORM/marketing_db/DATASET_PARTS + - /data_warehouse/DATASET_PARTS +``` +This will add 2 browse paths like `/mysql/marketing_db/sales/orders` and `/data_warehouse/sales/orders` for a table `sales.orders` in `mysql` database instance. + +Default behaviour of the transform is to add new browse paths, you can optionally set `replace_existing: True` so +the transform becomes a _set_ operation instead of an _append_. +```yaml +transformers: + - type: "set_dataset_browse_path" + config: + replace_existing: True + path_templates: + - /ENV/PLATFORM/DATASET_PARTS +``` +In this case, the resulting dataset will have only 1 browse path, the one from the transform. + +`set_dataset_browse_path` can be configured in below different way + +- Add browsePath, however replace existing browsePath sent by ingestion source + ```yaml + transformers: + - type: "set_dataset_browse_path" + config: + replace_existing: true # false is default behaviour + path_templates: + - /PLATFORM/marketing_db/DATASET_PARTS + ``` +- Add browsePath, however overwrite the browsePath available for the dataset on DataHub GMS + ```yaml + transformers: + - type: "set_dataset_browse_path" + config: + semantics: OVERWRITE # OVERWRITE is default behaviour + path_templates: + - /PLATFORM/marketing_db/DATASET_PARTS + ``` +- Add browsePath, however keep the browsePath available for the dataset on DataHub GMS + ```yaml + transformers: + - type: "set_dataset_browse_path" + config: + semantics: PATCH + path_templates: + - /PLATFORM/marketing_db/DATASET_PARTS + ``` + +## Simple Add Dataset glossaryTerms +### Config Details +| Field | Required | Type | Default | Description | +|-----------------------------|----------|--------------|---------------|------------------------------------------------------------------| +| `term_urns` | ✅ | list[string] | | List of glossaryTerms urn. | +| `replace_existing` | | boolean | `false` | Whether to remove owners from entity sent by ingestion source. | +| `semantics` | | enum | `OVERWRITE` | Whether to OVERWRITE or PATCH the entity present on DataHub GMS. | + +We can use a similar convention to associate [Glossary Terms](../../../docs/generated/ingestion/sources/business-glossary.md) to datasets. +We can use the `simple_add_dataset_terms` transformer that’s included in the ingestion framework. + +The config, which we’d append to our ingestion recipe YAML, would look like this: + + ```yaml + transformers: + - type: "simple_add_dataset_terms" + config: + term_urns: + - "urn:li:glossaryTerm:Email" + - "urn:li:glossaryTerm:Address" + ``` + +`simple_add_dataset_terms` can be configured in below different way + +- Add terms, however replace existing terms sent by ingestion source + ```yaml + transformers: + - type: "simple_add_dataset_terms" + config: + replace_existing: true # false is default behaviour + term_urns: + - "urn:li:glossaryTerm:Email" + - "urn:li:glossaryTerm:Address" + ``` +- Add terms, however overwrite the terms available for the dataset on DataHub GMS + ```yaml + transformers: + - type: "simple_add_dataset_terms" + config: + semantics: OVERWRITE # OVERWRITE is default behaviour + term_urns: + - "urn:li:glossaryTerm:Email" + - "urn:li:glossaryTerm:Address" + ``` +- Add terms, however keep the terms available for the dataset on DataHub GMS + ```yaml + transformers: + - type: "simple_add_dataset_terms" + config: + semantics: PATCH + term_urns: + - "urn:li:glossaryTerm:Email" + - "urn:li:glossaryTerm:Address" + ``` + +## Pattern Add Dataset glossaryTerms +### Config Details +| Field | Required | Type | Default | Description | +|-----------------------------|--------|----------------------|--------------|-------------------------------------------------------------------------------------------------| +| `term_pattern` | ✅ | map[regx, list[urn]] | | entity urn with regular expression and list of glossaryTerms urn apply to matching entity urn. | +| `replace_existing` | | boolean | `false` | Whether to remove owners from entity sent by ingestion source. | +| `semantics` | | enum | `OVERWRITE` | Whether to OVERWRITE or PATCH the entity present on DataHub GMS. | + +We can add glossary terms to datasets based on a regex filter. + + ```yaml + transformers: + - type: "pattern_add_dataset_terms" + config: + term_pattern: + rules: + ".*example1.*": ["urn:li:glossaryTerm:Email", "urn:li:glossaryTerm:Address"] + ".*example2.*": ["urn:li:glossaryTerm:PostalCode"] + ``` + +`pattern_add_dataset_terms` can be configured in below different way + +- Add terms, however replace existing terms sent by ingestion source + ```yaml + transformers: + - type: "pattern_add_dataset_terms" + config: + replace_existing: true # false is default behaviour + term_pattern: + rules: + ".*example1.*": ["urn:li:glossaryTerm:Email", "urn:li:glossaryTerm:Address"] + ".*example2.*": ["urn:li:glossaryTerm:PostalCode"] + + ``` +- Add terms, however overwrite the terms available for the dataset on DataHub GMS + ```yaml + transformers: + - type: "pattern_add_dataset_terms" + config: + semantics: OVERWRITE # OVERWRITE is default behaviour + term_pattern: + rules: + ".*example1.*": ["urn:li:glossaryTerm:Email", "urn:li:glossaryTerm:Address"] + ".*example2.*": ["urn:li:glossaryTerm:PostalCode"] + ``` +- Add terms, however keep the terms available for the dataset on DataHub GMS + ```yaml + transformers: + - type: "pattern_add_dataset_terms" + config: + semantics: PATCH + term_pattern: + rules: + ".*example1.*": ["urn:li:glossaryTerm:Email", "urn:li:glossaryTerm:Address"] + ".*example2.*": ["urn:li:glossaryTerm:PostalCode"] + ``` +## Pattern Add Dataset Schema Field glossaryTerms +### Config Details +| Field | Required | Type | Default | Description | +|-----------------------------|---------|----------------------|-------------|------------------------------------------------------------------------------------------------| +| `term_pattern` | ✅ | map[regx, list[urn]] | | entity urn with regular expression and list of glossaryTerms urn apply to matching entity urn. | +| `replace_existing` | | boolean | `false` | Whether to remove owners from entity sent by ingestion source. | +| `semantics` | | enum | `OVERWRITE` | Whether to OVERWRITE or PATCH the entity present on DataHub GMS. | + +We can add glossary terms to schema fields based on a regex filter. + +Note that only terms from the first matching pattern will be applied. + + ```yaml + transformers: + - type: "pattern_add_dataset_schema_terms" + config: + term_pattern: + rules: + ".*email.*": ["urn:li:glossaryTerm:Email"] + ".*name.*": ["urn:li:glossaryTerm:Name"] + ``` + +`pattern_add_dataset_schema_terms` can be configured in below different way + +- Add terms, however replace existing terms sent by ingestion source + ```yaml + transformers: + - type: "pattern_add_dataset_schema_terms" + config: + replace_existing: true # false is default behaviour + term_pattern: + rules: + ".*email.*": ["urn:li:glossaryTerm:Email"] + ".*name.*": ["urn:li:glossaryTerm:Name"] + ``` +- Add terms, however overwrite the terms available for the dataset on DataHub GMS + ```yaml + transformers: + - type: "pattern_add_dataset_schema_terms" + config: + semantics: OVERWRITE # OVERWRITE is default behaviour + term_pattern: + rules: + ".*email.*": ["urn:li:glossaryTerm:Email"] + ".*name.*": ["urn:li:glossaryTerm:Name"] + ``` +- Add terms, however keep the terms available for the dataset on DataHub GMS + ```yaml + transformers: + - type: "pattern_add_dataset_schema_terms" + config: + semantics: PATCH + term_pattern: + rules: + ".*email.*": ["urn:li:glossaryTerm:Email"] + ".*name.*": ["urn:li:glossaryTerm:Name"] + ``` +## Pattern Add Dataset Schema Field globalTags +### Config Details +| Field | Required | Type | Default | Description | +|-----------------------------|----------|----------------------|-------------|---------------------------------------------------------------------------------------| +| `tag_pattern` | ✅ | map[regx, list[urn]] | | entity urn with regular expression and list of tags urn apply to matching entity urn. | +| `replace_existing` | | boolean | `false` | Whether to remove owners from entity sent by ingestion source. | +| `semantics` | | enum | `OVERWRITE` | Whether to OVERWRITE or PATCH the entity present on DataHub GMS. | + + +We can also append a series of tags to specific schema fields. To do so, we can use the `pattern_add_dataset_schema_tags` transformer. This will match the regex pattern to each schema field path and assign the respective tags urns given in the array. + +Note that the tags from the first matching pattern will be applied, not all matching patterns. + +The config would look like this: + + ```yaml + transformers: + - type: "pattern_add_dataset_schema_tags" + config: + tag_pattern: + rules: + ".*email.*": ["urn:li:tag:Email"] + ".*name.*": ["urn:li:tag:Name"] + ``` + +`pattern_add_dataset_schema_tags` can be configured in below different way + +- Add tags, however replace existing tag sent by ingestion source + ```yaml + transformers: + - type: "pattern_add_dataset_schema_tags" + config: + replace_existing: true # false is default behaviour + tag_pattern: + rules: + ".*example1.*": ["urn:li:tag:NeedsDocumentation", "urn:li:tag:Legacy"] + ".*example2.*": ["urn:li:tag:NeedsDocumentation"] + ``` +- Add tags, however overwrite the tags available for the dataset on DataHub GMS + ```yaml + transformers: + - type: "pattern_add_dataset_schema_tags" + config: + semantics: OVERWRITE # OVERWRITE is default behaviour + tag_pattern: + rules: + ".*example1.*": ["urn:li:tag:NeedsDocumentation", "urn:li:tag:Legacy"] + ".*example2.*": ["urn:li:tag:NeedsDocumentation"] + ``` +- Add tags, however keep the tags available for the dataset on DataHub GMS + ```yaml + transformers: + - type: "pattern_add_dataset_schema_tags" + config: + semantics: PATCH + tag_pattern: + rules: + ".*example1.*": ["urn:li:tag:NeedsDocumentation", "urn:li:tag:Legacy"] + ".*example2.*": ["urn:li:tag:NeedsDocumentation"] + ``` +## Simple Add Dataset datasetProperties +### Config Details +| Field | Required | Type | Default | Description | +|--------------------|---------|----------------|-------------|------------------------------------------------------------------| +| `properties` | ✅ | dict[str, str] | | Map of key value pair. | +| `replace_existing` | | boolean | `false` | Whether to remove owners from entity sent by ingestion source. | +| `semantics` | | enum | `OVERWRITE` | Whether to OVERWRITE or PATCH the entity present on DataHub GMS. | + +`simple_add_dataset_properties` transformer assigns the properties to dataset entity from the configuration. +`properties` field is a dictionary of string values. Note in case of any key collision, the value in the config will +overwrite the previous value. + + ```yaml + transformers: + - type: "simple_add_dataset_properties" + config: + properties: + prop1: value1 + prop2: value2 + ``` + +`simple_add_dataset_properties` can be configured in below different way + +- Add dataset-properties, however replace existing dataset-properties sent by ingestion source + ```yaml + transformers: + - type: "simple_add_dataset_properties" + config: + replace_existing: true # false is default behaviour + properties: + prop1: value1 + prop2: value2 + + + ``` +- Add dataset-properties, however overwrite the dataset-properties available for the dataset on DataHub GMS + ```yaml + transformers: + - type: "simple_add_dataset_properties" + config: + semantics: OVERWRITE # OVERWRITE is default behaviour + properties: + prop1: value1 + prop2: value2 + + + ``` +- Add dataset-properties, however keep the dataset-properties available for the dataset on DataHub GMS + ```yaml + transformers: + - type: "simple_add_dataset_properties" + config: + semantics: PATCH + properties: + prop1: value1 + prop2: value2 + + ``` + +## Add Dataset datasetProperties +### Config Details +| Field | Required | Type | Default | Description | +|--------------------------------|----------|--------------------------------------------|-------------|------------------------------------------------------------------| +| `add_properties_resolver_class`| ✅ | Type[AddDatasetPropertiesResolverBase] | | A class extends from `AddDatasetPropertiesResolverBase` | +| `replace_existing` | | boolean | `false` | Whether to remove owners from entity sent by ingestion source. | +| `semantics` | | enum | `OVERWRITE` | Whether to OVERWRITE or PATCH the entity present on DataHub GMS. | + +If you'd like to add more complex logic for assigning properties, you can use the `add_dataset_properties` transformer, which calls a user-provided class (that extends from `AddDatasetPropertiesResolverBase` class) to determine the properties for each dataset. + +The config, which we’d append to our ingestion recipe YAML, would look like this: + + ```yaml + transformers: + - type: "add_dataset_properties" + config: + add_properties_resolver_class: "." + ``` + +Then define your class to return a list of custom properties, for example: + + ```python + import logging + from typing import Dict + from datahub.ingestion.transformer.add_dataset_properties import AddDatasetPropertiesResolverBase + from datahub.metadata.schema_classes import DatasetSnapshotClass + + class MyPropertiesResolver(AddDatasetPropertiesResolverBase): + def get_properties_to_add(self, current: DatasetSnapshotClass) -> Dict[str, str]: + ### Add custom logic here + properties= {'my_custom_property': 'property value'} + logging.info(f"Adding properties: {properties} to dataset: {current.urn}.") + return properties + ``` + +`add_dataset_properties` can be configured in below different way + +- Add dataset-properties, however replace existing dataset-properties sent by ingestion source + ```yaml + transformers: + - type: "add_dataset_properties" + config: + replace_existing: true # false is default behaviour + add_properties_resolver_class: "." + + ``` +- Add dataset-properties, however overwrite the dataset-properties available for the dataset on DataHub GMS + ```yaml + transformers: + - type: "add_dataset_properties" + config: + semantics: OVERWRITE # OVERWRITE is default behaviour + add_properties_resolver_class: "." + + ``` +- Add dataset-properties, however keep the dataset-properties available for the dataset on DataHub GMS + ```yaml + transformers: + - type: "add_dataset_properties" + config: + semantics: PATCH + add_properties_resolver_class: "." + ``` + +## Simple Add Dataset domains +### Config Details +| Field | Required | Type | Default | Description | +|--------------------|----------|------------------------|---------------|------------------------------------------------------------------| +| `domains` | ✅ | list[union[urn, str]] | | List of simple domain name or domain urns. | +| `replace_existing` | | boolean | `false` | Whether to remove owners from entity sent by ingestion source. | +| `semantics` | | enum | `OVERWRITE` | Whether to OVERWRITE or PATCH the entity present on DataHub GMS. | + +For transformer behaviour on `replace_existing` and `semantics`, please refer section [Relationship Between replace_existing And semantics](#relationship-between-replace_existing-and-semantics). + +
+ +let’s suppose we’d like to add a series of domain to dataset, in this case you can use `simple_add_dataset_domain` transformer. + +The config, which we’d append to our ingestion recipe YAML, would look like this: + +Here we can set domains to either urn (i.e. urn:li:domain:engineering) or simple domain name (i.e. engineering) in both of the cases domain should be provisioned on DataHub GMS +```yaml +transformers: + - type: "simple_add_dataset_domain" + config: + semantics: OVERWRITE + domains: + - urn:li:domain:engineering +``` + + +`simple_add_dataset_domain` can be configured in below different way + +- Add domains, however replace existing domains sent by ingestion source +```yaml + transformers: + - type: "simple_add_dataset_domain" + config: + replace_existing: true # false is default behaviour + domains: + - "urn:li:domain:engineering" + - "urn:li:domain:hr" + ``` +- Add domains, however overwrite the domains available for the dataset on DataHub GMS +```yaml + transformers: + - type: "simple_add_dataset_domain" + config: + semantics: OVERWRITE # OVERWRITE is default behaviour + domains: + - "urn:li:domain:engineering" + - "urn:li:domain:hr" + ``` +- Add domains, however keep the domains available for the dataset on DataHub GMS +```yaml + transformers: + - type: "simple_add_dataset_domain" + config: + semantics: PATCH + domains: + - "urn:li:domain:engineering" + - "urn:li:domain:hr" + ``` + +## Pattern Add Dataset domains +### Config Details +| Field | Required | Type | Default | Description | +|----------------------------|-----------|---------------------------------|-----------------|----------------------------------------------------------------------------------------------------------------------------| +| `domain_pattern` | ✅ | map[regx, list[union[urn, str]] | | dataset urn with regular expression and list of simple domain name or domain urn need to be apply on matching dataset urn. | +| `replace_existing` | | boolean | `false` | Whether to remove owners from entity sent by ingestion source. | +| `semantics` | | enum | `OVERWRITE` | Whether to OVERWRITE or PATCH the entity present on DataHub GMS. | + +Let’s suppose we’d like to append a series of domain to specific datasets. To do so, we can use the pattern_add_dataset_domain transformer that’s included in the ingestion framework. +This will match the regex pattern to urn of the dataset and assign the respective domain urns given in the array. + +The config, which we’d append to our ingestion recipe YAML, would look like this: +Here we can set domain list to either urn (i.e. urn:li:domain:hr) or simple domain name (i.e. hr) +in both of the cases domain should be provisioned on DataHub GMS + + ```yaml + transformers: + - type: "pattern_add_dataset_domain" + config: + semantics: OVERWRITE + domain_pattern: + rules: + 'urn:li:dataset:\(urn:li:dataPlatform:postgres,postgres\.public\.n.*': ["hr"] + 'urn:li:dataset:\(urn:li:dataPlatform:postgres,postgres\.public\.t.*': ["urn:li:domain:finance"] + ``` + +`pattern_add_dataset_domain` can be configured in below different way + +- Add domains, however replace existing domains sent by ingestion source +```yaml + transformers: + - type: "pattern_add_dataset_ownership" + config: + replace_existing: true # false is default behaviour + domain_pattern: + rules: + 'urn:li:dataset:\(urn:li:dataPlatform:postgres,postgres\.public\.n.*': ["hr"] + 'urn:li:dataset:\(urn:li:dataPlatform:postgres,postgres\.public\.t.*': ["urn:li:domain:finance"] + ``` +- Add domains, however overwrite the domains available for the dataset on DataHub GMS +```yaml + transformers: + - type: "pattern_add_dataset_ownership" + config: + semantics: OVERWRITE # OVERWRITE is default behaviour + domain_pattern: + rules: + 'urn:li:dataset:\(urn:li:dataPlatform:postgres,postgres\.public\.n.*': ["hr"] + 'urn:li:dataset:\(urn:li:dataPlatform:postgres,postgres\.public\.t.*': ["urn:li:domain:finance"] + ``` +- Add domains, however keep the domains available for the dataset on DataHub GMS +```yaml + transformers: + - type: "pattern_add_dataset_ownership" + config: + semantics: PATCH + domain_pattern: + rules: + 'urn:li:dataset:\(urn:li:dataPlatform:postgres,postgres\.public\.n.*': ["hr"] + 'urn:li:dataset:\(urn:li:dataPlatform:postgres,postgres\.public\.t.*': ["urn:li:domain:finance"] + ``` +## Relationship Between replace_existing and semantics +The transformer behaviour mentioned here is in context of `simple_add_dataset_ownership`, however it is applicable for all dataset transformers which are supporting `replace_existing` +and `semantics` configuration attributes, for example `simple_add_dataset_tags` will add or remove tags as per behaviour mentioned in this section. + +`replace_existing` controls whether to remove owners from currently executing ingestion pipeline. + +`semantics` controls whether to overwrite or patch owners present on DataHub GMS server. These owners might be added from DataHub Portal. + +if `replace_existing` is set to `true` and `semantics` is set to `OVERWRITE` then transformer takes below steps +1. As `replace_existing` is set to `true`, remove the owners from input entity (i.e. dataset) +2. Add owners mentioned in ingestion recipe to input entity +3. As `semantics` is set to `OVERWRITE` no need to fetch owners present on DataHub GMS server for the input entity +4. Return input entity + +if `replace_existing` is set to `true` and `semantics` is set to `PATCH` then transformer takes below steps +1. `replace_existing` is set to `true`, first remove the owners from input entity (i.e. dataset) +2. Add owners mentioned in ingestion recipe to input entity +3. As `semantics` is set to `PATCH` fetch owners for the input entity from DataHub GMS Server +4. Add owners fetched from DataHub GMS Server to input entity +5. Return input entity + +if `replace_existing` is set to `false` and `semantics` is set to `OVERWRITE` then transformer takes below steps +1. As `replace_existing` is set to `false`, keep the owners present in input entity as is +2. Add owners mentioned in ingestion recipe to input entity +3. As `semantics` is set to `OVERWRITE` no need to fetch owners from DataHub GMS Server for the input entity +4. Return input entity + +if `replace_existing` is set to `false` and `semantics` is set to `PATCH` then transformer takes below steps +1. `replace_existing` is set to `false`, keep the owners present in input entity as is +2. Add owners mentioned in ingestion recipe to input entity +3. As `semantics` is set to `PATCH` fetch owners for the input entity from DataHub GMS Server +4. Add owners fetched from DataHub GMS Server to input entity +5. Return input entity + + + +## Writing a custom transformer from scratch + +In the above couple of examples, we use classes that have already been implemented in the ingestion framework. However, it’s common for more advanced cases to pop up where custom code is required, for instance if you'd like to utilize conditional logic or rewrite properties. In such cases, we can add our own modules and define the arguments it takes as a custom transformer. + +As an example, suppose we want to append a set of ownership fields to our metadata that are dependent upon an external source – for instance, an API endpoint or file – rather than a preset list like above. In this case, we can set a JSON file as an argument to our custom config, and our transformer will read this file and append the included ownership elements to all metadata events. + +Our JSON file might look like the following: + +```json +[ + "urn:li:corpuser:athos", + "urn:li:corpuser:porthos", + "urn:li:corpuser:aramis", + "urn:li:corpGroup:the_three_musketeers" +] +``` + +### Defining a config + +To get started, we’ll initiate an `AddCustomOwnershipConfig` class that inherits from [`datahub.configuration.common.ConfigModel`](../../src/datahub/configuration/common.py). The sole parameter will be an `owners_json` which expects a path to a JSON file containing a list of owner URNs. This will go in a file called `custom_transform_example.py`. + +```python +from datahub.configuration.common import ConfigModel + +class AddCustomOwnershipConfig(ConfigModel): + owners_json: str +``` + +### Defining the transformer + +Next, we’ll define the transformer itself, which must inherit from [`datahub.ingestion.api.transform.Transformer`](../../src/datahub/ingestion/api/transform.py). The framework provides a helper class called [`datahub.ingestion.transformer.base_transformer.BaseTransformer`](../../src/datahub/ingestion/transformer/base_transformer.py) that makes it super-simple to write transformers. +First, let's get all our imports in: + +```python +# append these to the start of custom_transform_example.py +import json +from typing import List, Optional + +from datahub.configuration.common import ConfigModel +from datahub.ingestion.api.common import PipelineContext +from datahub.ingestion.transformer.add_dataset_ownership import Semantics +from datahub.ingestion.transformer.base_transformer import ( + BaseTransformer, + SingleAspectTransformer, +) +from datahub.metadata.schema_classes import ( + OwnerClass, + OwnershipClass, + OwnershipTypeClass, +) + +``` + +Next, let's define the base scaffolding for the class: + +```python +# append this to the end of custom_transform_example.py + +class AddCustomOwnership(BaseTransformer, SingleAspectTransformer): + """Transformer that adds owners to datasets according to a callback function.""" + + # context param to generate run metadata such as a run ID + ctx: PipelineContext + # as defined in the previous block + config: AddCustomOwnershipConfig + + def __init__(self, config: AddCustomOwnershipConfig, ctx: PipelineContext): + super().__init__() + self.ctx = ctx + self.config = config + + with open(self.config.owners_json, "r") as f: + raw_owner_urns = json.load(f) + + self.owners = [ + OwnerClass(owner=owner, type=OwnershipTypeClass.DATAOWNER) + for owner in raw_owner_urns + ] +``` + +A transformer must have two functions: a `create()` function for initialization and a `transform()` function for executing the transformation. Transformers that extend `BaseTransformer` and `SingleAspectTransformer` can avoid having to implement the more complex `transform` function and just implement the `transform_aspect` function. + +Let's begin by adding a `create()` method for parsing our configuration dictionary: + +```python +# add this as a function of AddCustomOwnership + +@classmethod +def create(cls, config_dict: dict, ctx: PipelineContext) -> "AddCustomOwnership": + config = AddCustomOwnershipConfig.parse_obj(config_dict) + return cls(config, ctx) +``` + +Next we need to tell the helper classes which entity types and aspect we are interested in transforming. In this case, we want to only process `dataset` entities and transform the `ownership` aspect. + +```python +def entity_types(self) -> List[str]: + return ["dataset"] + + def aspect_name(self) -> str: + return "ownership" +``` + +Finally we need to implement the `transform_aspect()` method that does the work of adding our custom ownership classes. This method will be called be the framework with an optional aspect value filled out if the upstream source produced a value for this aspect. The framework takes care of pre-processing both MCE-s and MCP-s so that the `transform_aspect()` function is only called one per entity. Our job is merely to inspect the incoming aspect (or absence) and produce a transformed value for this aspect. Returning `None` from this method will effectively suppress this aspect from being emitted. + +```python +# add this as a function of AddCustomOwnership + + def transform_aspect( # type: ignore + self, entity_urn: str, aspect_name: str, aspect: Optional[OwnershipClass] + ) -> Optional[OwnershipClass]: + + owners_to_add = self.owners + assert aspect is None or isinstance(aspect, OwnershipClass) + + if owners_to_add: + ownership = ( + aspect + if aspect + else OwnershipClass( + owners=[], + ) + ) + ownership.owners.extend(owners_to_add) + + return ownership +``` + +### More Sophistication: Making calls to DataHub during Transformation + +In some advanced cases, you might want to check with DataHub before performing a transformation. A good example for this might be retrieving the current set of owners of a dataset before providing the new set of owners during an ingestion process. To allow transformers to always be able to query the graph, the framework provides them access to the graph through the context object `ctx`. Connectivity to the graph is automatically instantiated anytime the pipeline uses a REST sink. In case you are using the Kafka sink, you can additionally provide access to the graph by configuring it in your pipeline. + +Here is an example of a recipe that uses Kafka as the sink, but provides access to the graph by explicitly configuring the `datahub_api`. + +```yaml +source: + type: mysql + config: + # ..source configs + +sink: + type: datahub-kafka + config: + connection: + bootstrap: localhost:9092 + schema_registry_url: "http://localhost:8081" + +datahub_api: + server: http://localhost:8080 + # standard configs accepted by datahub rest client ... +``` + +#### Advanced Use-Case: Patching Owners + +With the above capability, we can now build more powerful transformers that can check with the server-side state before issuing changes in metadata. +e.g. Here is how the AddDatasetOwnership transformer can now support PATCH semantics by ensuring that it never deletes any owners that are stored on the server. + +```python +def transform_one(self, mce: MetadataChangeEventClass) -> MetadataChangeEventClass: + if not isinstance(mce.proposedSnapshot, DatasetSnapshotClass): + return mce + owners_to_add = self.config.get_owners_to_add(mce.proposedSnapshot) + if owners_to_add: + ownership = builder.get_or_add_aspect( + mce, + OwnershipClass( + owners=[], + ), + ) + ownership.owners.extend(owners_to_add) + + if self.config.semantics == Semantics.PATCH: + assert self.ctx.graph + patch_ownership = AddDatasetOwnership.get_ownership_to_set( + self.ctx.graph, mce.proposedSnapshot.urn, ownership + ) + builder.set_aspect( + mce, aspect=patch_ownership, aspect_type=OwnershipClass + ) + return mce +``` + +### Installing the package + +Now that we've defined the transformer, we need to make it visible to DataHub. The easiest way to do this is to just place it in the same directory as your recipe, in which case the module name is the same as the file – in this case, `custom_transform_example`. + +
+ Advanced: installing as a package +Alternatively, create a `setup.py` in the same directory as our transform script to make it visible globally. After installing this package (e.g. with `python setup.py` or `pip install -e .`), our module will be installed and importable as `custom_transform_example`. + +```python +from setuptools import find_packages, setup + +setup( + name="custom_transform_example", + version="1.0", + packages=find_packages(), + # if you don't already have DataHub installed, add it under install_requires + # install_requires=["acryl-datahub"] +) +``` + +
+ +### Running the transform + +```yaml +transformers: + - type: "custom_transform_example.AddCustomOwnership" + config: + owners_json: "" # the JSON file mentioned at the start +``` + +After running `datahub ingest -c `, our MCEs will now have the following owners appended: + +```json +"owners": [ + { + "owner": "urn:li:corpuser:athos", + "type": "DATAOWNER", + "source": null + }, + { + "owner": "urn:li:corpuser:porthos", + "type": "DATAOWNER", + "source": null + }, + { + "owner": "urn:li:corpuser:aramis", + "type": "DATAOWNER", + "source": null + }, + { + "owner": "urn:li:corpGroup:the_three_musketeers", + "type": "DATAOWNER", + "source": null + }, + // ...and any additional owners +], +``` + +All the files for this tutorial may be found [here](../../examples/transforms/). diff --git a/metadata-ingestion/docs/transformer/intro.md b/metadata-ingestion/docs/transformer/intro.md new file mode 100644 index 00000000000000..18629d0abb6758 --- /dev/null +++ b/metadata-ingestion/docs/transformer/intro.md @@ -0,0 +1,33 @@ +--- +title: "Introduction" +--- + +# Transformers + +## What’s a transformer? + +Oftentimes we want to modify metadata before it reaches the ingestion sink – for instance, we might want to add custom tags, ownership, properties, or patch some fields. A transformer allows us to do exactly these things. + +Moreover, a transformer allows one to have fine-grained control over the metadata that’s ingested without having to modify the ingestion framework's code yourself. Instead, you can write your own module that can transform metadata events however you like. To include a transformer into a recipe, all that's needed is the name of the transformer as well as any configuration that the transformer needs. + +## Provided transformers + +Aside from the option of writing your own transformer (see below), we provide some simple transformers for the use cases of adding: tags, glossary terms, properties and ownership information. + +DataHub provided transformers for dataset are: +- [Simple Add Dataset ownership](./dataset_transformer.md#simple-add-dataset-ownership) +- [Pattern Add Dataset ownership](./dataset_transformer.md#pattern-add-dataset-ownership) +- [Simple Remove Dataset ownership](./dataset_transformer.md#simple-remove-dataset-ownership) +- [Mark Dataset Status](./dataset_transformer.md#mark-dataset-status) +- [Simple Add Dataset globalTags](./dataset_transformer.md#simple-add-dataset-globaltags) +- [Pattern Add Dataset globalTags](./dataset_transformer.md#pattern-add-dataset-globaltags) +- [Add Dataset globalTags](./dataset_transformer.md#add-dataset-globaltags) +- [Set Dataset browsePath](./dataset_transformer.md#set-dataset-browsepath) +- [Simple Add Dataset glossaryTerms](./dataset_transformer.md#simple-add-dataset-glossaryterms) +- [Pattern Add Dataset glossaryTerms](./dataset_transformer.md#pattern-add-dataset-glossaryterms) +- [Pattern Add Dataset Schema Field glossaryTerms](./dataset_transformer.md#pattern-add-dataset-schema-field-glossaryterms) +- [Pattern Add Dataset Schema Field globalTags](./dataset_transformer.md#pattern-add-dataset-schema-field-globaltags) +- [Simple Add Dataset datasetProperties](./dataset_transformer.md#simple-add-dataset-datasetproperties) +- [Add Dataset datasetProperties](./dataset_transformer.md#add-dataset-datasetproperties) +- [Simple Add Dataset domains](./dataset_transformer.md#simple-add-dataset-domains) +- [Pattern Add Dataset domains](./dataset_transformer.md#pattern-add-dataset-domains) diff --git a/metadata-ingestion/examples/transforms/custom_transform_example.py b/metadata-ingestion/examples/transforms/custom_transform_example.py index 57560e75cf7e92..4886185647a6d2 100644 --- a/metadata-ingestion/examples/transforms/custom_transform_example.py +++ b/metadata-ingestion/examples/transforms/custom_transform_example.py @@ -2,9 +2,8 @@ import json from typing import List, Optional -from datahub.configuration.common import ConfigModel +from datahub.configuration.common import ConfigModel, TransformerSemantics from datahub.ingestion.api.common import PipelineContext -from datahub.ingestion.transformer.add_dataset_ownership import Semantics from datahub.ingestion.transformer.base_transformer import ( BaseTransformer, SingleAspectTransformer, @@ -18,7 +17,7 @@ class AddCustomOwnershipConfig(ConfigModel): owners_json: str - semantics: Semantics = Semantics.OVERWRITE + semantics: TransformerSemantics = TransformerSemantics.OVERWRITE class AddCustomOwnership(BaseTransformer, SingleAspectTransformer): diff --git a/metadata-ingestion/src/datahub/cli/check_cli.py b/metadata-ingestion/src/datahub/cli/check_cli.py index 65bf9c0cab553a..b6484549a7de13 100644 --- a/metadata-ingestion/src/datahub/cli/check_cli.py +++ b/metadata-ingestion/src/datahub/cli/check_cli.py @@ -27,7 +27,10 @@ def check() -> None: is_flag=True, help="Rewrite the JSON file to it's canonical form.", ) -def metadata_file(json_file: str, rewrite: bool) -> None: +@click.option( + "--unpack-mces", default=False, is_flag=True, help="Converts MCEs into MCPs" +) +def metadata_file(json_file: str, rewrite: bool, unpack_mces: bool) -> None: """Check the schema of a metadata (MCE or MCP) JSON file.""" if not rewrite: @@ -42,7 +45,10 @@ def metadata_file(json_file: str, rewrite: bool) -> None: "type": "file", "config": {"filename": json_file}, "extractor": "generic", - "extractor_config": {"set_system_metadata": False}, + "extractor_config": { + "set_system_metadata": False, + "unpack_mces_into_mcps": unpack_mces, + }, }, "sink": { "type": "file", diff --git a/metadata-ingestion/src/datahub/cli/cli_utils.py b/metadata-ingestion/src/datahub/cli/cli_utils.py index bddea0c0342a43..e026e7ba55a121 100644 --- a/metadata-ingestion/src/datahub/cli/cli_utils.py +++ b/metadata-ingestion/src/datahub/cli/cli_utils.py @@ -17,7 +17,7 @@ from datahub.emitter.request_helper import _make_curl_command from datahub.emitter.serialization_helper import post_json_transform from datahub.metadata.schema_classes import _ASPECT_CLASSES, _Aspect -from datahub.utilities.urns.urn import Urn +from datahub.utilities.urns.urn import Urn, guess_entity_type log = logging.getLogger(__name__) @@ -125,11 +125,6 @@ def first_non_null(ls: List[Optional[str]]) -> Optional[str]: return next((el for el in ls if el is not None and el.strip() != ""), None) -def guess_entity_type(urn: str) -> str: - assert urn.startswith("urn:li:"), "urns must start with urn:li:" - return urn.split(":")[2] - - def get_system_auth() -> Optional[str]: system_client_id = os.environ.get(ENV_DATAHUB_SYSTEM_CLIENT_ID) system_client_secret = os.environ.get(ENV_DATAHUB_SYSTEM_CLIENT_SECRET) diff --git a/metadata-ingestion/src/datahub/cli/delete_cli.py b/metadata-ingestion/src/datahub/cli/delete_cli.py index 5174c4b77f8e1e..32fb531fa299c6 100644 --- a/metadata-ingestion/src/datahub/cli/delete_cli.py +++ b/metadata-ingestion/src/datahub/cli/delete_cli.py @@ -10,7 +10,6 @@ from tabulate import tabulate from datahub.cli import cli_utils -from datahub.cli.cli_utils import guess_entity_type from datahub.emitter import rest_emitter from datahub.emitter.mcp import MetadataChangeProposalWrapper from datahub.metadata.schema_classes import ( @@ -20,6 +19,7 @@ ) from datahub.telemetry import telemetry from datahub.upgrade import upgrade +from datahub.utilities.urns.urn import guess_entity_type logger = logging.getLogger(__name__) diff --git a/metadata-ingestion/src/datahub/cli/docker.py b/metadata-ingestion/src/datahub/cli/docker.py index 63fc4c6144bdd0..51c5197f564c1d 100644 --- a/metadata-ingestion/src/datahub/cli/docker.py +++ b/metadata-ingestion/src/datahub/cli/docker.py @@ -36,10 +36,17 @@ M1_QUICKSTART_COMPOSE_FILE = ( "docker/quickstart/docker-compose-without-neo4j-m1.quickstart.yml" ) +CONSUMERS_QUICKSTART_COMPOSE_FILE = ( + "docker/quickstart/docker-compose.consumers.quickstart.yml" +) +ELASTIC_CONSUMERS_QUICKSTART_COMPOSE_FILE = ( + "docker/quickstart/docker-compose.consumers-without-neo4j.quickstart.yml" +) BOOTSTRAP_MCES_FILE = "metadata-ingestion/examples/mce_files/bootstrap_mce.json" GITHUB_BASE_URL = "https://raw.githubusercontent.com/datahub-project/datahub/master" + GITHUB_NEO4J_AND_ELASTIC_QUICKSTART_COMPOSE_URL = ( f"{GITHUB_BASE_URL}/{NEO4J_AND_ELASTIC_QUICKSTART_COMPOSE_FILE}" ) @@ -188,7 +195,8 @@ def _attempt_stop(quickstart_compose_file: List[pathlib.Path]) -> None: if compose_files_for_stopping: # docker-compose stop base_command: List[str] = [ - "docker-compose", + "docker", + "compose", *itertools.chain.from_iterable( ("-f", f"{path}") for path in compose_files_for_stopping ), @@ -473,6 +481,13 @@ def _restore( default=False, help="Disables the restoration of indices of a running quickstart instance when used in conjunction with --restore.", ) +@click.option( + "--standalone_consumers", + required=False, + is_flag=True, + default=False, + help="Launches MAE & MCE consumers as stand alone docker containers", +) @upgrade.check_upgrade @telemetry.with_telemetry def quickstart( @@ -493,6 +508,7 @@ def quickstart( restore_file: str, restore_indices: bool, no_restore_indices: bool, + standalone_consumers: bool, ) -> None: """Start an instance of DataHub locally using docker-compose. @@ -570,6 +586,32 @@ def quickstart( tmp_file.write(quickstart_download_response.content) logger.debug(f"Copied to {path}") + if standalone_consumers: + consumer_github_file = ( + f"{GITHUB_BASE_URL}/{CONSUMERS_QUICKSTART_COMPOSE_FILE}" + if should_use_neo4j + else f"{GITHUB_BASE_URL}/{ELASTIC_CONSUMERS_QUICKSTART_COMPOSE_FILE}" + ) + + default_consumer_compose_file = ( + Path(DATAHUB_ROOT_FOLDER) / "quickstart/docker-compose.consumers.yml" + ) + with open( + default_consumer_compose_file, "wb" + ) if default_consumer_compose_file else tempfile.NamedTemporaryFile( + suffix=".yml", delete=False + ) as tmp_file: + path = pathlib.Path(tmp_file.name) + quickstart_compose_file.append(path) + click.echo( + f"Fetching consumer docker-compose file {consumer_github_file} from GitHub" + ) + # Download the quickstart docker-compose file from GitHub. + quickstart_download_response = requests.get(consumer_github_file) + quickstart_download_response.raise_for_status() + tmp_file.write(quickstart_download_response.content) + logger.debug(f"Copied to {path}") + # set version _set_environment_variables( version=version, @@ -581,7 +623,8 @@ def quickstart( ) base_command: List[str] = [ - "docker-compose", + "docker", + "compose", *itertools.chain.from_iterable( ("-f", f"{path}") for path in quickstart_compose_file ), @@ -597,7 +640,7 @@ def quickstart( ) except subprocess.CalledProcessError: click.secho( - "Error while pulling images. Going to attempt to move on to docker-compose up assuming the images have " + "Error while pulling images. Going to attempt to move on to docker compose up assuming the images have " "been built locally", fg="red", ) @@ -623,7 +666,7 @@ def quickstart( up_interval = datetime.timedelta(seconds=30) up_attempts = 0 while (datetime.datetime.now() - start_time) < max_wait_time: - # Attempt to run docker-compose up every minute. + # Attempt to run docker compose up every minute. if (datetime.datetime.now() - start_time) > up_attempts * up_interval: click.echo() subprocess.run(base_command + ["up", "-d", "--remove-orphans"]) @@ -651,7 +694,7 @@ def quickstart( if dump_logs_on_failure: with open(log_file.name, "r") as logs: - click.echo("Dumping docker-compose logs:") + click.echo("Dumping docker compose logs:") click.echo(logs.read()) click.echo() diff --git a/metadata-ingestion/src/datahub/cli/put_cli.py b/metadata-ingestion/src/datahub/cli/put_cli.py index 2f326804bfc833..cd40455c1a2397 100644 --- a/metadata-ingestion/src/datahub/cli/put_cli.py +++ b/metadata-ingestion/src/datahub/cli/put_cli.py @@ -4,9 +4,10 @@ import click -from datahub.cli.cli_utils import guess_entity_type, post_entity +from datahub.cli.cli_utils import post_entity from datahub.telemetry import telemetry from datahub.upgrade import upgrade +from datahub.utilities.urns.urn import guess_entity_type logger = logging.getLogger(__name__) diff --git a/metadata-ingestion/src/datahub/configuration/common.py b/metadata-ingestion/src/datahub/configuration/common.py index 0484bfc3190dbc..35a30f025aefdc 100644 --- a/metadata-ingestion/src/datahub/configuration/common.py +++ b/metadata-ingestion/src/datahub/configuration/common.py @@ -26,6 +26,7 @@ class TransformerSemantics(Enum): class TransformerSemanticsConfigModel(ConfigModel): semantics: TransformerSemantics = TransformerSemantics.OVERWRITE + replace_existing: bool = False @validator("semantics", pre=True) def ensure_semantics_is_upper_case(cls, v: str) -> str: diff --git a/metadata-ingestion/src/datahub/emitter/mcp_builder.py b/metadata-ingestion/src/datahub/emitter/mcp_builder.py index 87b5307a7e4e24..ad2df41a1bb069 100644 --- a/metadata-ingestion/src/datahub/emitter/mcp_builder.py +++ b/metadata-ingestion/src/datahub/emitter/mcp_builder.py @@ -1,6 +1,6 @@ import hashlib import json -from typing import Any, Iterable, List, Optional, TypeVar, Union +from typing import Any, Iterable, List, Optional, TypeVar from pydantic.fields import Field from pydantic.main import BaseModel @@ -16,6 +16,7 @@ ContainerClass, DomainsClass, GlobalTagsClass, + MetadataChangeEventClass, OwnerClass, OwnershipClass, OwnershipTypeClass, @@ -23,6 +24,7 @@ TagAssociationClass, _Aspect, ) +from datahub.utilities.urns.urn import guess_entity_type class DatahubKey(BaseModel): @@ -250,7 +252,7 @@ def add_dataset_to_container( # FIXME: Union requires two or more type arguments container_key: KeyType, dataset_urn: str, -) -> Iterable[Union[MetadataWorkUnit]]: +) -> Iterable[MetadataWorkUnit]: container_urn = make_container_urn( guid=container_key.guid(), ) @@ -280,3 +282,17 @@ def add_entity_to_container( ) wu = MetadataWorkUnit(id=f"container-{container_urn}-to-{entity_urn}", mcp=mcp) yield wu + + +def mcps_from_mce( + mce: MetadataChangeEventClass, +) -> Iterable[MetadataChangeProposalWrapper]: + for aspect in mce.proposedSnapshot.aspects: + yield MetadataChangeProposalWrapper( + entityType=guess_entity_type(mce.proposedSnapshot.urn), + changeType=ChangeTypeClass.UPSERT, + entityUrn=mce.proposedSnapshot.urn, + auditHeader=mce.auditHeader, + aspect=aspect, + systemMetadata=mce.systemMetadata, + ) diff --git a/metadata-ingestion/src/datahub/ingestion/api/workunit.py b/metadata-ingestion/src/datahub/ingestion/api/workunit.py index 1db67d3b9f79e1..e74dd7be819155 100644 --- a/metadata-ingestion/src/datahub/ingestion/api/workunit.py +++ b/metadata-ingestion/src/datahub/ingestion/api/workunit.py @@ -1,5 +1,5 @@ from dataclasses import dataclass -from typing import Union, overload +from typing import Iterable, Union, overload from datahub.emitter.mcp import MetadataChangeProposalWrapper from datahub.ingestion.api.source import WorkUnit @@ -64,6 +64,20 @@ def __init__( def get_metadata(self) -> dict: return {"metadata": self.metadata} + def decompose_mce_into_mcps(self) -> Iterable["MetadataWorkUnit"]: + from datahub.emitter.mcp_builder import mcps_from_mce + + assert isinstance(self.metadata, MetadataChangeEvent) + + yield from [ + MetadataWorkUnit( + id=self.id, + mcp=mcpw, + treat_errors_as_warnings=self.treat_errors_as_warnings, + ) + for mcpw in mcps_from_mce(self.metadata) + ] + @dataclass class UsageStatsWorkUnit(WorkUnit): diff --git a/metadata-ingestion/src/datahub/ingestion/extractor/mce_extractor.py b/metadata-ingestion/src/datahub/ingestion/extractor/mce_extractor.py index 9a9e15fcac68a0..fd74312b6439be 100644 --- a/metadata-ingestion/src/datahub/ingestion/extractor/mce_extractor.py +++ b/metadata-ingestion/src/datahub/ingestion/extractor/mce_extractor.py @@ -21,6 +21,7 @@ class WorkUnitRecordExtractorConfig(ConfigModel): set_system_metadata = True + unpack_mces_into_mcps = False class WorkUnitRecordExtractor( @@ -41,6 +42,13 @@ def get_records( ] ]: if isinstance(workunit, MetadataWorkUnit): + if self.config.unpack_mces_into_mcps and isinstance( + workunit.metadata, MetadataChangeEvent + ): + for inner_workunit in workunit.decompose_mce_into_mcps(): + yield from self.get_records(inner_workunit) + return + if isinstance( workunit.metadata, ( diff --git a/metadata-ingestion/src/datahub/ingestion/graph/client.py b/metadata-ingestion/src/datahub/ingestion/graph/client.py index e1d2de4204534c..0c406a646776da 100644 --- a/metadata-ingestion/src/datahub/ingestion/graph/client.py +++ b/metadata-ingestion/src/datahub/ingestion/graph/client.py @@ -14,15 +14,18 @@ from datahub.emitter.rest_emitter import DatahubRestEmitter from datahub.emitter.serialization_helper import post_json_transform from datahub.metadata.schema_classes import ( + BrowsePathsClass, + DatasetPropertiesClass, DatasetUsageStatisticsClass, DomainPropertiesClass, DomainsClass, GlobalTagsClass, GlossaryTermsClass, OwnershipClass, + SchemaMetadataClass, TelemetryClientIdClass, ) -from datahub.utilities.urns.urn import Urn +from datahub.utilities.urns.urn import Urn, guess_entity_type logger = logging.getLogger(__name__) @@ -108,11 +111,6 @@ def _post_generic(self, url: str, payload_dict: Dict) -> Dict: "Unable to get metadata from DataHub", {"message": str(e)} ) from e - @staticmethod - def _guess_entity_type(urn: str) -> str: - assert urn.startswith("urn:li:"), "urns must start with urn:li:" - return urn.split(":")[2] - @deprecated( reason="Use get_aspect_v2 instead which makes aspect_type_name truly optional" ) @@ -185,6 +183,13 @@ def get_ownership(self, entity_urn: str) -> Optional[OwnershipClass]: aspect_type=OwnershipClass, ) + def get_schema_metadata(self, entity_urn: str) -> Optional[SchemaMetadataClass]: + return self.get_aspect_v2( + entity_urn=entity_urn, + aspect="schemaMetadata", + aspect_type=SchemaMetadataClass, + ) + def get_domain_properties(self, entity_urn: str) -> Optional[DomainPropertiesClass]: return self.get_aspect_v2( entity_urn=entity_urn, @@ -192,6 +197,15 @@ def get_domain_properties(self, entity_urn: str) -> Optional[DomainPropertiesCla aspect_type=DomainPropertiesClass, ) + def get_dataset_properties( + self, entity_urn: str + ) -> Optional[DatasetPropertiesClass]: + return self.get_aspect_v2( + entity_urn=entity_urn, + aspect="datasetProperties", + aspect_type=DatasetPropertiesClass, + ) + def get_tags(self, entity_urn: str) -> Optional[GlobalTagsClass]: return self.get_aspect_v2( entity_urn=entity_urn, @@ -213,6 +227,13 @@ def get_domain(self, entity_urn: str) -> Optional[DomainsClass]: aspect_type=DomainsClass, ) + def get_browse_path(self, entity_urn: str) -> Optional[BrowsePathsClass]: + return self.get_aspect_v2( + entity_urn=entity_urn, + aspect="browsePaths", + aspect_type=BrowsePathsClass, + ) + def get_usage_aspects_from_urn( self, entity_urn: str, start_timestamp: int, end_timestamp: int ) -> Optional[List[DatasetUsageStatisticsClass]]: @@ -286,7 +307,7 @@ def get_latest_timeseries_value( ] query_body = { "urn": entity_urn, - "entity": self._guess_entity_type(entity_urn), + "entity": guess_entity_type(entity_urn), "aspect": aspect_name, "latestValue": True, "filter": {"or": [{"and": filter_criteria}]}, diff --git a/metadata-ingestion/src/datahub/ingestion/transformer/add_dataset_browse_path.py b/metadata-ingestion/src/datahub/ingestion/transformer/add_dataset_browse_path.py index 542c568cb89502..d0533af2db7151 100644 --- a/metadata-ingestion/src/datahub/ingestion/transformer/add_dataset_browse_path.py +++ b/metadata-ingestion/src/datahub/ingestion/transformer/add_dataset_browse_path.py @@ -1,17 +1,20 @@ -from typing import List +from typing import List, Optional, cast -import datahub.emitter.mce_builder as builder -from datahub.configuration.common import ConfigModel +from datahub.configuration.common import ( + TransformerSemantics, + TransformerSemanticsConfigModel, +) +from datahub.emitter.mce_builder import Aspect from datahub.ingestion.api.common import PipelineContext +from datahub.ingestion.graph.client import DataHubGraph from datahub.ingestion.transformer.dataset_transformer import ( DatasetBrowsePathsTransformer, ) -from datahub.metadata.schema_classes import BrowsePathsClass, MetadataChangeEventClass +from datahub.metadata.schema_classes import BrowsePathsClass -class AddDatasetBrowsePathConfig(ConfigModel): +class AddDatasetBrowsePathConfig(TransformerSemanticsConfigModel): path_templates: List[str] - replace_existing: bool = False class AddDatasetBrowsePathTransformer(DatasetBrowsePathsTransformer): @@ -32,24 +35,42 @@ def create( config = AddDatasetBrowsePathConfig.parse_obj(config_dict) return cls(config, ctx) - def transform_one(self, mce: MetadataChangeEventClass) -> MetadataChangeEventClass: + @staticmethod + def get_browse_paths_to_set( + graph: DataHubGraph, urn: str, mce_browse_paths: Optional[BrowsePathsClass] + ) -> Optional[BrowsePathsClass]: + if not mce_browse_paths or not mce_browse_paths.paths: + # nothing to add, no need to consult server + return None + + server_browse_paths = graph.get_browse_path(entity_urn=urn) + if server_browse_paths: + # compute patch + # we only include domain who are not present in the server domain list + paths_to_add: List[str] = [] + for path in mce_browse_paths.paths: + if path not in server_browse_paths.paths: + paths_to_add.append(path) + # Lets patch + mce_browse_paths.paths = [] + mce_browse_paths.paths.extend(server_browse_paths.paths) + mce_browse_paths.paths.extend(paths_to_add) + + return mce_browse_paths + + def transform_aspect( + self, entity_urn: str, aspect_name: str, aspect: Optional[Aspect] + ) -> Optional[Aspect]: platform_part, dataset_fqdn, env = ( - mce.proposedSnapshot.urn.replace("urn:li:dataset:(", "") - .replace(")", "") - .split(",") + entity_urn.replace("urn:li:dataset:(", "").replace(")", "").split(",") ) + platform = platform_part.replace("urn:li:dataPlatform:", "") dataset = dataset_fqdn.replace(".", "/") - browse_paths = builder.get_or_add_aspect( - mce, - BrowsePathsClass( - paths=[], - ), - ) - - if self.config.replace_existing: - browse_paths.paths = [] + browse_paths = BrowsePathsClass(paths=[]) + if aspect is not None and self.config.replace_existing is False: + browse_paths.paths.extend(aspect.paths) # type: ignore[attr-defined] for template in self.config.path_templates: browse_path = ( @@ -57,7 +78,16 @@ def transform_one(self, mce: MetadataChangeEventClass) -> MetadataChangeEventCla .replace("DATASET_PARTS", dataset) .replace("ENV", env.lower()) ) - browse_paths.paths.append(browse_path) - return mce + if self.config.semantics == TransformerSemantics.PATCH: + assert self.ctx.graph + patch_browse_paths: Optional[ + BrowsePathsClass + ] = AddDatasetBrowsePathTransformer.get_browse_paths_to_set( + self.ctx.graph, entity_urn, browse_paths + ) + if patch_browse_paths is not None: + browse_paths = patch_browse_paths + + return cast(Optional[Aspect], browse_paths) diff --git a/metadata-ingestion/src/datahub/ingestion/transformer/add_dataset_ownership.py b/metadata-ingestion/src/datahub/ingestion/transformer/add_dataset_ownership.py index ea30f957044cf4..07778351f8c137 100644 --- a/metadata-ingestion/src/datahub/ingestion/transformer/add_dataset_ownership.py +++ b/metadata-ingestion/src/datahub/ingestion/transformer/add_dataset_ownership.py @@ -1,54 +1,38 @@ -from enum import Enum -from typing import Callable, List, Optional, Union - -from pydantic import validator +from typing import Callable, List, Optional, Union, cast import datahub.emitter.mce_builder as builder from datahub.configuration.common import ( - ConfigModel, ConfigurationError, KeyValuePattern, + TransformerSemantics, + TransformerSemanticsConfigModel, ) from datahub.configuration.import_resolver import pydantic_resolve_key +from datahub.emitter.mce_builder import Aspect from datahub.ingestion.api.common import PipelineContext from datahub.ingestion.graph.client import DataHubGraph from datahub.ingestion.transformer.dataset_transformer import ( DatasetOwnershipTransformer, ) from datahub.metadata.schema_classes import ( - DatasetSnapshotClass, - MetadataChangeEventClass, + AuditStampClass, OwnerClass, OwnershipClass, OwnershipTypeClass, ) -class Semantics(Enum): - """Describes semantics for ownership changes""" - - OVERWRITE = "OVERWRITE" # Apply changes blindly - PATCH = "PATCH" # Only apply differences from what exists already on the server - - -class AddDatasetOwnershipConfig(ConfigModel): +class AddDatasetOwnershipConfig(TransformerSemanticsConfigModel): # Workaround for https://github.com/python/mypy/issues/708. # Suggested by https://stackoverflow.com/a/64528725/5004662. get_owners_to_add: Union[ - Callable[[DatasetSnapshotClass], List[OwnerClass]], - Callable[[DatasetSnapshotClass], List[OwnerClass]], + Callable[[str], List[OwnerClass]], + Callable[[str], List[OwnerClass]], ] default_actor: str = builder.make_user_urn("etl") - semantics: Semantics = Semantics.OVERWRITE _resolve_owner_fn = pydantic_resolve_key("get_owners_to_add") - @validator("semantics", pre=True) - def ensure_semantics_is_upper_case(cls, v): - if isinstance(v, str): - return v.upper() - return v - class AddDatasetOwnership(DatasetOwnershipTransformer): """Transformer that adds owners to datasets according to a callback function.""" @@ -60,9 +44,12 @@ def __init__(self, config: AddDatasetOwnershipConfig, ctx: PipelineContext): super().__init__() self.ctx = ctx self.config = config - if self.config.semantics == Semantics.PATCH and self.ctx.graph is None: + if ( + self.config.semantics == TransformerSemantics.PATCH + and self.ctx.graph is None + ): raise ConfigurationError( - "With PATCH semantics, AddDatasetOwnership requires a datahub_api to connect to. Consider using the datahub-rest sink or provide a datahub_api: configuration on your ingestion recipe" + "With PATCH TransformerSemantics, AddDatasetOwnership requires a datahub_api to connect to. Consider using the datahub-rest sink or provide a datahub_api: configuration on your ingestion recipe" ) @classmethod @@ -71,7 +58,7 @@ def create(cls, config_dict: dict, ctx: PipelineContext) -> "AddDatasetOwnership return cls(config, ctx) @staticmethod - def get_ownership_to_set( + def get_patch_ownership_aspect( graph: DataHubGraph, urn: str, mce_ownership: Optional[OwnershipClass] ) -> Optional[OwnershipClass]: if not mce_ownership or not mce_ownership.owners: @@ -107,43 +94,47 @@ def get_ownership_to_set( else: return mce_ownership - def transform_one(self, mce: MetadataChangeEventClass) -> MetadataChangeEventClass: - assert isinstance(mce.proposedSnapshot, DatasetSnapshotClass) - owners_to_add = self.config.get_owners_to_add(mce.proposedSnapshot) - if owners_to_add: - ownership = builder.get_or_add_aspect( - mce, - OwnershipClass( - owners=[], - ), + def transform_aspect( + self, entity_urn: str, aspect_name: str, aspect: Optional[Aspect] + ) -> Optional[Aspect]: + + in_ownership_aspect: Optional[OwnershipClass] = cast(OwnershipClass, aspect) + out_ownership_aspect: OwnershipClass = OwnershipClass( + owners=[], + lastModified=in_ownership_aspect.lastModified + if in_ownership_aspect is not None + else AuditStampClass.construct_with_defaults(), + ) + + # Check if user want to keep existing ownerships + if in_ownership_aspect is not None and self.config.replace_existing is False: + out_ownership_aspect.owners.extend(in_ownership_aspect.owners) + + owners_to_add = self.config.get_owners_to_add(entity_urn) + if owners_to_add is not None: + out_ownership_aspect.owners.extend(owners_to_add) + + patch_ownership: Optional[OwnershipClass] = None + if self.config.semantics == TransformerSemantics.PATCH: + assert self.ctx.graph + patch_ownership = AddDatasetOwnership.get_patch_ownership_aspect( + self.ctx.graph, entity_urn, out_ownership_aspect ) - ownership.owners.extend(owners_to_add) - if self.config.semantics == Semantics.PATCH: - assert self.ctx.graph - patch_ownership = AddDatasetOwnership.get_ownership_to_set( - self.ctx.graph, mce.proposedSnapshot.urn, ownership - ) - builder.set_aspect( - mce, aspect=patch_ownership, aspect_type=OwnershipClass - ) - return mce + return ( + cast(Aspect, patch_ownership) + if patch_ownership is not None + else cast(Aspect, out_ownership_aspect) + ) -class DatasetOwnershipBaseConfig(ConfigModel): +class DatasetOwnershipBaseConfig(TransformerSemanticsConfigModel): ownership_type: Optional[str] = OwnershipTypeClass.DATAOWNER class SimpleDatasetOwnershipConfig(DatasetOwnershipBaseConfig): owner_urns: List[str] default_actor: str = builder.make_user_urn("etl") - semantics: Semantics = Semantics.OVERWRITE - - @validator("semantics", pre=True) - def upper_case_semantics(cls, v): - if isinstance(v, str): - return v.upper() - return v class SimpleAddDatasetOwnership(AddDatasetOwnership): @@ -163,6 +154,7 @@ def __init__(self, config: SimpleDatasetOwnershipConfig, ctx: PipelineContext): get_owners_to_add=lambda _: owners, default_actor=config.default_actor, semantics=config.semantics, + replace_existing=config.replace_existing, ) super().__init__(generic_config, ctx) @@ -201,14 +193,16 @@ def __init__(self, config: PatternDatasetOwnershipConfig, ctx: PipelineContext): ownership_type = builder.validate_ownership_type(config.ownership_type) owner_pattern = config.owner_pattern generic_config = AddDatasetOwnershipConfig( - get_owners_to_add=lambda _: [ + get_owners_to_add=lambda urn: [ OwnerClass( owner=owner, type=ownership_type, ) - for owner in owner_pattern.value(_.urn) + for owner in owner_pattern.value(urn) ], default_actor=config.default_actor, + semantics=config.semantics, + replace_existing=config.replace_existing, ) super().__init__(generic_config, ctx) diff --git a/metadata-ingestion/src/datahub/ingestion/transformer/add_dataset_properties.py b/metadata-ingestion/src/datahub/ingestion/transformer/add_dataset_properties.py index 8c778931ddcf52..76d89c68b89033 100644 --- a/metadata-ingestion/src/datahub/ingestion/transformer/add_dataset_properties.py +++ b/metadata-ingestion/src/datahub/ingestion/transformer/add_dataset_properties.py @@ -1,27 +1,28 @@ +import copy from abc import ABC, abstractmethod -from typing import Any, Dict, Type +from typing import Any, Dict, Optional, Type, cast -import datahub.emitter.mce_builder as builder -from datahub.configuration.common import ConfigModel +from datahub.configuration.common import ( + TransformerSemantics, + TransformerSemanticsConfigModel, +) from datahub.configuration.import_resolver import pydantic_resolve_key +from datahub.emitter.mce_builder import Aspect from datahub.ingestion.api.common import PipelineContext +from datahub.ingestion.graph.client import DataHubGraph from datahub.ingestion.transformer.dataset_transformer import ( DatasetPropertiesTransformer, ) -from datahub.metadata.schema_classes import ( - DatasetPropertiesClass, - DatasetSnapshotClass, - MetadataChangeEventClass, -) +from datahub.metadata.schema_classes import DatasetPropertiesClass class AddDatasetPropertiesResolverBase(ABC): @abstractmethod - def get_properties_to_add(self, current: DatasetSnapshotClass) -> Dict[str, str]: + def get_properties_to_add(self, entity_urn: str) -> Dict[str, str]: pass -class AddDatasetPropertiesConfig(ConfigModel): +class AddDatasetPropertiesConfig(TransformerSemanticsConfigModel): add_properties_resolver_class: Type[AddDatasetPropertiesResolverBase] class Config: @@ -52,23 +53,75 @@ def create(cls, config_dict: dict, ctx: PipelineContext) -> "AddDatasetPropertie config = AddDatasetPropertiesConfig.parse_obj(config_dict) return cls(config, ctx) - def transform_one(self, mce: MetadataChangeEventClass) -> MetadataChangeEventClass: - if not isinstance(mce.proposedSnapshot, DatasetSnapshotClass): - return mce + @staticmethod + def get_patch_dataset_properties_aspect( + graph: DataHubGraph, + entity_urn: str, + dataset_properties_aspect: Optional[DatasetPropertiesClass], + ) -> Optional[DatasetPropertiesClass]: + assert dataset_properties_aspect + + server_dataset_properties_aspect: Optional[ + DatasetPropertiesClass + ] = graph.get_dataset_properties(entity_urn) + # No need to take any action if server properties is None or there is not customProperties in server properties + if ( + server_dataset_properties_aspect is None + or not server_dataset_properties_aspect.customProperties + ): + return dataset_properties_aspect + + custom_properties_to_add = server_dataset_properties_aspect.customProperties + # Give precedence to ingestion custom properties + # if same property exist on server as well as in input aspect then value of input aspect would get set in output + custom_properties_to_add.update(dataset_properties_aspect.customProperties) + + patch_dataset_properties: DatasetPropertiesClass = copy.deepcopy( + dataset_properties_aspect + ) + patch_dataset_properties.customProperties = custom_properties_to_add + + return patch_dataset_properties + + def transform_aspect( + self, entity_urn: str, aspect_name: str, aspect: Optional[Aspect] + ) -> Optional[Aspect]: + + in_dataset_properties_aspect: DatasetPropertiesClass = cast( + DatasetPropertiesClass, aspect + ) + + assert in_dataset_properties_aspect + + out_dataset_properties_aspect: DatasetPropertiesClass = copy.deepcopy( + in_dataset_properties_aspect + ) + if self.config.replace_existing is True: + # clean the existing properties + out_dataset_properties_aspect.customProperties = {} properties_to_add = self.config.add_properties_resolver_class( # type: ignore **self.resolver_args - ).get_properties_to_add(mce.proposedSnapshot) - if properties_to_add: - properties = builder.get_or_add_aspect( - mce, DatasetPropertiesClass(customProperties={}) + ).get_properties_to_add(entity_urn) + + out_dataset_properties_aspect.customProperties.update(properties_to_add) + if self.config.semantics == TransformerSemantics.PATCH: + assert self.ctx.graph + patch_dataset_properties_aspect = ( + AddDatasetProperties.get_patch_dataset_properties_aspect( + self.ctx.graph, entity_urn, out_dataset_properties_aspect + ) + ) + out_dataset_properties_aspect = ( + patch_dataset_properties_aspect + if patch_dataset_properties_aspect is not None + else out_dataset_properties_aspect ) - properties.customProperties.update(properties_to_add) - return mce + return cast(Aspect, out_dataset_properties_aspect) -class SimpleAddDatasetPropertiesConfig(ConfigModel): +class SimpleAddDatasetPropertiesConfig(TransformerSemanticsConfigModel): properties: Dict[str, str] @@ -76,7 +129,7 @@ class SimpleAddDatasetPropertiesResolverClass(AddDatasetPropertiesResolverBase): def __init__(self, properties: Dict[str, str]): self.properties = properties - def get_properties_to_add(self, current: DatasetSnapshotClass) -> Dict[str, str]: + def get_properties_to_add(self, entity_urn: str) -> Dict[str, str]: return self.properties @@ -85,7 +138,9 @@ class SimpleAddDatasetProperties(AddDatasetProperties): def __init__(self, config: SimpleAddDatasetPropertiesConfig, ctx: PipelineContext): generic_config = AddDatasetPropertiesConfig( - add_properties_resolver_class=SimpleAddDatasetPropertiesResolverClass + add_properties_resolver_class=SimpleAddDatasetPropertiesResolverClass, + replace_existing=config.replace_existing, + semantics=config.semantics, ) resolver_args = {"properties": config.properties} super().__init__(generic_config, ctx, **resolver_args) diff --git a/metadata-ingestion/src/datahub/ingestion/transformer/add_dataset_schema_tags.py b/metadata-ingestion/src/datahub/ingestion/transformer/add_dataset_schema_tags.py index f71ae3f98733e5..f17d4c60ce243a 100644 --- a/metadata-ingestion/src/datahub/ingestion/transformer/add_dataset_schema_tags.py +++ b/metadata-ingestion/src/datahub/ingestion/transformer/add_dataset_schema_tags.py @@ -1,12 +1,15 @@ -from typing import Callable, List, Optional, Union +from typing import Callable, List, Optional, Union, cast import datahub.emitter.mce_builder as builder -from datahub.configuration.common import ConfigModel, KeyValuePattern +from datahub.configuration.common import ( + KeyValuePattern, + TransformerSemantics, + TransformerSemanticsConfigModel, +) from datahub.configuration.import_resolver import pydantic_resolve_key from datahub.ingestion.api.common import PipelineContext -from datahub.ingestion.transformer.base_transformer import ( - BaseTransformer, - SingleAspectTransformer, +from datahub.ingestion.transformer.dataset_transformer import ( + DatasetSchemaMetadataTransformer, ) from datahub.metadata.schema_classes import ( GlobalTagsClass, @@ -16,7 +19,7 @@ ) -class AddDatasetSchemaTagsConfig(ConfigModel): +class AddDatasetSchemaTagsConfig(TransformerSemanticsConfigModel): # Workaround for https://github.com/python/mypy/issues/708. # Suggested by https://stackoverflow.com/a/64528725/5004662. get_tags_to_add: Union[ @@ -27,7 +30,7 @@ class AddDatasetSchemaTagsConfig(ConfigModel): _resolve_tag_fn = pydantic_resolve_key("get_tags_to_add") -class AddDatasetSchemaTags(BaseTransformer, SingleAspectTransformer): +class AddDatasetSchemaTags(DatasetSchemaMetadataTransformer): """Transformer that adds glossary tags to datasets according to a callback function.""" ctx: PipelineContext @@ -38,50 +41,90 @@ def __init__(self, config: AddDatasetSchemaTagsConfig, ctx: PipelineContext): self.ctx = ctx self.config = config - def aspect_name(self) -> str: - return "schemaMetadata" - - def entity_types(self) -> List[str]: - return ["dataset"] - @classmethod def create(cls, config_dict: dict, ctx: PipelineContext) -> "AddDatasetSchemaTags": config = AddDatasetSchemaTagsConfig.parse_obj(config_dict) return cls(config, ctx) - def extend_field(self, schema_field: SchemaFieldClass) -> SchemaFieldClass: - tags_to_add = self.config.get_tags_to_add(schema_field.fieldPath) - if len(tags_to_add) > 0: - new_tags = ( - schema_field.globalTags - if schema_field.globalTags is not None - else GlobalTagsClass( - tags=[], - ) - ) - new_tags.tags.extend(tags_to_add) - schema_field.globalTags = new_tags + def extend_field( + self, schema_field: SchemaFieldClass, server_field: Optional[SchemaFieldClass] + ) -> SchemaFieldClass: + all_tags = self.config.get_tags_to_add(schema_field.fieldPath) + if len(all_tags) == 0: + # return input schema field as there is no tags to add + return schema_field + + # Add existing tags if user want to keep them + if ( + schema_field.globalTags is not None + and self.config.replace_existing is False + ): + all_tags.extend(schema_field.globalTags.tags) + + tags_to_add: List[TagAssociationClass] = [] + server_tags: List[TagAssociationClass] = [] + if server_field is not None and server_field.globalTags is not None: + # Go for patch + server_tags = server_field.globalTags.tags + server_tag_urns = [tag_association.tag for tag_association in server_tags] + for tag in all_tags: + if tag.tag not in server_tag_urns: + tags_to_add.append(tag) + + # Set tags_to_add to all_tags if server tags were not present + if len(tags_to_add) == 0: + tags_to_add = all_tags + + new_global_tag_class: GlobalTagsClass = GlobalTagsClass(tags=[]) + new_global_tag_class.tags.extend(tags_to_add) + new_global_tag_class.tags.extend(server_tags) + + schema_field.globalTags = new_global_tag_class return schema_field def transform_aspect( self, entity_urn: str, aspect_name: str, aspect: Optional[builder.Aspect] ) -> Optional[builder.Aspect]: - assert aspect is None or isinstance(aspect, SchemaMetadataClass) - if aspect is None: - return aspect + schema_metadata_aspect: SchemaMetadataClass = cast(SchemaMetadataClass, aspect) + assert ( + schema_metadata_aspect is None + or isinstance(schema_metadata_aspect, SchemaMetadataClass) + or schema_metadata_aspect.field is None + ) - schema_metadata_aspect: SchemaMetadataClass = aspect + server_field_map: dict = {} + if self.config.semantics == TransformerSemantics.PATCH: + assert self.ctx.graph + server_schema_metadata_aspect: Optional[ + SchemaMetadataClass + ] = self.ctx.graph.get_schema_metadata(entity_urn=entity_urn) + if server_schema_metadata_aspect is not None: + input_field_path = [ + field.fieldPath + for field in schema_metadata_aspect.fields + if field is not None + ] + server_field_to_add: List[SchemaFieldClass] = [] + # cache the server field to use in patching the schema-field later + for field in server_schema_metadata_aspect.fields: + server_field_map[field.fieldPath] = field + if field.fieldPath not in input_field_path: + # This field is present on server but not in input aspect + server_field_to_add.append(field) + # Add field present on server + schema_metadata_aspect.fields.extend(server_field_to_add) schema_metadata_aspect.fields = [ - self.extend_field(field) for field in schema_metadata_aspect.fields + self.extend_field(field, server_field=server_field_map.get(field.fieldPath)) + for field in schema_metadata_aspect.fields ] return schema_metadata_aspect # type: ignore -class PatternDatasetTagsConfig(ConfigModel): +class PatternDatasetTagsConfig(TransformerSemanticsConfigModel): tag_pattern: KeyValuePattern = KeyValuePattern.all() @@ -94,6 +137,8 @@ def __init__(self, config: PatternDatasetTagsConfig, ctx: PipelineContext): get_tags_to_add=lambda path: [ TagAssociationClass(tag=urn) for urn in tag_pattern.value(path) ], + semantics=config.semantics, + replace_existing=config.replace_existing, ) super().__init__(generic_config, ctx) diff --git a/metadata-ingestion/src/datahub/ingestion/transformer/add_dataset_schema_terms.py b/metadata-ingestion/src/datahub/ingestion/transformer/add_dataset_schema_terms.py index 18949f62d1996a..40d4c950f5dbda 100644 --- a/metadata-ingestion/src/datahub/ingestion/transformer/add_dataset_schema_terms.py +++ b/metadata-ingestion/src/datahub/ingestion/transformer/add_dataset_schema_terms.py @@ -1,12 +1,15 @@ -from typing import Callable, List, Optional, Union +from typing import Callable, Dict, List, Optional, Union, cast import datahub.emitter.mce_builder as builder -from datahub.configuration.common import ConfigModel, KeyValuePattern +from datahub.configuration.common import ( + KeyValuePattern, + TransformerSemantics, + TransformerSemanticsConfigModel, +) from datahub.configuration.import_resolver import pydantic_resolve_key from datahub.ingestion.api.common import PipelineContext -from datahub.ingestion.transformer.base_transformer import ( - BaseTransformer, - SingleAspectTransformer, +from datahub.ingestion.transformer.dataset_transformer import ( + DatasetSchemaMetadataTransformer, ) from datahub.metadata.schema_classes import ( AuditStampClass, @@ -17,7 +20,7 @@ ) -class AddDatasetSchemaTermsConfig(ConfigModel): +class AddDatasetSchemaTermsConfig(TransformerSemanticsConfigModel): # Workaround for https://github.com/python/mypy/issues/708. # Suggested by https://stackoverflow.com/a/64528725/5004662. get_terms_to_add: Union[ @@ -28,7 +31,7 @@ class AddDatasetSchemaTermsConfig(ConfigModel): _resolve_term_fn = pydantic_resolve_key("get_terms_to_add") -class AddDatasetSchemaTerms(BaseTransformer, SingleAspectTransformer): +class AddDatasetSchemaTerms(DatasetSchemaMetadataTransformer): """Transformer that adds glossary terms to datasets according to a callback function.""" ctx: PipelineContext @@ -39,53 +42,97 @@ def __init__(self, config: AddDatasetSchemaTermsConfig, ctx: PipelineContext): self.ctx = ctx self.config = config - def aspect_name(self) -> str: - return "schemaMetadata" - - def entity_types(self) -> List[str]: - return ["dataset"] - @classmethod def create(cls, config_dict: dict, ctx: PipelineContext) -> "AddDatasetSchemaTerms": config = AddDatasetSchemaTermsConfig.parse_obj(config_dict) return cls(config, ctx) - def extend_field(self, schema_field: SchemaFieldClass) -> SchemaFieldClass: - terms_to_add = self.config.get_terms_to_add(schema_field.fieldPath) - if len(terms_to_add) > 0: - new_terms = ( - schema_field.glossaryTerms - if schema_field.glossaryTerms is not None - else GlossaryTermsClass( - terms=[], - auditStamp=AuditStampClass( - time=builder.get_sys_time(), actor="urn:li:corpUser:restEmitter" - ), - ) - ) - new_terms.terms.extend(terms_to_add) - schema_field.glossaryTerms = new_terms + def extend_field( + self, schema_field: SchemaFieldClass, server_field: Optional[SchemaFieldClass] + ) -> SchemaFieldClass: + all_terms = self.config.get_terms_to_add(schema_field.fieldPath) + if len(all_terms) == 0: + # return input schema field as there is no terms to add + return schema_field + + # Add existing terms if user want to keep them + if ( + schema_field.glossaryTerms is not None + and self.config.replace_existing is False + ): + all_terms.extend(schema_field.glossaryTerms.terms) + + terms_to_add: List[GlossaryTermAssociationClass] = [] + server_terms: List[GlossaryTermAssociationClass] = [] + if server_field is not None and server_field.glossaryTerms is not None: + # Go for patch + server_terms = server_field.glossaryTerms.terms + server_term_urns = [term.urn for term in server_terms] + for term in all_terms: + if term.urn not in server_term_urns: + terms_to_add.append(term) + + # Set terms_to_add to all_terms if server terms were not present + if len(terms_to_add) == 0: + terms_to_add = all_terms + + new_glossary_term = GlossaryTermsClass( + terms=[], + auditStamp=schema_field.glossaryTerms.auditStamp + if schema_field.glossaryTerms is not None + else AuditStampClass.construct_with_defaults(), + ) + new_glossary_term.terms.extend(terms_to_add) + new_glossary_term.terms.extend(server_terms) + + schema_field.glossaryTerms = new_glossary_term return schema_field def transform_aspect( self, entity_urn: str, aspect_name: str, aspect: Optional[builder.Aspect] ) -> Optional[builder.Aspect]: - assert aspect is None or isinstance(aspect, SchemaMetadataClass) + schema_metadata_aspect: SchemaMetadataClass = cast(SchemaMetadataClass, aspect) - if aspect is None: - return aspect + assert ( + schema_metadata_aspect is None + or isinstance(schema_metadata_aspect, SchemaMetadataClass) + or schema_metadata_aspect.field is None + ) - schema_metadata_aspect: SchemaMetadataClass = aspect + server_field_map: Dict[ + str, SchemaFieldClass + ] = {} # Map to cache server field objects, where fieldPath is key + if self.config.semantics == TransformerSemantics.PATCH: + assert self.ctx.graph + server_schema_metadata_aspect: Optional[ + SchemaMetadataClass + ] = self.ctx.graph.get_schema_metadata(entity_urn=entity_urn) + if server_schema_metadata_aspect is not None: + input_field_path = [ + field.fieldPath + for field in schema_metadata_aspect.fields + if field is not None + ] + server_field_to_add: List[SchemaFieldClass] = [] + # cache the server field to use in patching the schema-field later + for field in server_schema_metadata_aspect.fields: + server_field_map[field.fieldPath] = field + if field.fieldPath not in input_field_path: + # This field is present on server but not in input aspect + server_field_to_add.append(field) + # Add field present on server + schema_metadata_aspect.fields.extend(server_field_to_add) schema_metadata_aspect.fields = [ - self.extend_field(field) for field in schema_metadata_aspect.fields + self.extend_field(field, server_field=server_field_map.get(field.fieldPath)) + for field in schema_metadata_aspect.fields ] return schema_metadata_aspect # type: ignore -class PatternDatasetTermsConfig(ConfigModel): +class PatternDatasetTermsConfig(TransformerSemanticsConfigModel): term_pattern: KeyValuePattern = KeyValuePattern.all() @@ -99,6 +146,8 @@ def __init__(self, config: PatternDatasetTermsConfig, ctx: PipelineContext): GlossaryTermAssociationClass(urn=urn) for urn in term_pattern.value(path) ], + semantics=config.semantics, + replace_existing=config.replace_existing, ) super().__init__(generic_config, ctx) diff --git a/metadata-ingestion/src/datahub/ingestion/transformer/add_dataset_tags.py b/metadata-ingestion/src/datahub/ingestion/transformer/add_dataset_tags.py index d8bd78f82c90f5..a991c7ea4d6c0d 100644 --- a/metadata-ingestion/src/datahub/ingestion/transformer/add_dataset_tags.py +++ b/metadata-ingestion/src/datahub/ingestion/transformer/add_dataset_tags.py @@ -1,24 +1,24 @@ -from typing import Callable, List, Union +from typing import Callable, List, Optional, Union, cast -import datahub.emitter.mce_builder as builder -from datahub.configuration.common import ConfigModel, KeyValuePattern +from datahub.configuration.common import ( + KeyValuePattern, + TransformerSemantics, + TransformerSemanticsConfigModel, +) from datahub.configuration.import_resolver import pydantic_resolve_key +from datahub.emitter.mce_builder import Aspect from datahub.ingestion.api.common import PipelineContext +from datahub.ingestion.graph.client import DataHubGraph from datahub.ingestion.transformer.dataset_transformer import DatasetTagsTransformer -from datahub.metadata.schema_classes import ( - DatasetSnapshotClass, - GlobalTagsClass, - MetadataChangeEventClass, - TagAssociationClass, -) +from datahub.metadata.schema_classes import GlobalTagsClass, TagAssociationClass -class AddDatasetTagsConfig(ConfigModel): +class AddDatasetTagsConfig(TransformerSemanticsConfigModel): # Workaround for https://github.com/python/mypy/issues/708. # Suggested by https://stackoverflow.com/a/64528725/5004662. get_tags_to_add: Union[ - Callable[[DatasetSnapshotClass], List[TagAssociationClass]], - Callable[[DatasetSnapshotClass], List[TagAssociationClass]], + Callable[[str], List[TagAssociationClass]], + Callable[[str], List[TagAssociationClass]], ] _resolve_tag_fn = pydantic_resolve_key("get_tags_to_add") @@ -40,22 +40,64 @@ def create(cls, config_dict: dict, ctx: PipelineContext) -> "AddDatasetTags": config = AddDatasetTagsConfig.parse_obj(config_dict) return cls(config, ctx) - def transform_one(self, mce: MetadataChangeEventClass) -> MetadataChangeEventClass: - assert isinstance(mce.proposedSnapshot, DatasetSnapshotClass) - tags_to_add = self.config.get_tags_to_add(mce.proposedSnapshot) - if tags_to_add: - tags = builder.get_or_add_aspect( - mce, - GlobalTagsClass( - tags=[], - ), + @staticmethod + def get_patch_global_tags_aspect( + graph: DataHubGraph, urn: str, global_tags_aspect: Optional[GlobalTagsClass] + ) -> Optional[GlobalTagsClass]: + if not global_tags_aspect or not global_tags_aspect.tags: + # nothing to add, no need to consult server + return global_tags_aspect + + server_global_tags_aspect = graph.get_tags(entity_urn=urn) + # No server aspect to patch + if server_global_tags_aspect is None: + return global_tags_aspect + + # Compute patch + # We only include tags which are not present in the server tag list + server_tag_urns: List[str] = [ + tag_association.tag for tag_association in server_global_tags_aspect.tags + ] + tags_to_add: List[TagAssociationClass] = [ + tag_association + for tag_association in global_tags_aspect.tags + if tag_association.tag not in server_tag_urns + ] + # Lets patch + patch_global_tags_aspect: GlobalTagsClass = GlobalTagsClass(tags=[]) + patch_global_tags_aspect.tags.extend(server_global_tags_aspect.tags) + patch_global_tags_aspect.tags.extend(tags_to_add) + + return patch_global_tags_aspect + + def transform_aspect( + self, entity_urn: str, aspect_name: str, aspect: Optional[Aspect] + ) -> Optional[Aspect]: + in_global_tags_aspect: GlobalTagsClass = cast(GlobalTagsClass, aspect) + out_global_tags_aspect: GlobalTagsClass = GlobalTagsClass(tags=[]) + # Check if user want to keep existing tags + if in_global_tags_aspect is not None and self.config.replace_existing is False: + out_global_tags_aspect.tags.extend(in_global_tags_aspect.tags) + + tags_to_add = self.config.get_tags_to_add(entity_urn) + if tags_to_add is not None: + out_global_tags_aspect.tags.extend(tags_to_add) + + patch_global_tags_aspect: Optional[GlobalTagsClass] = None + if self.config.semantics == TransformerSemantics.PATCH: + assert self.ctx.graph + patch_global_tags_aspect = AddDatasetTags.get_patch_global_tags_aspect( + self.ctx.graph, entity_urn, out_global_tags_aspect ) - tags.tags.extend(tags_to_add) - return mce + return ( + cast(Optional[Aspect], patch_global_tags_aspect) + if patch_global_tags_aspect is not None + else cast(Optional[Aspect], out_global_tags_aspect) + ) -class SimpleDatasetTagConfig(ConfigModel): +class SimpleDatasetTagConfig(TransformerSemanticsConfigModel): tag_urns: List[str] @@ -67,6 +109,8 @@ def __init__(self, config: SimpleDatasetTagConfig, ctx: PipelineContext): generic_config = AddDatasetTagsConfig( get_tags_to_add=lambda _: tags, + replace_existing=config.replace_existing, + semantics=config.semantics, ) super().__init__(generic_config, ctx) @@ -76,7 +120,7 @@ def create(cls, config_dict: dict, ctx: PipelineContext) -> "SimpleAddDatasetTag return cls(config, ctx) -class PatternDatasetTagsConfig(ConfigModel): +class PatternDatasetTagsConfig(TransformerSemanticsConfigModel): tag_pattern: KeyValuePattern = KeyValuePattern.all() @@ -87,8 +131,10 @@ def __init__(self, config: PatternDatasetTagsConfig, ctx: PipelineContext): tag_pattern = config.tag_pattern generic_config = AddDatasetTagsConfig( get_tags_to_add=lambda _: [ - TagAssociationClass(tag=urn) for urn in tag_pattern.value(_.urn) + TagAssociationClass(tag=urn) for urn in tag_pattern.value(_) ], + replace_existing=config.replace_existing, + semantics=config.semantics, ) super().__init__(generic_config, ctx) diff --git a/metadata-ingestion/src/datahub/ingestion/transformer/add_dataset_terms.py b/metadata-ingestion/src/datahub/ingestion/transformer/add_dataset_terms.py index 675897f0a41e74..ce933d952ff413 100644 --- a/metadata-ingestion/src/datahub/ingestion/transformer/add_dataset_terms.py +++ b/metadata-ingestion/src/datahub/ingestion/transformer/add_dataset_terms.py @@ -1,25 +1,29 @@ -from typing import Callable, List, Union +import logging +from typing import Callable, List, Optional, Union, cast -import datahub.emitter.mce_builder as builder -from datahub.configuration.common import ConfigModel, KeyValuePattern +from datahub.configuration.common import ( + KeyValuePattern, + TransformerSemantics, + TransformerSemanticsConfigModel, +) from datahub.configuration.import_resolver import pydantic_resolve_key +from datahub.emitter.mce_builder import Aspect from datahub.ingestion.api.common import PipelineContext +from datahub.ingestion.graph.client import DataHubGraph from datahub.ingestion.transformer.dataset_transformer import DatasetTermsTransformer from datahub.metadata.schema_classes import ( AuditStampClass, - DatasetSnapshotClass, GlossaryTermAssociationClass, GlossaryTermsClass, - MetadataChangeEventClass, ) -class AddDatasetTermsConfig(ConfigModel): +class AddDatasetTermsConfig(TransformerSemanticsConfigModel): # Workaround for https://github.com/python/mypy/issues/708. # Suggested by https://stackoverflow.com/a/64528725/5004662. get_terms_to_add: Union[ - Callable[[DatasetSnapshotClass], List[GlossaryTermAssociationClass]], - Callable[[DatasetSnapshotClass], List[GlossaryTermAssociationClass]], + Callable[[str], List[GlossaryTermAssociationClass]], + Callable[[str], List[GlossaryTermAssociationClass]], ] _resolve_term_fn = pydantic_resolve_key("get_terms_to_add") @@ -35,32 +39,84 @@ def __init__(self, config: AddDatasetTermsConfig, ctx: PipelineContext): super().__init__() self.ctx = ctx self.config = config + self.log = logging.getLogger(__name__) @classmethod def create(cls, config_dict: dict, ctx: PipelineContext) -> "AddDatasetTerms": config = AddDatasetTermsConfig.parse_obj(config_dict) return cls(config, ctx) - def transform_one(self, mce: MetadataChangeEventClass) -> MetadataChangeEventClass: - if not isinstance(mce.proposedSnapshot, DatasetSnapshotClass): - return mce - terms_to_add = self.config.get_terms_to_add(mce.proposedSnapshot) - if terms_to_add: - terms = builder.get_or_add_aspect( - mce, - GlossaryTermsClass( - terms=[], - auditStamp=AuditStampClass( - time=builder.get_sys_time(), actor="urn:li:corpUser:restEmitter" - ), - ), + @staticmethod + def get_patch_glossary_terms_aspect( + graph: DataHubGraph, + urn: str, + glossary_terms_aspect: Optional[GlossaryTermsClass], + ) -> Optional[GlossaryTermsClass]: + if not glossary_terms_aspect or not glossary_terms_aspect.terms: + # nothing to add, no need to consult server + return glossary_terms_aspect + + server_glossary_terms_aspect = graph.get_glossary_terms(entity_urn=urn) + # No server glossary_terms_aspect to compute a patch + if server_glossary_terms_aspect is None: + return glossary_terms_aspect + + # Compute patch + server_term_urns: List[str] = [ + term.urn for term in server_glossary_terms_aspect.terms + ] + # We only include terms which are not present in the server_term_urns list + terms_to_add: List[GlossaryTermAssociationClass] = [ + term + for term in glossary_terms_aspect.terms + if term.urn not in server_term_urns + ] + # Lets patch + patch_glossary_terms_aspect: GlossaryTermsClass = GlossaryTermsClass( + terms=[], auditStamp=glossary_terms_aspect.auditStamp + ) + patch_glossary_terms_aspect.terms.extend(server_glossary_terms_aspect.terms) + patch_glossary_terms_aspect.terms.extend(terms_to_add) + + return patch_glossary_terms_aspect + + def transform_aspect( + self, entity_urn: str, aspect_name: str, aspect: Optional[Aspect] + ) -> Optional[Aspect]: + + in_glossary_terms: Optional[GlossaryTermsClass] = cast( + Optional[GlossaryTermsClass], aspect + ) + out_glossary_terms: GlossaryTermsClass = GlossaryTermsClass( + terms=[], + auditStamp=in_glossary_terms.auditStamp + if in_glossary_terms is not None + else AuditStampClass.construct_with_defaults(), + ) + # Check if user want to keep existing terms + if in_glossary_terms is not None and self.config.replace_existing is False: + out_glossary_terms.terms.extend(in_glossary_terms.terms) + out_glossary_terms.auditStamp = in_glossary_terms.auditStamp + + terms_to_add = self.config.get_terms_to_add(entity_urn) + if terms_to_add is not None: + out_glossary_terms.terms.extend(terms_to_add) + + patch_glossary_terms: Optional[GlossaryTermsClass] = None + if self.config.semantics == TransformerSemantics.PATCH: + assert self.ctx.graph + patch_glossary_terms = AddDatasetTerms.get_patch_glossary_terms_aspect( + self.ctx.graph, entity_urn, out_glossary_terms ) - terms.terms.extend(terms_to_add) - return mce + return ( + cast(Optional[Aspect], patch_glossary_terms) + if patch_glossary_terms is not None + else cast(Optional[Aspect], out_glossary_terms) + ) -class SimpleDatasetTermsConfig(ConfigModel): +class SimpleDatasetTermsConfig(TransformerSemanticsConfigModel): term_urns: List[str] @@ -72,6 +128,8 @@ def __init__(self, config: SimpleDatasetTermsConfig, ctx: PipelineContext): generic_config = AddDatasetTermsConfig( get_terms_to_add=lambda _: terms, + replace_existing=config.replace_existing, + semantics=config.semantics, ) super().__init__(generic_config, ctx) @@ -81,7 +139,7 @@ def create(cls, config_dict: dict, ctx: PipelineContext) -> "SimpleAddDatasetTer return cls(config, ctx) -class PatternDatasetTermsConfig(ConfigModel): +class PatternDatasetTermsConfig(TransformerSemanticsConfigModel): term_pattern: KeyValuePattern = KeyValuePattern.all() @@ -92,9 +150,10 @@ def __init__(self, config: PatternDatasetTermsConfig, ctx: PipelineContext): term_pattern = config.term_pattern generic_config = AddDatasetTermsConfig( get_terms_to_add=lambda _: [ - GlossaryTermAssociationClass(urn=urn) - for urn in term_pattern.value(_.urn) + GlossaryTermAssociationClass(urn=urn) for urn in term_pattern.value(_) ], + replace_existing=config.replace_existing, + semantics=config.semantics, ) super().__init__(generic_config, ctx) diff --git a/metadata-ingestion/src/datahub/ingestion/transformer/dataset_domain.py b/metadata-ingestion/src/datahub/ingestion/transformer/dataset_domain.py index 6a2ca668589b78..369ba1cea98c72 100644 --- a/metadata-ingestion/src/datahub/ingestion/transformer/dataset_domain.py +++ b/metadata-ingestion/src/datahub/ingestion/transformer/dataset_domain.py @@ -48,6 +48,13 @@ def create(cls, config_dict: dict, ctx: PipelineContext) -> "AddDatasetDomain": config = AddDatasetDomainSemanticsConfig.parse_obj(config_dict) return cls(config, ctx) + @staticmethod + def raise_ctx_configuration_error(ctx: PipelineContext) -> None: + if ctx.graph is None: + raise ConfigurationError( + "AddDatasetDomain requires a datahub_api to connect to. Consider using the datahub-rest sink or provide a datahub_api: configuration on your ingestion recipe" + ) + @staticmethod def get_domain_class( graph: Optional[DataHubGraph], domains: List[str] @@ -76,7 +83,8 @@ def get_domains_to_set( for domain in mce_domain.domains: if domain not in server_domain.domains: domains_to_add.append(domain) - + # Lets patch + mce_domain.domains = [] mce_domain.domains.extend(server_domain.domains) mce_domain.domains.extend(domains_to_add) @@ -86,10 +94,11 @@ def transform_aspect( self, entity_urn: str, aspect_name: str, aspect: Optional[Aspect] ) -> Optional[Aspect]: - domain_aspect: DomainsClass = DomainsClass(domains=[]) + in_domain_aspect: DomainsClass = cast(DomainsClass, aspect) + domain_aspect = DomainsClass(domains=[]) # Check if we have received existing aspect - if aspect is not None: - domain_aspect.domains.extend(cast(DomainsClass, aspect).domains) + if in_domain_aspect is not None and self.config.replace_existing is False: + domain_aspect.domains.extend(in_domain_aspect.domains) domain_to_add = self.config.get_domains_to_add(entity_urn) @@ -118,15 +127,12 @@ class SimpleAddDatasetDomain(AddDatasetDomain): def __init__( self, config: SimpleDatasetDomainSemanticsConfig, ctx: PipelineContext ): - if ctx.graph is None: - raise ConfigurationError( - "AddDatasetDomain requires a datahub_api to connect to. Consider using the datahub-rest sink or provide a datahub_api: configuration on your ingestion recipe" - ) - + AddDatasetDomain.raise_ctx_configuration_error(ctx) domains = AddDatasetDomain.get_domain_class(ctx.graph, config.domains) generic_config = AddDatasetDomainSemanticsConfig( get_domains_to_add=lambda _: domains, semantics=config.semantics, + replace_existing=config.replace_existing, ) super().__init__(generic_config, ctx) @@ -144,10 +150,7 @@ class PatternAddDatasetDomain(AddDatasetDomain): def __init__( self, config: PatternDatasetDomainSemanticsConfig, ctx: PipelineContext ): - if ctx.graph is None: - raise ConfigurationError( - "AddDatasetDomain requires a datahub_api to connect to. Consider using the datahub-rest sink or provide a datahub_api: configuration on your ingestion recipe" - ) + AddDatasetDomain.raise_ctx_configuration_error(ctx) domain_pattern = config.domain_pattern diff --git a/metadata-ingestion/src/datahub/ingestion/transformer/dataset_transformer.py b/metadata-ingestion/src/datahub/ingestion/transformer/dataset_transformer.py index c2e6ddf141c5bf..22f8a110371376 100644 --- a/metadata-ingestion/src/datahub/ingestion/transformer/dataset_transformer.py +++ b/metadata-ingestion/src/datahub/ingestion/transformer/dataset_transformer.py @@ -1,28 +1,17 @@ import logging -from abc import ABCMeta, abstractmethod -from typing import List, Optional +from abc import ABCMeta +from typing import List -from deprecated import deprecated - -from datahub.emitter.mce_builder import Aspect from datahub.ingestion.transformer.base_transformer import ( BaseTransformer, - LegacyMCETransformer, SingleAspectTransformer, ) -from datahub.metadata.schema_classes import ( - DatasetSnapshotClass, - MetadataChangeEventClass, -) log = logging.getLogger(__name__) -@deprecated( - reason="Legacy transformer that supports transforming MCE-s using transform_one method. Use BaseTransformer directly and implement the transform_aspect method" -) -class DatasetTransformer(BaseTransformer, LegacyMCETransformer): - """Transformer that does transforms sequentially on each dataset.""" +class DatasetTransformer(BaseTransformer, SingleAspectTransformer, metaclass=ABCMeta): + """Transformer that does transform sequentially on each dataset.""" def __init__(self): super().__init__() @@ -30,76 +19,42 @@ def __init__(self): def entity_types(self) -> List[str]: return ["dataset"] - @deprecated( - reason="preserved for backwards compatibility, subclasses should use transform_aspect directly instead" - ) - @abstractmethod - def transform_one(self, mce: MetadataChangeEventClass) -> MetadataChangeEventClass: - pass - - def transform_aspect( # not marked as @abstractmethod to avoid impacting transformers that extend this class - self, entity_urn: str, aspect_name: str, aspect: Optional[Aspect] - ) -> Optional[Aspect]: - """A default implementation for transform_aspect that calls `transform_one` with a fake MCE to preserve compatibility with previous transformers coded against MCE""" - fake_mce: MetadataChangeEventClass = MetadataChangeEventClass( - proposedSnapshot=DatasetSnapshotClass( - urn=entity_urn, - aspects=[aspect] if aspect else [], # type: ignore - ) - ) - transformed_mce = self.transform_one(fake_mce) - assert transformed_mce.proposedSnapshot - assert ( - len(transformed_mce.proposedSnapshot.aspects) <= 1 - ), "This implementation assumes that transformers will return at most 1 aspect value back" - return ( - transformed_mce.proposedSnapshot.aspects[0] # type: ignore - if len(transformed_mce.proposedSnapshot.aspects) - else None - ) - - -# TODO: rename DatasetTransformerV2 to DatasetTransformer after upgrading all existing dataset transformer -class DatasetTransformerV2(BaseTransformer, SingleAspectTransformer, metaclass=ABCMeta): - """Transformer that does transforms sequentially on each dataset.""" - def __init__(self): - super().__init__() - - def entity_types(self) -> List[str]: - return ["dataset"] - - -class DatasetOwnershipTransformer(DatasetTransformer, SingleAspectTransformer): +class DatasetOwnershipTransformer(DatasetTransformer, metaclass=ABCMeta): def aspect_name(self) -> str: return "ownership" -class DatasetDomainTransformer(DatasetTransformerV2, SingleAspectTransformer): +class DatasetDomainTransformer(DatasetTransformer, metaclass=ABCMeta): def aspect_name(self) -> str: return "domains" -class DatasetStatusTransformer(DatasetTransformer, SingleAspectTransformer): +class DatasetStatusTransformer(DatasetTransformer, metaclass=ABCMeta): def aspect_name(self) -> str: return "status" -class DatasetTagsTransformer(DatasetTransformer, SingleAspectTransformer): +class DatasetTagsTransformer(DatasetTransformer, metaclass=ABCMeta): def aspect_name(self) -> str: return "globalTags" -class DatasetTermsTransformer(DatasetTransformer, SingleAspectTransformer): +class DatasetTermsTransformer(DatasetTransformer, metaclass=ABCMeta): def aspect_name(self) -> str: return "glossaryTerms" -class DatasetPropertiesTransformer(DatasetTransformer, SingleAspectTransformer): +class DatasetPropertiesTransformer(DatasetTransformer, metaclass=ABCMeta): def aspect_name(self) -> str: return "datasetProperties" -class DatasetBrowsePathsTransformer(DatasetTransformer, SingleAspectTransformer): +class DatasetBrowsePathsTransformer(DatasetTransformer, metaclass=ABCMeta): def aspect_name(self) -> str: return "browsePaths" + + +class DatasetSchemaMetadataTransformer(DatasetTransformer, metaclass=ABCMeta): + def aspect_name(self) -> str: + return "schemaMetadata" diff --git a/metadata-ingestion/src/datahub/ingestion/transformer/mark_dataset_status.py b/metadata-ingestion/src/datahub/ingestion/transformer/mark_dataset_status.py index d833e9bcc75a64..00ef29183a0c9a 100644 --- a/metadata-ingestion/src/datahub/ingestion/transformer/mark_dataset_status.py +++ b/metadata-ingestion/src/datahub/ingestion/transformer/mark_dataset_status.py @@ -1,12 +1,9 @@ -from typing import List, Optional +from typing import Optional, cast import datahub.emitter.mce_builder as builder from datahub.configuration.common import ConfigModel from datahub.ingestion.api.common import PipelineContext -from datahub.ingestion.transformer.base_transformer import ( - BaseTransformer, - SingleAspectTransformer, -) +from datahub.ingestion.transformer.dataset_transformer import DatasetStatusTransformer from datahub.metadata.schema_classes import StatusClass @@ -14,7 +11,7 @@ class MarkDatasetStatusConfig(ConfigModel): removed: bool -class MarkDatasetStatus(BaseTransformer, SingleAspectTransformer): +class MarkDatasetStatus(DatasetStatusTransformer): """Transformer that marks status of each dataset.""" ctx: PipelineContext @@ -25,12 +22,6 @@ def __init__(self, config: MarkDatasetStatusConfig, ctx: PipelineContext): self.ctx = ctx self.config = config - def aspect_name(self) -> str: - return "status" - - def entity_types(self) -> List[str]: - return ["dataset"] - @classmethod def create(cls, config_dict: dict, ctx: PipelineContext) -> "MarkDatasetStatus": config = MarkDatasetStatusConfig.parse_obj(config_dict) @@ -42,4 +33,4 @@ def transform_aspect( assert aspect is None or isinstance(aspect, StatusClass) status_aspect: StatusClass = aspect or StatusClass(removed=None) status_aspect.removed = self.config.removed - return status_aspect # type: ignore + return cast(Optional[builder.Aspect], status_aspect) diff --git a/metadata-ingestion/src/datahub/ingestion/transformer/remove_dataset_ownership.py b/metadata-ingestion/src/datahub/ingestion/transformer/remove_dataset_ownership.py index 2cdc02d96ee29a..35b1319c41aaec 100644 --- a/metadata-ingestion/src/datahub/ingestion/transformer/remove_dataset_ownership.py +++ b/metadata-ingestion/src/datahub/ingestion/transformer/remove_dataset_ownership.py @@ -1,23 +1,24 @@ -import datahub.emitter.mce_builder as builder +from typing import Optional, cast + from datahub.configuration.common import ConfigModel +from datahub.emitter.mce_builder import Aspect from datahub.ingestion.api.common import PipelineContext -from datahub.ingestion.transformer.dataset_transformer import DatasetTransformer -from datahub.metadata.schema_classes import MetadataChangeEventClass, OwnershipClass +from datahub.ingestion.transformer.dataset_transformer import ( + DatasetOwnershipTransformer, +) +from datahub.metadata.schema_classes import OwnershipClass class ClearDatasetOwnershipConfig(ConfigModel): pass -class SimpleRemoveDatasetOwnership(DatasetTransformer): +class SimpleRemoveDatasetOwnership(DatasetOwnershipTransformer): """Transformer that clears all owners on each dataset.""" def __init__(self, config: ClearDatasetOwnershipConfig, ctx: PipelineContext): super().__init__() - def aspect_name(self) -> str: - return "ownership" - @classmethod def create( cls, config_dict: dict, ctx: PipelineContext @@ -25,12 +26,13 @@ def create( config = ClearDatasetOwnershipConfig.parse_obj(config_dict) return cls(config, ctx) - def transform_one(self, mce: MetadataChangeEventClass) -> MetadataChangeEventClass: - ownership = builder.get_or_add_aspect( - mce, - OwnershipClass( - owners=[], - ), - ) - ownership.owners = [] - return mce + def transform_aspect( + self, entity_urn: str, aspect_name: str, aspect: Optional[Aspect] + ) -> Optional[Aspect]: + in_ownership_aspect = cast(OwnershipClass, aspect) + if in_ownership_aspect is None: + return cast(Aspect, in_ownership_aspect) + + in_ownership_aspect.owners = [] + + return cast(Aspect, in_ownership_aspect) diff --git a/metadata-ingestion/src/datahub/utilities/urns/urn.py b/metadata-ingestion/src/datahub/utilities/urns/urn.py index 479e74331fd9b3..db6898d55ad2b3 100644 --- a/metadata-ingestion/src/datahub/utilities/urns/urn.py +++ b/metadata-ingestion/src/datahub/utilities/urns/urn.py @@ -4,6 +4,11 @@ from datahub.utilities.urns.error import InvalidUrnError +def guess_entity_type(urn: str) -> str: + assert urn.startswith("urn:li:"), "urns must start with urn:li:" + return urn.split(":")[2] + + class Urn: """ URNs are Globally Unique Identifiers (GUID) used to represent an entity. diff --git a/metadata-ingestion/tests/unit/serde/test_canonicalization_output.json b/metadata-ingestion/tests/unit/serde/test_canonicalization_output.json index 468dabf1561579..7ba3629a9e92ed 100644 --- a/metadata-ingestion/tests/unit/serde/test_canonicalization_output.json +++ b/metadata-ingestion/tests/unit/serde/test_canonicalization_output.json @@ -1,118 +1,12 @@ [ { - "proposedSnapshot": { - "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { - "urn": "urn:li:dataset:(urn:li:dataPlatform:mysql,metagalaxy.metagalaxy.metadata_aspect,PROD)", - "aspects": [ - { - "com.linkedin.pegasus2avro.schema.SchemaMetadata": { - "schemaName": "metagalaxy.metagalaxy.metadata_aspect", - "platform": "urn:li:dataPlatform:mysql", - "version": 0, - "created": { - "time": 1613070834000, - "actor": "urn:li:corpuser:etl" - }, - "lastModified": { - "time": 1613070834000, - "actor": "urn:li:corpuser:etl" - }, - "hash": "", - "platformSchema": { - "com.linkedin.pegasus2avro.schema.MySqlDDL": { - "tableSchema": "" - } - }, - "fields": [ - { - "fieldPath": "urn", - "nullable": false, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "VARCHAR(length=500)", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "aspect", - "nullable": false, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "VARCHAR(length=200)", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "version", - "nullable": false, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.NumberType": {} - } - }, - "nativeDataType": "BIGINT()", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "metadata", - "nullable": false, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "LONGTEXT()", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "createdon", - "nullable": false, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.NullType": {} - } - }, - "nativeDataType": "DATETIME(fsp=6)", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "createdby", - "nullable": false, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "VARCHAR(length=255)", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "createdfor", - "nullable": false, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "VARCHAR(length=255)", - "recursive": false, - "isPartOfKey": false - } - ] - } - } - ] - } + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mysql,metagalaxy.metagalaxy.metadata_aspect,PROD)", + "changeType": "UPSERT", + "aspectName": "schemaMetadata", + "aspect": { + "value": "{\"schemaName\": \"metagalaxy.metagalaxy.metadata_aspect\", \"platform\": \"urn:li:dataPlatform:mysql\", \"version\": 0, \"created\": {\"time\": 1613070834000, \"actor\": \"urn:li:corpuser:etl\"}, \"lastModified\": {\"time\": 1613070834000, \"actor\": \"urn:li:corpuser:etl\"}, \"hash\": \"\", \"platformSchema\": {\"com.linkedin.schema.MySqlDDL\": {\"tableSchema\": \"\"}}, \"fields\": [{\"fieldPath\": \"urn\", \"nullable\": false, \"type\": {\"type\": {\"com.linkedin.schema.StringType\": {}}}, \"nativeDataType\": \"VARCHAR(length=500)\", \"recursive\": false, \"isPartOfKey\": false}, {\"fieldPath\": \"aspect\", \"nullable\": false, \"type\": {\"type\": {\"com.linkedin.schema.StringType\": {}}}, \"nativeDataType\": \"VARCHAR(length=200)\", \"recursive\": false, \"isPartOfKey\": false}, {\"fieldPath\": \"version\", \"nullable\": false, \"type\": {\"type\": {\"com.linkedin.schema.NumberType\": {}}}, \"nativeDataType\": \"BIGINT()\", \"recursive\": false, \"isPartOfKey\": false}, {\"fieldPath\": \"metadata\", \"nullable\": false, \"type\": {\"type\": {\"com.linkedin.schema.StringType\": {}}}, \"nativeDataType\": \"LONGTEXT()\", \"recursive\": false, \"isPartOfKey\": false}, {\"fieldPath\": \"createdon\", \"nullable\": false, \"type\": {\"type\": {\"com.linkedin.schema.NullType\": {}}}, \"nativeDataType\": \"DATETIME(fsp=6)\", \"recursive\": false, \"isPartOfKey\": false}, {\"fieldPath\": \"createdby\", \"nullable\": false, \"type\": {\"type\": {\"com.linkedin.schema.StringType\": {}}}, \"nativeDataType\": \"VARCHAR(length=255)\", \"recursive\": false, \"isPartOfKey\": false}, {\"fieldPath\": \"createdfor\", \"nullable\": false, \"type\": {\"type\": {\"com.linkedin.schema.StringType\": {}}}, \"nativeDataType\": \"VARCHAR(length=255)\", \"recursive\": false, \"isPartOfKey\": false}]}", + "contentType": "application/json" }, "systemMetadata": { "lastObserved": 1626980046000, @@ -120,119 +14,13 @@ } }, { - "proposedSnapshot": { - "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { - "urn": "urn:li:dataset:(urn:li:dataPlatform:mysql,metagalaxy.metagalaxy.metadata_index,PROD)", - "aspects": [ - { - "com.linkedin.pegasus2avro.schema.SchemaMetadata": { - "schemaName": "metagalaxy.metagalaxy.metadata_index", - "platform": "urn:li:dataPlatform:mysql", - "version": 0, - "created": { - "time": 1613070834000, - "actor": "urn:li:corpuser:etl" - }, - "lastModified": { - "time": 1613070834000, - "actor": "urn:li:corpuser:etl" - }, - "hash": "", - "platformSchema": { - "com.linkedin.pegasus2avro.schema.MySqlDDL": { - "tableSchema": "" - } - }, - "fields": [ - { - "fieldPath": "id", - "nullable": false, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.NumberType": {} - } - }, - "nativeDataType": "BIGINT()", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "urn", - "nullable": false, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "VARCHAR(length=200)", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "aspect", - "nullable": false, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "VARCHAR(length=150)", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "path", - "nullable": false, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "VARCHAR(length=150)", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "longVal", - "nullable": false, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.NumberType": {} - } - }, - "nativeDataType": "BIGINT()", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "stringVal", - "nullable": false, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "VARCHAR(length=200)", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "doubleVal", - "nullable": false, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.NumberType": {} - } - }, - "nativeDataType": "DOUBLE(asdecimal=True)", - "recursive": false, - "isPartOfKey": false - } - ] - } - } - ] - } + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mysql,metagalaxy.metagalaxy.metadata_index,PROD)", + "changeType": "UPSERT", + "aspectName": "schemaMetadata", + "aspect": { + "value": "{\"schemaName\": \"metagalaxy.metagalaxy.metadata_index\", \"platform\": \"urn:li:dataPlatform:mysql\", \"version\": 0, \"created\": {\"time\": 1613070834000, \"actor\": \"urn:li:corpuser:etl\"}, \"lastModified\": {\"time\": 1613070834000, \"actor\": \"urn:li:corpuser:etl\"}, \"hash\": \"\", \"platformSchema\": {\"com.linkedin.schema.MySqlDDL\": {\"tableSchema\": \"\"}}, \"fields\": [{\"fieldPath\": \"id\", \"nullable\": false, \"type\": {\"type\": {\"com.linkedin.schema.NumberType\": {}}}, \"nativeDataType\": \"BIGINT()\", \"recursive\": false, \"isPartOfKey\": false}, {\"fieldPath\": \"urn\", \"nullable\": false, \"type\": {\"type\": {\"com.linkedin.schema.StringType\": {}}}, \"nativeDataType\": \"VARCHAR(length=200)\", \"recursive\": false, \"isPartOfKey\": false}, {\"fieldPath\": \"aspect\", \"nullable\": false, \"type\": {\"type\": {\"com.linkedin.schema.StringType\": {}}}, \"nativeDataType\": \"VARCHAR(length=150)\", \"recursive\": false, \"isPartOfKey\": false}, {\"fieldPath\": \"path\", \"nullable\": false, \"type\": {\"type\": {\"com.linkedin.schema.StringType\": {}}}, \"nativeDataType\": \"VARCHAR(length=150)\", \"recursive\": false, \"isPartOfKey\": false}, {\"fieldPath\": \"longVal\", \"nullable\": false, \"type\": {\"type\": {\"com.linkedin.schema.NumberType\": {}}}, \"nativeDataType\": \"BIGINT()\", \"recursive\": false, \"isPartOfKey\": false}, {\"fieldPath\": \"stringVal\", \"nullable\": false, \"type\": {\"type\": {\"com.linkedin.schema.StringType\": {}}}, \"nativeDataType\": \"VARCHAR(length=200)\", \"recursive\": false, \"isPartOfKey\": false}, {\"fieldPath\": \"doubleVal\", \"nullable\": false, \"type\": {\"type\": {\"com.linkedin.schema.NumberType\": {}}}, \"nativeDataType\": \"DOUBLE(asdecimal=True)\", \"recursive\": false, \"isPartOfKey\": false}]}", + "contentType": "application/json" }, "systemMetadata": { "lastObserved": 1626980046000, @@ -240,119 +28,13 @@ } }, { - "proposedSnapshot": { - "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { - "urn": "urn:li:dataset:(urn:li:dataPlatform:mysql,metagalaxy.mysql.columns_priv,PROD)", - "aspects": [ - { - "com.linkedin.pegasus2avro.schema.SchemaMetadata": { - "schemaName": "metagalaxy.mysql.columns_priv", - "platform": "urn:li:dataPlatform:mysql", - "version": 0, - "created": { - "time": 1613070834000, - "actor": "urn:li:corpuser:etl" - }, - "lastModified": { - "time": 1613070834000, - "actor": "urn:li:corpuser:etl" - }, - "hash": "", - "platformSchema": { - "com.linkedin.pegasus2avro.schema.MySqlDDL": { - "tableSchema": "" - } - }, - "fields": [ - { - "fieldPath": "Host", - "nullable": false, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "CHAR(charset='ascii', collation='ascii_general_ci', length=255)", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "Db", - "nullable": false, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "CHAR(collation='utf8_bin', length=64)", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "User", - "nullable": false, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "CHAR(collation='utf8_bin', length=32)", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "Table_name", - "nullable": false, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "CHAR(collation='utf8_bin', length=64)", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "Column_name", - "nullable": false, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "CHAR(collation='utf8_bin', length=64)", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "Timestamp", - "nullable": false, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.NullType": {} - } - }, - "nativeDataType": "TIMESTAMP()", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "Column_priv", - "nullable": false, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "SET(charset='utf8', collation='utf8_general_ci', length=10)", - "recursive": false, - "isPartOfKey": false - } - ] - } - } - ] - } + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mysql,metagalaxy.mysql.columns_priv,PROD)", + "changeType": "UPSERT", + "aspectName": "schemaMetadata", + "aspect": { + "value": "{\"schemaName\": \"metagalaxy.mysql.columns_priv\", \"platform\": \"urn:li:dataPlatform:mysql\", \"version\": 0, \"created\": {\"time\": 1613070834000, \"actor\": \"urn:li:corpuser:etl\"}, \"lastModified\": {\"time\": 1613070834000, \"actor\": \"urn:li:corpuser:etl\"}, \"hash\": \"\", \"platformSchema\": {\"com.linkedin.schema.MySqlDDL\": {\"tableSchema\": \"\"}}, \"fields\": [{\"fieldPath\": \"Host\", \"nullable\": false, \"type\": {\"type\": {\"com.linkedin.schema.StringType\": {}}}, \"nativeDataType\": \"CHAR(charset='ascii', collation='ascii_general_ci', length=255)\", \"recursive\": false, \"isPartOfKey\": false}, {\"fieldPath\": \"Db\", \"nullable\": false, \"type\": {\"type\": {\"com.linkedin.schema.StringType\": {}}}, \"nativeDataType\": \"CHAR(collation='utf8_bin', length=64)\", \"recursive\": false, \"isPartOfKey\": false}, {\"fieldPath\": \"User\", \"nullable\": false, \"type\": {\"type\": {\"com.linkedin.schema.StringType\": {}}}, \"nativeDataType\": \"CHAR(collation='utf8_bin', length=32)\", \"recursive\": false, \"isPartOfKey\": false}, {\"fieldPath\": \"Table_name\", \"nullable\": false, \"type\": {\"type\": {\"com.linkedin.schema.StringType\": {}}}, \"nativeDataType\": \"CHAR(collation='utf8_bin', length=64)\", \"recursive\": false, \"isPartOfKey\": false}, {\"fieldPath\": \"Column_name\", \"nullable\": false, \"type\": {\"type\": {\"com.linkedin.schema.StringType\": {}}}, \"nativeDataType\": \"CHAR(collation='utf8_bin', length=64)\", \"recursive\": false, \"isPartOfKey\": false}, {\"fieldPath\": \"Timestamp\", \"nullable\": false, \"type\": {\"type\": {\"com.linkedin.schema.NullType\": {}}}, \"nativeDataType\": \"TIMESTAMP()\", \"recursive\": false, \"isPartOfKey\": false}, {\"fieldPath\": \"Column_priv\", \"nullable\": false, \"type\": {\"type\": {\"com.linkedin.schema.StringType\": {}}}, \"nativeDataType\": \"SET(charset='utf8', collation='utf8_general_ci', length=10)\", \"recursive\": false, \"isPartOfKey\": false}]}", + "contentType": "application/json" }, "systemMetadata": { "lastObserved": 1626980046000, @@ -360,71 +42,13 @@ } }, { - "proposedSnapshot": { - "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { - "urn": "urn:li:dataset:(urn:li:dataPlatform:mysql,metagalaxy.mysql.component,PROD)", - "aspects": [ - { - "com.linkedin.pegasus2avro.schema.SchemaMetadata": { - "schemaName": "metagalaxy.mysql.component", - "platform": "urn:li:dataPlatform:mysql", - "version": 0, - "created": { - "time": 1613070834000, - "actor": "urn:li:corpuser:etl" - }, - "lastModified": { - "time": 1613070834000, - "actor": "urn:li:corpuser:etl" - }, - "hash": "", - "platformSchema": { - "com.linkedin.pegasus2avro.schema.MySqlDDL": { - "tableSchema": "" - } - }, - "fields": [ - { - "fieldPath": "component_id", - "nullable": false, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.NumberType": {} - } - }, - "nativeDataType": "INTEGER(unsigned=True)", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "component_group_id", - "nullable": false, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.NumberType": {} - } - }, - "nativeDataType": "INTEGER(unsigned=True)", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "component_urn", - "nullable": false, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "TEXT()", - "recursive": false, - "isPartOfKey": false - } - ] - } - } - ] - } + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mysql,metagalaxy.mysql.component,PROD)", + "changeType": "UPSERT", + "aspectName": "schemaMetadata", + "aspect": { + "value": "{\"schemaName\": \"metagalaxy.mysql.component\", \"platform\": \"urn:li:dataPlatform:mysql\", \"version\": 0, \"created\": {\"time\": 1613070834000, \"actor\": \"urn:li:corpuser:etl\"}, \"lastModified\": {\"time\": 1613070834000, \"actor\": \"urn:li:corpuser:etl\"}, \"hash\": \"\", \"platformSchema\": {\"com.linkedin.schema.MySqlDDL\": {\"tableSchema\": \"\"}}, \"fields\": [{\"fieldPath\": \"component_id\", \"nullable\": false, \"type\": {\"type\": {\"com.linkedin.schema.NumberType\": {}}}, \"nativeDataType\": \"INTEGER(unsigned=True)\", \"recursive\": false, \"isPartOfKey\": false}, {\"fieldPath\": \"component_group_id\", \"nullable\": false, \"type\": {\"type\": {\"com.linkedin.schema.NumberType\": {}}}, \"nativeDataType\": \"INTEGER(unsigned=True)\", \"recursive\": false, \"isPartOfKey\": false}, {\"fieldPath\": \"component_urn\", \"nullable\": false, \"type\": {\"type\": {\"com.linkedin.schema.StringType\": {}}}, \"nativeDataType\": \"TEXT()\", \"recursive\": false, \"isPartOfKey\": false}]}", + "contentType": "application/json" }, "systemMetadata": { "lastObserved": 1626980046000, @@ -432,59 +56,13 @@ } }, { - "proposedSnapshot": { - "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { - "urn": "urn:li:dataset:(urn:li:dataPlatform:mssql,DemoData.dbo.Products,PROD)", - "aspects": [ - { - "com.linkedin.pegasus2avro.schema.SchemaMetadata": { - "schemaName": "DemoData.dbo.Products", - "platform": "urn:li:dataPlatform:mssql", - "version": 0, - "created": { - "time": 1613095693000, - "actor": "urn:li:corpuser:etl" - }, - "lastModified": { - "time": 1613095693000, - "actor": "urn:li:corpuser:etl" - }, - "hash": "", - "platformSchema": { - "com.linkedin.pegasus2avro.schema.MySqlDDL": { - "tableSchema": "" - } - }, - "fields": [ - { - "fieldPath": "ID", - "nullable": false, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.NumberType": {} - } - }, - "nativeDataType": "INTEGER()", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "ProductName", - "nullable": false, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "NVARCHAR()", - "recursive": false, - "isPartOfKey": false - } - ] - } - } - ] - } + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,DemoData.dbo.Products,PROD)", + "changeType": "UPSERT", + "aspectName": "schemaMetadata", + "aspect": { + "value": "{\"schemaName\": \"DemoData.dbo.Products\", \"platform\": \"urn:li:dataPlatform:mssql\", \"version\": 0, \"created\": {\"time\": 1613095693000, \"actor\": \"urn:li:corpuser:etl\"}, \"lastModified\": {\"time\": 1613095693000, \"actor\": \"urn:li:corpuser:etl\"}, \"hash\": \"\", \"platformSchema\": {\"com.linkedin.schema.MySqlDDL\": {\"tableSchema\": \"\"}}, \"fields\": [{\"fieldPath\": \"ID\", \"nullable\": false, \"type\": {\"type\": {\"com.linkedin.schema.NumberType\": {}}}, \"nativeDataType\": \"INTEGER()\", \"recursive\": false, \"isPartOfKey\": false}, {\"fieldPath\": \"ProductName\", \"nullable\": false, \"type\": {\"type\": {\"com.linkedin.schema.StringType\": {}}}, \"nativeDataType\": \"NVARCHAR()\", \"recursive\": false, \"isPartOfKey\": false}]}", + "contentType": "application/json" }, "systemMetadata": { "lastObserved": 1626980046000, @@ -492,59 +70,13 @@ } }, { - "proposedSnapshot": { - "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { - "urn": "urn:li:dataset:(urn:li:dataPlatform:mssql,DemoData.Foo.Items,PROD)", - "aspects": [ - { - "com.linkedin.pegasus2avro.schema.SchemaMetadata": { - "schemaName": "DemoData.Foo.Items", - "platform": "urn:li:dataPlatform:mssql", - "version": 0, - "created": { - "time": 1613095694000, - "actor": "urn:li:corpuser:etl" - }, - "lastModified": { - "time": 1613095694000, - "actor": "urn:li:corpuser:etl" - }, - "hash": "", - "platformSchema": { - "com.linkedin.pegasus2avro.schema.MySqlDDL": { - "tableSchema": "" - } - }, - "fields": [ - { - "fieldPath": "ID", - "nullable": false, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.NumberType": {} - } - }, - "nativeDataType": "INTEGER()", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "ItemName", - "nullable": false, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "NVARCHAR()", - "recursive": false, - "isPartOfKey": false - } - ] - } - } - ] - } + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,DemoData.Foo.Items,PROD)", + "changeType": "UPSERT", + "aspectName": "schemaMetadata", + "aspect": { + "value": "{\"schemaName\": \"DemoData.Foo.Items\", \"platform\": \"urn:li:dataPlatform:mssql\", \"version\": 0, \"created\": {\"time\": 1613095694000, \"actor\": \"urn:li:corpuser:etl\"}, \"lastModified\": {\"time\": 1613095694000, \"actor\": \"urn:li:corpuser:etl\"}, \"hash\": \"\", \"platformSchema\": {\"com.linkedin.schema.MySqlDDL\": {\"tableSchema\": \"\"}}, \"fields\": [{\"fieldPath\": \"ID\", \"nullable\": false, \"type\": {\"type\": {\"com.linkedin.schema.NumberType\": {}}}, \"nativeDataType\": \"INTEGER()\", \"recursive\": false, \"isPartOfKey\": false}, {\"fieldPath\": \"ItemName\", \"nullable\": false, \"type\": {\"type\": {\"com.linkedin.schema.StringType\": {}}}, \"nativeDataType\": \"NVARCHAR()\", \"recursive\": false, \"isPartOfKey\": false}]}", + "contentType": "application/json" }, "systemMetadata": { "lastObserved": 1626980046000, diff --git a/metadata-ingestion/tests/unit/serde/test_serde.py b/metadata-ingestion/tests/unit/serde/test_serde.py index 828ccb50a78dda..d99b2970124dd9 100644 --- a/metadata-ingestion/tests/unit/serde/test_serde.py +++ b/metadata-ingestion/tests/unit/serde/test_serde.py @@ -161,7 +161,9 @@ def test_check_metadata_rewrite( output_file_path = tmp_path / "output.json" shutil.copyfile(json_input, output_file_path) - run_datahub_cmd(["check", "metadata-file", f"{output_file_path}", "--rewrite"]) + run_datahub_cmd( + ["check", "metadata-file", f"{output_file_path}", "--rewrite", "--unpack-mces"] + ) mce_helpers.check_golden_file( pytestconfig, output_path=output_file_path, golden_path=json_output_reference diff --git a/metadata-ingestion/tests/unit/test_transform_dataset.py b/metadata-ingestion/tests/unit/test_transform_dataset.py index f3c02ef9ba0735..51a8568327d5b4 100644 --- a/metadata-ingestion/tests/unit/test_transform_dataset.py +++ b/metadata-ingestion/tests/unit/test_transform_dataset.py @@ -1,5 +1,15 @@ import re -from typing import Any, Dict, List, MutableSequence, Optional, Union +from typing import ( + Any, + Callable, + Dict, + List, + MutableSequence, + Optional, + Type, + Union, + cast, +) from unittest import mock from uuid import uuid4 @@ -8,9 +18,11 @@ import datahub.emitter.mce_builder as builder import datahub.metadata.schema_classes as models import tests.test_helpers.mce_helpers +from datahub.configuration.common import TransformerSemantics from datahub.emitter.mcp import MetadataChangeProposalWrapper from datahub.ingestion.api import workunit from datahub.ingestion.api.common import EndOfStream, PipelineContext, RecordEnvelope +from datahub.ingestion.graph.client import DatahubClientConfig, DataHubGraph from datahub.ingestion.run.pipeline import Pipeline from datahub.ingestion.transformer.add_dataset_browse_path import ( AddDatasetBrowsePathTransformer, @@ -44,6 +56,7 @@ BaseTransformer, SingleAspectTransformer, ) +from datahub.ingestion.transformer.dataset_domain import SimpleAddDatasetDomain from datahub.ingestion.transformer.dataset_transformer import DatasetTransformer from datahub.ingestion.transformer.mark_dataset_status import MarkDatasetStatus from datahub.ingestion.transformer.remove_dataset_ownership import ( @@ -53,7 +66,6 @@ BrowsePathsClass, ChangeTypeClass, DatasetPropertiesClass, - DatasetSnapshotClass, GlobalTagsClass, MetadataChangeEventClass, OwnershipClass, @@ -833,7 +845,7 @@ def test_ownership_patching_intersect(mock_time): mce_ownership = gen_owners(["baz", "foo"]) mock_graph.get_ownership.return_value = server_ownership - test_ownership = AddDatasetOwnership.get_ownership_to_set( + test_ownership = AddDatasetOwnership.get_patch_ownership_aspect( mock_graph, "test_urn", mce_ownership ) assert test_ownership and test_ownership.owners @@ -846,7 +858,7 @@ def test_ownership_patching_with_nones(mock_time): mock_graph = mock.MagicMock() mce_ownership = gen_owners(["baz", "foo"]) mock_graph.get_ownership.return_value = None - test_ownership = AddDatasetOwnership.get_ownership_to_set( + test_ownership = AddDatasetOwnership.get_patch_ownership_aspect( mock_graph, "test_urn", mce_ownership ) assert test_ownership and test_ownership.owners @@ -855,7 +867,7 @@ def test_ownership_patching_with_nones(mock_time): server_ownership = gen_owners(["baz", "foo"]) mock_graph.get_ownership.return_value = server_ownership - test_ownership = AddDatasetOwnership.get_ownership_to_set( + test_ownership = AddDatasetOwnership.get_patch_ownership_aspect( mock_graph, "test_urn", None ) assert not test_ownership @@ -865,7 +877,7 @@ def test_ownership_patching_with_empty_mce_none_server(mock_time): mock_graph = mock.MagicMock() mce_ownership = gen_owners([]) mock_graph.get_ownership.return_value = None - test_ownership = AddDatasetOwnership.get_ownership_to_set( + test_ownership = AddDatasetOwnership.get_patch_ownership_aspect( mock_graph, "test_urn", mce_ownership ) # nothing to add, so we omit writing @@ -877,7 +889,7 @@ def test_ownership_patching_with_empty_mce_nonempty_server(mock_time): server_ownership = gen_owners(["baz", "foo"]) mce_ownership = gen_owners([]) mock_graph.get_ownership.return_value = server_ownership - test_ownership = AddDatasetOwnership.get_ownership_to_set( + test_ownership = AddDatasetOwnership.get_patch_ownership_aspect( mock_graph, "test_urn", mce_ownership ) # nothing to add, so we omit writing @@ -889,7 +901,7 @@ def test_ownership_patching_with_different_types_1(mock_time): server_ownership = gen_owners(["baz", "foo"], models.OwnershipTypeClass.PRODUCER) mce_ownership = gen_owners(["foo"], models.OwnershipTypeClass.DATAOWNER) mock_graph.get_ownership.return_value = server_ownership - test_ownership = AddDatasetOwnership.get_ownership_to_set( + test_ownership = AddDatasetOwnership.get_patch_ownership_aspect( mock_graph, "test_urn", mce_ownership ) assert test_ownership and test_ownership.owners @@ -907,7 +919,7 @@ def test_ownership_patching_with_different_types_2(mock_time): server_ownership = gen_owners(["baz", "foo"], models.OwnershipTypeClass.PRODUCER) mce_ownership = gen_owners(["foo", "baz"], models.OwnershipTypeClass.DATAOWNER) mock_graph.get_ownership.return_value = server_ownership - test_ownership = AddDatasetOwnership.get_ownership_to_set( + test_ownership = AddDatasetOwnership.get_patch_ownership_aspect( mock_graph, "test_urn", mce_ownership ) assert test_ownership and test_ownership.owners @@ -925,7 +937,7 @@ def test_ownership_patching_with_different_types_2(mock_time): class DummyPropertiesResolverClass(AddDatasetPropertiesResolverBase): - def get_properties_to_add(self, current: DatasetSnapshotClass) -> Dict[str, str]: + def get_properties_to_add(self, entity_urn: str) -> Dict[str, str]: return PROPERTIES_TO_ADD @@ -957,36 +969,133 @@ def test_add_dataset_properties(mock_time): } -def test_simple_add_dataset_properties(mock_time): - dataset_mce = make_dataset_with_properties() +def run_simple_add_dataset_properties_transformer_semantics( + semantics: TransformerSemantics, + new_properties: dict, + server_properties: dict, + mock_datahub_graph: Callable[[DatahubClientConfig], DataHubGraph], +) -> List[RecordEnvelope]: + pipeline_context = PipelineContext(run_id="test_pattern_dataset_schema_terms") + pipeline_context.graph = mock_datahub_graph(DatahubClientConfig()) - new_properties = {"new-simple-property": "new-value"} - transformer = SimpleAddDatasetProperties.create( - { + # fake the server response + def fake_dataset_properties(entity_urn: str) -> models.DatasetPropertiesClass: + return DatasetPropertiesClass(customProperties=server_properties) + + pipeline_context.graph.get_dataset_properties = fake_dataset_properties # type: ignore + + output = run_dataset_transformer_pipeline( + transformer_type=SimpleAddDatasetProperties, + pipeline_context=pipeline_context, + aspect=models.DatasetPropertiesClass( + customProperties=EXISTING_PROPERTIES.copy() + ), + config={ + "semantics": semantics, "properties": new_properties, }, - PipelineContext(run_id="test-simple-properties"), ) - outputs = list( - transformer.transform( - [RecordEnvelope(input, metadata={}) for input in [dataset_mce]] - ) + return output + + +def test_simple_add_dataset_properties_overwrite(mock_datahub_graph): + new_properties = {"new-simple-property": "new-value"} + server_properties = {"p1": "value1"} + + output = run_simple_add_dataset_properties_transformer_semantics( + semantics=TransformerSemantics.OVERWRITE, + new_properties=new_properties, + server_properties=server_properties, + mock_datahub_graph=mock_datahub_graph, ) - assert len(outputs) == 1 - custom_properties = builder.get_aspect_if_available( - outputs[0].record, models.DatasetPropertiesClass + assert len(output) == 2 + assert output[0].record + assert output[0].record.aspect + custom_properties_aspect: models.DatasetPropertiesClass = cast( + models.DatasetPropertiesClass, output[0].record.aspect ) - print(str(custom_properties)) - assert custom_properties is not None - assert custom_properties.customProperties == { + assert custom_properties_aspect.customProperties == { + **EXISTING_PROPERTIES, + **new_properties, + } + + +def test_simple_add_dataset_properties_patch(mock_datahub_graph): + new_properties = {"new-simple-property": "new-value"} + server_properties = {"p1": "value1"} + + output = run_simple_add_dataset_properties_transformer_semantics( + semantics=TransformerSemantics.PATCH, + new_properties=new_properties, + server_properties=server_properties, + mock_datahub_graph=mock_datahub_graph, + ) + + assert len(output) == 2 + assert output[0].record + assert output[0].record.aspect + custom_properties_aspect: models.DatasetPropertiesClass = cast( + models.DatasetPropertiesClass, output[0].record.aspect + ) + assert custom_properties_aspect.customProperties == { + **EXISTING_PROPERTIES, + **new_properties, + **server_properties, + } + + +def test_simple_add_dataset_properties(mock_time): + new_properties = {"new-simple-property": "new-value"} + outputs = run_dataset_transformer_pipeline( + transformer_type=SimpleAddDatasetProperties, + aspect=models.DatasetPropertiesClass( + customProperties=EXISTING_PROPERTIES.copy() + ), + config={ + "properties": new_properties, + }, + ) + + assert len(outputs) == 2 + assert outputs[0].record + assert outputs[0].record.aspect + custom_properties_aspect: models.DatasetPropertiesClass = cast( + models.DatasetPropertiesClass, outputs[0].record.aspect + ) + assert custom_properties_aspect.customProperties == { **EXISTING_PROPERTIES, **new_properties, } +def test_simple_add_dataset_properties_replace_existing(mock_time): + new_properties = {"new-simple-property": "new-value"} + outputs = run_dataset_transformer_pipeline( + transformer_type=SimpleAddDatasetProperties, + aspect=models.DatasetPropertiesClass( + customProperties=EXISTING_PROPERTIES.copy() + ), + config={ + "replace_existing": True, + "properties": new_properties, + }, + ) + + assert len(outputs) == 2 + assert outputs[0].record + assert outputs[0].record.aspect + custom_properties_aspect: models.DatasetPropertiesClass = cast( + models.DatasetPropertiesClass, outputs[0].record.aspect + ) + + assert custom_properties_aspect.customProperties == { + **new_properties, + } + + def test_simple_dataset_terms_transformation(mock_time): dataset_mce = make_generic_dataset() @@ -1337,84 +1446,6 @@ def test_supression_works(): assert len(outputs) == 2 # MCP will be dropped -class OldMCETransformer(DatasetTransformer): - """A simulated legacy MCE transformer""" - - @classmethod - def create(cls, config_dict: dict, ctx: PipelineContext) -> "OldMCETransformer": - return OldMCETransformer() - - def transform_one(self, mce: MetadataChangeEventClass) -> MetadataChangeEventClass: - # legacy transformers should not receive metadata change proposal events - assert not isinstance(mce, MetadataChangeProposalWrapper) - if isinstance(mce, MetadataChangeEventClass): - assert isinstance(mce.proposedSnapshot, DatasetSnapshotClass) - mce.proposedSnapshot.aspects.append( - DatasetPropertiesClass(description="Old Transformer was here") - ) - - return mce - - -def test_old_transformers_working_as_before(mock_time): - - dataset_mce = make_generic_dataset() - dataset_mcp = make_generic_dataset_mcp() - transformer = OldMCETransformer.create( - {}, - PipelineContext(run_id="test-old-transformer"), - ) - - outputs = list( - transformer.transform( - [ - RecordEnvelope(input, metadata={}) - for input in [dataset_mce, dataset_mcp, EndOfStream()] - ] - ) - ) - - assert len(outputs) == 3 # MCP will come back untouched - - assert outputs[0].record == dataset_mce - # Check that glossary terms were added. - props_aspect = builder.get_aspect_if_available( - outputs[0].record, DatasetPropertiesClass - ) - assert props_aspect - assert props_aspect.description == "Old Transformer was here" - - assert outputs[1].record == dataset_mcp - - assert isinstance(outputs[-1].record, EndOfStream) - - # MCP only stream - dataset_mcps = [ - make_generic_dataset_mcp(), - make_generic_dataset_mcp( - aspect_name="datasetProperties", - aspect=DatasetPropertiesClass(description="Another test MCP"), - ), - EndOfStream(), - ] - transformer = OldMCETransformer.create( - {}, - PipelineContext(run_id="test-old-transformer"), - ) - - outputs = list( - transformer.transform( - [RecordEnvelope(input, metadata={}) for input in dataset_mcps] - ) - ) - - assert len(outputs) == 3 # MCP-s will come back untouched - - assert outputs[0].record == dataset_mcps[0] - assert outputs[1].record == dataset_mcps[1] - assert isinstance(outputs[-1].record, EndOfStream) - - def test_pattern_dataset_schema_terms_transformation(mock_time): dataset_mce = make_generic_dataset( aspects=[ @@ -1601,3 +1632,572 @@ def test_pattern_dataset_schema_tags_transformation(mock_time): assert schema_aspect.fields[2].globalTags.tags[1].tag == builder.make_tag_urn( "LastName" ) + + +def run_dataset_transformer_pipeline( + transformer_type: Type[DatasetTransformer], + aspect: builder.Aspect, + config: dict, + pipeline_context: PipelineContext = PipelineContext(run_id="transformer_pipe_line"), +) -> List[RecordEnvelope]: + + transformer: DatasetTransformer = cast( + DatasetTransformer, transformer_type.create(config, pipeline_context) + ) + + dataset_mcp = make_generic_dataset_mcp( + aspect=aspect, aspect_name=transformer.aspect_name() + ) + + outputs = list( + transformer.transform( + [ + RecordEnvelope(input, metadata={}) + for input in [dataset_mcp, EndOfStream()] + ] + ) + ) + return outputs + + +def test_simple_add_dataset_domain(mock_datahub_graph): + acryl_domain = builder.make_domain_urn("acryl.io") + gslab_domain = builder.make_domain_urn("gslab.io") + + pipeline_context: PipelineContext = PipelineContext( + run_id="test_simple_add_dataset_domain" + ) + pipeline_context.graph = mock_datahub_graph(DatahubClientConfig) + + output = run_dataset_transformer_pipeline( + transformer_type=SimpleAddDatasetDomain, + aspect=models.DomainsClass(domains=[gslab_domain]), + config={"domains": [acryl_domain]}, + pipeline_context=pipeline_context, + ) + + assert len(output) == 2 + assert output[0] is not None + assert output[0].record is not None + assert isinstance(output[0].record, MetadataChangeProposalWrapper) + assert output[0].record.aspect is not None + assert isinstance(output[0].record.aspect, models.DomainsClass) + transformed_aspect = cast(models.DomainsClass, output[0].record.aspect) + assert len(transformed_aspect.domains) == 2 + assert gslab_domain in transformed_aspect.domains + assert acryl_domain in transformed_aspect.domains + + +def test_simple_add_dataset_domain_replace_existing(mock_datahub_graph): + acryl_domain = builder.make_domain_urn("acryl.io") + gslab_domain = builder.make_domain_urn("gslab.io") + + pipeline_context: PipelineContext = PipelineContext( + run_id="test_simple_add_dataset_domain" + ) + pipeline_context.graph = mock_datahub_graph(DatahubClientConfig) + + output = run_dataset_transformer_pipeline( + transformer_type=SimpleAddDatasetDomain, + aspect=models.DomainsClass(domains=[gslab_domain]), + config={"replace_existing": True, "domains": [acryl_domain]}, + pipeline_context=pipeline_context, + ) + + assert len(output) == 2 + assert output[0] is not None + assert output[0].record is not None + assert isinstance(output[0].record, MetadataChangeProposalWrapper) + assert output[0].record.aspect is not None + assert isinstance(output[0].record.aspect, models.DomainsClass) + transformed_aspect = cast(models.DomainsClass, output[0].record.aspect) + assert len(transformed_aspect.domains) == 1 + assert gslab_domain not in transformed_aspect.domains + assert acryl_domain in transformed_aspect.domains + + +def test_simple_add_dataset_domain_semantics_overwrite(mock_datahub_graph): + acryl_domain = builder.make_domain_urn("acryl.io") + gslab_domain = builder.make_domain_urn("gslab.io") + server_domain = builder.make_domain_urn("test.io") + + pipeline_context = PipelineContext(run_id="transformer_pipe_line") + pipeline_context.graph = mock_datahub_graph(DatahubClientConfig()) + + # Return fake aspect to simulate server behaviour + def fake_get_domain(entity_urn: str) -> models.DomainsClass: + return models.DomainsClass(domains=[server_domain]) + + pipeline_context.graph.get_domain = fake_get_domain # type: ignore + + output = run_dataset_transformer_pipeline( + transformer_type=SimpleAddDatasetDomain, + aspect=models.DomainsClass(domains=[gslab_domain]), + config={ + "semantics": TransformerSemantics.OVERWRITE, + "domains": [acryl_domain], + }, + pipeline_context=pipeline_context, + ) + + assert len(output) == 2 + assert output[0] is not None + assert output[0].record is not None + assert isinstance(output[0].record, MetadataChangeProposalWrapper) + assert output[0].record.aspect is not None + assert isinstance(output[0].record.aspect, models.DomainsClass) + transformed_aspect = cast(models.DomainsClass, output[0].record.aspect) + assert len(transformed_aspect.domains) == 2 + assert gslab_domain in transformed_aspect.domains + assert acryl_domain in transformed_aspect.domains + assert server_domain not in transformed_aspect.domains + + +def test_simple_add_dataset_domain_semantics_patch( + pytestconfig, tmp_path, mock_time, mock_datahub_graph +): + acryl_domain = builder.make_domain_urn("acryl.io") + gslab_domain = builder.make_domain_urn("gslab.io") + server_domain = builder.make_domain_urn("test.io") + + pipeline_context = PipelineContext(run_id="transformer_pipe_line") + pipeline_context.graph = mock_datahub_graph(DatahubClientConfig()) + + # Return fake aspect to simulate server behaviour + def fake_get_domain(entity_urn: str) -> models.DomainsClass: + return models.DomainsClass(domains=[server_domain]) + + pipeline_context.graph.get_domain = fake_get_domain # type: ignore + + output = run_dataset_transformer_pipeline( + transformer_type=SimpleAddDatasetDomain, + aspect=models.DomainsClass(domains=[gslab_domain]), + config={ + "replace_existing": False, + "semantics": TransformerSemantics.PATCH, + "domains": [acryl_domain], + }, + pipeline_context=pipeline_context, + ) + + assert len(output) == 2 + assert output[0] is not None + assert output[0].record is not None + assert isinstance(output[0].record, MetadataChangeProposalWrapper) + assert output[0].record.aspect is not None + assert isinstance(output[0].record.aspect, models.DomainsClass) + transformed_aspect = cast(models.DomainsClass, output[0].record.aspect) + assert len(transformed_aspect.domains) == 3 + assert gslab_domain in transformed_aspect.domains + assert acryl_domain in transformed_aspect.domains + assert server_domain in transformed_aspect.domains + + +def test_simple_dataset_ownership_transformer_semantics_patch(mock_datahub_graph): + + pipeline_context = PipelineContext(run_id="transformer_pipe_line") + pipeline_context.graph = mock_datahub_graph(DatahubClientConfig()) + + server_owner: str = builder.make_owner_urn( + "mohd@acryl.io", owner_type=builder.OwnerType.USER + ) + owner1: str = builder.make_owner_urn( + "john@acryl.io", owner_type=builder.OwnerType.USER + ) + owner2: str = builder.make_owner_urn( + "pedro@acryl.io", owner_type=builder.OwnerType.USER + ) + + # Return fake aspect to simulate server behaviour + def fake_ownership_class(entity_urn: str) -> models.OwnershipClass: + return models.OwnershipClass( + owners=[ + models.OwnerClass( + owner=server_owner, type=models.OwnershipTypeClass.DATAOWNER + ) + ] + ) + + pipeline_context.graph.get_ownership = fake_ownership_class # type: ignore + + output = run_dataset_transformer_pipeline( + transformer_type=SimpleAddDatasetOwnership, + aspect=models.OwnershipClass( + owners=[ + models.OwnerClass(owner=owner1, type=models.OwnershipTypeClass.PRODUCER) + ] + ), + config={ + "replace_existing": False, + "semantics": TransformerSemantics.PATCH, + "owner_urns": [owner2], + }, + pipeline_context=pipeline_context, + ) + + assert len(output) == 2 + assert output[0] is not None + assert output[0].record is not None + assert isinstance(output[0].record, MetadataChangeProposalWrapper) + assert output[0].record.aspect is not None + assert isinstance(output[0].record.aspect, models.OwnershipClass) + transformed_aspect: models.OwnershipClass = cast( + models.OwnershipClass, output[0].record.aspect + ) + assert len(transformed_aspect.owners) == 3 + owner_urns: List[str] = [ + owner_class.owner for owner_class in transformed_aspect.owners + ] + assert owner1 in owner_urns + assert owner2 in owner_urns + assert server_owner in owner_urns + + +def run_pattern_dataset_schema_terms_transformation_semantics( + semantics: TransformerSemantics, + mock_datahub_graph: Callable[[DatahubClientConfig], DataHubGraph], +) -> List[RecordEnvelope]: + pipeline_context = PipelineContext(run_id="test_pattern_dataset_schema_terms") + pipeline_context.graph = mock_datahub_graph(DatahubClientConfig()) + + # fake the server response + def fake_schema_metadata(entity_urn: str) -> models.SchemaMetadataClass: + return models.SchemaMetadataClass( + schemaName="customer", # not used + platform=builder.make_data_platform_urn( + "hive" + ), # important <- platform must be an urn + version=0, + # when the source system has a notion of versioning of schemas, insert this in, otherwise leave as 0 + hash="", + # when the source system has a notion of unique schemas identified via hash, include a hash, else leave it as empty string + platformSchema=models.OtherSchemaClass( + rawSchema="__insert raw schema here__" + ), + fields=[ + models.SchemaFieldClass( + fieldPath="first_name", + glossaryTerms=models.GlossaryTermsClass( + terms=[ + models.GlossaryTermAssociationClass( + urn=builder.make_term_urn("pii") + ) + ], + auditStamp=models.AuditStampClass.construct_with_defaults(), + ), + type=models.SchemaFieldDataTypeClass(type=models.StringTypeClass()), + nativeDataType="VARCHAR(100)", + # use this to provide the type of the field in the source system's vernacular + ), + models.SchemaFieldClass( + fieldPath="mobile_number", + glossaryTerms=models.GlossaryTermsClass( + terms=[ + models.GlossaryTermAssociationClass( + urn=builder.make_term_urn("pii") + ) + ], + auditStamp=models.AuditStampClass.construct_with_defaults(), + ), + type=models.SchemaFieldDataTypeClass(type=models.StringTypeClass()), + nativeDataType="VARCHAR(100)", + # use this to provide the type of the field in the source system's vernacular + ), + ], + ) + + pipeline_context.graph.get_schema_metadata = fake_schema_metadata # type: ignore + + output = run_dataset_transformer_pipeline( + transformer_type=PatternAddDatasetSchemaTerms, + pipeline_context=pipeline_context, + config={ + "semantics": semantics, + "term_pattern": { + "rules": { + ".*first_name.*": [ + builder.make_term_urn("Name"), + builder.make_term_urn("FirstName"), + ], + ".*last_name.*": [ + builder.make_term_urn("Name"), + builder.make_term_urn("LastName"), + ], + } + }, + }, + aspect=models.SchemaMetadataClass( + schemaName="customer", # not used + platform=builder.make_data_platform_urn( + "hive" + ), # important <- platform must be an urn + version=0, + # when the source system has a notion of versioning of schemas, insert this in, otherwise leave as 0 + hash="", + # when the source system has a notion of unique schemas identified via hash, include a hash, else leave it as empty string + platformSchema=models.OtherSchemaClass( + rawSchema="__insert raw schema here__" + ), + fields=[ + models.SchemaFieldClass( + fieldPath="address", + type=models.SchemaFieldDataTypeClass(type=models.StringTypeClass()), + nativeDataType="VARCHAR(100)", + # use this to provide the type of the field in the source system's vernacular + ), + models.SchemaFieldClass( + fieldPath="first_name", + type=models.SchemaFieldDataTypeClass(type=models.StringTypeClass()), + nativeDataType="VARCHAR(100)", + # use this to provide the type of the field in the source system's vernacular + ), + models.SchemaFieldClass( + fieldPath="last_name", + type=models.SchemaFieldDataTypeClass(type=models.StringTypeClass()), + nativeDataType="VARCHAR(100)", + # use this to provide the type of the field in the source system's vernacular + ), + ], + ), + ) + + return output + + +def test_pattern_dataset_schema_terms_transformation_patch( + mock_time, mock_datahub_graph +): + output = run_pattern_dataset_schema_terms_transformation_semantics( + TransformerSemantics.PATCH, mock_datahub_graph + ) + assert len(output) == 2 + # Check that glossary terms were added. + assert len(output) == 2 + assert output[0] is not None + assert output[0].record is not None + assert isinstance(output[0].record, MetadataChangeProposalWrapper) + assert output[0].record.aspect is not None + assert isinstance(output[0].record.aspect, models.SchemaMetadataClass) + transform_aspect = cast(models.SchemaMetadataClass, output[0].record.aspect) + field_path_vs_field: Dict[str, models.SchemaFieldClass] = { + field.fieldPath: field for field in transform_aspect.fields + } + + assert ( + field_path_vs_field.get("mobile_number") is not None + ) # server field should be preserved during patch + + assert field_path_vs_field["first_name"].glossaryTerms is not None + assert len(field_path_vs_field["first_name"].glossaryTerms.terms) == 3 + glossary_terms_urn = [ + term.urn for term in field_path_vs_field["first_name"].glossaryTerms.terms + ] + assert builder.make_term_urn("pii") in glossary_terms_urn + assert builder.make_term_urn("FirstName") in glossary_terms_urn + assert builder.make_term_urn("Name") in glossary_terms_urn + + +def test_pattern_dataset_schema_terms_transformation_overwrite( + mock_time, mock_datahub_graph +): + output = run_pattern_dataset_schema_terms_transformation_semantics( + TransformerSemantics.OVERWRITE, mock_datahub_graph + ) + + assert len(output) == 2 + # Check that glossary terms were added. + assert len(output) == 2 + assert output[0] is not None + assert output[0].record is not None + assert isinstance(output[0].record, MetadataChangeProposalWrapper) + assert output[0].record.aspect is not None + assert isinstance(output[0].record.aspect, models.SchemaMetadataClass) + transform_aspect = cast(models.SchemaMetadataClass, output[0].record.aspect) + field_path_vs_field: Dict[str, models.SchemaFieldClass] = { + field.fieldPath: field for field in transform_aspect.fields + } + + assert ( + field_path_vs_field.get("mobile_number") is None + ) # server field should not be preserved during overwrite + + assert field_path_vs_field["first_name"].glossaryTerms is not None + assert len(field_path_vs_field["first_name"].glossaryTerms.terms) == 2 + glossary_terms_urn = [ + term.urn for term in field_path_vs_field["first_name"].glossaryTerms.terms + ] + assert builder.make_term_urn("pii") not in glossary_terms_urn + assert builder.make_term_urn("FirstName") in glossary_terms_urn + assert builder.make_term_urn("Name") in glossary_terms_urn + + +def run_pattern_dataset_schema_tags_transformation_semantics( + semantics: TransformerSemantics, + mock_datahub_graph: Callable[[DatahubClientConfig], DataHubGraph], +) -> List[RecordEnvelope]: + pipeline_context = PipelineContext(run_id="test_pattern_dataset_schema_terms") + pipeline_context.graph = mock_datahub_graph(DatahubClientConfig()) + + # fake the server response + def fake_schema_metadata(entity_urn: str) -> models.SchemaMetadataClass: + return models.SchemaMetadataClass( + schemaName="customer", # not used + platform=builder.make_data_platform_urn( + "hive" + ), # important <- platform must be an urn + version=0, + # when the source system has a notion of versioning of schemas, insert this in, otherwise leave as 0 + hash="", + # when the source system has a notion of unique schemas identified via hash, include a hash, else leave it as empty string + platformSchema=models.OtherSchemaClass( + rawSchema="__insert raw schema here__" + ), + fields=[ + models.SchemaFieldClass( + fieldPath="first_name", + globalTags=models.GlobalTagsClass( + tags=[ + models.TagAssociationClass(tag=builder.make_tag_urn("pii")) + ], + ), + type=models.SchemaFieldDataTypeClass(type=models.StringTypeClass()), + nativeDataType="VARCHAR(100)", + # use this to provide the type of the field in the source system's vernacular + ), + models.SchemaFieldClass( + fieldPath="mobile_number", + globalTags=models.GlobalTagsClass( + tags=[ + models.TagAssociationClass(tag=builder.make_tag_urn("pii")) + ], + ), + type=models.SchemaFieldDataTypeClass(type=models.StringTypeClass()), + nativeDataType="VARCHAR(100)", + # use this to provide the type of the field in the source system's vernacular + ), + ], + ) + + pipeline_context.graph.get_schema_metadata = fake_schema_metadata # type: ignore + + output = run_dataset_transformer_pipeline( + transformer_type=PatternAddDatasetSchemaTags, + pipeline_context=pipeline_context, + config={ + "semantics": semantics, + "tag_pattern": { + "rules": { + ".*first_name.*": [ + builder.make_tag_urn("Name"), + builder.make_tag_urn("FirstName"), + ], + ".*last_name.*": [ + builder.make_tag_urn("Name"), + builder.make_tag_urn("LastName"), + ], + } + }, + }, + aspect=models.SchemaMetadataClass( + schemaName="customer", # not used + platform=builder.make_data_platform_urn( + "hive" + ), # important <- platform must be an urn + version=0, + # when the source system has a notion of versioning of schemas, insert this in, otherwise leave as 0 + hash="", + # when the source system has a notion of unique schemas identified via hash, include a hash, else leave it as empty string + platformSchema=models.OtherSchemaClass( + rawSchema="__insert raw schema here__" + ), + fields=[ + models.SchemaFieldClass( + fieldPath="address", + type=models.SchemaFieldDataTypeClass(type=models.StringTypeClass()), + nativeDataType="VARCHAR(100)", + # use this to provide the type of the field in the source system's vernacular + ), + models.SchemaFieldClass( + fieldPath="first_name", + type=models.SchemaFieldDataTypeClass(type=models.StringTypeClass()), + nativeDataType="VARCHAR(100)", + # use this to provide the type of the field in the source system's vernacular + ), + models.SchemaFieldClass( + fieldPath="last_name", + type=models.SchemaFieldDataTypeClass(type=models.StringTypeClass()), + nativeDataType="VARCHAR(100)", + # use this to provide the type of the field in the source system's vernacular + ), + ], + ), + ) + return output + + +def test_pattern_dataset_schema_tags_transformation_overwrite( + mock_time, mock_datahub_graph +): + output = run_pattern_dataset_schema_tags_transformation_semantics( + TransformerSemantics.OVERWRITE, mock_datahub_graph + ) + + assert len(output) == 2 + # Check that glossary terms were added. + assert len(output) == 2 + assert output[0] is not None + assert output[0].record is not None + assert isinstance(output[0].record, MetadataChangeProposalWrapper) + assert output[0].record.aspect is not None + assert isinstance(output[0].record.aspect, models.SchemaMetadataClass) + transform_aspect = cast(models.SchemaMetadataClass, output[0].record.aspect) + field_path_vs_field: Dict[str, models.SchemaFieldClass] = { + field.fieldPath: field for field in transform_aspect.fields + } + + assert ( + field_path_vs_field.get("mobile_number") is None + ) # server field should not be preserved during overwrite + + assert field_path_vs_field["first_name"].globalTags is not None + assert len(field_path_vs_field["first_name"].globalTags.tags) == 2 + global_tags_urn = [ + tag.tag for tag in field_path_vs_field["first_name"].globalTags.tags + ] + assert builder.make_tag_urn("pii") not in global_tags_urn + assert builder.make_tag_urn("FirstName") in global_tags_urn + assert builder.make_tag_urn("Name") in global_tags_urn + + +def test_pattern_dataset_schema_tags_transformation_patch( + mock_time, mock_datahub_graph +): + output = run_pattern_dataset_schema_tags_transformation_semantics( + TransformerSemantics.PATCH, mock_datahub_graph + ) + + assert len(output) == 2 + # Check that global tags were added. + assert len(output) == 2 + assert output[0] is not None + assert output[0].record is not None + assert isinstance(output[0].record, MetadataChangeProposalWrapper) + assert output[0].record.aspect is not None + assert isinstance(output[0].record.aspect, models.SchemaMetadataClass) + transform_aspect = cast(models.SchemaMetadataClass, output[0].record.aspect) + field_path_vs_field: Dict[str, models.SchemaFieldClass] = { + field.fieldPath: field for field in transform_aspect.fields + } + + assert ( + field_path_vs_field.get("mobile_number") is not None + ) # server field should be preserved during patch + + assert field_path_vs_field["first_name"].globalTags is not None + assert len(field_path_vs_field["first_name"].globalTags.tags) == 3 + global_tags_urn = [ + tag.tag for tag in field_path_vs_field["first_name"].globalTags.tags + ] + assert builder.make_tag_urn("pii") in global_tags_urn + assert builder.make_tag_urn("FirstName") in global_tags_urn + assert builder.make_tag_urn("Name") in global_tags_urn diff --git a/metadata-ingestion/transformers.md b/metadata-ingestion/transformers.md deleted file mode 100644 index 8529e29a567676..00000000000000 --- a/metadata-ingestion/transformers.md +++ /dev/null @@ -1,608 +0,0 @@ -# Transformers - -## What’s a transformer? - -Oftentimes we want to modify metadata before it reaches the ingestion sink – for instance, we might want to add custom tags, ownership, properties, or patch some fields. A transformer allows us to do exactly these things. - -Moreover, a transformer allows one to have fine-grained control over the metadata that’s ingested without having to modify the ingestion framework's code yourself. Instead, you can write your own module that can transform metadata events however you like. To include a transformer into a recipe, all that's needed is the name of the transformer as well as any configuration that the transformer needs. - -## Provided transformers - -Aside from the option of writing your own transformer (see below), we provide some simple transformers for the use cases of adding: dataset tags, dataset glossary terms, dataset properties and ownership information. - -### Adding a set of tags - -Let’s suppose we’d like to add a set of dataset tags. To do so, we can use the `simple_add_dataset_tags` module that’s included in the ingestion framework. - -The config, which we’d append to our ingestion recipe YAML, would look like this: - -```yaml -transformers: - - type: "simple_add_dataset_tags" - config: - tag_urns: - - "urn:li:tag:NeedsDocumentation" - - "urn:li:tag:Legacy" -``` - -### Adding tags by dataset urn pattern - -Let’s suppose we’d like to append a series of tags to specific datasets. To do so, we can use the `pattern_add_dataset_tags` module that’s included in the ingestion framework. This will match the regex pattern to `urn` of the dataset and assign the respective tags urns given in the array. - -The config, which we’d append to our ingestion recipe YAML, would look like this: - -```yaml -transformers: - - type: "pattern_add_dataset_tags" - config: - tag_pattern: - rules: - ".*example1.*": ["urn:li:tag:NeedsDocumentation", "urn:li:tag:Legacy"] - ".*example2.*": ["urn:li:tag:NeedsDocumentation"] -``` - -### Adding tags by schema field pattern - -We can also append a series of tags to specific schema fields. To do so, we can use the `pattern_add_dataset_schema_tags` module. This will match the regex pattern to each schema field path and assign the respective tags urns given in the array. - -Note that the tags from the first matching pattern will be applied, not all matching patterns. - -The config would look like this: - -```yaml -transformers: - - type: "pattern_add_dataset_schema_tags" - config: - tag_pattern: - rules: - ".*email.*": ["urn:li:tag:Email"] - ".*name.*": ["urn:li:tag:Name"] -``` - -### Add your own custom Transformer - -If you'd like to add more complex logic for assigning tags, you can use the more generic add_dataset_tags transformer, which calls a user-provided function to determine the tags for each dataset. - -```yaml -transformers: - - type: "add_dataset_tags" - config: - get_tags_to_add: "." -``` - -Then define your function to return a list of TagAssociationClass tags, for example: - -```python -import logging - -import datahub.emitter.mce_builder as builder -from datahub.metadata.schema_classes import ( - DatasetSnapshotClass, - TagAssociationClass -) - -def custom_tags(current: DatasetSnapshotClass) -> List[TagAssociationClass]: - """ Returns tags to associate to a dataset depending on custom logic - - This function receives a DatasetSnapshotClass, performs custom logic and returns - a list of TagAssociationClass-wrapped tags. - - Args: - current (DatasetSnapshotClass): Single DatasetSnapshotClass object - - Returns: - List of TagAssociationClass objects. - """ - - tag_strings = [] - - ### Add custom logic here - tag_strings.append('custom1') - tag_strings.append('custom2') - - tag_strings = [builder.make_tag_urn(tag=n) for n in tag_strings] - tags = [TagAssociationClass(tag=tag) for tag in tag_strings] - - logging.info(f"Tagging dataset {current.urn} with {tag_strings}.") - return tags -``` -Finally, you can install and use your custom transformer as [shown here](#installing-the-package). - -### Adding a set of glossary terms - -We can use a similar convention to associate [Glossary Terms](../docs/generated/ingestion/sources/business-glossary.md) to datasets. We can use the `simple_add_dataset_terms` module that’s included in the ingestion framework. - -The config, which we’d append to our ingestion recipe YAML, would look like this: - -```yaml -transformers: - - type: "simple_add_dataset_terms" - config: - term_urns: - - "urn:li:glossaryTerm:Email" - - "urn:li:glossaryTerm:Address" -``` - -### Adding glossary terms by dataset urn pattern - -Similar to the above example with tags, we can add glossary terms to datasets based on a regex filter. - -```yaml -transformers: - - type: "pattern_add_dataset_terms" - config: - term_pattern: - rules: - ".*example1.*": ["urn:li:glossaryTerm:Email", "urn:li:glossaryTerm:Address"] - ".*example2.*": ["urn:li:glossaryTerm:PostalCode"] -``` - -### Adding glossary terms by schema field pattern - -Similar to the above example with tags applied to schema fields, we can add glossary terms to schema fields based on a regex filter. -Again, note that only terms from the first matching pattern will be applied. - -```yaml -transformers: - - type: "pattern_add_dataset_schema_terms" - config: - term_pattern: - rules: - ".*email.*": ["urn:li:glossaryTerm:Email"] - ".*name.*": ["urn:li:glossaryTerm:Name"] -``` - -### Change owners - -If we wanted to clear existing owners sent by ingestion source we can use the `simple_remove_dataset_ownership` module which removes all owners sent by the ingestion source. - -```yaml -transformers: - - type: "simple_remove_dataset_ownership" - config: {} -``` - -The main use case of `simple_remove_dataset_ownership` is to remove incorrect owners present in the source. You can use it along with the next `simple_add_dataset_ownership` to remove wrong owners and add the correct ones. - -Let’s suppose we’d like to append a series of users who we know to own a dataset but aren't detected during normal ingestion. To do so, we can use the `simple_add_dataset_ownership` module that’s included in the ingestion framework. - -The config, which we’d append to our ingestion recipe YAML, would look like this: - -```yaml -transformers: - - type: "simple_add_dataset_ownership" - config: - owner_urns: - - "urn:li:corpuser:username1" - - "urn:li:corpuser:username2" - - "urn:li:corpGroup:groupname" - ownership_type: "PRODUCER" -``` - -Note `ownership_type` is an optional field with `DATAOWNER` as default value. - -### Setting ownership by dataset urn pattern - -Again, let’s suppose we’d like to append a series of users who we know to own a different dataset from a data source but aren't detected during normal ingestion. To do so, we can use the `pattern_add_dataset_ownership` module that’s included in the ingestion framework. This will match the pattern to `urn` of the dataset and assign the respective owners. - -The config, which we’d append to our ingestion recipe YAML, would look like this: - -```yaml -transformers: - - type: "pattern_add_dataset_ownership" - config: - owner_pattern: - rules: - ".*example1.*": ["urn:li:corpuser:username1"] - ".*example2.*": ["urn:li:corpuser:username2"] - ownership_type: "DEVELOPER" -``` - -Note `ownership_type` is an optional field with `DATAOWNER` as default value. - -If you'd like to add more complex logic for assigning ownership, you can use the more generic `add_dataset_ownership` transformer, which calls a user-provided function to determine the ownership of each dataset. - -```yaml -transformers: - - type: "add_dataset_ownership" - config: - get_owners_to_add: "." -``` - -Note that whatever owners you send via this will overwrite the owners present in the UI. -### Add domain to dataset -let’s suppose we’d like to add a series of domain to dataset, in this case you can use `simple_add_dataset_domain` transformer. - -The config, which we’d append to our ingestion recipe YAML, would look like this: - -Here we can set `domains` to either urn (i.e. `urn:li:domain:engineering`) or simple domain name (i.e. `engineering`) -in both of the cases domain should be provisioned on DataHub GMS - -```yaml -transformers: - - type: "simple_add_dataset_domain" - config: - semantics: OVERWRITE - domains: - - urn:li:domain:engineering -``` -It will add domain to all datasets, above yaml configuration will overwrite the existing domain of the datasets on DataHub GMS, -if you want to preserve domain stored on DataHub GMS then set semantics to `PATCH` as shown in below configuration - -```yaml -transformers: - - type: "simple_add_dataset_domain" - config: - semantics: PATCH - domains: - - urn:li:domain:engineering -``` - -### Adding domain by dataset urn pattern -Let’s suppose we’d like to append a series of domain to specific datasets. To do so, we can use the `pattern_add_dataset_domain` transformer that’s included in the ingestion framework. This will match the regex pattern to `urn` of the dataset and assign the respective domain urns given in the array. - -The config, which we’d append to our ingestion recipe YAML, would look like this: - -Here we can set domain list to either urn (i.e. `urn:li:domain:hr`) or simple domain name (i.e. `hr`) -in both of the cases domain should be provisioned on DataHub GMS - -```yaml - transformers: - - type: "pattern_add_dataset_domain" - config: - semantics: OVERWRITE - domain_pattern: - rules: - 'urn:li:dataset:\(urn:li:dataPlatform:postgres,postgres\.public\.n.*': ["hr"] - 'urn:li:dataset:\(urn:li:dataPlatform:postgres,postgres\.public\.t.*': ["urn:li:domain:finance"] -``` - -We can set semantics to `PATCH` to preserve the domain of the dataset stored on DataHub GMS. - -### Mark dataset status - -If you would like to stop a dataset from appearing in the UI, then you need to mark the status of the dataset as removed. You can use this transformer after filtering for the specific datasets that you want to mark as removed. - -```yaml -transformers: - - type: "mark_dataset_status" - config: - removed: true -``` - -### Add dataset browse paths - -If you would like to add to browse paths of dataset can use this transformer. There are 3 optional variables that you can use to get information from the dataset `urn`: -- ENV: env passed (default: prod) -- PLATFORM: `mysql`, `postgres` or different platform supported by datahub -- DATASET_PARTS: slash separated parts of dataset name. e.g. `database_name/schema_name/[table_name]` for postgres - -e.g. this can be used to create browse paths like `/prod/postgres/superset/public/logs` for table `superset.public.logs` in a `postgres` database -```yaml -transformers: - - type: "set_dataset_browse_path" - config: - path_templates: - - /ENV/PLATFORM/DATASET_PARTS -``` - -If you don't want the environment but wanted to add something static in the browse path like the database instance name you can use this. -```yaml -transformers: - - type: "set_dataset_browse_path" - config: - path_templates: - - /PLATFORM/marketing_db/DATASET_PARTS -``` -It will create browse path like `/mysql/marketing_db/sales/orders` for a table `sales.orders` in `mysql` database instance. - -You can use this to add multiple browse paths. Different people might know the same data assets by different names. -```yaml -transformers: - - type: "set_dataset_browse_path" - config: - path_templates: - - /PLATFORM/marketing_db/DATASET_PARTS - - /data_warehouse/DATASET_PARTS -``` -This will add 2 browse paths like `/mysql/marketing_db/sales/orders` and `/data_warehouse/sales/orders` for a table `sales.orders` in `mysql` database instance. - -Default behaviour of the transform is to add new browse paths, you can optionally set `replace_existing: True` so -the transform becomes a _set_ operation instead of an _append_. -```yaml -transformers: - - type: "set_dataset_browse_path" - config: - replace_existing: True - path_templates: - - /ENV/PLATFORM/DATASET_PARTS -``` -In this case, the resulting dataset will have only 1 browse path, the one from the transform. - -Note that whatever browse paths you send via this will overwrite the browse paths present in the UI. - - -### Adding a set of properties - -If you'd like to add more complex logic for assigning properties, you can use the `add_dataset_properties` transformer, which calls a user-provided class (that extends from `AddDatasetPropertiesResolverBase` class) to determine the properties for each dataset. - -The config, which we’d append to our ingestion recipe YAML, would look like this: - -```yaml -transformers: - - type: "add_dataset_properties" - config: - add_properties_resolver_class: "." -``` - -Then define your class to return a list of custom properties, for example: - -```python -import logging -from typing import Dict -from datahub.ingestion.transformer.add_dataset_properties import AddDatasetPropertiesResolverBase -from datahub.metadata.schema_classes import DatasetSnapshotClass - -class MyPropertiesResolver(AddDatasetPropertiesResolverBase): - def get_properties_to_add(self, current: DatasetSnapshotClass) -> Dict[str, str]: - ### Add custom logic here - properties= {'my_custom_property': 'property value'} - logging.info(f"Adding properties: {properties} to dataset: {current.urn}.") - return properties -``` - -There also exists `simple_add_dataset_properties` transformer for directly assigning properties from the configuration. -`properties` field is a dictionary of string values. Note in case of any key collision, the value in the config will -overwrite the previous value. - -```yaml -transformers: - - type: "simple_add_dataset_properties" - config: - properties: - prop1: value1 - prop2: value2 -``` - -## Writing a custom transformer from scratch - -In the above couple of examples, we use classes that have already been implemented in the ingestion framework. However, it’s common for more advanced cases to pop up where custom code is required, for instance if you'd like to utilize conditional logic or rewrite properties. In such cases, we can add our own modules and define the arguments it takes as a custom transformer. - -As an example, suppose we want to append a set of ownership fields to our metadata that are dependent upon an external source – for instance, an API endpoint or file – rather than a preset list like above. In this case, we can set a JSON file as an argument to our custom config, and our transformer will read this file and append the included ownership elements to all metadata events. - -Our JSON file might look like the following: - -```json -[ - "urn:li:corpuser:athos", - "urn:li:corpuser:porthos", - "urn:li:corpuser:aramis", - "urn:li:corpGroup:the_three_musketeers" -] -``` - -### Defining a config - -To get started, we’ll initiate an `AddCustomOwnershipConfig` class that inherits from [`datahub.configuration.common.ConfigModel`](./src/datahub/configuration/common.py). The sole parameter will be an `owners_json` which expects a path to a JSON file containing a list of owner URNs. This will go in a file called `custom_transform_example.py`. - -```python -from datahub.configuration.common import ConfigModel - -class AddCustomOwnershipConfig(ConfigModel): - owners_json: str -``` - -### Defining the transformer - -Next, we’ll define the transformer itself, which must inherit from [`datahub.ingestion.api.transform.Transformer`](./src/datahub/ingestion/api/transform.py). The framework provides a helper class called [`datahub.ingestion.transformer.base_transformer.BaseTransformer`](./src/datahub/ingestion/transformer/base_transformer.py) that makes it super-simple to write transformers. -First, let's get all our imports in: - -```python -# append these to the start of custom_transform_example.py -import json -from typing import List, Optional - -from datahub.configuration.common import ConfigModel -from datahub.ingestion.api.common import PipelineContext -from datahub.ingestion.transformer.add_dataset_ownership import Semantics -from datahub.ingestion.transformer.base_transformer import ( - BaseTransformer, - SingleAspectTransformer, -) -from datahub.metadata.schema_classes import ( - OwnerClass, - OwnershipClass, - OwnershipTypeClass, -) - -``` - -Next, let's define the base scaffolding for the class: - -```python -# append this to the end of custom_transform_example.py - -class AddCustomOwnership(BaseTransformer, SingleAspectTransformer): - """Transformer that adds owners to datasets according to a callback function.""" - - # context param to generate run metadata such as a run ID - ctx: PipelineContext - # as defined in the previous block - config: AddCustomOwnershipConfig - - def __init__(self, config: AddCustomOwnershipConfig, ctx: PipelineContext): - super().__init__() - self.ctx = ctx - self.config = config - - with open(self.config.owners_json, "r") as f: - raw_owner_urns = json.load(f) - - self.owners = [ - OwnerClass(owner=owner, type=OwnershipTypeClass.DATAOWNER) - for owner in raw_owner_urns - ] -``` - -A transformer must have two functions: a `create()` function for initialization and a `transform()` function for executing the transformation. Transformers that extend `BaseTransformer` and `SingleAspectTransformer` can avoid having to implement the more complex `transform` function and just implement the `transform_aspect` function. - -Let's begin by adding a `create()` method for parsing our configuration dictionary: - -```python -# add this as a function of AddCustomOwnership - -@classmethod -def create(cls, config_dict: dict, ctx: PipelineContext) -> "AddCustomOwnership": - config = AddCustomOwnershipConfig.parse_obj(config_dict) - return cls(config, ctx) -``` - -Next we need to tell the helper classes which entity types and aspect we are interested in transforming. In this case, we want to only process `dataset` entities and transform the `ownership` aspect. - -```python -def entity_types(self) -> List[str]: - return ["dataset"] - - def aspect_name(self) -> str: - return "ownership" -``` - -Finally we need to implement the `transform_aspect()` method that does the work of adding our custom ownership classes. This method will be called be the framework with an optional aspect value filled out if the upstream source produced a value for this aspect. The framework takes care of pre-processing both MCE-s and MCP-s so that the `transform_aspect()` function is only called one per entity. Our job is merely to inspect the incoming aspect (or absence) and produce a transformed value for this aspect. Returning `None` from this method will effectively suppress this aspect from being emitted. - -```python -# add this as a function of AddCustomOwnership - - def transform_aspect( # type: ignore - self, entity_urn: str, aspect_name: str, aspect: Optional[OwnershipClass] - ) -> Optional[OwnershipClass]: - - owners_to_add = self.owners - assert aspect is None or isinstance(aspect, OwnershipClass) - - if owners_to_add: - ownership = ( - aspect - if aspect - else OwnershipClass( - owners=[], - ) - ) - ownership.owners.extend(owners_to_add) - - return ownership -``` - -### More Sophistication: Making calls to DataHub during Transformation - -In some advanced cases, you might want to check with DataHub before performing a transformation. A good example for this might be retrieving the current set of owners of a dataset before providing the new set of owners during an ingestion process. To allow transformers to always be able to query the graph, the framework provides them access to the graph through the context object `ctx`. Connectivity to the graph is automatically instantiated anytime the pipeline uses a REST sink. In case you are using the Kafka sink, you can additionally provide access to the graph by configuring it in your pipeline. - -Here is an example of a recipe that uses Kafka as the sink, but provides access to the graph by explicitly configuring the `datahub_api`. - -```yaml -source: - type: mysql - config: - # ..source configs - -sink: - type: datahub-kafka - config: - connection: - bootstrap: localhost:9092 - schema_registry_url: "http://localhost:8081" - -datahub_api: - server: http://localhost:8080 - # standard configs accepted by datahub rest client ... -``` - -#### Advanced Use-Case: Patching Owners - -With the above capability, we can now build more powerful transformers that can check with the server-side state before issuing changes in metadata. -e.g. Here is how the AddDatasetOwnership transformer can now support PATCH semantics by ensuring that it never deletes any owners that are stored on the server. - -```python -def transform_one(self, mce: MetadataChangeEventClass) -> MetadataChangeEventClass: - if not isinstance(mce.proposedSnapshot, DatasetSnapshotClass): - return mce - owners_to_add = self.config.get_owners_to_add(mce.proposedSnapshot) - if owners_to_add: - ownership = builder.get_or_add_aspect( - mce, - OwnershipClass( - owners=[], - ), - ) - ownership.owners.extend(owners_to_add) - - if self.config.semantics == Semantics.PATCH: - assert self.ctx.graph - patch_ownership = AddDatasetOwnership.get_ownership_to_set( - self.ctx.graph, mce.proposedSnapshot.urn, ownership - ) - builder.set_aspect( - mce, aspect=patch_ownership, aspect_type=OwnershipClass - ) - return mce -``` - -### Installing the package - -Now that we've defined the transformer, we need to make it visible to DataHub. The easiest way to do this is to just place it in the same directory as your recipe, in which case the module name is the same as the file – in this case, `custom_transform_example`. - -
- Advanced: installing as a package -Alternatively, create a `setup.py` in the same directory as our transform script to make it visible globally. After installing this package (e.g. with `python setup.py` or `pip install -e .`), our module will be installed and importable as `custom_transform_example`. - -```python -from setuptools import find_packages, setup - -setup( - name="custom_transform_example", - version="1.0", - packages=find_packages(), - # if you don't already have DataHub installed, add it under install_requires - # install_requires=["acryl-datahub"] -) -``` - -
- -### Running the transform - -```yaml -transformers: - - type: "custom_transform_example.AddCustomOwnership" - config: - owners_json: "" # the JSON file mentioned at the start -``` - -After running `datahub ingest -c `, our MCEs will now have the following owners appended: - -```json -"owners": [ - { - "owner": "urn:li:corpuser:athos", - "type": "DATAOWNER", - "source": null - }, - { - "owner": "urn:li:corpuser:porthos", - "type": "DATAOWNER", - "source": null - }, - { - "owner": "urn:li:corpuser:aramis", - "type": "DATAOWNER", - "source": null - }, - { - "owner": "urn:li:corpGroup:the_three_musketeers", - "type": "DATAOWNER", - "source": null - }, - // ...and any additional owners -], -``` - -All the files for this tutorial may be found [here](./examples/transforms/). diff --git a/metadata-jobs/mae-consumer-job/src/main/java/com/linkedin/metadata/kafka/MaeConsumerApplication.java b/metadata-jobs/mae-consumer-job/src/main/java/com/linkedin/metadata/kafka/MaeConsumerApplication.java index cd856399c4e890..cf04c5bfb61b2a 100644 --- a/metadata-jobs/mae-consumer-job/src/main/java/com/linkedin/metadata/kafka/MaeConsumerApplication.java +++ b/metadata-jobs/mae-consumer-job/src/main/java/com/linkedin/metadata/kafka/MaeConsumerApplication.java @@ -2,6 +2,7 @@ import com.linkedin.gms.factory.telemetry.ScheduledAnalyticsFactory; import org.springframework.boot.SpringApplication; +import org.springframework.boot.actuate.autoconfigure.solr.SolrHealthContributorAutoConfiguration; import org.springframework.boot.autoconfigure.SpringBootApplication; import org.springframework.boot.autoconfigure.cassandra.CassandraAutoConfiguration; import org.springframework.boot.autoconfigure.elasticsearch.ElasticsearchRestClientAutoConfiguration; @@ -10,7 +11,8 @@ @SuppressWarnings("checkstyle:HideUtilityClassConstructor") -@SpringBootApplication(exclude = {ElasticsearchRestClientAutoConfiguration.class, CassandraAutoConfiguration.class}) +@SpringBootApplication(exclude = {ElasticsearchRestClientAutoConfiguration.class, CassandraAutoConfiguration.class, + SolrHealthContributorAutoConfiguration.class}) @ComponentScan(excludeFilters = { @ComponentScan.Filter(type = FilterType.ASSIGNABLE_TYPE, classes = ScheduledAnalyticsFactory.class)}) public class MaeConsumerApplication { diff --git a/metadata-jobs/mae-consumer/src/main/java/com/linkedin/metadata/kafka/MetadataChangeLogProcessor.java b/metadata-jobs/mae-consumer/src/main/java/com/linkedin/metadata/kafka/MetadataChangeLogProcessor.java index 02946464d5fab3..6700dec65a11d8 100644 --- a/metadata-jobs/mae-consumer/src/main/java/com/linkedin/metadata/kafka/MetadataChangeLogProcessor.java +++ b/metadata-jobs/mae-consumer/src/main/java/com/linkedin/metadata/kafka/MetadataChangeLogProcessor.java @@ -81,6 +81,9 @@ public void consume(final ConsumerRecord consumerRecord) // Here - plug in additional "custom processor hooks" for (MetadataChangeLogHook hook : this.hooks) { + if (!hook.isEnabled()) { + continue; + } try (Timer.Context ignored = MetricUtils.timer(this.getClass(), hook.getClass().getSimpleName() + "_latency") .time()) { hook.invoke(event); diff --git a/metadata-jobs/mae-consumer/src/main/java/com/linkedin/metadata/kafka/hook/MetadataChangeLogHook.java b/metadata-jobs/mae-consumer/src/main/java/com/linkedin/metadata/kafka/hook/MetadataChangeLogHook.java index bc5ab2a01d055e..c7857eb7baffc1 100644 --- a/metadata-jobs/mae-consumer/src/main/java/com/linkedin/metadata/kafka/hook/MetadataChangeLogHook.java +++ b/metadata-jobs/mae-consumer/src/main/java/com/linkedin/metadata/kafka/hook/MetadataChangeLogHook.java @@ -18,6 +18,13 @@ public interface MetadataChangeLogHook { */ default void init() { } + /** + * Return whether the hook is enabled or not. If not enabled, the below invoke method is not triggered + */ + default boolean isEnabled() { + return true; + } + /** * Invoke the hook when a MetadataChangeLog is received */ diff --git a/metadata-jobs/mae-consumer/src/main/java/com/linkedin/metadata/kafka/hook/event/EntityChangeEventGeneratorHook.java b/metadata-jobs/mae-consumer/src/main/java/com/linkedin/metadata/kafka/hook/event/EntityChangeEventGeneratorHook.java index c0603d07cb6e48..1b78f77687e33d 100644 --- a/metadata-jobs/mae-consumer/src/main/java/com/linkedin/metadata/kafka/hook/event/EntityChangeEventGeneratorHook.java +++ b/metadata-jobs/mae-consumer/src/main/java/com/linkedin/metadata/kafka/hook/event/EntityChangeEventGeneratorHook.java @@ -35,6 +35,7 @@ import javax.annotation.Nullable; import lombok.extern.slf4j.Slf4j; import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.beans.factory.annotation.Value; import org.springframework.context.annotation.Import; import org.springframework.stereotype.Component; @@ -91,17 +92,25 @@ public class EntityChangeEventGeneratorHook implements MetadataChangeLogHook { private final EntityClient _entityClient; private final Authentication _systemAuthentication; private final EntityRegistry _entityRegistry; + private final Boolean _isEnabled; @Autowired public EntityChangeEventGeneratorHook( @Nonnull final AspectDifferRegistry aspectDifferRegistry, @Nonnull final RestliEntityClient entityClient, @Nonnull final Authentication systemAuthentication, - @Nonnull final EntityRegistry entityRegistry) { + @Nonnull final EntityRegistry entityRegistry, + @Nonnull @Value("${entityChangeEvents.enabled:true}") Boolean isEnabled) { _aspectDifferRegistry = Objects.requireNonNull(aspectDifferRegistry); _entityClient = Objects.requireNonNull(entityClient); _systemAuthentication = Objects.requireNonNull(systemAuthentication); _entityRegistry = Objects.requireNonNull(entityRegistry); + _isEnabled = isEnabled; + } + + @Override + public boolean isEnabled() { + return _isEnabled; } @Override diff --git a/metadata-jobs/mae-consumer/src/test/java/com/linkedin/metadata/kafka/hook/event/EntityChangeEventGeneratorHookTest.java b/metadata-jobs/mae-consumer/src/test/java/com/linkedin/metadata/kafka/hook/event/EntityChangeEventGeneratorHookTest.java index 4c87d2c93daf02..e2cecfbeb20d2f 100644 --- a/metadata-jobs/mae-consumer/src/test/java/com/linkedin/metadata/kafka/hook/event/EntityChangeEventGeneratorHookTest.java +++ b/metadata-jobs/mae-consumer/src/test/java/com/linkedin/metadata/kafka/hook/event/EntityChangeEventGeneratorHookTest.java @@ -79,7 +79,8 @@ public void setupTest() { differRegistry, _mockClient, mockAuthentication, - createMockEntityRegistry()); + createMockEntityRegistry(), + true); } @Test diff --git a/metadata-jobs/mce-consumer-job/src/main/java/com/linkedin/metadata/kafka/MceConsumerApplication.java b/metadata-jobs/mce-consumer-job/src/main/java/com/linkedin/metadata/kafka/MceConsumerApplication.java index ba605a59abd884..840abedc20e2ad 100644 --- a/metadata-jobs/mce-consumer-job/src/main/java/com/linkedin/metadata/kafka/MceConsumerApplication.java +++ b/metadata-jobs/mce-consumer-job/src/main/java/com/linkedin/metadata/kafka/MceConsumerApplication.java @@ -2,6 +2,7 @@ import com.linkedin.gms.factory.telemetry.ScheduledAnalyticsFactory; import org.springframework.boot.SpringApplication; +import org.springframework.boot.actuate.autoconfigure.solr.SolrHealthContributorAutoConfiguration; import org.springframework.boot.autoconfigure.SpringBootApplication; import org.springframework.boot.autoconfigure.cassandra.CassandraAutoConfiguration; import org.springframework.boot.autoconfigure.elasticsearch.ElasticsearchRestClientAutoConfiguration; @@ -10,7 +11,8 @@ @SuppressWarnings("checkstyle:HideUtilityClassConstructor") -@SpringBootApplication(exclude = {ElasticsearchRestClientAutoConfiguration.class, CassandraAutoConfiguration.class}) +@SpringBootApplication(exclude = {ElasticsearchRestClientAutoConfiguration.class, CassandraAutoConfiguration.class, + SolrHealthContributorAutoConfiguration.class}) @ComponentScan(excludeFilters = { @ComponentScan.Filter(type = FilterType.ASSIGNABLE_TYPE, classes = ScheduledAnalyticsFactory.class)}) public class MceConsumerApplication { diff --git a/metadata-service/factories/src/main/resources/application.yml b/metadata-service/factories/src/main/resources/application.yml index edddf4a0ab701f..be259a12724a53 100644 --- a/metadata-service/factories/src/main/resources/application.yml +++ b/metadata-service/factories/src/main/resources/application.yml @@ -199,3 +199,6 @@ siblings: featureFlags: showSimplifiedHomepageByDefault: ${SHOW_SIMPLIFIED_HOMEPAGE_BY_DEFAULT:false} # shows a simplified homepage with just datasets, charts and dashboards by default to users. this can be configured in user settings + +entityChangeEvents: + enabled: ${ENABLE_ENTITY_CHANGE_EVENTS_HOOK:true} diff --git a/smoke-test/smoke.sh b/smoke-test/smoke.sh index 141f0c70e17f09..133ed51dda6439 100755 --- a/smoke-test/smoke.sh +++ b/smoke-test/smoke.sh @@ -21,6 +21,7 @@ pip install -r requirements.txt echo "DATAHUB_VERSION = $DATAHUB_VERSION" DATAHUB_TELEMETRY_ENABLED=false datahub docker quickstart --quickstart-compose-file ../docker/quickstart/docker-compose-without-neo4j.quickstart.yml --dump-logs-on-failure +#DATAHUB_TELEMETRY_ENABLED=false datahub docker quickstart --standalone_consumers --build-locally --dump-logs-on-failure (cd ..; ./gradlew :smoke-test:yarnInstall)