diff --git a/.github/actions/ci-optimization/action.yml b/.github/actions/ci-optimization/action.yml index 2f677a0e552c23..ff901b5de04b65 100644 --- a/.github/actions/ci-optimization/action.yml +++ b/.github/actions/ci-optimization/action.yml @@ -1,5 +1,5 @@ -name: 'Identify CI Optimizations' -description: 'Determine if code changes are specific to certain modules.' +name: "Identify CI Optimizations" +description: "Determine if code changes are specific to certain modules." outputs: frontend-only: @@ -44,27 +44,25 @@ outputs: runs: using: "composite" steps: - - uses: dorny/paths-filter@v2 + - uses: dorny/paths-filter@v3 id: filter with: + token: "" # Empty token forces it to use raw git commands. filters: | frontend: - "datahub-frontend/**" - "datahub-web-react/**" - - "smoke-test/tests/cypress/**" - "docker/datahub-frontend/**" ingestion: - "metadata-ingestion-modules/**" - "metadata-ingestion/**" - "metadata-models/**" - - "smoke-test/**" - "docker/datahub-ingestion**" ingestion-base: - "docker/datahub-ingestion-base/**" docker: - "docker/**" backend: - - ".github/**" - "metadata-models/**" - "datahub-upgrade/**" - "entity-registry/**" @@ -78,7 +76,6 @@ runs: - "metadata-utils/**" - "metadata-operation-context/**" - "datahub-graphql-core/**" - - "smoke-test/**" - "docker/**" kafka-setup: - "docker/kafka-setup/**" diff --git a/.github/actions/docker-custom-build-and-push/action.yml b/.github/actions/docker-custom-build-and-push/action.yml index 1c4a777c14802a..3805b3501ccec0 100644 --- a/.github/actions/docker-custom-build-and-push/action.yml +++ b/.github/actions/docker-custom-build-and-push/action.yml @@ -26,10 +26,13 @@ inputs: build-args: description: "List of build-time variables. Same as docker/build-push-action" required: false - tags: - # e.g. latest,head,sha12345 - description: "List of tags to use for the Docker image" + image_tag: + # e.g. pr12345 OR head OR v0.1.2.3 + description: "Main tag to use for the Docker image" required: true + flavor: + description: 'Image flavor (e.g., slim, full)' + required: false target: description: "Sets the target stage to build" required: false @@ -45,13 +48,16 @@ runs: steps: - name: Docker meta id: docker_meta - uses: crazy-max/ghaction-docker-meta@v1 + uses: docker/metadata-action@v5 with: - # list of Docker images to use as base name for tags images: ${{ inputs.images }} - # add git short SHA as Docker tag - tag-custom: ${{ inputs.tags }} - tag-custom-only: true + flavor: | + latest=false + suffix=${{ inputs.flavor && format('-{0}', inputs.flavor) || '' }} + tags: | + type=raw,value=${{ inputs.image_tag }} + type=raw,value=head,enable={{is_default_branch}} + type=sha,prefix=,format=short # Code for testing the build when not pushing to Docker Hub. - name: Build and Load image for testing (if not publishing) @@ -74,10 +80,13 @@ runs: if: ${{ inputs.publish != 'true' }} shell: bash run: | + IMAGES=""" + ${{ inputs.images }} + """ TAGS=""" - ${{ steps.docker_meta.outputs.tags }} + ${{ inputs.image_tag }} """ - echo "SINGLE_TAG=$(echo $TAGS | tr '\n' ' ' | awk -F' ' '{ print $1 }')" >> $GITHUB_OUTPUT + echo "SINGLE_TAG=$(echo $IMAGES | tr '\n' ' ' | awk -F' |,' '{ print $1 }'):$(echo $TAGS | tr '\n' ' ' | awk -F' |,' '{ print $1 }')" >> $GITHUB_OUTPUT id: single_tag - name: Upload image locally for testing (if not publishing) uses: ishworkh/docker-image-artifact-upload@v1 diff --git a/.github/scripts/docker_helpers.sh b/.github/scripts/docker_helpers.sh index e031a6d2a4d843..138c8649820ec5 100755 --- a/.github/scripts/docker_helpers.sh +++ b/.github/scripts/docker_helpers.sh @@ -5,22 +5,22 @@ export MAIN_BRANCH="master" export MAIN_BRANCH_TAG="head" function get_short_sha { - echo $(git rev-parse --short "$GITHUB_SHA") + echo $(git rev-parse --short "$GITHUB_SHA"|head -c7) } export SHORT_SHA=$(get_short_sha) echo "SHORT_SHA: $SHORT_SHA" function get_tag { - echo $(echo ${GITHUB_REF} | sed -e "s,refs/heads/${MAIN_BRANCH},${MAIN_BRANCH_TAG},g" -e 's,refs/tags/,,g' -e 's,refs/pull/\([0-9]*\).*,pr\1,g'),${SHORT_SHA} + echo $(echo ${GITHUB_REF} | sed -e "s,refs/heads/${MAIN_BRANCH},${MAIN_BRANCH_TAG},g" -e 's,refs/tags/,,g' -e 's,refs/pull/\([0-9]*\).*,pr\1,g') } function get_tag_slim { - echo $(echo ${GITHUB_REF} | sed -e "s,refs/heads/${MAIN_BRANCH},${MAIN_BRANCH_TAG}-slim,g" -e 's,refs/tags/\(.*\),\1-slim,g' -e 's,refs/pull/\([0-9]*\).*,pr\1-slim,g'),${SHORT_SHA}-slim + echo $(echo ${GITHUB_REF} | sed -e "s,refs/heads/${MAIN_BRANCH},${MAIN_BRANCH_TAG}-slim,g" -e 's,refs/tags/\(.*\),\1-slim,g' -e 's,refs/pull/\([0-9]*\).*,pr\1-slim,g') } function get_tag_full { - echo $(echo ${GITHUB_REF} | sed -e "s,refs/heads/${MAIN_BRANCH},${MAIN_BRANCH_TAG}-full,g" -e 's,refs/tags/\(.*\),\1-full,g' -e 's,refs/pull/\([0-9]*\).*,pr\1-full,g'),${SHORT_SHA}-full + echo $(echo ${GITHUB_REF} | sed -e "s,refs/heads/${MAIN_BRANCH},${MAIN_BRANCH_TAG}-full,g" -e 's,refs/tags/\(.*\),\1-full,g' -e 's,refs/pull/\([0-9]*\).*,pr\1-full,g') } function get_python_docker_release_v { @@ -38,3 +38,11 @@ function get_unique_tag_slim { function get_unique_tag_full { echo $(echo ${GITHUB_REF} | sed -e "s,refs/heads/${MAIN_BRANCH},${SHORT_SHA}-full,g" -e 's,refs/tags/\(.*\),\1-full,g' -e 's,refs/pull/\([0-9]*\).*,pr\1-full,g') } + +function get_platforms_based_on_branch { + if [ "${{ github.event_name }}" == 'push' && "${{ github.ref }}" == "refs/heads/${MAIN_BRANCH}" ]; then + echo "linux/amd64,linux/arm64" + else + echo "linux/amd64" + fi +} diff --git a/.github/scripts/docker_logs.sh b/.github/scripts/docker_logs.sh new file mode 100644 index 00000000000000..918b859fbe5b1d --- /dev/null +++ b/.github/scripts/docker_logs.sh @@ -0,0 +1,8 @@ +TARGET_DIR="${TARGET_DIR:=docker_logs}" +TEST_STRATEGY="${TEST_STRATEGY:=}" + +mkdir -p "$TARGET_DIR" +for name in `docker ps -a --format '{{.Names}}'`; +do + docker logs "$name" >& "${TARGET_DIR}/${name}${TEST_STRATEGY}.log" || true +done \ No newline at end of file diff --git a/.github/workflows/build-and-test.yml b/.github/workflows/build-and-test.yml index c93267947b65a8..b0666f4a42aac8 100644 --- a/.github/workflows/build-and-test.yml +++ b/.github/workflows/build-and-test.yml @@ -57,6 +57,11 @@ jobs: timeout-minutes: 60 needs: setup steps: + - name: Free up disk space + run: | + sudo apt-get remove 'dotnet-*' azure-cli || true + sudo rm -rf /usr/local/lib/android/ || true + sudo docker image prune -a -f || true - uses: szenius/set-timezone@v1.2 with: timezoneLinux: ${{ matrix.timezone }} diff --git a/.github/workflows/docker-unified.yml b/.github/workflows/docker-unified.yml index 9487e71e8da3d1..32e68a76a88f5a 100644 --- a/.github/workflows/docker-unified.yml +++ b/.github/workflows/docker-unified.yml @@ -47,7 +47,6 @@ jobs: publish: ${{ steps.publish.outputs.publish }} pr-publish: ${{ steps.pr-publish.outputs.publish }} python_release_version: ${{ steps.tag.outputs.python_release_version }} - short_sha: ${{ steps.tag.outputs.short_sha }} branch_name: ${{ steps.tag.outputs.branch_name }} repository_name: ${{ steps.tag.outputs.repository_name }} frontend_change: ${{ steps.ci-optimize.outputs.frontend-change == 'true' }} @@ -61,6 +60,7 @@ jobs: mysql_setup_change: ${{ steps.ci-optimize.outputs.mysql-setup-change == 'true' }} postgres_setup_change: ${{ steps.ci-optimize.outputs.postgres-setup-change == 'true' }} elasticsearch_setup_change: ${{ steps.ci-optimize.outputs.elasticsearch-setup-change == 'true' }} + smoke_test_change: ${{ steps.ci-optimize.outputs.smoke-test-change == 'true' }} steps: - name: Check out the repo uses: acryldata/sane-checkout-action@v3 @@ -157,7 +157,7 @@ jobs: with: images: | ${{ env.DATAHUB_GMS_IMAGE }} - tags: ${{ needs.setup.outputs.tag }} + image_tag: ${{ needs.setup.outputs.tag }} username: ${{ secrets.ACRYL_DOCKER_USERNAME }} password: ${{ secrets.ACRYL_DOCKER_PASSWORD }} publish: ${{ needs.setup.outputs.publish == 'true' || needs.setup.outputs.pr-publish == 'true' }} @@ -221,7 +221,7 @@ jobs: with: images: | ${{ env.DATAHUB_MAE_CONSUMER_IMAGE }} - tags: ${{ needs.setup.outputs.tag }} + image_tag: ${{ needs.setup.outputs.tag }} username: ${{ secrets.ACRYL_DOCKER_USERNAME }} password: ${{ secrets.ACRYL_DOCKER_PASSWORD }} publish: ${{ needs.setup.outputs.publish == 'true' || needs.setup.outputs.pr-publish == 'true' }} @@ -285,7 +285,7 @@ jobs: with: images: | ${{ env.DATAHUB_MCE_CONSUMER_IMAGE }} - tags: ${{ needs.setup.outputs.tag }} + image_tag: ${{ needs.setup.outputs.tag }} username: ${{ secrets.ACRYL_DOCKER_USERNAME }} password: ${{ secrets.ACRYL_DOCKER_PASSWORD }} publish: ${{ needs.setup.outputs.publish == 'true' || needs.setup.outputs.pr-publish == 'true' }} @@ -349,7 +349,7 @@ jobs: with: images: | ${{ env.DATAHUB_UPGRADE_IMAGE }} - tags: ${{ needs.setup.outputs.tag }} + image_tag: ${{ needs.setup.outputs.tag }} username: ${{ secrets.ACRYL_DOCKER_USERNAME }} password: ${{ secrets.ACRYL_DOCKER_PASSWORD }} publish: ${{ needs.setup.outputs.publish == 'true' || needs.setup.outputs.pr-publish == 'true' }} @@ -394,7 +394,7 @@ jobs: name: Build and Push DataHub Frontend Docker Image runs-on: ubuntu-latest needs: setup - if: ${{ needs.setup.outputs.frontend_change == 'true' || needs.setup.outputs.publish == 'true' }} + if: ${{ needs.setup.outputs.frontend_change == 'true' || needs.setup.outputs.publish == 'true' || needs.setup.outputs.pr-publish == 'true'}} steps: - name: Set up JDK 17 uses: actions/setup-java@v3 @@ -415,7 +415,7 @@ jobs: with: images: | ${{ env.DATAHUB_FRONTEND_IMAGE }} - tags: ${{ needs.setup.outputs.tag }} + image_tag: ${{ needs.setup.outputs.tag }} username: ${{ secrets.ACRYL_DOCKER_USERNAME }} password: ${{ secrets.ACRYL_DOCKER_PASSWORD }} publish: ${{ needs.setup.outputs.publish == 'true' || needs.setup.outputs.pr-publish == 'true' }} @@ -469,7 +469,7 @@ jobs: with: images: | ${{ env.DATAHUB_KAFKA_SETUP_IMAGE }} - tags: ${{ needs.setup.outputs.tag }} + image_tag: ${{ needs.setup.outputs.tag }} username: ${{ secrets.ACRYL_DOCKER_USERNAME }} password: ${{ secrets.ACRYL_DOCKER_PASSWORD }} publish: ${{ needs.setup.outputs.publish == 'true' || needs.setup.outputs.pr-publish == 'true' }} @@ -490,7 +490,7 @@ jobs: with: images: | ${{ env.DATAHUB_MYSQL_SETUP_IMAGE }} - tags: ${{ needs.setup.outputs.tag }} + image_tag: ${{ needs.setup.outputs.tag }} username: ${{ secrets.ACRYL_DOCKER_USERNAME }} password: ${{ secrets.ACRYL_DOCKER_PASSWORD }} publish: ${{ needs.setup.outputs.publish == 'true' || needs.setup.outputs.pr-publish == 'true' }} @@ -502,7 +502,7 @@ jobs: name: Build and Push DataHub Elasticsearch Setup Docker Image runs-on: ubuntu-latest needs: setup - if: ${{ needs.setup.outputs.elasticsearch_setup_change == 'true' || (needs.setup.outputs.publish == 'true' || needs.setup.outputs.pr-publish == 'true') }} + if: ${{ needs.setup.outputs.elasticsearch_setup_change == 'true' || (needs.setup.outputs.publish == 'true' || needs.setup.outputs.pr-publish == 'true' ) }} steps: - name: Check out the repo uses: acryldata/sane-checkout-action@v3 @@ -511,7 +511,7 @@ jobs: with: images: | ${{ env.DATAHUB_ELASTIC_SETUP_IMAGE }} - tags: ${{ needs.setup.outputs.tag }} + image_tag: ${{ needs.setup.outputs.tag }} username: ${{ secrets.ACRYL_DOCKER_USERNAME }} password: ${{ secrets.ACRYL_DOCKER_PASSWORD }} publish: ${{ needs.setup.outputs.publish == 'true' || needs.setup.outputs.pr-publish == 'true' }} @@ -525,7 +525,7 @@ jobs: outputs: tag: ${{ steps.tag.outputs.tag }} needs: setup - if: ${{ needs.setup.outputs.ingestion_change == 'true' || needs.setup.outputs.publish == 'true' }} + if: ${{ needs.setup.outputs.ingestion_change == 'true' || needs.setup.outputs.publish == 'true' || needs.setup.outputs.pr-publish == 'true' }} steps: - name: Check out the repo uses: acryldata/sane-checkout-action@v3 @@ -536,7 +536,7 @@ jobs: target: base images: | ${{ env.DATAHUB_INGESTION_BASE_IMAGE }} - tags: ${{ needs.setup.outputs.tag }} + image_tag: ${{ needs.setup.outputs.tag }} username: ${{ secrets.ACRYL_DOCKER_USERNAME }} password: ${{ secrets.ACRYL_DOCKER_PASSWORD }} publish: ${{ needs.setup.outputs.publish == 'true' || needs.setup.outputs.pr-publish == 'true' }} @@ -552,7 +552,7 @@ jobs: outputs: tag: ${{ steps.tag.outputs.tag }} needs: [setup, datahub_ingestion_base_build] - if: ${{ needs.setup.outputs.ingestion_change == 'true' || needs.setup.outputs.publish == 'true' }} + if: ${{ needs.setup.outputs.ingestion_change == 'true' || needs.setup.outputs.publish == 'true' || needs.setup.outputs.pr-publish == 'true' }} steps: - name: Check out the repo uses: acryldata/sane-checkout-action@v3 @@ -574,7 +574,7 @@ jobs: target: slim-install images: | ${{ env.DATAHUB_INGESTION_BASE_IMAGE }} - tags: ${{ needs.setup.outputs.slim_tag }} + image_tag: ${{ needs.setup.outputs.slim_tag }} username: ${{ secrets.ACRYL_DOCKER_USERNAME }} password: ${{ secrets.ACRYL_DOCKER_PASSWORD }} build-args: | @@ -593,7 +593,7 @@ jobs: outputs: tag: ${{ steps.tag.outputs.tag }} needs: [setup, datahub_ingestion_base_build] - if: ${{ needs.setup.outputs.ingestion_change == 'true' || needs.setup.outputs.publish == 'true' }} + if: ${{ needs.setup.outputs.ingestion_change == 'true' || needs.setup.outputs.publish == 'true' || needs.setup.outputs.pr-publish == 'true' }} steps: - name: Check out the repo uses: acryldata/sane-checkout-action@v3 @@ -636,7 +636,7 @@ jobs: tag: ${{ steps.tag.outputs.tag }} needs_artifact_download: ${{ needs.setup.outputs.ingestion_change == 'true' && ( needs.setup.outputs.publish != 'true' && needs.setup.outputs.pr-publish != 'true') }} needs: [setup, datahub_ingestion_base_slim_build] - if: ${{ needs.setup.outputs.ingestion_change == 'true' || needs.setup.outputs.publish == 'true' }} + if: ${{ needs.setup.outputs.ingestion_change == 'true' || needs.setup.outputs.publish == 'true' || needs.setup.outputs.pr-publish == 'true' }} steps: - name: Set up JDK 17 uses: actions/setup-java@v3 @@ -647,7 +647,7 @@ jobs: - name: Check out the repo uses: acryldata/sane-checkout-action@v3 - name: Build codegen - if: ${{ needs.setup.outputs.ingestion_change == 'true' || needs.setup.outputs.publish == 'true' }} + if: ${{ needs.setup.outputs.ingestion_change == 'true' || needs.setup.outputs.publish == 'true' || needs.setup.outputs.pr-publish =='true' }} run: ./gradlew :metadata-ingestion:codegen - name: Download Base Image uses: ishworkh/docker-image-artifact-download@v1 @@ -661,7 +661,7 @@ jobs: username: ${{ secrets.ACRYL_DOCKER_USERNAME }} password: ${{ secrets.ACRYL_DOCKER_PASSWORD }} - name: Build and push Slim Image - if: ${{ needs.setup.outputs.ingestion_change == 'true' || needs.setup.outputs.publish == 'true' }} + if: ${{ needs.setup.outputs.ingestion_change == 'true' || needs.setup.outputs.publish == 'true' || needs.setup.outputs.pr-publish == 'true' }} uses: ./.github/actions/docker-custom-build-and-push with: target: final @@ -672,7 +672,7 @@ jobs: DOCKER_VERSION=${{ needs.setup.outputs.ingestion_base_change == 'true' && needs.setup.outputs.unique_slim_tag || 'head-slim' }} RELEASE_VERSION=${{ needs.setup.outputs.python_release_version }} APP_ENV=slim - tags: ${{ needs.setup.outputs.slim_tag }} + image_tag: ${{ needs.setup.outputs.slim_tag }} username: ${{ secrets.ACRYL_DOCKER_USERNAME }} password: ${{ secrets.ACRYL_DOCKER_PASSWORD }} publish: ${{ needs.setup.outputs.publish == 'true' || needs.setup.outputs.pr-publish == 'true' }} @@ -723,7 +723,7 @@ jobs: tag: ${{ steps.tag.outputs.tag }} needs_artifact_download: ${{ needs.setup.outputs.ingestion_change == 'true' && ( needs.setup.outputs.publish != 'true' && needs.setup.outputs.pr-publish != 'true' ) }} needs: [setup, datahub_ingestion_base_full_build] - if: ${{ needs.setup.outputs.ingestion_change == 'true' || needs.setup.outputs.publish == 'true' }} + if: ${{ needs.setup.outputs.ingestion_change == 'true' || needs.setup.outputs.publish == 'true' || needs.setup.outputs.pr-publish == 'true' }} steps: - name: Set up JDK 17 uses: actions/setup-java@v3 @@ -734,7 +734,7 @@ jobs: - name: Check out the repo uses: acryldata/sane-checkout-action@v3 - name: Build codegen - if: ${{ needs.setup.outputs.ingestion_change == 'true' || needs.setup.outputs.publish == 'true' }} + if: ${{ needs.setup.outputs.ingestion_change == 'true' || needs.setup.outputs.publish == 'true' || needs.setup.outputs.pr-publish == 'true' }} run: ./gradlew :metadata-ingestion:codegen - name: Download Base Image uses: ishworkh/docker-image-artifact-download@v1 @@ -748,7 +748,7 @@ jobs: username: ${{ secrets.ACRYL_DOCKER_USERNAME }} password: ${{ secrets.ACRYL_DOCKER_PASSWORD }} - name: Build and push Full Image - if: ${{ needs.setup.outputs.ingestion_change == 'true' || needs.setup.outputs.publish == 'true' }} + if: ${{ needs.setup.outputs.ingestion_change == 'true' || needs.setup.outputs.publish == 'true' || needs.setup.outputs.pr-publish == 'true' }} uses: ./.github/actions/docker-custom-build-and-push with: target: final @@ -758,7 +758,7 @@ jobs: BASE_IMAGE=${{ env.DATAHUB_INGESTION_BASE_IMAGE }} DOCKER_VERSION=${{ needs.setup.outputs.ingestion_base_change == 'true' && needs.setup.outputs.unique_tag || 'head' }} RELEASE_VERSION=${{ needs.setup.outputs.python_release_version }} - tags: ${{ needs.setup.outputs.tag }} + image_tag: ${{ needs.setup.outputs.tag }} username: ${{ secrets.ACRYL_DOCKER_USERNAME }} password: ${{ secrets.ACRYL_DOCKER_PASSWORD }} publish: ${{ needs.setup.outputs.publish == 'true' || needs.setup.outputs.pr-publish == 'true' }} @@ -776,7 +776,7 @@ jobs: name: "[Monitoring] Scan Datahub Ingestion images for vulnerabilities" runs-on: ubuntu-latest needs: [setup, datahub_ingestion_full_build] - if: ${{ needs.setup.outputs.ingestion_change == 'true' || needs.setup.outputs.publish == 'true' }} + if: ${{ needs.setup.outputs.ingestion_change == 'true' || needs.setup.outputs.publish == 'true' || needs.setup.outputs.pr-publish == 'true' }} steps: - name: Checkout # adding checkout step just to make trivy upload happy uses: acryldata/sane-checkout-action@v3 @@ -814,7 +814,7 @@ jobs: echo 'matrix=["cypress_suite1","cypress_rest"]' >> $GITHUB_OUTPUT elif [ '${{ needs.setup.outputs.ingestion_only }}' == 'true' ]; then echo 'matrix=["no_cypress_suite0","no_cypress_suite1"]' >> $GITHUB_OUTPUT - elif [ '${{ needs.setup.outputs.backend_change }}' == 'true' ]; then + elif [[ '${{ needs.setup.outputs.backend_change }}' == 'true' || '${{ needs.setup.outputs.smoke_test_change }}' == 'true' ]]; then echo 'matrix=["no_cypress_suite0","no_cypress_suite1","cypress_suite1","cypress_rest"]' >> $GITHUB_OUTPUT else echo 'matrix=[]' >> $GITHUB_OUTPUT @@ -862,11 +862,6 @@ jobs: with: python-version: "3.10" cache: "pip" - - name: Install dependencies - run: ./metadata-ingestion/scripts/install_deps.sh - - name: Build datahub cli - run: | - ./gradlew :metadata-ingestion:install - name: Login to DockerHub uses: docker/login-action@v3 if: ${{ needs.setup.outputs.docker-login == 'true' }} @@ -965,7 +960,7 @@ jobs: echo 'datahub-ingestion head-slim images' docker pull '${{ env.DATAHUB_INGESTION_IMAGE }}:head-slim' if [ '${{ needs.datahub_ingestion_slim_build.outputs.tag || 'head-slim' }}' != 'head-slim' ]; then - docker tag '${{ env.DATAHUB_INGESTION_IMAGE }}:head-slim' '${{ env.DATAHUB_INGESTION_IMAGE }}:${{ needs.datahub_ingestion_slim_build.outputs.tag }}' + docker tag '${{ env.DATAHUB_INGESTION_IMAGE }}:head-slim' '${{ env.DATAHUB_INGESTION_IMAGE }}:${{ needs.setup.outputs.unique_tag }}' fi fi - name: Disk Check @@ -994,6 +989,15 @@ jobs: } } }' + - name: Disk Check + run: df -h . && docker images + - name: Install dependencies + run: ./metadata-ingestion/scripts/install_deps.sh + - name: Build datahub cli + run: | + ./gradlew :metadata-ingestion:install + - name: Disk Check + run: df -h . && docker images - name: Remove Source Code run: find ./*/* ! -path "./metadata-ingestion*" ! -path "./smoke-test*" ! -path "./gradle*" -delete - name: Disk Check @@ -1014,21 +1018,14 @@ jobs: if: failure() run: | docker ps -a - docker logs datahub-datahub-gms-1 >& gms-${{ matrix.test_strategy }}.log || true - docker logs datahub-datahub-actions-1 >& actions-${{ matrix.test_strategy }}.log || true - docker logs datahub-datahub-mae-consumer-1 >& mae-${{ matrix.test_strategy }}.log || true - docker logs datahub-datahub-mce-consumer-1 >& mce-${{ matrix.test_strategy }}.log || true - docker logs datahub-broker-1 >& broker-${{ matrix.test_strategy }}.log || true - docker logs datahub-mysql-1 >& mysql-${{ matrix.test_strategy }}.log || true - docker logs datahub-elasticsearch-1 >& elasticsearch-${{ matrix.test_strategy }}.log || true - docker logs datahub-datahub-frontend-react-1 >& frontend-${{ matrix.test_strategy }}.log || true - docker logs datahub-upgrade-1 >& upgrade-${{ matrix.test_strategy }}.log || true + TEST_STRATEGY="-${{ matrix.test_strategy }}" + source .github/scripts/docker_logs.sh - name: Upload logs uses: actions/upload-artifact@v3 if: failure() with: name: docker logs - path: "*.log" + path: "docker_logs/*.log" - name: Upload screenshots uses: actions/upload-artifact@v3 if: failure() @@ -1049,7 +1046,7 @@ jobs: runs-on: ubuntu-latest needs: [setup, smoke_test] steps: - - uses: aws-actions/configure-aws-credentials@v1 + - uses: aws-actions/configure-aws-credentials@v4 if: ${{ needs.setup.outputs.publish != 'false' && github.repository_owner == 'datahub-project' && needs.setup.outputs.repository_name == 'datahub' }} with: aws-access-key-id: ${{ secrets.AWS_SQS_ACCESS_KEY_ID }} diff --git a/.github/workflows/lint-actions.yml b/.github/workflows/lint-actions.yml index 4d83adbeba08a1..8a1777522f416b 100644 --- a/.github/workflows/lint-actions.yml +++ b/.github/workflows/lint-actions.yml @@ -14,3 +14,8 @@ jobs: - uses: reviewdog/action-actionlint@v1 with: reporter: github-pr-review + permissions: + contents: read + checks: write + pull-requests: write + issues: write diff --git a/.github/workflows/metadata-ingestion.yml b/.github/workflows/metadata-ingestion.yml index 51b97552eb150a..a27013c4bf4887 100644 --- a/.github/workflows/metadata-ingestion.yml +++ b/.github/workflows/metadata-ingestion.yml @@ -46,6 +46,11 @@ jobs: - python-version: "3.10" fail-fast: false steps: + - name: Free up disk space + run: | + sudo apt-get remove 'dotnet-*' azure-cli || true + sudo rm -rf /usr/local/lib/android/ || true + sudo docker image prune -a -f || true - name: Set up JDK 17 uses: actions/setup-java@v3 with: diff --git a/.github/workflows/metadata-io.yml b/.github/workflows/metadata-io.yml index 6797c7ad67c0b6..332330b4ed8984 100644 --- a/.github/workflows/metadata-io.yml +++ b/.github/workflows/metadata-io.yml @@ -47,6 +47,11 @@ jobs: timeout-minutes: 60 needs: setup steps: + - name: Free up disk space + run: | + sudo apt-get remove 'dotnet-*' azure-cli || true + sudo rm -rf /usr/local/lib/android/ || true + sudo docker image prune -a -f || true - uses: acryldata/sane-checkout-action@v3 - name: Set up JDK 17 uses: actions/setup-java@v3 diff --git a/.github/workflows/metadata-model.yml b/.github/workflows/metadata-model.yml index 558b7c80f727c1..d62c03057db3f0 100644 --- a/.github/workflows/metadata-model.yml +++ b/.github/workflows/metadata-model.yml @@ -49,7 +49,7 @@ jobs: run: ./gradlew :metadata-ingestion:modelDocGen - name: Configure AWS Credentials if: ${{ needs.setup.outputs.publish == 'true' }} - uses: aws-actions/configure-aws-credentials@v3 + uses: aws-actions/configure-aws-credentials@v4 with: aws-access-key-id: ${{ secrets.ACRYL_CI_ARTIFACTS_ACCESS_KEY_ID }} aws-secret-access-key: ${{ secrets.ACRYL_CI_ARTIFACTS_ACCESS_KEY }} diff --git a/.github/workflows/publish-datahub-jars.yml b/.github/workflows/publish-datahub-jars.yml index 7137302c73564c..aceee756339ada 100644 --- a/.github/workflows/publish-datahub-jars.yml +++ b/.github/workflows/publish-datahub-jars.yml @@ -45,6 +45,9 @@ jobs: echo "tag=$TAG" >> $GITHUB_OUTPUT publish: runs-on: ubuntu-latest + permissions: + id-token: write + contents: read needs: ["check-secret", "setup"] if: ${{ needs.check-secret.outputs.publish-enabled == 'true' }} steps: diff --git a/.github/workflows/spark-smoke-test.yml b/.github/workflows/spark-smoke-test.yml index 8ffc8420ba9413..d1618c65285773 100644 --- a/.github/workflows/spark-smoke-test.yml +++ b/.github/workflows/spark-smoke-test.yml @@ -44,8 +44,11 @@ jobs: run: ./metadata-ingestion/scripts/install_deps.sh - name: Disk Check run: df -h . && docker images - - name: Remove images - run: docker image prune -a -f || true + - name: Free up disk space + run: | + sudo apt-get remove 'dotnet-*' azure-cli || true + sudo rm -rf /usr/local/lib/android/ || true + sudo docker image prune -a -f || true - name: Disk Check run: df -h . && docker images - name: Smoke test diff --git a/.github/workflows/test-results.yml b/.github/workflows/test-results.yml index c94a5fc340f473..a122ef3835f4d7 100644 --- a/.github/workflows/test-results.yml +++ b/.github/workflows/test-results.yml @@ -10,6 +10,11 @@ jobs: unit-test-results: name: Unit Test Results runs-on: ubuntu-latest + permissions: + contents: read + actions: read + checks: write + issues: read if: github.event.workflow_run.conclusion != 'skipped' steps: diff --git a/README.md b/README.md index 3ac0668918f708..8aa177c3d66754 100644 --- a/README.md +++ b/README.md @@ -138,6 +138,7 @@ Here are the companies that have officially adopted DataHub. Please feel free to - [Peloton](https://www.onepeloton.com) - [PITS Global Data Recovery Services](https://www.pitsdatarecovery.net/) - [Razer](https://www.razer.com) +- [Rippling](https://www.rippling.com/) - [Showroomprive](https://www.showroomprive.com/) - [SpotHero](https://spothero.com) - [Stash](https://www.stash.com) @@ -153,6 +154,7 @@ Here are the companies that have officially adopted DataHub. Please feel free to - [Zynga](https://www.zynga.com) + ## Select Articles & Talks - [DataHub Blog](https://blog.datahubproject.io/) @@ -173,6 +175,23 @@ Here are the companies that have officially adopted DataHub. Please feel free to See the full list [here](docs/links.md). +## Security Notes + +### Multi-Component + +The DataHub project uses a wide range of code which is responsible for build automation, documentation generation, and +include both service (i.e. GMS) and client (i.e. ingestion) components. When evaluating security vulnerabilities in +upstream dependencies, it is important to consider which component and how it is used in the project. For example, an +upstream javascript library may include a Denial of Service (DoS) vulnerability however when used for generating +documentation it does not affect the running of DataHub itself and cannot be used to impact DataHub's service. Similarly, +python dependencies for ingestion are part of the DataHub client and are not exposed as a service. + +### Known False Positives + +DataHub's ingestion client does not include credentials in the code repository, python package, or Docker images. +Upstream python dependencies may include files that look like credentials and are often misinterpreted as credentials +by automated scanners. + ## License [Apache License 2.0](./LICENSE). diff --git a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/mutate/util/OwnerUtils.java b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/mutate/util/OwnerUtils.java index 29056eb71a7a3a..ddb795189c0e3d 100644 --- a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/mutate/util/OwnerUtils.java +++ b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/mutate/util/OwnerUtils.java @@ -171,7 +171,7 @@ public static boolean isOwnerEqual( if (!owner.getOwner().equals(ownerUrn)) { return false; } - if (owner.getTypeUrn() != null) { + if (owner.getTypeUrn() != null && ownershipTypeUrn != null) { return owner.getTypeUrn().equals(ownershipTypeUrn); } if (ownershipTypeUrn == null) { diff --git a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/settings/docPropagation/DocPropagationSettingsResolver.java b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/settings/docPropagation/DocPropagationSettingsResolver.java index 84d3bcd7b376c0..0641d6aca63704 100644 --- a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/settings/docPropagation/DocPropagationSettingsResolver.java +++ b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/settings/docPropagation/DocPropagationSettingsResolver.java @@ -33,7 +33,9 @@ public CompletableFuture get(final DataFetchingEnvironme final GlobalSettingsInfo globalSettings = _settingsService.getGlobalSettings(context.getOperationContext()); final DocPropagationSettings defaultSettings = new DocPropagationSettings(); - defaultSettings.setDocColumnPropagation(true); + // TODO: Enable by default. Currently the automation trusts the settings aspect, which + // does not have this. + defaultSettings.setDocColumnPropagation(false); return globalSettings != null && globalSettings.hasDocPropagation() ? mapDocPropagationSettings(globalSettings.getDocPropagation()) : defaultSettings; diff --git a/datahub-graphql-core/src/test/java/com/linkedin/datahub/graphql/utils/OwnerUtilsTest.java b/datahub-graphql-core/src/test/java/com/linkedin/datahub/graphql/utils/OwnerUtilsTest.java index b4097d9dd045df..d524d8bfb9a6b3 100644 --- a/datahub-graphql-core/src/test/java/com/linkedin/datahub/graphql/utils/OwnerUtilsTest.java +++ b/datahub-graphql-core/src/test/java/com/linkedin/datahub/graphql/utils/OwnerUtilsTest.java @@ -59,6 +59,7 @@ public void testIsOwnerEqualOnlyOwnershipTypeUrn() throws URISyntaxException { Urn technicalOwnershipTypeUrn = new Urn(TECHNICAL_OWNER_OWNERSHIP_TYPE_URN); Urn businessOwnershipTypeUrn = new Urn(BUSINESS_OWNER_OWNERSHIP_TYPE_URN); Urn ownerUrn1 = new Urn("urn:li:corpuser:foo"); + Urn ownerUrn2 = new Urn("urn:li:corpuser:bar"); Owner ownerWithTechnicalOwnership = new Owner(); ownerWithTechnicalOwnership.setOwner(ownerUrn1); @@ -72,12 +73,17 @@ public void testIsOwnerEqualOnlyOwnershipTypeUrn() throws URISyntaxException { ownerWithoutOwnershipType.setOwner(ownerUrn1); ownerWithoutOwnershipType.setType(OwnershipType.NONE); + Owner owner2WithoutOwnershipType = new Owner(); + owner2WithoutOwnershipType.setOwner(ownerUrn2); + owner2WithoutOwnershipType.setType(OwnershipType.NONE); + assertTrue( OwnerUtils.isOwnerEqual(ownerWithTechnicalOwnership, ownerUrn1, technicalOwnershipTypeUrn)); assertFalse( OwnerUtils.isOwnerEqual(ownerWithBusinessOwnership, ownerUrn1, technicalOwnershipTypeUrn)); - assertFalse(OwnerUtils.isOwnerEqual(ownerWithTechnicalOwnership, ownerUrn1, null)); + assertTrue(OwnerUtils.isOwnerEqual(ownerWithTechnicalOwnership, ownerUrn1, null)); assertTrue(OwnerUtils.isOwnerEqual(ownerWithoutOwnershipType, ownerUrn1, null)); + assertFalse(OwnerUtils.isOwnerEqual(owner2WithoutOwnershipType, ownerUrn1, null)); } public void testIsOwnerEqualWithBothLegacyAndNewType() throws URISyntaxException { diff --git a/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/config/RestoreIndicesConfig.java b/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/config/RestoreIndicesConfig.java index 949b75edaa6ba0..26e40485787e90 100644 --- a/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/config/RestoreIndicesConfig.java +++ b/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/config/RestoreIndicesConfig.java @@ -8,37 +8,23 @@ import io.ebean.Database; import javax.annotation.Nonnull; import lombok.extern.slf4j.Slf4j; -import org.springframework.beans.factory.annotation.Autowired; import org.springframework.boot.autoconfigure.condition.ConditionalOnProperty; -import org.springframework.context.ApplicationContext; import org.springframework.context.annotation.Bean; import org.springframework.context.annotation.Configuration; -import org.springframework.context.annotation.DependsOn; @Slf4j @Configuration public class RestoreIndicesConfig { - @Autowired ApplicationContext applicationContext; @Bean(name = "restoreIndices") - @DependsOn({ - "ebeanServer", - "entityService", - "systemMetadataService", - "searchService", - "graphService" - }) @ConditionalOnProperty(name = "entityService.impl", havingValue = "ebean", matchIfMissing = true) @Nonnull - public RestoreIndices createInstance() { - final Database ebeanServer = applicationContext.getBean(Database.class); - final EntityService entityService = applicationContext.getBean(EntityService.class); - final SystemMetadataService systemMetadataService = - applicationContext.getBean(SystemMetadataService.class); - final EntitySearchService entitySearchService = - applicationContext.getBean(EntitySearchService.class); - final GraphService graphService = applicationContext.getBean(GraphService.class); - + public RestoreIndices createInstance( + final Database ebeanServer, + final EntityService entityService, + final EntitySearchService entitySearchService, + final GraphService graphService, + final SystemMetadataService systemMetadataService) { return new RestoreIndices( ebeanServer, entityService, systemMetadataService, entitySearchService, graphService); } diff --git a/datahub-web-react/.eslintrc.js b/datahub-web-react/.eslintrc.js index 5627283af1af1c..3fdf7b6a3042ca 100644 --- a/datahub-web-react/.eslintrc.js +++ b/datahub-web-react/.eslintrc.js @@ -48,7 +48,7 @@ module.exports = { ], 'vitest/prefer-to-be': 'off', '@typescript-eslint/no-use-before-define': ['error', { functions: false, classes: false }], - 'react-refresh/only-export-components': ['warn', { 'allowConstantExport': true }], + 'react-refresh/only-export-components': ['warn', { allowConstantExport: true }], }, settings: { react: { diff --git a/datahub-web-react/README.md b/datahub-web-react/README.md index 560f5315b2c71f..86bbb349b027c4 100644 --- a/datahub-web-react/README.md +++ b/datahub-web-react/README.md @@ -1,44 +1,47 @@ --- -title: "datahub-web-react" +title: 'datahub-web-react' --- # DataHub React App ## About -This module contains a React application that serves as the DataHub UI. -Feel free to take a look around, deploy, and contribute. +This module contains a React application that serves as the DataHub UI. +Feel free to take a look around, deploy, and contribute. ## Functional Goals + The initial milestone for the app was to achieve functional parity with the previous Ember app. This meant supporting -- Dataset Profiles, Search, Browse Experience -- User Profiles, Search -- LDAP Authentication Flow +- Dataset Profiles, Search, Browse Experience +- User Profiles, Search +- LDAP Authentication Flow -This has since been achieved. The new set of functional goals are reflected in the latest version of the [DataHub Roadmap](../docs/roadmap.md). +This has since been achieved. The new set of functional goals are reflected in the latest version of the [DataHub Roadmap](../docs/roadmap.md). ## Design Goals + In building out the client experience, we intend to leverage learnings from the previous Ember-based app and incorporate feedback gathered from organizations operating DataHub. Two themes have emerged to serve as guideposts: -1. **Configurability**: The client experience should be configurable, such that deploying organizations can tailor certain - aspects to their needs. This includes theme / styling configurability, showing and hiding specific functionality, - customizing copy & logos, etc. - -2. **Extensibility**: Extending the *functionality* of DataHub should be as simple as possible. Making changes like - extending an existing entity & adding a new entity should require minimal effort and should be well covered in detailed - documentation. +1. **Configurability**: The client experience should be configurable, such that deploying organizations can tailor certain + aspects to their needs. This includes theme / styling configurability, showing and hiding specific functionality, + customizing copy & logos, etc. +2. **Extensibility**: Extending the _functionality_ of DataHub should be as simple as possible. Making changes like + extending an existing entity & adding a new entity should require minimal effort and should be well covered in detailed + documentation. ## Starting the Application ### Quick Start Navigate to the `docker` directory and run the following to spin up the react app: + ``` ./quickstart.sh ``` + at `http://localhost:9002`. If you want to make changes to the UI see them live without having to rebuild the `datahub-frontend-react` docker image, you @@ -54,8 +57,9 @@ Optionally you could also start the app with the mock server without running the ### Testing your customizations There is two options to test your customizations: -* **Option 1**: Initialize the docker containers with the `quickstart.sh` script (or if any custom docker-compose file) and then run `yarn start` in this directory. This will start a forwarding server at `localhost:3000` that will use the `datahub-frontend` server at `http://localhost:9002` to fetch real data. -* **Option 2**: Change the environment variable `REACT_APP_PROXY_TARGET` in the `.env` file to point to your `datahub-frontend` server (ex: https://my_datahub_host.com) and then run `yarn start` in this directory. This will start a forwarding server at `localhost:3000` that will use the `datahub-frontend` server at some domain to fetch real data. + +- **Option 1**: Initialize the docker containers with the `quickstart.sh` script (or if any custom docker-compose file) and then run `yarn start` in this directory. This will start a forwarding server at `localhost:3000` that will use the `datahub-frontend` server at `http://localhost:9002` to fetch real data. +- **Option 2**: Change the environment variable `REACT_APP_PROXY_TARGET` in the `.env` file to point to your `datahub-frontend` server (ex: https://my_datahub_host.com) and then run `yarn start` in this directory. This will start a forwarding server at `localhost:3000` that will use the `datahub-frontend` server at some domain to fetch real data. The option 2 is useful if you want to test your React customizations without having to run the hole DataHub stack locally. However, if you changed other components of the DataHub stack, you will need to run the hole stack locally (building the docker images) and use the option 1. @@ -68,10 +72,10 @@ In order to start a server and run frontend unit tests using react-testing-frame There are also more automated tests using Cypress in the `smoke-test` folder of the repository root. #### Troubleshooting + `Error: error:0308010C:digital envelope routines::unsupported`: This error message shows up when using Node 17, due to an OpenSSL update related to md5. The best workaround is to revert to the Active LTS version of Node, 16.13.0 with the command `nvm install 16.13.0` and if necessary reinstall yarn `npm install --global yarn`. - ### Theming #### Customizing your App without rebuilding assets @@ -108,74 +112,74 @@ you to terminate and re-run `yarn start` to see updated styles. The `src` dir of the app is broken down into the following modules -**conf** - Stores global configuration flags that can be referenced across the app. For example, the number of +**conf** - Stores global configuration flags that can be referenced across the app. For example, the number of search results shown per page, or the placeholder text in the search bar box. It serves as a location where levels -for functional configurability should reside. +for functional configurability should reside. **app** - Contains all important components of the app. It has a few sub-modules: -- `auth`: Components used to render the user authentication experience. -- `browse`: Shared components used to render the 'browse-by-path' experience. The experience is akin to navigating a filesystem hierarchy. -- `preview`: Shared components used to render Entity 'preview' views. These can appear in search results, browse results, - and within entity profile pages. -- `search`: Shared components used to render the full-text search experience. -- `shared`: Misc. shared components -- `entity`: Contains Entity definitions, where entity-specific functionality resides. - Configuration is provided by implementing the 'Entity' interface. (See DatasetEntity.tsx for example) - There are 2 visual components each entity should supply: - - `profiles`: display relevant details about an individual entity. This serves as the entity's 'profile'. - - `previews`: provide a 'preview', or a smaller details card, containing the most important information about an entity instance. - - When rendering a preview, the entity's data and the type of preview (SEARCH, BROWSE, PREVIEW) are provided. This +- `auth`: Components used to render the user authentication experience. +- `browse`: Shared components used to render the 'browse-by-path' experience. The experience is akin to navigating a filesystem hierarchy. +- `preview`: Shared components used to render Entity 'preview' views. These can appear in search results, browse results, + and within entity profile pages. +- `search`: Shared components used to render the full-text search experience. +- `shared`: Misc. shared components +- `entity`: Contains Entity definitions, where entity-specific functionality resides. + Configuration is provided by implementing the 'Entity' interface. (See DatasetEntity.tsx for example) + There are 2 visual components each entity should supply: + + - `profiles`: display relevant details about an individual entity. This serves as the entity's 'profile'. + - `previews`: provide a 'preview', or a smaller details card, containing the most important information about an entity instance. + + When rendering a preview, the entity's data and the type of preview (SEARCH, BROWSE, PREVIEW) are provided. This allows you to optionally customize the way an entities preview is rendered in different views. - - - `entity registry`: There's another very important piece of code living within this module: the **EntityRegistry**. This is a layer + + - `entity registry`: There's another very important piece of code living within this module: the **EntityRegistry**. This is a layer of abstraction over the intimate details of rendering a particular entity. It is used to render a view associated with a particular entity type (user, dataset, etc.). - - +

-**graphql** - The React App talks to the `dathub-frontend` server using GraphQL. This module is where the *queries* issued -against the server are defined. Once defined, running `yarn run generate` will code-gen TypeScript objects to make invoking +**graphql** - The React App talks to the `dathub-frontend` server using GraphQL. This module is where the _queries_ issued +against the server are defined. Once defined, running `yarn run generate` will code-gen TypeScript objects to make invoking these queries extremely easy. An example can be found at the top of `SearchPage.tsx.` -**images** - Images to be displayed within the app. This is where one would place a custom logo image. +**images** - Images to be displayed within the app. This is where one would place a custom logo image. ## Adding an Entity The following outlines a series of steps required to introduce a new entity into the React app: -1. Declare the GraphQL Queries required to display the new entity - - If search functionality should be supported, extend the "search" query within `search.graphql` to fetch the new +1. Declare the GraphQL Queries required to display the new entity + + - If search functionality should be supported, extend the "search" query within `search.graphql` to fetch the new + entity data. + - If browse functionality should be supported, extend the "browse" query within `browse.graphql` to fetch the new entity data. - - If browse functionality should be supported, extend the "browse" query within `browse.graphql` to fetch the new - entity data. - - If display a 'profile' should be supported (most often), introduce a new `.graphql` file that contains a - `get` query to fetch the entity by primary key (urn). - - Note that your new entity *must* implement the `Entity` GraphQL type interface, and thus must have a corresponding - `EntityType`. - - -2. Implement the `Entity` interface + - If display a 'profile' should be supported (most often), introduce a new `.graphql` file that contains a + `get` query to fetch the entity by primary key (urn). + + Note that your new entity _must_ implement the `Entity` GraphQL type interface, and thus must have a corresponding + `EntityType`. + +2. Implement the `Entity` interface + - Create a new folder under `src/components/entity` corresponding to your entity - Create a class that implements the `Entity` interface (example: `DatasetEntity.tsx`) - - Provide an implementation each method defined on the interface. - - This class specifies whether your new entity should be searchable & browsable, defines the names used to - identify your entity when instances are rendered in collection / when entity appears - in the URL path, and provides the ability to render your entity given data returned by the GQL API. - + - Provide an implementation each method defined on the interface. + - This class specifies whether your new entity should be searchable & browsable, defines the names used to + identify your entity when instances are rendered in collection / when entity appears + in the URL path, and provides the ability to render your entity given data returned by the GQL API. 3. Register the new entity in the `EntityRegistry` - - Update `App.tsx` to register an instance of your new entity. Now your entity will be accessible via the registry + - Update `App.tsx` to register an instance of your new entity. Now your entity will be accessible via the registry and appear in the UI. To manually retrieve the info about your entity or others, simply use an instance - of the `EntityRegistry`, which is provided via `ReactContext` to *all* components in the hierarchy. + of the `EntityRegistry`, which is provided via `ReactContext` to _all_ components in the hierarchy. For example - ``` - entityRegistry.getCollectionName(EntityType.YOUR_NEW_ENTITY) - ``` - -That's it! For any questions, do not hesitate to reach out on the DataHub Slack community in #datahub-react. + ``` + entityRegistry.getCollectionName(EntityType.YOUR_NEW_ENTITY) + ``` + +That's it! For any questions, do not hesitate to reach out on the DataHub Slack community in #datahub-react. diff --git a/datahub-web-react/src/app/entity/dataset/profile/schema/components/SchemaDescriptionField.tsx b/datahub-web-react/src/app/entity/dataset/profile/schema/components/SchemaDescriptionField.tsx index ce8d03fbdc9602..e7d986028d4a66 100644 --- a/datahub-web-react/src/app/entity/dataset/profile/schema/components/SchemaDescriptionField.tsx +++ b/datahub-web-react/src/app/entity/dataset/profile/schema/components/SchemaDescriptionField.tsx @@ -5,6 +5,8 @@ import styled from 'styled-components'; import { FetchResult } from '@apollo/client'; import { UpdateDatasetMutation } from '../../../../../../graphql/dataset.generated'; +import { StringMapEntry } from '../../../../../../types.generated'; +import PropagationDetails from '../../../../shared/propagation/PropagationDetails'; import UpdateDescriptionModal from '../../../../shared/components/legacy/DescriptionModal'; import StripMarkdownText, { removeMarkdown } from '../../../../shared/components/styled/StripMarkdownText'; import SchemaEditableContext from '../../../../../shared/SchemaEditableContext'; @@ -28,6 +30,11 @@ const ExpandedActions = styled.div` height: 10px; `; +const DescriptionWrapper = styled.span` + display: inline-flex; + align-items: center; +`; + const DescriptionContainer = styled.div` position: relative; display: flex; @@ -105,6 +112,8 @@ type Props = { isEdited?: boolean; isReadOnly?: boolean; businessAttributeDescription?: string; + isPropagated?: boolean; + sourceDetail?: StringMapEntry[] | null; }; const ABBREVIATED_LIMIT = 80; @@ -120,6 +129,8 @@ export default function DescriptionField({ original, isReadOnly, businessAttributeDescription, + isPropagated, + sourceDetail, }: Props) { const [showAddModal, setShowAddModal] = useState(false); const overLimit = removeMarkdown(description).length > 80; @@ -163,7 +174,7 @@ export default function DescriptionField({ return ( - {expanded || !overLimit ? ( + {expanded ? ( <> {!!description && } {!!description && (EditButton || overLimit) && ( @@ -184,25 +195,29 @@ export default function DescriptionField({ ) : ( <> - - { - e.stopPropagation(); - handleExpanded(true); - }} - > - Read More - - - } - suffix={EditButton} - shouldWrap - > - {description} - + + {isPropagated && } +   + + { + e.stopPropagation(); + handleExpanded(true); + }} + > + Read More + + + } + suffix={EditButton} + shouldWrap + > + {description} + + )} {isEdited && (edited)} diff --git a/datahub-web-react/src/app/entity/shared/components/legacy/DescriptionModal.tsx b/datahub-web-react/src/app/entity/shared/components/legacy/DescriptionModal.tsx index 0e899bc391e0a7..2d65a305b4cc8b 100644 --- a/datahub-web-react/src/app/entity/shared/components/legacy/DescriptionModal.tsx +++ b/datahub-web-react/src/app/entity/shared/components/legacy/DescriptionModal.tsx @@ -19,16 +19,29 @@ const StyledViewer = styled(Editor)` } `; +const OriginalDocumentation = styled(Form.Item)` + margin-bottom: 0; +`; + type Props = { title: string; description?: string | undefined; original?: string | undefined; + propagatedDescription?: string | undefined; onClose: () => void; onSubmit: (description: string) => void; isAddDesc?: boolean; }; -export default function UpdateDescriptionModal({ title, description, original, onClose, onSubmit, isAddDesc }: Props) { +export default function UpdateDescriptionModal({ + title, + description, + original, + propagatedDescription, + onClose, + onSubmit, + isAddDesc, +}: Props) { const [updatedDesc, setDesc] = useState(description || original || ''); const handleEditorKeyDown = (event: React.KeyboardEvent) => { @@ -72,9 +85,14 @@ export default function UpdateDescriptionModal({ title, description, original, o /> {!isAddDesc && description && original && ( - Original:}> + Original:}> - + + )} + {!isAddDesc && description && propagatedDescription && ( + Propagated:}> + + )} diff --git a/datahub-web-react/src/app/entity/shared/containers/profile/__tests__/EntityHeader.test.tsx b/datahub-web-react/src/app/entity/shared/containers/profile/__tests__/EntityHeader.test.tsx index db347d4f1cc54c..ec6a91df9019ab 100644 --- a/datahub-web-react/src/app/entity/shared/containers/profile/__tests__/EntityHeader.test.tsx +++ b/datahub-web-react/src/app/entity/shared/containers/profile/__tests__/EntityHeader.test.tsx @@ -3,13 +3,14 @@ import { EntityType } from '../../../../../../types.generated'; import { getCanEditName } from '../header/EntityHeader'; describe('getCanEditName', () => { - const entityDataWithManagePrivileges = { privileges: { canManageEntity: true } }; - const entityDataWithoutManagePrivileges = { privileges: { canManageEntity: false } }; + const entityDataWithManagePrivileges = { privileges: { canManageEntity: true, canEditProperties: true } }; + const entityDataWithoutManagePrivileges = { privileges: { canManageEntity: false, canEditProperties: false } }; it('should return true for Terms if manageGlossaries privilege is true', () => { const canEditName = getCanEditName( EntityType.GlossaryTerm, entityDataWithoutManagePrivileges, + true, platformPrivileges, ); @@ -21,6 +22,7 @@ describe('getCanEditName', () => { const canEditName = getCanEditName( EntityType.GlossaryTerm, entityDataWithoutManagePrivileges, + true, privilegesWithoutGlossaries, ); @@ -32,6 +34,7 @@ describe('getCanEditName', () => { const canEditName = getCanEditName( EntityType.GlossaryTerm, entityDataWithManagePrivileges, + true, privilegesWithoutGlossaries, ); @@ -42,6 +45,7 @@ describe('getCanEditName', () => { const canEditName = getCanEditName( EntityType.GlossaryNode, entityDataWithoutManagePrivileges, + true, platformPrivileges, ); @@ -53,6 +57,7 @@ describe('getCanEditName', () => { const canEditName = getCanEditName( EntityType.GlossaryNode, entityDataWithoutManagePrivileges, + true, privilegesWithoutGlossaries, ); @@ -64,6 +69,7 @@ describe('getCanEditName', () => { const canEditName = getCanEditName( EntityType.GlossaryNode, entityDataWithManagePrivileges, + true, privilegesWithoutGlossaries, ); @@ -71,7 +77,12 @@ describe('getCanEditName', () => { }); it('should return true for Domains if manageDomains privilege is true', () => { - const canEditName = getCanEditName(EntityType.Domain, entityDataWithoutManagePrivileges, platformPrivileges); + const canEditName = getCanEditName( + EntityType.Domain, + entityDataWithoutManagePrivileges, + true, + platformPrivileges, + ); expect(canEditName).toBe(true); }); @@ -81,6 +92,7 @@ describe('getCanEditName', () => { const canEditName = getCanEditName( EntityType.Domain, entityDataWithoutManagePrivileges, + true, privilegesWithoutDomains, ); @@ -88,7 +100,30 @@ describe('getCanEditName', () => { }); it('should return false for an unsupported entity', () => { - const canEditName = getCanEditName(EntityType.Chart, entityDataWithManagePrivileges, platformPrivileges); + const canEditName = getCanEditName(EntityType.Chart, entityDataWithManagePrivileges, true, platformPrivileges); + + expect(canEditName).toBe(false); + }); + + it('should return true for a dataset if canEditProperties is true', () => { + const canEditName = getCanEditName(EntityType.Chart, entityDataWithManagePrivileges, true, platformPrivileges); + + expect(canEditName).toBe(false); + }); + + it('should return false for a dataset if canEditProperties is false', () => { + const canEditName = getCanEditName( + EntityType.Chart, + entityDataWithoutManagePrivileges, + true, + platformPrivileges, + ); + + expect(canEditName).toBe(false); + }); + + it('should return false for a dataset if isEditableDatasetNameEnabled is false', () => { + const canEditName = getCanEditName(EntityType.Chart, entityDataWithManagePrivileges, false, platformPrivileges); expect(canEditName).toBe(false); }); diff --git a/datahub-web-react/src/app/entity/shared/containers/profile/header/EntityHeader.tsx b/datahub-web-react/src/app/entity/shared/containers/profile/header/EntityHeader.tsx index 11335d0378760c..12fa9131f33c73 100644 --- a/datahub-web-react/src/app/entity/shared/containers/profile/header/EntityHeader.tsx +++ b/datahub-web-react/src/app/entity/shared/containers/profile/header/EntityHeader.tsx @@ -60,6 +60,7 @@ const TopButtonsWrapper = styled.div` export function getCanEditName( entityType: EntityType, entityData: GenericEntityProperties | null, + isEditableDatasetNameEnabled: boolean, privileges?: PlatformPrivileges, ) { switch (entityType) { @@ -73,7 +74,7 @@ export function getCanEditName( case EntityType.BusinessAttribute: return privileges?.manageBusinessAttributes; case EntityType.Dataset: - return entityData?.privileges?.canEditProperties; + return isEditableDatasetNameEnabled && entityData?.privileges?.canEditProperties; default: return false; } @@ -99,9 +100,13 @@ export const EntityHeader = ({ headerDropdownItems, headerActionItems, isNameEdi const isEditableDatasetNameEnabled = useIsEditableDatasetNameEnabled(); const canEditName = - isEditableDatasetNameEnabled && isNameEditable && - getCanEditName(entityType, entityData, me?.platformPrivileges as PlatformPrivileges); + getCanEditName( + entityType, + entityData, + isEditableDatasetNameEnabled, + me?.platformPrivileges as PlatformPrivileges, + ); const entityRegistry = useEntityRegistry(); return ( diff --git a/datahub-web-react/src/app/entity/shared/containers/profile/sidebar/EntitySidebar.tsx b/datahub-web-react/src/app/entity/shared/containers/profile/sidebar/EntitySidebar.tsx index a8d1dceb71ec92..b5e3b221c736d0 100644 --- a/datahub-web-react/src/app/entity/shared/containers/profile/sidebar/EntitySidebar.tsx +++ b/datahub-web-react/src/app/entity/shared/containers/profile/sidebar/EntitySidebar.tsx @@ -46,7 +46,7 @@ export const EntitySidebar = ({ sidebarSections, topSection }: Props) => { return ( <> {topSection && } - {entityData?.lastIngested && ( + {!!entityData?.lastIngested && ( diff --git a/datahub-web-react/src/app/entity/shared/propagation/PropagationDetails.tsx b/datahub-web-react/src/app/entity/shared/propagation/PropagationDetails.tsx new file mode 100644 index 00000000000000..646f47134938c4 --- /dev/null +++ b/datahub-web-react/src/app/entity/shared/propagation/PropagationDetails.tsx @@ -0,0 +1,109 @@ +import React from 'react'; +import styled from 'styled-components'; +import { Popover } from 'antd'; +import { StringMapEntry } from '../../../../types.generated'; +import PropagationEntityLink from './PropagationEntityLink'; +import { usePropagationDetails } from './utils'; +import { PropagateThunderbolt, PropagateThunderboltFilled } from './PropagationIcon'; + +const PopoverWrapper = styled.div` + display: flex; + flex-direction: column; +`; + +const PopoverTitle = styled.div` + font-weight: bold; + font-size: 14px; + padding: 6px 0px; + color: #eeecfa; +`; + +const PopoverDescription = styled.div` + max-width: 340px; + font-size: 14px; + color: #eeecfa; + display: inline; + padding: 0px 0px 8px 0px; +`; + +const PopoverAttributes = styled.div` + display: flex; +`; + +const PopoverAttribute = styled.div` + margin-right: 12px; + margin-bottom: 4px; +`; + +const PopoverAttributeTitle = styled.div` + font-size: 14px; + color: #eeecfa; + font-weight: bold; + margin: 8px 0px; + overflow: hidden; + text-overflow: ellipsis; +`; + +const PopoverDocumentation = styled.a` + margin-top: 12px; +`; + +interface Props { + sourceDetail?: StringMapEntry[] | null; +} + +export default function PropagationDetails({ sourceDetail }: Props) { + const { + isPropagated, + origin: { entity: originEntity }, + via: { entity: viaEntity }, + } = usePropagationDetails(sourceDetail); + + if (!sourceDetail || !isPropagated) return null; + + const popoverContent = + originEntity || viaEntity ? ( + + + This description was automatically propagated from an upstream column.{' '} + + Learn more + + + + {originEntity && originEntity.urn !== viaEntity?.urn && ( + + Origin + + + )} + {viaEntity && ( + + Via + + + )} + + + ) : undefined; + + return ( + + + Propagated Description + + } + content={popoverContent} + > + + + ); +} diff --git a/datahub-web-react/src/app/entity/shared/propagation/PropagationEntityLink.tsx b/datahub-web-react/src/app/entity/shared/propagation/PropagationEntityLink.tsx new file mode 100644 index 00000000000000..8c1285dd5808b1 --- /dev/null +++ b/datahub-web-react/src/app/entity/shared/propagation/PropagationEntityLink.tsx @@ -0,0 +1,56 @@ +import React from 'react'; +import styled from 'styled-components'; +import { Link } from 'react-router-dom'; +import { useEntityRegistry } from '../../../useEntityRegistry'; +import { Entity, EntityType, SchemaFieldEntity } from '../../../../types.generated'; +import { GenericEntityProperties } from '../types'; + +const PreviewImage = styled.img<{ size: number }>` + height: ${(props) => props.size}px; + width: ${(props) => props.size}px; + min-width: ${(props) => props.size}px; + object-fit: contain; + background-color: transparent; + margin: 0px 4px 0px 0px; +`; + +const StyledLink = styled(Link)` + margin-right: 4px; + display: flex; + align-items: center; + overflow: hidden; + text-overflow: ellipsis; + white-space: nowrap; +`; + +interface Props { + entity: Entity; +} + +export default function PropagationEntityLink({ entity }: Props) { + const entityRegistry = useEntityRegistry(); + + const isSchemaField = entity.type === EntityType.SchemaField; + const baseEntity = isSchemaField ? (entity as SchemaFieldEntity).parent : entity; + + const logoUrl = (baseEntity as GenericEntityProperties)?.platform?.properties?.logoUrl || ''; + let entityUrl = entityRegistry.getEntityUrl(baseEntity.type, baseEntity.urn); + let entityDisplayName = entityRegistry.getDisplayName(baseEntity.type, baseEntity); + + if (isSchemaField) { + entityUrl = `${entityUrl}/${encodeURIComponent('Columns')}?schemaFilter=${encodeURIComponent( + (entity as SchemaFieldEntity).fieldPath, + )}`; + const schemaFieldName = entityRegistry.getDisplayName(entity.type, entity); + entityDisplayName = `${entityDisplayName}.${schemaFieldName}`; + } + + return ( + <> + + + {entityDisplayName} + + + ); +} diff --git a/datahub-web-react/src/app/entity/shared/propagation/PropagationIcon.tsx b/datahub-web-react/src/app/entity/shared/propagation/PropagationIcon.tsx new file mode 100644 index 00000000000000..01b4570c4ca0df --- /dev/null +++ b/datahub-web-react/src/app/entity/shared/propagation/PropagationIcon.tsx @@ -0,0 +1,22 @@ +import styled from 'styled-components'; +import { ThunderboltFilled } from '@ant-design/icons'; +import { REDESIGN_COLORS } from '../constants'; + +export const PropagateThunderbolt = styled(ThunderboltFilled)` + && { + color: #a7c7fa; + } + font-size: 16px; + &:hover { + color: ${REDESIGN_COLORS.BLUE}; + } + margin-right: 4px; +`; + +export const PropagateThunderboltFilled = styled(ThunderboltFilled)` + && { + color: ${REDESIGN_COLORS.BLUE}; + } + font-size: 16px; + margin-right: 4px; +`; diff --git a/datahub-web-react/src/app/entity/shared/propagation/utils.ts b/datahub-web-react/src/app/entity/shared/propagation/utils.ts new file mode 100644 index 00000000000000..d8b4d4d931f4ee --- /dev/null +++ b/datahub-web-react/src/app/entity/shared/propagation/utils.ts @@ -0,0 +1,24 @@ +import { StringMapEntry } from '../../../../types.generated'; +import { useGetEntities } from '../useGetEntities'; + +export function usePropagationDetails(sourceDetail?: StringMapEntry[] | null) { + const isPropagated = !!sourceDetail?.find((mapEntry) => mapEntry.key === 'propagated' && mapEntry.value === 'true'); + const originEntityUrn = sourceDetail?.find((mapEntry) => mapEntry.key === 'origin')?.value || ''; + const viaEntityUrn = sourceDetail?.find((mapEntry) => mapEntry.key === 'via')?.value || ''; + + const entities = useGetEntities([originEntityUrn, viaEntityUrn]); + const originEntity = entities.find((e) => e.urn === originEntityUrn); + const viaEntity = entities.find((e) => e.urn === viaEntityUrn); + + return { + isPropagated, + origin: { + urn: originEntityUrn, + entity: originEntity, + }, + via: { + urn: viaEntityUrn, + entity: viaEntity, + }, + }; +} diff --git a/datahub-web-react/src/app/entity/shared/tabs/Dataset/Schema/components/SchemaFieldDrawer/FieldDescription.tsx b/datahub-web-react/src/app/entity/shared/tabs/Dataset/Schema/components/SchemaFieldDrawer/FieldDescription.tsx index be95cba3ab4f07..e64a1436b0b1c5 100644 --- a/datahub-web-react/src/app/entity/shared/tabs/Dataset/Schema/components/SchemaFieldDrawer/FieldDescription.tsx +++ b/datahub-web-react/src/app/entity/shared/tabs/Dataset/Schema/components/SchemaFieldDrawer/FieldDescription.tsx @@ -6,6 +6,8 @@ import styled from 'styled-components'; import { SectionHeader, StyledDivider } from './components'; import UpdateDescriptionModal from '../../../../../components/legacy/DescriptionModal'; import { EditableSchemaFieldInfo, SchemaField, SubResourceType } from '../../../../../../../../types.generated'; +import { getFieldDescriptionDetails } from '../../utils/getFieldDescriptionDetails'; +import PropagationDetails from '../../../../../propagation/PropagationDetails'; import DescriptionSection from '../../../../../containers/profile/sidebar/AboutSection/DescriptionSection'; import { useEntityData, useMutationUrn, useRefetch } from '../../../../../EntityContext'; import { useSchemaRefetch } from '../../SchemaContext'; @@ -13,11 +15,6 @@ import { useUpdateDescriptionMutation } from '../../../../../../../../graphql/mu import analytics, { EntityActionType, EventType } from '../../../../../../../analytics'; import SchemaEditableContext from '../../../../../../../shared/SchemaEditableContext'; -const DescriptionWrapper = styled.div` - display: flex; - justify-content: space-between; -`; - const EditIcon = styled(Button)` border: none; box-shadow: none; @@ -25,6 +22,13 @@ const EditIcon = styled(Button)` width: 20px; `; +const DescriptionWrapper = styled.div` + display: flex; + gap: 4px; + align-items: center; + justify-content: space-between; +`; + interface Props { expandedField: SchemaField; editableFieldInfo?: EditableSchemaFieldInfo; @@ -76,7 +80,13 @@ export default function FieldDescription({ expandedField, editableFieldInfo }: P }, }); - const displayedDescription = editableFieldInfo?.description || expandedField.description; + const { schemaFieldEntity, description } = expandedField; + const { displayedDescription, isPropagated, sourceDetail, propagatedDescription } = getFieldDescriptionDetails({ + schemaFieldEntity, + editableFieldInfo, + defaultDescription: description, + }); + const baDescription = expandedField?.schemaFieldEntity?.businessAttributes?.businessAttribute?.businessAttribute?.properties ?.description; @@ -87,12 +97,17 @@ export default function FieldDescription({ expandedField, editableFieldInfo }: P
Description - + + {isPropagated && } + {!!displayedDescription && ( + + )} +
{isSchemaEditable && ( )} diff --git a/datahub-web-react/src/app/entity/shared/tabs/Dataset/Schema/utils/getFieldDescriptionDetails.ts b/datahub-web-react/src/app/entity/shared/tabs/Dataset/Schema/utils/getFieldDescriptionDetails.ts new file mode 100644 index 00000000000000..6434baddb77a66 --- /dev/null +++ b/datahub-web-react/src/app/entity/shared/tabs/Dataset/Schema/utils/getFieldDescriptionDetails.ts @@ -0,0 +1,25 @@ +import { EditableSchemaFieldInfo, SchemaFieldEntity } from '../../../../../../../types.generated'; + +interface Props { + schemaFieldEntity?: SchemaFieldEntity | null; + editableFieldInfo?: EditableSchemaFieldInfo; + defaultDescription?: string | null; +} + +export function getFieldDescriptionDetails({ schemaFieldEntity, editableFieldInfo, defaultDescription }: Props) { + const documentation = schemaFieldEntity?.documentation?.documentations?.[0]; + const isUsingDocumentationAspect = !editableFieldInfo?.description && !!documentation; + const isPropagated = + isUsingDocumentationAspect && + !!documentation?.attribution?.sourceDetail?.find( + (mapEntry) => mapEntry.key === 'propagated' && mapEntry.value === 'true', + ); + + const displayedDescription = + editableFieldInfo?.description || documentation?.documentation || defaultDescription || ''; + + const sourceDetail = documentation?.attribution?.sourceDetail; + const propagatedDescription = documentation?.documentation; + + return { displayedDescription, isPropagated, sourceDetail, propagatedDescription }; +} diff --git a/datahub-web-react/src/app/entity/shared/tabs/Dataset/Schema/utils/useDescriptionRenderer.tsx b/datahub-web-react/src/app/entity/shared/tabs/Dataset/Schema/utils/useDescriptionRenderer.tsx index 73e6d2ca6e9b3e..bb70c2cb493037 100644 --- a/datahub-web-react/src/app/entity/shared/tabs/Dataset/Schema/utils/useDescriptionRenderer.tsx +++ b/datahub-web-react/src/app/entity/shared/tabs/Dataset/Schema/utils/useDescriptionRenderer.tsx @@ -6,6 +6,7 @@ import { useUpdateDescriptionMutation } from '../../../../../../../graphql/mutat import { useMutationUrn, useRefetch } from '../../../../EntityContext'; import { useSchemaRefetch } from '../SchemaContext'; import { pathMatchesNewPath } from '../../../../../dataset/profile/schema/utils/utils'; +import { getFieldDescriptionDetails } from './getFieldDescriptionDetails'; export default function useDescriptionRenderer(editableSchemaMetadata: EditableSchemaMetadata | null | undefined) { const urn = useMutationUrn(); @@ -21,10 +22,16 @@ export default function useDescriptionRenderer(editableSchemaMetadata: EditableS }; return (description: string, record: SchemaField, index: number): JSX.Element => { - const relevantEditableFieldInfo = editableSchemaMetadata?.editableSchemaFieldInfo.find( - (candidateEditableFieldInfo) => pathMatchesNewPath(candidateEditableFieldInfo.fieldPath, record.fieldPath), + const editableFieldInfo = editableSchemaMetadata?.editableSchemaFieldInfo.find((candidateEditableFieldInfo) => + pathMatchesNewPath(candidateEditableFieldInfo.fieldPath, record.fieldPath), ); - const displayedDescription = relevantEditableFieldInfo?.description || description; + const { schemaFieldEntity } = record; + const { displayedDescription, isPropagated, sourceDetail } = getFieldDescriptionDetails({ + schemaFieldEntity, + editableFieldInfo, + defaultDescription: description, + }); + const sanitizedDescription = DOMPurify.sanitize(displayedDescription); const original = record.description ? DOMPurify.sanitize(record.description) : undefined; const businessAttributeDescription = @@ -43,7 +50,7 @@ export default function useDescriptionRenderer(editableSchemaMetadata: EditableS baExpanded={!!expandedBARows[index]} description={sanitizedDescription} original={original} - isEdited={!!relevantEditableFieldInfo?.description} + isEdited={!!editableFieldInfo?.description} onUpdate={(updatedDescription) => updateDescription({ variables: { @@ -56,6 +63,8 @@ export default function useDescriptionRenderer(editableSchemaMetadata: EditableS }, }).then(refresh) } + isPropagated={isPropagated} + sourceDetail={sourceDetail} isReadOnly /> ); diff --git a/datahub-web-react/src/app/entity/shared/useGetEntities.ts b/datahub-web-react/src/app/entity/shared/useGetEntities.ts new file mode 100644 index 00000000000000..9391bc17d7a8a2 --- /dev/null +++ b/datahub-web-react/src/app/entity/shared/useGetEntities.ts @@ -0,0 +1,18 @@ +import { useEffect, useState } from 'react'; +import { useGetEntitiesQuery } from '../../../graphql/entity.generated'; +import { Entity } from '../../../types.generated'; + +export function useGetEntities(urns: string[]): Entity[] { + const [verifiedUrns, setVerifiedUrns] = useState([]); + + useEffect(() => { + urns.forEach((urn) => { + if (urn.startsWith('urn:li:') && !verifiedUrns.includes(urn)) { + setVerifiedUrns((prevUrns) => [...prevUrns, urn]); + } + }); + }, [urns, verifiedUrns]); + + const { data } = useGetEntitiesQuery({ variables: { urns: verifiedUrns }, skip: !verifiedUrns.length }); + return (data?.entities || []) as Entity[]; +} diff --git a/datahub-web-react/src/app/lineage/LineageExplorer.tsx b/datahub-web-react/src/app/lineage/LineageExplorer.tsx index 26ffaa26a6ca22..ce0c4bb8f122d4 100644 --- a/datahub-web-react/src/app/lineage/LineageExplorer.tsx +++ b/datahub-web-react/src/app/lineage/LineageExplorer.tsx @@ -221,7 +221,9 @@ export default function LineageExplorer({ urn, type }: Props) { Close {selectedEntity.type !== EntityType.Restricted && ( - )} diff --git a/datahub-web-react/src/app/settings/SettingsPage.tsx b/datahub-web-react/src/app/settings/SettingsPage.tsx index 24bcd17ca7f9c0..e3948349546efb 100644 --- a/datahub-web-react/src/app/settings/SettingsPage.tsx +++ b/datahub-web-react/src/app/settings/SettingsPage.tsx @@ -121,7 +121,7 @@ export const SettingsPage = () => { const showViews = isViewsEnabled || false; const showOwnershipTypes = me && me?.platformPrivileges?.manageOwnershipTypes; const showHomePagePosts = me && me?.platformPrivileges?.manageGlobalAnnouncements && !readOnlyModeEnabled; - const showFeatures = true; // TODO: Add feature flag for this + const showFeatures = me?.platformPrivileges?.manageIngestion; // TODO: Add feature flag for this return ( diff --git a/datahub-web-react/src/app/settings/features/Feature.tsx b/datahub-web-react/src/app/settings/features/Feature.tsx index 2c090aae696f88..13453cf8f73252 100644 --- a/datahub-web-react/src/app/settings/features/Feature.tsx +++ b/datahub-web-react/src/app/settings/features/Feature.tsx @@ -104,6 +104,8 @@ export interface FeatureType { title: string; description: string; isAvailable: boolean; + isDisabled: boolean; + disabledMessage?: string; checked: boolean; onChange?: (checked: boolean) => void; }>; @@ -134,22 +136,6 @@ export const Feature = ({ key, title, description, settings, options, isNew, lea - {settings.map((option) => ( - <> - - - - {option.title} - - - - - - - - ))} {options.map((option, index) => ( <> @@ -165,15 +151,34 @@ export const Feature = ({ key, title, description, settings, options, isNew, lea {option.description} - (option.onChange ? option.onChange(checked) : null)} - disabled={!option.isAvailable} - /> + + (option.onChange ? option.onChange(checked) : null)} + disabled={!option.isAvailable || option.isDisabled} + /> + {index !== options.length - 1 && } ))} + {settings.map((option) => ( + <> + + + + {option.title} + Only available on DataHub Cloud + + + + + + + + ))} ); diff --git a/datahub-web-react/src/app/settings/features/Features.tsx b/datahub-web-react/src/app/settings/features/Features.tsx index ee8d7c628c1eff..1d0a0bb469cf86 100644 --- a/datahub-web-react/src/app/settings/features/Features.tsx +++ b/datahub-web-react/src/app/settings/features/Features.tsx @@ -73,18 +73,23 @@ export const Features = () => { setIsColPropagateChecked(checked); updateDocPropagation(checked); }, + isDisabled: false, + disabledMessage: undefined, }, { key: uuidv4(), title: 'Asset Level Propagation', description: 'Propagate new documentation from upstream to downstream assets based on data lineage relationships.', - isAvailable: false, checked: false, + onChange: (_: boolean) => null, + isAvailable: true, + isDisabled: true, + disabledMessage: 'Coming soon!', }, ], isNew: true, - learnMoreLink: 'https://datahubproject.io/docs/automations/doc-propagation', + learnMoreLink: 'https://datahubproject.io/docs/automations/docs-propagation', }, ]; diff --git a/docker/datahub-frontend/Dockerfile b/docker/datahub-frontend/Dockerfile index 2a9354cbf6a04f..89974e56575b07 100644 --- a/docker/datahub-frontend/Dockerfile +++ b/docker/datahub-frontend/Dockerfile @@ -25,7 +25,7 @@ RUN apk --no-cache --update-cache --available upgrade \ ENV LD_LIBRARY_PATH="/lib:/lib64" -FROM base as unpack +FROM base AS unpack COPY ./datahub-frontend.zip / RUN unzip datahub-frontend.zip -d /tmp/out \ @@ -33,16 +33,16 @@ RUN unzip datahub-frontend.zip -d /tmp/out \ COPY ./docker/monitoring/client-prometheus-config.yaml /datahub-frontend/ RUN chown -R datahub:datahub /datahub-frontend && chmod 755 /datahub-frontend -FROM base as prod-install +FROM base AS prod-install COPY --from=unpack /datahub-frontend/ /datahub-frontend/ -FROM base as dev-install +FROM base AS dev-install # Dummy stage for development. Assumes code is built on your machine and mounted to this image. # See this excellent thread https://github.com/docker/cli/issues/1134 VOLUME [ "/datahub-frontend" ] -FROM ${APP_ENV}-install as final +FROM ${APP_ENV}-install AS final COPY --chown=datahub:datahub --chmod=755 ./docker/datahub-frontend/start.sh / USER datahub diff --git a/docker/datahub-gms/Dockerfile b/docker/datahub-gms/Dockerfile index d30dbd84930578..b15bf3c6f9f17b 100644 --- a/docker/datahub-gms/Dockerfile +++ b/docker/datahub-gms/Dockerfile @@ -11,7 +11,7 @@ FROM golang:1-alpine3.20 AS binary # Re-declaring arg from above to make it available in this stage (will inherit default value) ARG ALPINE_REPO_URL -ENV DOCKERIZE_VERSION v0.6.1 +ENV DOCKERIZE_VERSION=v0.6.1 WORKDIR /go/src/github.com/jwilder # Optionally set corporate mirror for apk @@ -52,7 +52,7 @@ COPY --from=binary /go/bin/dockerize /usr/local/bin ENV LD_LIBRARY_PATH="/lib:/lib64" -FROM base as prod-install +FROM base AS prod-install COPY war.war /datahub/datahub-gms/bin/war.war COPY metadata-models/src/main/resources/entity-registry.yml /datahub/datahub-gms/resources/entity-registry.yml COPY docker/datahub-gms/start.sh /datahub/datahub-gms/scripts/start.sh @@ -61,11 +61,11 @@ COPY docker/datahub-gms/jetty-jmx.xml /datahub/datahub-gms/scripts/jetty-jmx.xml COPY docker/monitoring/client-prometheus-config.yaml /datahub/datahub-gms/scripts/prometheus-config.yaml RUN chmod +x /datahub/datahub-gms/scripts/start.sh -FROM base as dev-install +FROM base AS dev-install # Dummy stage for development. Assumes code is built on your machine and mounted to this image. # See this excellent thread https://github.com/docker/cli/issues/1134 -FROM ${APP_ENV}-install as final +FROM ${APP_ENV}-install AS final RUN mkdir -p /etc/datahub/plugins/auth/resources diff --git a/docker/datahub-gms/env/docker-without-neo4j.env b/docker/datahub-gms/env/docker-without-neo4j.env index 37b7ba1797af5b..cc0dd6b4278b56 100644 --- a/docker/datahub-gms/env/docker-without-neo4j.env +++ b/docker/datahub-gms/env/docker-without-neo4j.env @@ -23,8 +23,6 @@ PE_CONSUMER_ENABLED=true UI_INGESTION_ENABLED=true ENTITY_SERVICE_ENABLE_RETENTION=true -ELASTIC_ID_HASH_ALGO=MD5 - # Uncomment to disable persistence of client-side analytics events # DATAHUB_ANALYTICS_ENABLED=false diff --git a/docker/datahub-gms/env/docker.env b/docker/datahub-gms/env/docker.env index 0ecaa32c4cb123..59fc4bdde02ff4 100644 --- a/docker/datahub-gms/env/docker.env +++ b/docker/datahub-gms/env/docker.env @@ -27,8 +27,6 @@ MCE_CONSUMER_ENABLED=true PE_CONSUMER_ENABLED=true UI_INGESTION_ENABLED=true -ELASTIC_ID_HASH_ALGO=MD5 - # Uncomment to enable Metadata Service Authentication METADATA_SERVICE_AUTH_ENABLED=false diff --git a/docker/datahub-ingestion-base/Dockerfile b/docker/datahub-ingestion-base/Dockerfile index 8a238c32704bb6..a2686ee8b6557f 100644 --- a/docker/datahub-ingestion-base/Dockerfile +++ b/docker/datahub-ingestion-base/Dockerfile @@ -7,28 +7,13 @@ ARG GITHUB_REPO_URL=https://github.com ARG DEBIAN_REPO_URL=https://deb.debian.org/debian ARG PIP_MIRROR_URL=https://pypi.python.org/simple -FROM golang:1-alpine3.20 AS dockerize-binary +FROM powerman/dockerize:0.19 as dockerize-binary -# Re-declaring arg from above to make it available in this stage (will inherit default value) -ARG ALPINE_REPO_URL - -ENV DOCKERIZE_VERSION v0.6.1 -WORKDIR /go/src/github.com/jwilder - -# Optionally set corporate mirror for apk -RUN if [ "${ALPINE_REPO_URL}" != "http://dl-cdn.alpinelinux.org/alpine" ] ; then sed -i "s#http.*://dl-cdn.alpinelinux.org/alpine#${ALPINE_REPO_URL}#g" /etc/apk/repositories ; fi - -RUN apk --no-cache --update add openssl git tar curl - -WORKDIR /go/src/github.com/jwilder/dockerize - -RUN go install github.com/jwilder/dockerize@$DOCKERIZE_VERSION - -FROM python:3.10 as base +FROM python:3.10 AS base ARG GITHUB_REPO_URL -ENV DEBIAN_FRONTEND noninteractive +ENV DEBIAN_FRONTEND=noninteractive # Optionally set corporate mirror for deb ARG DEBIAN_REPO_URL @@ -56,8 +41,7 @@ RUN apt-get update && apt-get install -y -qq \ && python -m pip install --no-cache --upgrade pip uv>=0.1.10 wheel setuptools \ && rm -rf /var/lib/apt/lists/* /var/cache/apk/* -# compiled against newer golang for security fixes -COPY --from=dockerize-binary /go/bin/dockerize /usr/local/bin +COPY --from=dockerize-binary /usr/local/bin/dockerize /usr/local/bin COPY ./docker/datahub-ingestion-base/base-requirements.txt requirements.txt COPY ./docker/datahub-ingestion-base/entrypoint.sh /entrypoint.sh @@ -75,7 +59,7 @@ RUN python3 -m venv $VIRTUAL_ENV && \ ENTRYPOINT [ "/entrypoint.sh" ] -FROM ${BASE_IMAGE} as full-install +FROM ${BASE_IMAGE} AS full-install USER 0 RUN apt-get update && apt-get install -y -qq \ @@ -102,7 +86,7 @@ RUN if [ $(arch) = "x86_64" ]; then \ USER datahub -FROM ${BASE_IMAGE} as slim-install +FROM ${BASE_IMAGE} AS slim-install # Do nothing else on top of base FROM ${APP_ENV}-install diff --git a/docker/datahub-ingestion/Dockerfile b/docker/datahub-ingestion/Dockerfile index b8eda548491224..34ac6ae9eba584 100644 --- a/docker/datahub-ingestion/Dockerfile +++ b/docker/datahub-ingestion/Dockerfile @@ -5,7 +5,7 @@ ARG DOCKER_VERSION=head-full ARG DEBIAN_REPO_URL=https://deb.debian.org/debian ARG PIP_MIRROR_URL=https://pypi.python.org/simple -FROM $BASE_IMAGE:$DOCKER_VERSION as base +FROM $BASE_IMAGE:$DOCKER_VERSION AS base # Optionally set corporate mirror for deb USER 0 @@ -28,11 +28,11 @@ RUN sed -i.bak "s/__version__ = \"1\!0.0.0.dev0\"/__version__ = \"$(echo $RELEAS cat src/datahub/__init__.py | grep __version__ && \ cat airflow-plugin/src/datahub_airflow_plugin/__init__.py | grep __version__ -FROM base as slim-install +FROM base AS slim-install RUN uv pip install --no-cache -e ".[base,datahub-rest,datahub-kafka,snowflake,bigquery,redshift,mysql,postgres,hive,clickhouse,glue,dbt,looker,lookml,tableau,powerbi,superset,datahub-business-glossary]" -FROM base as full-install-build +FROM base AS full-install-build USER 0 RUN apt-get update && apt-get install -y -qq maven @@ -44,14 +44,14 @@ RUN uv pip install --no-cache -e ".[base,all]" "./airflow-plugin[plugin-v2]" && datahub --version RUN ./pyspark_jars.sh -FROM base as full-install +FROM base AS full-install COPY --from=full-install-build ${VIRTUAL_ENV} ${VIRTUAL_ENV} -FROM base as dev-install +FROM base AS dev-install # Dummy stage for development. Assumes code is built on your machine and mounted to this image. # See this excellent thread https://github.com/docker/cli/issues/1134 -FROM ${APP_ENV}-install as final +FROM ${APP_ENV}-install AS final USER datahub diff --git a/docker/datahub-mae-consumer/Dockerfile b/docker/datahub-mae-consumer/Dockerfile index 0ee55821f2579f..6edaa29ee1a8bb 100644 --- a/docker/datahub-mae-consumer/Dockerfile +++ b/docker/datahub-mae-consumer/Dockerfile @@ -11,7 +11,7 @@ FROM golang:1-alpine3.20 AS binary # Re-declaring arg from above to make it available in this stage (will inherit default value) ARG ALPINE_REPO_URL -ENV DOCKERIZE_VERSION v0.6.1 +ENV DOCKERIZE_VERSION=v0.6.1 WORKDIR /go/src/github.com/jwilder # Optionally set corporate mirror for apk @@ -47,18 +47,18 @@ COPY --from=binary /go/bin/dockerize /usr/local/bin ENV LD_LIBRARY_PATH="/lib:/lib64" -FROM base as prod-install +FROM base AS prod-install COPY mae-consumer-job.jar /datahub/datahub-mae-consumer/bin/ COPY metadata-models/src/main/resources/entity-registry.yml /datahub/datahub-mae-consumer/resources/entity-registry.yml COPY docker/datahub-mae-consumer/start.sh /datahub/datahub-mae-consumer/scripts/ COPY docker/monitoring/client-prometheus-config.yaml /datahub/datahub-mae-consumer/scripts/prometheus-config.yaml RUN chmod +x /datahub/datahub-mae-consumer/scripts/start.sh -FROM base as dev-install +FROM base AS dev-install # Dummy stage for development. Assumes code is built on your machine and mounted to this image. # See this excellent thread https://github.com/docker/cli/issues/1134 -FROM ${APP_ENV}-install as final +FROM ${APP_ENV}-install AS final RUN addgroup -S datahub && adduser -S datahub -G datahub USER datahub diff --git a/docker/datahub-mae-consumer/env/docker-without-neo4j.env b/docker/datahub-mae-consumer/env/docker-without-neo4j.env index 6a82f235b29711..b6899f7e6d63b2 100644 --- a/docker/datahub-mae-consumer/env/docker-without-neo4j.env +++ b/docker/datahub-mae-consumer/env/docker-without-neo4j.env @@ -13,8 +13,6 @@ ES_BULK_REFRESH_POLICY=WAIT_UNTIL GRAPH_SERVICE_IMPL=elasticsearch ENTITY_REGISTRY_CONFIG_PATH=/datahub/datahub-mae-consumer/resources/entity-registry.yml -ELASTIC_ID_HASH_ALGO=MD5 - # Uncomment to disable persistence of client-side analytics events # DATAHUB_ANALYTICS_ENABLED=false diff --git a/docker/datahub-mae-consumer/env/docker.env b/docker/datahub-mae-consumer/env/docker.env index 1f0ee4b05b3820..5a6daa6eaeaed7 100644 --- a/docker/datahub-mae-consumer/env/docker.env +++ b/docker/datahub-mae-consumer/env/docker.env @@ -17,8 +17,6 @@ NEO4J_PASSWORD=datahub GRAPH_SERVICE_IMPL=neo4j ENTITY_REGISTRY_CONFIG_PATH=/datahub/datahub-mae-consumer/resources/entity-registry.yml -ELASTIC_ID_HASH_ALGO=MD5 - # Uncomment to disable persistence of client-side analytics events # DATAHUB_ANALYTICS_ENABLED=false diff --git a/docker/datahub-mce-consumer/Dockerfile b/docker/datahub-mce-consumer/Dockerfile index 8f85b432a10711..1eb56633c561e6 100644 --- a/docker/datahub-mce-consumer/Dockerfile +++ b/docker/datahub-mce-consumer/Dockerfile @@ -11,7 +11,7 @@ FROM golang:1-alpine3.20 AS binary # Re-declaring arg from above to make it available in this stage (will inherit default value) ARG ALPINE_REPO_URL -ENV DOCKERIZE_VERSION v0.6.1 +ENV DOCKERIZE_VERSION=v0.6.1 WORKDIR /go/src/github.com/jwilder # Optionally set corporate mirror for apk @@ -45,7 +45,7 @@ RUN apk --no-cache --update-cache --available upgrade \ && cp /usr/lib/jvm/java-17-openjdk/jre/lib/security/cacerts /tmp/kafka.client.truststore.jks COPY --from=binary /go/bin/dockerize /usr/local/bin -FROM base as prod-install +FROM base AS prod-install COPY mce-consumer-job.jar /datahub/datahub-mce-consumer/bin/ COPY metadata-models/src/main/resources/entity-registry.yml /datahub/datahub-mce-consumer/resources/entity-registry.yml COPY docker/datahub-mce-consumer/start.sh /datahub/datahub-mce-consumer/scripts/ @@ -54,12 +54,12 @@ RUN chmod +x /datahub/datahub-mce-consumer/scripts/start.sh ENV LD_LIBRARY_PATH="/lib:/lib64" -FROM base as dev-install +FROM base AS dev-install # Dummy stage for development. Assumes code is built on your machine and mounted to this image. # See this excellent thread https://github.com/docker/cli/issues/1134 COPY metadata-models/src/main/resources/entity-registry.yml /datahub/datahub-mce-consumer/resources/entity-registry.yml -FROM ${APP_ENV}-install as final +FROM ${APP_ENV}-install AS final RUN addgroup -S datahub && adduser -S datahub -G datahub USER datahub diff --git a/docker/datahub-mce-consumer/env/docker-without-neo4j.env b/docker/datahub-mce-consumer/env/docker-without-neo4j.env index b0edfc0a75b669..e7be7d8ed4ddc5 100644 --- a/docker/datahub-mce-consumer/env/docker-without-neo4j.env +++ b/docker/datahub-mce-consumer/env/docker-without-neo4j.env @@ -24,8 +24,6 @@ MAE_CONSUMER_ENABLED=false PE_CONSUMER_ENABLED=false UI_INGESTION_ENABLED=false -ELASTIC_ID_HASH_ALGO=MD5 - # Uncomment to configure kafka topic names # Make sure these names are consistent across the whole deployment # METADATA_CHANGE_PROPOSAL_TOPIC_NAME=MetadataChangeProposal_v1 diff --git a/docker/datahub-mce-consumer/env/docker.env b/docker/datahub-mce-consumer/env/docker.env index c0f85ef667546e..8618f3f5f7af7a 100644 --- a/docker/datahub-mce-consumer/env/docker.env +++ b/docker/datahub-mce-consumer/env/docker.env @@ -24,8 +24,6 @@ MAE_CONSUMER_ENABLED=false PE_CONSUMER_ENABLED=false UI_INGESTION_ENABLED=false -ELASTIC_ID_HASH_ALGO=MD5 - # Uncomment to configure kafka topic names # Make sure these names are consistent across the whole deployment # METADATA_CHANGE_PROPOSAL_TOPIC_NAME=MetadataChangeProposal_v1 diff --git a/docker/datahub-upgrade/Dockerfile b/docker/datahub-upgrade/Dockerfile index 675e24ab87109e..3d59a903414b1a 100644 --- a/docker/datahub-upgrade/Dockerfile +++ b/docker/datahub-upgrade/Dockerfile @@ -11,7 +11,7 @@ FROM golang:1-alpine3.20 AS binary # Re-declaring arg from above to make it available in this stage (will inherit default value) ARG ALPINE_REPO_URL -ENV DOCKERIZE_VERSION v0.6.1 +ENV DOCKERIZE_VERSION=v0.6.1 WORKDIR /go/src/github.com/jwilder # Optionally set corporate mirror for apk @@ -51,15 +51,15 @@ COPY --from=binary /go/bin/dockerize /usr/local/bin ENV LD_LIBRARY_PATH="/lib:/lib64" -FROM base as prod-install +FROM base AS prod-install COPY datahub-upgrade.jar /datahub/datahub-upgrade/bin/ COPY metadata-models/src/main/resources/entity-registry.yml /datahub/datahub-gms/resources/entity-registry.yml -FROM base as dev-install +FROM base AS dev-install # Dummy stage for development. Assumes code is built on your machine and mounted to this image. # See this excellent thread https://github.com/docker/cli/issues/1134 -FROM ${APP_ENV}-install as final +FROM ${APP_ENV}-install AS final RUN addgroup -S datahub && adduser -S datahub -G datahub USER datahub diff --git a/docker/elasticsearch-setup/Dockerfile b/docker/elasticsearch-setup/Dockerfile index 7390e3579dcf8f..4e64dcbc1e452c 100644 --- a/docker/elasticsearch-setup/Dockerfile +++ b/docker/elasticsearch-setup/Dockerfile @@ -10,7 +10,7 @@ FROM golang:1-alpine3.20 AS binary ARG ALPINE_REPO_URL -ENV DOCKERIZE_VERSION v0.6.1 +ENV DOCKERIZE_VERSION=v0.6.1 WORKDIR /go/src/github.com/jwilder # Optionally set corporate mirror for apk diff --git a/docker/kafka-setup/Dockerfile b/docker/kafka-setup/Dockerfile index a68da4e41d4df9..ad1d01c1ce97c0 100644 --- a/docker/kafka-setup/Dockerfile +++ b/docker/kafka-setup/Dockerfile @@ -6,8 +6,8 @@ ARG GITHUB_REPO_URL=https://github.com ARG MAVEN_CENTRAL_REPO_URL=https://repo1.maven.org/maven2 ARG APACHE_DOWNLOAD_URL=null -# Using as a base image because to get the needed jars for confluent utils -FROM confluentinc/cp-base-new:$KAFKA_DOCKER_VERSION as confluent_base +# Using AS a base image because to get the needed jars for confluent utils +FROM confluentinc/cp-base-new:$KAFKA_DOCKER_VERSION AS confluent_base ARG MAVEN_CENTRAL_REPO_URL ARG SNAKEYAML_VERSION="2.0" @@ -22,8 +22,8 @@ ARG ALPINE_REPO_URL ARG APACHE_DOWNLOAD_URL ARG GITHUB_REPO_URL -ENV KAFKA_VERSION 3.7.0 -ENV SCALA_VERSION 2.13 +ENV KAFKA_VERSION=3.7.0 +ENV SCALA_VERSION=2.13 LABEL name="kafka" version=${KAFKA_VERSION} @@ -44,7 +44,7 @@ RUN mkdir -p /opt \ && rm -rf /tmp/* \ && apk del --purge .build-deps -ENV PATH /sbin:/opt/kafka/bin/:$PATH +ENV PATH=/sbin:/opt/kafka/bin/:$PATH WORKDIR /opt/kafka @@ -71,6 +71,7 @@ COPY docker/kafka-setup/kafka-setup.sh ./kafka-setup.sh COPY docker/kafka-setup/kafka-config.sh ./kafka-config.sh COPY docker/kafka-setup/kafka-topic-workers.sh ./kafka-topic-workers.sh COPY docker/kafka-setup/kafka-ready.sh ./kafka-ready.sh +COPY docker/kafka-setup/env_to_properties.py ./env_to_properties.py RUN chmod +x ./kafka-setup.sh ./kafka-topic-workers.sh ./kafka-ready.sh diff --git a/docker/mysql-setup/Dockerfile b/docker/mysql-setup/Dockerfile index 46969352d81746..b0ca45ad8f6f24 100644 --- a/docker/mysql-setup/Dockerfile +++ b/docker/mysql-setup/Dockerfile @@ -5,7 +5,7 @@ FROM golang:1-alpine3.20 AS binary ARG ALPINE_REPO_URL -ENV DOCKERIZE_VERSION v0.6.1 +ENV DOCKERIZE_VERSION=v0.6.1 WORKDIR /go/src/github.com/jwilder # Optionally set corporate mirror for apk diff --git a/docker/postgres-setup/Dockerfile b/docker/postgres-setup/Dockerfile index 8ab211218f2406..e145456e807d4d 100644 --- a/docker/postgres-setup/Dockerfile +++ b/docker/postgres-setup/Dockerfile @@ -5,7 +5,7 @@ FROM golang:1-alpine3.20 AS binary ARG ALPINE_REPO_URL -ENV DOCKERIZE_VERSION v0.6.1 +ENV DOCKERIZE_VERSION=v0.6.1 WORKDIR /go/src/github.com/jwilder # Optionally set corporate mirror for apk diff --git a/docker/profiles/docker-compose.frontend.yml b/docker/profiles/docker-compose.frontend.yml index b43db8297cb1e0..b5b2d50143927f 100644 --- a/docker/profiles/docker-compose.frontend.yml +++ b/docker/profiles/docker-compose.frontend.yml @@ -10,6 +10,7 @@ x-datahub-frontend-service: &datahub-frontend-service - ${DATAHUB_LOCAL_FRONTEND_ENV:-empty2.env} environment: &datahub-frontend-service-env KAFKA_BOOTSTRAP_SERVER: broker:29092 + DATAHUB_GMS_HOST: ${DATAHUB_GMS_HOST:-datahub-gms} volumes: - ${HOME}/.datahub/plugins:/etc/datahub/plugins diff --git a/docker/profiles/docker-compose.gms.yml b/docker/profiles/docker-compose.gms.yml index 8cfff2280e2fea..c9448fa34c6870 100644 --- a/docker/profiles/docker-compose.gms.yml +++ b/docker/profiles/docker-compose.gms.yml @@ -40,6 +40,7 @@ x-kafka-env: &kafka-env # KAFKA_SCHEMAREGISTRY_URL=http://schema-registry:8081 SCHEMA_REGISTRY_TYPE: INTERNAL KAFKA_SCHEMAREGISTRY_URL: http://datahub-gms:8080/schema-registry/api/ + SPRING_KAFKA_CONSUMER_AUTO_OFFSET_RESET: ${SPRING_KAFKA_CONSUMER_AUTO_OFFSET_RESET:-earliest} x-datahub-quickstart-telemetry-env: &datahub-quickstart-telemetry-env DATAHUB_SERVER_TYPE: ${DATAHUB_SERVER_TYPE:-quickstart} diff --git a/docker/profiles/docker-compose.prerequisites.yml b/docker/profiles/docker-compose.prerequisites.yml index 7cd9c9039539cc..eed23a749628fe 100644 --- a/docker/profiles/docker-compose.prerequisites.yml +++ b/docker/profiles/docker-compose.prerequisites.yml @@ -234,7 +234,7 @@ services: env_file: kafka-broker/env/docker.env environment: KAFKA_NODE_ID: 1 - KAFKA_ADVERTISED_LISTENERS: BROKER://broker:29092,EXTERNAL://broker:9092 + KAFKA_ADVERTISED_LISTENERS: BROKER://broker:29092,EXTERNAL://localhost:9092 KAFKA_LISTENERS: BROKER://broker:29092,EXTERNAL://broker:9092,CONTROLLER://broker:39092 KAFKA_INTER_BROKER_LISTENER_NAME: BROKER KAFKA_CONTROLLER_LISTENER_NAMES: CONTROLLER diff --git a/docker/quickstart/docker-compose-m1.quickstart.yml b/docker/quickstart/docker-compose-m1.quickstart.yml index a0f60d23710a07..834d55096468f6 100644 --- a/docker/quickstart/docker-compose-m1.quickstart.yml +++ b/docker/quickstart/docker-compose-m1.quickstart.yml @@ -86,7 +86,6 @@ services: - ELASTICSEARCH_INDEX_BUILDER_MAPPINGS_REINDEX=true - ELASTICSEARCH_INDEX_BUILDER_SETTINGS_REINDEX=true - ELASTICSEARCH_PORT=9200 - - ELASTIC_ID_HASH_ALGO=MD5 - ENTITY_REGISTRY_CONFIG_PATH=/datahub/datahub-gms/resources/entity-registry.yml - ENTITY_SERVICE_ENABLE_RETENTION=true - ES_BULK_REFRESH_POLICY=WAIT_UNTIL diff --git a/docker/quickstart/docker-compose-without-neo4j-m1.quickstart.yml b/docker/quickstart/docker-compose-without-neo4j-m1.quickstart.yml index 11e33a9950ba9b..47fb50f78e4f0c 100644 --- a/docker/quickstart/docker-compose-without-neo4j-m1.quickstart.yml +++ b/docker/quickstart/docker-compose-without-neo4j-m1.quickstart.yml @@ -86,7 +86,6 @@ services: - ELASTICSEARCH_INDEX_BUILDER_MAPPINGS_REINDEX=true - ELASTICSEARCH_INDEX_BUILDER_SETTINGS_REINDEX=true - ELASTICSEARCH_PORT=9200 - - ELASTIC_ID_HASH_ALGO=MD5 - ENTITY_REGISTRY_CONFIG_PATH=/datahub/datahub-gms/resources/entity-registry.yml - ENTITY_SERVICE_ENABLE_RETENTION=true - ES_BULK_REFRESH_POLICY=WAIT_UNTIL diff --git a/docker/quickstart/docker-compose-without-neo4j.quickstart.yml b/docker/quickstart/docker-compose-without-neo4j.quickstart.yml index 2efa8959834183..3fa13a9e56b421 100644 --- a/docker/quickstart/docker-compose-without-neo4j.quickstart.yml +++ b/docker/quickstart/docker-compose-without-neo4j.quickstart.yml @@ -86,7 +86,6 @@ services: - ELASTICSEARCH_INDEX_BUILDER_MAPPINGS_REINDEX=true - ELASTICSEARCH_INDEX_BUILDER_SETTINGS_REINDEX=true - ELASTICSEARCH_PORT=9200 - - ELASTIC_ID_HASH_ALGO=MD5 - ENTITY_REGISTRY_CONFIG_PATH=/datahub/datahub-gms/resources/entity-registry.yml - ENTITY_SERVICE_ENABLE_RETENTION=true - ES_BULK_REFRESH_POLICY=WAIT_UNTIL diff --git a/docker/quickstart/docker-compose.consumers-without-neo4j.quickstart.yml b/docker/quickstart/docker-compose.consumers-without-neo4j.quickstart.yml index 4f47a3da24eb1b..a4211acedcf102 100644 --- a/docker/quickstart/docker-compose.consumers-without-neo4j.quickstart.yml +++ b/docker/quickstart/docker-compose.consumers-without-neo4j.quickstart.yml @@ -19,7 +19,6 @@ services: - ES_BULK_REFRESH_POLICY=WAIT_UNTIL - GRAPH_SERVICE_IMPL=elasticsearch - ENTITY_REGISTRY_CONFIG_PATH=/datahub/datahub-mae-consumer/resources/entity-registry.yml - - ELASTIC_ID_HASH_ALGO=MD5 hostname: datahub-mae-consumer image: ${DATAHUB_MAE_CONSUMER_IMAGE:-acryldata/datahub-mae-consumer}:${DATAHUB_VERSION:-head} ports: @@ -38,7 +37,6 @@ services: - EBEAN_DATASOURCE_USERNAME=datahub - ELASTICSEARCH_HOST=elasticsearch - ELASTICSEARCH_PORT=9200 - - ELASTIC_ID_HASH_ALGO=MD5 - ENTITY_REGISTRY_CONFIG_PATH=/datahub/datahub-mce-consumer/resources/entity-registry.yml - ENTITY_SERVICE_ENABLE_RETENTION=true - ES_BULK_REFRESH_POLICY=WAIT_UNTIL diff --git a/docker/quickstart/docker-compose.consumers.quickstart.yml b/docker/quickstart/docker-compose.consumers.quickstart.yml index 7dd7388b939884..e7571e4baf8b4e 100644 --- a/docker/quickstart/docker-compose.consumers.quickstart.yml +++ b/docker/quickstart/docker-compose.consumers.quickstart.yml @@ -26,7 +26,6 @@ services: - NEO4J_PASSWORD=datahub - GRAPH_SERVICE_IMPL=neo4j - ENTITY_REGISTRY_CONFIG_PATH=/datahub/datahub-mae-consumer/resources/entity-registry.yml - - ELASTIC_ID_HASH_ALGO=MD5 hostname: datahub-mae-consumer image: ${DATAHUB_MAE_CONSUMER_IMAGE:-acryldata/datahub-mae-consumer}:${DATAHUB_VERSION:-head} ports: @@ -48,7 +47,6 @@ services: - EBEAN_DATASOURCE_USERNAME=datahub - ELASTICSEARCH_HOST=elasticsearch - ELASTICSEARCH_PORT=9200 - - ELASTIC_ID_HASH_ALGO=MD5 - ENTITY_REGISTRY_CONFIG_PATH=/datahub/datahub-mce-consumer/resources/entity-registry.yml - ENTITY_SERVICE_ENABLE_RETENTION=true - ES_BULK_REFRESH_POLICY=WAIT_UNTIL diff --git a/docker/quickstart/docker-compose.quickstart.yml b/docker/quickstart/docker-compose.quickstart.yml index f42ed1f40c2467..c63b6d1d61b030 100644 --- a/docker/quickstart/docker-compose.quickstart.yml +++ b/docker/quickstart/docker-compose.quickstart.yml @@ -86,7 +86,6 @@ services: - ELASTICSEARCH_INDEX_BUILDER_MAPPINGS_REINDEX=true - ELASTICSEARCH_INDEX_BUILDER_SETTINGS_REINDEX=true - ELASTICSEARCH_PORT=9200 - - ELASTIC_ID_HASH_ALGO=MD5 - ENTITY_REGISTRY_CONFIG_PATH=/datahub/datahub-gms/resources/entity-registry.yml - ENTITY_SERVICE_ENABLE_RETENTION=true - ES_BULK_REFRESH_POLICY=WAIT_UNTIL diff --git a/docs-website/docusaurus.config.js b/docs-website/docusaurus.config.js index 1a40c986b31671..3b2019f785c1e2 100644 --- a/docs-website/docusaurus.config.js +++ b/docs-website/docusaurus.config.js @@ -170,6 +170,14 @@ module.exports = { value: '', }, { + value: ` + 0.14.0 + + + `, + type: "html", + }, + { value: ` 0.13.0 diff --git a/docs-website/sidebars.js b/docs-website/sidebars.js index a3aa54657d0675..1f9c0a4d79a9d8 100644 --- a/docs-website/sidebars.js +++ b/docs-website/sidebars.js @@ -98,6 +98,23 @@ module.exports = { }, ], }, + { + label: "Automations", + type: "category", + items: [ + { + label: "Documentation Propagation", + type: "doc", + id: "docs/automation/docs-propagation", + }, + { + label: "Snowflake Tag Sync", + type: "doc", + id: "docs/automation/snowflake-tag-propagation", + className: "saasOnly", + }, + ], + }, { label: "Business Attributes", type: "doc", @@ -527,7 +544,12 @@ module.exports = { "Advanced Guides": [ "docs/how/delete-metadata", "docs/how/configuring-authorization-with-apache-ranger", - "docs/managed-datahub/configuring-identity-provisioning-with-ms-entra", + { + "SCIM Provisioning": [ + "docs/managed-datahub/configuring-identity-provisioning-with-ms-entra", + "docs/managed-datahub/configuring-identity-provisioning-with-okta", + ], + }, "docs/how/backup-datahub", "docs/how/restore-indices", "docs/advanced/db-retention", diff --git a/docs-website/versions.json b/docs-website/versions.json index afd30a317c618b..5288c42437c779 100644 --- a/docs-website/versions.json +++ b/docs-website/versions.json @@ -1,3 +1,4 @@ [ + "0.14.0", "0.13.1" ] diff --git a/docs/authorization/policies.md b/docs/authorization/policies.md index b393c8ffa37577..45d0b59e408337 100644 --- a/docs/authorization/policies.md +++ b/docs/authorization/policies.md @@ -173,12 +173,13 @@ These privileges are for DataHub operators to access & manage the administrative | View Tests | View Asset Tests. | | Manage Tests[^2] | Allow actor to create and remove Asset Tests. | | View Metadata Proposals[^2] | Allow actor to view the requests tab for viewing metadata proposals. | -| Create metadata constraints[^2] | Allow actor to create metadata constraints. | +| Create metadata constraints[^3] | Allow actor to create metadata constraints. | | Manage Platform Settings[^2] | Allow actor to view and change platform-level settings, like integrations & notifications. | | Manage Monitors[^2] | Allow actor to create, update, and delete any data asset monitors, including Custom SQL monitors. Grant with care. | [^1]: Only active if REST_API_AUTHORIZATION_ENABLED is true [^2]: DataHub Cloud only +[^3]: Deprecated feature #### Entity Management diff --git a/docs/authorization/roles.md b/docs/authorization/roles.md index 7c7b4581faffca..a1719438d2941b 100644 --- a/docs/authorization/roles.md +++ b/docs/authorization/roles.md @@ -156,10 +156,12 @@ These privileges are only relevant to DataHub Cloud. |-----------------------------|--------------------|--------------------|--------|-----------------------------------------------------------------------------------------------------| | Manage Tests | :heavy_check_mark: | :heavy_check_mark: | :x: | Create and remove Asset Tests. | | View Metadata Proposals | :heavy_check_mark: | :heavy_check_mark: | :x: | View the requests tab for viewing metadata proposals. | -| Create metadata constraints | :heavy_check_mark: | :heavy_check_mark: | :x: | Create metadata constraints. | +| Create metadata constraints[^1] | :heavy_check_mark: | :heavy_check_mark: | :x: | Create metadata constraints. | | Manage Platform Settings | :heavy_check_mark: | :x: | :x: | View and change platform-level settings, like integrations & notifications. | | Manage Monitors | :heavy_check_mark: | :x: | :x: | Create, update, and delete any data asset monitors, including Custom SQL monitors. Grant with care. | +[^1]: Deprecated feature + ##### Metadata Privileges | Privilege | Admin | Editor | Reader | Description | diff --git a/docs/automation/docs-propagation.md b/docs/automation/docs-propagation.md new file mode 100644 index 00000000000000..a637afcde4dca7 --- /dev/null +++ b/docs/automation/docs-propagation.md @@ -0,0 +1,128 @@ +# Documentation Propagation Automation + +## Introduction + +Documentation Propagation is an automation automatically propagates column and asset (coming soon) descriptions based on downstream column-level lineage and sibling relationships. +It simplifies metadata management by ensuring consistency and reducing the manual effort required for documenting data assets to aid +in Data Governance & Compliance along with Data Discovery. + +This feature is enabled by default in Open Source DataHub. + +## Capabilities + +### Open Source +- **Column-Level Docs Propagation**: Automatically propagate documentation to downstream columns and sibling columns that are derived or dependent on the source column. +- **(Coming Soon) Asset-Level Docs Propagation**: Propagate descriptions to sibling assets. + +### DataHub Cloud (Acryl) +- Includes all the features of Open Source. +- **Propagation Rollback (Undo)**: Offers the ability to undo any propagation changes, providing a safety net against accidental updates. +- **Historical Backfilling**: Automatically backfills historical data for newly documented columns to maintain consistency across time. + +### Comparison of Features + +| Feature | Open Source | DataHub Cloud | +|---------------------------------|-------------|---------------| +| Column-Level Docs Propagation | ✔️ | ✔️ | +| Asset-Level Docs Propagation | ✔️ | ✔️ | +| Downstream Lineage + Siblings | ✔️ | ✔️ | +| Propagation Rollback (Undo) | ❌ | ✔️ | +| Historical Backfilling | ❌ | ✔️ | + +## Enabling Documentation Propagation + +### In Open Source + +Notice that the user must have the `Manage Ingestion` permission to view and enable the feature. + +1. **Navigate to Settings**: Click on the 'Settings' gear in top navigation bar. + +

+ +

+ +2. **Navigate to Features**: Click on the 'Features' tab in the left-hand navigation bar. + +

+ +

+ +3**Enable Documentation Propagation**: Locate the 'Documentation Propagation' section and toggle the feature to enable it for column-level and asset-level propagation. +Currently, Column Level propagation is supported, with asset level propagation coming soon. + +

+ +

+ + +### In DataHub Cloud + +1. **Navigate to Automations**: Click on 'Govern' > 'Automations' in the navigation bar. + +

+ +

+ +2. **Create An Automation**: Click on 'Create' and select 'Column Documentation Propagation'. + +

+ +

+ +3. **Configure Automation**: Fill in the required fields, such as the name, description, and category. Finally, click 'Save and Run' to start the automation + +

+ +

+ +## Propagating for Existing Assets (DataHub Cloud Only) + +In DataHub Cloud, you can back-fill historical data for existing assets to ensure that all existing column descriptions are propagated to downstreams +when you start the automation. Note that it may take some time to complete the initial back-filling process, depending on the number of assets and the complexity of your lineage. + +To do this, navigate to the Automation you created in Step 3 above, click the 3-dot "more" menu: + +

+ +

+ +and then click "Initialize". + +

+ +

+ +This one-time step will kick off the back-filling process for existing descriptions. If you only want to begin propagating +descriptions going forward, you can skip this step. + +## Rolling Back Propagated Descriptions (DataHub Cloud Only) + +In DataHub Cloud, you can rollback all descriptions that have been propagated historically. + +This feature allows you to "clean up" or "undo" any accidental propagation that may have occurred automatically, in the case +that you no longer want propagated descriptions to be visible. + +To do this, navigate to the Automation you created in Step 3 above, click the 3-dot "More" menu + +

+ +

+ +and then click "Rollback". + +

+ +

+ +This one-time step will remove all propagated tags and glossary terms from Snowflake. To simply stop propagating new tags, you can disable the automation. + +## Viewing Propagated Descriptions + +Once the automation is enabled, you'll be able to recognize propagated descriptions as those with the thunderbolt icon next to them: + +The tooltip will provide additional information, including where the description originated and any intermediate hops that were +used to propagate the description. + +

+ +

\ No newline at end of file diff --git a/docs/automation/snowflake-tag-propagation.md b/docs/automation/snowflake-tag-propagation.md new file mode 100644 index 00000000000000..bdc80376dfb484 --- /dev/null +++ b/docs/automation/snowflake-tag-propagation.md @@ -0,0 +1,88 @@ + +import FeatureAvailability from '@site/src/components/FeatureAvailability'; + +# Snowflake Tag Propagation Automation + + + +## Introduction + +Snowflake Tag Propagation is an automation that allows you to sync DataHub Glossary Terms and Tags on +both columns and tables back to Snowflake. This automation is available in DataHub Cloud (Acryl) only. + +## Capabilities + +- Automatically Add DataHub Glossary Terms to Snowflake Tables and Columns +- Automatically Add DataHub Tags to Snowflake Tables and Columns +- Automatically Remove DataHub Glossary Terms and Tags from Snowflake Tables and Columns when they are removed in DataHub + +## Enabling Snowflake Tag Sync + +1. **Navigate to Automations**: Click on 'Govern' > 'Automations' in the navigation bar. + +

+ +

+ +2. **Create An Automation**: Click on 'Create' and select 'Snowflake Tag Propagation'. + +

+ +

+ +3. **Configure Automation**: Fill in the required fields to connect to Snowflake, along with the name, description, and category. +Note that you can limit propagation based on specific Tags and Glossary Terms. If none are selected, then ALL Tags or Glossary Terms will be automatically +propagated to Snowflake tables and columns. Finally, click 'Save and Run' to start the automation + +

+ +

+ +## Propagating for Existing Assets + +You can back-fill historical data for existing assets to ensure that all existing column and table Tags and Glossary Terms are propagated to Snowflake. +Note that it may take some time to complete the initial back-filling process, depending on the number of Snowflake assets you have. + +To do so, navigate to the Automation you created in Step 3 above, click the 3-dot "More" menu + +

+ +

+ +and then click "Initialize". + +

+ +

+ +This one-time step will kick off the back-filling process for existing descriptions. If you only want to begin propagating +descriptions going forward, you can skip this step. + +## Rolling Back Propagated Tags + +You can rollback all tags and glossary terms that have been propagated historically. + +This feature allows you to "clean up" or "undo" any accidental propagation that may have occurred automatically, in the case +that you no longer want propagated descriptions to be visible. + +To do this, navigate to the Automation you created in Step 3 above, click the 3-dot "More" menu + +

+ +

+ +and then click "Rollback". + +

+ +

+ +This one-time step will remove all propagated tags and glossary terms from Snowflake. To simply stop propagating new tags, you can disable the automation. + +## Viewing Propagated Tags + +You can view propagated Tags (and corresponding DataHub URNs) inside the Snowflake UI to confirm the automation is working as expected. + +

+ +

diff --git a/docs/how/updating-datahub.md b/docs/how/updating-datahub.md index 08ababcb5cfce9..2443375099b7b2 100644 --- a/docs/how/updating-datahub.md +++ b/docs/how/updating-datahub.md @@ -20,6 +20,16 @@ This file documents any backwards-incompatible changes in DataHub and assists pe ### Breaking Changes +### Potential Downtime + +### Deprecations + +### Other Notable Changes + +## 0.14.0 + +### Breaking Changes + - Protobuf CLI will no longer create binary encoded protoc custom properties. Flag added `-protocProp` in case this behavior is required. - #10814 Data flow info and data job info aspect will produce an additional field that will require a corresponding upgrade of server. Otherwise server can reject the aspects. diff --git a/docs/managed-datahub/configuring-identity-provisioning-with-okta.md b/docs/managed-datahub/configuring-identity-provisioning-with-okta.md new file mode 100644 index 00000000000000..a7939b514166da --- /dev/null +++ b/docs/managed-datahub/configuring-identity-provisioning-with-okta.md @@ -0,0 +1,119 @@ +--- +title: "SCIM Integration: Okta and DataHub" +hide_title: true +--- +import FeatureAvailability from '@site/src/components/FeatureAvailability'; + +## SCIM Integration: Okta and DataHub + + +## Overview +This document covers the steps required to enable SCIM provisioning from Okta to DataHub. + +This document assumes you are using OIDC for SSO with DataHub. +Since Okta doesn't currently support SCIM with OIDC, you would need to create an additional SWA-app-integration to enable SCIM provisioning. + +On completing the steps in this guide, Okta will start automatically pushing changes to users/groups of this SWA-app-integration to DataHub, thereby simplifying provisioning of users/groups in DataHub. + +### Why SCIM provisioning? +Let us look at an example of the flows enabled through SCIM provisioning. + +Consider the following configuration in Okta +- A group `governance-team` +- And it has two members `john` and `sid` +- And the group has role `Reader` + +Through SCIM provisioning, the following are enabled: +* If the `governance-team` group is assigned to the DataHub app in Okta with the role `Reader`, Okta will create the users `john` and `sid` in DataHub with the `Reader` role. +* If you remove `john` from group `governance-team` then `john` would automatically get deactivated in DataHub. +* If you remove `sid` from the DataHub app in Okta, then `sid` would automatically get deactivated in DataHub. + +Generally, any user assignment/unassignment to the app in Okta - directly or through groups - are automatically reflected in the DataHub application. + +This guide also covers other variations such as how to assign a role to a user directly, and how group-information can be pushed to DataHub. + +> Only Admin, Editor and Reader roles are supported in DataHub. These roles are preconfigured/created on DataHub. + +## Configuring SCIM provisioning + +### 1. Create an SWA app integration +a). Create a new [SWA app integration](https://help.okta.com/en-us/content/topics/apps/apps_app_integration_wizard_swa.htm), called say, `DataHub-SCIM-SWA`. + +Note: this app-integration will only be used for SCIM provisioning. You would continue to use the existing OIDC-app-integration for SSO. + +b). In the `General` tab of the `DataHub-SCIM-SWA` application, check the `Enable SCIM provisioning` option + +

+ +

+ +You may also want to configure the other selections as shown in the above image, so that this application isn't visible to your users. + +### 2. Configure SCIM + +a). Generate a personal access token from [DataHub](../../docs/authentication/personal-access-tokens.md#creating-personal-access-tokens). + +b). In the `Provisioning` tab, configure the DataHub-SCIM endpoint as shown in the below image: + +

+ +

+ +**Note**: Set the value of the `Bearer` field to the personal access token obtained in step (a) above. + +c). Configure the `To App` section as shown below: + +

+ +

+ +**Note**: We are not pushing passwords to DataHub over SCIM, since we are assuming SSO with OIDC as mentioned earlier. + +### 3. Add a custom attribute to represent roles +a). Navigate to `Directory` -> `Profile Editor`, and select the user-profile of this new application. + +

+ +

+ +b). Click `Add Attribute` and define a new attribute that will be used to specify the role of a DataHub user. + +

+ +

+ +* Set value of `External name` to `roles.^[primary==true].value` +* Set value of `External namespace` to `urn:ietf:params:scim:schemas:core:2.0:User` +* Define an enumerated list of values as shown in the above image +* Mark this attribute as required +* Select `Attribute type` as `Personal` + +c). Add a similar attribute for groups i.e. repeat step (b) above, but select `Attribute Type` as `Group`. (Specify the variable name as, say, `dataHubGroupRoles`.) + +### 4. Assign users & groups to the app +Assign users and groups to the app from the `Assignments` tab: + +

+ +

+ +While assigning a user/group, choose an appropriate value for the dataHubRoles/dataHubGroupRoles attribute. +Note that when a role is selected for a group, the corresponding role is pushed for all users of that group in DataHub. + +### The provisioning setup is now complete +Once the above steps are completed, user assignments/unassignments to the DataHub-SCIM-SWA app in Okta will get reflected in DataHub automatically. + +> #### A note on user deletion +>Note that when users are unassigned or deactivated in Okta, the corresponding users in DataHub are also deactivated (marked "suspended"). +But when a user is *deleted* in Okta, the corresponding user in DataHub does *not* get deleted. +Refer the Okta documentation on [Delete (Deprovision)](https://developer.okta.com/docs/concepts/scim/#delete-deprovision) for more details. + +### 5. (Optional): Configure push groups +When groups are assigned to the app, Okta pushes the group-members as users to DataHub, but the group itself isn't pushed. +To push group information to DataHub, configure the `Push Groups` tab accordingly as shown below: + +

+ +

+ +Refer to the Okta [Group Push](https://help.okta.com/en-us/content/topics/users-groups-profiles/app-assignments-group-push.htm) documentation for more details. \ No newline at end of file diff --git a/entity-registry/src/main/java/com/linkedin/metadata/aspect/models/graph/Edge.java b/entity-registry/src/main/java/com/linkedin/metadata/aspect/models/graph/Edge.java index 3de09e599d99ed..8777be57e1bd8f 100644 --- a/entity-registry/src/main/java/com/linkedin/metadata/aspect/models/graph/Edge.java +++ b/entity-registry/src/main/java/com/linkedin/metadata/aspect/models/graph/Edge.java @@ -13,6 +13,7 @@ import java.util.Map; import java.util.Optional; import java.util.stream.Collectors; +import javax.annotation.Nonnull; import lombok.AllArgsConstructor; import lombok.Data; import lombok.EqualsAndHashCode; @@ -59,7 +60,7 @@ public Edge( null); } - public String toDocId() { + public String toDocId(@Nonnull String idHashAlgo) { StringBuilder rawDocId = new StringBuilder(); rawDocId .append(getSource().toString()) @@ -72,9 +73,8 @@ public String toDocId() { } try { - String hashAlgo = System.getenv("ELASTIC_ID_HASH_ALGO"); byte[] bytesOfRawDocID = rawDocId.toString().getBytes(StandardCharsets.UTF_8); - MessageDigest md = MessageDigest.getInstance(hashAlgo); + MessageDigest md = MessageDigest.getInstance(idHashAlgo); byte[] thedigest = md.digest(bytesOfRawDocID); return Base64.getEncoder().encodeToString(thedigest); } catch (NoSuchAlgorithmException e) { diff --git a/metadata-ingestion-modules/airflow-plugin/tests/integration/dags/custom_operator_dag.py b/metadata-ingestion-modules/airflow-plugin/tests/integration/dags/custom_operator_dag.py new file mode 100644 index 00000000000000..b31226b7b4ceeb --- /dev/null +++ b/metadata-ingestion-modules/airflow-plugin/tests/integration/dags/custom_operator_dag.py @@ -0,0 +1,74 @@ +import logging +from datetime import datetime, timedelta +from typing import Any, List, Tuple + +from airflow import DAG +from airflow.models.baseoperator import BaseOperator + +from datahub_airflow_plugin.entities import Dataset + +logger = logging.getLogger(__name__) + + +class CustomOperator(BaseOperator): + def __init__(self, name, **kwargs): + super().__init__(**kwargs) + self.name = name + + def execute(self, context): + """ + Other code.... + """ + logger.info("executing other code here") + + input_tables = ["mydb.schema.tableA", "mydb.schema.tableB"] + output_tables = ["mydb.schema.tableD"] + + inlets, outlets = self._get_sf_lineage(input_tables, output_tables) + + context["ti"].task.inlets = inlets + context["ti"].task.outlets = outlets + + @staticmethod + def _get_sf_lineage( + input_tables: List[str], output_tables: List[str] + ) -> Tuple[List[Any], List[Any]]: + """ + Get lineage tables from Snowflake. + """ + inlets: List[Dataset] = [] + outlets: List[Dataset] = [] + + for table in input_tables: + inlets.append(Dataset(platform="snowflake", name=table)) + + for table in output_tables: + outlets.append(Dataset(platform="snowflake", name=table)) + + return inlets, outlets + + +default_args = { + "owner": "airflow", + "depends_on_past": False, + "start_date": datetime(2023, 1, 1), + "email": ["jdoe@example.com"], + "email_on_failure": False, + "execution_timeout": timedelta(minutes=5), +} + + +with DAG( + "custom_operator_dag", + default_args=default_args, + description="An example dag with custom operator", + schedule_interval=None, + tags=["example_tag"], + catchup=False, + default_view="tree", +) as dag: + custom_task = CustomOperator( + task_id="custom_task_id", + name="custom_name", + dag=dag, + ) diff --git a/metadata-ingestion-modules/airflow-plugin/tests/integration/goldens/v2_custom_operator_dag.json b/metadata-ingestion-modules/airflow-plugin/tests/integration/goldens/v2_custom_operator_dag.json new file mode 100644 index 00000000000000..b81466930ed41a --- /dev/null +++ b/metadata-ingestion-modules/airflow-plugin/tests/integration/goldens/v2_custom_operator_dag.json @@ -0,0 +1,365 @@ +[ +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(airflow,custom_operator_dag,prod)", + "changeType": "UPSERT", + "aspectName": "dataFlowInfo", + "aspect": { + "json": { + "customProperties": { + "_access_control": "None", + "catchup": "False", + "description": "'An example dag with custom operator'", + "doc_md": "None", + "fileloc": "", + "is_paused_upon_creation": "None", + "start_date": "None", + "tags": "['example_tag']", + "timezone": "Timezone('UTC')" + }, + "externalUrl": "http://airflow.example.com/tree?dag_id=custom_operator_dag", + "name": "custom_operator_dag", + "description": "An example dag with custom operator" + } + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(airflow,custom_operator_dag,prod)", + "changeType": "UPSERT", + "aspectName": "ownership", + "aspect": { + "json": { + "owners": [ + { + "owner": "urn:li:corpuser:airflow", + "type": "DEVELOPER", + "source": { + "type": "SERVICE" + } + } + ], + "ownerTypes": {}, + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:airflow" + } + } + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(airflow,custom_operator_dag,prod)", + "changeType": "UPSERT", + "aspectName": "globalTags", + "aspect": { + "json": { + "tags": [ + { + "tag": "urn:li:tag:example_tag" + } + ] + } + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(airflow,custom_operator_dag,prod)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,custom_operator_dag,prod),custom_task_id)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, +{ + "entityType": "tag", + "entityUrn": "urn:li:tag:example_tag", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(airflow,custom_operator_dag,prod)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "custom_operator_dag" + } + ] + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,custom_operator_dag,prod),custom_task_id)", + "changeType": "UPSERT", + "aspectName": "dataJobInfo", + "aspect": { + "json": { + "customProperties": { + "depends_on_past": "False", + "email": "['jdoe@example.com']", + "label": "'custom_task_id'", + "execution_timeout": "datetime.timedelta(seconds=300)", + "sla": "None", + "task_id": "'custom_task_id'", + "trigger_rule": "", + "wait_for_downstream": "False", + "downstream_task_ids": "[]", + "inlets": "[]", + "outlets": "[]", + "openlineage_run_facet_unknownSourceAttribute": "{\"_producer\": \"https://github.com/OpenLineage/OpenLineage/tree/1.18.0/integration/airflow\", \"_schemaURL\": \"https://openlineage.io/spec/2-0-2/OpenLineage.json#/$defs/BaseFacet\", \"unknownItems\": [{\"name\": \"CustomOperator\", \"properties\": {\"depends_on_past\": false, \"downstream_task_ids\": \"[]\", \"execution_timeout\": \"<>\", \"executor_config\": {}, \"ignore_first_depends_on_past\": true, \"is_setup\": false, \"is_teardown\": false, \"mapped\": false, \"operator_class\": \"custom_operator.CustomOperator\", \"owner\": \"airflow\", \"priority_weight\": 1, \"queue\": \"default\", \"retries\": 0, \"retry_exponential_backoff\": false, \"task_id\": \"custom_task_id\", \"trigger_rule\": \"all_success\", \"upstream_task_ids\": \"[]\", \"wait_for_downstream\": false, \"wait_for_past_depends_before_skipping\": false, \"weight_rule\": \"downstream\"}, \"type\": \"operator\"}]}" + }, + "externalUrl": "http://airflow.example.com/taskinstance/list/?flt1_dag_id_equals=custom_operator_dag&_flt_3_task_id=custom_task_id", + "name": "custom_task_id", + "type": { + "string": "COMMAND" + } + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,custom_operator_dag,prod),custom_task_id)", + "changeType": "UPSERT", + "aspectName": "dataJobInputOutput", + "aspect": { + "json": { + "inputDatasets": [], + "outputDatasets": [], + "inputDatajobs": [], + "fineGrainedLineages": [] + } + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:07a4aaeffa3875a24cccd1fec6fc7c8c", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceProperties", + "aspect": { + "json": { + "customProperties": { + "run_id": "manual_run_test", + "duration": "", + "start_date": "", + "end_date": "", + "execution_date": "2023-09-27 21:34:38+00:00", + "try_number": "0", + "max_tries": "0", + "external_executor_id": "None", + "state": "running", + "operator": "CustomOperator", + "priority_weight": "1", + "log_url": "http://airflow.example.com/log?execution_date=2023-09-27T21%3A34%3A38%2B00%3A00&task_id=custom_task_id&dag_id=custom_operator_dag&map_index=-1", + "orchestrator": "airflow", + "dag_id": "custom_operator_dag", + "task_id": "custom_task_id" + }, + "externalUrl": "http://airflow.example.com/log?execution_date=2023-09-27T21%3A34%3A38%2B00%3A00&task_id=custom_task_id&dag_id=custom_operator_dag&map_index=-1", + "name": "custom_operator_dag_custom_task_id_manual_run_test", + "type": "BATCH_AD_HOC", + "created": { + "time": 1722943444074, + "actor": "urn:li:corpuser:datahub" + } + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,custom_operator_dag,prod),custom_task_id)", + "changeType": "UPSERT", + "aspectName": "ownership", + "aspect": { + "json": { + "owners": [ + { + "owner": "urn:li:corpuser:airflow", + "type": "DEVELOPER", + "source": { + "type": "SERVICE" + } + } + ], + "ownerTypes": {}, + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:airflow" + } + } + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:07a4aaeffa3875a24cccd1fec6fc7c8c", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceRelationships", + "aspect": { + "json": { + "parentTemplate": "urn:li:dataJob:(urn:li:dataFlow:(airflow,custom_operator_dag,prod),custom_task_id)", + "upstreamInstances": [] + } + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:07a4aaeffa3875a24cccd1fec6fc7c8c", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceRunEvent", + "aspect": { + "json": { + "timestampMillis": 1722943444074, + "partitionSpec": { + "partition": "FULL_TABLE_SNAPSHOT", + "type": "FULL_TABLE" + }, + "status": "STARTED", + "attempt": 1 + } + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableA,PROD)", + "changeType": "UPSERT", + "aspectName": "datasetKey", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:snowflake", + "name": "mydb.schema.tableA", + "origin": "PROD" + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,custom_operator_dag,prod),custom_task_id)", + "changeType": "UPSERT", + "aspectName": "dataJobInfo", + "aspect": { + "json": { + "customProperties": { + "depends_on_past": "False", + "email": "['jdoe@example.com']", + "label": "'custom_task_id'", + "execution_timeout": "datetime.timedelta(seconds=300)", + "sla": "None", + "task_id": "'custom_task_id'", + "trigger_rule": "", + "wait_for_downstream": "False", + "downstream_task_ids": "[]", + "inlets": "[Dataset(platform='snowflake', name='mydb.schema.tableA', env='PROD', platform_instance=None), Dataset(platform='snowflake', name='mydb.schema.tableB', env='PROD', platform_instance=None)]", + "outlets": "[Dataset(platform='snowflake', name='mydb.schema.tableD', env='PROD', platform_instance=None)]", + "openlineage_run_facet_unknownSourceAttribute": "{\"_producer\": \"https://github.com/OpenLineage/OpenLineage/tree/1.18.0/integration/airflow\", \"_schemaURL\": \"https://openlineage.io/spec/2-0-2/OpenLineage.json#/$defs/BaseFacet\", \"unknownItems\": [{\"name\": \"CustomOperator\", \"properties\": {\"depends_on_past\": false, \"downstream_task_ids\": \"[]\", \"execution_timeout\": \"<>\", \"executor_config\": {}, \"ignore_first_depends_on_past\": true, \"is_setup\": false, \"is_teardown\": false, \"mapped\": false, \"operator_class\": \"custom_operator.CustomOperator\", \"owner\": \"airflow\", \"priority_weight\": 1, \"queue\": \"default\", \"retries\": 0, \"retry_exponential_backoff\": false, \"task_id\": \"custom_task_id\", \"trigger_rule\": \"all_success\", \"upstream_task_ids\": \"[]\", \"wait_for_downstream\": false, \"wait_for_past_depends_before_skipping\": false, \"weight_rule\": \"downstream\"}, \"type\": \"operator\"}]}" + }, + "externalUrl": "http://airflow.example.com/taskinstance/list/?flt1_dag_id_equals=custom_operator_dag&_flt_3_task_id=custom_task_id", + "name": "custom_task_id", + "type": { + "string": "COMMAND" + } + } + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableD,PROD)", + "changeType": "UPSERT", + "aspectName": "datasetKey", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:snowflake", + "name": "mydb.schema.tableD", + "origin": "PROD" + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,custom_operator_dag,prod),custom_task_id)", + "changeType": "UPSERT", + "aspectName": "globalTags", + "aspect": { + "json": { + "tags": [ + { + "tag": "urn:li:tag:example_tag" + } + ] + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,custom_operator_dag,prod),custom_task_id)", + "changeType": "UPSERT", + "aspectName": "dataJobInputOutput", + "aspect": { + "json": { + "inputDatasets": [ + "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableA,PROD)", + "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableB,PROD)" + ], + "outputDatasets": [ + "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableD,PROD)" + ], + "inputDatajobs": [], + "fineGrainedLineages": [] + } + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableB,PROD)", + "changeType": "UPSERT", + "aspectName": "datasetKey", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:snowflake", + "name": "mydb.schema.tableB", + "origin": "PROD" + } + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:07a4aaeffa3875a24cccd1fec6fc7c8c", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceRunEvent", + "aspect": { + "json": { + "timestampMillis": 1722943444263, + "partitionSpec": { + "partition": "FULL_TABLE_SNAPSHOT", + "type": "FULL_TABLE" + }, + "status": "COMPLETE", + "result": { + "type": "SUCCESS", + "nativeResultType": "airflow" + } + } + } +} +] \ No newline at end of file diff --git a/metadata-ingestion-modules/airflow-plugin/tests/integration/goldens/v2_custom_operator_dag_no_dag_listener.json b/metadata-ingestion-modules/airflow-plugin/tests/integration/goldens/v2_custom_operator_dag_no_dag_listener.json new file mode 100644 index 00000000000000..019122600aedbc --- /dev/null +++ b/metadata-ingestion-modules/airflow-plugin/tests/integration/goldens/v2_custom_operator_dag_no_dag_listener.json @@ -0,0 +1,404 @@ +[ +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(airflow,custom_operator_dag,prod)", + "changeType": "UPSERT", + "aspectName": "dataFlowInfo", + "aspect": { + "json": { + "customProperties": { + "_access_control": "None", + "catchup": "False", + "description": "'An example dag with custom operator'", + "doc_md": "None", + "fileloc": "", + "is_paused_upon_creation": "None", + "start_date": "None", + "tags": "['example_tag']", + "timezone": "Timezone('UTC')" + }, + "externalUrl": "http://airflow.example.com/tree?dag_id=custom_operator_dag", + "name": "custom_operator_dag", + "description": "An example dag with custom operator" + } + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(airflow,custom_operator_dag,prod)", + "changeType": "UPSERT", + "aspectName": "ownership", + "aspect": { + "json": { + "owners": [ + { + "owner": "urn:li:corpuser:airflow", + "type": "DEVELOPER", + "source": { + "type": "SERVICE" + } + } + ], + "ownerTypes": {}, + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:airflow" + } + } + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(airflow,custom_operator_dag,prod)", + "changeType": "UPSERT", + "aspectName": "globalTags", + "aspect": { + "json": { + "tags": [ + { + "tag": "urn:li:tag:example_tag" + } + ] + } + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(airflow,custom_operator_dag,prod)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,custom_operator_dag,prod),custom_task_id)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, +{ + "entityType": "tag", + "entityUrn": "urn:li:tag:example_tag", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(airflow,custom_operator_dag,prod)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "custom_operator_dag" + } + ] + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,custom_operator_dag,prod),custom_task_id)", + "changeType": "UPSERT", + "aspectName": "dataJobInfo", + "aspect": { + "json": { + "customProperties": { + "depends_on_past": "False", + "email": "['jdoe@example.com']", + "label": "'custom_task_id'", + "execution_timeout": "datetime.timedelta(seconds=300)", + "sla": "None", + "task_id": "'custom_task_id'", + "trigger_rule": "", + "wait_for_downstream": "False", + "downstream_task_ids": "[]", + "inlets": "[]", + "outlets": "[]", + "openlineage_run_facet_unknownSourceAttribute": "{\"_producer\": \"https://github.com/OpenLineage/OpenLineage/tree/1.18.0/integration/airflow\", \"_schemaURL\": \"https://openlineage.io/spec/2-0-2/OpenLineage.json#/$defs/BaseFacet\", \"unknownItems\": [{\"name\": \"CustomOperator\", \"properties\": {\"depends_on_past\": false, \"downstream_task_ids\": \"[]\", \"execution_timeout\": \"<>\", \"executor_config\": {}, \"ignore_first_depends_on_past\": true, \"mapped\": false, \"operator_class\": \"custom_operator.CustomOperator\", \"owner\": \"airflow\", \"priority_weight\": 1, \"queue\": \"default\", \"retries\": 0, \"retry_exponential_backoff\": false, \"task_id\": \"custom_task_id\", \"trigger_rule\": \"all_success\", \"upstream_task_ids\": \"[]\", \"wait_for_downstream\": false, \"weight_rule\": \"downstream\"}, \"type\": \"operator\"}]}" + }, + "externalUrl": "http://airflow.example.com/taskinstance/list/?flt1_dag_id_equals=custom_operator_dag&_flt_3_task_id=custom_task_id", + "name": "custom_task_id", + "type": { + "string": "COMMAND" + } + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,custom_operator_dag,prod),custom_task_id)", + "changeType": "UPSERT", + "aspectName": "dataJobInputOutput", + "aspect": { + "json": { + "inputDatasets": [], + "outputDatasets": [], + "inputDatajobs": [], + "fineGrainedLineages": [] + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,custom_operator_dag,prod),custom_task_id)", + "changeType": "UPSERT", + "aspectName": "ownership", + "aspect": { + "json": { + "owners": [ + { + "owner": "urn:li:corpuser:airflow", + "type": "DEVELOPER", + "source": { + "type": "SERVICE" + } + } + ], + "ownerTypes": {}, + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:airflow" + } + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,custom_operator_dag,prod),custom_task_id)", + "changeType": "UPSERT", + "aspectName": "globalTags", + "aspect": { + "json": { + "tags": [ + { + "tag": "urn:li:tag:example_tag" + } + ] + } + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:07a4aaeffa3875a24cccd1fec6fc7c8c", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceProperties", + "aspect": { + "json": { + "customProperties": { + "run_id": "manual_run_test", + "duration": "", + "start_date": "", + "end_date": "", + "execution_date": "2023-09-27 21:34:38+00:00", + "try_number": "0", + "max_tries": "0", + "external_executor_id": "None", + "state": "running", + "operator": "CustomOperator", + "priority_weight": "1", + "log_url": "http://airflow.example.com/log?execution_date=2023-09-27T21%3A34%3A38%2B00%3A00&task_id=custom_task_id&dag_id=custom_operator_dag&map_index=-1", + "orchestrator": "airflow", + "dag_id": "custom_operator_dag", + "task_id": "custom_task_id" + }, + "externalUrl": "http://airflow.example.com/log?execution_date=2023-09-27T21%3A34%3A38%2B00%3A00&task_id=custom_task_id&dag_id=custom_operator_dag&map_index=-1", + "name": "custom_operator_dag_custom_task_id_manual_run_test", + "type": "BATCH_AD_HOC", + "created": { + "time": 1723716446564, + "actor": "urn:li:corpuser:datahub" + } + } + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:07a4aaeffa3875a24cccd1fec6fc7c8c", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceRelationships", + "aspect": { + "json": { + "parentTemplate": "urn:li:dataJob:(urn:li:dataFlow:(airflow,custom_operator_dag,prod),custom_task_id)", + "upstreamInstances": [] + } + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:07a4aaeffa3875a24cccd1fec6fc7c8c", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceRunEvent", + "aspect": { + "json": { + "timestampMillis": 1723716446564, + "partitionSpec": { + "partition": "FULL_TABLE_SNAPSHOT", + "type": "FULL_TABLE" + }, + "status": "STARTED", + "attempt": 1 + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,custom_operator_dag,prod),custom_task_id)", + "changeType": "UPSERT", + "aspectName": "dataJobInfo", + "aspect": { + "json": { + "customProperties": { + "depends_on_past": "False", + "email": "['jdoe@example.com']", + "label": "'custom_task_id'", + "execution_timeout": "datetime.timedelta(seconds=300)", + "sla": "None", + "task_id": "'custom_task_id'", + "trigger_rule": "", + "wait_for_downstream": "False", + "downstream_task_ids": "[]", + "inlets": "[Dataset(platform='snowflake', name='mydb.schema.tableA', env='PROD', platform_instance=None), Dataset(platform='snowflake', name='mydb.schema.tableB', env='PROD', platform_instance=None)]", + "outlets": "[Dataset(platform='snowflake', name='mydb.schema.tableD', env='PROD', platform_instance=None)]", + "openlineage_run_facet_unknownSourceAttribute": "{\"_producer\": \"https://github.com/OpenLineage/OpenLineage/tree/1.18.0/integration/airflow\", \"_schemaURL\": \"https://openlineage.io/spec/2-0-2/OpenLineage.json#/$defs/BaseFacet\", \"unknownItems\": [{\"name\": \"CustomOperator\", \"properties\": {\"depends_on_past\": false, \"downstream_task_ids\": \"[]\", \"execution_timeout\": \"<>\", \"executor_config\": {}, \"ignore_first_depends_on_past\": true, \"mapped\": false, \"operator_class\": \"custom_operator.CustomOperator\", \"owner\": \"airflow\", \"priority_weight\": 1, \"queue\": \"default\", \"retries\": 0, \"retry_exponential_backoff\": false, \"task_id\": \"custom_task_id\", \"trigger_rule\": \"all_success\", \"upstream_task_ids\": \"[]\", \"wait_for_downstream\": false, \"weight_rule\": \"downstream\"}, \"type\": \"operator\"}]}" + }, + "externalUrl": "http://airflow.example.com/taskinstance/list/?flt1_dag_id_equals=custom_operator_dag&_flt_3_task_id=custom_task_id", + "name": "custom_task_id", + "type": { + "string": "COMMAND" + } + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,custom_operator_dag,prod),custom_task_id)", + "changeType": "UPSERT", + "aspectName": "dataJobInputOutput", + "aspect": { + "json": { + "inputDatasets": [ + "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableA,PROD)", + "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableB,PROD)" + ], + "outputDatasets": [ + "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableD,PROD)" + ], + "inputDatajobs": [], + "fineGrainedLineages": [] + } + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableA,PROD)", + "changeType": "UPSERT", + "aspectName": "datasetKey", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:snowflake", + "name": "mydb.schema.tableA", + "origin": "PROD" + } + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableB,PROD)", + "changeType": "UPSERT", + "aspectName": "datasetKey", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:snowflake", + "name": "mydb.schema.tableB", + "origin": "PROD" + } + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableD,PROD)", + "changeType": "UPSERT", + "aspectName": "datasetKey", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:snowflake", + "name": "mydb.schema.tableD", + "origin": "PROD" + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,custom_operator_dag,prod),custom_task_id)", + "changeType": "UPSERT", + "aspectName": "ownership", + "aspect": { + "json": { + "owners": [ + { + "owner": "urn:li:corpuser:airflow", + "type": "DEVELOPER", + "source": { + "type": "SERVICE" + } + } + ], + "ownerTypes": {}, + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:airflow" + } + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,custom_operator_dag,prod),custom_task_id)", + "changeType": "UPSERT", + "aspectName": "globalTags", + "aspect": { + "json": { + "tags": [ + { + "tag": "urn:li:tag:example_tag" + } + ] + } + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:07a4aaeffa3875a24cccd1fec6fc7c8c", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceRunEvent", + "aspect": { + "json": { + "timestampMillis": 1723716446701, + "partitionSpec": { + "partition": "FULL_TABLE_SNAPSHOT", + "type": "FULL_TABLE" + }, + "status": "COMPLETE", + "result": { + "type": "SUCCESS", + "nativeResultType": "airflow" + } + } + } +} +] \ No newline at end of file diff --git a/metadata-ingestion-modules/airflow-plugin/tests/integration/test_plugin.py b/metadata-ingestion-modules/airflow-plugin/tests/integration/test_plugin.py index 9ea822edeef81f..2b8d4c47f62246 100644 --- a/metadata-ingestion-modules/airflow-plugin/tests/integration/test_plugin.py +++ b/metadata-ingestion-modules/airflow-plugin/tests/integration/test_plugin.py @@ -110,7 +110,9 @@ def _wait_for_dag_finish( @contextlib.contextmanager def _run_airflow( - tmp_path: pathlib.Path, dags_folder: pathlib.Path, is_v1: bool + tmp_path: pathlib.Path, + dags_folder: pathlib.Path, + is_v1: bool, ) -> Iterator[AirflowInstance]: airflow_home = tmp_path / "airflow_home" print(f"Using airflow home: {airflow_home}") @@ -272,6 +274,7 @@ class DagTestCase: DagTestCase("basic_iolets"), DagTestCase("snowflake_operator", success=False, v2_only=True), DagTestCase("sqlite_operator", v2_only=True), + DagTestCase("custom_operator_dag", v2_only=True), ] diff --git a/metadata-ingestion/docs/transformer/dataset_transformer.md b/metadata-ingestion/docs/transformer/dataset_transformer.md index ac6fefc3095741..03a224bcf7da47 100644 --- a/metadata-ingestion/docs/transformer/dataset_transformer.md +++ b/metadata-ingestion/docs/transformer/dataset_transformer.md @@ -1207,20 +1207,51 @@ The config, which we’d append to our ingestion recipe YAML, would look like th | Field | Required | Type | Default | Description | |---------------------------------------|----------|----------------------|-------------|---------------------------------------------------------------------------------------------| | `dataset_to_data_product_urns_pattern`| ✅ | map[regx, urn] | | Dataset Entity urn with regular expression and dataproduct urn apply to matching entity urn.| +| `is_container` | | bool | `false` | Whether to also consider a container or not. If true, the data product will be attached to both the dataset and its container. | -Let’s suppose we’d like to append a series of dataproducts with specific datasets as its assets. To do so, we can use the `pattern_add_dataset_dataproduct` module that’s included in the ingestion framework. This will match the regex pattern to `urn` of the dataset and create the data product entity with given urn and matched datasets as its assets. + +Let’s suppose we’d like to append a series of data products with specific datasets or their containers as assets. To do so, we can use the pattern_add_dataset_dataproduct module that’s included in the ingestion framework. This module matches a regex pattern to the urn of the dataset and creates a data product entity with the given urn, associating the matched datasets as its assets. + +If the is_container field is set to true, the module will not only attach the data product to the matching datasets but will also find and attach the containers associated with those datasets. This means that both the datasets and their containers will be associated with the specified data product. The config, which we’d append to our ingestion recipe YAML, would look like this: +- Add Product to dataset + ```yaml + transformers: + - type: "pattern_add_dataset_dataproduct" + config: + dataset_to_data_product_urns_pattern: + rules: + ".*example1.*": "urn:li:dataProduct:first" + ".*example2.*": "urn:li:dataProduct:second" + ``` +- Add Product to dataset container ```yaml transformers: - type: "pattern_add_dataset_dataproduct" config: + is_container: true dataset_to_data_product_urns_pattern: rules: ".*example1.*": "urn:li:dataProduct:first" ".*example2.*": "urn:li:dataProduct:second" ``` +⚠️ Warning: +When working with two datasets in the same container but with different data products, only one data product can be attached to the container. + +For example: +```yaml +transformers: + - type: "pattern_add_dataset_dataproduct" + config: + is_container: true + dataset_to_data_product_urns_pattern: + rules: + ".*example1.*": "urn:li:dataProduct:first" + ".*example2.*": "urn:li:dataProduct:second" +``` +If example1 and example2 are in the same container, only urn:li:dataProduct:first will be added. However, if they are in separate containers, the system works as expected and assigns the correct data product URNs. ## Add Dataset dataProduct ### Config Details diff --git a/metadata-ingestion/examples/mce_files/bootstrap_mce.json b/metadata-ingestion/examples/mce_files/bootstrap_mce.json index fbe6b9953cb4fa..bc218e5e8c2d53 100644 --- a/metadata-ingestion/examples/mce_files/bootstrap_mce.json +++ b/metadata-ingestion/examples/mce_files/bootstrap_mce.json @@ -3394,7 +3394,7 @@ "changeType":"UPSERT", "aspectName":"datasetProfile", "aspect":{ - "value":"{\"timestampMillis\": 1679515693000, \"rowCount\": 4500, \"columnCount\": 2, \"sizeInBytes\": 842000200000, \"fieldProfiles\": [{\"fieldPath\": \"field_foo\", \"uniqueCount\": 2, \"uniqueProportion\": 0.00044, \"nullCount\": 0, \"nullProportion\": 0.0, \"sampleValues\": [\"true\", \"false\"]}, {\"fieldPath\": \"field_bar\", \"uniqueCount\": 2, \"uniqueProportion\": 0.00044, \"nullCount\": 0, \"nullProportion\": 0.0, \"sampleValues\": [\"false\"]}]}", + "value":"{\"timestampMillis\": 1723488954865, \"rowCount\": 4500, \"columnCount\": 2, \"sizeInBytes\": 842000200000, \"fieldProfiles\": [{\"fieldPath\": \"field_foo\", \"uniqueCount\": 2, \"uniqueProportion\": 0.00044, \"nullCount\": 0, \"nullProportion\": 0.0, \"sampleValues\": [\"true\", \"false\"]}, {\"fieldPath\": \"field_bar\", \"uniqueCount\": 2, \"uniqueProportion\": 0.00044, \"nullCount\": 0, \"nullProportion\": 0.0, \"sampleValues\": [\"false\"]}]}", "contentType":"application/json" }, "systemMetadata":null @@ -3406,7 +3406,7 @@ "changeType":"UPSERT", "aspectName":"datasetProfile", "aspect":{ - "value":"{\"timestampMillis\": 1684786093000, \"rowCount\": 3500, \"columnCount\": 2, \"fieldProfiles\": [{\"fieldPath\": \"field_foo\", \"uniqueCount\": 2, \"uniqueProportion\": 0.00057, \"nullCount\": 0, \"nullProportion\": 0.0, \"sampleValues\": [\"true\", \"false\"]}, {\"fieldPath\": \"field_bar\", \"uniqueCount\": 2, \"uniqueProportion\": 0.00057, \"nullCount\": 0, \"nullProportion\": 0.0, \"sampleValues\": [\"true\"]}]}", + "value":"{\"timestampMillis\": 1723488954865, \"rowCount\": 3500, \"columnCount\": 2, \"fieldProfiles\": [{\"fieldPath\": \"field_foo\", \"uniqueCount\": 2, \"uniqueProportion\": 0.00057, \"nullCount\": 0, \"nullProportion\": 0.0, \"sampleValues\": [\"true\", \"false\"]}, {\"fieldPath\": \"field_bar\", \"uniqueCount\": 2, \"uniqueProportion\": 0.00057, \"nullCount\": 0, \"nullProportion\": 0.0, \"sampleValues\": [\"true\"]}]}", "contentType":"application/json" }, "systemMetadata":null diff --git a/metadata-ingestion/setup.py b/metadata-ingestion/setup.py index abb716d2434ac6..03b44401dd2448 100644 --- a/metadata-ingestion/setup.py +++ b/metadata-ingestion/setup.py @@ -173,6 +173,7 @@ *sqlglot_lib, "GitPython>2", "python-liquid", + "deepmerge>=1.1.1" } bigquery_common = { diff --git a/metadata-ingestion/src/datahub/ingestion/source/dbt/dbt_common.py b/metadata-ingestion/src/datahub/ingestion/source/dbt/dbt_common.py index e2b5f8378732c5..8d67551b9e1f2f 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/dbt/dbt_common.py +++ b/metadata-ingestion/src/datahub/ingestion/source/dbt/dbt_common.py @@ -276,6 +276,12 @@ class DBTCommonConfig( DBTEntitiesEnabled(), description="Controls for enabling / disabling metadata emission for different dbt entities (models, test definitions, test results, etc.)", ) + prefer_sql_parser_lineage: bool = Field( + default=False, + description="Normally we use dbt's metadata to generate table lineage. When enabled, we prefer results from the SQL parser when generating lineage instead. " + "This can be useful when dbt models reference tables directly, instead of using the ref() macro. " + "This requires that `skip_sources_in_lineage` is enabled.", + ) skip_sources_in_lineage: bool = Field( default=False, description="[Experimental] When enabled, dbt sources will not be included in the lineage graph. " @@ -366,13 +372,6 @@ class DBTCommonConfig( description="When enabled, includes the compiled code in the emitted metadata.", ) - prefer_sql_parser_lineage: bool = Field( - default=False, - description="Normally we use dbt's metadata to generate table lineage. When enabled, we prefer results from the SQL parser when generating lineage instead. " - "This can be useful when dbt models reference tables directly, instead of using the ref() macro. " - "This requires that `skip_sources_in_lineage` is enabled.", - ) - @validator("target_platform") def validate_target_platform_value(cls, target_platform: str) -> str: if target_platform.lower() == DBT_PLATFORM: @@ -438,15 +437,27 @@ def validate_include_column_lineage( return include_column_lineage - @validator("skip_sources_in_lineage") + @validator("skip_sources_in_lineage", always=True) def validate_skip_sources_in_lineage( cls, skip_sources_in_lineage: bool, values: Dict ) -> bool: - entites_enabled: Optional[DBTEntitiesEnabled] = values.get("entities_enabled") + entities_enabled: Optional[DBTEntitiesEnabled] = values.get("entities_enabled") + prefer_sql_parser_lineage: Optional[bool] = values.get( + "prefer_sql_parser_lineage" + ) + + if prefer_sql_parser_lineage and not skip_sources_in_lineage: + raise ValueError( + "`prefer_sql_parser_lineage` requires that `skip_sources_in_lineage` is enabled." + ) + if ( skip_sources_in_lineage - and entites_enabled - and entites_enabled.sources == EmitDirective.YES + and entities_enabled + and entities_enabled.sources == EmitDirective.YES + # When `prefer_sql_parser_lineage` is enabled, it's ok to have `skip_sources_in_lineage` enabled + # without also disabling sources. + and not prefer_sql_parser_lineage ): raise ValueError( "When `skip_sources_in_lineage` is enabled, `entities_enabled.sources` must be set to NO." @@ -454,16 +465,6 @@ def validate_skip_sources_in_lineage( return skip_sources_in_lineage - @validator("prefer_sql_parser_lineage") - def validate_prefer_sql_parser_lineage( - cls, prefer_sql_parser_lineage: bool, values: Dict - ) -> bool: - if prefer_sql_parser_lineage and not values.get("skip_sources_in_lineage"): - raise ValueError( - "`prefer_sql_parser_lineage` requires that `skip_sources_in_lineage` is enabled." - ) - return prefer_sql_parser_lineage - @dataclass class DBTColumn: @@ -769,23 +770,30 @@ def make_mapping_upstream_lineage( downstream_urn: str, node: DBTNode, convert_column_urns_to_lowercase: bool, + skip_sources_in_lineage: bool, ) -> UpstreamLineageClass: cll = [] - for column in node.columns or []: - field_name = column.name - if convert_column_urns_to_lowercase: - field_name = field_name.lower() - - cll.append( - FineGrainedLineage( - upstreamType=FineGrainedLineageUpstreamType.FIELD_SET, - upstreams=[mce_builder.make_schema_field_urn(upstream_urn, field_name)], - downstreamType=FineGrainedLineageDownstreamType.FIELD, - downstreams=[ - mce_builder.make_schema_field_urn(downstream_urn, field_name) - ], + if not (node.node_type == "source" and skip_sources_in_lineage): + # If `skip_sources_in_lineage` is enabled, we want to generate table lineage (for siblings) + # but not CLL. That's because CLL will make it look like the warehouse node has downstream + # column lineage, but it's really just empty. + for column in node.columns or []: + field_name = column.name + if convert_column_urns_to_lowercase: + field_name = field_name.lower() + + cll.append( + FineGrainedLineage( + upstreamType=FineGrainedLineageUpstreamType.FIELD_SET, + upstreams=[ + mce_builder.make_schema_field_urn(upstream_urn, field_name) + ], + downstreamType=FineGrainedLineageDownstreamType.FIELD, + downstreams=[ + mce_builder.make_schema_field_urn(downstream_urn, field_name) + ], + ) ) - ) return UpstreamLineageClass( upstreams=[ @@ -1476,6 +1484,7 @@ def create_target_platform_mces( downstream_urn=node_datahub_urn, node=node, convert_column_urns_to_lowercase=self.config.convert_column_urns_to_lowercase, + skip_sources_in_lineage=self.config.skip_sources_in_lineage, ) if self.config.incremental_lineage: # We only generate incremental lineage for non-dbt nodes. @@ -1821,6 +1830,7 @@ def _create_lineage_aspect_for_dbt_node( downstream_urn=node_urn, node=node, convert_column_urns_to_lowercase=self.config.convert_column_urns_to_lowercase, + skip_sources_in_lineage=self.config.skip_sources_in_lineage, ) else: upstream_urns = get_upstreams( diff --git a/metadata-ingestion/src/datahub/ingestion/source/looker/looker_constant.py b/metadata-ingestion/src/datahub/ingestion/source/looker/looker_constant.py index 21160cc97d4a62..5f47d361abb37c 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/looker/looker_constant.py +++ b/metadata-ingestion/src/datahub/ingestion/source/looker/looker_constant.py @@ -1 +1,10 @@ IMPORTED_PROJECTS = "imported_projects" +SQL_TABLE_NAME = "sql_table_name" +DATAHUB_TRANSFORMED_SQL_TABLE_NAME = "datahub_transformed_sql_table_name" +DERIVED_TABLE = "derived_table" +SQL = "sql" +DATAHUB_TRANSFORMED_SQL = "datahub_transformed_sql" +prod = "prod" +dev = "dev" +NAME = "name" +DERIVED_DOT_SQL = "derived.sql" diff --git a/metadata-ingestion/src/datahub/ingestion/source/looker/looker_file_loader.py b/metadata-ingestion/src/datahub/ingestion/source/looker/looker_file_loader.py index fd670c23ad9cb0..52ebcdde06a279 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/looker/looker_file_loader.py +++ b/metadata-ingestion/src/datahub/ingestion/source/looker/looker_file_loader.py @@ -1,17 +1,18 @@ import logging import pathlib from dataclasses import replace -from typing import Any, Dict, Optional +from typing import Dict, Optional from datahub.ingestion.source.looker.lkml_patched import load_lkml from datahub.ingestion.source.looker.looker_config import LookerConnectionDefinition from datahub.ingestion.source.looker.looker_dataclasses import LookerViewFile from datahub.ingestion.source.looker.looker_template_language import ( - resolve_liquid_variable_in_view_dict, + process_lookml_template_language, ) from datahub.ingestion.source.looker.lookml_config import ( _EXPLORE_FILE_EXTENSION, _VIEW_FILE_EXTENSION, + LookMLSourceConfig, LookMLSourceReport, ) @@ -29,13 +30,13 @@ def __init__( root_project_name: Optional[str], base_projects_folder: Dict[str, pathlib.Path], reporter: LookMLSourceReport, - liquid_variable: Dict[Any, Any], + source_config: LookMLSourceConfig, ) -> None: self.viewfile_cache: Dict[str, Optional[LookerViewFile]] = {} self._root_project_name = root_project_name self._base_projects_folder = base_projects_folder self.reporter = reporter - self.liquid_variable = liquid_variable + self.source_config = source_config def _load_viewfile( self, project_name: str, path: str, reporter: LookMLSourceReport @@ -73,9 +74,9 @@ def _load_viewfile( parsed = load_lkml(path) - resolve_liquid_variable_in_view_dict( - raw_view=parsed, - liquid_variable=self.liquid_variable, + process_lookml_template_language( + view_lkml_file_dict=parsed, + source_config=self.source_config, ) looker_viewfile = LookerViewFile.from_looker_dict( diff --git a/metadata-ingestion/src/datahub/ingestion/source/looker/looker_template_language.py b/metadata-ingestion/src/datahub/ingestion/source/looker/looker_template_language.py index 99f83b5e922bad..04f9ec081ee680 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/looker/looker_template_language.py +++ b/metadata-ingestion/src/datahub/ingestion/source/looker/looker_template_language.py @@ -1,15 +1,31 @@ import logging import re -from typing import Any, ClassVar, Dict, Set +from abc import ABC, abstractmethod +from typing import Any, ClassVar, Dict, List, Optional, Set +from deepmerge import always_merger from liquid import Undefined from liquid.exceptions import LiquidSyntaxError +from datahub.ingestion.source.looker.looker_constant import ( + DATAHUB_TRANSFORMED_SQL, + DATAHUB_TRANSFORMED_SQL_TABLE_NAME, + DERIVED_DOT_SQL, + DERIVED_TABLE, + NAME, + SQL, + SQL_TABLE_NAME, + dev, + prod, +) from datahub.ingestion.source.looker.looker_liquid_tag import ( CustomTagException, create_template, ) -from datahub.ingestion.source.looker.lookml_config import DERIVED_VIEW_PATTERN +from datahub.ingestion.source.looker.lookml_config import ( + DERIVED_VIEW_PATTERN, + LookMLSourceConfig, +) logger = logging.getLogger(__name__) @@ -92,52 +108,311 @@ def resolve_liquid_variable(text: str, liquid_variable: Dict[Any, Any]) -> str: return text -def _drop_derived_view_pattern(value: str) -> str: - # Drop ${ and } - return re.sub(DERIVED_VIEW_PATTERN, r"\1", value) +class LookMLViewTransformer(ABC): + """ + There are many transformations that we need to perform on the LookML view to make it suitable for metadata ingestion. + + These transformations include: + + 1. Evaluating Looker templates, such as `-- if prod --` comments. Example `LookMlIfCommentTransformer`. + + 2. Resolving Liquid templates. Example `LiquidVariableTransformer`. + + 3. Removing ${} from derived view patterns. Example `DropDerivedViewPatternTransformer`. + (e.g., changing ${view_name.SQL_TABLE_NAME} to 4. view_name.SQL_TABLE_NAME). + + 4. Completing incomplete SQL fragments. Example `IncompleteSqlTransformer`. + + Each transformer works on specific attributes of the LookML view. For example, the #4 transformation is only + applicable to the view.derived.sql attribute, while the other transformations apply to both the + view.sql_table_name and view.derived.sql attributes. + + This class contains the logic to ensure that the transformer is applied to specific attributes and returns a + dictionary containing the transformed data. + + For example: + In case of #1 and #2, it returns: + + **transformed derived_table:** + ``` + { + "derived_table": { + "datahub_transformed_sql": "" + } + } + ``` + + **Whereas original was:** + ``` + { + "derived_table": { + "sql": "" + } + } + ``` + + In case #3, it returns: + **transformed sql_table_name:** + ``` + { + "datahub_transformed_sql_table_name": "employee_income_source.SQL_TABLE_NAME" + } + ``` + + **Whereas original was:** + ``` + { + "sql_table_name": "${employee_income_source.SQL_TABLE_NAME}" + } + ``` + + In case #4, it returns: + **transformed derived_table:** + ``` + { + "derived_table": { + "datahub_transformed_sql": "SELECT column_a, column_b FROM foo" + } + } + ``` + + **Whereas original was:** + ``` + { + "derived_table": { + "sql": "column_a, column_b" + } + } + ``` + + Each transformation generates a section of the transformed dictionary with a new attribute named + `datahub_transformed_`. + """ + + source_config: LookMLSourceConfig + + def __init__(self, source_config: LookMLSourceConfig): + self.source_config = source_config + + def transform(self, view: dict) -> dict: + value_to_transform: Optional[str] = None + + # is_attribute_supported check is required because not all transformer works on all attributes in current + # case mostly all transformer works on sql_table_name and derived.sql attributes, + # however IncompleteSqlTransformer only transform the derived.sql attribute + if SQL_TABLE_NAME in view and self.is_attribute_supported(SQL_TABLE_NAME): + # Give precedence to already processed transformed view.sql_table_name to apply more transformation + value_to_transform = view.get( + DATAHUB_TRANSFORMED_SQL_TABLE_NAME, view[SQL_TABLE_NAME] + ) + if ( + DERIVED_TABLE in view + and SQL in view[DERIVED_TABLE] + and self.is_attribute_supported(DERIVED_DOT_SQL) + ): + # Give precedence to already processed transformed view.derived.sql to apply more transformation + value_to_transform = view[DERIVED_TABLE].get( + DATAHUB_TRANSFORMED_SQL, view[DERIVED_TABLE][SQL] + ) -def _complete_incomplete_sql(raw_view: dict, sql: str) -> str: + if value_to_transform is None: + return {} - # Looker supports sql fragments that omit the SELECT and FROM parts of the query - # Add those in if we detect that it is missing - sql_query: str = sql + logger.debug(f"value to transform = {value_to_transform}") - if not re.search(r"SELECT\s", sql_query, flags=re.I): - # add a SELECT clause at the beginning - sql_query = f"SELECT {sql}" + transformed_value: str = self._apply_transformation( + value=value_to_transform, view=view + ) - if not re.search(r"FROM\s", sql_query, flags=re.I): - # add a FROM clause at the end - sql_query = f"{sql_query} FROM {raw_view['name']}" + logger.debug(f"transformed value = {transformed_value}") - return _drop_derived_view_pattern(sql_query) + if SQL_TABLE_NAME in view and value_to_transform: + return {DATAHUB_TRANSFORMED_SQL_TABLE_NAME: transformed_value} + if DERIVED_TABLE in view and SQL in view[DERIVED_TABLE] and value_to_transform: + return {DERIVED_TABLE: {DATAHUB_TRANSFORMED_SQL: transformed_value}} -def resolve_liquid_variable_in_view_dict( - raw_view: dict, liquid_variable: Dict[Any, Any] -) -> None: - if "views" not in raw_view: - return + return {} - for view in raw_view["views"]: - if "sql_table_name" in view: - view["datahub_transformed_sql_table_name"] = resolve_liquid_variable( - text=view["sql_table_name"], - liquid_variable=liquid_variable, - ) # keeping original sql_table_name as is to avoid any visualization issue later + @abstractmethod + def _apply_transformation(self, value: str, view: dict) -> str: + pass - view["datahub_transformed_sql_table_name"] = _drop_derived_view_pattern( - value=view["datahub_transformed_sql_table_name"] - ) + def is_attribute_supported(self, attribute: str) -> bool: + return attribute in [DERIVED_DOT_SQL, SQL_TABLE_NAME] + + +class LiquidVariableTransformer(LookMLViewTransformer): + """ + Replace the liquid variables with their values. + """ + + def _apply_transformation(self, value: str, view: dict) -> str: + return resolve_liquid_variable( + text=value, + liquid_variable=self.source_config.liquid_variable, + ) + + +class IncompleteSqlTransformer(LookMLViewTransformer): + """ + lookml view may contain the fragment of sql, however for lineage generation we need a complete sql. + IncompleteSqlTransformer will complete the view's derived.sql. + """ + + def is_attribute_supported(self, attribute: str) -> bool: + return attribute in [DERIVED_DOT_SQL] - if "derived_table" in view and "sql" in view["derived_table"]: - # In sql we don't need to remove the extra spaces as sql parser takes care of extra spaces and \n - # while generating URN from sql - view["derived_table"]["datahub_transformed_sql"] = resolve_liquid_variable( - text=view["derived_table"]["sql"], liquid_variable=liquid_variable - ) # keeping original sql as is, so that on UI sql will be shown same is it is visible on looker portal + def _apply_transformation(self, value: str, view: dict) -> str: + if DERIVED_TABLE not in view or SQL not in view[DERIVED_TABLE]: + # This transformation is only applicable in-case of view contains view.derived.sql + return value - view["derived_table"]["datahub_transformed_sql"] = _complete_incomplete_sql( - raw_view=view, sql=view["derived_table"]["datahub_transformed_sql"] + # Looker supports sql fragments that omit the SELECT and FROM parts of the query + # Add those in if we detect that it is missing + sql_query: str = value + + if not re.search(r"SELECT\s", sql_query, flags=re.I): + # add a SELECT clause at the beginning + sql_query = f"SELECT {sql_query}" + + if not re.search(r"FROM\s", sql_query, flags=re.I): + # add a FROM clause at the end + sql_query = f"{sql_query} FROM {view[NAME]}" + + return sql_query + + +class DropDerivedViewPatternTransformer(LookMLViewTransformer): + """ + drop ${} from datahub_transformed_sql_table_name and view["derived_table"]["datahub_transformed_sql_table_name"] values. + + Example: transform ${employee_income_source.SQL_TABLE_NAME} to employee_income_source.SQL_TABLE_NAME + """ + + def _apply_transformation(self, value: str, view: dict) -> str: + return re.sub( + DERIVED_VIEW_PATTERN, + r"\1", + value, + ) + + +class LookMlIfCommentTransformer(LookMLViewTransformer): + """ + Evaluate the looker -- if -- comments. + """ + + evaluate_to_true_regx: str + remove_if_comment_line_regx: str + + def __init__(self, source_config: LookMLSourceConfig): + super().__init__(source_config=source_config) + + # This regx will keep whatever after -- if looker_environment -- + self.evaluate_to_true_regx = r"-- if {} --".format( + self.source_config.looker_environment + ) + + # It will remove all other lines starts with -- if ... -- + self.remove_if_comment_line_regx = r"-- if {} --.*?(?=\n|-- if|$)".format( + dev if self.source_config.looker_environment.lower() == prod else prod + ) + + def _apply_regx(self, value: str) -> str: + result: str = re.sub( + self.remove_if_comment_line_regx, "", value, flags=re.IGNORECASE | re.DOTALL + ) + + # Remove '-- if prod --' but keep the rest of the line + result = re.sub(self.evaluate_to_true_regx, "", result, flags=re.IGNORECASE) + + return result + + def _apply_transformation(self, value: str, view: dict) -> str: + return self._apply_regx(value) + + +class TransformedLookMlView: + """ + TransformedLookMlView is collecting output of LookMLViewTransformer and creating a new transformed LookML view. + TransformedLookMlView creates a copy of the original view dictionary and updates the copy with the transformed output. + The deepmerge library is used because Python's dict.update function doesn't merge nested fields. + + The transformed LookML view will contain the following attributes: + + ``` + { + "derived_table": { + "sql": "" + }, + + dimensions ..... + } + ``` + see documentation of LookMLViewTransformer for output of each transformer. + """ + + transformers: List[LookMLViewTransformer] + view_dict: dict + transformed_dict: dict + + def __init__( + self, + transformers: List[LookMLViewTransformer], + view_dict: dict, + ): + self.transformers = transformers + self.view_dict = view_dict + self.transformed_dict = {} + + def view(self) -> dict: + if self.transformed_dict: + return self.transformed_dict + + self.transformed_dict = {**self.view_dict} + + logger.debug(f"Processing view {self.view_dict[NAME]}") + + for transformer in self.transformers: + logger.debug(f"Applying transformer {transformer.__class__.__name__}") + + self.transformed_dict = always_merger.merge( + self.transformed_dict, transformer.transform(self.transformed_dict) ) + + return self.transformed_dict + + +def process_lookml_template_language( + source_config: LookMLSourceConfig, + view_lkml_file_dict: dict, +) -> None: + if "views" not in view_lkml_file_dict: + return + + transformers: List[LookMLViewTransformer] = [ + LookMlIfCommentTransformer( + source_config=source_config + ), # First evaluate the -- if -- comments. Looker does the same + LiquidVariableTransformer( + source_config=source_config + ), # Now resolve liquid variables + DropDerivedViewPatternTransformer( + source_config=source_config + ), # Remove any ${} symbol + IncompleteSqlTransformer( + source_config=source_config + ), # complete any incomplete sql + ] + + transformed_views: List[dict] = [] + + for view in view_lkml_file_dict["views"]: + transformed_views.append( + TransformedLookMlView(transformers=transformers, view_dict=view).view() + ) + + view_lkml_file_dict["views"] = transformed_views diff --git a/metadata-ingestion/src/datahub/ingestion/source/looker/lookml_concept_context.py b/metadata-ingestion/src/datahub/ingestion/source/looker/lookml_concept_context.py index 7805b8b7b7d9a5..69b9f842ac14db 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/looker/lookml_concept_context.py +++ b/metadata-ingestion/src/datahub/ingestion/source/looker/lookml_concept_context.py @@ -313,9 +313,9 @@ def datahub_transformed_sql_table_name(self) -> str: # remove extra spaces and new lines from sql_table_name if it is not a sql if not self.is_direct_sql_query_case(): - table_name = remove_extra_spaces_and_newlines(table_name) # Some sql_table_name fields contain quotes like: optimizely."group", just remove the quotes table_name = table_name.replace('"', "").replace("`", "").lower() + table_name = remove_extra_spaces_and_newlines(table_name).strip() return table_name diff --git a/metadata-ingestion/src/datahub/ingestion/source/looker/lookml_config.py b/metadata-ingestion/src/datahub/ingestion/source/looker/lookml_config.py index f4fb1316b16a20..0bcee14ec77a1a 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/looker/lookml_config.py +++ b/metadata-ingestion/src/datahub/ingestion/source/looker/lookml_config.py @@ -1,7 +1,7 @@ import logging from dataclasses import dataclass, field as dataclass_field from datetime import timedelta -from typing import Any, Dict, List, Optional, Union +from typing import Any, Dict, List, Literal, Optional, Union import pydantic from pydantic import root_validator, validator @@ -174,6 +174,13 @@ class LookMLSourceConfig( "view.sql_table_name. Defaults to an empty dictionary.", ) + looker_environment: Literal["prod", "dev"] = Field( + "prod", + description="A looker prod or dev environment. " + "It helps to evaluate looker if comments i.e. -- if prod --. " + "All if comments are evaluated to true for configured looker_environment value", + ) + @validator("connection_to_platform_map", pre=True) def convert_string_to_connection_def(cls, conn_map): # Previous version of config supported strings in connection map. This upconverts strings to ConnectionMap diff --git a/metadata-ingestion/src/datahub/ingestion/source/looker/lookml_source.py b/metadata-ingestion/src/datahub/ingestion/source/looker/lookml_source.py index d77e65ac733232..b00291caabbf68 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/looker/lookml_source.py +++ b/metadata-ingestion/src/datahub/ingestion/source/looker/lookml_source.py @@ -669,7 +669,7 @@ def get_internal_workunits(self) -> Iterable[MetadataWorkUnit]: # noqa: C901 self.source_config.project_name, self.base_projects_folder, self.reporter, - self.source_config.liquid_variable, + self.source_config, ) # Some views can be mentioned by multiple 'include' statements and can be included via different connections. diff --git a/metadata-ingestion/src/datahub/ingestion/source/salesforce.py b/metadata-ingestion/src/datahub/ingestion/source/salesforce.py index 42128123c61442..7a7f1f30950eb6 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/salesforce.py +++ b/metadata-ingestion/src/datahub/ingestion/source/salesforce.py @@ -3,7 +3,7 @@ import time from datetime import datetime from enum import Enum -from typing import Dict, Iterable, List, Optional +from typing import Any, Dict, Iterable, List, Optional import requests from pydantic import Field, validator @@ -124,6 +124,9 @@ class SalesforceConfig(DatasetSourceConfigMixin): default=dict(), description='Regex patterns for tables/schemas to describe domain_key domain key (domain_key can be any string like "sales".) There can be multiple domain keys specified.', ) + api_version: Optional[str] = Field( + description="If specified, overrides default version used by the Salesforce package. Example value: '59.0'" + ) profiling: SalesforceProfilingConfig = SalesforceProfilingConfig() @@ -222,6 +225,12 @@ def __init__(self, config: SalesforceConfig, ctx: PipelineContext) -> None: self.session = requests.Session() self.platform: str = "salesforce" self.fieldCounts = {} + common_args: Dict[str, Any] = { + "domain": "test" if self.config.is_sandbox else None, + "session": self.session, + } + if self.config.api_version: + common_args["version"] = self.config.api_version try: if self.config.auth is SalesforceAuthType.DIRECT_ACCESS_TOKEN: @@ -236,8 +245,7 @@ def __init__(self, config: SalesforceConfig, ctx: PipelineContext) -> None: self.sf = Salesforce( instance_url=self.config.instance_url, session_id=self.config.access_token, - session=self.session, - domain="test" if self.config.is_sandbox else None, + **common_args, ) elif self.config.auth is SalesforceAuthType.USERNAME_PASSWORD: logger.debug("Username/Password Provided in Config") @@ -255,8 +263,7 @@ def __init__(self, config: SalesforceConfig, ctx: PipelineContext) -> None: username=self.config.username, password=self.config.password, security_token=self.config.security_token, - session=self.session, - domain="test" if self.config.is_sandbox else None, + **common_args, ) elif self.config.auth is SalesforceAuthType.JSON_WEB_TOKEN: @@ -275,14 +282,13 @@ def __init__(self, config: SalesforceConfig, ctx: PipelineContext) -> None: username=self.config.username, consumer_key=self.config.consumer_key, privatekey=self.config.private_key, - session=self.session, - domain="test" if self.config.is_sandbox else None, + **common_args, ) except Exception as e: logger.error(e) raise ConfigurationError("Salesforce login failed") from e - else: + if not self.config.api_version: # List all REST API versions and use latest one versions_url = "https://{instance}/services/data/".format( instance=self.sf.sf_instance, @@ -290,17 +296,22 @@ def __init__(self, config: SalesforceConfig, ctx: PipelineContext) -> None: versions_response = self.sf._call_salesforce("GET", versions_url).json() latest_version = versions_response[-1] version = latest_version["version"] + # we could avoid setting the version like below (after the Salesforce object has been already initiated + # above), since, according to the docs: + # https://developer.salesforce.com/docs/atlas.en-us.api_rest.meta/api_rest/dome_versions.htm + # we don't need to be authenticated to list the versions (so we could perform this call before even + # authenticating) self.sf.sf_version = version - self.base_url = "https://{instance}/services/data/v{sf_version}/".format( - instance=self.sf.sf_instance, sf_version=version - ) + self.base_url = "https://{instance}/services/data/v{sf_version}/".format( + instance=self.sf.sf_instance, sf_version=self.sf.sf_version + ) - logger.debug( - "Using Salesforce REST API with {label} version: {version}".format( - label=latest_version["label"], version=latest_version["version"] - ) + logger.debug( + "Using Salesforce REST API version: {version}".format( + version=self.sf.sf_version ) + ) def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: sObjects = self.get_salesforce_objects() diff --git a/metadata-ingestion/src/datahub/ingestion/source/tableau_common.py b/metadata-ingestion/src/datahub/ingestion/source/tableau_common.py index 93f2a0ef2f6a86..f3a9c4a5aa201e 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/tableau_common.py +++ b/metadata-ingestion/src/datahub/ingestion/source/tableau_common.py @@ -502,7 +502,12 @@ def get_tags_from_params(params: List[str] = []) -> GlobalTagsClass: def tableau_field_to_schema_field(field, ingest_tags): - nativeDataType = field.get("dataType", "UNKNOWN") + # The check here makes sure that even if 'dataType' key exists in the 'field' dictionary but has value None, + # it will be set as "UNKNOWN" (nativeDataType field can not be None in the SchemaField). + # Hence, field.get("dataType", "UNKNOWN") is not enough + nativeDataType = field.get("dataType") + if nativeDataType is None: + nativeDataType = "UNKNOWN" TypeClass = FIELD_TYPE_MAPPING.get(nativeDataType, NullTypeClass) schema_field = SchemaField( diff --git a/metadata-ingestion/src/datahub/ingestion/transformer/add_dataset_dataproduct.py b/metadata-ingestion/src/datahub/ingestion/transformer/add_dataset_dataproduct.py index 45e92628430258..c474e423030e05 100644 --- a/metadata-ingestion/src/datahub/ingestion/transformer/add_dataset_dataproduct.py +++ b/metadata-ingestion/src/datahub/ingestion/transformer/add_dataset_dataproduct.py @@ -11,7 +11,7 @@ from datahub.ingestion.transformer.dataset_transformer import ( DatasetDataproductTransformer, ) -from datahub.metadata.schema_classes import MetadataChangeProposalClass +from datahub.metadata.schema_classes import ContainerClass, MetadataChangeProposalClass from datahub.specific.dataproduct import DataProductPatchBuilder logger = logging.getLogger(__name__) @@ -23,6 +23,8 @@ class AddDatasetDataProductConfig(ConfigModel): _resolve_data_product_fn = pydantic_resolve_key("get_data_product_to_add") + is_container: bool = False + class AddDatasetDataProduct(DatasetDataproductTransformer): """Transformer that adds dataproduct entity for provided dataset as its asset according to a callback function.""" @@ -49,10 +51,11 @@ def handle_end_of_stream( self, ) -> List[Union[MetadataChangeProposalWrapper, MetadataChangeProposalClass]]: data_products: Dict[str, DataProductPatchBuilder] = {} - + data_products_container: Dict[str, DataProductPatchBuilder] = {} logger.debug("Generating dataproducts") for entity_urn in self.entity_map.keys(): data_product_urn = self.config.get_data_product_to_add(entity_urn) + is_container = self.config.is_container if data_product_urn: if data_product_urn not in data_products: data_products[data_product_urn] = DataProductPatchBuilder( @@ -63,11 +66,34 @@ def handle_end_of_stream( data_product_urn ].add_asset(entity_urn) + if is_container: + assert self.ctx.graph + container_aspect = self.ctx.graph.get_aspect( + entity_urn, aspect_type=ContainerClass + ) + if not container_aspect: + continue + container_urn = container_aspect.container + if data_product_urn not in data_products_container: + container_product = DataProductPatchBuilder( + data_product_urn + ).add_asset(container_urn) + data_products_container[data_product_urn] = container_product + else: + data_products_container[ + data_product_urn + ] = data_products_container[data_product_urn].add_asset( + container_urn + ) + mcps: List[ Union[MetadataChangeProposalWrapper, MetadataChangeProposalClass] ] = [] for data_product in data_products.values(): mcps.extend(list(data_product.build())) + if is_container: + for data_product in data_products_container.values(): + mcps.extend(list(data_product.build())) return mcps @@ -97,6 +123,7 @@ def create( class PatternDatasetDataProductConfig(ConfigModel): dataset_to_data_product_urns_pattern: KeyValuePattern = KeyValuePattern.all() + is_container: bool = False @pydantic.root_validator(pre=True) def validate_pattern_value(cls, values: Dict) -> Dict: @@ -122,6 +149,7 @@ def __init__(self, config: PatternDatasetDataProductConfig, ctx: PipelineContext )[0] if dataset_to_data_product.value(dataset_urn) else None, + is_container=config.is_container, ) super().__init__(generic_config, ctx) diff --git a/metadata-ingestion/tests/integration/dbt/dbt_test_prefer_sql_parser_lineage_golden.json b/metadata-ingestion/tests/integration/dbt/dbt_test_prefer_sql_parser_lineage_golden.json index 81754fd6cbcaca..d2c71659706818 100644 --- a/metadata-ingestion/tests/integration/dbt/dbt_test_prefer_sql_parser_lineage_golden.json +++ b/metadata-ingestion/tests/integration/dbt/dbt_test_prefer_sql_parser_lineage_golden.json @@ -638,8 +638,8 @@ "json": { "timestampMillis": 1663355198240, "partitionSpec": { - "type": "FULL_TABLE", - "partition": "FULL_TABLE_SNAPSHOT" + "partition": "FULL_TABLE_SNAPSHOT", + "type": "FULL_TABLE" }, "status": "STARTED" } @@ -659,8 +659,8 @@ "json": { "timestampMillis": 1663355198242, "partitionSpec": { - "type": "FULL_TABLE", - "partition": "FULL_TABLE_SNAPSHOT" + "partition": "FULL_TABLE_SNAPSHOT", + "type": "FULL_TABLE" }, "status": "COMPLETE", "result": { @@ -1097,8 +1097,8 @@ "json": { "timestampMillis": 1663355198240, "partitionSpec": { - "type": "FULL_TABLE", - "partition": "FULL_TABLE_SNAPSHOT" + "partition": "FULL_TABLE_SNAPSHOT", + "type": "FULL_TABLE" }, "status": "STARTED" } @@ -1118,8 +1118,8 @@ "json": { "timestampMillis": 1663355198242, "partitionSpec": { - "type": "FULL_TABLE", - "partition": "FULL_TABLE_SNAPSHOT" + "partition": "FULL_TABLE_SNAPSHOT", + "type": "FULL_TABLE" }, "status": "COMPLETE", "result": { @@ -1420,8 +1420,8 @@ "json": { "timestampMillis": 1663355198240, "partitionSpec": { - "type": "FULL_TABLE", - "partition": "FULL_TABLE_SNAPSHOT" + "partition": "FULL_TABLE_SNAPSHOT", + "type": "FULL_TABLE" }, "status": "STARTED" } @@ -1441,8 +1441,8 @@ "json": { "timestampMillis": 1663355198242, "partitionSpec": { - "type": "FULL_TABLE", - "partition": "FULL_TABLE_SNAPSHOT" + "partition": "FULL_TABLE_SNAPSHOT", + "type": "FULL_TABLE" }, "status": "COMPLETE", "result": { @@ -1944,8 +1944,8 @@ "json": { "timestampMillis": 1663355198240, "partitionSpec": { - "type": "FULL_TABLE", - "partition": "FULL_TABLE_SNAPSHOT" + "partition": "FULL_TABLE_SNAPSHOT", + "type": "FULL_TABLE" }, "status": "STARTED" } @@ -1965,8 +1965,8 @@ "json": { "timestampMillis": 1663355198242, "partitionSpec": { - "type": "FULL_TABLE", - "partition": "FULL_TABLE_SNAPSHOT" + "partition": "FULL_TABLE_SNAPSHOT", + "type": "FULL_TABLE" }, "status": "COMPLETE", "result": { @@ -1982,6 +1982,2201 @@ "lastRunId": "no-run-id-provided" } }, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.actor,PROD)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Source" + ] + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "dbt-prefer-sql-parser-lineage", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.actor,PROD)", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:dbt" + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "dbt-prefer-sql-parser-lineage", + "lastRunId": "no-run-id-provided" + } +}, +{ + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { + "urn": "urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.actor,PROD)", + "aspects": [ + { + "com.linkedin.pegasus2avro.dataset.DatasetProperties": { + "customProperties": { + "model_maturity": "in dev", + "owner": "@alice", + "some_other_property": "test 1", + "node_type": "source", + "dbt_file_path": "models/base.yml", + "catalog_type": "BASE TABLE", + "language": "sql", + "dbt_unique_id": "source.sample_dbt.pagila.actor", + "dbt_package_name": "sample_dbt", + "manifest_schema": "https://schemas.getdbt.com/dbt/manifest/v11.json", + "manifest_version": "1.7.3", + "manifest_adapter": "postgres", + "catalog_schema": "https://schemas.getdbt.com/dbt/catalog/v1.json", + "catalog_version": "1.7.3" + }, + "name": "actor", + "description": "description for actor table from dbt", + "tags": [] + } + }, + { + "com.linkedin.pegasus2avro.common.Status": { + "removed": false + } + }, + { + "com.linkedin.pegasus2avro.common.Ownership": { + "owners": [ + { + "owner": "urn:li:corpuser:@alice", + "type": "DATAOWNER" + } + ], + "ownerTypes": {}, + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + } + } + }, + { + "com.linkedin.pegasus2avro.schema.SchemaMetadata": { + "schemaName": "source.sample_dbt.pagila.actor", + "platform": "urn:li:dataPlatform:dbt", + "version": 0, + "created": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "lastModified": { + "time": 1581759273000, + "actor": "urn:li:corpuser:dbt_executor" + }, + "hash": "", + "platformSchema": { + "com.linkedin.pegasus2avro.schema.MySqlDDL": { + "tableSchema": "" + } + }, + "fields": [ + { + "fieldPath": "actor_id", + "nullable": false, + "description": "description for actor_id column from dbt", + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "integer", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "first_name", + "nullable": false, + "description": "dbt comment: Actors column \u2013 from postgres\n\ndbt model description: description for first_name from dbt", + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "text", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "last_name", + "nullable": false, + "description": "description for last_name from dbt", + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "text", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "last_update", + "nullable": false, + "description": "description for last_update from dbt", + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.TimeType": {} + } + }, + "nativeDataType": "timestamp with time zone", + "recursive": false, + "isPartOfKey": false + } + ] + } + }, + { + "com.linkedin.pegasus2avro.dataset.UpstreamLineage": { + "upstreams": [ + { + "auditStamp": { + "time": 1643871600000, + "actor": "urn:li:corpuser:unknown" + }, + "dataset": "urn:li:dataset:(urn:li:dataPlatform:postgres,pagila.public.actor,PROD)", + "type": "COPY" + } + ] + } + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "dbt-prefer-sql-parser-lineage", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.address,PROD)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Source" + ] + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "dbt-prefer-sql-parser-lineage", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.address,PROD)", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:dbt" + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "dbt-prefer-sql-parser-lineage", + "lastRunId": "no-run-id-provided" + } +}, +{ + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { + "urn": "urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.address,PROD)", + "aspects": [ + { + "com.linkedin.pegasus2avro.dataset.DatasetProperties": { + "customProperties": { + "node_type": "source", + "dbt_file_path": "models/base.yml", + "catalog_type": "BASE TABLE", + "language": "sql", + "dbt_unique_id": "source.sample_dbt.pagila.address", + "dbt_package_name": "sample_dbt", + "manifest_schema": "https://schemas.getdbt.com/dbt/manifest/v11.json", + "manifest_version": "1.7.3", + "manifest_adapter": "postgres", + "catalog_schema": "https://schemas.getdbt.com/dbt/catalog/v1.json", + "catalog_version": "1.7.3" + }, + "name": "address", + "description": "a user's address", + "tags": [] + } + }, + { + "com.linkedin.pegasus2avro.common.Status": { + "removed": false + } + }, + { + "com.linkedin.pegasus2avro.schema.SchemaMetadata": { + "schemaName": "source.sample_dbt.pagila.address", + "platform": "urn:li:dataPlatform:dbt", + "version": 0, + "created": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "lastModified": { + "time": 1581759930000, + "actor": "urn:li:corpuser:dbt_executor" + }, + "hash": "", + "platformSchema": { + "com.linkedin.pegasus2avro.schema.MySqlDDL": { + "tableSchema": "" + } + }, + "fields": [ + { + "fieldPath": "address", + "nullable": false, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "text", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "address2", + "nullable": false, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "text", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "address_id", + "nullable": false, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "integer", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "city_id", + "nullable": false, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "integer", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "district", + "nullable": false, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "text", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "last_update", + "nullable": false, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.TimeType": {} + } + }, + "nativeDataType": "timestamp with time zone", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "phone", + "nullable": false, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "text", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "postal_code", + "nullable": false, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "text", + "recursive": false, + "isPartOfKey": false + } + ] + } + }, + { + "com.linkedin.pegasus2avro.dataset.UpstreamLineage": { + "upstreams": [ + { + "auditStamp": { + "time": 1643871600000, + "actor": "urn:li:corpuser:unknown" + }, + "dataset": "urn:li:dataset:(urn:li:dataPlatform:postgres,pagila.public.address,PROD)", + "type": "COPY" + } + ] + } + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "dbt-prefer-sql-parser-lineage", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.category,PROD)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Source" + ] + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "dbt-prefer-sql-parser-lineage", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.category,PROD)", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:dbt" + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "dbt-prefer-sql-parser-lineage", + "lastRunId": "no-run-id-provided" + } +}, +{ + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { + "urn": "urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.category,PROD)", + "aspects": [ + { + "com.linkedin.pegasus2avro.dataset.DatasetProperties": { + "customProperties": { + "node_type": "source", + "dbt_file_path": "models/base.yml", + "catalog_type": "BASE TABLE", + "language": "sql", + "dbt_unique_id": "source.sample_dbt.pagila.category", + "dbt_package_name": "sample_dbt", + "manifest_schema": "https://schemas.getdbt.com/dbt/manifest/v11.json", + "manifest_version": "1.7.3", + "manifest_adapter": "postgres", + "catalog_schema": "https://schemas.getdbt.com/dbt/catalog/v1.json", + "catalog_version": "1.7.3" + }, + "name": "category", + "description": "a user's category", + "tags": [] + } + }, + { + "com.linkedin.pegasus2avro.common.Status": { + "removed": false + } + }, + { + "com.linkedin.pegasus2avro.schema.SchemaMetadata": { + "schemaName": "source.sample_dbt.pagila.category", + "platform": "urn:li:dataPlatform:dbt", + "version": 0, + "created": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "lastModified": { + "time": 1581759987000, + "actor": "urn:li:corpuser:dbt_executor" + }, + "hash": "", + "platformSchema": { + "com.linkedin.pegasus2avro.schema.MySqlDDL": { + "tableSchema": "" + } + }, + "fields": [ + { + "fieldPath": "category_id", + "nullable": false, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "integer", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "last_update", + "nullable": false, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.TimeType": {} + } + }, + "nativeDataType": "timestamp with time zone", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "name", + "nullable": false, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "text", + "recursive": false, + "isPartOfKey": false + } + ] + } + }, + { + "com.linkedin.pegasus2avro.dataset.UpstreamLineage": { + "upstreams": [ + { + "auditStamp": { + "time": 1643871600000, + "actor": "urn:li:corpuser:unknown" + }, + "dataset": "urn:li:dataset:(urn:li:dataPlatform:postgres,pagila.public.category,PROD)", + "type": "COPY" + } + ] + } + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "dbt-prefer-sql-parser-lineage", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.city,PROD)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Source" + ] + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "dbt-prefer-sql-parser-lineage", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.city,PROD)", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:dbt" + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "dbt-prefer-sql-parser-lineage", + "lastRunId": "no-run-id-provided" + } +}, +{ + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { + "urn": "urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.city,PROD)", + "aspects": [ + { + "com.linkedin.pegasus2avro.dataset.DatasetProperties": { + "customProperties": { + "node_type": "source", + "dbt_file_path": "models/base.yml", + "catalog_type": "BASE TABLE", + "language": "sql", + "dbt_unique_id": "source.sample_dbt.pagila.city", + "dbt_package_name": "sample_dbt", + "manifest_schema": "https://schemas.getdbt.com/dbt/manifest/v11.json", + "manifest_version": "1.7.3", + "manifest_adapter": "postgres", + "catalog_schema": "https://schemas.getdbt.com/dbt/catalog/v1.json", + "catalog_version": "1.7.3" + }, + "name": "city", + "description": "", + "tags": [] + } + }, + { + "com.linkedin.pegasus2avro.common.Status": { + "removed": false + } + }, + { + "com.linkedin.pegasus2avro.schema.SchemaMetadata": { + "schemaName": "source.sample_dbt.pagila.city", + "platform": "urn:li:dataPlatform:dbt", + "version": 0, + "created": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "lastModified": { + "time": 1581759925000, + "actor": "urn:li:corpuser:dbt_executor" + }, + "hash": "", + "platformSchema": { + "com.linkedin.pegasus2avro.schema.MySqlDDL": { + "tableSchema": "" + } + }, + "fields": [ + { + "fieldPath": "city", + "nullable": false, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "text", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "city_id", + "nullable": false, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "integer", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "country_id", + "nullable": false, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "integer", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "last_update", + "nullable": false, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.TimeType": {} + } + }, + "nativeDataType": "timestamp with time zone", + "recursive": false, + "isPartOfKey": false + } + ] + } + }, + { + "com.linkedin.pegasus2avro.dataset.UpstreamLineage": { + "upstreams": [ + { + "auditStamp": { + "time": 1643871600000, + "actor": "urn:li:corpuser:unknown" + }, + "dataset": "urn:li:dataset:(urn:li:dataPlatform:postgres,pagila.public.city,PROD)", + "type": "COPY" + } + ] + } + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "dbt-prefer-sql-parser-lineage", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.country,PROD)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Source" + ] + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "dbt-prefer-sql-parser-lineage", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.country,PROD)", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:dbt" + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "dbt-prefer-sql-parser-lineage", + "lastRunId": "no-run-id-provided" + } +}, +{ + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { + "urn": "urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.country,PROD)", + "aspects": [ + { + "com.linkedin.pegasus2avro.dataset.DatasetProperties": { + "customProperties": { + "model_maturity": "in prod", + "owner": "@bob", + "some_other_property": "test 2", + "node_type": "source", + "dbt_file_path": "models/base.yml", + "catalog_type": "BASE TABLE", + "language": "sql", + "dbt_unique_id": "source.sample_dbt.pagila.country", + "dbt_package_name": "sample_dbt", + "manifest_schema": "https://schemas.getdbt.com/dbt/manifest/v11.json", + "manifest_version": "1.7.3", + "manifest_adapter": "postgres", + "catalog_schema": "https://schemas.getdbt.com/dbt/catalog/v1.json", + "catalog_version": "1.7.3" + }, + "name": "country", + "description": "", + "tags": [] + } + }, + { + "com.linkedin.pegasus2avro.common.Status": { + "removed": false + } + }, + { + "com.linkedin.pegasus2avro.common.Ownership": { + "owners": [ + { + "owner": "urn:li:corpuser:@bob", + "type": "DATAOWNER" + } + ], + "ownerTypes": {}, + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + } + } + }, + { + "com.linkedin.pegasus2avro.schema.SchemaMetadata": { + "schemaName": "source.sample_dbt.pagila.country", + "platform": "urn:li:dataPlatform:dbt", + "version": 0, + "created": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "lastModified": { + "time": 1581759840000, + "actor": "urn:li:corpuser:dbt_executor" + }, + "hash": "", + "platformSchema": { + "com.linkedin.pegasus2avro.schema.MySqlDDL": { + "tableSchema": "" + } + }, + "fields": [ + { + "fieldPath": "country", + "nullable": false, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "text", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "country_id", + "nullable": false, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "integer", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "last_update", + "nullable": false, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.TimeType": {} + } + }, + "nativeDataType": "timestamp with time zone", + "recursive": false, + "isPartOfKey": false + } + ] + } + }, + { + "com.linkedin.pegasus2avro.dataset.UpstreamLineage": { + "upstreams": [ + { + "auditStamp": { + "time": 1643871600000, + "actor": "urn:li:corpuser:unknown" + }, + "dataset": "urn:li:dataset:(urn:li:dataPlatform:postgres,pagila.public.country,PROD)", + "type": "COPY" + } + ] + } + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "dbt-prefer-sql-parser-lineage", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.customer,PROD)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Source" + ] + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "dbt-prefer-sql-parser-lineage", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.customer,PROD)", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:dbt" + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "dbt-prefer-sql-parser-lineage", + "lastRunId": "no-run-id-provided" + } +}, +{ + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { + "urn": "urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.customer,PROD)", + "aspects": [ + { + "com.linkedin.pegasus2avro.dataset.DatasetProperties": { + "customProperties": { + "node_type": "source", + "dbt_file_path": "models/base.yml", + "catalog_type": "BASE TABLE", + "language": "sql", + "dbt_unique_id": "source.sample_dbt.pagila.customer", + "dbt_package_name": "sample_dbt", + "manifest_schema": "https://schemas.getdbt.com/dbt/manifest/v11.json", + "manifest_version": "1.7.3", + "manifest_adapter": "postgres", + "catalog_schema": "https://schemas.getdbt.com/dbt/catalog/v1.json", + "catalog_version": "1.7.3" + }, + "name": "customer", + "description": "description for customer table from dbt", + "tags": [] + } + }, + { + "com.linkedin.pegasus2avro.common.Status": { + "removed": false + } + }, + { + "com.linkedin.pegasus2avro.schema.SchemaMetadata": { + "schemaName": "source.sample_dbt.pagila.customer", + "platform": "urn:li:dataPlatform:dbt", + "version": 0, + "created": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "lastModified": { + "time": 1581760640000, + "actor": "urn:li:corpuser:dbt_executor" + }, + "hash": "", + "platformSchema": { + "com.linkedin.pegasus2avro.schema.MySqlDDL": { + "tableSchema": "" + } + }, + "fields": [ + { + "fieldPath": "active", + "nullable": false, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "integer", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "activebool", + "nullable": false, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.BooleanType": {} + } + }, + "nativeDataType": "boolean", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "address_id", + "nullable": false, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "integer", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "create_date", + "nullable": false, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.DateType": {} + } + }, + "nativeDataType": "date", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "customer_id", + "nullable": false, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "integer", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "email", + "nullable": false, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "text", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "first_name", + "nullable": false, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "text", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "last_name", + "nullable": false, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "text", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "last_update", + "nullable": false, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.TimeType": {} + } + }, + "nativeDataType": "timestamp with time zone", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "store_id", + "nullable": false, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "integer", + "recursive": false, + "isPartOfKey": false + } + ] + } + }, + { + "com.linkedin.pegasus2avro.dataset.UpstreamLineage": { + "upstreams": [ + { + "auditStamp": { + "time": 1643871600000, + "actor": "urn:li:corpuser:unknown" + }, + "dataset": "urn:li:dataset:(urn:li:dataPlatform:postgres,pagila.public.customer,PROD)", + "type": "COPY" + } + ] + } + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "dbt-prefer-sql-parser-lineage", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.payment_p2020_01,PROD)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Source" + ] + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "dbt-prefer-sql-parser-lineage", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.payment_p2020_01,PROD)", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:dbt" + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "dbt-prefer-sql-parser-lineage", + "lastRunId": "no-run-id-provided" + } +}, +{ + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { + "urn": "urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.payment_p2020_01,PROD)", + "aspects": [ + { + "com.linkedin.pegasus2avro.dataset.DatasetProperties": { + "customProperties": { + "node_type": "source", + "dbt_file_path": "models/base.yml", + "catalog_type": "BASE TABLE", + "language": "sql", + "dbt_unique_id": "source.sample_dbt.pagila.payment_p2020_01", + "dbt_package_name": "sample_dbt", + "manifest_schema": "https://schemas.getdbt.com/dbt/manifest/v11.json", + "manifest_version": "1.7.3", + "manifest_adapter": "postgres", + "catalog_schema": "https://schemas.getdbt.com/dbt/catalog/v1.json", + "catalog_version": "1.7.3" + }, + "name": "payment_p2020_01", + "description": "", + "tags": [] + } + }, + { + "com.linkedin.pegasus2avro.common.Status": { + "removed": false + } + }, + { + "com.linkedin.pegasus2avro.schema.SchemaMetadata": { + "schemaName": "source.sample_dbt.pagila.payment_p2020_01", + "platform": "urn:li:dataPlatform:dbt", + "version": 0, + "created": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "lastModified": { + "time": 1580505371997, + "actor": "urn:li:corpuser:dbt_executor" + }, + "hash": "", + "platformSchema": { + "com.linkedin.pegasus2avro.schema.MySqlDDL": { + "tableSchema": "" + } + }, + "fields": [ + { + "fieldPath": "amount", + "nullable": false, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "numeric(5,2)", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "customer_id", + "nullable": false, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "integer", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "payment_date", + "nullable": false, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.TimeType": {} + } + }, + "nativeDataType": "timestamp with time zone", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "payment_id", + "nullable": false, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "integer", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "rental_id", + "nullable": false, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "integer", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "staff_id", + "nullable": false, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "integer", + "recursive": false, + "isPartOfKey": false + } + ] + } + }, + { + "com.linkedin.pegasus2avro.dataset.UpstreamLineage": { + "upstreams": [ + { + "auditStamp": { + "time": 1643871600000, + "actor": "urn:li:corpuser:unknown" + }, + "dataset": "urn:li:dataset:(urn:li:dataPlatform:postgres,pagila.public.payment_p2020_01,PROD)", + "type": "COPY" + } + ] + } + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "dbt-prefer-sql-parser-lineage", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.payment_p2020_02,PROD)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Source" + ] + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "dbt-prefer-sql-parser-lineage", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.payment_p2020_02,PROD)", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:dbt" + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "dbt-prefer-sql-parser-lineage", + "lastRunId": "no-run-id-provided" + } +}, +{ + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { + "urn": "urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.payment_p2020_02,PROD)", + "aspects": [ + { + "com.linkedin.pegasus2avro.dataset.DatasetProperties": { + "customProperties": { + "an_array_property": "['alpha', 'beta', 'charlie']", + "model_maturity": "in prod", + "owner": "@charles", + "some_other_property": "test 3", + "node_type": "source", + "dbt_file_path": "models/base.yml", + "catalog_type": "BASE TABLE", + "language": "sql", + "dbt_unique_id": "source.sample_dbt.pagila.payment_p2020_02", + "dbt_package_name": "sample_dbt", + "manifest_schema": "https://schemas.getdbt.com/dbt/manifest/v11.json", + "manifest_version": "1.7.3", + "manifest_adapter": "postgres", + "catalog_schema": "https://schemas.getdbt.com/dbt/catalog/v1.json", + "catalog_version": "1.7.3" + }, + "name": "payment_p2020_02", + "description": "", + "tags": [] + } + }, + { + "com.linkedin.pegasus2avro.common.Status": { + "removed": false + } + }, + { + "com.linkedin.pegasus2avro.common.Ownership": { + "owners": [ + { + "owner": "urn:li:corpuser:@charles", + "type": "DATAOWNER" + } + ], + "ownerTypes": {}, + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + } + } + }, + { + "com.linkedin.pegasus2avro.schema.SchemaMetadata": { + "schemaName": "source.sample_dbt.pagila.payment_p2020_02", + "platform": "urn:li:dataPlatform:dbt", + "version": 0, + "created": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "lastModified": { + "time": 1582319845997, + "actor": "urn:li:corpuser:dbt_executor" + }, + "hash": "", + "platformSchema": { + "com.linkedin.pegasus2avro.schema.MySqlDDL": { + "tableSchema": "" + } + }, + "fields": [ + { + "fieldPath": "amount", + "nullable": false, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "numeric(5,2)", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "customer_id", + "nullable": false, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "integer", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "payment_date", + "nullable": false, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.TimeType": {} + } + }, + "nativeDataType": "timestamp with time zone", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "payment_id", + "nullable": false, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "integer", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "rental_id", + "nullable": false, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "integer", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "staff_id", + "nullable": false, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "integer", + "recursive": false, + "isPartOfKey": false + } + ] + } + }, + { + "com.linkedin.pegasus2avro.dataset.UpstreamLineage": { + "upstreams": [ + { + "auditStamp": { + "time": 1643871600000, + "actor": "urn:li:corpuser:unknown" + }, + "dataset": "urn:li:dataset:(urn:li:dataPlatform:postgres,pagila.public.payment_p2020_02,PROD)", + "type": "COPY" + } + ] + } + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "dbt-prefer-sql-parser-lineage", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.payment_p2020_03,PROD)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Source" + ] + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "dbt-prefer-sql-parser-lineage", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.payment_p2020_03,PROD)", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:dbt" + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "dbt-prefer-sql-parser-lineage", + "lastRunId": "no-run-id-provided" + } +}, +{ + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { + "urn": "urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.payment_p2020_03,PROD)", + "aspects": [ + { + "com.linkedin.pegasus2avro.dataset.DatasetProperties": { + "customProperties": { + "node_type": "source", + "dbt_file_path": "models/base.yml", + "catalog_type": "BASE TABLE", + "language": "sql", + "dbt_unique_id": "source.sample_dbt.pagila.payment_p2020_03", + "dbt_package_name": "sample_dbt", + "manifest_schema": "https://schemas.getdbt.com/dbt/manifest/v11.json", + "manifest_version": "1.7.3", + "manifest_adapter": "postgres", + "catalog_schema": "https://schemas.getdbt.com/dbt/catalog/v1.json", + "catalog_version": "1.7.3" + }, + "name": "payment_p2020_03", + "description": "", + "tags": [] + } + }, + { + "com.linkedin.pegasus2avro.common.Status": { + "removed": false + } + }, + { + "com.linkedin.pegasus2avro.schema.SchemaMetadata": { + "schemaName": "source.sample_dbt.pagila.payment_p2020_03", + "platform": "urn:li:dataPlatform:dbt", + "version": 0, + "created": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "lastModified": { + "time": 1584998318997, + "actor": "urn:li:corpuser:dbt_executor" + }, + "hash": "", + "platformSchema": { + "com.linkedin.pegasus2avro.schema.MySqlDDL": { + "tableSchema": "" + } + }, + "fields": [ + { + "fieldPath": "amount", + "nullable": false, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "numeric(5,2)", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "customer_id", + "nullable": false, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "integer", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "payment_date", + "nullable": false, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.TimeType": {} + } + }, + "nativeDataType": "timestamp with time zone", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "payment_id", + "nullable": false, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "integer", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "rental_id", + "nullable": false, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "integer", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "staff_id", + "nullable": false, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "integer", + "recursive": false, + "isPartOfKey": false + } + ] + } + }, + { + "com.linkedin.pegasus2avro.dataset.UpstreamLineage": { + "upstreams": [ + { + "auditStamp": { + "time": 1643871600000, + "actor": "urn:li:corpuser:unknown" + }, + "dataset": "urn:li:dataset:(urn:li:dataPlatform:postgres,pagila.public.payment_p2020_03,PROD)", + "type": "COPY" + } + ] + } + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "dbt-prefer-sql-parser-lineage", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.payment_p2020_04,PROD)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Source" + ] + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "dbt-prefer-sql-parser-lineage", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.payment_p2020_04,PROD)", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:dbt" + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "dbt-prefer-sql-parser-lineage", + "lastRunId": "no-run-id-provided" + } +}, +{ + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { + "urn": "urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.payment_p2020_04,PROD)", + "aspects": [ + { + "com.linkedin.pegasus2avro.dataset.DatasetProperties": { + "customProperties": { + "node_type": "source", + "dbt_file_path": "models/base.yml", + "catalog_type": "BASE TABLE", + "language": "sql", + "dbt_unique_id": "source.sample_dbt.pagila.payment_p2020_04", + "dbt_package_name": "sample_dbt", + "manifest_schema": "https://schemas.getdbt.com/dbt/manifest/v11.json", + "manifest_version": "1.7.3", + "manifest_adapter": "postgres", + "catalog_schema": "https://schemas.getdbt.com/dbt/catalog/v1.json", + "catalog_version": "1.7.3" + }, + "name": "payment_p2020_04", + "description": "", + "tags": [] + } + }, + { + "com.linkedin.pegasus2avro.common.Status": { + "removed": false + } + }, + { + "com.linkedin.pegasus2avro.schema.SchemaMetadata": { + "schemaName": "source.sample_dbt.pagila.payment_p2020_04", + "platform": "urn:li:dataPlatform:dbt", + "version": 0, + "created": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "lastModified": { + "time": 1588287228997, + "actor": "urn:li:corpuser:dbt_executor" + }, + "hash": "", + "platformSchema": { + "com.linkedin.pegasus2avro.schema.MySqlDDL": { + "tableSchema": "" + } + }, + "fields": [ + { + "fieldPath": "amount", + "nullable": false, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "numeric(5,2)", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "customer_id", + "nullable": false, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "integer", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "payment_date", + "nullable": false, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.TimeType": {} + } + }, + "nativeDataType": "timestamp with time zone", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "payment_id", + "nullable": false, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "integer", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "rental_id", + "nullable": false, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "integer", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "staff_id", + "nullable": false, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "integer", + "recursive": false, + "isPartOfKey": false + } + ] + } + }, + { + "com.linkedin.pegasus2avro.dataset.UpstreamLineage": { + "upstreams": [ + { + "auditStamp": { + "time": 1643871600000, + "actor": "urn:li:corpuser:unknown" + }, + "dataset": "urn:li:dataset:(urn:li:dataPlatform:postgres,pagila.public.payment_p2020_04,PROD)", + "type": "COPY" + } + ] + } + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "dbt-prefer-sql-parser-lineage", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.payment_p2020_05,PROD)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Source" + ] + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "dbt-prefer-sql-parser-lineage", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.payment_p2020_05,PROD)", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:dbt" + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "dbt-prefer-sql-parser-lineage", + "lastRunId": "no-run-id-provided" + } +}, +{ + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { + "urn": "urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.payment_p2020_05,PROD)", + "aspects": [ + { + "com.linkedin.pegasus2avro.dataset.DatasetProperties": { + "customProperties": { + "node_type": "source", + "dbt_file_path": "models/base.yml", + "catalog_type": "BASE TABLE", + "language": "sql", + "dbt_unique_id": "source.sample_dbt.pagila.payment_p2020_05", + "dbt_package_name": "sample_dbt", + "manifest_schema": "https://schemas.getdbt.com/dbt/manifest/v11.json", + "manifest_version": "1.7.3", + "manifest_adapter": "postgres", + "catalog_schema": "https://schemas.getdbt.com/dbt/catalog/v1.json", + "catalog_version": "1.7.3" + }, + "name": "payment_p2020_05", + "description": "a payment", + "tags": [] + } + }, + { + "com.linkedin.pegasus2avro.common.Status": { + "removed": false + } + }, + { + "com.linkedin.pegasus2avro.schema.SchemaMetadata": { + "schemaName": "source.sample_dbt.pagila.payment_p2020_05", + "platform": "urn:li:dataPlatform:dbt", + "version": 0, + "created": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "lastModified": { + "time": 1589460269997, + "actor": "urn:li:corpuser:dbt_executor" + }, + "hash": "", + "platformSchema": { + "com.linkedin.pegasus2avro.schema.MySqlDDL": { + "tableSchema": "" + } + }, + "fields": [ + { + "fieldPath": "amount", + "nullable": false, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "numeric(5,2)", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "customer_id", + "nullable": false, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "integer", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "payment_date", + "nullable": false, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.TimeType": {} + } + }, + "nativeDataType": "timestamp with time zone", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "payment_id", + "nullable": false, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "integer", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "rental_id", + "nullable": false, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "integer", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "staff_id", + "nullable": false, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "integer", + "recursive": false, + "isPartOfKey": false + } + ] + } + }, + { + "com.linkedin.pegasus2avro.dataset.UpstreamLineage": { + "upstreams": [ + { + "auditStamp": { + "time": 1643871600000, + "actor": "urn:li:corpuser:unknown" + }, + "dataset": "urn:li:dataset:(urn:li:dataPlatform:postgres,pagila.public.payment_p2020_05,PROD)", + "type": "COPY" + } + ] + } + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "dbt-prefer-sql-parser-lineage", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.payment_p2020_06,PROD)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Source" + ] + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "dbt-prefer-sql-parser-lineage", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.payment_p2020_06,PROD)", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:dbt" + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "dbt-prefer-sql-parser-lineage", + "lastRunId": "no-run-id-provided" + } +}, +{ + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { + "urn": "urn:li:dataset:(urn:li:dataPlatform:dbt,pagila.public.payment_p2020_06,PROD)", + "aspects": [ + { + "com.linkedin.pegasus2avro.dataset.DatasetProperties": { + "customProperties": { + "node_type": "source", + "dbt_file_path": "models/base.yml", + "catalog_type": "BASE TABLE", + "language": "sql", + "dbt_unique_id": "source.sample_dbt.pagila.payment_p2020_06", + "dbt_package_name": "sample_dbt", + "manifest_schema": "https://schemas.getdbt.com/dbt/manifest/v11.json", + "manifest_version": "1.7.3", + "manifest_adapter": "postgres", + "catalog_schema": "https://schemas.getdbt.com/dbt/catalog/v1.json", + "catalog_version": "1.7.3" + }, + "name": "payment_p2020_06", + "description": "", + "tags": [] + } + }, + { + "com.linkedin.pegasus2avro.common.Status": { + "removed": false + } + }, + { + "com.linkedin.pegasus2avro.schema.SchemaMetadata": { + "schemaName": "source.sample_dbt.pagila.payment_p2020_06", + "platform": "urn:li:dataPlatform:dbt", + "version": 0, + "created": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "lastModified": { + "time": -62135596800000, + "actor": "urn:li:corpuser:dbt_executor" + }, + "hash": "", + "platformSchema": { + "com.linkedin.pegasus2avro.schema.MySqlDDL": { + "tableSchema": "" + } + }, + "fields": [ + { + "fieldPath": "amount", + "nullable": false, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "numeric(5,2)", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "customer_id", + "nullable": false, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "integer", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "payment_date", + "nullable": false, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.TimeType": {} + } + }, + "nativeDataType": "timestamp with time zone", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "payment_id", + "nullable": false, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "integer", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "rental_id", + "nullable": false, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "integer", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "staff_id", + "nullable": false, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "integer", + "recursive": false, + "isPartOfKey": false + } + ] + } + }, + { + "com.linkedin.pegasus2avro.dataset.UpstreamLineage": { + "upstreams": [ + { + "auditStamp": { + "time": 1643871600000, + "actor": "urn:li:corpuser:unknown" + }, + "dataset": "urn:li:dataset:(urn:li:dataPlatform:postgres,pagila.public.payment_p2020_06,PROD)", + "type": "COPY" + } + ] + } + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "dbt-prefer-sql-parser-lineage", + "lastRunId": "no-run-id-provided" + } +}, { "entityType": "dataset", "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:postgres,pagila.public.an-aliased-view-for-monthly-billing,PROD)", @@ -2344,8 +4539,8 @@ }, "assertionUrn": "urn:li:assertion:ba2c6ba830d407d539452f4cf46c92a6", "partitionSpec": { - "type": "FULL_TABLE", - "partition": "FULL_TABLE_SNAPSHOT" + "partition": "FULL_TABLE_SNAPSHOT", + "type": "FULL_TABLE" } } }, @@ -2426,8 +4621,8 @@ }, "assertionUrn": "urn:li:assertion:10f2a119dedcaab43afc47ff13d9cb5b", "partitionSpec": { - "type": "FULL_TABLE", - "partition": "FULL_TABLE_SNAPSHOT" + "partition": "FULL_TABLE_SNAPSHOT", + "type": "FULL_TABLE" } } }, @@ -2507,8 +4702,8 @@ }, "assertionUrn": "urn:li:assertion:c456eccf6440c6e3388c584689a74d91", "partitionSpec": { - "type": "FULL_TABLE", - "partition": "FULL_TABLE_SNAPSHOT" + "partition": "FULL_TABLE_SNAPSHOT", + "type": "FULL_TABLE" } } }, @@ -2588,8 +4783,8 @@ }, "assertionUrn": "urn:li:assertion:f812b73477d81e6af283d918cb59e7bf", "partitionSpec": { - "type": "FULL_TABLE", - "partition": "FULL_TABLE_SNAPSHOT" + "partition": "FULL_TABLE_SNAPSHOT", + "type": "FULL_TABLE" } } }, @@ -2678,8 +4873,8 @@ }, "assertionUrn": "urn:li:assertion:08c35a6481d3c37c93eaf9e424faa6d5", "partitionSpec": { - "type": "FULL_TABLE", - "partition": "FULL_TABLE_SNAPSHOT" + "partition": "FULL_TABLE_SNAPSHOT", + "type": "FULL_TABLE" } } }, @@ -2752,8 +4947,8 @@ }, "assertionUrn": "urn:li:assertion:08c35a6481d3c37c93eaf9e424faa6d5", "partitionSpec": { - "type": "FULL_TABLE", - "partition": "FULL_TABLE_SNAPSHOT" + "partition": "FULL_TABLE_SNAPSHOT", + "type": "FULL_TABLE" } } }, @@ -2833,8 +5028,8 @@ }, "assertionUrn": "urn:li:assertion:f6a1fde3ab4919abcc04bdee93144958", "partitionSpec": { - "type": "FULL_TABLE", - "partition": "FULL_TABLE_SNAPSHOT" + "partition": "FULL_TABLE_SNAPSHOT", + "type": "FULL_TABLE" } } }, @@ -2920,8 +5115,8 @@ }, "assertionUrn": "urn:li:assertion:60ce4aad7ff6dbff7004da0f2258c9df", "partitionSpec": { - "type": "FULL_TABLE", - "partition": "FULL_TABLE_SNAPSHOT" + "partition": "FULL_TABLE_SNAPSHOT", + "type": "FULL_TABLE" } } }, diff --git a/metadata-ingestion/tests/integration/dbt/test_dbt.py b/metadata-ingestion/tests/integration/dbt/test_dbt.py index a46da9707679c7..d213cffa78045e 100644 --- a/metadata-ingestion/tests/integration/dbt/test_dbt.py +++ b/metadata-ingestion/tests/integration/dbt/test_dbt.py @@ -227,7 +227,7 @@ def set_paths( source_config_modifiers={ "prefer_sql_parser_lineage": True, "skip_sources_in_lineage": True, - "entities_enabled": {"sources": "NO"}, + # "entities_enabled": {"sources": "NO"}, }, ), ], diff --git a/metadata-ingestion/tests/integration/lookml/vv-lineage-and-liquid-templates/data.model.lkml b/metadata-ingestion/tests/integration/lookml/vv-lineage-and-liquid-templates/data.model.lkml index 2cc6ae994d245b..a87381dd0bf759 100644 --- a/metadata-ingestion/tests/integration/lookml/vv-lineage-and-liquid-templates/data.model.lkml +++ b/metadata-ingestion/tests/integration/lookml/vv-lineage-and-liquid-templates/data.model.lkml @@ -6,6 +6,8 @@ include: "employee_total_income.view.lkml" include: "top_10_employee_income_source.view.lkml" include: "employee_tax_report.view.lkml" include: "employee_salary_rating.view.lkml" +include: "environment_activity_logs.view.lkml" +include: "employee_income_source_as_per_env.view.lkml" include: "rent_as_employee_income_source.view.lkml" explore: activity_logs { @@ -26,5 +28,11 @@ explore: employee_tax_report { explore: employee_salary_rating { } +explore: environment_activity_logs { +} + +explore: employee_income_source_as_per_env { +} + explore: rent_as_employee_income_source { } \ No newline at end of file diff --git a/metadata-ingestion/tests/integration/lookml/vv-lineage-and-liquid-templates/employee_income_source_as_per_env.view.lkml b/metadata-ingestion/tests/integration/lookml/vv-lineage-and-liquid-templates/employee_income_source_as_per_env.view.lkml new file mode 100644 index 00000000000000..4b8e0dd46a8ce3 --- /dev/null +++ b/metadata-ingestion/tests/integration/lookml/vv-lineage-and-liquid-templates/employee_income_source_as_per_env.view.lkml @@ -0,0 +1,40 @@ +view: employee_income_source_as_per_env { + derived_table: { + sql: SELECT + employee_id, + employee_name, + {% if dw_eff_dt_date._is_selected or finance_dw_eff_dt_date._is_selected %} + prod_core.data.r_metric_summary_v2 + {% elsif dw_eff_dt_week._is_selected or finance_dw_eff_dt_week._is_selected %} + prod_core.data.r_metric_summary_v3 + {% else %} + 'default_table' as source + {% endif %}, + employee_income + FROM -- if dev -- dev_income_source -- if prod -- prod_income_source + WHERE + {% condition source_region %} source_table.region {% endcondition %} + ;; + } + + dimension: id { + type: number + sql: ${TABLE}.employee_id;; + } + + dimension: name { + type: string + sql: ${TABLE}.employee_name;; + } + + dimension: source { + type: string + sql: ${TABLE}.source ;; + } + + dimension: income { + type: number + sql: ${TABLE}.employee_income ;; + } + +} diff --git a/metadata-ingestion/tests/integration/lookml/vv-lineage-and-liquid-templates/environment_activity_logs.view.lkml b/metadata-ingestion/tests/integration/lookml/vv-lineage-and-liquid-templates/environment_activity_logs.view.lkml new file mode 100644 index 00000000000000..efc7ba82754b88 --- /dev/null +++ b/metadata-ingestion/tests/integration/lookml/vv-lineage-and-liquid-templates/environment_activity_logs.view.lkml @@ -0,0 +1,12 @@ +view: environment_activity_logs { + sql_table_name: -- if prod -- prod.staging_app.stg_app__activity_logs + -- if dev -- {{ _user_attributes['dev_database_prefix'] }}analytics.{{ _user_attributes['dev_schema_prefix'] }}staging_app.stg_app__activity_logs + ;; + + dimension: generated_message_id { + group_label: "IDs" + primary_key: yes + type: number + sql: ${TABLE}."GENERATED_MESSAGE_ID" ;; + } +} diff --git a/metadata-ingestion/tests/integration/lookml/vv_lineage_liquid_template_golden.json b/metadata-ingestion/tests/integration/lookml/vv_lineage_liquid_template_golden.json index 2e55971b65bd43..b723aff080bc44 100644 --- a/metadata-ingestion/tests/integration/lookml/vv_lineage_liquid_template_golden.json +++ b/metadata-ingestion/tests/integration/lookml/vv_lineage_liquid_template_golden.json @@ -1580,6 +1580,472 @@ "lastRunId": "no-run-id-provided" } }, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.environment_activity_logs,PROD)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "View" + ] + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "lookml-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.environment_activity_logs,PROD)", + "changeType": "UPSERT", + "aspectName": "viewProperties", + "aspect": { + "json": { + "materialized": false, + "viewLogic": "view: environment_activity_logs {\n sql_table_name: -- if prod -- prod.staging_app.stg_app__activity_logs\n -- if dev -- {{ _user_attributes['dev_database_prefix'] }}analytics.{{ _user_attributes['dev_schema_prefix'] }}staging_app.stg_app__activity_logs\n ;;\n\n dimension: generated_message_id {\n group_label: \"IDs\"\n primary_key: yes\n type: number\n sql: ${TABLE}.\"GENERATED_MESSAGE_ID\" ;;\n }\n}\n", + "viewLanguage": "lookml" + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "lookml-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.environment_activity_logs,PROD)", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:78f22c19304954b15e8adb1d9809975e" + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "lookml-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { + "urn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.environment_activity_logs,PROD)", + "aspects": [ + { + "com.linkedin.pegasus2avro.common.BrowsePaths": { + "paths": [ + "/Develop/lkml_samples/" + ] + } + }, + { + "com.linkedin.pegasus2avro.common.Status": { + "removed": false + } + }, + { + "com.linkedin.pegasus2avro.dataset.UpstreamLineage": { + "upstreams": [ + { + "auditStamp": { + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" + }, + "dataset": "urn:li:dataset:(urn:li:dataPlatform:postgres,prod.staging_app.stg_app__activity_logs,PROD)", + "type": "VIEW" + } + ], + "fineGrainedLineages": [ + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:postgres,prod.staging_app.stg_app__activity_logs,PROD),generated_message_id)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.environment_activity_logs,PROD),generated_message_id)" + ], + "confidenceScore": 1.0 + } + ] + } + }, + { + "com.linkedin.pegasus2avro.schema.SchemaMetadata": { + "schemaName": "environment_activity_logs", + "platform": "urn:li:dataPlatform:looker", + "version": 0, + "created": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "hash": "", + "platformSchema": { + "com.linkedin.pegasus2avro.schema.OtherSchema": { + "rawSchema": "" + } + }, + "fields": [ + { + "fieldPath": "generated_message_id", + "nullable": false, + "description": "", + "label": "", + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "number", + "recursive": false, + "globalTags": { + "tags": [ + { + "tag": "urn:li:tag:Dimension" + } + ] + }, + "isPartOfKey": true + } + ], + "primaryKeys": [ + "generated_message_id" + ] + } + }, + { + "com.linkedin.pegasus2avro.dataset.DatasetProperties": { + "customProperties": { + "looker.file.path": "environment_activity_logs.view.lkml", + "looker.model": "data" + }, + "name": "environment_activity_logs", + "tags": [] + } + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "lookml-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.environment_activity_logs,PROD)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "Develop" + }, + { + "id": "urn:li:container:78f22c19304954b15e8adb1d9809975e", + "urn": "urn:li:container:78f22c19304954b15e8adb1d9809975e" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "lookml-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.employee_income_source_as_per_env,PROD)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "View" + ] + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "lookml-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.employee_income_source_as_per_env,PROD)", + "changeType": "UPSERT", + "aspectName": "viewProperties", + "aspect": { + "json": { + "materialized": false, + "viewLogic": "SELECT\n employee_id,\n employee_name,\n {% if dw_eff_dt_date._is_selected or finance_dw_eff_dt_date._is_selected %}\n prod_core.data.r_metric_summary_v2\n {% elsif dw_eff_dt_week._is_selected or finance_dw_eff_dt_week._is_selected %}\n prod_core.data.r_metric_summary_v3\n {% else %}\n 'default_table' as source\n {% endif %},\n employee_income\n FROM -- if dev -- dev_income_source -- if prod -- prod_income_source\n WHERE\n {% condition source_region %} source_table.region {% endcondition %}", + "viewLanguage": "sql" + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "lookml-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.employee_income_source_as_per_env,PROD)", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:78f22c19304954b15e8adb1d9809975e" + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "lookml-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { + "urn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.employee_income_source_as_per_env,PROD)", + "aspects": [ + { + "com.linkedin.pegasus2avro.common.BrowsePaths": { + "paths": [ + "/Develop/lkml_samples/" + ] + } + }, + { + "com.linkedin.pegasus2avro.common.Status": { + "removed": false + } + }, + { + "com.linkedin.pegasus2avro.dataset.UpstreamLineage": { + "upstreams": [ + { + "auditStamp": { + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" + }, + "dataset": "urn:li:dataset:(urn:li:dataPlatform:postgres,prod_income_source,PROD)", + "type": "VIEW" + } + ], + "fineGrainedLineages": [ + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:postgres,prod_income_source,PROD),employee_id)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.employee_income_source_as_per_env,PROD),id)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:postgres,prod_income_source,PROD),employee_name)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.employee_income_source_as_per_env,PROD),name)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:postgres,prod_income_source,PROD),source)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.employee_income_source_as_per_env,PROD),source)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:postgres,prod_income_source,PROD),employee_income)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.employee_income_source_as_per_env,PROD),income)" + ], + "confidenceScore": 1.0 + } + ] + } + }, + { + "com.linkedin.pegasus2avro.schema.SchemaMetadata": { + "schemaName": "employee_income_source_as_per_env", + "platform": "urn:li:dataPlatform:looker", + "version": 0, + "created": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "hash": "", + "platformSchema": { + "com.linkedin.pegasus2avro.schema.OtherSchema": { + "rawSchema": "" + } + }, + "fields": [ + { + "fieldPath": "id", + "nullable": false, + "description": "", + "label": "", + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "number", + "recursive": false, + "globalTags": { + "tags": [ + { + "tag": "urn:li:tag:Dimension" + } + ] + }, + "isPartOfKey": false + }, + { + "fieldPath": "name", + "nullable": false, + "description": "", + "label": "", + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "globalTags": { + "tags": [ + { + "tag": "urn:li:tag:Dimension" + } + ] + }, + "isPartOfKey": false + }, + { + "fieldPath": "source", + "nullable": false, + "description": "", + "label": "", + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "globalTags": { + "tags": [ + { + "tag": "urn:li:tag:Dimension" + } + ] + }, + "isPartOfKey": false + }, + { + "fieldPath": "income", + "nullable": false, + "description": "", + "label": "", + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "number", + "recursive": false, + "globalTags": { + "tags": [ + { + "tag": "urn:li:tag:Dimension" + } + ] + }, + "isPartOfKey": false + } + ], + "primaryKeys": [] + } + }, + { + "com.linkedin.pegasus2avro.dataset.DatasetProperties": { + "customProperties": { + "looker.file.path": "employee_income_source_as_per_env.view.lkml", + "looker.model": "data" + }, + "name": "employee_income_source_as_per_env", + "tags": [] + } + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "lookml-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.employee_income_source_as_per_env,PROD)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "Develop" + }, + { + "id": "urn:li:container:78f22c19304954b15e8adb1d9809975e", + "urn": "urn:li:container:78f22c19304954b15e8adb1d9809975e" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "lookml-test", + "lastRunId": "no-run-id-provided" + } +}, { "entityType": "dataset", "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.rent_as_employee_income_source,PROD)", diff --git a/metadata-ingestion/tests/integration/salesforce/test_salesforce.py b/metadata-ingestion/tests/integration/salesforce/test_salesforce.py index 8b6b883b2148d2..89a37a372df843 100644 --- a/metadata-ingestion/tests/integration/salesforce/test_salesforce.py +++ b/metadata-ingestion/tests/integration/salesforce/test_salesforce.py @@ -1,10 +1,12 @@ import json import pathlib from unittest import mock +from unittest.mock import Mock from freezegun import freeze_time from datahub.ingestion.run.pipeline import Pipeline +from datahub.ingestion.source.salesforce import SalesforceConfig, SalesforceSource from tests.test_helpers import mce_helpers FROZEN_TIME = "2022-05-12 11:00:00" @@ -19,15 +21,16 @@ def _read_response(file_name: str) -> dict: return data -def side_effect_call_salesforce(type, url): - class MockResponse: - def __init__(self, json_data, status_code): - self.json_data = json_data - self.status_code = status_code +class MockResponse: + def __init__(self, json_data, status_code): + self.json_data = json_data + self.status_code = status_code + + def json(self): + return self.json_data - def json(self): - return self.json_data +def side_effect_call_salesforce(type, url): if url.endswith("/services/data/"): return MockResponse(_read_response("versions_response.json"), 200) if url.endswith("FROM EntityDefinition WHERE IsCustomizable = true"): @@ -55,9 +58,92 @@ def json(self): return MockResponse({}, 404) +@mock.patch("datahub.ingestion.source.salesforce.Salesforce") +def test_latest_version(mock_sdk): + mock_sf = mock.Mock() + mocked_call = mock.Mock() + mocked_call.side_effect = side_effect_call_salesforce + mock_sf._call_salesforce = mocked_call + mock_sdk.return_value = mock_sf + + config = SalesforceConfig.parse_obj( + { + "auth": "DIRECT_ACCESS_TOKEN", + "instance_url": "https://mydomain.my.salesforce.com/", + "access_token": "access_token`", + "ingest_tags": True, + "object_pattern": { + "allow": [ + "^Account$", + "^Property__c$", + ], + }, + "domain": {"sales": {"allow": {"^Property__c$"}}}, + "profiling": {"enabled": True}, + "profile_pattern": { + "allow": [ + "^Property__c$", + ] + }, + } + ) + SalesforceSource(config=config, ctx=Mock()) + calls = mock_sf._call_salesforce.mock_calls + assert ( + len(calls) == 1 + ), "We didn't specify version but source didn't call SF API to get the latest one" + assert calls[0].ends_with( + "/services/data" + ), "Source didn't call proper SF API endpoint to get all versions" + assert ( + mock_sf.sf_version == "54.0" + ), "API version was not correctly set (see versions_responses.json)" + + +@mock.patch("datahub.ingestion.source.salesforce.Salesforce") +def test_custom_version(mock_sdk): + mock_sf = mock.Mock() + mocked_call = mock.Mock() + mocked_call.side_effect = side_effect_call_salesforce + mock_sf._call_salesforce = mocked_call + mock_sdk.return_value = mock_sf + + config = SalesforceConfig.parse_obj( + { + "auth": "DIRECT_ACCESS_TOKEN", + "api_version": "46.0", + "instance_url": "https://mydomain.my.salesforce.com/", + "access_token": "access_token`", + "ingest_tags": True, + "object_pattern": { + "allow": [ + "^Account$", + "^Property__c$", + ], + }, + "domain": {"sales": {"allow": {"^Property__c$"}}}, + "profiling": {"enabled": True}, + "profile_pattern": { + "allow": [ + "^Property__c$", + ] + }, + } + ) + SalesforceSource(config=config, ctx=Mock()) + + calls = mock_sf._call_salesforce.mock_calls + assert ( + len(calls) == 0 + ), "Source called API to get all versions even though we specified proper version" + assert ( + mock_sdk.call_args.kwargs["version"] == "46.0" + ), "API client object was not correctly initialized with the custom version" + + @freeze_time(FROZEN_TIME) def test_salesforce_ingest(pytestconfig, tmp_path): - with mock.patch("simple_salesforce.Salesforce") as mock_sdk: + with mock.patch("datahub.ingestion.source.salesforce.Salesforce") as mock_sdk: mock_sf = mock.Mock() mocked_call = mock.Mock() mocked_call.side_effect = side_effect_call_salesforce diff --git a/metadata-ingestion/tests/unit/test_dbt_source.py b/metadata-ingestion/tests/unit/test_dbt_source.py index 01d7a4809b01b8..90ff78b16f652b 100644 --- a/metadata-ingestion/tests/unit/test_dbt_source.py +++ b/metadata-ingestion/tests/unit/test_dbt_source.py @@ -247,7 +247,6 @@ def test_dbt_config_prefer_sql_parser_lineage(): "catalog_path": "dummy_path", "target_platform": "dummy_platform", "skip_sources_in_lineage": True, - "entities_enabled": {"sources": "NO"}, "prefer_sql_parser_lineage": True, } config = DBTCoreConfig.parse_obj(config_dict) diff --git a/metadata-ingestion/tests/unit/test_tableau_source.py b/metadata-ingestion/tests/unit/test_tableau_source.py index f5410b161ed703..1cd0557d085f19 100644 --- a/metadata-ingestion/tests/unit/test_tableau_source.py +++ b/metadata-ingestion/tests/unit/test_tableau_source.py @@ -1,8 +1,37 @@ +from typing import Any, Dict + import pytest import datahub.ingestion.source.tableau_constant as c from datahub.ingestion.source.tableau import TableauSiteSource -from datahub.ingestion.source.tableau_common import get_filter_pages, make_filter +from datahub.ingestion.source.tableau_common import ( + get_filter_pages, + make_filter, + tableau_field_to_schema_field, +) +from datahub.metadata.com.linkedin.pegasus2avro.schema import SchemaField + + +def test_tablea_source_handles_none_nativedatatype(): + field: Dict[str, Any] = { + "__typename": "CalculatedField", + "id": "abcd", + "name": "Test Field", + "description": None, + "isHidden": False, + "folderName": None, + "upstreamFields": [], + "upstreamColumns": [], + "role": None, + "dataType": None, + "defaultFormat": "s", + "aggregation": None, + "formula": "a/b + d", + } + schema_field: SchemaField = tableau_field_to_schema_field( + field=field, ingest_tags=False + ) + assert schema_field.nativeDataType == "UNKNOWN" def test_tableau_source_unescapes_lt(): diff --git a/metadata-io/build.gradle b/metadata-io/build.gradle index 17d9cb8cd14fee..ff29cb5fff47d2 100644 --- a/metadata-io/build.gradle +++ b/metadata-io/build.gradle @@ -130,7 +130,6 @@ test { // override, testng controlling parallelization // increasing >1 will merely run all tests extra times maxParallelForks = 1 - environment "ELASTIC_ID_HASH_ALGO", "MD5" } useTestNG() { suites 'src/test/resources/testng.xml' diff --git a/metadata-io/metadata-io-api/src/main/java/com/linkedin/metadata/entity/ebean/batch/AspectsBatchImpl.java b/metadata-io/metadata-io-api/src/main/java/com/linkedin/metadata/entity/ebean/batch/AspectsBatchImpl.java index 0808c29e8ea892..3ec090a3db3a45 100644 --- a/metadata-io/metadata-io-api/src/main/java/com/linkedin/metadata/entity/ebean/batch/AspectsBatchImpl.java +++ b/metadata-io/metadata-io-api/src/main/java/com/linkedin/metadata/entity/ebean/batch/AspectsBatchImpl.java @@ -170,16 +170,22 @@ public AspectsBatchImplBuilder mcps( mcps.stream() .map( mcp -> { - if (mcp.getChangeType().equals(ChangeType.PATCH)) { - return PatchItemImpl.PatchItemImplBuilder.build( - mcp, - auditStamp, - retrieverContext.getAspectRetriever().getEntityRegistry()); - } else { - return ChangeItemImpl.ChangeItemImplBuilder.build( - mcp, auditStamp, retrieverContext.getAspectRetriever()); + try { + if (mcp.getChangeType().equals(ChangeType.PATCH)) { + return PatchItemImpl.PatchItemImplBuilder.build( + mcp, + auditStamp, + retrieverContext.getAspectRetriever().getEntityRegistry()); + } else { + return ChangeItemImpl.ChangeItemImplBuilder.build( + mcp, auditStamp, retrieverContext.getAspectRetriever()); + } + } catch (IllegalArgumentException e) { + log.error("Invalid proposal, skipping and proceeding with batch: " + mcp, e); + return null; } }) + .filter(Objects::nonNull) .collect(Collectors.toList())); return this; } diff --git a/metadata-io/metadata-io-api/src/test/java/com/linkedin/metadata/entity/ebean/batch/AspectsBatchImplTest.java b/metadata-io/metadata-io-api/src/test/java/com/linkedin/metadata/entity/ebean/batch/AspectsBatchImplTest.java index d2e7243d045604..31dd868b4cb4a3 100644 --- a/metadata-io/metadata-io-api/src/test/java/com/linkedin/metadata/entity/ebean/batch/AspectsBatchImplTest.java +++ b/metadata-io/metadata-io-api/src/test/java/com/linkedin/metadata/entity/ebean/batch/AspectsBatchImplTest.java @@ -1,22 +1,26 @@ package com.linkedin.metadata.entity.ebean.batch; -import static com.linkedin.metadata.Constants.DATASET_ENTITY_NAME; -import static com.linkedin.metadata.Constants.STATUS_ASPECT_NAME; -import static com.linkedin.metadata.Constants.STRUCTURED_PROPERTIES_ASPECT_NAME; +import static com.linkedin.metadata.Constants.*; import static org.mockito.Mockito.mock; import static org.mockito.Mockito.when; import static org.testng.Assert.assertEquals; +import com.google.common.collect.ImmutableList; +import com.linkedin.common.FabricType; import com.linkedin.common.Status; +import com.linkedin.common.urn.DataPlatformUrn; +import com.linkedin.common.urn.DatasetUrn; import com.linkedin.common.urn.UrnUtils; import com.linkedin.data.ByteString; import com.linkedin.data.schema.annotation.PathSpecBasedSchemaAnnotationVisitor; +import com.linkedin.dataset.DatasetProperties; import com.linkedin.events.metadata.ChangeType; import com.linkedin.metadata.aspect.AspectRetriever; import com.linkedin.metadata.aspect.GraphRetriever; import com.linkedin.metadata.aspect.batch.MCPItem; import com.linkedin.metadata.aspect.patch.GenericJsonPatch; import com.linkedin.metadata.aspect.patch.PatchOperationType; +import com.linkedin.metadata.aspect.patch.builder.DatasetPropertiesPatchBuilder; import com.linkedin.metadata.aspect.plugins.config.AspectPluginConfig; import com.linkedin.metadata.aspect.plugins.hooks.MutationHook; import com.linkedin.metadata.entity.SearchRetriever; @@ -297,6 +301,38 @@ public void toUpsertBatchItemsProposedItemTest() { "Mutation to status aspect"); } + @Test + public void singleInvalidDoesntBreakBatch() { + MetadataChangeProposal proposal1 = + new DatasetPropertiesPatchBuilder() + .urn(new DatasetUrn(new DataPlatformUrn("platform"), "name", FabricType.PROD)) + .setDescription("something") + .setName("name") + .addCustomProperty("prop1", "propVal1") + .addCustomProperty("prop2", "propVal2") + .build(); + MetadataChangeProposal proposal2 = + new MetadataChangeProposal() + .setEntityType(DATASET_ENTITY_NAME) + .setAspectName(DATASET_PROPERTIES_ASPECT_NAME) + .setAspect(GenericRecordUtils.serializeAspect(new DatasetProperties())) + .setChangeType(ChangeType.UPSERT); + + AspectsBatchImpl testBatch = + AspectsBatchImpl.builder() + .mcps( + ImmutableList.of(proposal1, proposal2), + AuditStampUtils.createDefaultAuditStamp(), + retrieverContext) + .retrieverContext(retrieverContext) + .build(); + + assertEquals( + testBatch.toUpsertBatchItems(Map.of()).getSecond().size(), + 1, + "Expected 1 valid mcp to be passed through."); + } + /** Converts unsupported to status aspect */ @Getter @Setter diff --git a/metadata-io/src/main/java/com/linkedin/metadata/graph/elastic/ElasticSearchGraphService.java b/metadata-io/src/main/java/com/linkedin/metadata/graph/elastic/ElasticSearchGraphService.java index 5b0fb554a4f48d..e1532ea4e26c06 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/graph/elastic/ElasticSearchGraphService.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/graph/elastic/ElasticSearchGraphService.java @@ -64,6 +64,7 @@ public class ElasticSearchGraphService implements GraphService, ElasticSearchInd private final ESGraphWriteDAO _graphWriteDAO; private final ESGraphQueryDAO _graphReadDAO; private final ESIndexBuilder _indexBuilder; + private final String idHashAlgo; public static final String INDEX_NAME = "graph_service_v1"; private static final Map EMPTY_HASH = new HashMap<>(); @@ -125,7 +126,7 @@ public LineageRegistry getLineageRegistry() { @Override public void addEdge(@Nonnull final Edge edge) { - String docId = edge.toDocId(); + String docId = edge.toDocId(idHashAlgo); String edgeDocument = toDocument(edge); _graphWriteDAO.upsertDocument(docId, edgeDocument); } @@ -137,7 +138,7 @@ public void upsertEdge(@Nonnull final Edge edge) { @Override public void removeEdge(@Nonnull final Edge edge) { - String docId = edge.toDocId(); + String docId = edge.toDocId(idHashAlgo); _graphWriteDAO.deleteDocument(docId); } diff --git a/metadata-io/src/main/java/com/linkedin/metadata/service/UpdateIndicesService.java b/metadata-io/src/main/java/com/linkedin/metadata/service/UpdateIndicesService.java index dff0a99a142b73..2ab9e17f281637 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/service/UpdateIndicesService.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/service/UpdateIndicesService.java @@ -80,6 +80,7 @@ public class UpdateIndicesService implements SearchIndicesService { private final SystemMetadataService _systemMetadataService; private final SearchDocumentTransformer _searchDocumentTransformer; private final EntityIndexBuilders _entityIndexBuilders; + @Nonnull private final String idHashAlgo; @Value("${featureFlags.graphServiceDiffModeEnabled:true}") private boolean _graphDiffMode; @@ -117,13 +118,15 @@ public UpdateIndicesService( TimeseriesAspectService timeseriesAspectService, SystemMetadataService systemMetadataService, SearchDocumentTransformer searchDocumentTransformer, - EntityIndexBuilders entityIndexBuilders) { + EntityIndexBuilders entityIndexBuilders, + @Nonnull String idHashAlgo) { _graphService = graphService; _entitySearchService = entitySearchService; _timeseriesAspectService = timeseriesAspectService; _systemMetadataService = systemMetadataService; _searchDocumentTransformer = searchDocumentTransformer; _entityIndexBuilders = entityIndexBuilders; + this.idHashAlgo = idHashAlgo; } @Override @@ -601,7 +604,9 @@ private void updateTimeseriesFields( SystemMetadata systemMetadata) { Map documents; try { - documents = TimeseriesAspectTransformer.transform(urn, aspect, aspectSpec, systemMetadata); + documents = + TimeseriesAspectTransformer.transform( + urn, aspect, aspectSpec, systemMetadata, idHashAlgo); } catch (JsonProcessingException e) { log.error("Failed to generate timeseries document from aspect: {}", e.toString()); return; diff --git a/metadata-io/src/main/java/com/linkedin/metadata/systemmetadata/ElasticSearchSystemMetadataService.java b/metadata-io/src/main/java/com/linkedin/metadata/systemmetadata/ElasticSearchSystemMetadataService.java index cdfc4e985293f6..fe79ba75cb1d14 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/systemmetadata/ElasticSearchSystemMetadataService.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/systemmetadata/ElasticSearchSystemMetadataService.java @@ -52,6 +52,7 @@ public class ElasticSearchSystemMetadataService private final IndexConvention _indexConvention; private final ESSystemMetadataDAO _esDAO; private final ESIndexBuilder _indexBuilder; + @Nonnull private final String elasticIdHashAlgo; private static final String DOC_DELIMETER = "--"; public static final String INDEX_NAME = "system_metadata_service_v1"; @@ -86,10 +87,9 @@ private String toDocument(SystemMetadata systemMetadata, String urn, String aspe private String toDocId(@Nonnull final String urn, @Nonnull final String aspect) { String rawDocId = urn + DOC_DELIMETER + aspect; - String hashAlgo = System.getenv("ELASTIC_ID_HASH_ALGO"); try { byte[] bytesOfRawDocID = rawDocId.getBytes(StandardCharsets.UTF_8); - MessageDigest md = MessageDigest.getInstance(hashAlgo); + MessageDigest md = MessageDigest.getInstance(elasticIdHashAlgo); byte[] thedigest = md.digest(bytesOfRawDocID); return Base64.getEncoder().encodeToString(thedigest); } catch (NoSuchAlgorithmException e) { diff --git a/metadata-io/src/main/java/com/linkedin/metadata/timeseries/transformer/TimeseriesAspectTransformer.java b/metadata-io/src/main/java/com/linkedin/metadata/timeseries/transformer/TimeseriesAspectTransformer.java index cf0a3f1466d254..c353e601a31b70 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/timeseries/transformer/TimeseriesAspectTransformer.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/timeseries/transformer/TimeseriesAspectTransformer.java @@ -54,7 +54,8 @@ public static Map transform( @Nonnull final Urn urn, @Nonnull final RecordTemplate timeseriesAspect, @Nonnull final AspectSpec aspectSpec, - @Nullable final SystemMetadata systemMetadata) + @Nullable final SystemMetadata systemMetadata, + @Nonnull final String idHashAlgo) throws JsonProcessingException { ObjectNode commonDocument = getCommonDocument(urn, timeseriesAspect, systemMetadata); Map finalDocuments = new HashMap<>(); @@ -74,7 +75,7 @@ public static Map transform( final Map> timeseriesFieldValueMap = FieldExtractor.extractFields(timeseriesAspect, aspectSpec.getTimeseriesFieldSpecs()); timeseriesFieldValueMap.forEach((k, v) -> setTimeseriesField(document, k, v)); - finalDocuments.put(getDocId(document, null), document); + finalDocuments.put(getDocId(document, null, idHashAlgo), document); // Create new rows for the member collection fields. final Map> timeseriesFieldCollectionValueMap = @@ -83,7 +84,7 @@ public static Map transform( timeseriesFieldCollectionValueMap.forEach( (key, values) -> finalDocuments.putAll( - getTimeseriesFieldCollectionDocuments(key, values, commonDocument))); + getTimeseriesFieldCollectionDocuments(key, values, commonDocument, idHashAlgo))); return finalDocuments; } @@ -216,12 +217,13 @@ private static void setTimeseriesField( private static Map getTimeseriesFieldCollectionDocuments( final TimeseriesFieldCollectionSpec fieldSpec, final List values, - final ObjectNode commonDocument) { + final ObjectNode commonDocument, + @Nonnull final String idHashAlgo) { return values.stream() .map(value -> getTimeseriesFieldCollectionDocument(fieldSpec, value, commonDocument)) .collect( Collectors.toMap( - keyDocPair -> getDocId(keyDocPair.getSecond(), keyDocPair.getFirst()), + keyDocPair -> getDocId(keyDocPair.getSecond(), keyDocPair.getFirst(), idHashAlgo), Pair::getSecond)); } @@ -257,9 +259,9 @@ private static Pair getTimeseriesFieldCollectionDocument( finalDocument); } - private static String getDocId(@Nonnull JsonNode document, String collectionId) + private static String getDocId( + @Nonnull JsonNode document, String collectionId, @Nonnull String idHashAlgo) throws IllegalArgumentException { - String hashAlgo = System.getenv("ELASTIC_ID_HASH_ALGO"); String docId = document.get(MappingsBuilder.TIMESTAMP_MILLIS_FIELD).toString(); JsonNode eventGranularity = document.get(MappingsBuilder.EVENT_GRANULARITY); if (eventGranularity != null) { @@ -278,9 +280,9 @@ private static String getDocId(@Nonnull JsonNode document, String collectionId) docId += partitionSpec.toString(); } - if (hashAlgo.equalsIgnoreCase("SHA-256")) { + if (idHashAlgo.equalsIgnoreCase("SHA-256")) { return DigestUtils.sha256Hex(docId); - } else if (hashAlgo.equalsIgnoreCase("MD5")) { + } else if (idHashAlgo.equalsIgnoreCase("MD5")) { return DigestUtils.md5Hex(docId); } throw new IllegalArgumentException("Hash function not handled !"); diff --git a/metadata-io/src/test/java/com/linkedin/metadata/graph/search/SearchGraphServiceTestBase.java b/metadata-io/src/test/java/com/linkedin/metadata/graph/search/SearchGraphServiceTestBase.java index 06f1369ff0670c..d1a51b1d69b2c3 100644 --- a/metadata-io/src/test/java/com/linkedin/metadata/graph/search/SearchGraphServiceTestBase.java +++ b/metadata-io/src/test/java/com/linkedin/metadata/graph/search/SearchGraphServiceTestBase.java @@ -62,7 +62,7 @@ public abstract class SearchGraphServiceTestBase extends GraphServiceTestBase { @Nonnull protected abstract ESIndexBuilder getIndexBuilder(); - private final IndexConvention _indexConvention = IndexConventionImpl.NO_PREFIX; + private final IndexConvention _indexConvention = IndexConventionImpl.noPrefix("MD5"); private final String _indexName = _indexConvention.getIndexName(INDEX_NAME); private ElasticSearchGraphService _client; @@ -108,7 +108,8 @@ private ElasticSearchGraphService buildService(boolean enableMultiPathSearch) { _indexConvention, writeDAO, readDAO, - getIndexBuilder()); + getIndexBuilder(), + "MD5"); } @Override diff --git a/metadata-io/src/test/java/com/linkedin/metadata/search/LineageServiceTestBase.java b/metadata-io/src/test/java/com/linkedin/metadata/search/LineageServiceTestBase.java index a9d84ae1f3aea1..99e4923885a41d 100644 --- a/metadata-io/src/test/java/com/linkedin/metadata/search/LineageServiceTestBase.java +++ b/metadata-io/src/test/java/com/linkedin/metadata/search/LineageServiceTestBase.java @@ -122,7 +122,7 @@ public void setup() throws RemoteInvocationException, URISyntaxException { operationContext = TestOperationContexts.systemContextNoSearchAuthorization( new SnapshotEntityRegistry(new Snapshot()), - new IndexConventionImpl("lineage_search_service_test")) + new IndexConventionImpl("lineage_search_service_test", "MD5")) .asSession(RequestContext.TEST, Authorizer.EMPTY, TestOperationContexts.TEST_USER_AUTH); settingsBuilder = new SettingsBuilder(null); elasticSearchService = buildEntitySearchService(); diff --git a/metadata-io/src/test/java/com/linkedin/metadata/search/SearchServiceTestBase.java b/metadata-io/src/test/java/com/linkedin/metadata/search/SearchServiceTestBase.java index 445b71b2eaff62..5e30e01a8ea690 100644 --- a/metadata-io/src/test/java/com/linkedin/metadata/search/SearchServiceTestBase.java +++ b/metadata-io/src/test/java/com/linkedin/metadata/search/SearchServiceTestBase.java @@ -79,7 +79,7 @@ public void setup() throws RemoteInvocationException, URISyntaxException { operationContext = TestOperationContexts.systemContextNoSearchAuthorization( new SnapshotEntityRegistry(new Snapshot()), - new IndexConventionImpl("search_service_test")) + new IndexConventionImpl("search_service_test", "MD5")) .asSession(RequestContext.TEST, Authorizer.EMPTY, TestOperationContexts.TEST_USER_AUTH); settingsBuilder = new SettingsBuilder(null); diff --git a/metadata-io/src/test/java/com/linkedin/metadata/search/TestEntityTestBase.java b/metadata-io/src/test/java/com/linkedin/metadata/search/TestEntityTestBase.java index ab5e90f77c21aa..282a3d8e3ea6ae 100644 --- a/metadata-io/src/test/java/com/linkedin/metadata/search/TestEntityTestBase.java +++ b/metadata-io/src/test/java/com/linkedin/metadata/search/TestEntityTestBase.java @@ -62,7 +62,8 @@ public abstract class TestEntityTestBase extends AbstractTestNGSpringContextTest public void setup() { opContext = TestOperationContexts.systemContextNoSearchAuthorization( - new SnapshotEntityRegistry(new Snapshot()), new IndexConventionImpl("es_service_test")); + new SnapshotEntityRegistry(new Snapshot()), + new IndexConventionImpl("es_service_test", "MD5")); settingsBuilder = new SettingsBuilder(null); elasticSearchService = buildService(); elasticSearchService.reindexAll(Collections.emptySet()); diff --git a/metadata-io/src/test/java/com/linkedin/metadata/search/query/BrowseDAOTest.java b/metadata-io/src/test/java/com/linkedin/metadata/search/query/BrowseDAOTest.java index a0288d019644bd..8044515e3dc6a7 100644 --- a/metadata-io/src/test/java/com/linkedin/metadata/search/query/BrowseDAOTest.java +++ b/metadata-io/src/test/java/com/linkedin/metadata/search/query/BrowseDAOTest.java @@ -45,7 +45,7 @@ public void setup() throws RemoteInvocationException, URISyntaxException { mockClient = mock(RestHighLevelClient.class); opContext = TestOperationContexts.systemContextNoSearchAuthorization( - new IndexConventionImpl("es_browse_dao_test")); + new IndexConventionImpl("es_browse_dao_test", "MD5")); browseDAO = new ESBrowseDAO(mockClient, searchConfiguration, customSearchConfiguration); } diff --git a/metadata-io/src/test/java/com/linkedin/metadata/systemmetadata/SystemMetadataServiceTestBase.java b/metadata-io/src/test/java/com/linkedin/metadata/systemmetadata/SystemMetadataServiceTestBase.java index d843191bed7413..1b9d8c57b4cad3 100644 --- a/metadata-io/src/test/java/com/linkedin/metadata/systemmetadata/SystemMetadataServiceTestBase.java +++ b/metadata-io/src/test/java/com/linkedin/metadata/systemmetadata/SystemMetadataServiceTestBase.java @@ -32,7 +32,7 @@ public abstract class SystemMetadataServiceTestBase extends AbstractTestNGSpring protected abstract ESIndexBuilder getIndexBuilder(); private final IndexConvention _indexConvention = - new IndexConventionImpl("es_system_metadata_service_test"); + new IndexConventionImpl("es_system_metadata_service_test", "MD5"); private ElasticSearchSystemMetadataService _client; @@ -54,7 +54,7 @@ private ElasticSearchSystemMetadataService buildService() { ESSystemMetadataDAO dao = new ESSystemMetadataDAO(getSearchClient(), _indexConvention, getBulkProcessor(), 1); return new ElasticSearchSystemMetadataService( - getBulkProcessor(), _indexConvention, dao, getIndexBuilder()); + getBulkProcessor(), _indexConvention, dao, getIndexBuilder(), "MD5"); } @Test diff --git a/metadata-io/src/test/java/com/linkedin/metadata/timeseries/search/TimeseriesAspectServiceTestBase.java b/metadata-io/src/test/java/com/linkedin/metadata/timeseries/search/TimeseriesAspectServiceTestBase.java index 10c6f09cb8f8d6..414183c8882f9c 100644 --- a/metadata-io/src/test/java/com/linkedin/metadata/timeseries/search/TimeseriesAspectServiceTestBase.java +++ b/metadata-io/src/test/java/com/linkedin/metadata/timeseries/search/TimeseriesAspectServiceTestBase.java @@ -126,7 +126,7 @@ public void setup() throws RemoteInvocationException, URISyntaxException { opContext = TestOperationContexts.systemContextNoSearchAuthorization( - entityRegistry, new IndexConventionImpl("es_timeseries_aspect_service_test")); + entityRegistry, new IndexConventionImpl("es_timeseries_aspect_service_test", "MD5")); elasticSearchTimeseriesAspectService = buildService(); elasticSearchTimeseriesAspectService.reindexAll(Collections.emptySet()); @@ -152,7 +152,7 @@ private ElasticSearchTimeseriesAspectService buildService() { private void upsertDocument(TestEntityProfile dp, Urn urn) throws JsonProcessingException { Map documents = - TimeseriesAspectTransformer.transform(urn, dp, aspectSpec, null); + TimeseriesAspectTransformer.transform(urn, dp, aspectSpec, null, "MD5"); assertEquals(documents.size(), 3); documents.forEach( (key, value) -> diff --git a/metadata-io/src/test/java/io/datahubproject/test/fixtures/search/SampleDataFixtureConfiguration.java b/metadata-io/src/test/java/io/datahubproject/test/fixtures/search/SampleDataFixtureConfiguration.java index 28a4a2b00cd6f1..6a95d16c254370 100644 --- a/metadata-io/src/test/java/io/datahubproject/test/fixtures/search/SampleDataFixtureConfiguration.java +++ b/metadata-io/src/test/java/io/datahubproject/test/fixtures/search/SampleDataFixtureConfiguration.java @@ -86,12 +86,12 @@ protected String longTailIndexPrefix() { @Bean(name = "sampleDataIndexConvention") protected IndexConvention indexConvention(@Qualifier("sampleDataPrefix") String prefix) { - return new IndexConventionImpl(prefix); + return new IndexConventionImpl(prefix, "MD5"); } @Bean(name = "longTailIndexConvention") protected IndexConvention longTailIndexConvention(@Qualifier("longTailPrefix") String prefix) { - return new IndexConventionImpl(prefix); + return new IndexConventionImpl(prefix, "MD5"); } @Bean(name = "sampleDataFixtureName") diff --git a/metadata-io/src/test/java/io/datahubproject/test/fixtures/search/SearchLineageFixtureConfiguration.java b/metadata-io/src/test/java/io/datahubproject/test/fixtures/search/SearchLineageFixtureConfiguration.java index e783c011de6d0e..33e04af83c0a3a 100644 --- a/metadata-io/src/test/java/io/datahubproject/test/fixtures/search/SearchLineageFixtureConfiguration.java +++ b/metadata-io/src/test/java/io/datahubproject/test/fixtures/search/SearchLineageFixtureConfiguration.java @@ -71,7 +71,7 @@ protected String indexPrefix() { @Bean(name = "searchLineageIndexConvention") protected IndexConvention indexConvention(@Qualifier("searchLineagePrefix") String prefix) { - return new IndexConventionImpl(prefix); + return new IndexConventionImpl(prefix, "MD5"); } @Bean(name = "searchLineageFixtureName") @@ -173,7 +173,8 @@ protected ElasticSearchGraphService graphService( new ESGraphWriteDAO(indexConvention, bulkProcessor, 1), new ESGraphQueryDAO( searchClient, lineageRegistry, indexConvention, getGraphQueryConfiguration()), - indexBuilder); + indexBuilder, + indexConvention.getIdHashAlgo()); graphService.reindexAll(Collections.emptySet()); return graphService; } diff --git a/metadata-jobs/mae-consumer/src/main/java/com/linkedin/metadata/kafka/hook/MetadataChangeLogHook.java b/metadata-jobs/mae-consumer/src/main/java/com/linkedin/metadata/kafka/hook/MetadataChangeLogHook.java index 06a184c9f89f9c..876df4279b7b8a 100644 --- a/metadata-jobs/mae-consumer/src/main/java/com/linkedin/metadata/kafka/hook/MetadataChangeLogHook.java +++ b/metadata-jobs/mae-consumer/src/main/java/com/linkedin/metadata/kafka/hook/MetadataChangeLogHook.java @@ -30,9 +30,7 @@ default MetadataChangeLogHook init(@Nonnull OperationContext systemOperationCont * Return whether the hook is enabled or not. If not enabled, the below invoke method is not * triggered */ - default boolean isEnabled() { - return true; - } + boolean isEnabled(); /** Invoke the hook when a MetadataChangeLog is received */ void invoke(@Nonnull MetadataChangeLog log) throws Exception; diff --git a/metadata-jobs/mae-consumer/src/test/java/com/linkedin/metadata/kafka/hook/UpdateIndicesHookTest.java b/metadata-jobs/mae-consumer/src/test/java/com/linkedin/metadata/kafka/hook/UpdateIndicesHookTest.java index 411fe02260bb1b..4cd59992eb2f00 100644 --- a/metadata-jobs/mae-consumer/src/test/java/com/linkedin/metadata/kafka/hook/UpdateIndicesHookTest.java +++ b/metadata-jobs/mae-consumer/src/test/java/com/linkedin/metadata/kafka/hook/UpdateIndicesHookTest.java @@ -125,7 +125,8 @@ public void setupTest() { mockTimeseriesAspectService, mockSystemMetadataService, searchDocumentTransformer, - mockEntityIndexBuilders); + mockEntityIndexBuilders, + "MD5"); OperationContext systemOperationContext = TestOperationContexts.systemContextNoSearchAuthorization(); @@ -235,7 +236,8 @@ public void testInputFieldsEdgesAreAdded() throws Exception { mockTimeseriesAspectService, mockSystemMetadataService, searchDocumentTransformer, - mockEntityIndexBuilders); + mockEntityIndexBuilders, + "MD5"); updateIndicesHook = new UpdateIndicesHook(updateIndicesService, true, false); updateIndicesHook.init( diff --git a/metadata-jobs/pe-consumer/src/main/java/com/datahub/event/hook/PlatformEventHook.java b/metadata-jobs/pe-consumer/src/main/java/com/datahub/event/hook/PlatformEventHook.java index 37241861f2e5e6..7fcc2a07b950bf 100644 --- a/metadata-jobs/pe-consumer/src/main/java/com/datahub/event/hook/PlatformEventHook.java +++ b/metadata-jobs/pe-consumer/src/main/java/com/datahub/event/hook/PlatformEventHook.java @@ -20,9 +20,7 @@ default void init() {} * Return whether the hook is enabled or not. If not enabled, the below invoke method is not * triggered */ - default boolean isEnabled() { - return true; - } + boolean isEnabled(); /** Invoke the hook when a PlatformEvent is received */ void invoke(@Nonnull OperationContext opContext, @Nonnull PlatformEvent event); diff --git a/metadata-models/src/main/pegasus/com/linkedin/settings/global/GlobalSettingsInfo.pdl b/metadata-models/src/main/pegasus/com/linkedin/settings/global/GlobalSettingsInfo.pdl index 8d4121b767dc38..6c6f4d0036ce03 100644 --- a/metadata-models/src/main/pegasus/com/linkedin/settings/global/GlobalSettingsInfo.pdl +++ b/metadata-models/src/main/pegasus/com/linkedin/settings/global/GlobalSettingsInfo.pdl @@ -12,16 +12,17 @@ record GlobalSettingsInfo { * SSO integrations between DataHub and identity providers */ sso: optional SsoSettings + /** * Settings related to the Views Feature */ views: optional GlobalViewsSettings + /** * Settings related to the documentation propagation feature */ - docPropagation: DocPropagationFeatureSettings = { + docPropagation: optional DocPropagationFeatureSettings = { "enabled": true "columnPropagationEnabled": true } - } \ No newline at end of file diff --git a/metadata-operation-context/src/main/java/io/datahubproject/metadata/context/SearchContext.java b/metadata-operation-context/src/main/java/io/datahubproject/metadata/context/SearchContext.java index c067e91c3524cf..5ad7bdc14820c3 100644 --- a/metadata-operation-context/src/main/java/io/datahubproject/metadata/context/SearchContext.java +++ b/metadata-operation-context/src/main/java/io/datahubproject/metadata/context/SearchContext.java @@ -21,7 +21,7 @@ public class SearchContext implements ContextInterface { public static SearchContext EMPTY = - SearchContext.builder().indexConvention(IndexConventionImpl.NO_PREFIX).build(); + SearchContext.builder().indexConvention(IndexConventionImpl.noPrefix("")).build(); public static SearchContext withFlagDefaults( @Nonnull SearchContext searchContext, diff --git a/metadata-operation-context/src/main/java/io/datahubproject/test/metadata/context/TestOperationContexts.java b/metadata-operation-context/src/main/java/io/datahubproject/test/metadata/context/TestOperationContexts.java index e54c040fe13b58..76f58fb4751085 100644 --- a/metadata-operation-context/src/main/java/io/datahubproject/test/metadata/context/TestOperationContexts.java +++ b/metadata-operation-context/src/main/java/io/datahubproject/test/metadata/context/TestOperationContexts.java @@ -191,7 +191,7 @@ public static OperationContext systemContext( IndexConvention indexConvention = Optional.ofNullable(indexConventionSupplier) .map(Supplier::get) - .orElse(IndexConventionImpl.NO_PREFIX); + .orElse(IndexConventionImpl.noPrefix("MD5")); ServicesRegistryContext servicesRegistryContext = Optional.ofNullable(servicesRegistrySupplier).orElse(() -> null).get(); diff --git a/metadata-operation-context/src/test/java/io/datahubproject/metadata/context/SearchContextTest.java b/metadata-operation-context/src/test/java/io/datahubproject/metadata/context/SearchContextTest.java index 4858bb342258a5..2e0585cc82a4fd 100644 --- a/metadata-operation-context/src/test/java/io/datahubproject/metadata/context/SearchContextTest.java +++ b/metadata-operation-context/src/test/java/io/datahubproject/metadata/context/SearchContextTest.java @@ -12,26 +12,26 @@ public class SearchContextTest { @Test public void searchContextId() { SearchContext testNoFlags = - SearchContext.builder().indexConvention(IndexConventionImpl.NO_PREFIX).build(); + SearchContext.builder().indexConvention(IndexConventionImpl.noPrefix("MD5")).build(); assertEquals( testNoFlags.getCacheKeyComponent(), SearchContext.builder() - .indexConvention(IndexConventionImpl.NO_PREFIX) + .indexConvention(IndexConventionImpl.noPrefix("MD5")) .build() .getCacheKeyComponent(), "Expected consistent context ids across instances"); SearchContext testWithFlags = SearchContext.builder() - .indexConvention(IndexConventionImpl.NO_PREFIX) + .indexConvention(IndexConventionImpl.noPrefix("MD5")) .searchFlags(new SearchFlags().setFulltext(true)) .build(); assertEquals( testWithFlags.getCacheKeyComponent(), SearchContext.builder() - .indexConvention(IndexConventionImpl.NO_PREFIX) + .indexConvention(IndexConventionImpl.noPrefix("MD5")) .searchFlags(new SearchFlags().setFulltext(true)) .build() .getCacheKeyComponent(), @@ -44,7 +44,7 @@ public void searchContextId() { assertNotEquals( testWithFlags.getCacheKeyComponent(), SearchContext.builder() - .indexConvention(IndexConventionImpl.NO_PREFIX) + .indexConvention(IndexConventionImpl.noPrefix("MD5")) .searchFlags(new SearchFlags().setFulltext(true).setIncludeRestricted(true)) .build() .getCacheKeyComponent(), @@ -53,7 +53,7 @@ public void searchContextId() { assertNotEquals( testNoFlags.getCacheKeyComponent(), SearchContext.builder() - .indexConvention(new IndexConventionImpl("Some Prefix")) + .indexConvention(new IndexConventionImpl("Some Prefix", "MD5")) .searchFlags(null) .build() .getCacheKeyComponent(), @@ -61,7 +61,7 @@ public void searchContextId() { assertNotEquals( SearchContext.builder() - .indexConvention(IndexConventionImpl.NO_PREFIX) + .indexConvention(IndexConventionImpl.noPrefix("MD5")) .searchFlags( new SearchFlags() .setFulltext(false) @@ -70,7 +70,7 @@ public void searchContextId() { .build() .getCacheKeyComponent(), SearchContext.builder() - .indexConvention(IndexConventionImpl.NO_PREFIX) + .indexConvention(IndexConventionImpl.noPrefix("MD5")) .searchFlags(new SearchFlags().setFulltext(true).setIncludeRestricted(true)) .build() .getCacheKeyComponent(), @@ -80,7 +80,7 @@ public void searchContextId() { @Test public void testImmutableSearchFlags() { SearchContext initial = - SearchContext.builder().indexConvention(IndexConventionImpl.NO_PREFIX).build(); + SearchContext.builder().indexConvention(IndexConventionImpl.noPrefix("MD5")).build(); assertEquals(initial.getSearchFlags(), new SearchFlags().setSkipCache(false)); SearchContext mutated = initial.withFlagDefaults(flags -> flags.setSkipCache(true)); diff --git a/metadata-service/configuration/src/main/java/com/linkedin/metadata/config/search/ElasticSearchConfiguration.java b/metadata-service/configuration/src/main/java/com/linkedin/metadata/config/search/ElasticSearchConfiguration.java index 130620a9ab918c..7d68e18940401e 100644 --- a/metadata-service/configuration/src/main/java/com/linkedin/metadata/config/search/ElasticSearchConfiguration.java +++ b/metadata-service/configuration/src/main/java/com/linkedin/metadata/config/search/ElasticSearchConfiguration.java @@ -8,4 +8,5 @@ public class ElasticSearchConfiguration { private BuildIndicesConfiguration buildIndices; public String implementation; private SearchConfiguration search; + private String idHashAlgo; } diff --git a/metadata-service/factories/src/main/java/com/linkedin/gms/factory/common/ElasticSearchGraphServiceFactory.java b/metadata-service/factories/src/main/java/com/linkedin/gms/factory/common/ElasticSearchGraphServiceFactory.java index eb56e8d42c158e..55eb931625fecc 100644 --- a/metadata-service/factories/src/main/java/com/linkedin/gms/factory/common/ElasticSearchGraphServiceFactory.java +++ b/metadata-service/factories/src/main/java/com/linkedin/gms/factory/common/ElasticSearchGraphServiceFactory.java @@ -11,6 +11,7 @@ import javax.annotation.Nonnull; import org.springframework.beans.factory.annotation.Autowired; import org.springframework.beans.factory.annotation.Qualifier; +import org.springframework.beans.factory.annotation.Value; import org.springframework.context.annotation.Bean; import org.springframework.context.annotation.Configuration; import org.springframework.context.annotation.Import; @@ -30,7 +31,8 @@ public class ElasticSearchGraphServiceFactory { @Bean(name = "elasticSearchGraphService") @Nonnull - protected ElasticSearchGraphService getInstance() { + protected ElasticSearchGraphService getInstance( + @Value("${elasticsearch.idHashAlgo}") final String idHashAlgo) { LineageRegistry lineageRegistry = new LineageRegistry(entityRegistry); return new ElasticSearchGraphService( lineageRegistry, @@ -45,6 +47,7 @@ protected ElasticSearchGraphService getInstance() { lineageRegistry, components.getIndexConvention(), configurationProvider.getElasticSearch().getSearch().getGraph()), - components.getIndexBuilder()); + components.getIndexBuilder(), + idHashAlgo); } } diff --git a/metadata-service/factories/src/main/java/com/linkedin/gms/factory/common/ElasticSearchSystemMetadataServiceFactory.java b/metadata-service/factories/src/main/java/com/linkedin/gms/factory/common/ElasticSearchSystemMetadataServiceFactory.java index d560fba399f340..fb48d64ce7ba9e 100644 --- a/metadata-service/factories/src/main/java/com/linkedin/gms/factory/common/ElasticSearchSystemMetadataServiceFactory.java +++ b/metadata-service/factories/src/main/java/com/linkedin/gms/factory/common/ElasticSearchSystemMetadataServiceFactory.java @@ -6,6 +6,7 @@ import javax.annotation.Nonnull; import org.springframework.beans.factory.annotation.Autowired; import org.springframework.beans.factory.annotation.Qualifier; +import org.springframework.beans.factory.annotation.Value; import org.springframework.context.annotation.Bean; import org.springframework.context.annotation.Configuration; import org.springframework.context.annotation.Import; @@ -19,7 +20,8 @@ public class ElasticSearchSystemMetadataServiceFactory { @Bean(name = "elasticSearchSystemMetadataService") @Nonnull - protected ElasticSearchSystemMetadataService getInstance() { + protected ElasticSearchSystemMetadataService getInstance( + @Value("${elasticsearch.idHashAlgo}") final String elasticIdHashAlgo) { return new ElasticSearchSystemMetadataService( components.getBulkProcessor(), components.getIndexConvention(), @@ -28,6 +30,7 @@ protected ElasticSearchSystemMetadataService getInstance() { components.getIndexConvention(), components.getBulkProcessor(), components.getNumRetries()), - components.getIndexBuilder()); + components.getIndexBuilder(), + elasticIdHashAlgo); } } diff --git a/metadata-service/factories/src/main/java/com/linkedin/gms/factory/common/IndexConventionFactory.java b/metadata-service/factories/src/main/java/com/linkedin/gms/factory/common/IndexConventionFactory.java index 5b76a3f2cb833f..2288c8d4ecd50d 100644 --- a/metadata-service/factories/src/main/java/com/linkedin/gms/factory/common/IndexConventionFactory.java +++ b/metadata-service/factories/src/main/java/com/linkedin/gms/factory/common/IndexConventionFactory.java @@ -19,7 +19,8 @@ public class IndexConventionFactory { private String indexPrefix; @Bean(name = INDEX_CONVENTION_BEAN) - protected IndexConvention createInstance() { - return new IndexConventionImpl(indexPrefix); + protected IndexConvention createInstance( + @Value("${elasticsearch.idHashAlgo}") final String isHashAlgo) { + return new IndexConventionImpl(indexPrefix, isHashAlgo); } } diff --git a/metadata-service/factories/src/main/java/com/linkedin/gms/factory/entity/update/indices/UpdateIndicesServiceFactory.java b/metadata-service/factories/src/main/java/com/linkedin/gms/factory/entity/update/indices/UpdateIndicesServiceFactory.java index fad9d0eaf3b45c..38a344f8be8e92 100644 --- a/metadata-service/factories/src/main/java/com/linkedin/gms/factory/entity/update/indices/UpdateIndicesServiceFactory.java +++ b/metadata-service/factories/src/main/java/com/linkedin/gms/factory/entity/update/indices/UpdateIndicesServiceFactory.java @@ -9,6 +9,7 @@ import com.linkedin.metadata.service.UpdateIndicesService; import com.linkedin.metadata.systemmetadata.SystemMetadataService; import com.linkedin.metadata.timeseries.TimeseriesAspectService; +import org.springframework.beans.factory.annotation.Value; import org.springframework.boot.autoconfigure.condition.ConditionalOnProperty; import org.springframework.context.annotation.Bean; import org.springframework.context.annotation.Configuration; @@ -30,7 +31,8 @@ public UpdateIndicesService searchIndicesServiceNonGMS( TimeseriesAspectService timeseriesAspectService, SystemMetadataService systemMetadataService, SearchDocumentTransformer searchDocumentTransformer, - EntityIndexBuilders entityIndexBuilders) { + EntityIndexBuilders entityIndexBuilders, + @Value("${elasticsearch.idHashAlgo}") final String idHashAlgo) { return new UpdateIndicesService( graphService, @@ -38,7 +40,8 @@ public UpdateIndicesService searchIndicesServiceNonGMS( timeseriesAspectService, systemMetadataService, searchDocumentTransformer, - entityIndexBuilders); + entityIndexBuilders, + idHashAlgo); } @Bean @@ -50,7 +53,8 @@ public UpdateIndicesService searchIndicesServiceGMS( final SystemMetadataService systemMetadataService, final SearchDocumentTransformer searchDocumentTransformer, final EntityIndexBuilders entityIndexBuilders, - final EntityService entityService) { + final EntityService entityService, + @Value("${elasticsearch.idHashAlgo}") final String idHashAlgo) { UpdateIndicesService updateIndicesService = new UpdateIndicesService( @@ -59,7 +63,8 @@ public UpdateIndicesService searchIndicesServiceGMS( timeseriesAspectService, systemMetadataService, searchDocumentTransformer, - entityIndexBuilders); + entityIndexBuilders, + idHashAlgo); entityService.setUpdateIndicesService(updateIndicesService); diff --git a/metadata-service/restli-servlet-impl/src/main/java/com/linkedin/metadata/resources/usage/UsageStats.java b/metadata-service/restli-servlet-impl/src/main/java/com/linkedin/metadata/resources/usage/UsageStats.java index 518dfecd576808..1b003fec82e8b8 100644 --- a/metadata-service/restli-servlet-impl/src/main/java/com/linkedin/metadata/resources/usage/UsageStats.java +++ b/metadata-service/restli-servlet-impl/src/main/java/com/linkedin/metadata/resources/usage/UsageStats.java @@ -2,25 +2,20 @@ import static com.datahub.authorization.AuthUtil.isAPIAuthorized; import static com.datahub.authorization.AuthUtil.isAPIAuthorizedEntityUrns; -import static com.linkedin.metadata.Constants.*; import static com.linkedin.metadata.authorization.ApiOperation.UPDATE; import static com.linkedin.metadata.timeseries.elastic.UsageServiceUtil.USAGE_STATS_ASPECT_NAME; import static com.linkedin.metadata.timeseries.elastic.UsageServiceUtil.USAGE_STATS_ENTITY_NAME; import com.codahale.metrics.MetricRegistry; -import com.codahale.metrics.Timer; import com.datahub.authentication.Authentication; import com.datahub.authentication.AuthenticationContext; import com.datahub.authorization.EntitySpec; import com.datahub.plugins.auth.authorization.Authorizer; import com.fasterxml.jackson.core.JsonProcessingException; -import com.fasterxml.jackson.core.StreamReadConstraints; import com.fasterxml.jackson.databind.JsonNode; -import com.fasterxml.jackson.databind.ObjectMapper; import com.linkedin.common.WindowDuration; import com.linkedin.common.urn.Urn; import com.linkedin.common.urn.UrnUtils; -import com.linkedin.data.template.StringArray; import com.linkedin.dataset.DatasetFieldUsageCounts; import com.linkedin.dataset.DatasetFieldUsageCountsArray; import com.linkedin.dataset.DatasetUsageStatistics; @@ -29,17 +24,10 @@ import com.linkedin.metadata.authorization.PoliciesConfig; import com.linkedin.metadata.models.AspectSpec; import com.linkedin.metadata.models.registry.EntityRegistry; -import com.linkedin.metadata.query.filter.Condition; -import com.linkedin.metadata.query.filter.ConjunctiveCriterion; -import com.linkedin.metadata.query.filter.ConjunctiveCriterionArray; -import com.linkedin.metadata.query.filter.Criterion; -import com.linkedin.metadata.query.filter.CriterionArray; -import com.linkedin.metadata.query.filter.Filter; import com.linkedin.metadata.restli.RestliUtil; import com.linkedin.metadata.timeseries.TimeseriesAspectService; import com.linkedin.metadata.timeseries.elastic.UsageServiceUtil; import com.linkedin.metadata.timeseries.transformer.TimeseriesAspectTransformer; -import com.linkedin.metadata.utils.metrics.MetricUtils; import com.linkedin.parseq.Task; import com.linkedin.restli.common.HttpStatus; import com.linkedin.restli.server.RestLiServiceException; @@ -47,35 +35,20 @@ import com.linkedin.restli.server.annotations.ActionParam; import com.linkedin.restli.server.annotations.RestLiSimpleResource; import com.linkedin.restli.server.resources.SimpleResourceTemplate; -import com.linkedin.timeseries.AggregationSpec; -import com.linkedin.timeseries.AggregationType; -import com.linkedin.timeseries.CalendarInterval; -import com.linkedin.timeseries.GenericTable; -import com.linkedin.timeseries.GroupingBucket; -import com.linkedin.timeseries.GroupingBucketType; import com.linkedin.timeseries.TimeWindowSize; import com.linkedin.usage.FieldUsageCounts; -import com.linkedin.usage.FieldUsageCountsArray; import com.linkedin.usage.UsageAggregation; -import com.linkedin.usage.UsageAggregationArray; import com.linkedin.usage.UsageAggregationMetrics; import com.linkedin.usage.UsageQueryResult; -import com.linkedin.usage.UsageQueryResultAggregations; import com.linkedin.usage.UsageTimeRange; import com.linkedin.usage.UserUsageCounts; -import com.linkedin.usage.UserUsageCountsArray; import io.datahubproject.metadata.context.OperationContext; import io.datahubproject.metadata.context.RequestContext; import io.opentelemetry.extension.annotations.WithSpan; -import java.net.URISyntaxException; -import java.time.Instant; -import java.util.ArrayList; import java.util.Arrays; -import java.util.List; import java.util.Map; import java.util.Set; import java.util.stream.Collectors; -import java.util.concurrent.TimeUnit; import javax.annotation.Nonnull; import javax.inject.Inject; import javax.inject.Named; @@ -255,7 +228,8 @@ private void ingest(@Nonnull OperationContext opContext, @Nonnull UsageAggregati try { documents = TimeseriesAspectTransformer.transform( - bucket.getResource(), datasetUsageStatistics, getUsageStatsAspectSpec(), null); + bucket.getResource(), datasetUsageStatistics, getUsageStatsAspectSpec(), null, + systemOperationContext.getSearchContext().getIndexConvention().getIdHashAlgo()); } catch (JsonProcessingException e) { log.error("Failed to generate timeseries document from aspect: {}", e.toString()); return; diff --git a/metadata-service/services/src/main/java/com/linkedin/metadata/entity/DeleteEntityService.java b/metadata-service/services/src/main/java/com/linkedin/metadata/entity/DeleteEntityService.java index aed9b97411ff68..ed14dec4ed940a 100644 --- a/metadata-service/services/src/main/java/com/linkedin/metadata/entity/DeleteEntityService.java +++ b/metadata-service/services/src/main/java/com/linkedin/metadata/entity/DeleteEntityService.java @@ -729,11 +729,11 @@ private MetadataChangeProposal updateFormsAspect( .collect(Collectors.toList()); List completedForms = formsAspect.getCompletedForms().stream() - .filter(completedForm -> completedForm.getUrn() != deletedUrn) + .filter(completedForm -> !completedForm.getUrn().equals(deletedUrn)) .collect(Collectors.toList()); final List verifications = formsAspect.getVerifications().stream() - .filter(verification -> verification.getForm() != deletedUrn) + .filter(verification -> !verification.getForm().equals(deletedUrn)) .collect(Collectors.toList()); updatedAspect.get().setIncompleteForms(new FormAssociationArray(incompleteForms)); diff --git a/metadata-service/war/src/main/resources/boot/global_settings.json b/metadata-service/war/src/main/resources/boot/global_settings.json index 129783afd6df49..35145b85202a7b 100644 --- a/metadata-service/war/src/main/resources/boot/global_settings.json +++ b/metadata-service/war/src/main/resources/boot/global_settings.json @@ -1,4 +1,8 @@ { "views": { + }, + "docPropagation": { + "enabled": true, + "columnPropagationEnabled": true } } \ No newline at end of file diff --git a/metadata-utils/src/main/java/com/linkedin/metadata/utils/elasticsearch/IndexConvention.java b/metadata-utils/src/main/java/com/linkedin/metadata/utils/elasticsearch/IndexConvention.java index 4a3f78fcef7bd6..87aebabf643666 100644 --- a/metadata-utils/src/main/java/com/linkedin/metadata/utils/elasticsearch/IndexConvention.java +++ b/metadata-utils/src/main/java/com/linkedin/metadata/utils/elasticsearch/IndexConvention.java @@ -47,4 +47,7 @@ public interface IndexConvention { * if one cannot be extracted */ Optional> getEntityAndAspectName(String timeseriesAspectIndexName); + + @Nonnull + String getIdHashAlgo(); } diff --git a/metadata-utils/src/main/java/com/linkedin/metadata/utils/elasticsearch/IndexConventionImpl.java b/metadata-utils/src/main/java/com/linkedin/metadata/utils/elasticsearch/IndexConventionImpl.java index 47801cd2054fa4..2c9c927cd8c347 100644 --- a/metadata-utils/src/main/java/com/linkedin/metadata/utils/elasticsearch/IndexConventionImpl.java +++ b/metadata-utils/src/main/java/com/linkedin/metadata/utils/elasticsearch/IndexConventionImpl.java @@ -8,25 +8,30 @@ import java.util.concurrent.ConcurrentHashMap; import javax.annotation.Nonnull; import javax.annotation.Nullable; +import lombok.Getter; import org.apache.commons.lang3.StringUtils; // Default implementation of search index naming convention public class IndexConventionImpl implements IndexConvention { - public static final IndexConvention NO_PREFIX = new IndexConventionImpl(null); + public static IndexConvention noPrefix(@Nonnull String idHashAlgo) { + return new IndexConventionImpl(null, idHashAlgo); + } // Map from Entity name -> Index name private final Map indexNameMapping = new ConcurrentHashMap<>(); private final Optional _prefix; private final String _getAllEntityIndicesPattern; private final String _getAllTimeseriesIndicesPattern; + @Getter private final String idHashAlgo; private static final String ENTITY_INDEX_VERSION = "v2"; private static final String ENTITY_INDEX_SUFFIX = "index"; private static final String TIMESERIES_INDEX_VERSION = "v1"; private static final String TIMESERIES_ENTITY_INDEX_SUFFIX = "aspect"; - public IndexConventionImpl(@Nullable String prefix) { + public IndexConventionImpl(@Nullable String prefix, String idHashAlgo) { _prefix = StringUtils.isEmpty(prefix) ? Optional.empty() : Optional.of(prefix); + this.idHashAlgo = idHashAlgo; _getAllEntityIndicesPattern = _prefix.map(p -> p + "_").orElse("") + "*" diff --git a/metadata-utils/src/test/java/com/linkedin/metadata/utils/elasticsearch/IndexConventionImplTest.java b/metadata-utils/src/test/java/com/linkedin/metadata/utils/elasticsearch/IndexConventionImplTest.java index 8074f344cd2441..2f6c7138d3c4fb 100644 --- a/metadata-utils/src/test/java/com/linkedin/metadata/utils/elasticsearch/IndexConventionImplTest.java +++ b/metadata-utils/src/test/java/com/linkedin/metadata/utils/elasticsearch/IndexConventionImplTest.java @@ -10,7 +10,7 @@ public class IndexConventionImplTest { @Test public void testIndexConventionNoPrefix() { - IndexConvention indexConventionNoPrefix = IndexConventionImpl.NO_PREFIX; + IndexConvention indexConventionNoPrefix = IndexConventionImpl.noPrefix("MD5"); String entityName = "dataset"; String expectedIndexName = "datasetindex_v2"; assertEquals(indexConventionNoPrefix.getEntityIndexName(entityName), expectedIndexName); @@ -25,7 +25,7 @@ public void testIndexConventionNoPrefix() { @Test public void testIndexConventionPrefix() { - IndexConvention indexConventionPrefix = new IndexConventionImpl("prefix"); + IndexConvention indexConventionPrefix = new IndexConventionImpl("prefix", "MD5"); String entityName = "dataset"; String expectedIndexName = "prefix_datasetindex_v2"; assertEquals(indexConventionPrefix.getEntityIndexName(entityName), expectedIndexName); @@ -42,7 +42,7 @@ public void testIndexConventionPrefix() { @Test public void testTimeseriesIndexConventionNoPrefix() { - IndexConvention indexConventionNoPrefix = IndexConventionImpl.NO_PREFIX; + IndexConvention indexConventionNoPrefix = IndexConventionImpl.noPrefix("MD5"); String entityName = "dataset"; String aspectName = "datasetusagestatistics"; String expectedIndexName = "dataset_datasetusagestatisticsaspect_v1"; @@ -64,7 +64,7 @@ public void testTimeseriesIndexConventionNoPrefix() { @Test public void testTimeseriesIndexConventionPrefix() { - IndexConvention indexConventionPrefix = new IndexConventionImpl("prefix"); + IndexConvention indexConventionPrefix = new IndexConventionImpl("prefix", "MD5"); String entityName = "dataset"; String aspectName = "datasetusagestatistics"; String expectedIndexName = "prefix_dataset_datasetusagestatisticsaspect_v1"; diff --git a/smoke-test/build.gradle b/smoke-test/build.gradle index 95f3ba8ed56d64..a9e5a8942b71ec 100644 --- a/smoke-test/build.gradle +++ b/smoke-test/build.gradle @@ -44,12 +44,19 @@ task yarnInstall(type: YarnTask) { environment = ['NODE_OPTIONS': '--openssl-legacy-provider'] args = ['install', '--cwd', "${project.rootDir}/smoke-test/tests/cypress"] } + task cypressLint(type: YarnTask, dependsOn: yarnInstall) { environment = ['NODE_OPTIONS': '--openssl-legacy-provider'] // TODO: Run a full lint instead of just format. args = ['--cwd', "${project.rootDir}/smoke-test/tests/cypress", 'run', 'format'] } +task cypressLintFix(type: YarnTask, dependsOn: yarnInstall) { + environment = ['NODE_OPTIONS': '--openssl-legacy-provider'] + // TODO: Run a full lint instead of just format. + args = ['--cwd', "${project.rootDir}/smoke-test/tests/cypress", 'run', 'format', '--write'] +} + task installDev(type: Exec) { inputs.file file('pyproject.toml') inputs.file file('requirements.txt') @@ -86,10 +93,7 @@ task pythonLintFix(type: Exec, dependsOn: installDev) { */ task noCypressSuite0(type: Exec, dependsOn: [installDev, ':metadata-ingestion:installDev']) { environment 'RUN_QUICKSTART', 'false' - environment 'DATAHUB_KAFKA_SCHEMA_REGISTRY_URL', 'http://localhost:8080/schema-registry/api/' - environment 'KAFKA_BROKER_CONTAINER', 'datahub-kafka-broker-1' environment 'TEST_STRATEGY', 'no_cypress_suite0' - environment "ELASTIC_ID_HASH_ALGO", "MD5" workingDir = project.projectDir commandLine 'bash', '-c', @@ -99,10 +103,7 @@ task noCypressSuite0(type: Exec, dependsOn: [installDev, ':metadata-ingestion:in task noCypressSuite1(type: Exec, dependsOn: [installDev, ':metadata-ingestion:installDev']) { environment 'RUN_QUICKSTART', 'false' - environment 'DATAHUB_KAFKA_SCHEMA_REGISTRY_URL', 'http://localhost:8080/schema-registry/api/' - environment 'KAFKA_BROKER_CONTAINER', 'datahub-kafka-broker-1' environment 'TEST_STRATEGY', 'no_cypress_suite1' - environment "ELASTIC_ID_HASH_ALGO", "MD5" workingDir = project.projectDir commandLine 'bash', '-c', @@ -112,10 +113,7 @@ task noCypressSuite1(type: Exec, dependsOn: [installDev, ':metadata-ingestion:in task cypressSuite1(type: Exec, dependsOn: [installDev, ':metadata-ingestion:installDev']) { environment 'RUN_QUICKSTART', 'false' - environment 'DATAHUB_KAFKA_SCHEMA_REGISTRY_URL', 'http://localhost:8080/schema-registry/api/' - environment 'KAFKA_BROKER_CONTAINER', 'datahub-kafka-broker-1' environment 'TEST_STRATEGY', 'cypress_suite1' - environment "ELASTIC_ID_HASH_ALGO", "MD5" workingDir = project.projectDir commandLine 'bash', '-c', @@ -125,10 +123,7 @@ task cypressSuite1(type: Exec, dependsOn: [installDev, ':metadata-ingestion:inst task cypressRest(type: Exec, dependsOn: [installDev, ':metadata-ingestion:installDev']) { environment 'RUN_QUICKSTART', 'false' - environment 'DATAHUB_KAFKA_SCHEMA_REGISTRY_URL', 'http://localhost:8080/schema-registry/api/' - environment 'KAFKA_BROKER_CONTAINER', 'datahub-kafka-broker-1' environment 'TEST_STRATEGY', 'cypress_rest' - environment "ELASTIC_ID_HASH_ALGO", "MD5" workingDir = project.projectDir commandLine 'bash', '-c', @@ -141,9 +136,6 @@ task cypressRest(type: Exec, dependsOn: [installDev, ':metadata-ingestion:instal */ task cypressDev(type: Exec, dependsOn: [installDev, ':metadata-ingestion:installDev']) { environment 'RUN_QUICKSTART', 'false' - environment 'DATAHUB_KAFKA_SCHEMA_REGISTRY_URL', 'http://localhost:8080/schema-registry/api/' - environment 'KAFKA_BROKER_CONTAINER', 'datahub-kafka-broker-1' - environment "ELASTIC_ID_HASH_ALGO", "MD5" workingDir = project.projectDir commandLine 'bash', '-c', @@ -156,13 +148,18 @@ task cypressDev(type: Exec, dependsOn: [installDev, ':metadata-ingestion:install */ task cypressData(type: Exec, dependsOn: [installDev, ':metadata-ingestion:installDev']) { environment 'RUN_QUICKSTART', 'false' - environment 'DATAHUB_KAFKA_SCHEMA_REGISTRY_URL', 'http://localhost:8080/schema-registry/api/' - environment 'KAFKA_BROKER_CONTAINER', 'datahub-kafka-broker-1' environment 'RUN_UI', 'false' - environment "ELASTIC_ID_HASH_ALGO", "MD5" workingDir = project.projectDir commandLine 'bash', '-c', "source ${venv_name}/bin/activate && set -x && " + "./cypress-dev.sh" -} \ No newline at end of file +} + +task lint { + dependsOn pythonLint, cypressLint +} + +task lintFix { + dependsOn pythonLintFix +} diff --git a/smoke-test/cypress-dev.sh b/smoke-test/cypress-dev.sh index 59346b26069059..bce2d794b18691 100755 --- a/smoke-test/cypress-dev.sh +++ b/smoke-test/cypress-dev.sh @@ -10,9 +10,9 @@ fi source venv/bin/activate -export KAFKA_BROKER_CONTAINER="datahub-kafka-broker-1" -export KAFKA_BOOTSTRAP_SERVER="broker:9092" -export ELASTIC_ID_HASH_ALGO="MD5" +# set environment variables for the test +source ./set-test-env-vars.sh + python -c 'from tests.cypress.integration_test import ingest_data; ingest_data()' cd tests/cypress diff --git a/smoke-test/run-quickstart.sh b/smoke-test/run-quickstart.sh index 1923d42eb5e939..eb0d46b3172442 100755 --- a/smoke-test/run-quickstart.sh +++ b/smoke-test/run-quickstart.sh @@ -10,17 +10,17 @@ source venv/bin/activate mkdir -p ~/.datahub/plugins/frontend/auth/ echo "test_user:test_pass" >> ~/.datahub/plugins/frontend/auth/user.props +echo "DATAHUB_VERSION = $DATAHUB_VERSION" DATAHUB_SEARCH_IMAGE="${DATAHUB_SEARCH_IMAGE:=opensearchproject/opensearch}" DATAHUB_SEARCH_TAG="${DATAHUB_SEARCH_TAG:=2.9.0}" XPACK_SECURITY_ENABLED="${XPACK_SECURITY_ENABLED:=plugins.security.disabled=true}" ELASTICSEARCH_USE_SSL="${ELASTICSEARCH_USE_SSL:=false}" USE_AWS_ELASTICSEARCH="${USE_AWS_ELASTICSEARCH:=true}" -ELASTIC_ID_HASH_ALGO="${ELASTIC_ID_HASH_ALGO:=MD5}" -echo "DATAHUB_VERSION = $DATAHUB_VERSION" DATAHUB_TELEMETRY_ENABLED=false \ DOCKER_COMPOSE_BASE="file://$( dirname "$DIR" )" \ DATAHUB_SEARCH_IMAGE="$DATAHUB_SEARCH_IMAGE" DATAHUB_SEARCH_TAG="$DATAHUB_SEARCH_TAG" \ XPACK_SECURITY_ENABLED="$XPACK_SECURITY_ENABLED" ELASTICSEARCH_USE_SSL="$ELASTICSEARCH_USE_SSL" \ USE_AWS_ELASTICSEARCH="$USE_AWS_ELASTICSEARCH" \ -datahub docker quickstart --version ${DATAHUB_VERSION} --standalone_consumers --dump-logs-on-failure --kafka-setup +DATAHUB_VERSION=${DATAHUB_VERSION} \ +docker compose --project-directory ../docker/profiles --profile quickstart-consumers up -d --quiet-pull --wait --wait-timeout 900 diff --git a/smoke-test/set-cypress-creds.sh b/smoke-test/set-cypress-creds.sh index 82fe736b0a7e18..fc6e7dd42f5dea 100644 --- a/smoke-test/set-cypress-creds.sh +++ b/smoke-test/set-cypress-creds.sh @@ -2,4 +2,4 @@ export CYPRESS_ADMIN_USERNAME=${ADMIN_USERNAME:-datahub} export CYPRESS_ADMIN_PASSWORD=${ADMIN_PASSWORD:-datahub} -export CYPRESS_ADMIN_DISPLAYNAME=${ADMIN_DISPLAYNAME:-DataHub} \ No newline at end of file +export CYPRESS_ADMIN_DISPLAYNAME=${ADMIN_DISPLAYNAME:-DataHub} diff --git a/smoke-test/set-test-env-vars.sh b/smoke-test/set-test-env-vars.sh new file mode 100644 index 00000000000000..4668721f80de08 --- /dev/null +++ b/smoke-test/set-test-env-vars.sh @@ -0,0 +1,2 @@ +export DATAHUB_KAFKA_SCHEMA_REGISTRY_URL=http://localhost:8080/schema-registry/api +export DATAHUB_GMS_URL=http://localhost:8080 \ No newline at end of file diff --git a/smoke-test/smoke.sh b/smoke-test/smoke.sh index c16865fe1e71ef..5b3e8a9377a6ca 100755 --- a/smoke-test/smoke.sh +++ b/smoke-test/smoke.sh @@ -16,16 +16,23 @@ cd "$DIR" if [ "${RUN_QUICKSTART:-true}" == "true" ]; then source ./run-quickstart.sh +else + mkdir -p ~/.datahub/plugins/frontend/auth/ + echo "test_user:test_pass" >> ~/.datahub/plugins/frontend/auth/user.props + echo "datahub:datahub" > ~/.datahub/plugins/frontend/auth/user.props + + python3 -m venv venv + source venv/bin/activate + python -m pip install --upgrade pip uv>=0.1.10 wheel setuptools + uv pip install -r requirements.txt fi -source venv/bin/activate - (cd ..; ./gradlew :smoke-test:yarnInstall) source ./set-cypress-creds.sh -export DATAHUB_GMS_URL=http://localhost:8080 -export ELASTIC_ID_HASH_ALGO="MD5" +# set environment variables for the test +source ./set-test-env-vars.sh # no_cypress_suite0, no_cypress_suite1, cypress_suite1, cypress_rest if [[ -z "${TEST_STRATEGY}" ]]; then diff --git a/smoke-test/test_e2e.py b/smoke-test/test_e2e.py index abb4841314c4af..74d64a8193173a 100644 --- a/smoke-test/test_e2e.py +++ b/smoke-test/test_e2e.py @@ -21,6 +21,7 @@ get_frontend_session, get_admin_credentials, get_root_urn, + wait_for_writes_to_sync, ) bootstrap_sample_data = "../metadata-ingestion/examples/mce_files/bootstrap_mce.json" @@ -150,11 +151,13 @@ def _ensure_group_not_present(urn: str, frontend_session) -> Any: def test_ingestion_via_rest(wait_for_healthchecks): ingest_file_via_rest(bootstrap_sample_data) _ensure_user_present(urn=get_root_urn()) + wait_for_writes_to_sync() @pytest.mark.dependency(depends=["test_healthchecks"]) def test_ingestion_usage_via_rest(wait_for_healthchecks): ingest_file_via_rest(usage_sample_data) + wait_for_writes_to_sync() @pytest.mark.dependency(depends=["test_healthchecks"]) @@ -185,6 +188,7 @@ def test_ingestion_via_kafka(wait_for_healthchecks): # Since Kafka emission is asynchronous, we must wait a little bit so that # the changes are actually processed. time.sleep(kafka_post_ingestion_wait_sec) + wait_for_writes_to_sync() @pytest.mark.dependency( @@ -196,6 +200,7 @@ def test_ingestion_via_kafka(wait_for_healthchecks): ) def test_run_ingestion(wait_for_healthchecks): # Dummy test so that future ones can just depend on this one. + wait_for_writes_to_sync() pass @@ -1384,7 +1389,9 @@ def test_native_user_endpoints(frontend_session): unauthenticated_get_invite_token_response = unauthenticated_session.post( f"{get_frontend_url()}/api/v2/graphql", json=get_invite_token_json ) - assert unauthenticated_get_invite_token_response.status_code == HTTPStatus.UNAUTHORIZED + assert ( + unauthenticated_get_invite_token_response.status_code == HTTPStatus.UNAUTHORIZED + ) unauthenticated_create_reset_token_json = { "query": """mutation createNativeUserResetToken($input: CreateNativeUserResetTokenInput!) {\n @@ -1399,7 +1406,10 @@ def test_native_user_endpoints(frontend_session): f"{get_frontend_url()}/api/v2/graphql", json=unauthenticated_create_reset_token_json, ) - assert unauthenticated_create_reset_token_response.status_code == HTTPStatus.UNAUTHORIZED + assert ( + unauthenticated_create_reset_token_response.status_code + == HTTPStatus.UNAUTHORIZED + ) # cleanup steps json = { diff --git a/smoke-test/tests/consistency_utils.py b/smoke-test/tests/consistency_utils.py index 4335e2a874c1e7..1eddc46bb220b7 100644 --- a/smoke-test/tests/consistency_utils.py +++ b/smoke-test/tests/consistency_utils.py @@ -8,14 +8,31 @@ ELASTICSEARCH_REFRESH_INTERVAL_SECONDS: int = int( os.getenv("ELASTICSEARCH_REFRESH_INTERVAL_SECONDS", 5) ) -KAFKA_BROKER_CONTAINER: str = str( - os.getenv("KAFKA_BROKER_CONTAINER", "datahub-broker-1") -) KAFKA_BOOTSTRAP_SERVER: str = str(os.getenv("KAFKA_BOOTSTRAP_SERVER", "broker:29092")) logger = logging.getLogger(__name__) +def infer_kafka_broker_container() -> str: + cmd = "docker ps --format '{{.Names}}' | grep broker" + completed_process = subprocess.run( + cmd, + capture_output=True, + shell=True, + text=True, + ) + result = str(completed_process.stdout) + lines = result.splitlines() + if len(lines) == 0: + raise ValueError("No Kafka broker containers found") + return lines[0] + + +KAFKA_BROKER_CONTAINER: str = str( + os.getenv("KAFKA_BROKER_CONTAINER", infer_kafka_broker_container()) +) + + def wait_for_writes_to_sync(max_timeout_in_sec: int = 120) -> None: if USE_STATIC_SLEEP: time.sleep(ELASTICSEARCH_REFRESH_INTERVAL_SECONDS) @@ -44,7 +61,9 @@ def wait_for_writes_to_sync(max_timeout_in_sec: int = 120) -> None: if maximum_lag == 0: lag_zero = True except ValueError: - logger.warning(f"Error reading kafka lag using command: {cmd}") + logger.warning( + f"Error reading kafka lag using command: {cmd}", exc_info=True + ) if not lag_zero: logger.warning( diff --git a/smoke-test/tests/cypress/cypress/e2e/actions/docPropagation.js b/smoke-test/tests/cypress/cypress/e2e/actions/docPropagation.js new file mode 100644 index 00000000000000..3d7e14195ab64f --- /dev/null +++ b/smoke-test/tests/cypress/cypress/e2e/actions/docPropagation.js @@ -0,0 +1,27 @@ +const testId = '[data-testid="docPropagationIndicator"]'; + +describe("docPropagation", () => { + it("logs in and navigates to the schema page and checks for docPropagationIndicator", () => { + cy.login(); + cy.visit( + "/dataset/urn:li:dataset:(urn:li:dataPlatform:hive,fct_cypress_users_deleted,PROD)/Schema?is_lineage_mode=false&schemaFilter=", + "/dataset/urn:li:dataset:(urn:li:dataPlatform:hive,fct_cypress_users_deleted,PROD)/Schema?is_lineage_mode=false&schemaFilter=", + ); + + // verify that the indicator exists in the table + cy.get(testId).should("exist"); + + // click on the table row + cy.get('[data-row-key="user_id"]').click(); + + // verify that the indicator exists in id="entity-profile-sidebar" + cy.get('[id="entity-profile-sidebar"]') + .then(($sidebar) => { + if ($sidebar.find(testId).length) return testId; + return null; + }) + .then((selector) => { + cy.get(selector).should("exist"); + }); + }); +}); diff --git a/smoke-test/tests/cypress/data.json b/smoke-test/tests/cypress/data.json index 5253b7a33b085f..ce61f7c83a0389 100644 --- a/smoke-test/tests/cypress/data.json +++ b/smoke-test/tests/cypress/data.json @@ -96,7 +96,11 @@ }, "nativeDataType": "varchar(100)", "globalTags": { - "tags": [{ "tag": "urn:li:tag:NeedsDocumentation" }] + "tags": [ + { + "tag": "urn:li:tag:NeedsDocumentation" + } + ] }, "recursive": false }, @@ -137,7 +141,11 @@ }, { "com.linkedin.pegasus2avro.common.GlobalTags": { - "tags": [{ "tag": "urn:li:tag:Cypress" }] + "tags": [ + { + "tag": "urn:li:tag:Cypress" + } + ] } } ] @@ -246,7 +254,13 @@ "editableSchemaFieldInfo": [ { "fieldPath": "shipment_info", - "globalTags": { "tags": [{ "tag": "urn:li:tag:Legacy" }] }, + "globalTags": { + "tags": [ + { + "tag": "urn:li:tag:Legacy" + } + ] + }, "glossaryTerms": { "terms": [ { @@ -401,8 +415,12 @@ { "com.linkedin.pegasus2avro.common.GlobalTags": { "tags": [ - { "tag": "urn:li:tag:Cypress" }, - { "tag": "urn:li:tag:Cypress2" } + { + "tag": "urn:li:tag:Cypress" + }, + { + "tag": "urn:li:tag:Cypress2" + } ] } } @@ -542,7 +560,11 @@ }, { "com.linkedin.pegasus2avro.common.GlobalTags": { - "tags": [{ "tag": "urn:li:tag:Cypress" }] + "tags": [ + { + "tag": "urn:li:tag:Cypress" + } + ] } } ] @@ -718,7 +740,11 @@ }, { "com.linkedin.pegasus2avro.common.GlobalTags": { - "tags": [{ "tag": "urn:li:tag:Cypress" }] + "tags": [ + { + "tag": "urn:li:tag:Cypress" + } + ] } } ] @@ -1011,7 +1037,11 @@ }, { "com.linkedin.pegasus2avro.common.GlobalTags": { - "tags": [{ "tag": "urn:li:tag:Cypress" }] + "tags": [ + { + "tag": "urn:li:tag:Cypress" + } + ] } } ] @@ -1229,7 +1259,11 @@ }, { "com.linkedin.pegasus2avro.common.GlobalTags": { - "tags": [{ "tag": "urn:li:tag:Cypress" }] + "tags": [ + { + "tag": "urn:li:tag:Cypress" + } + ] } } ] @@ -1279,7 +1313,11 @@ }, { "com.linkedin.pegasus2avro.common.GlobalTags": { - "tags": [{ "tag": "urn:li:tag:Cypress" }] + "tags": [ + { + "tag": "urn:li:tag:Cypress" + } + ] } } ] @@ -1332,7 +1370,11 @@ }, { "com.linkedin.pegasus2avro.common.GlobalTags": { - "tags": [{ "tag": "urn:li:tag:Cypress" }] + "tags": [ + { + "tag": "urn:li:tag:Cypress" + } + ] } } ] @@ -1371,7 +1413,11 @@ }, { "com.linkedin.pegasus2avro.common.GlobalTags": { - "tags": [{ "tag": "urn:li:tag:Cypress" }] + "tags": [ + { + "tag": "urn:li:tag:Cypress" + } + ] } } ] @@ -1413,7 +1459,11 @@ }, { "com.linkedin.pegasus2avro.common.GlobalTags": { - "tags": [{ "tag": "urn:li:tag:Cypress" }] + "tags": [ + { + "tag": "urn:li:tag:Cypress" + } + ] } } ] @@ -1459,7 +1509,11 @@ }, { "com.linkedin.pegasus2avro.common.GlobalTags": { - "tags": [{ "tag": "urn:li:tag:Cypress" }] + "tags": [ + { + "tag": "urn:li:tag:Cypress" + } + ] } } ] @@ -1521,7 +1575,11 @@ }, { "com.linkedin.pegasus2avro.common.GlobalTags": { - "tags": [{ "tag": "urn:li:tag:Cypress" }] + "tags": [ + { + "tag": "urn:li:tag:Cypress" + } + ] } } ] @@ -1758,7 +1816,11 @@ }, { "com.linkedin.pegasus2avro.common.GlobalTags": { - "tags": [{ "tag": "urn:li:tag:CypressFeatureTag" }] + "tags": [ + { + "tag": "urn:li:tag:CypressFeatureTag" + } + ] } } ] @@ -1785,7 +1847,11 @@ }, { "com.linkedin.pegasus2avro.common.GlobalTags": { - "tags": [{ "tag": "urn:li:tag:CypressPrimaryKeyTag" }] + "tags": [ + { + "tag": "urn:li:tag:CypressPrimaryKeyTag" + } + ] } } ] @@ -2137,5 +2203,17 @@ "contentType": "application/json" }, "systemMetadata": null + }, + { + "auditHeader": null, + "entityType": "schemaField", + "entityUrn": "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:hive,fct_cypress_users_deleted,PROD),user_id)", + "changeType": "UPSERT", + "aspectName": "documentation", + "aspect": { + "value": "{\"documentations\":[{\"attribution\":{\"actor\":\"urn:li:corpuser:__datahub_system\",\"source\":\"urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:hive,fct_cypress_users_created,PROD),user_id)\",\"sourceDetail\":{\"actor\":\"urn:li:corpuser:shirshanka@acryl.io\",\"origin\":\"urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:hive,fct_cypress_users_created,PROD),user_id)\",\"propagated\":\"true\"},\"time\":1721422917808},\"documentation\":\"Unique identifier of user profile.\"}]}", + "contentType": "application/json" + }, + "systemMetadata": null } ] diff --git a/smoke-test/tests/openapi/test_openapi.py b/smoke-test/tests/openapi/test_openapi.py index 6561ee6d5c5cc9..20398e0e581685 100644 --- a/smoke-test/tests/openapi/test_openapi.py +++ b/smoke-test/tests/openapi/test_openapi.py @@ -64,6 +64,7 @@ def evaluate_test(test_name, test_data): actual_resp.json(), req_resp["response"]["json"], exclude_regex_paths=exclude_regex_paths, + ignore_order=True, ) assert not diff else: @@ -81,11 +82,12 @@ def evaluate_test(test_name, test_data): raise e -def run_tests(fixture_glob, num_workers=3): +def run_tests(fixture_globs, num_workers=3): with concurrent.futures.ThreadPoolExecutor(max_workers=num_workers) as executor: futures = [] - for test_fixture, test_data in load_tests(fixture_glob=fixture_glob): - futures.append(executor.submit(evaluate_test, test_fixture, test_data)) + for fixture_glob in fixture_globs: + for test_fixture, test_data in load_tests(fixture_glob=fixture_glob): + futures.append(executor.submit(evaluate_test, test_fixture, test_data)) for future in concurrent.futures.as_completed(futures): logger.info(future.result()) @@ -93,7 +95,7 @@ def run_tests(fixture_glob, num_workers=3): @pytest.mark.dependency(depends=["test_healthchecks"]) def test_openapi_all(): - run_tests(fixture_glob="tests/openapi/**/*.json", num_workers=10) + run_tests(fixture_globs=["tests/openapi/*/*.json"], num_workers=10) # @pytest.mark.dependency(depends=["test_healthchecks"])