diff --git a/.github/workflows/build-and-test.yml b/.github/workflows/build-and-test.yml index 532ba1102ed579..412c962cb6e36f 100644 --- a/.github/workflows/build-and-test.yml +++ b/.github/workflows/build-and-test.yml @@ -83,6 +83,7 @@ jobs: - uses: gradle/actions/setup-gradle@v3 - name: Gradle build (and test) for NOT metadata ingestion if: ${{ matrix.command == 'except_metadata_ingestion' && needs.setup.outputs.backend_change == 'true' }} + # datahub-schematron:cli excluded due to dependency on metadata-ingestion run: | ./gradlew build \ -x :metadata-ingestion:build \ @@ -100,6 +101,7 @@ jobs: -x :metadata-ingestion-modules:gx-plugin:check \ -x :datahub-frontend:build \ -x :datahub-web-react:build \ + -x :metadata-integration:java:datahub-schematron:cli:test \ --parallel - name: Gradle build (and test) for frontend if: ${{ matrix.command == 'frontend' && needs.setup.outputs.frontend_change == 'true' }} diff --git a/.github/workflows/check-datahub-jars.yml b/.github/workflows/check-datahub-jars.yml index becf8126dc45ba..7a49f32729ec1f 100644 --- a/.github/workflows/check-datahub-jars.yml +++ b/.github/workflows/check-datahub-jars.yml @@ -40,4 +40,5 @@ jobs: - name: check ${{ matrix.command }} jar run: | ./gradlew :metadata-integration:java:${{ matrix.command }}:build --info + ./gradlew :metadata-integration:java:${{ matrix.command }}:checkShadowJar ./gradlew :metadata-integration:java:${{ matrix.command }}:javadoc diff --git a/build.gradle b/build.gradle index 9ee756d41e11ef..be4d7ee8a562b9 100644 --- a/build.gradle +++ b/build.gradle @@ -48,6 +48,7 @@ buildscript { // see also datahub-frontend/play.gradle ext.playVersion = '2.8.22' ext.playScalaVersion = '2.13' + ext.akkaVersion = '2.6.21' // 2.7.0+ has incompatible license ext.log4jVersion = '2.23.1' ext.slf4jVersion = '1.7.36' ext.logbackClassic = '1.4.14' @@ -105,7 +106,14 @@ project.ext.spec = [ ] project.ext.externalDependency = [ - 'akkaHttp': "com.typesafe.akka:akka-http-core_$playScalaVersion:10.2.10", + 'akkaHttp': "com.typesafe.akka:akka-http-core_$playScalaVersion:10.2.10", // max version due to licensing + 'akkaActor': "com.typesafe.akka:akka-actor_$playScalaVersion:$akkaVersion", + 'akkaStream': "com.typesafe.akka:akka-stream_$playScalaVersion:$akkaVersion", + 'akkaActorTyped': "com.typesafe.akka:akka-actor-typed_$playScalaVersion:$akkaVersion", + 'akkaSlf4j': "com.typesafe.akka:akka-slf4j_$playScalaVersion:$akkaVersion", + 'akkaJackson': "com.typesafe.akka:akka-serialization-jackson_$playScalaVersion:$akkaVersion", + 'akkaParsing': "com.typesafe.akka:akka-parsing_$playScalaVersion:$akkaVersion", + 'akkaProtobuf': "com.typesafe.akka:akka-protobuf-v3_$playScalaVersion:$akkaVersion", 'antlr4Runtime': 'org.antlr:antlr4-runtime:4.9.3', 'antlr4': 'org.antlr:antlr4:4.9.3', 'assertJ': 'org.assertj:assertj-core:3.11.1', @@ -350,6 +358,7 @@ allprojects { } } } + } configure(subprojects.findAll {! it.name.startsWith('spark-lineage')}) { diff --git a/datahub-frontend/play.gradle b/datahub-frontend/play.gradle index 266962721a80a8..d513c3c232d9a0 100644 --- a/datahub-frontend/play.gradle +++ b/datahub-frontend/play.gradle @@ -55,6 +55,13 @@ dependencies { implementation externalDependency.antlr4Runtime implementation externalDependency.antlr4 implementation externalDependency.akkaHttp + implementation externalDependency.akkaActor + implementation externalDependency.akkaStream + implementation externalDependency.akkaActorTyped + implementation externalDependency.akkaSlf4j + implementation externalDependency.akkaJackson + implementation externalDependency.akkaParsing + implementation externalDependency.akkaProtobuf implementation externalDependency.jerseyCore implementation externalDependency.jerseyGuava diff --git a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/GmsGraphQLEngine.java b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/GmsGraphQLEngine.java index 5b265b67144523..d1da55268a50d5 100644 --- a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/GmsGraphQLEngine.java +++ b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/GmsGraphQLEngine.java @@ -63,6 +63,7 @@ import com.linkedin.datahub.graphql.generated.Domain; import com.linkedin.datahub.graphql.generated.ERModelRelationship; import com.linkedin.datahub.graphql.generated.ERModelRelationshipProperties; +import com.linkedin.datahub.graphql.generated.Entity; import com.linkedin.datahub.graphql.generated.EntityPath; import com.linkedin.datahub.graphql.generated.EntityRelationship; import com.linkedin.datahub.graphql.generated.EntityRelationshipLegacy; @@ -312,6 +313,7 @@ import com.linkedin.datahub.graphql.resolvers.type.HyperParameterValueTypeResolver; import com.linkedin.datahub.graphql.resolvers.type.PlatformSchemaUnionTypeResolver; import com.linkedin.datahub.graphql.resolvers.type.PropertyValueResolver; +import com.linkedin.datahub.graphql.resolvers.type.ResolvedActorResolver; import com.linkedin.datahub.graphql.resolvers.type.ResultsTypeResolver; import com.linkedin.datahub.graphql.resolvers.type.TimeSeriesAspectInterfaceTypeResolver; import com.linkedin.datahub.graphql.resolvers.user.CreateNativeUserResetTokenResolver; @@ -1730,12 +1732,22 @@ private void configureDatasetResolvers(final RuntimeWiring.Builder builder) { .type( "InstitutionalMemoryMetadata", typeWiring -> - typeWiring.dataFetcher( - "author", - new LoadableTypeResolver<>( - corpUserType, - (env) -> - ((InstitutionalMemoryMetadata) env.getSource()).getAuthor().getUrn()))) + typeWiring + .dataFetcher( + "author", + new LoadableTypeResolver<>( + corpUserType, + (env) -> + ((InstitutionalMemoryMetadata) env.getSource()) + .getAuthor() + .getUrn())) + .dataFetcher( + "actor", + new EntityTypeResolver( + this.entityTypes, + (env) -> + (Entity) + ((InstitutionalMemoryMetadata) env.getSource()).getActor()))) .type( "DatasetStatsSummary", typeWiring -> @@ -2242,6 +2254,7 @@ private void configureTypeResolvers(final RuntimeWiring.Builder builder) { "HyperParameterValueType", typeWiring -> typeWiring.typeResolver(new HyperParameterValueTypeResolver())) .type("PropertyValue", typeWiring -> typeWiring.typeResolver(new PropertyValueResolver())) + .type("ResolvedActor", typeWiring -> typeWiring.typeResolver(new ResolvedActorResolver())) .type("Aspect", typeWiring -> typeWiring.typeResolver(new AspectInterfaceTypeResolver())) .type( "TimeSeriesAspect", diff --git a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/analytics/resolver/GetChartsResolver.java b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/analytics/resolver/GetChartsResolver.java index 0fe6e5de0cac68..197ac87c1e22d8 100644 --- a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/analytics/resolver/GetChartsResolver.java +++ b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/analytics/resolver/GetChartsResolver.java @@ -84,8 +84,21 @@ private TimeSeriesChart getActiveUsersTimeSeriesChart( final DateTime end, final String title, final DateInterval interval) { - final DateRange dateRange = - new DateRange(String.valueOf(beginning.getMillis()), String.valueOf(end.getMillis())); + + final DateRange dateRange; + + // adjust month to show 1st of month rather than last day of previous month + if (interval == DateInterval.MONTH) { + dateRange = + new DateRange( + String.valueOf(beginning.plusDays(1).getMillis()), // Shift start by 1 day + String.valueOf(end.plusDays(1).getMillis()) // Shift end by 1 day + ); + } else { + // week display starting Sundays + dateRange = + new DateRange(String.valueOf(beginning.getMillis()), String.valueOf(end.getMillis())); + } final List timeSeriesLines = _analyticsService.getTimeseriesChart( @@ -96,6 +109,7 @@ private TimeSeriesChart getActiveUsersTimeSeriesChart( ImmutableMap.of(), Collections.emptyMap(), Optional.of("browserId")); + return TimeSeriesChart.builder() .setTitle(title) .setDateRange(dateRange) diff --git a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/type/ResolvedActorResolver.java b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/type/ResolvedActorResolver.java new file mode 100644 index 00000000000000..7ae719a23b00ad --- /dev/null +++ b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/type/ResolvedActorResolver.java @@ -0,0 +1,25 @@ +package com.linkedin.datahub.graphql.resolvers.type; + +import com.linkedin.datahub.graphql.generated.CorpGroup; +import com.linkedin.datahub.graphql.generated.CorpUser; +import graphql.TypeResolutionEnvironment; +import graphql.schema.GraphQLObjectType; +import graphql.schema.TypeResolver; + +public class ResolvedActorResolver implements TypeResolver { + + public static final String CORP_USER = "CorpUser"; + public static final String CORP_GROUP = "CorpGroup"; + + @Override + public GraphQLObjectType getType(TypeResolutionEnvironment env) { + if (env.getObject() instanceof CorpUser) { + return env.getSchema().getObjectType(CORP_USER); + } else if (env.getObject() instanceof CorpGroup) { + return env.getSchema().getObjectType(CORP_GROUP); + } else { + throw new RuntimeException( + "Unrecognized object type provided to type resolver, Type:" + env.getObject().toString()); + } + } +} diff --git a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/common/mappers/InstitutionalMemoryMetadataMapper.java b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/common/mappers/InstitutionalMemoryMetadataMapper.java index 7c6de02ecc8767..9781643c414c81 100644 --- a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/common/mappers/InstitutionalMemoryMetadataMapper.java +++ b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/common/mappers/InstitutionalMemoryMetadataMapper.java @@ -28,6 +28,7 @@ public InstitutionalMemoryMetadata apply( result.setDescription(input.getDescription()); // deprecated field result.setLabel(input.getDescription()); result.setAuthor(getAuthor(input.getCreateStamp().getActor().toString())); + result.setActor(ResolvedActorMapper.map(input.getCreateStamp().getActor())); result.setCreated(AuditStampMapper.map(context, input.getCreateStamp())); result.setAssociatedUrn(entityUrn.toString()); return result; diff --git a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/common/mappers/ResolvedActorMapper.java b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/common/mappers/ResolvedActorMapper.java new file mode 100644 index 00000000000000..c00ffd0b828b18 --- /dev/null +++ b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/common/mappers/ResolvedActorMapper.java @@ -0,0 +1,31 @@ +package com.linkedin.datahub.graphql.types.common.mappers; + +import com.linkedin.common.urn.Urn; +import com.linkedin.datahub.graphql.generated.CorpGroup; +import com.linkedin.datahub.graphql.generated.CorpUser; +import com.linkedin.datahub.graphql.generated.EntityType; +import com.linkedin.datahub.graphql.generated.ResolvedActor; +import com.linkedin.metadata.Constants; +import javax.annotation.Nonnull; + +public class ResolvedActorMapper { + + public static final ResolvedActorMapper INSTANCE = new ResolvedActorMapper(); + + public static ResolvedActor map(@Nonnull final Urn actorUrn) { + return INSTANCE.apply(actorUrn); + } + + public ResolvedActor apply(@Nonnull final Urn actorUrn) { + if (actorUrn.getEntityType().equals(Constants.CORP_GROUP_ENTITY_NAME)) { + CorpGroup partialGroup = new CorpGroup(); + partialGroup.setUrn(actorUrn.toString()); + partialGroup.setType(EntityType.CORP_GROUP); + return partialGroup; + } + CorpUser partialUser = new CorpUser(); + partialUser.setUrn(actorUrn.toString()); + partialUser.setType(EntityType.CORP_USER); + return (ResolvedActor) partialUser; + } +} diff --git a/datahub-graphql-core/src/main/resources/entity.graphql b/datahub-graphql-core/src/main/resources/entity.graphql index 732a782139b616..049527e5d77e3b 100644 --- a/datahub-graphql-core/src/main/resources/entity.graphql +++ b/datahub-graphql-core/src/main/resources/entity.graphql @@ -3005,8 +3005,14 @@ type InstitutionalMemoryMetadata { """ The author of this metadata + Deprecated! Use actor instead for users or groups. """ - author: CorpUser! + author: CorpUser! @deprecated(reason: "Use `actor`") + + """ + The author of this metadata + """ + actor: ResolvedActor! """ An AuditStamp corresponding to the creation of this resource @@ -3834,6 +3840,8 @@ enum CorpUserStatus { ACTIVE } +union ResolvedActor = CorpUser | CorpGroup + """ A DataHub User entity, which represents a Person on the Metadata Entity Graph """ diff --git a/datahub-graphql-core/src/test/java/com/linkedin/datahub/graphql/resolvers/search/AggregateAcrossEntitiesResolverTest.java b/datahub-graphql-core/src/test/java/com/linkedin/datahub/graphql/resolvers/search/AggregateAcrossEntitiesResolverTest.java index 1b33118bd154af..0a8e4e8b4fa5f8 100644 --- a/datahub-graphql-core/src/test/java/com/linkedin/datahub/graphql/resolvers/search/AggregateAcrossEntitiesResolverTest.java +++ b/datahub-graphql-core/src/test/java/com/linkedin/datahub/graphql/resolvers/search/AggregateAcrossEntitiesResolverTest.java @@ -386,7 +386,11 @@ private static EntityClient initMockEntityClient( Mockito.when( client.searchAcrossEntities( any(), - Mockito.eq(entityTypes), + Mockito.argThat( + argument -> + argument != null + && argument.containsAll(entityTypes) + && entityTypes.containsAll(argument)), Mockito.eq(query), Mockito.eq(filter), Mockito.eq(start), @@ -409,7 +413,11 @@ private static void verifyMockEntityClient( Mockito.verify(mockClient, Mockito.times(1)) .searchAcrossEntities( any(), - Mockito.eq(entityTypes), + Mockito.argThat( + argument -> + argument != null + && argument.containsAll(entityTypes) + && entityTypes.containsAll(argument)), Mockito.eq(query), Mockito.eq(filter), Mockito.eq(start), diff --git a/datahub-graphql-core/src/test/java/com/linkedin/datahub/graphql/resolvers/search/SearchAcrossEntitiesResolverTest.java b/datahub-graphql-core/src/test/java/com/linkedin/datahub/graphql/resolvers/search/SearchAcrossEntitiesResolverTest.java index a601a815453b2f..42768b8a2de21b 100644 --- a/datahub-graphql-core/src/test/java/com/linkedin/datahub/graphql/resolvers/search/SearchAcrossEntitiesResolverTest.java +++ b/datahub-graphql-core/src/test/java/com/linkedin/datahub/graphql/resolvers/search/SearchAcrossEntitiesResolverTest.java @@ -462,7 +462,11 @@ private static EntityClient initMockEntityClient( Mockito.when( client.searchAcrossEntities( any(), - Mockito.eq(entityTypes), + Mockito.argThat( + argument -> + argument != null + && argument.containsAll(entityTypes) + && entityTypes.containsAll(argument)), Mockito.eq(query), Mockito.eq(filter), Mockito.eq(start), @@ -483,7 +487,11 @@ private static void verifyMockEntityClient( Mockito.verify(mockClient, Mockito.times(1)) .searchAcrossEntities( any(), - Mockito.eq(entityTypes), + Mockito.argThat( + argument -> + argument != null + && argument.containsAll(entityTypes) + && entityTypes.containsAll(argument)), Mockito.eq(query), Mockito.eq(filter), Mockito.eq(start), diff --git a/datahub-graphql-core/src/test/java/com/linkedin/datahub/graphql/utils/DateUtilTest.java b/datahub-graphql-core/src/test/java/com/linkedin/datahub/graphql/utils/DateUtilTest.java index 6ecbc8d015b29a..4383df9d46a4bc 100644 --- a/datahub-graphql-core/src/test/java/com/linkedin/datahub/graphql/utils/DateUtilTest.java +++ b/datahub-graphql-core/src/test/java/com/linkedin/datahub/graphql/utils/DateUtilTest.java @@ -47,4 +47,26 @@ public void testStartOfNextWeek() { Mockito.when(dateUtil.getNow()).thenReturn(setTimeParts(8, false)); assertEqualStartOfNextWeek(dateUtil, 9); } + + // validates logic to display correct dates in MAU chart + @Test + public void testDateAdjustmentsForMonth() { + DateUtil dateUtil = Mockito.spy(DateUtil.class); + + Mockito.when(dateUtil.getNow()).thenReturn(new DateTime(2024, 11, 15, 0, 0, 0)); + + // start date should be next month minus a day + // but we want to display Dec 1 instead of Nov 30, so add a day and verify it's Dec + DateTime startOfNextMonthMinus12 = dateUtil.getStartOfNextMonth().minusMonths(12); + DateTime adjustedStart = startOfNextMonthMinus12.minusMillis(1).plusDays(1); + assertEquals(12, adjustedStart.getMonthOfYear()); // Verify it is December + assertEquals(2023, adjustedStart.getYear()); // Verify it is 2023 + + // verify that the end date displays correctly + // the chart will display Oct 1 as the last month because we don't show current month + DateTime startOfThisMonth = dateUtil.getStartOfThisMonth(); + DateTime adjustedEnd = startOfThisMonth.minusMillis(1).plusDays(1); + assertEquals(11, adjustedEnd.getMonthOfYear()); // Verify it is November + assertEquals(2024, adjustedEnd.getYear()); // Verify it is 2024 + } } diff --git a/datahub-web-react/src/Mocks.tsx b/datahub-web-react/src/Mocks.tsx index aed672a34e7caf..329d6250e576ab 100644 --- a/datahub-web-react/src/Mocks.tsx +++ b/datahub-web-react/src/Mocks.tsx @@ -566,6 +566,12 @@ export const dataset3 = { username: 'datahub', type: EntityType.CorpUser, }, + actor: { + __typename: 'CorpUser', + urn: 'urn:li:corpuser:datahub', + username: 'datahub', + type: EntityType.CorpUser, + }, description: 'This only points to Google', label: 'This only points to Google', created: { diff --git a/datahub-web-react/src/app/entity/shared/containers/profile/sidebar/LinkButton.tsx b/datahub-web-react/src/app/entity/shared/containers/profile/sidebar/LinkButton.tsx index 0ce3c9641d5597..c3896baedace79 100644 --- a/datahub-web-react/src/app/entity/shared/containers/profile/sidebar/LinkButton.tsx +++ b/datahub-web-react/src/app/entity/shared/containers/profile/sidebar/LinkButton.tsx @@ -29,7 +29,7 @@ export default function LinkButton({ link }: Props) { href={link.url} target="_blank" rel="noreferrer" - key={`${link.label}-${link.url}-${link.author}`} + key={`${link.label}-${link.url}-${link.actor.urn}`} > {link.description || link.label} diff --git a/datahub-web-react/src/app/entity/shared/tabs/Documentation/components/LinkList.tsx b/datahub-web-react/src/app/entity/shared/tabs/Documentation/components/LinkList.tsx index 7212198bbf61ca..6eb680785599e1 100644 --- a/datahub-web-react/src/app/entity/shared/tabs/Documentation/components/LinkList.tsx +++ b/datahub-web-react/src/app/entity/shared/tabs/Documentation/components/LinkList.tsx @@ -3,7 +3,7 @@ import { Link } from 'react-router-dom'; import styled from 'styled-components/macro'; import { message, Button, List, Typography, Modal, Form, Input } from 'antd'; import { LinkOutlined, DeleteOutlined, EditOutlined } from '@ant-design/icons'; -import { EntityType, InstitutionalMemoryMetadata } from '../../../../../../types.generated'; +import { InstitutionalMemoryMetadata } from '../../../../../../types.generated'; import { useEntityData, useMutationUrn } from '../../../EntityContext'; import { useEntityRegistry } from '../../../../../useEntityRegistry'; import { ANTD_GRAY } from '../../../constants'; @@ -182,10 +182,8 @@ export const LinkList = ({ refetch }: LinkListProps) => { description={ <> Added {formatDateString(link.created.time)} by{' '} - - {link.author.username} + + {entityRegistry.getDisplayName(link.actor.type, link.actor)} } diff --git a/datahub-web-react/src/app/ingest/source/IngestionSourceList.tsx b/datahub-web-react/src/app/ingest/source/IngestionSourceList.tsx index ccfa200fab630f..1990a3d7798973 100644 --- a/datahub-web-react/src/app/ingest/source/IngestionSourceList.tsx +++ b/datahub-web-react/src/app/ingest/source/IngestionSourceList.tsx @@ -193,7 +193,9 @@ export const IngestionSourceList = () => { const formatExtraArgs = (extraArgs): StringMapEntryInput[] => { if (extraArgs === null || extraArgs === undefined) return []; - return extraArgs.map((entry) => ({ key: entry.key, value: entry.value })); + return extraArgs + .filter((entry) => entry.value !== null && entry.value !== undefined && entry.value !== '') + .map((entry) => ({ key: entry.key, value: entry.value })); }; const createOrUpdateIngestionSource = ( diff --git a/datahub-web-react/src/app/ingest/source/builder/sources.json b/datahub-web-react/src/app/ingest/source/builder/sources.json index c20869a1c849c2..70d9baabdb4bc6 100644 --- a/datahub-web-react/src/app/ingest/source/builder/sources.json +++ b/datahub-web-react/src/app/ingest/source/builder/sources.json @@ -284,7 +284,7 @@ "name": "csv-enricher", "displayName": "CSV", "description": "Import metadata from a formatted CSV.", - "docsUrl": "https://datahubproject.io/docs/generated/ingestion/sources/csv'", + "docsUrl": "https://datahubproject.io/docs/generated/ingestion/sources/csv-enricher", "recipe": "source: \n type: csv-enricher \n config: \n # URL of your csv file to ingest \n filename: \n array_delimiter: '|' \n delimiter: ',' \n write_semantics: PATCH" }, { @@ -317,5 +317,13 @@ "displayName": "CassandraDB", "docsUrl": "https://datahubproject.io/docs/generated/ingestion/sources/cassandra", "recipe": "source:\n type: cassandra\n config:\n # Credentials for on prem cassandra\n contact_point: localhost\n port: 9042\n username: admin\n password: password\n\n # Or\n # Credentials Astra Cloud\n #cloud_config:\n # secure_connect_bundle: Path to Secure Connect Bundle (.zip)\n # token: Application Token\n\n # Optional Allow / Deny extraction of particular keyspaces.\n keyspace_pattern:\n allow: [.*]\n\n # Optional Allow / Deny extraction of particular tables.\n table_pattern:\n allow: [.*]" + }, + { + "urn": "urn:li:dataPlatform:iceberg", + "name": "iceberg", + "displayName": "Iceberg", + "description": "Ingest databases and tables from any Iceberg catalog implementation", + "docsUrl": "https://datahubproject.io/docs/generated/ingestion/sources/iceberg", + "recipe": "source:\n type: \"iceberg\"\n config:\n env: dev\n # each thread will open internet connections to fetch manifest files independently, \n # this value needs to be adjusted with ulimit\n processing_threads: 1 \n # a single catalog definition with a form of a dictionary\n catalog: \n demo: # name of the catalog\n type: \"rest\" # other types are available\n uri: \"uri\"\n s3.access-key-id: \"access-key\"\n s3.secret-access-key: \"secret-access-key\"\n s3.region: \"aws-region\"\n profiling:\n enabled: false\n" } ] diff --git a/datahub-web-react/src/app/ingest/source/conf/csv/csv.ts b/datahub-web-react/src/app/ingest/source/conf/csv/csv.ts index e1dc22c086fb43..e4cdee717923c2 100644 --- a/datahub-web-react/src/app/ingest/source/conf/csv/csv.ts +++ b/datahub-web-react/src/app/ingest/source/conf/csv/csv.ts @@ -15,7 +15,7 @@ const csvConfig: SourceConfig = { type: 'csv-enricher', placeholderRecipe, displayName: 'CSV', - docsUrl: 'https://datahubproject.io/docs/generated/ingestion/sources/csv', + docsUrl: 'https://datahubproject.io/docs/generated/ingestion/sources/csv-enricher', logoUrl: csvLogo, }; diff --git a/datahub-web-react/src/graphql-mock/mutationHelper.ts b/datahub-web-react/src/graphql-mock/mutationHelper.ts index a97b41b53bc656..0cf4f5f87f29ca 100644 --- a/datahub-web-react/src/graphql-mock/mutationHelper.ts +++ b/datahub-web-react/src/graphql-mock/mutationHelper.ts @@ -99,6 +99,7 @@ export const updateEntityLink = ({ entity, institutionalMemory }: UpdateEntityLi description: e.description as string, label: e.description as string, author: { urn: e.author, username: '', type: EntityType.CorpUser }, + actor: { urn: e.author, username: '', type: EntityType.CorpUser }, created: { time: Date.now(), actor: getActor(), __typename: 'AuditStamp' }, associatedUrn: dataEntity.urn, }; diff --git a/datahub-web-react/src/graphql/domain.graphql b/datahub-web-react/src/graphql/domain.graphql index 3897a2ced85b8f..2e96a78b0f44b4 100644 --- a/datahub-web-react/src/graphql/domain.graphql +++ b/datahub-web-react/src/graphql/domain.graphql @@ -19,9 +19,8 @@ query getDomain($urn: String!) { institutionalMemory { elements { url - author { - urn - username + actor { + ...resolvedActorFields } description created { diff --git a/datahub-web-react/src/graphql/fragments.graphql b/datahub-web-react/src/graphql/fragments.graphql index 7ce4082c42f61d..67dbdbbb22f309 100644 --- a/datahub-web-react/src/graphql/fragments.graphql +++ b/datahub-web-react/src/graphql/fragments.graphql @@ -202,12 +202,22 @@ fragment embedFields on Embed { renderUrl } +fragment resolvedActorFields on ResolvedActor { + ... on CorpUser { + urn + ...entityDisplayNameFields + } + ... on CorpGroup { + urn + ...entityDisplayNameFields + } +} + fragment institutionalMemoryFields on InstitutionalMemory { elements { url - author { - urn - username + actor { + ...resolvedActorFields } description created { diff --git a/docker/profiles/docker-compose.gms.yml b/docker/profiles/docker-compose.gms.yml index 824c8024b05d63..e8b2d4cd1f29d3 100644 --- a/docker/profiles/docker-compose.gms.yml +++ b/docker/profiles/docker-compose.gms.yml @@ -101,6 +101,7 @@ x-datahub-gms-service: &datahub-gms-service <<: [*primary-datastore-mysql-env, *graph-datastore-search-env, *search-datastore-env, *datahub-quickstart-telemetry-env, *kafka-env] ELASTICSEARCH_QUERY_CUSTOM_CONFIG_FILE: ${ELASTICSEARCH_QUERY_CUSTOM_CONFIG_FILE:-search_config.yaml} ALTERNATE_MCP_VALIDATION: ${ALTERNATE_MCP_VALIDATION:-true} + STRICT_URN_VALIDATION_ENABLED: ${STRICT_URN_VALIDATION_ENABLED:-true} healthcheck: test: curl -sS --fail http://datahub-gms:${DATAHUB_GMS_PORT:-8080}/health start_period: 90s @@ -183,6 +184,7 @@ x-datahub-mce-consumer-service: &datahub-mce-consumer-service environment: &datahub-mce-consumer-env <<: [*primary-datastore-mysql-env, *graph-datastore-search-env, *search-datastore-env, *datahub-quickstart-telemetry-env, *kafka-env] ALTERNATE_MCP_VALIDATION: ${ALTERNATE_MCP_VALIDATION:-true} + STRICT_URN_VALIDATION_ENABLED: ${STRICT_URN_VALIDATION_ENABLED:-true} x-datahub-mce-consumer-service-dev: &datahub-mce-consumer-service-dev <<: *datahub-mce-consumer-service diff --git a/docs-website/sidebars.js b/docs-website/sidebars.js index 0470723c1adb79..3a9d6e10ea8d42 100644 --- a/docs-website/sidebars.js +++ b/docs-website/sidebars.js @@ -989,6 +989,7 @@ module.exports = { // "metadata-ingestion/examples/structured_properties/README" // "smoke-test/tests/openapi/README" // "docs/SECURITY_STANCE" + // "metadata-integration/java/datahub-schematron/README" // ], ], }; diff --git a/docs/automations/snowflake-tag-propagation.md b/docs/automations/snowflake-tag-propagation.md index b72224642b0f07..8eded451644cce 100644 --- a/docs/automations/snowflake-tag-propagation.md +++ b/docs/automations/snowflake-tag-propagation.md @@ -4,6 +4,8 @@ import FeatureAvailability from '@site/src/components/FeatureAvailability'; +> Note that this Automation in currently in open **Beta**. With any questions or issues, please reach out to your Acryl representative. + ## Introduction Snowflake Tag Propagation is an automation that allows you to sync DataHub Glossary Terms and Tags on @@ -15,6 +17,41 @@ both columns and tables back to Snowflake. This automation is available in DataH - Automatically Add DataHub Tags to Snowflake Tables and Columns - Automatically Remove DataHub Glossary Terms and Tags from Snowflake Tables and Columns when they are removed in DataHub +## Prerequisites + +### Permissions Required for Tag Management + +- `CREATE TAG`: Required to create new tags in Snowflake. +Ensure the user or role has this privilege on the specific schema or database where tags will be created. +- `APPLY TAG`: Required to assign tags to Snowflake objects such as tables, columns, or other database objects. +This permission must be granted at the database, schema, or object level depending on the scope. + + +### Permissions Required for Object Access + +- `USAGE` on the database and schema: Allows access to the database and schema to view and apply changes. +- `SELECT` on the objects (tables, views, etc.): Enables the automation to read metadata and verify existing tags. + +### Example Permission Grant Statements + +To grant the necessary permissions for a specific role (DATAHUB_AUTOMATION_ROLE), you can use the following SQL commands: + +```sql +-- Tag management permissions +GRANT CREATE TAG ON SCHEMA your_database.your_schema TO ROLE DATAHUB_AUTOMATION_ROLE; +GRANT APPLY TAG ON SCHEMA your_database.your_schema TO ROLE DATAHUB_AUTOMATION_ROLE; + +-- Object access for metadata operations +GRANT USAGE ON DATABASE your_database TO ROLE DATAHUB_AUTOMATION_ROLE; +GRANT USAGE ON SCHEMA your_database.your_schema TO ROLE DATAHUB_AUTOMATION_ROLE; +GRANT SELECT ON ALL TABLES IN SCHEMA your_database.your_schema TO ROLE DATAHUB_AUTOMATION_ROLE; + +-- Future privileges for tagging +GRANT SELECT ON FUTURE TABLES IN SCHEMA your_database.your_schema TO ROLE DATAHUB_AUTOMATION_ROLE; +GRANT APPLY TAG ON FUTURE TABLES IN SCHEMA your_database.your_schema TO ROLE DATAHUB_AUTOMATION_ROLE; +``` + + ## Enabling Snowflake Tag Sync 1. **Navigate to Automations**: Click on 'Govern' > 'Automations' in the navigation bar. diff --git a/docs/deploy/environment-vars.md b/docs/deploy/environment-vars.md index 6429996c088b4a..13d7410397533d 100644 --- a/docs/deploy/environment-vars.md +++ b/docs/deploy/environment-vars.md @@ -9,12 +9,13 @@ DataHub works. ## Feature Flags -| Variable | Default | Unit/Type | Components | Description | -|--------------------------------------------------|---------|-----------|-----------------------------------------|-----------------------------------------------------------------------------------------------------------------------------| -| `UI_INGESTION_ENABLED` | `true` | boolean | [`GMS`, `MCE Consumer`] | Enable UI based ingestion. | -| `DATAHUB_ANALYTICS_ENABLED` | `true` | boolean | [`Frontend`, `GMS`] | Collect DataHub usage to populate the analytics dashboard. | -| `BOOTSTRAP_SYSTEM_UPDATE_WAIT_FOR_SYSTEM_UPDATE` | `true` | boolean | [`GMS`, `MCE Consumer`, `MAE Consumer`] | Do not wait for the `system-update` to complete before starting. This should typically only be disabled during development. | -| `ER_MODEL_RELATIONSHIP_FEATURE_ENABLED` | `false` | boolean | [`Frontend`, `GMS`] | Enable ER Model Relation Feature that shows Relationships Tab within a Dataset UI. | +| Variable | Default | Unit/Type | Components | Description | +|--------------------------------------------------|----------|-----------|------------------------------------------|-----------------------------------------------------------------------------------------------------------------------------| +| `UI_INGESTION_ENABLED` | `true` | boolean | [`GMS`, `MCE Consumer`] | Enable UI based ingestion. | +| `DATAHUB_ANALYTICS_ENABLED` | `true` | boolean | [`Frontend`, `GMS`] | Collect DataHub usage to populate the analytics dashboard. | +| `BOOTSTRAP_SYSTEM_UPDATE_WAIT_FOR_SYSTEM_UPDATE` | `true` | boolean | [`GMS`, `MCE Consumer`, `MAE Consumer`] | Do not wait for the `system-update` to complete before starting. This should typically only be disabled during development. | +| `ER_MODEL_RELATIONSHIP_FEATURE_ENABLED` | `false` | boolean | [`Frontend`, `GMS`] | Enable ER Model Relation Feature that shows Relationships Tab within a Dataset UI. | +| `STRICT_URN_VALIDATION_ENABLED` | `false` | boolean | [`GMS`, `MCE Consumer`, `MAE Consumer`] | Enable stricter URN validation logic | ## Ingestion diff --git a/docs/how/updating-datahub.md b/docs/how/updating-datahub.md index c21d197de29f08..087e30c2e541ad 100644 --- a/docs/how/updating-datahub.md +++ b/docs/how/updating-datahub.md @@ -38,7 +38,7 @@ This file documents any backwards-incompatible changes in DataHub and assists pe ### Breaking Changes -- #11486 - Deprecated Criterion filters using `value`. Use `values` instead. This also deprecates the ability to use comma delimited string to represent multiple values using `value`. +- #11486 - Criterion's `value` parameter has been previously deprecated. Use of `value` instead of `values` is no longer supported and will be completely removed on the next major version. - #11484 - Metadata service authentication enabled by default - #11484 - Rest API authorization enabled by default - #10472 - `SANDBOX` added as a FabricType. No rollbacks allowed once metadata with this fabric type is added without manual cleanups in databases. @@ -88,6 +88,9 @@ This file documents any backwards-incompatible changes in DataHub and assists pe ### Other Notable Changes - Downgrade to previous version is not automatically supported. +- Data Product Properties Unset side effect introduced + - Previously, Data Products could be set as linked to multiple Datasets if modified directly via the REST API rather than linked through the UI or GraphQL. This side effect aligns the REST API behavior with the GraphQL behavior by introducting a side effect that enforces the 1-to-1 constraint between Data Products and Datasets + - NOTE: There is a pathological pattern of writes for Data Products that can introduce issues with write processing that can occur with this side effect. If you are constantly changing all of the Datasets associated with a Data Product back and forth between multiple Data Products it will result in a high volume of writes due to the need to unset previous associations. ## 0.14.0.2 diff --git a/docs/managed-datahub/release-notes/v_0_3_7.md b/docs/managed-datahub/release-notes/v_0_3_7.md index 19cb04e9f56039..94cbdd79dbf5ef 100644 --- a/docs/managed-datahub/release-notes/v_0_3_7.md +++ b/docs/managed-datahub/release-notes/v_0_3_7.md @@ -7,7 +7,7 @@ Release Availability Date Recommended CLI/SDK --- -- `v0.14.1.11` with release notes at https://github.com/datahub/datahub/releases/tag/v0.14.1.11 +- `v0.14.1.12` with release notes at https://github.com/datahub/datahub/releases/tag/v0.14.1.12 If you are using an older CLI/SDK version, then please upgrade it. This applies for all CLI/SDK usages, if you are using it through your terminal, GitHub Actions, Airflow, in Python SDK somewhere, Java SDK, etc. This is a strong recommendation to upgrade, as we keep on pushing fixes in the CLI, and it helps us support you better. @@ -19,6 +19,26 @@ If you are using an older CLI/SDK version, then please upgrade it. This applies ## Release Changelog --- +### v0.3.7.4 + +- [#11935](https://github.com/datahub-project/datahub/pull/11935) - Added environment variable for enabling stricter URN validation rules `STRICT_URN_VALIDATION_ENABLED` [[1](https://datahubproject.io/docs/what/urn/#restrictions)]. +- [Automations] Filter out self-nodes in glossary term propagation +- [Remote Executor] Allow dashes in executor ids. +- [Search] Fix Nested Filter Counts in Primary Search +- [Search] Fix white screen of death on empty search result +- [Columns Tab] Support searching nested struct columns correctly in V2 UI. +- [Logo] Fix fit of custom logo for V2 UI nav bar. +- [Structured Properties] Better handling for special characters in structured properties +- [Lineage] Improvements to handling lineage cycles +- [Metadata Tests] Improve Reliability of Metadata Tests Action Application +- [Slack Integration] Minor improvement in authentication redirect to integrate with Slack +- [Columns Tab] Property display nullable status in column sidebar (bug) +- [Columns Tab] Fixing merging of sibling schemas between V2 and V1 field paths. +- [Documentation] Support group authors for institutional memory aspect + + +### v0.3.7 + - All changes in https://github.com/datahub-project/datahub/releases/tag/v0.14.1 - Note Breaking Changes: https://datahubproject.io/docs/how/updating-datahub/#0141 @@ -32,7 +52,7 @@ If you are using an older CLI/SDK version, then please upgrade it. This applies datahub: timezone: 'America/Los_Angeles' ``` - - #11486 - Deprecated Criterion filters using `value`. Use `values` instead. This also deprecates the ability to use comma delimited string to represent multiple values using `value`. + - #11486 - Criterion's `value` parameter has been previously deprecated. Use of `value` instead of `values` is no longer supported and will be completely removed on the next major version. - #10472 - `SANDBOX` added as a FabricType. No rollbacks allowed once metadata with this fabric type is added without manual cleanups in databases. - #11619 - schema field/column paths can no longer be empty strings - #11619 - schema field/column paths can no longer be duplicated within the schema @@ -96,7 +116,7 @@ If you are using an older CLI/SDK version, then please upgrade it. This applies - Improved UX for setting up and managing SSO - Ingestion changes - - In addition to the improvements listed here: https://github.com/acryldata/datahub/releases/tag/v0.14.1.11 + - In addition to the improvements listed here: https://github.com/acryldata/datahub/releases/tag/v0.14.1.12 - PowerBI: Support for PowerBI Apps and cross-workspace lineage - Fivetran: Major improvements to configurability and improved reliability with large Fivetran setups - Snowflake & BigQuery: Improved handling of temporary tables and swap statements when generating lineage @@ -120,3 +140,6 @@ If you are using an older CLI/SDK version, then please upgrade it. This applies - (system / internal) Exclude form-prompt tests in live Metadata Tests evaluation - (system / internal) Exclude form-prompt tests in stored Metadata Test results - Elasticsearch reindex time limit of 8h removed + - Data Product Properties Unset side effect introduced + - Previously, Data Products could be set as linked to multiple Datasets if modified directly via the REST API rather than linked through the UI or GraphQL. This side effect aligns the REST API behavior with the GraphQL behavior by introducting a side effect that enforces the 1-to-1 constraint between Data Products and Datasets + - NOTE: There is a pathological pattern of writes for Data Products that can introduce issues with write processing that can occur with this side effect. If you are constantly changing all of the Datasets associated with a Data Product back and forth between multiple Data Products it will result in a high volume of writes due to the need to unset previous associations. diff --git a/docs/what/urn.md b/docs/what/urn.md index e35ca7fbaca4bc..c7fb0555cd992f 100644 --- a/docs/what/urn.md +++ b/docs/what/urn.md @@ -35,10 +35,17 @@ urn:li:dataset:(urn:li:dataPlatform:hdfs,PageViewEvent,EI) ## Restrictions -There are a few restrictions when creating an urn: +There are a few restrictions when creating an URN: -1. Commas are reserved character in URN fields: `,` -2. Parentheses are reserved characters in URN fields: `( , )` -3. Colons are reserved characters in URN fields: `:` +The following characters are not allowed anywhere in the URN + +1. Parentheses are reserved characters in URN fields: `(` or `)` +2. The "unit separator" unicode character `␟` (U+241F) + +The following characters are not allowed within an URN tuple. + +1. Commas are reserved characters in URN tuples: `,` + +Example: `urn:li:dashboard:(looker,dashboards.thelook)` is a valid urn, but `urn:li:dashboard:(looker,dashboards.the,look)` is invalid. Please do not use these characters when creating or generating urns. One approach is to use URL encoding for the characters. diff --git a/entity-registry/src/main/java/com/linkedin/metadata/aspect/batch/AspectsBatch.java b/entity-registry/src/main/java/com/linkedin/metadata/aspect/batch/AspectsBatch.java index dc7934ad5cc193..30f5dce379a077 100644 --- a/entity-registry/src/main/java/com/linkedin/metadata/aspect/batch/AspectsBatch.java +++ b/entity-registry/src/main/java/com/linkedin/metadata/aspect/batch/AspectsBatch.java @@ -9,6 +9,7 @@ import com.linkedin.util.Pair; import java.util.ArrayList; import java.util.Collection; +import java.util.Collections; import java.util.HashMap; import java.util.HashSet; import java.util.List; @@ -49,7 +50,8 @@ default List getMCPItems() { * various hooks */ Pair>, List> toUpsertBatchItems( - Map> latestAspects); + Map> latestAspects, + Map> nextVersions); /** * Apply read mutations to batch @@ -227,4 +229,39 @@ static String toAbbreviatedString(Collection items, int max + StringUtils.abbreviate(itemsAbbreviated.toString(), maxWidth) + '}'; } + + /** + * Increment aspect within a batch, tracking both the next aspect version and the most recent + * + * @param changeMCP changeMCP to be incremented + * @param latestAspects lastest aspects within the batch + * @param nextVersions next version for the aspects in the batch + * @return the incremented changeMCP + */ + static ChangeMCP incrementBatchVersion( + ChangeMCP changeMCP, + Map> latestAspects, + Map> nextVersions) { + long nextVersion = + nextVersions + .getOrDefault(changeMCP.getUrn().toString(), Collections.emptyMap()) + .getOrDefault(changeMCP.getAspectName(), 0L); + + changeMCP.setPreviousSystemAspect( + latestAspects + .getOrDefault(changeMCP.getUrn().toString(), Collections.emptyMap()) + .getOrDefault(changeMCP.getAspectName(), null)); + + changeMCP.setNextAspectVersion(nextVersion); + + // support inner-batch upserts + latestAspects + .computeIfAbsent(changeMCP.getUrn().toString(), key -> new HashMap<>()) + .put(changeMCP.getAspectName(), changeMCP.getSystemAspect(nextVersion)); + nextVersions + .computeIfAbsent(changeMCP.getUrn().toString(), key -> new HashMap<>()) + .put(changeMCP.getAspectName(), nextVersion + 1); + + return changeMCP; + } } diff --git a/entity-registry/src/main/java/com/linkedin/metadata/aspect/patch/template/TemplateUtil.java b/entity-registry/src/main/java/com/linkedin/metadata/aspect/patch/template/TemplateUtil.java index d4e94e1e82e8f6..2423e37e6d5419 100644 --- a/entity-registry/src/main/java/com/linkedin/metadata/aspect/patch/template/TemplateUtil.java +++ b/entity-registry/src/main/java/com/linkedin/metadata/aspect/patch/template/TemplateUtil.java @@ -81,14 +81,13 @@ public static JsonNode populateTopLevelKeys(JsonNode transformedNode, JsonPatch PatchOperationType.REMOVE.equals(operationPath.getFirst()) ? keys.length : keys.length - 1; - // Skip first as it will always be blank due to path starting with / for (int i = 1; i < endIdx; i++) { + String decodedKey = decodeValue(keys[i]); if (parent.get(keys[i]) == null) { - String decodedKey = decodeValue(keys[i]); ((ObjectNode) parent).set(decodedKey, instance.objectNode()); } - parent = parent.get(keys[i]); + parent = parent.get(decodedKey); } } diff --git a/entity-registry/src/test/java/com/linkedin/metadata/aspect/patch/template/UpstreamLineageTemplateTest.java b/entity-registry/src/test/java/com/linkedin/metadata/aspect/patch/template/UpstreamLineageTemplateTest.java index 5042c35d2f5d47..f9af15a3d4cc6c 100644 --- a/entity-registry/src/test/java/com/linkedin/metadata/aspect/patch/template/UpstreamLineageTemplateTest.java +++ b/entity-registry/src/test/java/com/linkedin/metadata/aspect/patch/template/UpstreamLineageTemplateTest.java @@ -185,6 +185,29 @@ public void testPatchUpstream() throws Exception { // New entry in array because of new transformation type assertEquals(result4.getFineGrainedLineages().get(3), fineGrainedLineage4); + JsonPatchBuilder patchOperations5 = Json.createPatchBuilder(); + String urn4 = + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:s3,test-bucket/hive/folder_1/folder_2/my_dataset,DEV),c2)"; + UrnArray downstreamUrns5 = new UrnArray(); + downstreamUrns5.add(Urn.createFromString(urn4)); + patchOperations5.add( + "/fineGrainedLineages/TRANSFORM/urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:s3,test-bucket~1hive~1folder_1~1folder_2~1my_dataset,DEV),c2)/urn:li:query:anotherQuery/urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:bigquery,upstream_table_1,PROD),c2)", + finegrainedLineageNode5.build()); + JsonPatch jsonPatch5 = patchOperations5.build(); + UpstreamLineage result5 = upstreamLineageTemplate.applyPatch(result4, jsonPatch5); + // Hack because Jackson parses values to doubles instead of floats + DataMap dataMap5 = new DataMap(); + dataMap5.put("confidenceScore", 1.0); + FineGrainedLineage fineGrainedLineage5 = new FineGrainedLineage(dataMap5); + fineGrainedLineage5.setUpstreams(upstreamUrns3); + fineGrainedLineage5.setDownstreams(downstreamUrns5); + fineGrainedLineage5.setTransformOperation("TRANSFORM"); + fineGrainedLineage5.setUpstreamType(FineGrainedLineageUpstreamType.FIELD_SET); + fineGrainedLineage5.setDownstreamType(FineGrainedLineageDownstreamType.FIELD); + fineGrainedLineage5.setQuery(UrnUtils.getUrn("urn:li:query:anotherQuery")); + // New entry in array because of new transformation type + assertEquals(result5.getFineGrainedLineages().get(4), fineGrainedLineage5); + // Remove JsonPatchBuilder removeOperations = Json.createPatchBuilder(); removeOperations.remove( diff --git a/li-utils/src/main/java/com/linkedin/metadata/Constants.java b/li-utils/src/main/java/com/linkedin/metadata/Constants.java index f1f096640bc216..077e0e2b666be1 100644 --- a/li-utils/src/main/java/com/linkedin/metadata/Constants.java +++ b/li-utils/src/main/java/com/linkedin/metadata/Constants.java @@ -10,6 +10,7 @@ public class Constants { public static final String INTERNAL_DELEGATED_FOR_ACTOR_HEADER_NAME = "X-DataHub-Delegated-For"; public static final String INTERNAL_DELEGATED_FOR_ACTOR_TYPE = "X-DataHub-Delegated-For-"; + public static final String URN_LI_PREFIX = "urn:li:"; public static final String DATAHUB_ACTOR = "urn:li:corpuser:datahub"; // Super user. public static final String SYSTEM_ACTOR = "urn:li:corpuser:__datahub_system"; // DataHub internal service principal. diff --git a/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/_datahub_ol_adapter.py b/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/_datahub_ol_adapter.py index 7d35791bf1db42..69de61aced0a59 100644 --- a/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/_datahub_ol_adapter.py +++ b/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/_datahub_ol_adapter.py @@ -8,7 +8,6 @@ OL_SCHEME_TWEAKS = { "sqlserver": "mssql", - "trino": "presto", "awsathena": "athena", } diff --git a/metadata-ingestion/docs/sources/iceberg/iceberg.md b/metadata-ingestion/docs/sources/iceberg/iceberg.md index 7e40315a2e3193..92aac5ffa6ce51 100644 --- a/metadata-ingestion/docs/sources/iceberg/iceberg.md +++ b/metadata-ingestion/docs/sources/iceberg/iceberg.md @@ -18,6 +18,8 @@ This ingestion source maps the following Source System Concepts to DataHub Conce ## Troubleshooting -### [Common Issue] +### Exceptions while increasing `processing_threads` -[Provide description of common issues with this integration and steps to resolve] +Each processing thread will open several files/sockets to download manifest files from blob storage. If you experience +exceptions appearing when increasing `processing_threads` configuration parameter, try to increase limit of open +files (i.e. using `ulimit` in Linux). diff --git a/metadata-ingestion/docs/sources/sigma/sigma_pre.md b/metadata-ingestion/docs/sources/sigma/sigma_pre.md index 382a2fe67b944d..433f85a69f907c 100644 --- a/metadata-ingestion/docs/sources/sigma/sigma_pre.md +++ b/metadata-ingestion/docs/sources/sigma/sigma_pre.md @@ -16,7 +16,7 @@ This source extracts the following: | Sigma | Datahub | Notes | |------------------------|---------------------------------------------------------------|----------------------------------| | `Workspace` | [Container](../../metamodel/entities/container.md) | SubType `"Sigma Workspace"` | -| `Workbook` | [Container](../../metamodel/entities/container.md) | SubType `"Sigma Workbook"` | +| `Workbook` | [Dashboard](../../metamodel/entities/dashboard.md) | SubType `"Sigma Workbook"` | | `Page` | [Dashboard](../../metamodel/entities/dashboard.md) | | | `Element` | [Chart](../../metamodel/entities/chart.md) | | | `Dataset` | [Dataset](../../metamodel/entities/dataset.md) | SubType `"Sigma Dataset"` | diff --git a/metadata-ingestion/examples/mce_files/bootstrap_mce.json b/metadata-ingestion/examples/mce_files/bootstrap_mce.json index f0c4e7ff996ed3..bc218e5e8c2d53 100644 --- a/metadata-ingestion/examples/mce_files/bootstrap_mce.json +++ b/metadata-ingestion/examples/mce_files/bootstrap_mce.json @@ -3613,33 +3613,6 @@ }, "systemMetadata": null }, - { - "entityType": "post", - "entityUrn": "urn:li:post:f3a68539-f7e4-4c41-a4fd-9e57c085d8de", - "changeType": "UPSERT", - "aspectName": "postInfo", - "aspect": { - "json": { - "type": "HOME_PAGE_ANNOUNCEMENT", - "content": { - "title": "Join Metadata & AI Summit 2024", - "type": "LINK", - "link": "http://www.acryldata.io/conference?utm_source=datahub_quickstart&utm_medium=metadata_ai_2024&utm_campaign=pinned_announcement", - "media": { - "type": "IMAGE", - "location": "https://formulatedby.com/wp-content/uploads/2024/07/0193320a6d93e7508d1598f7b24662f75a87e92f-352x456-1.svg" - } - }, - "created": 1712547125049, - "lastModified": 1712547125049 - } - }, - "systemMetadata": { - "lastObserved": 1712548844816, - "runId": "datahub-2024_04_08-13_00_44", - "lastRunId": "no-run-id-provided" - } - }, { "entityType": "post", "entityUrn": "urn:li:post:f3a68539-f7e4-4c41-a4fd-9e57c085d8dd", diff --git a/metadata-ingestion/setup.py b/metadata-ingestion/setup.py index 8ae112c0ab0b2b..d7e056b31370df 100644 --- a/metadata-ingestion/setup.py +++ b/metadata-ingestion/setup.py @@ -14,8 +14,8 @@ ) base_requirements = { - # Typing extension should be >=3.10.0.2 ideally but we can't restrict due to a Airflow 2.1 dependency conflict. - "typing_extensions>=3.7.4.3", + # Our min version of typing_extensions is somewhat constrained by Airflow. + "typing_extensions>=3.10.0.2", # Actual dependencies. "typing-inspect", # pydantic 1.8.2 is incompatible with mypy 0.910. @@ -249,7 +249,8 @@ iceberg_common = { # Iceberg Python SDK - "pyiceberg>=0.4,<0.7", + # Kept at 0.4.0 due to higher versions requiring pydantic>2, as soon as we are fine with it, bump this dependency + "pyiceberg>=0.4.0", } mssql_common = { @@ -773,7 +774,7 @@ "trino = datahub.ingestion.source.sql.trino:TrinoSource", "starburst-trino-usage = datahub.ingestion.source.usage.starburst_trino_usage:TrinoUsageSource", "nifi = datahub.ingestion.source.nifi:NifiSource", - "powerbi = datahub.ingestion.source.powerbi:PowerBiDashboardSource", + "powerbi = datahub.ingestion.source.powerbi.powerbi:PowerBiDashboardSource", "powerbi-report-server = datahub.ingestion.source.powerbi_report_server:PowerBiReportServerDashboardSource", "iceberg = datahub.ingestion.source.iceberg.iceberg:IcebergSource", "vertica = datahub.ingestion.source.sql.vertica:VerticaSource", diff --git a/metadata-ingestion/src/datahub/configuration/kafka_consumer_config.py b/metadata-ingestion/src/datahub/configuration/kafka_consumer_config.py index d3ff5998d3e790..cac6bb4996391f 100644 --- a/metadata-ingestion/src/datahub/configuration/kafka_consumer_config.py +++ b/metadata-ingestion/src/datahub/configuration/kafka_consumer_config.py @@ -30,6 +30,9 @@ def _resolve_oauth_callback(self) -> None: call_back = self.get_call_back_attribute() - assert call_back # to silent lint + assert isinstance(call_back, str), ( + "oauth_cb must be a string representing python function reference " + "in the format :." + ) # Set the callback self._config[CallableConsumerConfig.CALLBACK_ATTRIBUTE] = import_path(call_back) diff --git a/metadata-ingestion/src/datahub/ingestion/graph/client.py b/metadata-ingestion/src/datahub/ingestion/graph/client.py index 759aebcfd46b0a..4aa937639e9590 100644 --- a/metadata-ingestion/src/datahub/ingestion/graph/client.py +++ b/metadata-ingestion/src/datahub/ingestion/graph/client.py @@ -67,6 +67,7 @@ SystemMetadataClass, TelemetryClientIdClass, ) +from datahub.telemetry.telemetry import telemetry_instance from datahub.utilities.perf_timer import PerfTimer from datahub.utilities.str_enum import StrEnum from datahub.utilities.urns.urn import Urn, guess_entity_type @@ -1819,4 +1820,5 @@ def get_default_graph() -> DataHubGraph: graph_config = config_utils.load_client_config() graph = DataHubGraph(graph_config) graph.test_connection() + telemetry_instance.set_context(server=graph) return graph diff --git a/metadata-ingestion/src/datahub/ingestion/run/pipeline.py b/metadata-ingestion/src/datahub/ingestion/run/pipeline.py index 7c3a42c3e08931..667129ff83584a 100644 --- a/metadata-ingestion/src/datahub/ingestion/run/pipeline.py +++ b/metadata-ingestion/src/datahub/ingestion/run/pipeline.py @@ -44,7 +44,8 @@ ) from datahub.ingestion.transformer.transform_registry import transform_registry from datahub.metadata.schema_classes import MetadataChangeProposalClass -from datahub.telemetry import stats, telemetry +from datahub.telemetry import stats +from datahub.telemetry.telemetry import telemetry_instance from datahub.utilities._custom_package_loader import model_version_name from datahub.utilities.global_warning_util import ( clear_global_warnings, @@ -273,8 +274,9 @@ def __init__( if self.graph is None and isinstance(self.sink, DatahubRestSink): with _add_init_error_context("setup default datahub client"): self.graph = self.sink.emitter.to_graph() + self.graph.test_connection() self.ctx.graph = self.graph - telemetry.telemetry_instance.update_capture_exception_context(server=self.graph) + telemetry_instance.set_context(server=self.graph) with set_graph_context(self.graph): with _add_init_error_context("configure reporters"): @@ -615,7 +617,7 @@ def log_ingestion_stats(self) -> None: sink_warnings = len(self.sink.get_report().warnings) global_warnings = len(get_global_warnings()) - telemetry.telemetry_instance.ping( + telemetry_instance.ping( "ingest_stats", { "source_type": self.source_type, @@ -637,7 +639,6 @@ def log_ingestion_stats(self) -> None: ), "has_pipeline_name": bool(self.config.pipeline_name), }, - self.ctx.graph, ) def _approx_all_vals(self, d: LossyList[Any]) -> int: diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py index 76c2fbf48ccaba..16a5268a2dea76 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py +++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py @@ -95,6 +95,10 @@ def cleanup(config: BigQueryV2Config) -> None: "Optionally enabled via `classification.enabled`", supported=True, ) +@capability( + SourceCapability.PARTITION_SUPPORT, + "Enabled by default, partition keys and clustering keys are supported.", +) class BigqueryV2Source(StatefulIngestionSourceBase, TestableSource): def __init__(self, ctx: PipelineContext, config: BigQueryV2Config): super().__init__(config, ctx) diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_report.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_report.py index 7e8b2931282fff..06842da67f76ca 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_report.py +++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_report.py @@ -8,7 +8,7 @@ from datahub.ingestion.api.report import Report from datahub.ingestion.glossary.classification_mixin import ClassificationReportMixin -from datahub.ingestion.source.sql.sql_generic_profiler import ProfilingSqlReport +from datahub.ingestion.source.sql.sql_report import SQLSourceReport from datahub.ingestion.source_report.ingestion_stage import IngestionStageReport from datahub.ingestion.source_report.time_window import BaseTimeWindowReport from datahub.sql_parsing.sql_parsing_aggregator import SqlAggregatorReport @@ -77,7 +77,7 @@ class BigQueryQueriesExtractorReport(Report): @dataclass class BigQueryV2Report( - ProfilingSqlReport, + SQLSourceReport, IngestionStageReport, BaseTimeWindowReport, ClassificationReportMixin, diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_schema.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_schema.py index 58317b108bef4c..3ce34be8dc89df 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_schema.py +++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_schema.py @@ -118,7 +118,6 @@ class BigqueryTable(BaseTable): active_billable_bytes: Optional[int] = None long_term_billable_bytes: Optional[int] = None partition_info: Optional[PartitionInfo] = None - columns_ignore_from_profiling: List[str] = field(default_factory=list) external: bool = False constraints: List[BigqueryTableConstraint] = field(default_factory=list) table_type: Optional[str] = None @@ -152,6 +151,21 @@ class BigqueryDataset: snapshots: List[BigqueryTableSnapshot] = field(default_factory=list) columns: List[BigqueryColumn] = field(default_factory=list) + # Some INFORMATION_SCHEMA views are not available for BigLake tables + # based on Amazon S3 and Blob Storage data. + # https://cloud.google.com/bigquery/docs/omni-introduction#limitations + # Omni Locations - https://cloud.google.com/bigquery/docs/omni-introduction#locations + def is_biglake_dataset(self) -> bool: + return self.location is not None and self.location.lower().startswith( + ("aws-", "azure-") + ) + + def supports_table_constraints(self) -> bool: + return not self.is_biglake_dataset() + + def supports_table_partitions(self) -> bool: + return not self.is_biglake_dataset() + @dataclass class BigqueryProject: @@ -541,18 +555,26 @@ def get_table_constraints_for_dataset( table_name=constraint.table_name, type=constraint.constraint_type, field_path=constraint.column_name, - referenced_project_id=constraint.referenced_catalog - if constraint.constraint_type == "FOREIGN KEY" - else None, - referenced_dataset=constraint.referenced_schema - if constraint.constraint_type == "FOREIGN KEY" - else None, - referenced_table_name=constraint.referenced_table - if constraint.constraint_type == "FOREIGN KEY" - else None, - referenced_column_name=constraint.referenced_column - if constraint.constraint_type == "FOREIGN KEY" - else None, + referenced_project_id=( + constraint.referenced_catalog + if constraint.constraint_type == "FOREIGN KEY" + else None + ), + referenced_dataset=( + constraint.referenced_schema + if constraint.constraint_type == "FOREIGN KEY" + else None + ), + referenced_table_name=( + constraint.referenced_table + if constraint.constraint_type == "FOREIGN KEY" + else None + ), + referenced_column_name=( + constraint.referenced_column + if constraint.constraint_type == "FOREIGN KEY" + else None + ), ) ) self.report.num_get_table_constraints_for_dataset_api_requests += 1 diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py index 6f3008ccfd6923..4a3b47f6b543a6 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py @@ -498,7 +498,10 @@ def _process_schema( report=self.report, rate_limiter=rate_limiter, ) - if self.config.include_table_constraints: + if ( + self.config.include_table_constraints + and bigquery_dataset.supports_table_constraints() + ): constraints = self.schema_api.get_table_constraints_for_dataset( project_id=project_id, dataset_name=dataset_name, report=self.report ) @@ -595,18 +598,6 @@ def _process_schema( dataset_name=dataset_name, ) - # This method is used to generate the ignore list for datatypes the profiler doesn't support we have to do it here - # because the profiler doesn't have access to columns - def generate_profile_ignore_list(self, columns: List[BigqueryColumn]) -> List[str]: - ignore_list: List[str] = [] - for column in columns: - if not column.data_type or any( - word in column.data_type.lower() - for word in ["array", "struct", "geography", "json"] - ): - ignore_list.append(column.field_path) - return ignore_list - def _process_table( self, table: BigqueryTable, @@ -628,15 +619,6 @@ def _process_table( ) table.column_count = len(columns) - # We only collect profile ignore list if profiling is enabled and profile_table_level_only is false - if ( - self.config.is_profiling_enabled() - and not self.config.profiling.profile_table_level_only - ): - table.columns_ignore_from_profiling = self.generate_profile_ignore_list( - columns - ) - if not table.column_count: logger.warning( f"Table doesn't have any column or unable to get columns for table: {table_identifier}" @@ -1157,9 +1139,11 @@ def gen_schema_metadata( # fields=[], fields=self.gen_schema_fields( columns, - table.constraints - if (isinstance(table, BigqueryTable) and table.constraints) - else [], + ( + table.constraints + if (isinstance(table, BigqueryTable) and table.constraints) + else [] + ), ), foreignKeys=foreign_keys if foreign_keys else None, ) @@ -1180,13 +1164,9 @@ def get_tables_for_dataset( ) -> Iterable[BigqueryTable]: # In bigquery there is no way to query all tables in a Project id with PerfTimer() as timer: - # PARTITIONS INFORMATION_SCHEMA view is not available for BigLake tables - # based on Amazon S3 and Blob Storage data. - # https://cloud.google.com/bigquery/docs/omni-introduction#limitations - # Omni Locations - https://cloud.google.com/bigquery/docs/omni-introduction#locations - with_partitions = self.config.have_table_data_read_permission and not ( - dataset.location - and dataset.location.lower().startswith(("aws-", "azure-")) + with_partitions = ( + self.config.have_table_data_read_permission + and dataset.supports_table_partitions() ) # Partitions view throw exception if we try to query partition info for too many tables diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/profiler.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/profiler.py index 6af8166fbf70c3..182ae2265cb162 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/profiler.py +++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/profiler.py @@ -166,12 +166,6 @@ def get_workunits( normalized_table_name = BigqueryTableIdentifier( project_id=project_id, dataset=dataset, table=table.name ).get_table_name() - for column in table.columns_ignore_from_profiling: - # Profiler has issues with complex types (array, struct, geography, json), so we deny those types from profiling - # We also filter columns without data type as it means that column is part of a complex type. - self.config.profile_pattern.deny.append( - f"^{normalized_table_name}.{column}$" - ) if table.external and not self.config.profiling.profile_external_tables: self.report.profiling_skipped_other[f"{project_id}.{dataset}"] += 1 diff --git a/metadata-ingestion/src/datahub/ingestion/source/dbt/dbt_common.py b/metadata-ingestion/src/datahub/ingestion/source/dbt/dbt_common.py index 4598ae388b827d..499e7e1231d050 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/dbt/dbt_common.py +++ b/metadata-ingestion/src/datahub/ingestion/source/dbt/dbt_common.py @@ -53,19 +53,7 @@ make_assertion_from_test, make_assertion_result_from_test, ) -from datahub.ingestion.source.sql.sql_types import ( - ATHENA_SQL_TYPES_MAP, - BIGQUERY_TYPES_MAP, - POSTGRES_TYPES_MAP, - SNOWFLAKE_TYPES_MAP, - SPARK_SQL_TYPES_MAP, - TRINO_SQL_TYPES_MAP, - VERTICA_SQL_TYPES_MAP, - resolve_athena_modified_type, - resolve_postgres_modified_type, - resolve_trino_modified_type, - resolve_vertica_modified_type, -) +from datahub.ingestion.source.sql.sql_types import resolve_sql_type from datahub.ingestion.source.state.stale_entity_removal_handler import ( StaleEntityRemovalHandler, StaleEntityRemovalSourceReport, @@ -89,17 +77,11 @@ from datahub.metadata.com.linkedin.pegasus2avro.metadata.snapshot import DatasetSnapshot from datahub.metadata.com.linkedin.pegasus2avro.mxe import MetadataChangeEvent from datahub.metadata.com.linkedin.pegasus2avro.schema import ( - BooleanTypeClass, - DateTypeClass, MySqlDDL, NullTypeClass, - NumberTypeClass, - RecordType, SchemaField, SchemaFieldDataType, SchemaMetadata, - StringTypeClass, - TimeTypeClass, ) from datahub.metadata.schema_classes import ( DataPlatformInstanceClass, @@ -804,28 +786,6 @@ def make_mapping_upstream_lineage( ) -# See https://github.com/fishtown-analytics/dbt/blob/master/core/dbt/adapters/sql/impl.py -_field_type_mapping = { - "boolean": BooleanTypeClass, - "date": DateTypeClass, - "time": TimeTypeClass, - "numeric": NumberTypeClass, - "text": StringTypeClass, - "timestamp with time zone": DateTypeClass, - "timestamp without time zone": DateTypeClass, - "integer": NumberTypeClass, - "float8": NumberTypeClass, - "struct": RecordType, - **POSTGRES_TYPES_MAP, - **SNOWFLAKE_TYPES_MAP, - **BIGQUERY_TYPES_MAP, - **SPARK_SQL_TYPES_MAP, - **TRINO_SQL_TYPES_MAP, - **ATHENA_SQL_TYPES_MAP, - **VERTICA_SQL_TYPES_MAP, -} - - def get_column_type( report: DBTSourceReport, dataset_name: str, @@ -835,24 +795,10 @@ def get_column_type( """ Maps known DBT types to datahub types """ - TypeClass: Any = _field_type_mapping.get(column_type) if column_type else None - - if TypeClass is None and column_type: - # resolve a modified type - if dbt_adapter == "trino": - TypeClass = resolve_trino_modified_type(column_type) - elif dbt_adapter == "athena": - TypeClass = resolve_athena_modified_type(column_type) - elif dbt_adapter == "postgres" or dbt_adapter == "redshift": - # Redshift uses a variant of Postgres, so we can use the same logic. - TypeClass = resolve_postgres_modified_type(column_type) - elif dbt_adapter == "vertica": - TypeClass = resolve_vertica_modified_type(column_type) - elif dbt_adapter == "snowflake": - # Snowflake types are uppercase, so we check that. - TypeClass = _field_type_mapping.get(column_type.upper()) - - # if still not found, report the warning + + TypeClass = resolve_sql_type(column_type, dbt_adapter) + + # if still not found, report a warning if TypeClass is None: if column_type: report.info( @@ -861,9 +807,9 @@ def get_column_type( context=f"{dataset_name} - {column_type}", log=False, ) - TypeClass = NullTypeClass + TypeClass = NullTypeClass() - return SchemaFieldDataType(type=TypeClass()) + return SchemaFieldDataType(type=TypeClass) @platform_name("dbt") diff --git a/metadata-ingestion/src/datahub/ingestion/source/dremio/dremio_reporting.py b/metadata-ingestion/src/datahub/ingestion/source/dremio/dremio_reporting.py index ccc685382f374d..926dbd42eb2673 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/dremio/dremio_reporting.py +++ b/metadata-ingestion/src/datahub/ingestion/source/dremio/dremio_reporting.py @@ -1,7 +1,7 @@ from dataclasses import dataclass from datetime import datetime -from datahub.ingestion.source.sql.sql_generic_profiler import ProfilingSqlReport +from datahub.ingestion.source.sql.sql_report import SQLSourceReport from datahub.ingestion.source.state.stale_entity_removal_handler import ( StaleEntityRemovalSourceReport, ) @@ -10,7 +10,7 @@ @dataclass class DremioSourceReport( - ProfilingSqlReport, StaleEntityRemovalSourceReport, IngestionStageReport + SQLSourceReport, StaleEntityRemovalSourceReport, IngestionStageReport ): num_containers_failed: int = 0 num_datasets_failed: int = 0 diff --git a/metadata-ingestion/src/datahub/ingestion/source/feast.py b/metadata-ingestion/src/datahub/ingestion/source/feast.py index e097fd1f221ea5..6330fe0291660d 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/feast.py +++ b/metadata-ingestion/src/datahub/ingestion/source/feast.py @@ -42,10 +42,14 @@ from datahub.metadata.com.linkedin.pegasus2avro.mxe import MetadataChangeEvent from datahub.metadata.schema_classes import ( BrowsePathsClass, + GlobalTagsClass, MLFeaturePropertiesClass, MLFeatureTablePropertiesClass, MLPrimaryKeyPropertiesClass, + OwnerClass, + OwnershipClass, StatusClass, + TagAssociationClass, ) # FIXME: ValueType module cannot be used as a type @@ -91,6 +95,24 @@ class FeastRepositorySourceConfig(ConfigModel): environment: str = Field( default=DEFAULT_ENV, description="Environment to use when constructing URNs" ) + # owner_mappings example: + # This must be added to the recipe in order to extract owners, otherwise NO owners will be extracted + # owner_mappings: + # - feast_owner_name: "" + # datahub_owner_urn: "urn:li:corpGroup:" + # datahub_ownership_type: "BUSINESS_OWNER" + owner_mappings: Optional[List[Dict[str, str]]] = Field( + default=None, description="Mapping of owner names to owner types" + ) + enable_owner_extraction: bool = Field( + default=False, + description="If this is disabled, then we NEVER try to map owners. " + "If this is enabled, then owner_mappings is REQUIRED to extract ownership.", + ) + enable_tag_extraction: bool = Field( + default=False, + description="If this is disabled, then we NEVER try to extract tags.", + ) @platform_name("Feast") @@ -215,10 +237,15 @@ def _get_entity_workunit( """ feature_view_name = f"{self.feature_store.project}.{feature_view.name}" + aspects = ( + [StatusClass(removed=False)] + + self._get_tags(entity) + + self._get_owners(entity) + ) entity_snapshot = MLPrimaryKeySnapshot( urn=builder.make_ml_primary_key_urn(feature_view_name, entity.name), - aspects=[StatusClass(removed=False)], + aspects=aspects, ) entity_snapshot.aspects.append( @@ -243,10 +270,11 @@ def _get_feature_workunit( Generate an MLFeature work unit for a Feast feature. """ feature_view_name = f"{self.feature_store.project}.{feature_view.name}" + aspects = [StatusClass(removed=False)] + self._get_tags(field) feature_snapshot = MLFeatureSnapshot( urn=builder.make_ml_feature_urn(feature_view_name, field.name), - aspects=[StatusClass(removed=False)], + aspects=aspects, ) feature_sources = [] @@ -295,13 +323,18 @@ def _get_feature_view_workunit(self, feature_view: FeatureView) -> MetadataWorkU """ feature_view_name = f"{self.feature_store.project}.{feature_view.name}" + aspects = ( + [ + BrowsePathsClass(paths=[f"/feast/{self.feature_store.project}"]), + StatusClass(removed=False), + ] + + self._get_tags(feature_view) + + self._get_owners(feature_view) + ) feature_view_snapshot = MLFeatureTableSnapshot( urn=builder.make_ml_feature_table_urn("feast", feature_view_name), - aspects=[ - BrowsePathsClass(paths=[f"/feast/{self.feature_store.project}"]), - StatusClass(removed=False), - ], + aspects=aspects, ) feature_view_snapshot.aspects.append( @@ -360,6 +393,64 @@ def _get_on_demand_feature_view_workunit( return MetadataWorkUnit(id=on_demand_feature_view_name, mce=mce) + # If a tag is specified in a Feast object, then the tag will be ingested into Datahub if enable_tag_extraction is + # True, otherwise NO tags will be ingested + def _get_tags(self, obj: Union[Entity, FeatureView, FeastField]) -> list: + """ + Extracts tags from the given object and returns a list of aspects. + """ + aspects: List[Union[GlobalTagsClass]] = [] + + # Extract tags + if self.source_config.enable_tag_extraction: + if obj.tags.get("name"): + tag_name: str = obj.tags["name"] + tag_association = TagAssociationClass( + tag=builder.make_tag_urn(tag_name) + ) + global_tags_aspect = GlobalTagsClass(tags=[tag_association]) + aspects.append(global_tags_aspect) + + return aspects + + # If an owner is specified in a Feast object, it will only be ingested into Datahub if owner_mappings is specified + # and enable_owner_extraction is True in FeastRepositorySourceConfig, otherwise NO owners will be ingested + def _get_owners(self, obj: Union[Entity, FeatureView, FeastField]) -> list: + """ + Extracts owners from the given object and returns a list of aspects. + """ + aspects: List[Union[OwnershipClass]] = [] + + # Extract owner + if self.source_config.enable_owner_extraction: + owner = getattr(obj, "owner", None) + if owner: + # Create owner association, skipping if None + owner_association = self._create_owner_association(owner) + if owner_association: # Only add valid owner associations + owners_aspect = OwnershipClass(owners=[owner_association]) + aspects.append(owners_aspect) + + return aspects + + def _create_owner_association(self, owner: str) -> Optional[OwnerClass]: + """ + Create an OwnerClass instance for the given owner using the owner mappings. + """ + if self.source_config.owner_mappings is not None: + for mapping in self.source_config.owner_mappings: + if mapping["feast_owner_name"] == owner: + ownership_type_class: str = mapping.get( + "datahub_ownership_type", "TECHNICAL_OWNER" + ) + datahub_owner_urn = mapping.get("datahub_owner_urn") + if datahub_owner_urn: + return OwnerClass( + owner=datahub_owner_urn, + type=ownership_type_class, + ) + return None + @classmethod def create(cls, config_dict, ctx): config = FeastRepositorySourceConfig.parse_obj(config_dict) diff --git a/metadata-ingestion/src/datahub/ingestion/source/gc/datahub_gc.py b/metadata-ingestion/src/datahub/ingestion/source/gc/datahub_gc.py index c4b4186f45fc38..52807ca2a3f026 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/gc/datahub_gc.py +++ b/metadata-ingestion/src/datahub/ingestion/source/gc/datahub_gc.py @@ -144,15 +144,32 @@ def get_workunits_internal( self, ) -> Iterable[MetadataWorkUnit]: if self.config.cleanup_expired_tokens: - self.revoke_expired_tokens() + try: + self.revoke_expired_tokens() + except Exception as e: + self.report.failure("While trying to cleanup expired token ", exc=e) if self.config.truncate_indices: - self.truncate_indices() + try: + self.truncate_indices() + except Exception as e: + self.report.failure("While trying to truncate indices ", exc=e) if self.dataprocess_cleanup: - yield from self.dataprocess_cleanup.get_workunits_internal() + try: + yield from self.dataprocess_cleanup.get_workunits_internal() + except Exception as e: + self.report.failure("While trying to cleanup data process ", exc=e) if self.soft_deleted_entities_cleanup: - self.soft_deleted_entities_cleanup.cleanup_soft_deleted_entities() + try: + self.soft_deleted_entities_cleanup.cleanup_soft_deleted_entities() + except Exception as e: + self.report.failure( + "While trying to cleanup soft deleted entities ", exc=e + ) if self.execution_request_cleanup: - self.execution_request_cleanup.run() + try: + self.execution_request_cleanup.run() + except Exception as e: + self.report.failure("While trying to cleanup execution request ", exc=e) yield from [] def truncate_indices(self) -> None: diff --git a/metadata-ingestion/src/datahub/ingestion/source/gc/dataprocess_cleanup.py b/metadata-ingestion/src/datahub/ingestion/source/gc/dataprocess_cleanup.py index 130f2c9c2e12fc..0f35e1a67fede7 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/gc/dataprocess_cleanup.py +++ b/metadata-ingestion/src/datahub/ingestion/source/gc/dataprocess_cleanup.py @@ -404,7 +404,9 @@ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: try: self.delete_dpi_from_datajobs(datajob_entity) except Exception as e: - logger.error(f"While trying to delete {datajob_entity} got {e}") + self.report.failure( + f"While trying to delete {datajob_entity} ", exc=e + ) if ( datajob_entity.total_runs == 0 and self.config.delete_empty_data_jobs diff --git a/metadata-ingestion/src/datahub/ingestion/source/ge_data_profiler.py b/metadata-ingestion/src/datahub/ingestion/source/ge_data_profiler.py index d175fce04a52c2..f7d783cd3dec0b 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/ge_data_profiler.py +++ b/metadata-ingestion/src/datahub/ingestion/source/ge_data_profiler.py @@ -7,6 +7,7 @@ import functools import json import logging +import re import threading import traceback import unittest.mock @@ -55,7 +56,7 @@ Cardinality, convert_to_cardinality, ) -from datahub.ingestion.source.sql.sql_common import SQLSourceReport +from datahub.ingestion.source.sql.sql_report import SQLSourceReport from datahub.metadata.com.linkedin.pegasus2avro.schema import EditableSchemaMetadata from datahub.metadata.schema_classes import ( DatasetFieldProfileClass, @@ -123,6 +124,8 @@ _datasource_connection_injection_lock = threading.Lock() +NORMALIZE_TYPE_PATTERN = re.compile(r"^(.*?)(?:[\[<(].*)?$") + @contextlib.contextmanager def _inject_connection_into_datasource(conn: Connection) -> Iterator[None]: @@ -165,11 +168,9 @@ def get_column_unique_count_dh_patch(self: SqlAlchemyDataset, column: str) -> in return convert_to_json_serializable(element_values.fetchone()[0]) elif self.engine.dialect.name.lower() == BIGQUERY: element_values = self.engine.execute( - sa.select( - [ - sa.func.coalesce(sa.text(f"APPROX_COUNT_DISTINCT(`{column}`)")), - ] - ).select_from(self._table) + sa.select(sa.func.APPROX_COUNT_DISTINCT(sa.column(column))).select_from( + self._table + ) ) return convert_to_json_serializable(element_values.fetchone()[0]) elif self.engine.dialect.name.lower() == SNOWFLAKE: @@ -378,6 +379,9 @@ def _get_columns_to_profile(self) -> List[str]: f"{self.dataset_name}.{col}" ): ignored_columns_by_pattern.append(col) + # We try to ignore nested columns as well + elif not self.config.profile_nested_fields and "." in col: + ignored_columns_by_pattern.append(col) elif col_dict.get("type") and self._should_ignore_column(col_dict["type"]): ignored_columns_by_type.append(col) else: @@ -407,9 +411,18 @@ def _get_columns_to_profile(self) -> List[str]: return columns_to_profile def _should_ignore_column(self, sqlalchemy_type: sa.types.TypeEngine) -> bool: - return str(sqlalchemy_type) in _get_column_types_to_ignore( - self.dataset.engine.dialect.name - ) + # We don't profiles columns with None types + if str(sqlalchemy_type) == "NULL": + return True + + sql_type = str(sqlalchemy_type) + + match = re.match(NORMALIZE_TYPE_PATTERN, sql_type) + + if match: + sql_type = match.group(1) + + return sql_type in _get_column_types_to_ignore(self.dataset.engine.dialect.name) @_run_with_query_combiner def _get_column_type(self, column_spec: _SingleColumnSpec, column: str) -> None: @@ -1397,6 +1410,8 @@ def _get_ge_dataset( def _get_column_types_to_ignore(dialect_name: str) -> List[str]: if dialect_name.lower() == POSTGRESQL: return ["JSON"] + elif dialect_name.lower() == BIGQUERY: + return ["ARRAY", "STRUCT", "GEOGRAPHY", "JSON"] return [] diff --git a/metadata-ingestion/src/datahub/ingestion/source/ge_profiling_config.py b/metadata-ingestion/src/datahub/ingestion/source/ge_profiling_config.py index c20506e36a844f..42d0def0a46e7d 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/ge_profiling_config.py +++ b/metadata-ingestion/src/datahub/ingestion/source/ge_profiling_config.py @@ -125,12 +125,16 @@ class GEProfilingConfig(GEProfilingBaseConfig): profile_table_size_limit: Optional[int] = Field( default=5, - description="Profile tables only if their size is less then specified GBs. If set to `null`, no limit on the size of tables to profile. Supported only in `snowflake` and `BigQuery`", + description="Profile tables only if their size is less than specified GBs. If set to `null`, " + "no limit on the size of tables to profile. Supported only in `snowflake` and `BigQuery`" + "Supported for `oracle` based on calculated size from gathered stats.", ) profile_table_row_limit: Optional[int] = Field( default=5000000, - description="Profile tables only if their row count is less then specified count. If set to `null`, no limit on the row count of tables to profile. Supported only in `snowflake` and `BigQuery`", + description="Profile tables only if their row count is less than specified count. If set to `null`, " + "no limit on the row count of tables to profile. Supported only in `snowflake` and `BigQuery`" + "Supported for `oracle` based on gathered stats.", ) profile_table_row_count_estimate_only: bool = Field( @@ -184,6 +188,11 @@ class GEProfilingConfig(GEProfilingBaseConfig): ), ) + profile_nested_fields: bool = Field( + default=False, + description="Whether to profile complex types like structs, arrays and maps. ", + ) + @pydantic.root_validator(pre=True) def deprecate_bigquery_temp_table_schema(cls, values): # TODO: Update docs to remove mention of this field. diff --git a/metadata-ingestion/src/datahub/ingestion/source/iceberg/iceberg.py b/metadata-ingestion/src/datahub/ingestion/source/iceberg/iceberg.py index 258a4b9ad6daf6..5931873f54236d 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/iceberg/iceberg.py +++ b/metadata-ingestion/src/datahub/ingestion/source/iceberg/iceberg.py @@ -9,6 +9,7 @@ NoSuchIcebergTableError, NoSuchNamespaceError, NoSuchPropertyException, + NoSuchTableError, ) from pyiceberg.schema import Schema, SchemaVisitorPerPrimitiveType, visit from pyiceberg.table import Table @@ -104,7 +105,7 @@ @capability(SourceCapability.DESCRIPTIONS, "Enabled by default.") @capability( SourceCapability.OWNERSHIP, - "Optionally enabled via configuration by specifying which Iceberg table property holds user or group ownership.", + "Automatically ingests ownership information from table properties based on `user_ownership_property` and `group_ownership_property`", ) @capability(SourceCapability.DELETION_DETECTION, "Enabled via stateful ingestion") class IcebergSource(StatefulIngestionSourceBase): @@ -192,9 +193,7 @@ def _process_dataset(dataset_path: Identifier) -> Iterable[MetadataWorkUnit]: table = thread_local.local_catalog.load_table(dataset_path) time_taken = timer.elapsed_seconds() self.report.report_table_load_time(time_taken) - LOGGER.debug( - f"Loaded table: {table.identifier}, time taken: {time_taken}" - ) + LOGGER.debug(f"Loaded table: {table.name()}, time taken: {time_taken}") yield from self._create_iceberg_workunit(dataset_name, table) except NoSuchPropertyException as e: self.report.report_warning( @@ -206,12 +205,20 @@ def _process_dataset(dataset_path: Identifier) -> Iterable[MetadataWorkUnit]: ) except NoSuchIcebergTableError as e: self.report.report_warning( - "no-iceberg-table", + "not-an-iceberg-table", f"Failed to create workunit for {dataset_name}. {e}", ) LOGGER.warning( f"NoSuchIcebergTableError while processing table {dataset_path}, skipping it.", ) + except NoSuchTableError as e: + self.report.report_warning( + "no-such-table", + f"Failed to create workunit for {dataset_name}. {e}", + ) + LOGGER.warning( + f"NoSuchTableError while processing table {dataset_path}, skipping it.", + ) except Exception as e: self.report.report_failure("general", f"Failed to create workunit: {e}") LOGGER.exception( diff --git a/metadata-ingestion/src/datahub/ingestion/source/kafka/kafka.py b/metadata-ingestion/src/datahub/ingestion/source/kafka/kafka.py index 06d929774240ba..709ba431f0f87b 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/kafka/kafka.py +++ b/metadata-ingestion/src/datahub/ingestion/source/kafka/kafka.py @@ -148,7 +148,7 @@ def get_kafka_consumer( ) -> confluent_kafka.Consumer: consumer = confluent_kafka.Consumer( { - "group.id": "test", + "group.id": "datahub-kafka-ingestion", "bootstrap.servers": connection.bootstrap, **connection.consumer_config, } @@ -157,11 +157,32 @@ def get_kafka_consumer( if CallableConsumerConfig.is_callable_config(connection.consumer_config): # As per documentation, we need to explicitly call the poll method to make sure OAuth callback gets executed # https://docs.confluent.io/platform/current/clients/confluent-kafka-python/html/index.html#kafka-client-configuration + logger.debug("Initiating polling for kafka consumer") consumer.poll(timeout=30) + logger.debug("Initiated polling for kafka consumer") return consumer +def get_kafka_admin_client( + connection: KafkaConsumerConnectionConfig, +) -> AdminClient: + client = AdminClient( + { + "group.id": "datahub-kafka-ingestion", + "bootstrap.servers": connection.bootstrap, + **connection.consumer_config, + } + ) + if CallableConsumerConfig.is_callable_config(connection.consumer_config): + # As per documentation, we need to explicitly call the poll method to make sure OAuth callback gets executed + # https://docs.confluent.io/platform/current/clients/confluent-kafka-python/html/index.html#kafka-client-configuration + logger.debug("Initiating polling for kafka admin client") + client.poll(timeout=30) + logger.debug("Initiated polling for kafka admin client") + return client + + @dataclass class KafkaSourceReport(StaleEntityRemovalSourceReport): topics_scanned: int = 0 @@ -276,13 +297,7 @@ def __init__(self, config: KafkaSourceConfig, ctx: PipelineContext): def init_kafka_admin_client(self) -> None: try: # TODO: Do we require separate config than existing consumer_config ? - self.admin_client = AdminClient( - { - "group.id": "test", - "bootstrap.servers": self.source_config.connection.bootstrap, - **self.source_config.connection.consumer_config, - } - ) + self.admin_client = get_kafka_admin_client(self.source_config.connection) except Exception as e: logger.debug(e, exc_info=e) self.report.report_warning( diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi/__init__.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi/__init__.py index 1068f335e8f8e5..e69de29bb2d1d6 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/powerbi/__init__.py +++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi/__init__.py @@ -1 +0,0 @@ -from datahub.ingestion.source.powerbi.powerbi import PowerBiDashboardSource diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi/config.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi/config.py index 91fa2e96be2cce..f7458c4eb4d5b5 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/powerbi/config.py +++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi/config.py @@ -173,7 +173,7 @@ class SupportedDataPlatform(Enum): datahub_data_platform_name="redshift", ) - DATABRICK_SQL = DataPlatformPair( + DATABRICKS_SQL = DataPlatformPair( powerbi_data_platform_name="Databricks", datahub_data_platform_name="databricks" ) @@ -313,8 +313,8 @@ class PowerBiDashboardSourceConfig( " Note: This field works in conjunction with 'workspace_type_filter' and both must be considered when filtering workspaces.", ) - # Dataset type mapping PowerBI support many type of data-sources. Here user need to define what type of PowerBI - # DataSource need to be mapped to corresponding DataHub Platform DataSource. For example PowerBI `Snowflake` is + # Dataset type mapping PowerBI support many type of data-sources. Here user needs to define what type of PowerBI + # DataSource needs to be mapped to corresponding DataHub Platform DataSource. For example, PowerBI `Snowflake` is # mapped to DataHub `snowflake` PowerBI `PostgreSQL` is mapped to DataHub `postgres` and so on. dataset_type_mapping: Union[ Dict[str, str], Dict[str, PlatformDetail] diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/data_classes.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/data_classes.py index bb0c0c2f79bbdd..f1691b5df68a94 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/data_classes.py +++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/data_classes.py @@ -1,10 +1,14 @@ import os from abc import ABC from dataclasses import dataclass -from typing import Any, Dict, Optional +from enum import Enum +from typing import Any, Dict, List, Optional from lark import Tree +from datahub.ingestion.source.powerbi.config import DataPlatformPair +from datahub.sql_parsing.sqlglot_lineage import ColumnLineageInfo + TRACE_POWERBI_MQUERY_PARSER = os.getenv("DATAHUB_TRACE_POWERBI_MQUERY_PARSER", False) @@ -30,7 +34,7 @@ class IdentifierAccessor(AbstractIdentifierAccessor): "[Schema="public",Item="order_date"]" is "items" in ItemSelector. Data of items varies as per DataSource - "public_order_date" is in "next" of ItemSelector. The "next" will be None if this identifier is leaf i.e. table + "public_order_date" is in "next" of ItemSelector. The "next" will be None if this identifier is leaf i.e., table """ @@ -53,3 +57,31 @@ class ReferencedTable: database: str schema: str table: str + + +@dataclass +class DataPlatformTable: + data_platform_pair: DataPlatformPair + urn: str + + +@dataclass +class Lineage: + upstreams: List[DataPlatformTable] + column_lineage: List[ColumnLineageInfo] + + @staticmethod + def empty() -> "Lineage": + return Lineage(upstreams=[], column_lineage=[]) + + +class FunctionName(Enum): + NATIVE_QUERY = "Value.NativeQuery" + POSTGRESQL_DATA_ACCESS = "PostgreSQL.Database" + ORACLE_DATA_ACCESS = "Oracle.Database" + SNOWFLAKE_DATA_ACCESS = "Snowflake.Databases" + MSSQL_DATA_ACCESS = "Sql.Database" + DATABRICK_DATA_ACCESS = "Databricks.Catalogs" + GOOGLE_BIGQUERY_DATA_ACCESS = "GoogleBigQuery.Database" + AMAZON_REDSHIFT_DATA_ACCESS = "AmazonRedshift.Database" + DATABRICK_MULTI_CLOUD_DATA_ACCESS = "DatabricksMultiCloud.Catalogs" diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/parser.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/parser.py index 97698a3d0d56c1..2a5de7494920b2 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/parser.py +++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/parser.py @@ -7,6 +7,7 @@ import lark from lark import Lark, Tree +import datahub.ingestion.source.powerbi.m_query.data_classes from datahub.ingestion.api.common import PipelineContext from datahub.ingestion.source.powerbi.config import ( PowerBiDashboardSourceConfig, @@ -65,7 +66,7 @@ def get_upstream_tables( ctx: PipelineContext, config: PowerBiDashboardSourceConfig, parameters: Dict[str, str] = {}, -) -> List[resolver.Lineage]: +) -> List[datahub.ingestion.source.powerbi.m_query.data_classes.Lineage]: if table.expression is None: logger.debug(f"There is no M-Query expression in table {table.full_name}") return [] @@ -127,12 +128,14 @@ def get_upstream_tables( reporter.m_query_parse_successes += 1 try: - lineage: List[resolver.Lineage] = resolver.MQueryResolver( + lineage: List[ + datahub.ingestion.source.powerbi.m_query.data_classes.Lineage + ] = resolver.MQueryResolver( table=table, parse_tree=parse_tree, reporter=reporter, parameters=parameters, - ).resolve_to_data_platform_table_list( + ).resolve_to_lineage( ctx=ctx, config=config, platform_instance_resolver=platform_instance_resolver, diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/pattern_handler.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/pattern_handler.py new file mode 100644 index 00000000000000..13d97a70290298 --- /dev/null +++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/pattern_handler.py @@ -0,0 +1,920 @@ +import logging +from abc import ABC, abstractmethod +from enum import Enum +from typing import Dict, List, Optional, Tuple, Type, Union, cast + +from lark import Tree + +from datahub.emitter import mce_builder as builder +from datahub.ingestion.api.common import PipelineContext +from datahub.ingestion.source.powerbi.config import ( + Constant, + DataBricksPlatformDetail, + DataPlatformPair, + PlatformDetail, + PowerBiDashboardSourceConfig, + PowerBiDashboardSourceReport, + PowerBIPlatformDetail, + SupportedDataPlatform, +) +from datahub.ingestion.source.powerbi.dataplatform_instance_resolver import ( + AbstractDataPlatformInstanceResolver, +) +from datahub.ingestion.source.powerbi.m_query import native_sql_parser, tree_function +from datahub.ingestion.source.powerbi.m_query.data_classes import ( + AbstractIdentifierAccessor, + DataAccessFunctionDetail, + DataPlatformTable, + FunctionName, + IdentifierAccessor, + Lineage, + ReferencedTable, +) +from datahub.ingestion.source.powerbi.rest_api_wrapper.data_classes import Table +from datahub.sql_parsing.sqlglot_lineage import SqlParsingResult + +logger = logging.getLogger(__name__) + + +def get_next_item(items: List[str], item: str) -> Optional[str]: + if item in items: + try: + index = items.index(item) + return items[index + 1] + except IndexError: + logger.debug(f'item:"{item}", not found in item-list: {items}') + return None + + +def urn_to_lowercase(value: str, flag: bool) -> str: + if flag is True: + return value.lower() + + return value + + +def make_urn( + config: PowerBiDashboardSourceConfig, + platform_instance_resolver: AbstractDataPlatformInstanceResolver, + data_platform_pair: DataPlatformPair, + server: str, + qualified_table_name: str, +) -> str: + platform_detail: PlatformDetail = platform_instance_resolver.get_platform_instance( + PowerBIPlatformDetail( + data_platform_pair=data_platform_pair, + data_platform_server=server, + ) + ) + + return builder.make_dataset_urn_with_platform_instance( + platform=data_platform_pair.datahub_data_platform_name, + platform_instance=platform_detail.platform_instance, + env=platform_detail.env, + name=urn_to_lowercase( + qualified_table_name, config.convert_lineage_urns_to_lowercase + ), + ) + + +class AbstractLineage(ABC): + """ + Base class to share common functionalities among different dataplatform for M-Query parsing. + + To create qualified table name we need to parse M-Query data-access-functions(https://learn.microsoft.com/en-us/powerquery-m/accessing-data-functions) and + the data-access-functions has some define pattern to access database-name, schema-name and table-name, for example, see below M-Query. + + let + Source = Sql.Database("localhost", "library"), + dbo_book_issue = Source{[Schema="dbo",Item="book_issue"]}[Data] + in + dbo_book_issue + + It is MSSQL M-Query and Sql.Database is the data-access-function to access MSSQL. If this function is available in M-Query then database name is available in the second argument of the first statement and schema-name and table-name is available in the second statement. the second statement can be repeated to access different tables from MSSQL. + + DefaultTwoStepDataAccessSources extends the AbstractDataPlatformTableCreator and provides the common functionalities for data-platform which has above type of M-Query pattern + + data-access-function varies as per data-platform for example for MySQL.Database for MySQL, PostgreSQL.Database for Postgres and Oracle.Database for Oracle and number of statement to + find out database-name , schema-name and table-name also varies as per dataplatform. + + Value.NativeQuery is one of the functions which is used to execute a native query inside M-Query, for example see below M-Query + + let + Source = Value.NativeQuery(AmazonRedshift.Database("redshift-url","dev"), "select * from dev.public.category", null, [EnableFolding=true]) + in + Source + + In this M-Query database-name is available in first argument and rest of the detail i.e database & schema is available in native query. + + NativeQueryDataPlatformTableCreator extends AbstractDataPlatformTableCreator to support Redshift and Snowflake native query parsing. + + """ + + ctx: PipelineContext + table: Table + config: PowerBiDashboardSourceConfig + reporter: PowerBiDashboardSourceReport + platform_instance_resolver: AbstractDataPlatformInstanceResolver + + def __init__( + self, + ctx: PipelineContext, + table: Table, + config: PowerBiDashboardSourceConfig, + reporter: PowerBiDashboardSourceReport, + platform_instance_resolver: AbstractDataPlatformInstanceResolver, + ) -> None: + super().__init__() + self.ctx = ctx + self.table = table + self.config = config + self.reporter = reporter + self.platform_instance_resolver = platform_instance_resolver + + @abstractmethod + def create_lineage( + self, data_access_func_detail: DataAccessFunctionDetail + ) -> Lineage: + pass + + @abstractmethod + def get_platform_pair(self) -> DataPlatformPair: + pass + + @staticmethod + def get_db_detail_from_argument( + arg_list: Tree, + ) -> Tuple[Optional[str], Optional[str]]: + arguments: List[str] = tree_function.strip_char_from_list( + values=tree_function.remove_whitespaces_from_list( + tree_function.token_values(arg_list) + ), + ) + + if len(arguments) < 2: + logger.debug(f"Expected minimum 2 arguments, but got {len(arguments)}") + return None, None + + return arguments[0], arguments[1] + + @staticmethod + def create_reference_table( + arg_list: Tree, + table_detail: Dict[str, str], + ) -> Optional[ReferencedTable]: + arguments: List[str] = tree_function.strip_char_from_list( + values=tree_function.remove_whitespaces_from_list( + tree_function.token_values(arg_list) + ), + ) + + logger.debug(f"Processing arguments {arguments}") + + if ( + len(arguments) + >= 4 # [0] is warehouse FQDN. + # [1] is endpoint, we are not using it. + # [2] is "Catalog" key + # [3] is catalog's value + ): + return ReferencedTable( + warehouse=arguments[0], + catalog=arguments[3], + # As per my observation, database and catalog names are same in M-Query + database=table_detail["Database"] + if table_detail.get("Database") + else arguments[3], + schema=table_detail["Schema"], + table=table_detail.get("Table") or table_detail["View"], + ) + elif len(arguments) == 2: + return ReferencedTable( + warehouse=arguments[0], + database=table_detail["Database"], + schema=table_detail["Schema"], + table=table_detail.get("Table") or table_detail["View"], + catalog=None, + ) + + return None + + def parse_custom_sql( + self, query: str, server: str, database: Optional[str], schema: Optional[str] + ) -> Lineage: + dataplatform_tables: List[DataPlatformTable] = [] + + platform_detail: PlatformDetail = ( + self.platform_instance_resolver.get_platform_instance( + PowerBIPlatformDetail( + data_platform_pair=self.get_platform_pair(), + data_platform_server=server, + ) + ) + ) + + query = native_sql_parser.remove_drop_statement( + native_sql_parser.remove_special_characters(query) + ) + + parsed_result: Optional[ + "SqlParsingResult" + ] = native_sql_parser.parse_custom_sql( + ctx=self.ctx, + query=query, + platform=self.get_platform_pair().datahub_data_platform_name, + platform_instance=platform_detail.platform_instance, + env=platform_detail.env, + database=database, + schema=schema, + ) + + if parsed_result is None: + self.reporter.info( + title=Constant.SQL_PARSING_FAILURE, + message="Fail to parse native sql present in PowerBI M-Query", + context=f"table-name={self.table.full_name}, sql={query}", + ) + return Lineage.empty() + + if parsed_result.debug_info and parsed_result.debug_info.table_error: + self.reporter.warning( + title=Constant.SQL_PARSING_FAILURE, + message="Fail to parse native sql present in PowerBI M-Query", + context=f"table-name={self.table.full_name}, error={parsed_result.debug_info.table_error},sql={query}", + ) + return Lineage.empty() + + for urn in parsed_result.in_tables: + dataplatform_tables.append( + DataPlatformTable( + data_platform_pair=self.get_platform_pair(), + urn=urn, + ) + ) + + logger.debug(f"Native Query parsed result={parsed_result}") + logger.debug(f"Generated dataplatform_tables={dataplatform_tables}") + + return Lineage( + upstreams=dataplatform_tables, + column_lineage=( + parsed_result.column_lineage + if parsed_result.column_lineage is not None + else [] + ), + ) + + +class AmazonRedshiftLineage(AbstractLineage): + def get_platform_pair(self) -> DataPlatformPair: + return SupportedDataPlatform.AMAZON_REDSHIFT.value + + def create_lineage( + self, data_access_func_detail: DataAccessFunctionDetail + ) -> Lineage: + logger.debug( + f"Processing AmazonRedshift data-access function detail {data_access_func_detail}" + ) + + server, db_name = self.get_db_detail_from_argument( + data_access_func_detail.arg_list + ) + if db_name is None or server is None: + return Lineage.empty() # Return an empty list + + schema_name: str = cast( + IdentifierAccessor, data_access_func_detail.identifier_accessor + ).items["Name"] + + table_name: str = cast( + IdentifierAccessor, + cast(IdentifierAccessor, data_access_func_detail.identifier_accessor).next, + ).items["Name"] + + qualified_table_name: str = f"{db_name}.{schema_name}.{table_name}" + + urn = make_urn( + config=self.config, + platform_instance_resolver=self.platform_instance_resolver, + data_platform_pair=self.get_platform_pair(), + server=server, + qualified_table_name=qualified_table_name, + ) + + return Lineage( + upstreams=[ + DataPlatformTable( + data_platform_pair=self.get_platform_pair(), + urn=urn, + ) + ], + column_lineage=[], + ) + + +class OracleLineage(AbstractLineage): + def get_platform_pair(self) -> DataPlatformPair: + return SupportedDataPlatform.ORACLE.value + + @staticmethod + def _get_server_and_db_name(value: str) -> Tuple[Optional[str], Optional[str]]: + error_message: str = ( + f"The target argument ({value}) should in the format of :/[" + ".]" + ) + splitter_result: List[str] = value.split("/") + if len(splitter_result) != 2: + logger.debug(error_message) + return None, None + + db_name = splitter_result[1].split(".")[0] + + return tree_function.strip_char_from_list([splitter_result[0]])[0], db_name + + def create_lineage( + self, data_access_func_detail: DataAccessFunctionDetail + ) -> Lineage: + logger.debug( + f"Processing Oracle data-access function detail {data_access_func_detail}" + ) + + arguments: List[str] = tree_function.remove_whitespaces_from_list( + tree_function.token_values(data_access_func_detail.arg_list) + ) + + server, db_name = self._get_server_and_db_name(arguments[0]) + + if db_name is None or server is None: + return Lineage.empty() + + schema_name: str = cast( + IdentifierAccessor, data_access_func_detail.identifier_accessor + ).items["Schema"] + + table_name: str = cast( + IdentifierAccessor, + cast(IdentifierAccessor, data_access_func_detail.identifier_accessor).next, + ).items["Name"] + + qualified_table_name: str = f"{db_name}.{schema_name}.{table_name}" + + urn = make_urn( + config=self.config, + platform_instance_resolver=self.platform_instance_resolver, + data_platform_pair=self.get_platform_pair(), + server=server, + qualified_table_name=qualified_table_name, + ) + + return Lineage( + upstreams=[ + DataPlatformTable( + data_platform_pair=self.get_platform_pair(), + urn=urn, + ) + ], + column_lineage=[], + ) + + +class DatabricksLineage(AbstractLineage): + def form_qualified_table_name( + self, + table_reference: ReferencedTable, + data_platform_pair: DataPlatformPair, + ) -> str: + platform_detail: PlatformDetail = ( + self.platform_instance_resolver.get_platform_instance( + PowerBIPlatformDetail( + data_platform_pair=data_platform_pair, + data_platform_server=table_reference.warehouse, + ) + ) + ) + + metastore: Optional[str] = None + + qualified_table_name: str = f"{table_reference.database}.{table_reference.schema}.{table_reference.table}" + + if isinstance(platform_detail, DataBricksPlatformDetail): + metastore = platform_detail.metastore + + if metastore is not None: + return f"{metastore}.{qualified_table_name}" + + return qualified_table_name + + def create_lineage( + self, data_access_func_detail: DataAccessFunctionDetail + ) -> Lineage: + logger.debug( + f"Processing Databrick data-access function detail {data_access_func_detail}" + ) + table_detail: Dict[str, str] = {} + temp_accessor: Optional[ + Union[IdentifierAccessor, AbstractIdentifierAccessor] + ] = data_access_func_detail.identifier_accessor + + while temp_accessor: + if isinstance(temp_accessor, IdentifierAccessor): + # Condition to handle databricks M-query pattern where table, schema and database all are present in + # the same invoke statement + if all( + element in temp_accessor.items + for element in ["Item", "Schema", "Catalog"] + ): + table_detail["Schema"] = temp_accessor.items["Schema"] + table_detail["Table"] = temp_accessor.items["Item"] + else: + table_detail[temp_accessor.items["Kind"]] = temp_accessor.items[ + "Name" + ] + + if temp_accessor.next is not None: + temp_accessor = temp_accessor.next + else: + break + else: + logger.debug( + "expecting instance to be IdentifierAccessor, please check if parsing is done properly" + ) + return Lineage.empty() + + table_reference = self.create_reference_table( + arg_list=data_access_func_detail.arg_list, + table_detail=table_detail, + ) + + if table_reference: + qualified_table_name: str = self.form_qualified_table_name( + table_reference=table_reference, + data_platform_pair=self.get_platform_pair(), + ) + + urn = make_urn( + config=self.config, + platform_instance_resolver=self.platform_instance_resolver, + data_platform_pair=self.get_platform_pair(), + server=table_reference.warehouse, + qualified_table_name=qualified_table_name, + ) + + return Lineage( + upstreams=[ + DataPlatformTable( + data_platform_pair=self.get_platform_pair(), + urn=urn, + ) + ], + column_lineage=[], + ) + + return Lineage.empty() + + def get_platform_pair(self) -> DataPlatformPair: + return SupportedDataPlatform.DATABRICKS_SQL.value + + +class TwoStepDataAccessPattern(AbstractLineage, ABC): + """ + These are the DataSource for which PowerBI Desktop generates default M-Query of the following pattern + let + Source = Sql.Database("localhost", "library"), + dbo_book_issue = Source{[Schema="dbo",Item="book_issue"]}[Data] + in + dbo_book_issue + """ + + def two_level_access_pattern( + self, data_access_func_detail: DataAccessFunctionDetail + ) -> Lineage: + logger.debug( + f"Processing {self.get_platform_pair().powerbi_data_platform_name} data-access function detail {data_access_func_detail}" + ) + + server, db_name = self.get_db_detail_from_argument( + data_access_func_detail.arg_list + ) + if server is None or db_name is None: + return Lineage.empty() # Return an empty list + + schema_name: str = cast( + IdentifierAccessor, data_access_func_detail.identifier_accessor + ).items["Schema"] + + table_name: str = cast( + IdentifierAccessor, data_access_func_detail.identifier_accessor + ).items["Item"] + + qualified_table_name: str = f"{db_name}.{schema_name}.{table_name}" + + logger.debug( + f"Platform({self.get_platform_pair().datahub_data_platform_name}) qualified_table_name= {qualified_table_name}" + ) + + urn = make_urn( + config=self.config, + platform_instance_resolver=self.platform_instance_resolver, + data_platform_pair=self.get_platform_pair(), + server=server, + qualified_table_name=qualified_table_name, + ) + return Lineage( + upstreams=[ + DataPlatformTable( + data_platform_pair=self.get_platform_pair(), + urn=urn, + ) + ], + column_lineage=[], + ) + + +class PostgresLineage(TwoStepDataAccessPattern): + def create_lineage( + self, data_access_func_detail: DataAccessFunctionDetail + ) -> Lineage: + return self.two_level_access_pattern(data_access_func_detail) + + def get_platform_pair(self) -> DataPlatformPair: + return SupportedDataPlatform.POSTGRES_SQL.value + + +class MSSqlLineage(TwoStepDataAccessPattern): + # https://learn.microsoft.com/en-us/sql/relational-databases/security/authentication-access/ownership-and-user-schema-separation?view=sql-server-ver16 + DEFAULT_SCHEMA = "dbo" # Default schema name in MS-SQL is dbo + + def get_platform_pair(self) -> DataPlatformPair: + return SupportedDataPlatform.MS_SQL.value + + def create_urn_using_old_parser( + self, query: str, db_name: str, server: str + ) -> List[DataPlatformTable]: + dataplatform_tables: List[DataPlatformTable] = [] + + tables: List[str] = native_sql_parser.get_tables(query) + + for parsed_table in tables: + # components: List[str] = [v.strip("[]") for v in parsed_table.split(".")] + components = [v.strip("[]") for v in parsed_table.split(".")] + if len(components) == 3: + database, schema, table = components + elif len(components) == 2: + schema, table = components + database = db_name + elif len(components) == 1: + (table,) = components + database = db_name + schema = MSSqlLineage.DEFAULT_SCHEMA + else: + self.reporter.warning( + title="Invalid table format", + message="The advanced SQL lineage feature (enable_advance_lineage_sql_construct) is disabled. Please either enable this feature or ensure the table is referenced as .. in the SQL.", + context=f"table-name={self.table.full_name}", + ) + continue + + qualified_table_name = f"{database}.{schema}.{table}" + urn = make_urn( + config=self.config, + platform_instance_resolver=self.platform_instance_resolver, + data_platform_pair=self.get_platform_pair(), + server=server, + qualified_table_name=qualified_table_name, + ) + dataplatform_tables.append( + DataPlatformTable( + data_platform_pair=self.get_platform_pair(), + urn=urn, + ) + ) + + logger.debug(f"Generated upstream tables = {dataplatform_tables}") + + return dataplatform_tables + + def create_lineage( + self, data_access_func_detail: DataAccessFunctionDetail + ) -> Lineage: + arguments: List[str] = tree_function.strip_char_from_list( + values=tree_function.remove_whitespaces_from_list( + tree_function.token_values(data_access_func_detail.arg_list) + ), + ) + + server, database = self.get_db_detail_from_argument( + data_access_func_detail.arg_list + ) + if server is None or database is None: + return Lineage.empty() # Return an empty list + + assert server + assert database # to silent the lint + + query: Optional[str] = get_next_item(arguments, "Query") + if query: + if self.config.enable_advance_lineage_sql_construct is False: + # Use previous parser to generate URN to keep backward compatibility + return Lineage( + upstreams=self.create_urn_using_old_parser( + query=query, + db_name=database, + server=server, + ), + column_lineage=[], + ) + + return self.parse_custom_sql( + query=query, + database=database, + server=server, + schema=MSSqlLineage.DEFAULT_SCHEMA, + ) + + # It is a regular case of MS-SQL + logger.debug("Handling with regular case") + return self.two_level_access_pattern(data_access_func_detail) + + +class ThreeStepDataAccessPattern(AbstractLineage, ABC): + def get_datasource_server( + self, arguments: List[str], data_access_func_detail: DataAccessFunctionDetail + ) -> str: + return tree_function.strip_char_from_list([arguments[0]])[0] + + def create_lineage( + self, data_access_func_detail: DataAccessFunctionDetail + ) -> Lineage: + logger.debug( + f"Processing {self.get_platform_pair().datahub_data_platform_name} function detail {data_access_func_detail}" + ) + + arguments: List[str] = tree_function.remove_whitespaces_from_list( + tree_function.token_values(data_access_func_detail.arg_list) + ) + # First is database name + db_name: str = data_access_func_detail.identifier_accessor.items["Name"] # type: ignore + # Second is schema name + schema_name: str = cast( + IdentifierAccessor, data_access_func_detail.identifier_accessor.next # type: ignore + ).items["Name"] + # Third is table name + table_name: str = cast( + IdentifierAccessor, data_access_func_detail.identifier_accessor.next.next # type: ignore + ).items["Name"] + + qualified_table_name: str = f"{db_name}.{schema_name}.{table_name}" + + logger.debug( + f"{self.get_platform_pair().datahub_data_platform_name} qualified_table_name {qualified_table_name}" + ) + + server: str = self.get_datasource_server(arguments, data_access_func_detail) + + urn = make_urn( + config=self.config, + platform_instance_resolver=self.platform_instance_resolver, + data_platform_pair=self.get_platform_pair(), + server=server, + qualified_table_name=qualified_table_name, + ) + + return Lineage( + upstreams=[ + DataPlatformTable( + data_platform_pair=self.get_platform_pair(), + urn=urn, + ) + ], + column_lineage=[], + ) + + +class SnowflakeLineage(ThreeStepDataAccessPattern): + def get_platform_pair(self) -> DataPlatformPair: + return SupportedDataPlatform.SNOWFLAKE.value + + +class GoogleBigQueryLineage(ThreeStepDataAccessPattern): + def get_platform_pair(self) -> DataPlatformPair: + return SupportedDataPlatform.GOOGLE_BIGQUERY.value + + def get_datasource_server( + self, arguments: List[str], data_access_func_detail: DataAccessFunctionDetail + ) -> str: + # In Google BigQuery server is project-name + # condition to silent lint, it is not going to be None + return ( + data_access_func_detail.identifier_accessor.items["Name"] + if data_access_func_detail.identifier_accessor is not None + else "" + ) + + +class NativeQueryLineage(AbstractLineage): + SUPPORTED_NATIVE_QUERY_DATA_PLATFORM: dict = { + SupportedDataPlatform.SNOWFLAKE.value.powerbi_data_platform_name: SupportedDataPlatform.SNOWFLAKE, + SupportedDataPlatform.AMAZON_REDSHIFT.value.powerbi_data_platform_name: SupportedDataPlatform.AMAZON_REDSHIFT, + SupportedDataPlatform.DatabricksMultiCloud_SQL.value.powerbi_data_platform_name: SupportedDataPlatform.DatabricksMultiCloud_SQL, + } + current_data_platform: SupportedDataPlatform = SupportedDataPlatform.SNOWFLAKE + + def get_platform_pair(self) -> DataPlatformPair: + return self.current_data_platform.value + + @staticmethod + def is_native_parsing_supported(data_access_function_name: str) -> bool: + return ( + data_access_function_name + in NativeQueryLineage.SUPPORTED_NATIVE_QUERY_DATA_PLATFORM + ) + + def create_urn_using_old_parser(self, query: str, server: str) -> Lineage: + dataplatform_tables: List[DataPlatformTable] = [] + + tables: List[str] = native_sql_parser.get_tables(query) + + for qualified_table_name in tables: + if len(qualified_table_name.split(".")) != 3: + logger.debug( + f"Skipping table {qualified_table_name} as it is not as per qualified_table_name format" + ) + continue + + urn = make_urn( + config=self.config, + platform_instance_resolver=self.platform_instance_resolver, + data_platform_pair=self.get_platform_pair(), + server=server, + qualified_table_name=qualified_table_name, + ) + + dataplatform_tables.append( + DataPlatformTable( + data_platform_pair=self.get_platform_pair(), + urn=urn, + ) + ) + + logger.debug(f"Generated dataplatform_tables {dataplatform_tables}") + + return Lineage( + upstreams=dataplatform_tables, + column_lineage=[], + ) + + def get_db_name(self, data_access_tokens: List[str]) -> Optional[str]: + if ( + data_access_tokens[0] + != SupportedDataPlatform.DatabricksMultiCloud_SQL.value.powerbi_data_platform_name + ): + return None + + database: Optional[str] = get_next_item(data_access_tokens, "Database") + + if ( + database and database != Constant.M_QUERY_NULL + ): # database name is explicitly set + return database + + return get_next_item( # database name is set in Name argument + data_access_tokens, "Name" + ) or get_next_item( # If both above arguments are not available, then try Catalog + data_access_tokens, "Catalog" + ) + + def create_lineage( + self, data_access_func_detail: DataAccessFunctionDetail + ) -> Lineage: + t1: Tree = cast( + Tree, tree_function.first_arg_list_func(data_access_func_detail.arg_list) + ) + flat_argument_list: List[Tree] = tree_function.flat_argument_list(t1) + + if len(flat_argument_list) != 2: + logger.debug( + f"Expecting 2 argument, actual argument count is {len(flat_argument_list)}" + ) + logger.debug(f"Flat argument list = {flat_argument_list}") + return Lineage.empty() + + data_access_tokens: List[str] = tree_function.remove_whitespaces_from_list( + tree_function.token_values(flat_argument_list[0]) + ) + + if not self.is_native_parsing_supported(data_access_tokens[0]): + logger.debug( + f"Unsupported native-query data-platform = {data_access_tokens[0]}" + ) + logger.debug( + f"NativeQuery is supported only for {self.SUPPORTED_NATIVE_QUERY_DATA_PLATFORM}" + ) + + return Lineage.empty() + + if len(data_access_tokens[0]) < 3: + logger.debug( + f"Server is not available in argument list for data-platform {data_access_tokens[0]}. Returning empty " + "list" + ) + return Lineage.empty() + + self.current_data_platform = self.SUPPORTED_NATIVE_QUERY_DATA_PLATFORM[ + data_access_tokens[0] + ] + # The First argument is the query + sql_query: str = tree_function.strip_char_from_list( + values=tree_function.remove_whitespaces_from_list( + tree_function.token_values(flat_argument_list[1]) + ), + )[ + 0 + ] # Remove any whitespaces and double quotes character + + server = tree_function.strip_char_from_list([data_access_tokens[2]])[0] + + if self.config.enable_advance_lineage_sql_construct is False: + # Use previous parser to generate URN to keep backward compatibility + return self.create_urn_using_old_parser( + query=sql_query, + server=server, + ) + + database_name: Optional[str] = self.get_db_name(data_access_tokens) + + return self.parse_custom_sql( + query=sql_query, + server=server, + database=database_name, + schema=None, + ) + + +class SupportedPattern(Enum): + DATABRICKS_QUERY = ( + DatabricksLineage, + FunctionName.DATABRICK_DATA_ACCESS, + ) + + DATABRICKS_MULTI_CLOUD = ( + DatabricksLineage, + FunctionName.DATABRICK_MULTI_CLOUD_DATA_ACCESS, + ) + + POSTGRES_SQL = ( + PostgresLineage, + FunctionName.POSTGRESQL_DATA_ACCESS, + ) + + ORACLE = ( + OracleLineage, + FunctionName.ORACLE_DATA_ACCESS, + ) + + SNOWFLAKE = ( + SnowflakeLineage, + FunctionName.SNOWFLAKE_DATA_ACCESS, + ) + + MS_SQL = ( + MSSqlLineage, + FunctionName.MSSQL_DATA_ACCESS, + ) + + GOOGLE_BIG_QUERY = ( + GoogleBigQueryLineage, + FunctionName.GOOGLE_BIGQUERY_DATA_ACCESS, + ) + + AMAZON_REDSHIFT = ( + AmazonRedshiftLineage, + FunctionName.AMAZON_REDSHIFT_DATA_ACCESS, + ) + + NATIVE_QUERY = ( + NativeQueryLineage, + FunctionName.NATIVE_QUERY, + ) + + def handler(self) -> Type[AbstractLineage]: + return self.value[0] + + def function_name(self) -> str: + return self.value[1].value + + @staticmethod + def get_function_names() -> List[str]: + functions: List[str] = [] + for supported_resolver in SupportedPattern: + functions.append(supported_resolver.function_name()) + + return functions + + @staticmethod + def get_pattern_handler(function_name: str) -> Optional["SupportedPattern"]: + logger.debug(f"Looking for pattern-handler for {function_name}") + for supported_resolver in SupportedPattern: + if function_name == supported_resolver.function_name(): + return supported_resolver + logger.debug(f"pattern-handler not found for function_name {function_name}") + return None diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/resolver.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/resolver.py index a40e67d08da5b2..81a0e1ef2d79b1 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/resolver.py +++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/resolver.py @@ -1,286 +1,33 @@ import logging from abc import ABC, abstractmethod -from dataclasses import dataclass -from enum import Enum -from typing import Any, Dict, List, Optional, Tuple, Type, Union, cast +from typing import Any, Dict, List, Optional, Tuple, Union, cast from lark import Tree -import datahub.emitter.mce_builder as builder from datahub.ingestion.api.common import PipelineContext from datahub.ingestion.source.powerbi.config import ( - Constant, - DataBricksPlatformDetail, - DataPlatformPair, - PlatformDetail, PowerBiDashboardSourceConfig, PowerBiDashboardSourceReport, - PowerBIPlatformDetail, - SupportedDataPlatform, ) from datahub.ingestion.source.powerbi.dataplatform_instance_resolver import ( AbstractDataPlatformInstanceResolver, ) -from datahub.ingestion.source.powerbi.m_query import native_sql_parser, tree_function +from datahub.ingestion.source.powerbi.m_query import tree_function from datahub.ingestion.source.powerbi.m_query.data_classes import ( TRACE_POWERBI_MQUERY_PARSER, - AbstractIdentifierAccessor, DataAccessFunctionDetail, IdentifierAccessor, - ReferencedTable, + Lineage, +) +from datahub.ingestion.source.powerbi.m_query.pattern_handler import ( + AbstractLineage, + SupportedPattern, ) from datahub.ingestion.source.powerbi.rest_api_wrapper.data_classes import Table -from datahub.sql_parsing.sqlglot_lineage import ColumnLineageInfo, SqlParsingResult logger = logging.getLogger(__name__) -@dataclass -class DataPlatformTable: - data_platform_pair: DataPlatformPair - urn: str - - -@dataclass -class Lineage: - upstreams: List[DataPlatformTable] - column_lineage: List[ColumnLineageInfo] - - @staticmethod - def empty() -> "Lineage": - return Lineage(upstreams=[], column_lineage=[]) - - -def urn_to_lowercase(value: str, flag: bool) -> str: - if flag is True: - return value.lower() - - return value - - -def urn_creator( - config: PowerBiDashboardSourceConfig, - platform_instance_resolver: AbstractDataPlatformInstanceResolver, - data_platform_pair: DataPlatformPair, - server: str, - qualified_table_name: str, -) -> str: - platform_detail: PlatformDetail = platform_instance_resolver.get_platform_instance( - PowerBIPlatformDetail( - data_platform_pair=data_platform_pair, - data_platform_server=server, - ) - ) - - return builder.make_dataset_urn_with_platform_instance( - platform=data_platform_pair.datahub_data_platform_name, - platform_instance=platform_detail.platform_instance, - env=platform_detail.env, - name=urn_to_lowercase( - qualified_table_name, config.convert_lineage_urns_to_lowercase - ), - ) - - -def get_next_item(items: List[str], item: str) -> Optional[str]: - if item in items: - try: - index = items.index(item) - return items[index + 1] - except IndexError: - logger.debug(f'item:"{item}", not found in item-list: {items}') - return None - - -class AbstractDataPlatformTableCreator(ABC): - """ - Base class to share common functionalities among different dataplatform for M-Query parsing. - - To create qualified table name we need to parse M-Query data-access-functions(https://learn.microsoft.com/en-us/powerquery-m/accessing-data-functions) and - the data-access-functions has some define pattern to access database-name, schema-name and table-name, for example see below M-Query. - - let - Source = Sql.Database("localhost", "library"), - dbo_book_issue = Source{[Schema="dbo",Item="book_issue"]}[Data] - in - dbo_book_issue - - It is MSSQL M-Query and Sql.Database is the data-access-function to access MSSQL. If this function is available in M-Query then database name is available in second argument - of first statement and schema-name and table-name is available in second statement. second statement can be repeated to access different tables from MSSQL. - - DefaultTwoStepDataAccessSources extends the AbstractDataPlatformTableCreator and provides the common functionalities for data-platform which has above type of M-Query pattern - - data-access-function varies as per data-platform for example for MySQL.Database for MySQL, PostgreSQL.Database for Postgres and Oracle.Database for Oracle and number of statement to - find out database-name , schema-name and table-name also varies as per dataplatform. - - Value.NativeQuery is one of the function which is used to execute native query inside M-Query, for example see below M-Query - - let - Source = Value.NativeQuery(AmazonRedshift.Database("redshift-url","dev"), "select * from dev.public.category", null, [EnableFolding=true]) - in - Source - - In this M-Query database-name is available in first argument and rest of the detail i.e database & schema is available in native query. - - NativeQueryDataPlatformTableCreator extends AbstractDataPlatformTableCreator to support Redshift and Snowflake native query parsing. - - """ - - ctx: PipelineContext - table: Table - config: PowerBiDashboardSourceConfig - reporter: PowerBiDashboardSourceReport - platform_instance_resolver: AbstractDataPlatformInstanceResolver - - def __init__( - self, - ctx: PipelineContext, - table: Table, - config: PowerBiDashboardSourceConfig, - reporter: PowerBiDashboardSourceReport, - platform_instance_resolver: AbstractDataPlatformInstanceResolver, - ) -> None: - super().__init__() - self.ctx = ctx - self.table = table - self.config = config - self.reporter = reporter - self.platform_instance_resolver = platform_instance_resolver - - @abstractmethod - def create_lineage( - self, data_access_func_detail: DataAccessFunctionDetail - ) -> Lineage: - pass - - @abstractmethod - def get_platform_pair(self) -> DataPlatformPair: - pass - - @staticmethod - def get_db_detail_from_argument( - arg_list: Tree, - ) -> Tuple[Optional[str], Optional[str]]: - arguments: List[str] = tree_function.strip_char_from_list( - values=tree_function.remove_whitespaces_from_list( - tree_function.token_values(arg_list) - ), - ) - - if len(arguments) < 2: - logger.debug(f"Expected minimum 2 arguments, but got {len(arguments)}") - return None, None - - return arguments[0], arguments[1] - - @staticmethod - def create_reference_table( - arg_list: Tree, - table_detail: Dict[str, str], - ) -> Optional[ReferencedTable]: - arguments: List[str] = tree_function.strip_char_from_list( - values=tree_function.remove_whitespaces_from_list( - tree_function.token_values(arg_list) - ), - ) - - logger.debug(f"Processing arguments {arguments}") - - if ( - len(arguments) - >= 4 # [0] is warehouse FQDN. - # [1] is endpoint, we are not using it. - # [2] is "Catalog" key - # [3] is catalog's value - ): - return ReferencedTable( - warehouse=arguments[0], - catalog=arguments[3], - # As per my observation, database and catalog names are same in M-Query - database=table_detail["Database"] - if table_detail.get("Database") - else arguments[3], - schema=table_detail["Schema"], - table=table_detail.get("Table") or table_detail["View"], - ) - elif len(arguments) == 2: - return ReferencedTable( - warehouse=arguments[0], - database=table_detail["Database"], - schema=table_detail["Schema"], - table=table_detail.get("Table") or table_detail["View"], - catalog=None, - ) - - return None - - def parse_custom_sql( - self, query: str, server: str, database: Optional[str], schema: Optional[str] - ) -> Lineage: - dataplatform_tables: List[DataPlatformTable] = [] - - platform_detail: PlatformDetail = ( - self.platform_instance_resolver.get_platform_instance( - PowerBIPlatformDetail( - data_platform_pair=self.get_platform_pair(), - data_platform_server=server, - ) - ) - ) - - query = native_sql_parser.remove_drop_statement( - native_sql_parser.remove_special_characters(query) - ) - - parsed_result: Optional[ - "SqlParsingResult" - ] = native_sql_parser.parse_custom_sql( - ctx=self.ctx, - query=query, - platform=self.get_platform_pair().datahub_data_platform_name, - platform_instance=platform_detail.platform_instance, - env=platform_detail.env, - database=database, - schema=schema, - ) - - if parsed_result is None: - self.reporter.info( - title=Constant.SQL_PARSING_FAILURE, - message="Fail to parse native sql present in PowerBI M-Query", - context=f"table-name={self.table.full_name}, sql={query}", - ) - return Lineage.empty() - - if parsed_result.debug_info and parsed_result.debug_info.table_error: - self.reporter.warning( - title=Constant.SQL_PARSING_FAILURE, - message="Fail to parse native sql present in PowerBI M-Query", - context=f"table-name={self.table.full_name}, error={parsed_result.debug_info.table_error},sql={query}", - ) - return Lineage.empty() - - for urn in parsed_result.in_tables: - dataplatform_tables.append( - DataPlatformTable( - data_platform_pair=self.get_platform_pair(), - urn=urn, - ) - ) - - logger.debug(f"Native Query parsed result={parsed_result}") - logger.debug(f"Generated dataplatform_tables={dataplatform_tables}") - - return Lineage( - upstreams=dataplatform_tables, - column_lineage=( - parsed_result.column_lineage - if parsed_result.column_lineage is not None - else [] - ), - ) - - class AbstractDataAccessMQueryResolver(ABC): table: Table parse_tree: Tree @@ -299,10 +46,10 @@ def __init__( self.parse_tree = parse_tree self.reporter = reporter self.parameters = parameters - self.data_access_functions = SupportedResolver.get_function_names() + self.data_access_functions = SupportedPattern.get_function_names() @abstractmethod - def resolve_to_data_platform_table_list( + def resolve_to_lineage( self, ctx: PipelineContext, config: PowerBiDashboardSourceConfig, @@ -318,7 +65,7 @@ class MQueryResolver(AbstractDataAccessMQueryResolver, ABC): This class has generic code to process M-Query tokens and create instance of DataAccessFunctionDetail. Once DataAccessFunctionDetail instance is initialized thereafter MQueryResolver generates the DataPlatformTable with the help of AbstractDataPlatformTableCreator - (see method resolve_to_data_platform_table_list). + (see method resolve_to_lineage). Classes which extended from AbstractDataPlatformTableCreator know how to convert generated DataAccessFunctionDetail instance to the respective DataPlatformTable instance as per dataplatform. @@ -602,7 +349,7 @@ def internal( return table_links - def resolve_to_data_platform_table_list( + def resolve_to_lineage( self, ctx: PipelineContext, config: PowerBiDashboardSourceConfig, @@ -630,7 +377,7 @@ def resolve_to_data_platform_table_list( # Each item is data-access function for f_detail in table_links: # Get & Check if we support data-access-function available in M-Query - supported_resolver = SupportedResolver.get_resolver( + supported_resolver = SupportedPattern.get_pattern_handler( f_detail.data_access_function_name ) if supported_resolver is None: @@ -643,11 +390,9 @@ def resolve_to_data_platform_table_list( ) continue - # From supported_resolver enum get respective resolver like AmazonRedshift or Snowflake or Oracle or NativeQuery and create instance of it - # & also pass additional information that will be need to generate urn - table_qualified_name_creator: ( - AbstractDataPlatformTableCreator - ) = supported_resolver.get_table_full_name_creator()( + # From supported_resolver enum get respective handler like AmazonRedshift or Snowflake or Oracle or NativeQuery and create instance of it + # & also pass additional information that will be need to generate lineage + pattern_handler: (AbstractLineage) = supported_resolver.handler()( ctx=ctx, table=self.table, config=config, @@ -655,673 +400,6 @@ def resolve_to_data_platform_table_list( platform_instance_resolver=platform_instance_resolver, ) - lineage.append(table_qualified_name_creator.create_lineage(f_detail)) + lineage.append(pattern_handler.create_lineage(f_detail)) return lineage - - -class DefaultTwoStepDataAccessSources(AbstractDataPlatformTableCreator, ABC): - """ - These are the DataSource for which PowerBI Desktop generates default M-Query of following pattern - let - Source = Sql.Database("localhost", "library"), - dbo_book_issue = Source{[Schema="dbo",Item="book_issue"]}[Data] - in - dbo_book_issue - """ - - def two_level_access_pattern( - self, data_access_func_detail: DataAccessFunctionDetail - ) -> Lineage: - logger.debug( - f"Processing {self.get_platform_pair().powerbi_data_platform_name} data-access function detail {data_access_func_detail}" - ) - - server, db_name = self.get_db_detail_from_argument( - data_access_func_detail.arg_list - ) - if server is None or db_name is None: - return Lineage.empty() # Return an empty list - - schema_name: str = cast( - IdentifierAccessor, data_access_func_detail.identifier_accessor - ).items["Schema"] - - table_name: str = cast( - IdentifierAccessor, data_access_func_detail.identifier_accessor - ).items["Item"] - - qualified_table_name: str = f"{db_name}.{schema_name}.{table_name}" - - logger.debug( - f"Platform({self.get_platform_pair().datahub_data_platform_name}) qualified_table_name= {qualified_table_name}" - ) - - urn = urn_creator( - config=self.config, - platform_instance_resolver=self.platform_instance_resolver, - data_platform_pair=self.get_platform_pair(), - server=server, - qualified_table_name=qualified_table_name, - ) - return Lineage( - upstreams=[ - DataPlatformTable( - data_platform_pair=self.get_platform_pair(), - urn=urn, - ) - ], - column_lineage=[], - ) - - -class PostgresDataPlatformTableCreator(DefaultTwoStepDataAccessSources): - def create_lineage( - self, data_access_func_detail: DataAccessFunctionDetail - ) -> Lineage: - return self.two_level_access_pattern(data_access_func_detail) - - def get_platform_pair(self) -> DataPlatformPair: - return SupportedDataPlatform.POSTGRES_SQL.value - - -class MSSqlDataPlatformTableCreator(DefaultTwoStepDataAccessSources): - # https://learn.microsoft.com/en-us/sql/relational-databases/security/authentication-access/ownership-and-user-schema-separation?view=sql-server-ver16 - DEFAULT_SCHEMA = "dbo" # Default schema name in MS-SQL is dbo - - def get_platform_pair(self) -> DataPlatformPair: - return SupportedDataPlatform.MS_SQL.value - - def create_urn_using_old_parser( - self, query: str, db_name: str, server: str - ) -> List[DataPlatformTable]: - dataplatform_tables: List[DataPlatformTable] = [] - - tables: List[str] = native_sql_parser.get_tables(query) - - for parsed_table in tables: - # components: List[str] = [v.strip("[]") for v in parsed_table.split(".")] - components = [v.strip("[]") for v in parsed_table.split(".")] - if len(components) == 3: - database, schema, table = components - elif len(components) == 2: - schema, table = components - database = db_name - elif len(components) == 1: - (table,) = components - database = db_name - schema = MSSqlDataPlatformTableCreator.DEFAULT_SCHEMA - else: - self.reporter.warning( - title="Invalid table format", - message="The advanced SQL lineage feature (enable_advance_lineage_sql_construct) is disabled. Please either enable this feature or ensure the table is referenced as .. in the SQL.", - context=f"table-name={self.table.full_name}", - ) - continue - - qualified_table_name = f"{database}.{schema}.{table}" - urn = urn_creator( - config=self.config, - platform_instance_resolver=self.platform_instance_resolver, - data_platform_pair=self.get_platform_pair(), - server=server, - qualified_table_name=qualified_table_name, - ) - dataplatform_tables.append( - DataPlatformTable( - data_platform_pair=self.get_platform_pair(), - urn=urn, - ) - ) - - logger.debug(f"Generated upstream tables = {dataplatform_tables}") - - return dataplatform_tables - - def create_lineage( - self, data_access_func_detail: DataAccessFunctionDetail - ) -> Lineage: - arguments: List[str] = tree_function.strip_char_from_list( - values=tree_function.remove_whitespaces_from_list( - tree_function.token_values(data_access_func_detail.arg_list) - ), - ) - - server, database = self.get_db_detail_from_argument( - data_access_func_detail.arg_list - ) - if server is None or database is None: - return Lineage.empty() # Return an empty list - - assert server - assert database # to silent the lint - - query: Optional[str] = get_next_item(arguments, "Query") - if query: - if self.config.enable_advance_lineage_sql_construct is False: - # Use previous parser to generate URN to keep backward compatibility - return Lineage( - upstreams=self.create_urn_using_old_parser( - query=query, - db_name=database, - server=server, - ), - column_lineage=[], - ) - - return self.parse_custom_sql( - query=query, - database=database, - server=server, - schema=MSSqlDataPlatformTableCreator.DEFAULT_SCHEMA, - ) - - # It is a regular case of MS-SQL - logger.debug("Handling with regular case") - return self.two_level_access_pattern(data_access_func_detail) - - -class OracleDataPlatformTableCreator(AbstractDataPlatformTableCreator): - def get_platform_pair(self) -> DataPlatformPair: - return SupportedDataPlatform.ORACLE.value - - @staticmethod - def _get_server_and_db_name(value: str) -> Tuple[Optional[str], Optional[str]]: - error_message: str = ( - f"The target argument ({value}) should in the format of :/[" - ".]" - ) - splitter_result: List[str] = value.split("/") - if len(splitter_result) != 2: - logger.debug(error_message) - return None, None - - db_name = splitter_result[1].split(".")[0] - - return tree_function.strip_char_from_list([splitter_result[0]])[0], db_name - - def create_lineage( - self, data_access_func_detail: DataAccessFunctionDetail - ) -> Lineage: - logger.debug( - f"Processing Oracle data-access function detail {data_access_func_detail}" - ) - - arguments: List[str] = tree_function.remove_whitespaces_from_list( - tree_function.token_values(data_access_func_detail.arg_list) - ) - - server, db_name = self._get_server_and_db_name(arguments[0]) - - if db_name is None or server is None: - return Lineage.empty() - - schema_name: str = cast( - IdentifierAccessor, data_access_func_detail.identifier_accessor - ).items["Schema"] - - table_name: str = cast( - IdentifierAccessor, - cast(IdentifierAccessor, data_access_func_detail.identifier_accessor).next, - ).items["Name"] - - qualified_table_name: str = f"{db_name}.{schema_name}.{table_name}" - - urn = urn_creator( - config=self.config, - platform_instance_resolver=self.platform_instance_resolver, - data_platform_pair=self.get_platform_pair(), - server=server, - qualified_table_name=qualified_table_name, - ) - - return Lineage( - upstreams=[ - DataPlatformTable( - data_platform_pair=self.get_platform_pair(), - urn=urn, - ) - ], - column_lineage=[], - ) - - -class DatabrickDataPlatformTableCreator(AbstractDataPlatformTableCreator): - def form_qualified_table_name( - self, - table_reference: ReferencedTable, - data_platform_pair: DataPlatformPair, - ) -> str: - platform_detail: PlatformDetail = ( - self.platform_instance_resolver.get_platform_instance( - PowerBIPlatformDetail( - data_platform_pair=data_platform_pair, - data_platform_server=table_reference.warehouse, - ) - ) - ) - - metastore: Optional[str] = None - - qualified_table_name: str = f"{table_reference.database}.{table_reference.schema}.{table_reference.table}" - - if isinstance(platform_detail, DataBricksPlatformDetail): - metastore = platform_detail.metastore - - if metastore is not None: - return f"{metastore}.{qualified_table_name}" - - return qualified_table_name - - def create_lineage( - self, data_access_func_detail: DataAccessFunctionDetail - ) -> Lineage: - logger.debug( - f"Processing Databrick data-access function detail {data_access_func_detail}" - ) - table_detail: Dict[str, str] = {} - temp_accessor: Optional[ - Union[IdentifierAccessor, AbstractIdentifierAccessor] - ] = data_access_func_detail.identifier_accessor - - while temp_accessor: - if isinstance(temp_accessor, IdentifierAccessor): - # Condition to handle databricks M-query pattern where table, schema and database all are present in - # the same invoke statement - if all( - element in temp_accessor.items - for element in ["Item", "Schema", "Catalog"] - ): - table_detail["Schema"] = temp_accessor.items["Schema"] - table_detail["Table"] = temp_accessor.items["Item"] - else: - table_detail[temp_accessor.items["Kind"]] = temp_accessor.items[ - "Name" - ] - - if temp_accessor.next is not None: - temp_accessor = temp_accessor.next - else: - break - else: - logger.debug( - "expecting instance to be IdentifierAccessor, please check if parsing is done properly" - ) - return Lineage.empty() - - table_reference = self.create_reference_table( - arg_list=data_access_func_detail.arg_list, - table_detail=table_detail, - ) - - if table_reference: - qualified_table_name: str = self.form_qualified_table_name( - table_reference=table_reference, - data_platform_pair=self.get_platform_pair(), - ) - - urn = urn_creator( - config=self.config, - platform_instance_resolver=self.platform_instance_resolver, - data_platform_pair=self.get_platform_pair(), - server=table_reference.warehouse, - qualified_table_name=qualified_table_name, - ) - - return Lineage( - upstreams=[ - DataPlatformTable( - data_platform_pair=self.get_platform_pair(), - urn=urn, - ) - ], - column_lineage=[], - ) - - return Lineage.empty() - - def get_platform_pair(self) -> DataPlatformPair: - return SupportedDataPlatform.DATABRICK_SQL.value - - -class DefaultThreeStepDataAccessSources(AbstractDataPlatformTableCreator, ABC): - def get_datasource_server( - self, arguments: List[str], data_access_func_detail: DataAccessFunctionDetail - ) -> str: - return tree_function.strip_char_from_list([arguments[0]])[0] - - def create_lineage( - self, data_access_func_detail: DataAccessFunctionDetail - ) -> Lineage: - logger.debug( - f"Processing {self.get_platform_pair().datahub_data_platform_name} function detail {data_access_func_detail}" - ) - - arguments: List[str] = tree_function.remove_whitespaces_from_list( - tree_function.token_values(data_access_func_detail.arg_list) - ) - # First is database name - db_name: str = data_access_func_detail.identifier_accessor.items["Name"] # type: ignore - # Second is schema name - schema_name: str = cast( - IdentifierAccessor, data_access_func_detail.identifier_accessor.next # type: ignore - ).items["Name"] - # Third is table name - table_name: str = cast( - IdentifierAccessor, data_access_func_detail.identifier_accessor.next.next # type: ignore - ).items["Name"] - - qualified_table_name: str = f"{db_name}.{schema_name}.{table_name}" - - logger.debug( - f"{self.get_platform_pair().datahub_data_platform_name} qualified_table_name {qualified_table_name}" - ) - - server: str = self.get_datasource_server(arguments, data_access_func_detail) - - urn = urn_creator( - config=self.config, - platform_instance_resolver=self.platform_instance_resolver, - data_platform_pair=self.get_platform_pair(), - server=server, - qualified_table_name=qualified_table_name, - ) - - return Lineage( - upstreams=[ - DataPlatformTable( - data_platform_pair=self.get_platform_pair(), - urn=urn, - ) - ], - column_lineage=[], - ) - - -class SnowflakeDataPlatformTableCreator(DefaultThreeStepDataAccessSources): - def get_platform_pair(self) -> DataPlatformPair: - return SupportedDataPlatform.SNOWFLAKE.value - - -class GoogleBigQueryDataPlatformTableCreator(DefaultThreeStepDataAccessSources): - def get_platform_pair(self) -> DataPlatformPair: - return SupportedDataPlatform.GOOGLE_BIGQUERY.value - - def get_datasource_server( - self, arguments: List[str], data_access_func_detail: DataAccessFunctionDetail - ) -> str: - # In Google BigQuery server is project-name - # condition to silent lint, it is not going to be None - return ( - data_access_func_detail.identifier_accessor.items["Name"] - if data_access_func_detail.identifier_accessor is not None - else "" - ) - - -class AmazonRedshiftDataPlatformTableCreator(AbstractDataPlatformTableCreator): - def get_platform_pair(self) -> DataPlatformPair: - return SupportedDataPlatform.AMAZON_REDSHIFT.value - - def create_lineage( - self, data_access_func_detail: DataAccessFunctionDetail - ) -> Lineage: - logger.debug( - f"Processing AmazonRedshift data-access function detail {data_access_func_detail}" - ) - - server, db_name = self.get_db_detail_from_argument( - data_access_func_detail.arg_list - ) - if db_name is None or server is None: - return Lineage.empty() # Return empty list - - schema_name: str = cast( - IdentifierAccessor, data_access_func_detail.identifier_accessor - ).items["Name"] - - table_name: str = cast( - IdentifierAccessor, - cast(IdentifierAccessor, data_access_func_detail.identifier_accessor).next, - ).items["Name"] - - qualified_table_name: str = f"{db_name}.{schema_name}.{table_name}" - - urn = urn_creator( - config=self.config, - platform_instance_resolver=self.platform_instance_resolver, - data_platform_pair=self.get_platform_pair(), - server=server, - qualified_table_name=qualified_table_name, - ) - - return Lineage( - upstreams=[ - DataPlatformTable( - data_platform_pair=self.get_platform_pair(), - urn=urn, - ) - ], - column_lineage=[], - ) - - -class NativeQueryDataPlatformTableCreator(AbstractDataPlatformTableCreator): - SUPPORTED_NATIVE_QUERY_DATA_PLATFORM: dict = { - SupportedDataPlatform.SNOWFLAKE.value.powerbi_data_platform_name: SupportedDataPlatform.SNOWFLAKE, - SupportedDataPlatform.AMAZON_REDSHIFT.value.powerbi_data_platform_name: SupportedDataPlatform.AMAZON_REDSHIFT, - SupportedDataPlatform.DatabricksMultiCloud_SQL.value.powerbi_data_platform_name: SupportedDataPlatform.DatabricksMultiCloud_SQL, - } - current_data_platform: SupportedDataPlatform = SupportedDataPlatform.SNOWFLAKE - - def get_platform_pair(self) -> DataPlatformPair: - return self.current_data_platform.value - - @staticmethod - def is_native_parsing_supported(data_access_function_name: str) -> bool: - return ( - data_access_function_name - in NativeQueryDataPlatformTableCreator.SUPPORTED_NATIVE_QUERY_DATA_PLATFORM - ) - - def create_urn_using_old_parser(self, query: str, server: str) -> Lineage: - dataplatform_tables: List[DataPlatformTable] = [] - - tables: List[str] = native_sql_parser.get_tables(query) - - for qualified_table_name in tables: - if len(qualified_table_name.split(".")) != 3: - logger.debug( - f"Skipping table {qualified_table_name} as it is not as per qualified_table_name format" - ) - continue - - urn = urn_creator( - config=self.config, - platform_instance_resolver=self.platform_instance_resolver, - data_platform_pair=self.get_platform_pair(), - server=server, - qualified_table_name=qualified_table_name, - ) - - dataplatform_tables.append( - DataPlatformTable( - data_platform_pair=self.get_platform_pair(), - urn=urn, - ) - ) - - logger.debug(f"Generated dataplatform_tables {dataplatform_tables}") - - return Lineage( - upstreams=dataplatform_tables, - column_lineage=[], - ) - - def get_db_name(self, data_access_tokens: List[str]) -> Optional[str]: - if ( - data_access_tokens[0] - != SupportedDataPlatform.DatabricksMultiCloud_SQL.value.powerbi_data_platform_name - ): - return None - - database: Optional[str] = get_next_item(data_access_tokens, "Database") - - if ( - database and database != Constant.M_QUERY_NULL - ): # database name is explicitly set - return database - - return get_next_item( # database name is set in Name argument - data_access_tokens, "Name" - ) or get_next_item( # If both above arguments are not available, then try Catalog - data_access_tokens, "Catalog" - ) - - def create_lineage( - self, data_access_func_detail: DataAccessFunctionDetail - ) -> Lineage: - t1: Tree = cast( - Tree, tree_function.first_arg_list_func(data_access_func_detail.arg_list) - ) - flat_argument_list: List[Tree] = tree_function.flat_argument_list(t1) - - if len(flat_argument_list) != 2: - logger.debug( - f"Expecting 2 argument, actual argument count is {len(flat_argument_list)}" - ) - logger.debug(f"Flat argument list = {flat_argument_list}") - return Lineage.empty() - - data_access_tokens: List[str] = tree_function.remove_whitespaces_from_list( - tree_function.token_values(flat_argument_list[0]) - ) - - if not self.is_native_parsing_supported(data_access_tokens[0]): - logger.debug( - f"Unsupported native-query data-platform = {data_access_tokens[0]}" - ) - logger.debug( - f"NativeQuery is supported only for {self.SUPPORTED_NATIVE_QUERY_DATA_PLATFORM}" - ) - - return Lineage.empty() - - if len(data_access_tokens[0]) < 3: - logger.debug( - f"Server is not available in argument list for data-platform {data_access_tokens[0]}. Returning empty " - "list" - ) - return Lineage.empty() - - self.current_data_platform = self.SUPPORTED_NATIVE_QUERY_DATA_PLATFORM[ - data_access_tokens[0] - ] - # The First argument is the query - sql_query: str = tree_function.strip_char_from_list( - values=tree_function.remove_whitespaces_from_list( - tree_function.token_values(flat_argument_list[1]) - ), - )[ - 0 - ] # Remove any whitespaces and double quotes character - - server = tree_function.strip_char_from_list([data_access_tokens[2]])[0] - - if self.config.enable_advance_lineage_sql_construct is False: - # Use previous parser to generate URN to keep backward compatibility - return self.create_urn_using_old_parser( - query=sql_query, - server=server, - ) - - database_name: Optional[str] = self.get_db_name(data_access_tokens) - - return self.parse_custom_sql( - query=sql_query, - server=server, - database=database_name, - schema=None, - ) - - -class FunctionName(Enum): - NATIVE_QUERY = "Value.NativeQuery" - POSTGRESQL_DATA_ACCESS = "PostgreSQL.Database" - ORACLE_DATA_ACCESS = "Oracle.Database" - SNOWFLAKE_DATA_ACCESS = "Snowflake.Databases" - MSSQL_DATA_ACCESS = "Sql.Database" - DATABRICK_DATA_ACCESS = "Databricks.Catalogs" - GOOGLE_BIGQUERY_DATA_ACCESS = "GoogleBigQuery.Database" - AMAZON_REDSHIFT_DATA_ACCESS = "AmazonRedshift.Database" - DATABRICK_MULTI_CLOUD_DATA_ACCESS = "DatabricksMultiCloud.Catalogs" - - -class SupportedResolver(Enum): - DATABRICKS_QUERY = ( - DatabrickDataPlatformTableCreator, - FunctionName.DATABRICK_DATA_ACCESS, - ) - - DATABRICKS_MULTI_CLOUD = ( - DatabrickDataPlatformTableCreator, - FunctionName.DATABRICK_MULTI_CLOUD_DATA_ACCESS, - ) - - POSTGRES_SQL = ( - PostgresDataPlatformTableCreator, - FunctionName.POSTGRESQL_DATA_ACCESS, - ) - - ORACLE = ( - OracleDataPlatformTableCreator, - FunctionName.ORACLE_DATA_ACCESS, - ) - - SNOWFLAKE = ( - SnowflakeDataPlatformTableCreator, - FunctionName.SNOWFLAKE_DATA_ACCESS, - ) - - MS_SQL = ( - MSSqlDataPlatformTableCreator, - FunctionName.MSSQL_DATA_ACCESS, - ) - - GOOGLE_BIG_QUERY = ( - GoogleBigQueryDataPlatformTableCreator, - FunctionName.GOOGLE_BIGQUERY_DATA_ACCESS, - ) - - AMAZON_REDSHIFT = ( - AmazonRedshiftDataPlatformTableCreator, - FunctionName.AMAZON_REDSHIFT_DATA_ACCESS, - ) - - NATIVE_QUERY = ( - NativeQueryDataPlatformTableCreator, - FunctionName.NATIVE_QUERY, - ) - - def get_table_full_name_creator(self) -> Type[AbstractDataPlatformTableCreator]: - return self.value[0] - - def get_function_name(self) -> str: - return self.value[1].value - - @staticmethod - def get_function_names() -> List[str]: - functions: List[str] = [] - for supported_resolver in SupportedResolver: - functions.append(supported_resolver.get_function_name()) - - return functions - - @staticmethod - def get_resolver(function_name: str) -> Optional["SupportedResolver"]: - logger.debug(f"Looking for resolver {function_name}") - for supported_resolver in SupportedResolver: - if function_name == supported_resolver.get_function_name(): - return supported_resolver - logger.debug(f"Resolver not found for function_name {function_name}") - return None diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/validator.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/validator.py index ca2abf97c9f303..b52977aaa41fbe 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/validator.py +++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/validator.py @@ -1,7 +1,7 @@ import logging from typing import Optional, Tuple -from datahub.ingestion.source.powerbi.m_query import resolver +import datahub.ingestion.source.powerbi.m_query.data_classes logger = logging.getLogger(__name__) @@ -14,12 +14,18 @@ def validate_parse_tree( :param native_query_enabled: Whether user want to extract lineage from native query :return: True or False. """ - function_names = [fun.value for fun in resolver.FunctionName] + function_names = [ + fun.value + for fun in datahub.ingestion.source.powerbi.m_query.data_classes.FunctionName + ] if not any(fun in expression for fun in function_names): return False, "DataAccess function is not present in M-Query expression." if native_query_enabled is False: - if resolver.FunctionName.NATIVE_QUERY.value in function_names: + if ( + datahub.ingestion.source.powerbi.m_query.data_classes.FunctionName.NATIVE_QUERY.value + in function_names + ): return ( False, "Lineage extraction from native query is disabled. Enable native_query_parsing in recipe", diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi/powerbi.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi/powerbi.py index cef2d098aebc40..044946a5d308d1 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/powerbi/powerbi.py +++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi/powerbi.py @@ -10,6 +10,7 @@ import more_itertools import datahub.emitter.mce_builder as builder +import datahub.ingestion.source.powerbi.m_query.data_classes import datahub.ingestion.source.powerbi.rest_api_wrapper.data_classes as powerbi_data_classes from datahub.emitter.mcp import MetadataChangeProposalWrapper from datahub.emitter.mcp_builder import ContainerKey, gen_containers @@ -42,12 +43,13 @@ Constant, PowerBiDashboardSourceConfig, PowerBiDashboardSourceReport, + SupportedDataPlatform, ) from datahub.ingestion.source.powerbi.dataplatform_instance_resolver import ( AbstractDataPlatformInstanceResolver, create_dataplatform_instance_resolver, ) -from datahub.ingestion.source.powerbi.m_query import parser, resolver +from datahub.ingestion.source.powerbi.m_query import parser from datahub.ingestion.source.powerbi.rest_api_wrapper.powerbi_api import PowerBiAPI from datahub.ingestion.source.state.stale_entity_removal_handler import ( StaleEntityRemovalHandler, @@ -182,7 +184,9 @@ def extract_dataset_schema( return [schema_mcp] def make_fine_grained_lineage_class( - self, lineage: resolver.Lineage, dataset_urn: str + self, + lineage: datahub.ingestion.source.powerbi.m_query.data_classes.Lineage, + dataset_urn: str, ) -> List[FineGrainedLineage]: fine_grained_lineages: List[FineGrainedLineage] = [] @@ -234,7 +238,9 @@ def extract_lineage( upstream: List[UpstreamClass] = [] cll_lineage: List[FineGrainedLineage] = [] - upstream_lineage: List[resolver.Lineage] = parser.get_upstream_tables( + upstream_lineage: List[ + datahub.ingestion.source.powerbi.m_query.data_classes.Lineage + ] = parser.get_upstream_tables( table=table, reporter=self.__reporter, platform_instance_resolver=self.__dataplatform_instance_resolver, @@ -1294,7 +1300,7 @@ def get_allowed_workspaces(self) -> List[powerbi_data_classes.Workspace]: def validate_dataset_type_mapping(self): powerbi_data_platforms: List[str] = [ data_platform.value.powerbi_data_platform_name - for data_platform in resolver.SupportedDataPlatform + for data_platform in SupportedDataPlatform ] for key in self.source_config.dataset_type_mapping.keys(): @@ -1481,7 +1487,7 @@ def _get_dashboard_patch_work_unit( def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]: # As modified_workspaces is not idempotent, hence workunit processors are run later for each workspace_id - # This will result in creating checkpoint for each workspace_id + # This will result in creating a checkpoint for each workspace_id if self.source_config.modified_since: return [] # Handle these in get_workunits_internal else: @@ -1492,7 +1498,7 @@ def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]: def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: """ - Datahub Ingestion framework invoke this method + Datahub Ingestion framework invokes this method """ logger.info("PowerBi plugin execution is started") # Validate dataset type mapping diff --git a/metadata-ingestion/src/datahub/ingestion/source/qlik_sense/data_classes.py b/metadata-ingestion/src/datahub/ingestion/source/qlik_sense/data_classes.py index 672fcbceb0603b..a43f5f32493f2d 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/qlik_sense/data_classes.py +++ b/metadata-ingestion/src/datahub/ingestion/source/qlik_sense/data_classes.py @@ -15,6 +15,7 @@ TimeType, ) +# TODO: Replace with standardized types in sql_types.py FIELD_TYPE_MAPPING: Dict[ str, Type[ diff --git a/metadata-ingestion/src/datahub/ingestion/source/redshift/redshift.py b/metadata-ingestion/src/datahub/ingestion/source/redshift/redshift.py index 4bc4c1451c262f..06cbb7fbae27cc 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/redshift/redshift.py +++ b/metadata-ingestion/src/datahub/ingestion/source/redshift/redshift.py @@ -222,6 +222,7 @@ class RedshiftSource(StatefulIngestionSourceBase, TestableSource): ``` """ + # TODO: Replace with standardized types in sql_types.py REDSHIFT_FIELD_TYPE_MAPPINGS: Dict[ str, Type[ diff --git a/metadata-ingestion/src/datahub/ingestion/source/redshift/report.py b/metadata-ingestion/src/datahub/ingestion/source/redshift/report.py index ff28ed2c5e849c..2748f2a588a930 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/redshift/report.py +++ b/metadata-ingestion/src/datahub/ingestion/source/redshift/report.py @@ -3,7 +3,7 @@ from typing import Dict, Optional from datahub.ingestion.glossary.classification_mixin import ClassificationReportMixin -from datahub.ingestion.source.sql.sql_generic_profiler import ProfilingSqlReport +from datahub.ingestion.source.sql.sql_report import SQLSourceReport from datahub.ingestion.source_report.ingestion_stage import IngestionStageReport from datahub.ingestion.source_report.time_window import BaseTimeWindowReport from datahub.sql_parsing.sql_parsing_aggregator import SqlAggregatorReport @@ -14,7 +14,7 @@ @dataclass class RedshiftReport( - ProfilingSqlReport, + SQLSourceReport, IngestionStageReport, BaseTimeWindowReport, ClassificationReportMixin, diff --git a/metadata-ingestion/src/datahub/ingestion/source/sigma/data_classes.py b/metadata-ingestion/src/datahub/ingestion/source/sigma/data_classes.py index 922b0be3b4a93c..5a657d804cb7bf 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/sigma/data_classes.py +++ b/metadata-ingestion/src/datahub/ingestion/source/sigma/data_classes.py @@ -80,6 +80,7 @@ class Workbook(BaseModel): path: str latestVersion: int workspaceId: Optional[str] = None + description: Optional[str] = None pages: List[Page] = [] badge: Optional[str] = None diff --git a/metadata-ingestion/src/datahub/ingestion/source/sigma/sigma.py b/metadata-ingestion/src/datahub/ingestion/source/sigma/sigma.py index dd4b65a2cbdf29..e96eeb58d96efe 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/sigma/sigma.py +++ b/metadata-ingestion/src/datahub/ingestion/source/sigma/sigma.py @@ -4,7 +4,12 @@ import datahub.emitter.mce_builder as builder from datahub.configuration.common import ConfigurationError from datahub.emitter.mcp import MetadataChangeProposalWrapper -from datahub.emitter.mcp_builder import add_entity_to_container, gen_containers +from datahub.emitter.mcp_builder import ( + add_entity_to_container, + add_owner_to_entity_wu, + add_tags_to_entity_wu, + gen_containers, +) from datahub.ingestion.api.common import PipelineContext from datahub.ingestion.api.decorators import ( SourceCapability, @@ -59,12 +64,14 @@ UpstreamLineage, ) from datahub.metadata.schema_classes import ( + AuditStampClass, BrowsePathEntryClass, BrowsePathsV2Class, ChangeAuditStampsClass, ChartInfoClass, DashboardInfoClass, DataPlatformInstanceClass, + EdgeClass, GlobalTagsClass, InputFieldClass, InputFieldsClass, @@ -74,6 +81,7 @@ SchemaFieldClass, SchemaFieldDataTypeClass, StringTypeClass, + SubTypesClass, TagAssociationClass, ) from datahub.sql_parsing.sqlglot_lineage import create_lineage_sql_parsed_result @@ -257,11 +265,6 @@ def _gen_entity_browsepath_aspect( entries = [ BrowsePathEntryClass(id=parent_entity_urn, urn=parent_entity_urn) ] + [BrowsePathEntryClass(id=path) for path in paths] - if self.config.platform_instance: - urn = builder.make_dataplatform_instance_urn( - self.platform, self.config.platform_instance - ) - entries = [BrowsePathEntryClass(id=urn, urn=urn)] + entries return MetadataChangeProposalWrapper( entityUrn=entity_urn, aspect=BrowsePathsV2Class(entries), @@ -424,11 +427,11 @@ def _gen_elements_workunit( elements: List[Element], workbook: Workbook, all_input_fields: List[InputFieldClass], + paths: List[str], ) -> Iterable[MetadataWorkUnit]: """ Map Sigma page element to Datahub Chart """ - for element in elements: chart_urn = builder.make_chart_urn( platform=self.platform, @@ -459,11 +462,14 @@ def _gen_elements_workunit( ), ).as_workunit() - yield from add_entity_to_container( - container_key=self._gen_workbook_key(workbook.workbookId), - entity_type="chart", - entity_urn=chart_urn, - ) + if workbook.workspaceId: + yield self._gen_entity_browsepath_aspect( + entity_urn=chart_urn, + parent_entity_urn=builder.make_container_urn( + self._gen_workspace_key(workbook.workspaceId) + ), + paths=paths + [workbook.name], + ) # Add sigma dataset's upstream dataset urn mapping for dataset_urn, upstream_dataset_urns in inputs.items(): @@ -494,7 +500,9 @@ def _gen_elements_workunit( all_input_fields.extend(element_input_fields) - def _gen_pages_workunit(self, workbook: Workbook) -> Iterable[MetadataWorkUnit]: + def _gen_pages_workunit( + self, workbook: Workbook, paths: List[str] + ) -> Iterable[MetadataWorkUnit]: """ Map Sigma workbook page to Datahub dashboard """ @@ -505,20 +513,23 @@ def _gen_pages_workunit(self, workbook: Workbook) -> Iterable[MetadataWorkUnit]: yield self._gen_dashboard_info_workunit(page) - yield from add_entity_to_container( - container_key=self._gen_workbook_key(workbook.workbookId), - entity_type="dashboard", - entity_urn=dashboard_urn, - ) - dpi_aspect = self._gen_dataplatform_instance_aspect(dashboard_urn) if dpi_aspect: yield dpi_aspect all_input_fields: List[InputFieldClass] = [] + if workbook.workspaceId: + yield self._gen_entity_browsepath_aspect( + entity_urn=dashboard_urn, + parent_entity_urn=builder.make_container_urn( + self._gen_workspace_key(workbook.workspaceId) + ), + paths=paths + [workbook.name], + ) + yield from self._gen_elements_workunit( - page.elements, workbook, all_input_fields + page.elements, workbook, all_input_fields, paths ) yield MetadataChangeProposalWrapper( @@ -531,42 +542,89 @@ def _gen_workbook_workunit(self, workbook: Workbook) -> Iterable[MetadataWorkUni Map Sigma Workbook to Datahub container """ owner_username = self.sigma_api.get_user_name(workbook.createdBy) - workbook_key = self._gen_workbook_key(workbook.workbookId) - yield from gen_containers( - container_key=workbook_key, - name=workbook.name, - sub_types=[BIContainerSubTypes.SIGMA_WORKBOOK], - parent_container_key=( - self._gen_workspace_key(workbook.workspaceId) - if workbook.workspaceId - else None + + dashboard_urn = self._gen_dashboard_urn(workbook.workbookId) + + yield self._gen_entity_status_aspect(dashboard_urn) + + lastModified = AuditStampClass( + time=int(workbook.updatedAt.timestamp() * 1000), + actor="urn:li:corpuser:datahub", + ) + created = AuditStampClass( + time=int(workbook.createdAt.timestamp() * 1000), + actor="urn:li:corpuser:datahub", + ) + + dashboard_info_cls = DashboardInfoClass( + title=workbook.name, + description=workbook.description if workbook.description else "", + dashboards=[ + EdgeClass( + destinationUrn=self._gen_dashboard_urn(page.get_urn_part()), + sourceUrn=dashboard_urn, + ) + for page in workbook.pages + ], + externalUrl=workbook.url, + lastModified=ChangeAuditStampsClass( + created=created, lastModified=lastModified ), - extra_properties={ + customProperties={ "path": workbook.path, "latestVersion": str(workbook.latestVersion), }, - owner_urn=( - builder.make_user_urn(owner_username) - if self.config.ingest_owner and owner_username - else None - ), - external_url=workbook.url, - tags=[workbook.badge] if workbook.badge else None, - created=int(workbook.createdAt.timestamp() * 1000), - last_modified=int(workbook.updatedAt.timestamp() * 1000), ) + yield MetadataChangeProposalWrapper( + entityUrn=dashboard_urn, aspect=dashboard_info_cls + ).as_workunit() + + # Set subtype + yield MetadataChangeProposalWrapper( + entityUrn=dashboard_urn, + aspect=SubTypesClass(typeNames=[BIContainerSubTypes.SIGMA_WORKBOOK]), + ).as_workunit() + + # Ownership + owner_urn = ( + builder.make_user_urn(owner_username) + if self.config.ingest_owner and owner_username + else None + ) + if owner_urn: + yield from add_owner_to_entity_wu( + entity_type="dashboard", + entity_urn=dashboard_urn, + owner_urn=owner_urn, + ) + + # Tags + tags = [workbook.badge] if workbook.badge else None + if tags: + yield from add_tags_to_entity_wu( + entity_type="dashboard", + entity_urn=dashboard_urn, + tags=sorted(tags), + ) paths = workbook.path.split("/")[1:] - if len(paths) > 0 and workbook.workspaceId: + if workbook.workspaceId: yield self._gen_entity_browsepath_aspect( - entity_urn=builder.make_container_urn(workbook_key), + entity_urn=dashboard_urn, parent_entity_urn=builder.make_container_urn( self._gen_workspace_key(workbook.workspaceId) ), - paths=paths, + paths=paths + [workbook.name], ) - yield from self._gen_pages_workunit(workbook) + if len(paths) == 0: + yield from add_entity_to_container( + container_key=self._gen_workspace_key(workbook.workspaceId), + entity_type="dashboard", + entity_urn=dashboard_urn, + ) + + yield from self._gen_pages_workunit(workbook, paths) def _gen_sigma_dataset_upstream_lineage_workunit( self, diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_report.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_report.py index 80b6be36e5ffa1..b5f56f99431f91 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_report.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_report.py @@ -5,7 +5,7 @@ from datahub.ingestion.api.report import Report from datahub.ingestion.glossary.classification_mixin import ClassificationReportMixin from datahub.ingestion.source.snowflake.constants import SnowflakeEdition -from datahub.ingestion.source.sql.sql_generic_profiler import ProfilingSqlReport +from datahub.ingestion.source.sql.sql_report import SQLSourceReport from datahub.ingestion.source.state.stateful_ingestion_base import ( StatefulIngestionReport, ) @@ -59,7 +59,7 @@ class SnowflakeUsageReport: @dataclass -class SnowflakeReport(ProfilingSqlReport, BaseTimeWindowReport): +class SnowflakeReport(SQLSourceReport, BaseTimeWindowReport): num_table_to_table_edges_scanned: int = 0 num_table_to_view_edges_scanned: int = 0 num_view_to_table_edges_scanned: int = 0 diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_schema_gen.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_schema_gen.py index d4442749a06224..2bd8e8017f5492 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_schema_gen.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_schema_gen.py @@ -103,6 +103,7 @@ logger = logging.getLogger(__name__) # https://docs.snowflake.com/en/sql-reference/intro-summary-data-types.html +# TODO: Move to the standardized types in sql_types.py SNOWFLAKE_FIELD_TYPE_MAPPINGS = { "DATE": DateType, "BIGINT": NumberType, diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql/athena.py b/metadata-ingestion/src/datahub/ingestion/source/sql/athena.py index 71cfd0268ee6b5..6f7decc79b1df2 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/sql/athena.py +++ b/metadata-ingestion/src/datahub/ingestion/source/sql/athena.py @@ -26,6 +26,7 @@ platform_name, support_status, ) +from datahub.ingestion.api.source import StructuredLogLevel from datahub.ingestion.api.workunit import MetadataWorkUnit from datahub.ingestion.source.aws.s3_util import make_s3_urn from datahub.ingestion.source.common.subtypes import DatasetContainerSubTypes @@ -35,6 +36,7 @@ register_custom_type, ) from datahub.ingestion.source.sql.sql_config import SQLCommonConfig, make_sqlalchemy_uri +from datahub.ingestion.source.sql.sql_report import SQLSourceReport from datahub.ingestion.source.sql.sql_utils import ( add_table_to_schema_container, gen_database_container, @@ -48,6 +50,15 @@ get_schema_fields_for_sqlalchemy_column, ) +try: + from typing_extensions import override +except ImportError: + _F = typing.TypeVar("_F", bound=typing.Callable[..., typing.Any]) + + def override(f: _F, /) -> _F: # noqa: F811 + return f + + logger = logging.getLogger(__name__) assert STRUCT, "required type modules are not available" @@ -322,12 +333,15 @@ class AthenaSource(SQLAlchemySource): - Profiling when enabled. """ - table_partition_cache: Dict[str, Dict[str, Partitionitem]] = {} + config: AthenaConfig + report: SQLSourceReport def __init__(self, config, ctx): super().__init__(config, ctx, "athena") self.cursor: Optional[BaseCursor] = None + self.table_partition_cache: Dict[str, Dict[str, Partitionitem]] = {} + @classmethod def create(cls, config_dict, ctx): config = AthenaConfig.parse_obj(config_dict) @@ -452,6 +466,7 @@ def add_table_to_schema_container( ) # It seems like database/schema filter in the connection string does not work and this to work around that + @override def get_schema_names(self, inspector: Inspector) -> List[str]: athena_config = typing.cast(AthenaConfig, self.config) schemas = inspector.get_schema_names() @@ -459,34 +474,42 @@ def get_schema_names(self, inspector: Inspector) -> List[str]: return [schema for schema in schemas if schema == athena_config.database] return schemas - # Overwrite to get partitions + @classmethod + def _casted_partition_key(cls, key: str) -> str: + # We need to cast the partition keys to a VARCHAR, since otherwise + # Athena may throw an error during concatenation / comparison. + return f"CAST({key} as VARCHAR)" + + @override def get_partitions( self, inspector: Inspector, schema: str, table: str - ) -> List[str]: - partitions = [] - - athena_config = typing.cast(AthenaConfig, self.config) - - if not athena_config.extract_partitions: - return [] + ) -> Optional[List[str]]: + if not self.config.extract_partitions: + return None if not self.cursor: - return [] + return None metadata: AthenaTableMetadata = self.cursor.get_table_metadata( table_name=table, schema_name=schema ) - if metadata.partition_keys: - for key in metadata.partition_keys: - if key.name: - partitions.append(key.name) - - if not partitions: - return [] + partitions = [] + for key in metadata.partition_keys: + if key.name: + partitions.append(key.name) + if not partitions: + return [] - # We create an artiificaial concatenated partition key to be able to query max partition easier - part_concat = "|| '-' ||".join(partitions) + with self.report.report_exc( + message="Failed to extract partition details", + context=f"{schema}.{table}", + level=StructuredLogLevel.WARN, + ): + # We create an artifical concatenated partition key to be able to query max partition easier + part_concat = " || '-' || ".join( + self._casted_partition_key(key) for key in partitions + ) max_partition_query = f'select {",".join(partitions)} from "{schema}"."{table}$partitions" where {part_concat} = (select max({part_concat}) from "{schema}"."{table}$partitions")' ret = self.cursor.execute(max_partition_query) max_partition: Dict[str, str] = {} @@ -500,9 +523,8 @@ def get_partitions( partitions=partitions, max_partition=max_partition, ) - return partitions - return [] + return partitions # Overwrite to modify the creation of schema fields def get_schema_fields_for_column( @@ -551,7 +573,9 @@ def generate_partition_profiler_query( if partition and partition.max_partition: max_partition_filters = [] for key, value in partition.max_partition.items(): - max_partition_filters.append(f"CAST({key} as VARCHAR) = '{value}'") + max_partition_filters.append( + f"{self._casted_partition_key(key)} = '{value}'" + ) max_partition = str(partition.max_partition) return ( max_partition, diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql/mssql/job_models.py b/metadata-ingestion/src/datahub/ingestion/source/sql/mssql/job_models.py index 21e7fad3343314..5107a4e38f64de 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/sql/mssql/job_models.py +++ b/metadata-ingestion/src/datahub/ingestion/source/sql/mssql/job_models.py @@ -101,6 +101,7 @@ class StoredProcedure: flow: Union[MSSQLJob, MSSQLProceduresContainer] type: str = "STORED_PROCEDURE" source: str = "mssql" + code: Optional[str] = None @property def full_type(self) -> str: diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql/mssql/source.py b/metadata-ingestion/src/datahub/ingestion/source/sql/mssql/source.py index c19b22a8622ca2..7a2dbda8b4a939 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/sql/mssql/source.py +++ b/metadata-ingestion/src/datahub/ingestion/source/sql/mssql/source.py @@ -24,6 +24,8 @@ platform_name, support_status, ) +from datahub.ingestion.api.source import StructuredLogLevel +from datahub.ingestion.api.source_helpers import auto_workunit from datahub.ingestion.api.workunit import MetadataWorkUnit from datahub.ingestion.source.sql.mssql.job_models import ( JobStep, @@ -36,6 +38,9 @@ ProcedureParameter, StoredProcedure, ) +from datahub.ingestion.source.sql.mssql.stored_procedure_lineage import ( + generate_procedure_lineage, +) from datahub.ingestion.source.sql.sql_common import ( SQLAlchemySource, SqlWorkUnit, @@ -45,12 +50,14 @@ BasicSQLAlchemyConfig, make_sqlalchemy_uri, ) +from datahub.ingestion.source.sql.sql_report import SQLSourceReport from datahub.metadata.schema_classes import ( BooleanTypeClass, NumberTypeClass, StringTypeClass, UnionTypeClass, ) +from datahub.utilities.file_backed_collections import FileBackedList logger: logging.Logger = logging.getLogger(__name__) @@ -72,6 +79,11 @@ class SQLServerConfig(BasicSQLAlchemyConfig): include_stored_procedures_code: bool = Field( default=True, description="Include information about object code." ) + procedure_pattern: AllowDenyPattern = Field( + default=AllowDenyPattern.allow_all(), + description="Regex patterns for stored procedures to filter in ingestion." + "Specify regex to match the entire procedure name in database.schema.procedure_name format. e.g. to match all procedures starting with customer in Customer database and public schema, use the regex 'Customer.public.customer.*'", + ) include_jobs: bool = Field( default=True, description="Include ingest of MSSQL Jobs. Requires access to the 'msdb' and 'sys' schema.", @@ -99,6 +111,10 @@ class SQLServerConfig(BasicSQLAlchemyConfig): default=False, description="Enable to convert the SQL Server assets urns to lowercase", ) + include_lineage: bool = Field( + default=True, + description="Enable lineage extraction for stored procedures", + ) @pydantic.validator("uri_args") def passwords_match(cls, v, values, **kwargs): @@ -154,6 +170,8 @@ class SQLServerSource(SQLAlchemySource): If you do use pyodbc, make sure to change the source type from `mssql` to `mssql-odbc` so that we pull in the right set of dependencies. This will be needed in most cases where encryption is required, such as managed SQL Server services in Azure. """ + report: SQLSourceReport + def __init__(self, config: SQLServerConfig, ctx: PipelineContext): super().__init__(config, ctx, "mssql") # Cache the table and column descriptions @@ -161,6 +179,7 @@ def __init__(self, config: SQLServerConfig, ctx: PipelineContext): self.current_database = None self.table_descriptions: Dict[str, str] = {} self.column_descriptions: Dict[str, str] = {} + self.stored_procedures: FileBackedList[StoredProcedure] = FileBackedList() if self.config.include_descriptions: for inspector in self.get_inspectors(): db_name: str = self.get_db_name(inspector) @@ -374,7 +393,7 @@ def loop_jobs( def loop_job_steps( self, job: MSSQLJob, job_steps: Dict[str, Any] ) -> Iterable[MetadataWorkUnit]: - for step_id, step_data in job_steps.items(): + for _step_id, step_data in job_steps.items(): step = JobStep( job_name=job.formatted_name, step_name=step_data["step_name"], @@ -405,44 +424,57 @@ def loop_stored_procedures( # noqa: C901 data_flow = MSSQLDataFlow(entity=mssql_default_job) with inspector.engine.connect() as conn: procedures_data_list = self._get_stored_procedures(conn, db_name, schema) - procedures = [ - StoredProcedure(flow=mssql_default_job, **procedure_data) - for procedure_data in procedures_data_list - ] + procedures: List[StoredProcedure] = [] + for procedure_data in procedures_data_list: + procedure_full_name = f"{db_name}.{schema}.{procedure_data['name']}" + if not self.config.procedure_pattern.allowed(procedure_full_name): + self.report.report_dropped(procedure_full_name) + continue + procedures.append( + StoredProcedure(flow=mssql_default_job, **procedure_data) + ) + if procedures: yield from self.construct_flow_workunits(data_flow=data_flow) for procedure in procedures: - upstream = self._get_procedure_upstream(conn, procedure) - downstream = self._get_procedure_downstream(conn, procedure) - data_job = MSSQLDataJob( - entity=procedure, - ) - # TODO: because of this upstream and downstream are more dependencies, - # can't be used as DataJobInputOutput. - # Should be reorganized into lineage. - data_job.add_property("procedure_depends_on", str(upstream.as_property)) - data_job.add_property( - "depending_on_procedure", str(downstream.as_property) - ) - procedure_definition, procedure_code = self._get_procedure_code( - conn, procedure - ) - if procedure_definition: - data_job.add_property("definition", procedure_definition) - if sql_config.include_stored_procedures_code and procedure_code: - data_job.add_property("code", procedure_code) - procedure_inputs = self._get_procedure_inputs(conn, procedure) - properties = self._get_procedure_properties(conn, procedure) - data_job.add_property( - "input parameters", str([param.name for param in procedure_inputs]) - ) - for param in procedure_inputs: - data_job.add_property( - f"parameter {param.name}", str(param.properties) - ) - for property_name, property_value in properties.items(): - data_job.add_property(property_name, str(property_value)) - yield from self.construct_job_workunits(data_job) + yield from self._process_stored_procedure(conn, procedure) + + def _process_stored_procedure( + self, conn: Connection, procedure: StoredProcedure + ) -> Iterable[MetadataWorkUnit]: + upstream = self._get_procedure_upstream(conn, procedure) + downstream = self._get_procedure_downstream(conn, procedure) + data_job = MSSQLDataJob( + entity=procedure, + ) + # TODO: because of this upstream and downstream are more dependencies, + # can't be used as DataJobInputOutput. + # Should be reorganized into lineage. + data_job.add_property("procedure_depends_on", str(upstream.as_property)) + data_job.add_property("depending_on_procedure", str(downstream.as_property)) + procedure_definition, procedure_code = self._get_procedure_code(conn, procedure) + procedure.code = procedure_code + if procedure_definition: + data_job.add_property("definition", procedure_definition) + if procedure_code and self.config.include_stored_procedures_code: + data_job.add_property("code", procedure_code) + procedure_inputs = self._get_procedure_inputs(conn, procedure) + properties = self._get_procedure_properties(conn, procedure) + data_job.add_property( + "input parameters", str([param.name for param in procedure_inputs]) + ) + for param in procedure_inputs: + data_job.add_property(f"parameter {param.name}", str(param.properties)) + for property_name, property_value in properties.items(): + data_job.add_property(property_name, str(property_value)) + if self.config.include_lineage: + # These will be used to construct lineage + self.stored_procedures.append(procedure) + yield from self.construct_job_workunits( + data_job, + # For stored procedure lineage is ingested later + include_lineage=False, + ) @staticmethod def _get_procedure_downstream( @@ -546,8 +578,8 @@ def _get_procedure_code( code_list.append(row["Text"]) if code_slice_text in re.sub(" +", " ", row["Text"].lower()).strip(): code_slice_index = index - definition = "\n".join(code_list[:code_slice_index]) - code = "\n".join(code_list[code_slice_index:]) + definition = "".join(code_list[:code_slice_index]) + code = "".join(code_list[code_slice_index:]) except ResourceClosedError: logger.warning( "Connection was closed from procedure '%s'", @@ -602,16 +634,18 @@ def _get_stored_procedures( def construct_job_workunits( self, data_job: MSSQLDataJob, + include_lineage: bool = True, ) -> Iterable[MetadataWorkUnit]: yield MetadataChangeProposalWrapper( entityUrn=data_job.urn, aspect=data_job.as_datajob_info_aspect, ).as_workunit() - yield MetadataChangeProposalWrapper( - entityUrn=data_job.urn, - aspect=data_job.as_datajob_input_output_aspect, - ).as_workunit() + if include_lineage: + yield MetadataChangeProposalWrapper( + entityUrn=data_job.urn, + aspect=data_job.as_datajob_input_output_aspect, + ).as_workunit() # TODO: Add SubType when it appear def construct_flow_workunits( @@ -664,3 +698,58 @@ def get_identifier( if self.config.convert_urns_to_lowercase else qualified_table_name ) + + def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: + yield from super().get_workunits_internal() + + # This is done at the end so that we will have access to tables + # from all databases in schema_resolver and discovered_tables + for procedure in self.stored_procedures: + with self.report.report_exc( + message="Failed to parse stored procedure lineage", + context=procedure.full_name, + level=StructuredLogLevel.WARN, + ): + yield from auto_workunit( + generate_procedure_lineage( + schema_resolver=self.schema_resolver, + procedure=procedure, + procedure_job_urn=MSSQLDataJob(entity=procedure).urn, + is_temp_table=self.is_temp_table, + ) + ) + + def is_temp_table(self, name: str) -> bool: + try: + parts = name.split(".") + table_name = parts[-1] + schema_name = parts[-2] + db_name = parts[-3] + + if table_name.startswith("#"): + return True + + # This is also a temp table if + # 1. this name would be allowed by the dataset patterns, and + # 2. we have a list of discovered tables, and + # 3. it's not in the discovered tables list + if ( + self.config.database_pattern.allowed(db_name) + and self.config.schema_pattern.allowed(schema_name) + and self.config.table_pattern.allowed(name) + and self.standardize_identifier_case(name) + not in self.discovered_datasets + ): + logger.debug(f"inferred as temp table {name}") + return True + + except Exception: + logger.warning(f"Error parsing table name {name} ") + return False + + def standardize_identifier_case(self, table_ref_str: str) -> str: + return ( + table_ref_str.lower() + if self.config.convert_urns_to_lowercase + else table_ref_str + ) diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql/mssql/stored_procedure_lineage.py b/metadata-ingestion/src/datahub/ingestion/source/sql/mssql/stored_procedure_lineage.py new file mode 100644 index 00000000000000..b979a270a55282 --- /dev/null +++ b/metadata-ingestion/src/datahub/ingestion/source/sql/mssql/stored_procedure_lineage.py @@ -0,0 +1,84 @@ +import logging +from typing import Callable, Iterable, Optional + +from datahub.emitter.mcp import MetadataChangeProposalWrapper +from datahub.ingestion.source.sql.mssql.job_models import StoredProcedure +from datahub.metadata.schema_classes import DataJobInputOutputClass +from datahub.sql_parsing.datajob import to_datajob_input_output +from datahub.sql_parsing.schema_resolver import SchemaResolver +from datahub.sql_parsing.split_statements import split_statements +from datahub.sql_parsing.sql_parsing_aggregator import ( + ObservedQuery, + SqlParsingAggregator, +) + +logger = logging.getLogger(__name__) + + +def parse_procedure_code( + *, + schema_resolver: SchemaResolver, + default_db: Optional[str], + default_schema: Optional[str], + code: str, + is_temp_table: Callable[[str], bool], + raise_: bool = False, +) -> Optional[DataJobInputOutputClass]: + aggregator = SqlParsingAggregator( + platform=schema_resolver.platform, + env=schema_resolver.env, + schema_resolver=schema_resolver, + generate_lineage=True, + generate_queries=False, + generate_usage_statistics=False, + generate_operations=False, + generate_query_subject_fields=False, + generate_query_usage_statistics=False, + is_temp_table=is_temp_table, + ) + for query in split_statements(code): + # TODO: We should take into account `USE x` statements. + aggregator.add_observed_query( + observed=ObservedQuery( + default_db=default_db, + default_schema=default_schema, + query=query, + ) + ) + if aggregator.report.num_observed_queries_failed and raise_: + logger.info(aggregator.report.as_string()) + raise ValueError( + f"Failed to parse {aggregator.report.num_observed_queries_failed} queries." + ) + + mcps = list(aggregator.gen_metadata()) + return to_datajob_input_output( + mcps=mcps, + ignore_extra_mcps=True, + ) + + +# Is procedure handling generic enough to be added to SqlParsingAggregator? +def generate_procedure_lineage( + *, + schema_resolver: SchemaResolver, + procedure: StoredProcedure, + procedure_job_urn: str, + is_temp_table: Callable[[str], bool] = lambda _: False, + raise_: bool = False, +) -> Iterable[MetadataChangeProposalWrapper]: + if procedure.code: + datajob_input_output = parse_procedure_code( + schema_resolver=schema_resolver, + default_db=procedure.db, + default_schema=procedure.schema, + code=procedure.code, + is_temp_table=is_temp_table, + raise_=raise_, + ) + + if datajob_input_output: + yield MetadataChangeProposalWrapper( + entityUrn=procedure_job_urn, + aspect=datajob_input_output, + ) diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql/oracle.py b/metadata-ingestion/src/datahub/ingestion/source/sql/oracle.py index 766b704d6ffafe..52db3cd11a759d 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/sql/oracle.py +++ b/metadata-ingestion/src/datahub/ingestion/source/sql/oracle.py @@ -1,3 +1,4 @@ +import datetime import logging import re @@ -631,3 +632,52 @@ def get_workunits(self): clear=False, ): return super().get_workunits() + + def generate_profile_candidates( + self, + inspector: Inspector, + threshold_time: Optional[datetime.datetime], + schema: str, + ) -> Optional[List[str]]: + tables_table_name = ( + "ALL_TABLES" if self.config.data_dictionary_mode == "ALL" else "DBA_TABLES" + ) + + # If stats are available , they are used even if they are stale. + # Assuming that the table would typically grow over time, this will ensure to filter + # large tables known at stats collection time from profiling candidates. + # If stats are not available (NULL), such tables are not filtered and are considered + # as profiling candidates. + cursor = inspector.bind.execute( + sql.text( + f"""SELECT + t.OWNER, + t.TABLE_NAME, + t.NUM_ROWS, + t.LAST_ANALYZED, + COALESCE(t.NUM_ROWS * t.AVG_ROW_LEN, 0) / (1024 * 1024 * 1024) AS SIZE_GB + FROM {tables_table_name} t + WHERE t.OWNER = :owner + AND (t.NUM_ROWS < :table_row_limit OR t.NUM_ROWS IS NULL) + AND COALESCE(t.NUM_ROWS * t.AVG_ROW_LEN, 0) / (1024 * 1024 * 1024) < :table_size_limit + """ + ), + dict( + owner=inspector.dialect.denormalize_name(schema), + table_row_limit=self.config.profiling.profile_table_row_limit, + table_size_limit=self.config.profiling.profile_table_size_limit, + ), + ) + + TABLE_NAME_COL_LOC = 1 + return [ + self.get_identifier( + schema=schema, + entity=inspector.dialect.normalize_name(row[TABLE_NAME_COL_LOC]) + or _raise_err( + ValueError(f"Invalid table name: {row[TABLE_NAME_COL_LOC]}") + ), + inspector=inspector, + ) + for row in cursor + ] diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql/sql_common.py b/metadata-ingestion/src/datahub/ingestion/source/sql/sql_common.py index e5779791ed4120..41ffcb95a7cc43 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/sql/sql_common.py +++ b/metadata-ingestion/src/datahub/ingestion/source/sql/sql_common.py @@ -51,7 +51,6 @@ from datahub.ingestion.glossary.classification_mixin import ( SAMPLE_SIZE_MULTIPLIER, ClassificationHandler, - ClassificationReportMixin, ) from datahub.ingestion.source.common.data_reader import DataReader from datahub.ingestion.source.common.subtypes import ( @@ -59,6 +58,7 @@ DatasetSubTypes, ) from datahub.ingestion.source.sql.sql_config import SQLCommonConfig +from datahub.ingestion.source.sql.sql_report import SQLSourceReport from datahub.ingestion.source.sql.sql_utils import ( add_table_to_schema_container, downgrade_schema_from_v2, @@ -74,7 +74,6 @@ ) from datahub.ingestion.source.state.stale_entity_removal_handler import ( StaleEntityRemovalHandler, - StaleEntityRemovalSourceReport, ) from datahub.ingestion.source.state.stateful_ingestion_base import ( StatefulIngestionSourceBase, @@ -118,9 +117,7 @@ ) from datahub.telemetry import telemetry from datahub.utilities.file_backed_collections import FileBackedDict -from datahub.utilities.lossy_collections import LossyList from datahub.utilities.registries.domain_registry import DomainRegistry -from datahub.utilities.sqlalchemy_query_combiner import SQLAlchemyQueryCombinerReport from datahub.utilities.sqlalchemy_type_converter import ( get_native_data_type_for_sqlalchemy_type, ) @@ -134,43 +131,6 @@ logger: logging.Logger = logging.getLogger(__name__) -@dataclass -class SQLSourceReport(StaleEntityRemovalSourceReport, ClassificationReportMixin): - tables_scanned: int = 0 - views_scanned: int = 0 - entities_profiled: int = 0 - filtered: LossyList[str] = field(default_factory=LossyList) - - query_combiner: Optional[SQLAlchemyQueryCombinerReport] = None - - num_view_definitions_parsed: int = 0 - num_view_definitions_failed_parsing: int = 0 - num_view_definitions_failed_column_parsing: int = 0 - view_definitions_parsing_failures: LossyList[str] = field(default_factory=LossyList) - - def report_entity_scanned(self, name: str, ent_type: str = "table") -> None: - """ - Entity could be a view or a table - """ - if ent_type == "table": - self.tables_scanned += 1 - elif ent_type == "view": - self.views_scanned += 1 - else: - raise KeyError(f"Unknown entity {ent_type}.") - - def report_entity_profiled(self, name: str) -> None: - self.entities_profiled += 1 - - def report_dropped(self, ent_name: str) -> None: - self.filtered.append(ent_name) - - def report_from_query_combiner( - self, query_combiner_report: SQLAlchemyQueryCombinerReport - ) -> None: - self.query_combiner = query_combiner_report - - class SqlWorkUnit(MetadataWorkUnit): pass @@ -352,7 +312,7 @@ class SQLAlchemySource(StatefulIngestionSourceBase, TestableSource): def __init__(self, config: SQLCommonConfig, ctx: PipelineContext, platform: str): super().__init__(config, ctx) - self.config = config + self.config: SQLCommonConfig = config self.platform = platform self.report: SQLSourceReport = SQLSourceReport() self.profile_metadata_info: ProfileMetadata = ProfileMetadata() @@ -392,6 +352,7 @@ def __init__(self, config: SQLCommonConfig, ctx: PipelineContext, platform: str) platform_instance=self.config.platform_instance, env=self.config.env, ) + self.discovered_datasets: Set[str] = set() self._view_definition_cache: MutableMapping[str, str] if self.config.use_file_backed_cache: self._view_definition_cache = FileBackedDict[str]() @@ -831,8 +792,9 @@ def _process_table( self._classify(dataset_name, schema, table, data_reader, schema_metadata) dataset_snapshot.aspects.append(schema_metadata) - if self.config.include_view_lineage: + if self._save_schema_to_resolver(): self.schema_resolver.add_schema_metadata(dataset_urn, schema_metadata) + self.discovered_datasets.add(dataset_name) db_name = self.get_db_name(inspector) yield from self.add_table_to_schema_container( @@ -1126,8 +1088,9 @@ def _process_view( columns, canonical_schema=schema_fields, ) - if self.config.include_view_lineage: + if self._save_schema_to_resolver(): self.schema_resolver.add_schema_metadata(dataset_urn, schema_metadata) + self.discovered_datasets.add(dataset_name) description, properties, _ = self.get_table_properties(inspector, schema, view) try: view_definition = inspector.get_view_definition(view, schema) @@ -1190,6 +1153,11 @@ def _process_view( domain_registry=self.domain_registry, ) + def _save_schema_to_resolver(self): + return self.config.include_view_lineage or ( + hasattr(self.config, "include_lineage") and self.config.include_lineage + ) + def _run_sql_parser( self, view_identifier: str, query: str, schema_resolver: SchemaResolver ) -> Optional[SqlParsingResult]: @@ -1274,17 +1242,22 @@ def generate_profile_candidates( def is_dataset_eligible_for_profiling( self, dataset_name: str, - sql_config: SQLCommonConfig, + schema: str, inspector: Inspector, profile_candidates: Optional[List[str]], ) -> bool: - return ( - sql_config.table_pattern.allowed(dataset_name) - and sql_config.profile_pattern.allowed(dataset_name) - ) and ( - profile_candidates is None - or (profile_candidates is not None and dataset_name in profile_candidates) - ) + if not ( + self.config.table_pattern.allowed(dataset_name) + and self.config.profile_pattern.allowed(dataset_name) + ): + self.report.profiling_skipped_table_profile_pattern[schema] += 1 + return False + + if profile_candidates is not None and dataset_name not in profile_candidates: + self.report.profiling_skipped_other[schema] += 1 + return False + + return True def loop_profiler_requests( self, @@ -1299,7 +1272,7 @@ def loop_profiler_requests( if ( sql_config.profiling.profile_if_updated_since_days is not None or sql_config.profiling.profile_table_size_limit is not None - or sql_config.profiling.profile_table_row_limit is None + or sql_config.profiling.profile_table_row_limit is not None ): try: threshold_time: Optional[datetime.datetime] = None @@ -1320,8 +1293,9 @@ def loop_profiler_requests( schema=schema, entity=table, inspector=inspector ) if not self.is_dataset_eligible_for_profiling( - dataset_name, sql_config, inspector, profile_candidates + dataset_name, schema, inspector, profile_candidates ): + self.report.num_tables_not_eligible_profiling[schema] += 1 if self.config.profiling.report_dropped_profiles: self.report.report_dropped(f"profile of {dataset_name}") continue diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql/sql_generic_profiler.py b/metadata-ingestion/src/datahub/ingestion/source/sql/sql_generic_profiler.py index 9c8e475e7b3074..bd6c23cc2d4644 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/sql/sql_generic_profiler.py +++ b/metadata-ingestion/src/datahub/ingestion/source/sql/sql_generic_profiler.py @@ -1,6 +1,6 @@ import logging from abc import abstractmethod -from dataclasses import dataclass, field +from dataclasses import dataclass from datetime import datetime, timedelta, timezone from typing import Dict, Iterable, List, Optional, Union, cast @@ -14,42 +14,13 @@ DatahubGEProfiler, GEProfilerRequest, ) -from datahub.ingestion.source.sql.sql_common import SQLSourceReport from datahub.ingestion.source.sql.sql_config import SQLCommonConfig from datahub.ingestion.source.sql.sql_generic import BaseTable, BaseView +from datahub.ingestion.source.sql.sql_report import SQLSourceReport from datahub.ingestion.source.sql.sql_utils import check_table_with_profile_pattern from datahub.ingestion.source.state.profiling_state_handler import ProfilingHandler from datahub.metadata.com.linkedin.pegasus2avro.dataset import DatasetProfile from datahub.metadata.com.linkedin.pegasus2avro.timeseries import PartitionType -from datahub.utilities.stats_collections import TopKDict, int_top_k_dict - - -@dataclass -class DetailedProfilerReportMixin: - profiling_skipped_not_updated: TopKDict[str, int] = field( - default_factory=int_top_k_dict - ) - profiling_skipped_size_limit: TopKDict[str, int] = field( - default_factory=int_top_k_dict - ) - - profiling_skipped_row_limit: TopKDict[str, int] = field( - default_factory=int_top_k_dict - ) - - profiling_skipped_table_profile_pattern: TopKDict[str, int] = field( - default_factory=int_top_k_dict - ) - - profiling_skipped_other: TopKDict[str, int] = field(default_factory=int_top_k_dict) - - num_tables_not_eligible_profiling: Dict[str, int] = field( - default_factory=int_top_k_dict - ) - - -class ProfilingSqlReport(DetailedProfilerReportMixin, SQLSourceReport): - pass @dataclass @@ -65,7 +36,7 @@ class GenericProfiler: def __init__( self, config: SQLCommonConfig, - report: ProfilingSqlReport, + report: SQLSourceReport, platform: str, state_handler: Optional[ProfilingHandler] = None, ) -> None: diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql/sql_report.py b/metadata-ingestion/src/datahub/ingestion/source/sql/sql_report.py new file mode 100644 index 00000000000000..c1f722b5d1e783 --- /dev/null +++ b/metadata-ingestion/src/datahub/ingestion/source/sql/sql_report.py @@ -0,0 +1,75 @@ +from dataclasses import dataclass, field +from typing import Dict, Optional + +from datahub.ingestion.glossary.classification_mixin import ClassificationReportMixin +from datahub.ingestion.source.state.stale_entity_removal_handler import ( + StaleEntityRemovalSourceReport, +) +from datahub.utilities.lossy_collections import LossyList +from datahub.utilities.sqlalchemy_query_combiner import SQLAlchemyQueryCombinerReport +from datahub.utilities.stats_collections import TopKDict, int_top_k_dict + + +@dataclass +class DetailedProfilerReportMixin: + profiling_skipped_not_updated: TopKDict[str, int] = field( + default_factory=int_top_k_dict + ) + profiling_skipped_size_limit: TopKDict[str, int] = field( + default_factory=int_top_k_dict + ) + + profiling_skipped_row_limit: TopKDict[str, int] = field( + default_factory=int_top_k_dict + ) + + profiling_skipped_table_profile_pattern: TopKDict[str, int] = field( + default_factory=int_top_k_dict + ) + + profiling_skipped_other: TopKDict[str, int] = field(default_factory=int_top_k_dict) + + num_tables_not_eligible_profiling: Dict[str, int] = field( + default_factory=int_top_k_dict + ) + + +@dataclass +class SQLSourceReport( + StaleEntityRemovalSourceReport, + ClassificationReportMixin, + DetailedProfilerReportMixin, +): + tables_scanned: int = 0 + views_scanned: int = 0 + entities_profiled: int = 0 + filtered: LossyList[str] = field(default_factory=LossyList) + + query_combiner: Optional[SQLAlchemyQueryCombinerReport] = None + + num_view_definitions_parsed: int = 0 + num_view_definitions_failed_parsing: int = 0 + num_view_definitions_failed_column_parsing: int = 0 + view_definitions_parsing_failures: LossyList[str] = field(default_factory=LossyList) + + def report_entity_scanned(self, name: str, ent_type: str = "table") -> None: + """ + Entity could be a view or a table + """ + if ent_type == "table": + self.tables_scanned += 1 + elif ent_type == "view": + self.views_scanned += 1 + else: + raise KeyError(f"Unknown entity {ent_type}.") + + def report_entity_profiled(self, name: str) -> None: + self.entities_profiled += 1 + + def report_dropped(self, ent_name: str) -> None: + self.filtered.append(ent_name) + + def report_from_query_combiner( + self, query_combiner_report: SQLAlchemyQueryCombinerReport + ) -> None: + self.query_combiner = query_combiner_report diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql/sql_types.py b/metadata-ingestion/src/datahub/ingestion/source/sql/sql_types.py index 8ea4209784063f..89ca160ba1f487 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/sql/sql_types.py +++ b/metadata-ingestion/src/datahub/ingestion/source/sql/sql_types.py @@ -1,5 +1,5 @@ import re -from typing import Any, Dict, ValuesView +from typing import Any, Dict, Optional, Type, Union, ValuesView from datahub.metadata.com.linkedin.pegasus2avro.schema import ( ArrayType, @@ -16,14 +16,28 @@ UnionType, ) -# these can be obtained by running `select format_type(oid, null),* from pg_type;` -# we've omitted the types without a meaningful DataHub type (e.g. postgres-specific types, index vectors, etc.) -# (run `\copy (select format_type(oid, null),* from pg_type) to 'pg_type.csv' csv header;` to get a CSV) +DATAHUB_FIELD_TYPE = Union[ + ArrayType, + BooleanType, + BytesType, + DateType, + EnumType, + MapType, + NullType, + NumberType, + RecordType, + StringType, + TimeType, + UnionType, +] -# we map from format_type since this is what dbt uses -# see https://github.com/fishtown-analytics/dbt/blob/master/plugins/postgres/dbt/include/postgres/macros/catalog.sql#L22 -# see https://www.npgsql.org/dev/types.html for helpful type annotations +# These can be obtained by running `select format_type(oid, null),* from pg_type;` +# We've omitted the types without a meaningful DataHub type (e.g. postgres-specific types, index vectors, etc.) +# (run `\copy (select format_type(oid, null),* from pg_type) to 'pg_type.csv' csv header;` to get a CSV) +# We map from format_type since this is what dbt uses. +# See https://github.com/fishtown-analytics/dbt/blob/master/plugins/postgres/dbt/include/postgres/macros/catalog.sql#L22 +# See https://www.npgsql.org/dev/types.html for helpful type annotations POSTGRES_TYPES_MAP: Dict[str, Any] = { "boolean": BooleanType, "bytea": BytesType, @@ -430,3 +444,54 @@ def resolve_vertica_modified_type(type_string: str) -> Any: "geography": None, "uuid": StringType, } + + +_merged_mapping = { + "boolean": BooleanType, + "date": DateType, + "time": TimeType, + "numeric": NumberType, + "text": StringType, + "timestamp with time zone": DateType, + "timestamp without time zone": DateType, + "integer": NumberType, + "float8": NumberType, + "struct": RecordType, + **POSTGRES_TYPES_MAP, + **SNOWFLAKE_TYPES_MAP, + **BIGQUERY_TYPES_MAP, + **SPARK_SQL_TYPES_MAP, + **TRINO_SQL_TYPES_MAP, + **ATHENA_SQL_TYPES_MAP, + **VERTICA_SQL_TYPES_MAP, +} + + +def resolve_sql_type( + column_type: Optional[str], + platform: Optional[str] = None, +) -> Optional[DATAHUB_FIELD_TYPE]: + # In theory, we should use the platform-specific mapping where available. + # However, the types don't ever conflict, so the merged mapping is fine. + TypeClass: Optional[Type[DATAHUB_FIELD_TYPE]] = ( + _merged_mapping.get(column_type) if column_type else None + ) + + if TypeClass is None and column_type: + # resolve a modified type + if platform == "trino": + TypeClass = resolve_trino_modified_type(column_type) + elif platform == "athena": + TypeClass = resolve_athena_modified_type(column_type) + elif platform == "postgres" or platform == "redshift": + # Redshift uses a variant of Postgres, so we can use the same logic. + TypeClass = resolve_postgres_modified_type(column_type) + elif platform == "vertica": + TypeClass = resolve_vertica_modified_type(column_type) + elif platform == "snowflake": + # Snowflake types are uppercase, so we check that. + TypeClass = _merged_mapping.get(column_type.upper()) + + if TypeClass: + return TypeClass() + return None diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql/teradata.py b/metadata-ingestion/src/datahub/ingestion/source/sql/teradata.py index 53b1ddfcde5952..e42564975c3d19 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/sql/teradata.py +++ b/metadata-ingestion/src/datahub/ingestion/source/sql/teradata.py @@ -44,7 +44,7 @@ from datahub.ingestion.graph.client import DataHubGraph from datahub.ingestion.source.sql.sql_common import SqlWorkUnit, register_custom_type from datahub.ingestion.source.sql.sql_config import SQLCommonConfig -from datahub.ingestion.source.sql.sql_generic_profiler import ProfilingSqlReport +from datahub.ingestion.source.sql.sql_report import SQLSourceReport from datahub.ingestion.source.sql.two_tier_sql_source import ( TwoTierSQLAlchemyConfig, TwoTierSQLAlchemySource, @@ -330,7 +330,7 @@ def optimized_get_view_definition( @dataclass -class TeradataReport(ProfilingSqlReport, IngestionStageReport, BaseTimeWindowReport): +class TeradataReport(SQLSourceReport, IngestionStageReport, BaseTimeWindowReport): num_queries_parsed: int = 0 num_view_ddl_parsed: int = 0 num_table_parse_failures: int = 0 diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql/vertica.py b/metadata-ingestion/src/datahub/ingestion/source/sql/vertica.py index a340f049731c46..92487d48b99e63 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/sql/vertica.py +++ b/metadata-ingestion/src/datahub/ingestion/source/sql/vertica.py @@ -27,7 +27,6 @@ from datahub.ingestion.source.common.data_reader import DataReader from datahub.ingestion.source.sql.sql_common import ( SQLAlchemySource, - SQLSourceReport, SqlWorkUnit, get_schema_metadata, ) @@ -35,6 +34,7 @@ BasicSQLAlchemyConfig, SQLCommonConfig, ) +from datahub.ingestion.source.sql.sql_report import SQLSourceReport from datahub.ingestion.source.sql.sql_utils import get_domain_wu from datahub.metadata.com.linkedin.pegasus2avro.common import StatusClass from datahub.metadata.com.linkedin.pegasus2avro.dataset import UpstreamLineage @@ -536,7 +536,7 @@ def loop_profiler_requests( ) if not self.is_dataset_eligible_for_profiling( - dataset_name, sql_config, inspector, profile_candidates + dataset_name, schema, inspector, profile_candidates ): if self.config.profiling.report_dropped_profiles: self.report.report_dropped(f"profile of {dataset_name}") diff --git a/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau_common.py b/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau_common.py index 8d6746b6433a4e..ac917c5f128ed2 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau_common.py +++ b/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau_common.py @@ -643,8 +643,11 @@ def create( cls, d: dict, default_schema_map: Optional[Dict[str, str]] = None ) -> "TableauUpstreamReference": # Values directly from `table` object from Tableau - database = t_database = d.get(c.DATABASE, {}).get(c.NAME) - database_id = d.get(c.DATABASE, {}).get(c.ID) + database_dict = ( + d.get(c.DATABASE) or {} + ) # this sometimes is None, so we need the `or {}` + database = t_database = database_dict.get(c.NAME) + database_id = database_dict.get(c.ID) schema = t_schema = d.get(c.SCHEMA) table = t_table = d.get(c.NAME) or "" t_full_name = d.get(c.FULL_NAME) diff --git a/metadata-ingestion/src/datahub/ingestion/source/unity/proxy_types.py b/metadata-ingestion/src/datahub/ingestion/source/unity/proxy_types.py index f84f6c1b0c08d6..9c5752c518df14 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/unity/proxy_types.py +++ b/metadata-ingestion/src/datahub/ingestion/source/unity/proxy_types.py @@ -33,6 +33,7 @@ logger = logging.getLogger(__name__) +# TODO: (maybe) Replace with standardized types in sql_types.py DATA_TYPE_REGISTRY: dict = { ColumnTypeName.BOOLEAN: BooleanTypeClass, ColumnTypeName.BYTE: BytesTypeClass, diff --git a/metadata-ingestion/src/datahub/ingestion/source/unity/report.py b/metadata-ingestion/src/datahub/ingestion/source/unity/report.py index f4579376a3b3a4..f16769341853a1 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/unity/report.py +++ b/metadata-ingestion/src/datahub/ingestion/source/unity/report.py @@ -2,7 +2,7 @@ from typing import Optional, Tuple from datahub.ingestion.api.report import EntityFilterReport, Report -from datahub.ingestion.source.sql.sql_generic_profiler import ProfilingSqlReport +from datahub.ingestion.source.sql.sql_report import SQLSourceReport from datahub.ingestion.source_report.ingestion_stage import IngestionStageReport from datahub.utilities.lossy_collections import LossyDict, LossyList from datahub.utilities.perf_timer import PerfTimer @@ -19,7 +19,7 @@ class UnityCatalogUsagePerfReport(Report): @dataclass -class UnityCatalogReport(IngestionStageReport, ProfilingSqlReport): +class UnityCatalogReport(IngestionStageReport, SQLSourceReport): metastores: EntityFilterReport = EntityFilterReport.field(type="metastore") catalogs: EntityFilterReport = EntityFilterReport.field(type="catalog") schemas: EntityFilterReport = EntityFilterReport.field(type="schema") diff --git a/metadata-ingestion/src/datahub/sql_parsing/datajob.py b/metadata-ingestion/src/datahub/sql_parsing/datajob.py new file mode 100644 index 00000000000000..215b207c3dcf51 --- /dev/null +++ b/metadata-ingestion/src/datahub/sql_parsing/datajob.py @@ -0,0 +1,50 @@ +import logging +from typing import Iterable, List, Optional + +from datahub.emitter.mcp import MetadataChangeProposalWrapper +from datahub.metadata.schema_classes import ( + DataJobInputOutputClass, + FineGrainedLineageClass, + UpstreamLineageClass, +) + +logger = logging.getLogger(__name__) + + +def to_datajob_input_output( + *, mcps: Iterable[MetadataChangeProposalWrapper], ignore_extra_mcps: bool = True +) -> Optional[DataJobInputOutputClass]: + inputDatasets: List[str] = [] + outputDatasets: List[str] = [] + fineGrainedLineages: List[FineGrainedLineageClass] = [] + for mcp in mcps: + # TODO: Represent simple write operations without lineage as outputDatasets. + + upstream_lineage = mcp.as_workunit().get_aspect_of_type(UpstreamLineageClass) + if upstream_lineage is not None: + if mcp.entityUrn and mcp.entityUrn not in outputDatasets: + outputDatasets.append(mcp.entityUrn) + + for upstream in upstream_lineage.upstreams: + if upstream.dataset not in inputDatasets: + inputDatasets.append(upstream.dataset) + + if upstream_lineage.fineGrainedLineages: + for fineGrainedLineage in upstream_lineage.fineGrainedLineages: + fineGrainedLineages.append(fineGrainedLineage) + + elif ignore_extra_mcps: + pass + else: + raise ValueError( + f"Expected an upstreamLineage aspect, got {mcp.aspectName} for {mcp.entityUrn}" + ) + + if not inputDatasets and not outputDatasets: + return None + + return DataJobInputOutputClass( + inputDatasets=inputDatasets, + outputDatasets=outputDatasets, + fineGrainedLineages=fineGrainedLineages, + ) diff --git a/metadata-ingestion/src/datahub/sql_parsing/query_types.py b/metadata-ingestion/src/datahub/sql_parsing/query_types.py index 2acad19418c113..802fb3e993f428 100644 --- a/metadata-ingestion/src/datahub/sql_parsing/query_types.py +++ b/metadata-ingestion/src/datahub/sql_parsing/query_types.py @@ -14,7 +14,16 @@ def _is_temp_table(table: sqlglot.exp.Table, dialect: sqlglot.Dialect) -> bool: identifier: sqlglot.exp.Identifier = table.this return identifier.args.get("temporary") or ( - is_dialect_instance(dialect, "redshift") and identifier.name.startswith("#") + # These dialects use # as a prefix for temp tables. + is_dialect_instance( + dialect, + [ + "redshift", + "mssql", + # sybase is another one, but we don't support that dialect yet. + ], + ) + and identifier.name.startswith("#") ) diff --git a/metadata-ingestion/src/datahub/sql_parsing/split_statements.py b/metadata-ingestion/src/datahub/sql_parsing/split_statements.py new file mode 100644 index 00000000000000..42dda4e62158b0 --- /dev/null +++ b/metadata-ingestion/src/datahub/sql_parsing/split_statements.py @@ -0,0 +1,163 @@ +import re +from enum import Enum +from typing import Generator, List, Tuple + +CONTROL_FLOW_KEYWORDS = [ + "GO", + r"BEGIN\w+TRY", + r"BEGIN\w+CATCH", + "BEGIN", + r"END\w+TRY", + r"END\w+CATCH", + "END", +] + +# There's an exception to this rule, which is when the statement +# is preceeded by a CTE. +FORCE_NEW_STATEMENT_KEYWORDS = [ + # SELECT is used inside queries as well, so we can't include it here. + "INSERT", + "UPDATE", + "DELETE", + "MERGE", +] + + +class ParserState(Enum): + NORMAL = 1 + STRING = 2 + COMMENT = 3 + MULTILINE_COMMENT = 4 + + +def _is_keyword_at_position(sql: str, pos: int, keyword: str) -> bool: + """ + Check if a keyword exists at the given position using regex word boundaries. + """ + if pos + len(keyword) > len(sql): + return False + + # If we're not at a word boundary, we can't generate a keyword. + if pos > 0 and not ( + bool(re.match(r"\w\W", sql[pos - 1 : pos + 1])) + or bool(re.match(r"\W\w", sql[pos - 1 : pos + 1])) + ): + return False + + pattern = rf"^{re.escape(keyword)}\b" + match = re.match(pattern, sql[pos:], re.IGNORECASE) + return bool(match) + + +def _look_ahead_for_keywords( + sql: str, pos: int, keywords: List[str] +) -> Tuple[bool, str, int]: + """ + Look ahead for SQL keywords at the current position. + """ + + for keyword in keywords: + if _is_keyword_at_position(sql, pos, keyword): + return True, keyword, len(keyword) + return False, "", 0 + + +def split_statements(sql: str) -> Generator[str, None, None]: + """ + Split T-SQL code into individual statements, handling various SQL constructs. + """ + if not sql or not sql.strip(): + return + + current_statement: List[str] = [] + state = ParserState.NORMAL + i = 0 + + def yield_if_complete() -> Generator[str, None, None]: + statement = "".join(current_statement).strip() + if statement: + yield statement + current_statement.clear() + + prev_real_char = "\0" # the most recent non-whitespace, non-comment character + while i < len(sql): + c = sql[i] + next_char = sql[i + 1] if i < len(sql) - 1 else "\0" + + if state == ParserState.NORMAL: + if c == "'": + state = ParserState.STRING + current_statement.append(c) + prev_real_char = c + elif c == "-" and next_char == "-": + state = ParserState.COMMENT + current_statement.append(c) + current_statement.append(next_char) + i += 1 + elif c == "/" and next_char == "*": + state = ParserState.MULTILINE_COMMENT + current_statement.append(c) + current_statement.append(next_char) + i += 1 + else: + most_recent_real_char = prev_real_char + if not c.isspace(): + prev_real_char = c + + is_control_keyword, keyword, keyword_len = _look_ahead_for_keywords( + sql, i, keywords=CONTROL_FLOW_KEYWORDS + ) + if is_control_keyword: + # Yield current statement if any + yield from yield_if_complete() + # Yield keyword as its own statement + yield keyword + i += keyword_len + continue + + ( + is_force_new_statement_keyword, + keyword, + keyword_len, + ) = _look_ahead_for_keywords( + sql, i, keywords=FORCE_NEW_STATEMENT_KEYWORDS + ) + if ( + is_force_new_statement_keyword and most_recent_real_char != ")" + ): # usually we'd have a close paren that closes a CTE + # Force termination of current statement + yield from yield_if_complete() + + current_statement.append(keyword) + i += keyword_len + continue + + elif c == ";": + yield from yield_if_complete() + else: + current_statement.append(c) + + elif state == ParserState.STRING: + current_statement.append(c) + if c == "'" and next_char == "'": + current_statement.append(next_char) + i += 1 + elif c == "'": + state = ParserState.NORMAL + + elif state == ParserState.COMMENT: + current_statement.append(c) + if c == "\n": + state = ParserState.NORMAL + + elif state == ParserState.MULTILINE_COMMENT: + current_statement.append(c) + if c == "*" and next_char == "/": + current_statement.append(next_char) + i += 1 + state = ParserState.NORMAL + + i += 1 + + # Handle the last statement + yield from yield_if_complete() diff --git a/metadata-ingestion/src/datahub/sql_parsing/sql_parsing_aggregator.py b/metadata-ingestion/src/datahub/sql_parsing/sql_parsing_aggregator.py index 360ccd7bf35073..44f0d7be7aad9a 100644 --- a/metadata-ingestion/src/datahub/sql_parsing/sql_parsing_aggregator.py +++ b/metadata-ingestion/src/datahub/sql_parsing/sql_parsing_aggregator.py @@ -762,7 +762,6 @@ def add_observed_query( This assumes that queries come in order of increasing timestamps. """ - self.report.num_observed_queries += 1 # All queries with no session ID are assumed to be part of the same session. diff --git a/metadata-ingestion/src/datahub/telemetry/telemetry.py b/metadata-ingestion/src/datahub/telemetry/telemetry.py index 4faf04ee2d2c76..22b2cb6a101af9 100644 --- a/metadata-ingestion/src/datahub/telemetry/telemetry.py +++ b/metadata-ingestion/src/datahub/telemetry/telemetry.py @@ -7,7 +7,7 @@ import uuid from functools import wraps from pathlib import Path -from typing import Any, Callable, Dict, List, Optional, TypeVar +from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, TypeVar from mixpanel import Consumer, Mixpanel from typing_extensions import ParamSpec @@ -16,10 +16,12 @@ from datahub.cli.config_utils import DATAHUB_ROOT_FOLDER from datahub.cli.env_utils import get_boolean_env_variable from datahub.configuration.common import ExceptionWithProps -from datahub.ingestion.graph.client import DataHubGraph from datahub.metadata.schema_classes import _custom_package_path from datahub.utilities.perf_timer import PerfTimer +if TYPE_CHECKING: + from datahub.ingestion.graph.client import DataHubGraph + logger = logging.getLogger(__name__) DATAHUB_FOLDER = Path(DATAHUB_ROOT_FOLDER) @@ -117,7 +119,11 @@ class Telemetry: tracking_init: bool = False sentry_enabled: bool = False + context_properties: Dict[str, Any] = {} + def __init__(self): + self.context_properties = {} + if SENTRY_DSN: self.sentry_enabled = True try: @@ -157,6 +163,9 @@ def __init__(self): except Exception as e: logger.debug(f"Error connecting to mixpanel: {e}") + # Initialize the default properties for all events. + self.set_context() + def update_config(self) -> bool: """ Update the config file with the current client ID and enabled status. @@ -238,18 +247,22 @@ def load_config(self) -> bool: return False - def update_capture_exception_context( + def set_context( self, - server: Optional[DataHubGraph] = None, + server: Optional["DataHubGraph"] = None, properties: Optional[Dict[str, Any]] = None, ) -> None: + self.context_properties = { + **self._server_props(server), + **(properties or {}), + } + if self.sentry_enabled: from sentry_sdk import set_tag properties = { **_default_telemetry_properties(), - **self._server_props(server), - **(properties or {}), + **self.context_properties, } for key in properties: @@ -297,7 +310,6 @@ def ping( self, event_name: str, properties: Optional[Dict[str, Any]] = None, - server: Optional[DataHubGraph] = None, ) -> None: """ Send a single telemetry event. @@ -323,14 +335,15 @@ def ping( properties = { **_default_telemetry_properties(), - **self._server_props(server), + **self.context_properties, **properties, } self.mp.track(self.client_id, event_name, properties) except Exception as e: logger.debug(f"Error reporting telemetry: {e}") - def _server_props(self, server: Optional[DataHubGraph]) -> Dict[str, str]: + @classmethod + def _server_props(cls, server: Optional["DataHubGraph"]) -> Dict[str, str]: if not server: return { "server_type": "n/a", @@ -435,6 +448,7 @@ def wrapper(*args: _P.args, **kwargs: _P.kwargs) -> _T: **call_props, "status": "error", **_error_props(e), + "code": e.code, }, ) telemetry_instance.capture_exception(e) diff --git a/metadata-ingestion/src/datahub/utilities/urn_encoder.py b/metadata-ingestion/src/datahub/utilities/urn_encoder.py index 88c0a128b8e468..4f19eeff3e70f0 100644 --- a/metadata-ingestion/src/datahub/utilities/urn_encoder.py +++ b/metadata-ingestion/src/datahub/utilities/urn_encoder.py @@ -4,7 +4,8 @@ # NOTE: Frontend relies on encoding these three characters. Specifically, we decode and encode schema fields for column level lineage. # If this changes, make appropriate changes to datahub-web-react/src/app/lineage/utils/columnLineageUtils.ts # We also rely on encoding these exact three characters when generating schemaField urns in our graphQL layer. Update SchemaFieldUtils if this changes. -RESERVED_CHARS = {",", "(", ")"} +# Also see https://datahubproject.io/docs/what/urn/#restrictions +RESERVED_CHARS = {",", "(", ")", "␟"} RESERVED_CHARS_EXTENDED = RESERVED_CHARS.union({"%"}) diff --git a/metadata-ingestion/tests/integration/dbt/test_dbt.py b/metadata-ingestion/tests/integration/dbt/test_dbt.py index 390d8d7698dd4c..c6a3dc4fd590bd 100644 --- a/metadata-ingestion/tests/integration/dbt/test_dbt.py +++ b/metadata-ingestion/tests/integration/dbt/test_dbt.py @@ -11,12 +11,6 @@ from datahub.ingestion.run.pipeline_config import PipelineConfig, SourceConfig from datahub.ingestion.source.dbt.dbt_common import DBTEntitiesEnabled, EmitDirective from datahub.ingestion.source.dbt.dbt_core import DBTCoreConfig, DBTCoreSource -from datahub.ingestion.source.sql.sql_types import ( - ATHENA_SQL_TYPES_MAP, - TRINO_SQL_TYPES_MAP, - resolve_athena_modified_type, - resolve_trino_modified_type, -) from tests.test_helpers import mce_helpers, test_connection_helpers FROZEN_TIME = "2022-02-03 07:00:00" @@ -362,69 +356,6 @@ def test_dbt_tests(test_resources_dir, pytestconfig, tmp_path, mock_time, **kwar ) -@pytest.mark.parametrize( - "data_type, expected_data_type", - [ - ("boolean", "boolean"), - ("tinyint", "tinyint"), - ("smallint", "smallint"), - ("int", "int"), - ("integer", "integer"), - ("bigint", "bigint"), - ("real", "real"), - ("double", "double"), - ("decimal(10,0)", "decimal"), - ("varchar(20)", "varchar"), - ("char", "char"), - ("varbinary", "varbinary"), - ("json", "json"), - ("date", "date"), - ("time", "time"), - ("time(12)", "time"), - ("timestamp", "timestamp"), - ("timestamp(3)", "timestamp"), - ("row(x bigint, y double)", "row"), - ("array(row(x bigint, y double))", "array"), - ("map(varchar, varchar)", "map"), - ], -) -def test_resolve_trino_modified_type(data_type, expected_data_type): - assert ( - resolve_trino_modified_type(data_type) - == TRINO_SQL_TYPES_MAP[expected_data_type] - ) - - -@pytest.mark.parametrize( - "data_type, expected_data_type", - [ - ("boolean", "boolean"), - ("tinyint", "tinyint"), - ("smallint", "smallint"), - ("int", "int"), - ("integer", "integer"), - ("bigint", "bigint"), - ("float", "float"), - ("double", "double"), - ("decimal(10,0)", "decimal"), - ("varchar(20)", "varchar"), - ("char", "char"), - ("binary", "binary"), - ("date", "date"), - ("timestamp", "timestamp"), - ("timestamp(3)", "timestamp"), - ("struct", "struct"), - ("array>", "array"), - ("map", "map"), - ], -) -def test_resolve_athena_modified_type(data_type, expected_data_type): - assert ( - resolve_athena_modified_type(data_type) - == ATHENA_SQL_TYPES_MAP[expected_data_type] - ) - - @pytest.mark.integration @freeze_time(FROZEN_TIME) def test_dbt_tests_only_assertions( diff --git a/metadata-ingestion/tests/integration/feast/feast_repository_mces_golden.json b/metadata-ingestion/tests/integration/feast/feast_repository_mces_golden.json index 1b91925289845b..a4fd9843c5cf49 100644 --- a/metadata-ingestion/tests/integration/feast/feast_repository_mces_golden.json +++ b/metadata-ingestion/tests/integration/feast/feast_repository_mces_golden.json @@ -9,8 +9,33 @@ "removed": false } }, + { + "com.linkedin.pegasus2avro.common.GlobalTags": { + "tags": [ + { + "tag": "urn:li:tag:deprecated" + } + ] + } + }, + { + "com.linkedin.pegasus2avro.common.Ownership": { + "owners": [ + { + "owner": "urn:li:corpGroup:MOCK_OWNER", + "type": "BUSINESS_OWNER" + } + ], + "ownerTypes": {}, + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + } + } + }, { "com.linkedin.pegasus2avro.ml.metadata.MLPrimaryKeyProperties": { + "customProperties": {}, "description": "Driver ID", "dataType": "ORDINAL", "sources": [ @@ -23,7 +48,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "feast-repository-test" + "runId": "feast-repository-test", + "lastRunId": "no-run-id-provided" } }, { @@ -36,8 +62,18 @@ "removed": false } }, + { + "com.linkedin.pegasus2avro.common.GlobalTags": { + "tags": [ + { + "tag": "urn:li:tag:needs_documentation" + } + ] + } + }, { "com.linkedin.pegasus2avro.ml.metadata.MLFeatureProperties": { + "customProperties": {}, "description": "Conv rate", "dataType": "CONTINUOUS", "sources": [ @@ -50,7 +86,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "feast-repository-test" + "runId": "feast-repository-test", + "lastRunId": "no-run-id-provided" } }, { @@ -65,6 +102,7 @@ }, { "com.linkedin.pegasus2avro.ml.metadata.MLFeatureProperties": { + "customProperties": {}, "description": "Acc rate", "dataType": "CONTINUOUS", "sources": [ @@ -77,7 +115,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "feast-repository-test" + "runId": "feast-repository-test", + "lastRunId": "no-run-id-provided" } }, { @@ -92,6 +131,7 @@ }, { "com.linkedin.pegasus2avro.ml.metadata.MLFeatureProperties": { + "customProperties": {}, "description": "Avg daily trips", "dataType": "ORDINAL", "sources": [ @@ -104,7 +144,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "feast-repository-test" + "runId": "feast-repository-test", + "lastRunId": "no-run-id-provided" } }, { @@ -119,6 +160,7 @@ }, { "com.linkedin.pegasus2avro.ml.metadata.MLFeatureProperties": { + "customProperties": {}, "description": "String feature", "dataType": "TEXT", "sources": [ @@ -131,7 +173,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "feast-repository-test" + "runId": "feast-repository-test", + "lastRunId": "no-run-id-provided" } }, { @@ -151,6 +194,30 @@ "removed": false } }, + { + "com.linkedin.pegasus2avro.common.GlobalTags": { + "tags": [ + { + "tag": "urn:li:tag:deprecated" + } + ] + } + }, + { + "com.linkedin.pegasus2avro.common.Ownership": { + "owners": [ + { + "owner": "urn:li:corpGroup:MOCK_OWNER", + "type": "BUSINESS_OWNER" + } + ], + "ownerTypes": {}, + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + } + } + }, { "com.linkedin.pegasus2avro.ml.metadata.MLFeatureTableProperties": { "customProperties": {}, @@ -170,7 +237,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "feast-repository-test" + "runId": "feast-repository-test", + "lastRunId": "no-run-id-provided" } }, { @@ -189,7 +257,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "feast-repository-test" + "runId": "feast-repository-test", + "lastRunId": "no-run-id-provided" } }, { @@ -204,6 +273,7 @@ }, { "com.linkedin.pegasus2avro.ml.metadata.MLFeatureProperties": { + "customProperties": {}, "dataType": "CONTINUOUS", "sources": [ "urn:li:dataset:(urn:li:dataPlatform:request,vals_to_add,PROD)", @@ -216,7 +286,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "feast-repository-test" + "runId": "feast-repository-test", + "lastRunId": "no-run-id-provided" } }, { @@ -231,6 +302,7 @@ }, { "com.linkedin.pegasus2avro.ml.metadata.MLFeatureProperties": { + "customProperties": {}, "dataType": "CONTINUOUS", "sources": [ "urn:li:dataset:(urn:li:dataPlatform:request,vals_to_add,PROD)", @@ -243,7 +315,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "feast-repository-test" + "runId": "feast-repository-test", + "lastRunId": "no-run-id-provided" } }, { @@ -278,7 +351,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "feast-repository-test" + "runId": "feast-repository-test", + "lastRunId": "no-run-id-provided" } }, { @@ -297,7 +371,40 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "feast-repository-test" + "runId": "feast-repository-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "tag", + "entityUrn": "urn:li:tag:deprecated", + "changeType": "UPSERT", + "aspectName": "tagKey", + "aspect": { + "json": { + "name": "deprecated" + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "feast-repository-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "tag", + "entityUrn": "urn:li:tag:needs_documentation", + "changeType": "UPSERT", + "aspectName": "tagKey", + "aspect": { + "json": { + "name": "needs_documentation" + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "feast-repository-test", + "lastRunId": "no-run-id-provided" } } ] \ No newline at end of file diff --git a/metadata-ingestion/tests/integration/feast/feature_store/data/registry.db b/metadata-ingestion/tests/integration/feast/feature_store/data/registry.db index a511ff56c97705..5dca29d92afe53 100644 Binary files a/metadata-ingestion/tests/integration/feast/feature_store/data/registry.db and b/metadata-ingestion/tests/integration/feast/feature_store/data/registry.db differ diff --git a/metadata-ingestion/tests/integration/feast/feature_store/features.py b/metadata-ingestion/tests/integration/feast/feature_store/features.py index a6e6cd3616e924..dcfd417637958c 100644 --- a/metadata-ingestion/tests/integration/feast/feature_store/features.py +++ b/metadata-ingestion/tests/integration/feast/feature_store/features.py @@ -19,6 +19,8 @@ join_keys=["driver_id"], value_type=ValueType.INT64, description="Driver ID", + owner="MOCK_OWNER", + tags={"name": "deprecated"}, ) driver_hourly_stats_view = FeatureView( @@ -29,7 +31,7 @@ Field( name="conv_rate", dtype=feast.types.Float64, - tags=dict(description="Conv rate"), + tags={"name": "needs_documentation", "description": "Conv rate"}, ), Field( name="acc_rate", @@ -49,7 +51,8 @@ ], online=True, source=driver_hourly_stats_source, - tags={}, + tags={"name": "deprecated"}, + owner="MOCK_OWNER", ) input_request = RequestSource( diff --git a/metadata-ingestion/tests/integration/feast/test_feast_repository.py b/metadata-ingestion/tests/integration/feast/test_feast_repository.py index a6bdce67222896..7f04337145dc36 100644 --- a/metadata-ingestion/tests/integration/feast/test_feast_repository.py +++ b/metadata-ingestion/tests/integration/feast/test_feast_repository.py @@ -19,6 +19,15 @@ def test_feast_repository_ingest(pytestconfig, tmp_path, mock_time): "config": { "path": str(test_resources_dir / "feature_store"), "environment": "PROD", + "enable_tag_extraction": True, + "enable_owner_extraction": True, + "owner_mappings": [ + { + "feast_owner_name": "MOCK_OWNER", + "datahub_owner_urn": "urn:li:corpGroup:MOCK_OWNER", + "datahub_ownership_type": "BUSINESS_OWNER", + } + ], }, }, "sink": { diff --git a/metadata-ingestion/tests/integration/kafka/test_kafka.py b/metadata-ingestion/tests/integration/kafka/test_kafka.py index 597889c8440b7a..7462f177684b7e 100644 --- a/metadata-ingestion/tests/integration/kafka/test_kafka.py +++ b/metadata-ingestion/tests/integration/kafka/test_kafka.py @@ -128,11 +128,32 @@ def test_kafka_oauth_callback( pipeline.run() - is_found: bool = False + # Initialize flags to track oauth events + checks = { + "consumer_polling": False, + "consumer_oauth_callback": False, + "admin_polling": False, + "admin_oauth_callback": False, + } + + # Read log file and check for oauth events with open(log_file, "r") as file: - for line_number, line in enumerate(file, 1): + for line in file: + # Check for polling events + if "Initiating polling for kafka admin client" in line: + checks["admin_polling"] = True + elif "Initiating polling for kafka consumer" in line: + checks["consumer_polling"] = True + + # Check for oauth callbacks if oauth.MESSAGE in line: - is_found = True - break - - assert is_found + if checks["consumer_polling"] and not checks["admin_polling"]: + checks["consumer_oauth_callback"] = True + elif checks["consumer_polling"] and checks["admin_polling"]: + checks["admin_oauth_callback"] = True + + # Verify all oauth events occurred + assert checks["consumer_polling"], "Consumer polling was not initiated" + assert checks["consumer_oauth_callback"], "Consumer oauth callback not found" + assert checks["admin_polling"], "Admin polling was not initiated" + assert checks["admin_oauth_callback"], "Admin oauth callback not found" diff --git a/metadata-ingestion/tests/integration/powerbi/test_m_parser.py b/metadata-ingestion/tests/integration/powerbi/test_m_parser.py index f22998b47b9008..63821f9038a88c 100644 --- a/metadata-ingestion/tests/integration/powerbi/test_m_parser.py +++ b/metadata-ingestion/tests/integration/powerbi/test_m_parser.py @@ -7,6 +7,7 @@ import pytest from lark import Tree +import datahub.ingestion.source.powerbi.m_query.data_classes import datahub.ingestion.source.powerbi.rest_api_wrapper.data_classes as powerbi_data_classes from datahub.ingestion.api.common import PipelineContext from datahub.ingestion.api.source import StructuredLogLevel @@ -18,8 +19,11 @@ AbstractDataPlatformInstanceResolver, create_dataplatform_instance_resolver, ) -from datahub.ingestion.source.powerbi.m_query import parser, resolver, tree_function -from datahub.ingestion.source.powerbi.m_query.resolver import DataPlatformTable, Lineage +from datahub.ingestion.source.powerbi.m_query import parser, tree_function +from datahub.ingestion.source.powerbi.m_query.data_classes import ( + DataPlatformTable, + Lineage, +) pytestmark = pytest.mark.integration_batch_2 @@ -62,7 +66,9 @@ ] -def get_data_platform_tables_with_dummy_table(q: str) -> List[resolver.Lineage]: +def get_data_platform_tables_with_dummy_table( + q: str, +) -> List[datahub.ingestion.source.powerbi.m_query.data_classes.Lineage]: table: powerbi_data_classes.Table = powerbi_data_classes.Table( columns=[], measures=[], @@ -759,7 +765,9 @@ def test_sqlglot_parser(): } ) - lineage: List[resolver.Lineage] = parser.get_upstream_tables( + lineage: List[ + datahub.ingestion.source.powerbi.m_query.data_classes.Lineage + ] = parser.get_upstream_tables( table, reporter, ctx=ctx, @@ -806,7 +814,9 @@ def test_sqlglot_parser(): def test_databricks_multi_cloud(): q = M_QUERIES[25] - lineage: List[resolver.Lineage] = get_data_platform_tables_with_dummy_table(q=q) + lineage: List[ + datahub.ingestion.source.powerbi.m_query.data_classes.Lineage + ] = get_data_platform_tables_with_dummy_table(q=q) assert len(lineage) == 1 @@ -823,7 +833,9 @@ def test_databricks_multi_cloud(): def test_databricks_catalog_pattern_1(): q = M_QUERIES[26] - lineage: List[resolver.Lineage] = get_data_platform_tables_with_dummy_table(q=q) + lineage: List[ + datahub.ingestion.source.powerbi.m_query.data_classes.Lineage + ] = get_data_platform_tables_with_dummy_table(q=q) assert len(lineage) == 1 @@ -892,7 +904,9 @@ def test_sqlglot_parser_2(): } ) - lineage: List[resolver.Lineage] = parser.get_upstream_tables( + lineage: List[ + datahub.ingestion.source.powerbi.m_query.data_classes.Lineage + ] = parser.get_upstream_tables( table, reporter, ctx=ctx, @@ -951,7 +965,9 @@ def test_databricks_regular_case_with_view(): def test_snowflake_double_double_quotes(): q = M_QUERIES[30] - lineage: List[resolver.Lineage] = get_data_platform_tables_with_dummy_table(q=q) + lineage: List[ + datahub.ingestion.source.powerbi.m_query.data_classes.Lineage + ] = get_data_platform_tables_with_dummy_table(q=q) assert len(lineage) == 1 @@ -968,7 +984,9 @@ def test_snowflake_double_double_quotes(): def test_databricks_multicloud(): q = M_QUERIES[31] - lineage: List[resolver.Lineage] = get_data_platform_tables_with_dummy_table(q=q) + lineage: List[ + datahub.ingestion.source.powerbi.m_query.data_classes.Lineage + ] = get_data_platform_tables_with_dummy_table(q=q) assert len(lineage) == 1 @@ -985,7 +1003,9 @@ def test_databricks_multicloud(): def test_snowflake_multi_function_call(): q = M_QUERIES[32] - lineage: List[resolver.Lineage] = get_data_platform_tables_with_dummy_table(q=q) + lineage: List[ + datahub.ingestion.source.powerbi.m_query.data_classes.Lineage + ] = get_data_platform_tables_with_dummy_table(q=q) assert len(lineage) == 1 @@ -1002,7 +1022,9 @@ def test_snowflake_multi_function_call(): def test_mssql_drop_with_select(): q = M_QUERIES[33] - lineage: List[resolver.Lineage] = get_data_platform_tables_with_dummy_table(q=q) + lineage: List[ + datahub.ingestion.source.powerbi.m_query.data_classes.Lineage + ] = get_data_platform_tables_with_dummy_table(q=q) assert len(lineage) == 1 @@ -1062,7 +1084,9 @@ def test_empty_string_in_m_query(): # TRIM(TRIM(TRIM(AGENT_NAME, '\"\"'), '+'), '\\'') is in Query q = "let\n Source = Value.NativeQuery(Snowflake.Databases(\"bu10758.ap-unknown-2.fakecomputing.com\",\"operations_analytics_warehouse_prod\",[Role=\"OPERATIONS_ANALYTICS_MEMBER\"]){[Name=\"OPERATIONS_ANALYTICS\"]}[Data], \"select #(lf)UPPER(REPLACE(AGENT_NAME,'-','')) AS CLIENT_DIRECTOR,#(lf)TRIM(TRIM(TRIM(AGENT_NAME, '\"\"'), '+'), '\\'') AS TRIM_AGENT_NAME,#(lf)TIER,#(lf)UPPER(MANAGER),#(lf)TEAM_TYPE,#(lf)DATE_TARGET,#(lf)MONTHID,#(lf)TARGET_TEAM,#(lf)SELLER_EMAIL,#(lf)concat((UPPER(REPLACE(AGENT_NAME,'-',''))), MONTHID) as AGENT_KEY,#(lf)UNIT_TARGET AS SME_Quota,#(lf)AMV_TARGET AS Revenue_Quota,#(lf)SERVICE_QUOTA,#(lf)BL_TARGET,#(lf)SOFTWARE_QUOTA as Software_Quota#(lf)#(lf)from OPERATIONS_ANALYTICS.TRANSFORMED_PROD.V_SME_UNIT_TARGETS inner join OPERATIONS_ANALYTICS.TRANSFORMED_PROD.V_SME_UNIT #(lf)#(lf)where YEAR_TARGET >= 2022#(lf)and TEAM_TYPE = 'Accounting'#(lf)and TARGET_TEAM = 'Enterprise'#(lf)AND TIER = 'Client Director'\", null, [EnableFolding=true])\nin\n Source" - lineage: List[resolver.Lineage] = get_data_platform_tables_with_dummy_table(q=q) + lineage: List[ + datahub.ingestion.source.powerbi.m_query.data_classes.Lineage + ] = get_data_platform_tables_with_dummy_table(q=q) assert len(lineage) == 1 @@ -1084,7 +1108,9 @@ def test_double_quotes_in_alias(): # SELECT CAST(sales_date AS DATE) AS \"\"Date\"\" in query q = 'let \n Source = Sql.Database("abc.com", "DB", [Query="SELECT CAST(sales_date AS DATE) AS ""Date"",#(lf) SUM(cshintrpret) / 60.0 AS ""Total Order All Items"",#(lf)#(tab)#(tab)#(tab) SUM(cshintrpret) / 60.0 - LAG(SUM(cshintrpret) / 60.0, 1) OVER (ORDER BY CAST(sales_date AS DATE)) AS ""Total minute difference"",#(lf)#(tab)#(tab)#(tab) SUM(sale_price) / 60.0 - LAG(SUM(sale_price) / 60.0, 1) OVER (ORDER BY CAST(sales_date AS DATE)) AS ""Normal minute difference""#(lf) FROM [DB].[dbo].[sales_t]#(lf) WHERE sales_date >= GETDATE() - 365#(lf) GROUP BY CAST(sales_date AS DATE),#(lf)#(tab)#(tab)CAST(sales_date AS TIME);"]) \n in \n Source' - lineage: List[resolver.Lineage] = get_data_platform_tables_with_dummy_table(q=q) + lineage: List[ + datahub.ingestion.source.powerbi.m_query.data_classes.Lineage + ] = get_data_platform_tables_with_dummy_table(q=q) assert len(lineage) == 1 diff --git a/metadata-ingestion/tests/integration/sigma/golden_test_platform_instance_ingest.json b/metadata-ingestion/tests/integration/sigma/golden_test_platform_instance_ingest.json index 12bb7734f30a63..645e710309b0da 100644 --- a/metadata-ingestion/tests/integration/sigma/golden_test_platform_instance_ingest.json +++ b/metadata-ingestion/tests/integration/sigma/golden_test_platform_instance_ingest.json @@ -10,23 +10,7 @@ } }, "systemMetadata": { - "lastObserved": 1713794496006, - "runId": "sigma-test", - "lastRunId": "no-run-id-provided" - } -}, -{ - "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:sigma,cloud_instance.49HFLTr6xytgrPly3PFsNC,PROD)", - "changeType": "UPSERT", - "aspectName": "container", - "aspect": { - "json": { - "container": "urn:li:container:abbebb5181bf9ba2d905d2dea7d8704d" - } - }, - "systemMetadata": { - "lastObserved": 1713794496008, + "lastObserved": 1732608523763, "runId": "sigma-test", "lastRunId": "no-run-id-provided" } @@ -56,7 +40,7 @@ } }, "systemMetadata": { - "lastObserved": 1713794496007, + "lastObserved": 1732608523764, "runId": "sigma-test", "lastRunId": "no-run-id-provided" } @@ -65,40 +49,30 @@ "entityType": "dataset", "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:sigma,cloud_instance.49HFLTr6xytgrPly3PFsNC,PROD)", "changeType": "UPSERT", - "aspectName": "dataPlatformInstance", + "aspectName": "container", "aspect": { "json": { - "platform": "urn:li:dataPlatform:sigma", - "instance": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:sigma,cloud_instance)" + "container": "urn:li:container:abbebb5181bf9ba2d905d2dea7d8704d" } }, "systemMetadata": { - "lastObserved": 1713794496008, + "lastObserved": 1732608523764, "runId": "sigma-test", "lastRunId": "no-run-id-provided" } }, { - "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:sigma,cloud_instance.49HFLTr6xytgrPly3PFsNC,PROD)", + "entityType": "chart", + "entityUrn": "urn:li:chart:(sigma,cloud_instance.Ml9C5ezT5W)", "changeType": "UPSERT", - "aspectName": "browsePathsV2", + "aspectName": "status", "aspect": { "json": { - "path": [ - { - "id": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:sigma,cloud_instance)", - "urn": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:sigma,cloud_instance)" - }, - { - "id": "urn:li:container:abbebb5181bf9ba2d905d2dea7d8704d", - "urn": "urn:li:container:abbebb5181bf9ba2d905d2dea7d8704d" - } - ] + "removed": false } }, "systemMetadata": { - "lastObserved": 1713794496010, + "lastObserved": 1732608523833, "runId": "sigma-test", "lastRunId": "no-run-id-provided" } @@ -107,16 +81,15 @@ "entityType": "dataset", "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:sigma,cloud_instance.49HFLTr6xytgrPly3PFsNC,PROD)", "changeType": "UPSERT", - "aspectName": "subTypes", + "aspectName": "dataPlatformInstance", "aspect": { "json": { - "typeNames": [ - "Sigma Dataset" - ] + "platform": "urn:li:dataPlatform:sigma", + "instance": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:sigma,cloud_instance)" } }, "systemMetadata": { - "lastObserved": 1713794496009, + "lastObserved": 1732608523764, "runId": "sigma-test", "lastRunId": "no-run-id-provided" } @@ -125,40 +98,44 @@ "entityType": "dataset", "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:sigma,cloud_instance.49HFLTr6xytgrPly3PFsNC,PROD)", "changeType": "UPSERT", - "aspectName": "ownership", + "aspectName": "subTypes", "aspect": { "json": { - "owners": [ - { - "owner": "urn:li:corpuser:Shubham_Jagtap", - "type": "DATAOWNER" - } - ], - "ownerTypes": {}, - "lastModified": { - "time": 0, - "actor": "urn:li:corpuser:unknown" - } + "typeNames": [ + "Sigma Dataset" + ] } }, "systemMetadata": { - "lastObserved": 1713794496009, + "lastObserved": 1732608523765, "runId": "sigma-test", "lastRunId": "no-run-id-provided" } }, { - "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:sigma,cloud_instance.5LqGLu14qUnqh3cN6wRJBd,PROD)", + "entityType": "chart", + "entityUrn": "urn:li:chart:(sigma,cloud_instance.Ml9C5ezT5W)", "changeType": "UPSERT", - "aspectName": "status", + "aspectName": "browsePathsV2", "aspect": { "json": { - "removed": false + "path": [ + { + "id": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:sigma,cloud_instance)", + "urn": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:sigma,cloud_instance)" + }, + { + "id": "urn:li:container:abbebb5181bf9ba2d905d2dea7d8704d", + "urn": "urn:li:container:abbebb5181bf9ba2d905d2dea7d8704d" + }, + { + "id": "Acryl Workbook" + } + ] } }, "systemMetadata": { - "lastObserved": 1713794496011, + "lastObserved": 1732608523835, "runId": "sigma-test", "lastRunId": "no-run-id-provided" } @@ -167,14 +144,14 @@ "entityType": "dataset", "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:sigma,cloud_instance.5LqGLu14qUnqh3cN6wRJBd,PROD)", "changeType": "UPSERT", - "aspectName": "container", + "aspectName": "status", "aspect": { "json": { - "container": "urn:li:container:abbebb5181bf9ba2d905d2dea7d8704d" + "removed": false } }, "systemMetadata": { - "lastObserved": 1713794496012, + "lastObserved": 1732608523781, "runId": "sigma-test", "lastRunId": "no-run-id-provided" } @@ -206,33 +183,7 @@ } }, "systemMetadata": { - "lastObserved": 1713794496012, - "runId": "sigma-test", - "lastRunId": "no-run-id-provided" - } -}, -{ - "entityType": "container", - "entityUrn": "urn:li:container:abbebb5181bf9ba2d905d2dea7d8704d", - "changeType": "UPSERT", - "aspectName": "ownership", - "aspect": { - "json": { - "owners": [ - { - "owner": "urn:li:corpuser:Shubham_Jagtap", - "type": "DATAOWNER" - } - ], - "ownerTypes": {}, - "lastModified": { - "time": 0, - "actor": "urn:li:corpuser:unknown" - } - } - }, - "systemMetadata": { - "lastObserved": 1713794496201, + "lastObserved": 1732608523781, "runId": "sigma-test", "lastRunId": "no-run-id-provided" } @@ -241,49 +192,26 @@ "entityType": "dataset", "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:sigma,cloud_instance.5LqGLu14qUnqh3cN6wRJBd,PROD)", "changeType": "UPSERT", - "aspectName": "dataPlatformInstance", - "aspect": { - "json": { - "platform": "urn:li:dataPlatform:sigma", - "instance": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:sigma,cloud_instance)" - } - }, - "systemMetadata": { - "lastObserved": 1713794496013, - "runId": "sigma-test", - "lastRunId": "no-run-id-provided" - } -}, -{ - "entityType": "container", - "entityUrn": "urn:li:container:abbebb5181bf9ba2d905d2dea7d8704d", - "changeType": "UPSERT", - "aspectName": "subTypes", + "aspectName": "container", "aspect": { "json": { - "typeNames": [ - "Sigma Workspace" - ] + "container": "urn:li:container:abbebb5181bf9ba2d905d2dea7d8704d" } }, "systemMetadata": { - "lastObserved": 1713794496200, + "lastObserved": 1732608523782, "runId": "sigma-test", "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:sigma,cloud_instance.5LqGLu14qUnqh3cN6wRJBd,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:sigma,cloud_instance.49HFLTr6xytgrPly3PFsNC,PROD)", "changeType": "UPSERT", "aspectName": "browsePathsV2", "aspect": { "json": { "path": [ - { - "id": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:sigma,cloud_instance)", - "urn": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:sigma,cloud_instance)" - }, { "id": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:sigma,cloud_instance)", "urn": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:sigma,cloud_instance)" @@ -291,22 +219,19 @@ { "id": "urn:li:container:abbebb5181bf9ba2d905d2dea7d8704d", "urn": "urn:li:container:abbebb5181bf9ba2d905d2dea7d8704d" - }, - { - "id": "New Folder" } ] } }, "systemMetadata": { - "lastObserved": 1713794496015, + "lastObserved": 1732608523765, "runId": "sigma-test", "lastRunId": "no-run-id-provided" } }, { - "entityType": "container", - "entityUrn": "urn:li:container:abbebb5181bf9ba2d905d2dea7d8704d", + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:sigma,cloud_instance.5LqGLu14qUnqh3cN6wRJBd,PROD)", "changeType": "UPSERT", "aspectName": "dataPlatformInstance", "aspect": { @@ -316,7 +241,7 @@ } }, "systemMetadata": { - "lastObserved": 1713794496200, + "lastObserved": 1732608523782, "runId": "sigma-test", "lastRunId": "no-run-id-provided" } @@ -334,34 +259,51 @@ } }, "systemMetadata": { - "lastObserved": 1713794496014, + "lastObserved": 1732608523783, "runId": "sigma-test", "lastRunId": "no-run-id-provided" } }, { - "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:sigma,cloud_instance.5LqGLu14qUnqh3cN6wRJBd,PROD)", + "entityType": "chart", + "entityUrn": "urn:li:chart:(sigma,cloud_instance.Ml9C5ezT5W)", "changeType": "UPSERT", - "aspectName": "globalTags", + "aspectName": "chartInfo", "aspect": { "json": { - "tags": [ + "customProperties": { + "VizualizationType": "bar", + "type": "visualization" + }, + "externalUrl": "https://app.sigmacomputing.com/acryldata/workbook/4JRFW1HThPI1K3YTjouXI7?:nodeId=Ml9C5ezT5W&:fullScreen=true", + "title": "Count of Profile Id by Status", + "description": "", + "lastModified": { + "created": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + } + }, + "inputs": [ { - "tag": "urn:li:tag:Deprecated" + "string": "urn:li:dataset:(urn:li:dataPlatform:sigma,cloud_instance.49HFLTr6xytgrPly3PFsNC,PROD)" } ] } }, "systemMetadata": { - "lastObserved": 1713794496015, + "lastObserved": 1732608523833, "runId": "sigma-test", "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:sigma,cloud_instance.5LqGLu14qUnqh3cN6wRJBd,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:sigma,cloud_instance.49HFLTr6xytgrPly3PFsNC,PROD)", "changeType": "UPSERT", "aspectName": "ownership", "aspect": { @@ -380,14 +322,14 @@ } }, "systemMetadata": { - "lastObserved": 1713794496014, + "lastObserved": 1732608523765, "runId": "sigma-test", "lastRunId": "no-run-id-provided" } }, { - "entityType": "container", - "entityUrn": "urn:li:container:abbebb5181bf9ba2d905d2dea7d8704d", + "entityType": "dashboard", + "entityUrn": "urn:li:dashboard:(sigma,cloud_instance.9bbbe3b0-c0c8-4fac-b6f1-8dfebfe74f8b)", "changeType": "UPSERT", "aspectName": "status", "aspect": { @@ -396,57 +338,76 @@ } }, "systemMetadata": { - "lastObserved": 1713794496199, + "lastObserved": 1732608523784, "runId": "sigma-test", "lastRunId": "no-run-id-provided" } }, { - "entityType": "container", - "entityUrn": "urn:li:container:abbebb5181bf9ba2d905d2dea7d8704d", + "entityType": "dashboard", + "entityUrn": "urn:li:dashboard:(sigma,cloud_instance.9bbbe3b0-c0c8-4fac-b6f1-8dfebfe74f8b)", "changeType": "UPSERT", - "aspectName": "containerProperties", + "aspectName": "dashboardInfo", "aspect": { "json": { "customProperties": { - "platform": "sigma", - "instance": "cloud_instance", - "workspaceId": "3ee61405-3be2-4000-ba72-60d36757b95b" - }, - "name": "Acryl Data", - "created": { - "time": 1710232264826 + "path": "Acryl Data", + "latestVersion": "2" }, + "externalUrl": "https://app.sigmacomputing.com/acryldata/workbook/4JRFW1HThPI1K3YTjouXI7", + "title": "Acryl Workbook", + "description": "", + "charts": [], + "datasets": [], + "dashboards": [ + { + "sourceUrn": "urn:li:dashboard:(sigma,cloud_instance.9bbbe3b0-c0c8-4fac-b6f1-8dfebfe74f8b)", + "destinationUrn": "urn:li:dashboard:(sigma,cloud_instance.OSnGLBzL1i)" + }, + { + "sourceUrn": "urn:li:dashboard:(sigma,cloud_instance.9bbbe3b0-c0c8-4fac-b6f1-8dfebfe74f8b)", + "destinationUrn": "urn:li:dashboard:(sigma,cloud_instance.DFSieiAcgo)" + } + ], "lastModified": { - "time": 1710232264826 + "created": { + "time": 1713188691477, + "actor": "urn:li:corpuser:datahub" + }, + "lastModified": { + "time": 1713189117302, + "actor": "urn:li:corpuser:datahub" + } } } }, "systemMetadata": { - "lastObserved": 1713794496199, + "lastObserved": 1732608523785, "runId": "sigma-test", "lastRunId": "no-run-id-provided" } }, { - "entityType": "chart", - "entityUrn": "urn:li:chart:(sigma,cloud_instance.kH0MeihtGs)", + "entityType": "dashboard", + "entityUrn": "urn:li:dashboard:(sigma,cloud_instance.9bbbe3b0-c0c8-4fac-b6f1-8dfebfe74f8b)", "changeType": "UPSERT", - "aspectName": "status", + "aspectName": "subTypes", "aspect": { "json": { - "removed": false + "typeNames": [ + "Sigma Workbook" + ] } }, "systemMetadata": { - "lastObserved": 1713794496053, + "lastObserved": 1732608523785, "runId": "sigma-test", "lastRunId": "no-run-id-provided" } }, { - "entityType": "container", - "entityUrn": "urn:li:container:abbebb5181bf9ba2d905d2dea7d8704d", + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:sigma,cloud_instance.5LqGLu14qUnqh3cN6wRJBd,PROD)", "changeType": "UPSERT", "aspectName": "browsePathsV2", "aspect": { @@ -455,227 +416,81 @@ { "id": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:sigma,cloud_instance)", "urn": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:sigma,cloud_instance)" + }, + { + "id": "urn:li:container:abbebb5181bf9ba2d905d2dea7d8704d", + "urn": "urn:li:container:abbebb5181bf9ba2d905d2dea7d8704d" + }, + { + "id": "New Folder" } ] } }, "systemMetadata": { - "lastObserved": 1713794496202, + "lastObserved": 1732608523783, "runId": "sigma-test", "lastRunId": "no-run-id-provided" } }, { - "entityType": "chart", - "entityUrn": "urn:li:chart:(sigma,cloud_instance.kH0MeihtGs)", + "entityType": "dashboard", + "entityUrn": "urn:li:dashboard:(sigma,cloud_instance.9bbbe3b0-c0c8-4fac-b6f1-8dfebfe74f8b)", "changeType": "UPSERT", - "aspectName": "container", + "aspectName": "ownership", "aspect": { "json": { - "container": "urn:li:container:084a2e283eddfc576ce70989b395a7d8" + "owners": [ + { + "owner": "urn:li:corpuser:Shubham_Jagtap", + "type": "DATAOWNER" + } + ], + "ownerTypes": {}, + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + } } }, "systemMetadata": { - "lastObserved": 1713794496055, + "lastObserved": 1732608523786, "runId": "sigma-test", "lastRunId": "no-run-id-provided" } }, { - "entityType": "chart", - "entityUrn": "urn:li:chart:(sigma,cloud_instance.kH0MeihtGs)", + "entityType": "dashboard", + "entityUrn": "urn:li:dashboard:(sigma,cloud_instance.9bbbe3b0-c0c8-4fac-b6f1-8dfebfe74f8b)", "changeType": "UPSERT", - "aspectName": "inputFields", + "aspectName": "globalTags", "aspect": { "json": { - "fields": [ + "tags": [ { - "schemaFieldUrn": "urn:li:schemaField:(urn:li:chart:(sigma,cloud_instance.kH0MeihtGs),Pk)", - "schemaField": { - "fieldPath": "Pk", - "nullable": false, - "type": { - "type": { - "com.linkedin.schema.StringType": {} - } - }, - "nativeDataType": "String", - "recursive": false, - "isPartOfKey": false - } - }, - { - "schemaFieldUrn": "urn:li:schemaField:(urn:li:chart:(sigma,cloud_instance.kH0MeihtGs),Pet Fk)", - "schemaField": { - "fieldPath": "Pet Fk", - "nullable": false, - "type": { - "type": { - "com.linkedin.schema.StringType": {} - } - }, - "nativeDataType": "String", - "recursive": false, - "isPartOfKey": false - } - }, - { - "schemaFieldUrn": "urn:li:schemaField:(urn:li:chart:(sigma,cloud_instance.kH0MeihtGs),Human Fk)", - "schemaField": { - "fieldPath": "Human Fk", - "nullable": false, - "type": { - "type": { - "com.linkedin.schema.StringType": {} - } - }, - "nativeDataType": "String", - "recursive": false, - "isPartOfKey": false - } - }, - { - "schemaFieldUrn": "urn:li:schemaField:(urn:li:chart:(sigma,cloud_instance.kH0MeihtGs),Status)", - "schemaField": { - "fieldPath": "Status", - "nullable": false, - "type": { - "type": { - "com.linkedin.schema.StringType": {} - } - }, - "nativeDataType": "String", - "recursive": false, - "isPartOfKey": false - } - }, - { - "schemaFieldUrn": "urn:li:schemaField:(urn:li:chart:(sigma,cloud_instance.kH0MeihtGs),Created At)", - "schemaField": { - "fieldPath": "Created At", - "nullable": false, - "type": { - "type": { - "com.linkedin.schema.StringType": {} - } - }, - "nativeDataType": "String", - "recursive": false, - "isPartOfKey": false - } - }, - { - "schemaFieldUrn": "urn:li:schemaField:(urn:li:chart:(sigma,cloud_instance.kH0MeihtGs),Updated At)", - "schemaField": { - "fieldPath": "Updated At", - "nullable": false, - "type": { - "type": { - "com.linkedin.schema.StringType": {} - } - }, - "nativeDataType": "String", - "recursive": false, - "isPartOfKey": false - } - } - ] - } - }, - "systemMetadata": { - "lastObserved": 1713794496055, - "runId": "sigma-test", - "lastRunId": "no-run-id-provided" - } -}, -{ - "entityType": "chart", - "entityUrn": "urn:li:chart:(sigma,cloud_instance.kH0MeihtGs)", - "changeType": "UPSERT", - "aspectName": "chartInfo", - "aspect": { - "json": { - "customProperties": { - "VizualizationType": "levelTable", - "type": "table" - }, - "externalUrl": "https://app.sigmacomputing.com/acryldata/workbook/4JRFW1HThPI1K3YTjouXI7?:nodeId=kH0MeihtGs&:fullScreen=true", - "title": "ADOPTIONS", - "description": "", - "lastModified": { - "created": { - "time": 0, - "actor": "urn:li:corpuser:unknown" - }, - "lastModified": { - "time": 0, - "actor": "urn:li:corpuser:unknown" - } - }, - "inputs": [ - { - "string": "urn:li:dataset:(urn:li:dataPlatform:snowflake,dev_instance.long_tail_companions.adoption.adoptions,DEV)" - } - ] - } - }, - "systemMetadata": { - "lastObserved": 1718348049212, - "runId": "sigma-test", - "lastRunId": "no-run-id-provided" - } -}, -{ - "entityType": "chart", - "entityUrn": "urn:li:chart:(sigma,cloud_instance.kH0MeihtGs)", - "changeType": "UPSERT", - "aspectName": "browsePathsV2", - "aspect": { - "json": { - "path": [ - { - "id": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:sigma,cloud_instance)", - "urn": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:sigma,cloud_instance)" - }, - { - "id": "urn:li:container:abbebb5181bf9ba2d905d2dea7d8704d", - "urn": "urn:li:container:abbebb5181bf9ba2d905d2dea7d8704d" - }, - { - "id": "urn:li:container:084a2e283eddfc576ce70989b395a7d8", - "urn": "urn:li:container:084a2e283eddfc576ce70989b395a7d8" + "tag": "urn:li:tag:Warning" } ] } }, "systemMetadata": { - "lastObserved": 1713794496058, + "lastObserved": 1732608523786, "runId": "sigma-test", "lastRunId": "no-run-id-provided" } }, { - "entityType": "container", - "entityUrn": "urn:li:container:084a2e283eddfc576ce70989b395a7d8", + "entityType": "dashboard", + "entityUrn": "urn:li:dashboard:(sigma,cloud_instance.9bbbe3b0-c0c8-4fac-b6f1-8dfebfe74f8b)", "changeType": "UPSERT", - "aspectName": "ownership", + "aspectName": "container", "aspect": { "json": { - "owners": [ - { - "owner": "urn:li:corpuser:Shubham_Jagtap", - "type": "DATAOWNER" - } - ], - "ownerTypes": {}, - "lastModified": { - "time": 0, - "actor": "urn:li:corpuser:unknown" - } + "container": "urn:li:container:abbebb5181bf9ba2d905d2dea7d8704d" } }, "systemMetadata": { - "lastObserved": 1713794496019, + "lastObserved": 1732608523786, "runId": "sigma-test", "lastRunId": "no-run-id-provided" } @@ -691,7 +506,7 @@ } }, "systemMetadata": { - "lastObserved": 1713794496021, + "lastObserved": 1732608523787, "runId": "sigma-test", "lastRunId": "no-run-id-provided" } @@ -713,6 +528,7 @@ "urn:li:chart:(sigma,cloud_instance.Ml9C5ezT5W)" ], "datasets": [], + "dashboards": [], "lastModified": { "created": { "time": 0, @@ -726,87 +542,7 @@ } }, "systemMetadata": { - "lastObserved": 1713794496022, - "runId": "sigma-test", - "lastRunId": "no-run-id-provided" - } -}, -{ - "entityType": "container", - "entityUrn": "urn:li:container:084a2e283eddfc576ce70989b395a7d8", - "changeType": "UPSERT", - "aspectName": "containerProperties", - "aspect": { - "json": { - "customProperties": { - "platform": "sigma", - "instance": "cloud_instance", - "workbookId": "9bbbe3b0-c0c8-4fac-b6f1-8dfebfe74f8b", - "path": "Acryl Data", - "latestVersion": "2" - }, - "externalUrl": "https://app.sigmacomputing.com/acryldata/workbook/4JRFW1HThPI1K3YTjouXI7", - "name": "Acryl Workbook", - "created": { - "time": 1713188691477 - }, - "lastModified": { - "time": 1713189117302 - } - } - }, - "systemMetadata": { - "lastObserved": 1713794496016, - "runId": "sigma-test", - "lastRunId": "no-run-id-provided" - } -}, -{ - "entityType": "dashboard", - "entityUrn": "urn:li:dashboard:(sigma,cloud_instance.OSnGLBzL1i)", - "changeType": "UPSERT", - "aspectName": "container", - "aspect": { - "json": { - "container": "urn:li:container:084a2e283eddfc576ce70989b395a7d8" - } - }, - "systemMetadata": { - "lastObserved": 1713794496023, - "runId": "sigma-test", - "lastRunId": "no-run-id-provided" - } -}, -{ - "entityType": "container", - "entityUrn": "urn:li:container:084a2e283eddfc576ce70989b395a7d8", - "changeType": "UPSERT", - "aspectName": "subTypes", - "aspect": { - "json": { - "typeNames": [ - "Sigma Workbook" - ] - } - }, - "systemMetadata": { - "lastObserved": 1713794496018, - "runId": "sigma-test", - "lastRunId": "no-run-id-provided" - } -}, -{ - "entityType": "container", - "entityUrn": "urn:li:container:084a2e283eddfc576ce70989b395a7d8", - "changeType": "UPSERT", - "aspectName": "status", - "aspect": { - "json": { - "removed": false - } - }, - "systemMetadata": { - "lastObserved": 1713794496017, + "lastObserved": 1732608523788, "runId": "sigma-test", "lastRunId": "no-run-id-provided" } @@ -823,7 +559,7 @@ } }, "systemMetadata": { - "lastObserved": 1713794496023, + "lastObserved": 1732608523788, "runId": "sigma-test", "lastRunId": "no-run-id-provided" } @@ -845,108 +581,87 @@ "urn": "urn:li:container:abbebb5181bf9ba2d905d2dea7d8704d" }, { - "id": "urn:li:container:084a2e283eddfc576ce70989b395a7d8", - "urn": "urn:li:container:084a2e283eddfc576ce70989b395a7d8" + "id": "Acryl Workbook" } ] } }, "systemMetadata": { - "lastObserved": 1713794496024, + "lastObserved": 1732608523788, "runId": "sigma-test", "lastRunId": "no-run-id-provided" } }, { - "entityType": "container", - "entityUrn": "urn:li:container:084a2e283eddfc576ce70989b395a7d8", + "entityType": "dashboard", + "entityUrn": "urn:li:dashboard:(sigma,cloud_instance.9bbbe3b0-c0c8-4fac-b6f1-8dfebfe74f8b)", "changeType": "UPSERT", - "aspectName": "globalTags", + "aspectName": "browsePathsV2", "aspect": { "json": { - "tags": [ + "path": [ { - "tag": "urn:li:tag:Warning" + "id": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:sigma,cloud_instance)", + "urn": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:sigma,cloud_instance)" + }, + { + "id": "urn:li:container:abbebb5181bf9ba2d905d2dea7d8704d", + "urn": "urn:li:container:abbebb5181bf9ba2d905d2dea7d8704d" + }, + { + "id": "Acryl Workbook" } ] } }, "systemMetadata": { - "lastObserved": 1713794496019, + "lastObserved": 1732608523787, "runId": "sigma-test", "lastRunId": "no-run-id-provided" } }, { - "entityType": "container", - "entityUrn": "urn:li:container:084a2e283eddfc576ce70989b395a7d8", - "changeType": "UPSERT", - "aspectName": "dataPlatformInstance", - "aspect": { - "json": { - "platform": "urn:li:dataPlatform:sigma", - "instance": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:sigma,cloud_instance)" - } - }, - "systemMetadata": { - "lastObserved": 1713794496018, - "runId": "sigma-test", - "lastRunId": "no-run-id-provided" - } -}, -{ - "entityType": "chart", - "entityUrn": "urn:li:chart:(sigma,cloud_instance.Ml9C5ezT5W)", - "changeType": "UPSERT", - "aspectName": "status", - "aspect": { - "json": { - "removed": false - } - }, - "systemMetadata": { - "lastObserved": 1713794496114, - "runId": "sigma-test", - "lastRunId": "no-run-id-provided" - } -}, -{ - "entityType": "container", - "entityUrn": "urn:li:container:084a2e283eddfc576ce70989b395a7d8", + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:sigma,cloud_instance.5LqGLu14qUnqh3cN6wRJBd,PROD)", "changeType": "UPSERT", - "aspectName": "browsePathsV2", + "aspectName": "globalTags", "aspect": { "json": { - "path": [ - { - "id": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:sigma,cloud_instance)", - "urn": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:sigma,cloud_instance)" - }, + "tags": [ { - "id": "urn:li:container:abbebb5181bf9ba2d905d2dea7d8704d", - "urn": "urn:li:container:abbebb5181bf9ba2d905d2dea7d8704d" + "tag": "urn:li:tag:Deprecated" } ] } }, "systemMetadata": { - "lastObserved": 1713794496020, + "lastObserved": 1732608523783, "runId": "sigma-test", "lastRunId": "no-run-id-provided" } }, { - "entityType": "chart", - "entityUrn": "urn:li:chart:(sigma,cloud_instance.Ml9C5ezT5W)", + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:sigma,cloud_instance.5LqGLu14qUnqh3cN6wRJBd,PROD)", "changeType": "UPSERT", - "aspectName": "container", + "aspectName": "ownership", "aspect": { "json": { - "container": "urn:li:container:084a2e283eddfc576ce70989b395a7d8" + "owners": [ + { + "owner": "urn:li:corpuser:Shubham_Jagtap", + "type": "DATAOWNER" + } + ], + "ownerTypes": {}, + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + } } }, "systemMetadata": { - "lastObserved": 1713794496116, + "lastObserved": 1732608523782, "runId": "sigma-test", "lastRunId": "no-run-id-provided" } @@ -1047,208 +762,20 @@ "nativeDataType": "String", "recursive": false, "isPartOfKey": false - } - } - ] - } - }, - "systemMetadata": { - "lastObserved": 1713794496117, - "runId": "sigma-test", - "lastRunId": "no-run-id-provided" - } -}, -{ - "entityType": "chart", - "entityUrn": "urn:li:chart:(sigma,cloud_instance.Ml9C5ezT5W)", - "changeType": "UPSERT", - "aspectName": "chartInfo", - "aspect": { - "json": { - "customProperties": { - "VizualizationType": "bar", - "type": "visualization" - }, - "externalUrl": "https://app.sigmacomputing.com/acryldata/workbook/4JRFW1HThPI1K3YTjouXI7?:nodeId=Ml9C5ezT5W&:fullScreen=true", - "title": "Count of Profile Id by Status", - "description": "", - "lastModified": { - "created": { - "time": 0, - "actor": "urn:li:corpuser:unknown" - }, - "lastModified": { - "time": 0, - "actor": "urn:li:corpuser:unknown" - } - }, - "inputs": [ - { - "string": "urn:li:dataset:(urn:li:dataPlatform:sigma,cloud_instance.49HFLTr6xytgrPly3PFsNC,PROD)" - } - ] - } - }, - "systemMetadata": { - "lastObserved": 1718348049268, - "runId": "sigma-test", - "lastRunId": "no-run-id-provided" - } -}, -{ - "entityType": "chart", - "entityUrn": "urn:li:chart:(sigma,cloud_instance.Ml9C5ezT5W)", - "changeType": "UPSERT", - "aspectName": "browsePathsV2", - "aspect": { - "json": { - "path": [ - { - "id": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:sigma,cloud_instance)", - "urn": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:sigma,cloud_instance)" - }, - { - "id": "urn:li:container:abbebb5181bf9ba2d905d2dea7d8704d", - "urn": "urn:li:container:abbebb5181bf9ba2d905d2dea7d8704d" - }, - { - "id": "urn:li:container:084a2e283eddfc576ce70989b395a7d8", - "urn": "urn:li:container:084a2e283eddfc576ce70989b395a7d8" - } - ] - } - }, - "systemMetadata": { - "lastObserved": 1713794496119, - "runId": "sigma-test", - "lastRunId": "no-run-id-provided" - } -}, -{ - "entityType": "dashboard", - "entityUrn": "urn:li:dashboard:(sigma,cloud_instance.DFSieiAcgo)", - "changeType": "UPSERT", - "aspectName": "status", - "aspect": { - "json": { - "removed": false - } - }, - "systemMetadata": { - "lastObserved": 1713794496124, - "runId": "sigma-test", - "lastRunId": "no-run-id-provided" - } -}, -{ - "entityType": "dashboard", - "entityUrn": "urn:li:dashboard:(sigma,cloud_instance.DFSieiAcgo)", - "changeType": "UPSERT", - "aspectName": "dashboardInfo", - "aspect": { - "json": { - "customProperties": { - "ElementsCount": "1" - }, - "title": "Page 2", - "description": "", - "charts": [ - "urn:li:chart:(sigma,cloud_instance.tQJu5N1l81)" - ], - "datasets": [], - "lastModified": { - "created": { - "time": 0, - "actor": "urn:li:corpuser:unknown" - }, - "lastModified": { - "time": 0, - "actor": "urn:li:corpuser:unknown" - } - } - } - }, - "systemMetadata": { - "lastObserved": 1713794496125, - "runId": "sigma-test", - "lastRunId": "no-run-id-provided" - } -}, -{ - "entityType": "container", - "entityUrn": "urn:li:container:084a2e283eddfc576ce70989b395a7d8", - "changeType": "UPSERT", - "aspectName": "container", - "aspect": { - "json": { - "container": "urn:li:container:abbebb5181bf9ba2d905d2dea7d8704d" - } - }, - "systemMetadata": { - "lastObserved": 1713794496020, - "runId": "sigma-test", - "lastRunId": "no-run-id-provided" - } -}, -{ - "entityType": "dashboard", - "entityUrn": "urn:li:dashboard:(sigma,cloud_instance.DFSieiAcgo)", - "changeType": "UPSERT", - "aspectName": "container", - "aspect": { - "json": { - "container": "urn:li:container:084a2e283eddfc576ce70989b395a7d8" - } - }, - "systemMetadata": { - "lastObserved": 1713794496125, - "runId": "sigma-test", - "lastRunId": "no-run-id-provided" - } -}, -{ - "entityType": "chart", - "entityUrn": "urn:li:chart:(sigma,cloud_instance.tQJu5N1l81)", - "changeType": "UPSERT", - "aspectName": "chartInfo", - "aspect": { - "json": { - "customProperties": { - "VizualizationType": "levelTable", - "type": "table" - }, - "externalUrl": "https://app.sigmacomputing.com/acryldata/workbook/4JRFW1HThPI1K3YTjouXI7?:nodeId=tQJu5N1l81&:fullScreen=true", - "title": "PETS ADOPTIONS JOIN", - "description": "", - "lastModified": { - "created": { - "time": 0, - "actor": "urn:li:corpuser:unknown" - }, - "lastModified": { - "time": 0, - "actor": "urn:li:corpuser:unknown" - } - }, - "inputs": [ - { - "string": "urn:li:dataset:(urn:li:dataPlatform:sigma,cloud_instance.49HFLTr6xytgrPly3PFsNC,PROD)" - }, - { - "string": "urn:li:dataset:(urn:li:dataPlatform:snowflake,dev_instance.long_tail_companions.adoption.adoptions,DEV)" + } } ] } }, "systemMetadata": { - "lastObserved": 1718348049351, + "lastObserved": 1732608523834, "runId": "sigma-test", "lastRunId": "no-run-id-provided" } }, { "entityType": "chart", - "entityUrn": "urn:li:chart:(sigma,cloud_instance.tQJu5N1l81)", + "entityUrn": "urn:li:chart:(sigma,cloud_instance.kH0MeihtGs)", "changeType": "UPSERT", "aspectName": "status", "aspect": { @@ -1257,69 +784,179 @@ } }, "systemMetadata": { - "lastObserved": 1713794496188, + "lastObserved": 1732608523803, "runId": "sigma-test", "lastRunId": "no-run-id-provided" } }, { - "entityType": "dashboard", - "entityUrn": "urn:li:dashboard:(sigma,cloud_instance.DFSieiAcgo)", + "entityType": "chart", + "entityUrn": "urn:li:chart:(sigma,cloud_instance.kH0MeihtGs)", "changeType": "UPSERT", - "aspectName": "dataPlatformInstance", + "aspectName": "browsePathsV2", "aspect": { "json": { - "platform": "urn:li:dataPlatform:sigma", - "instance": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:sigma,cloud_instance)" + "path": [ + { + "id": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:sigma,cloud_instance)", + "urn": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:sigma,cloud_instance)" + }, + { + "id": "urn:li:container:abbebb5181bf9ba2d905d2dea7d8704d", + "urn": "urn:li:container:abbebb5181bf9ba2d905d2dea7d8704d" + }, + { + "id": "Acryl Workbook" + } + ] } }, "systemMetadata": { - "lastObserved": 1713794496126, + "lastObserved": 1732608523806, "runId": "sigma-test", "lastRunId": "no-run-id-provided" } }, { - "entityType": "dashboard", - "entityUrn": "urn:li:dashboard:(sigma,cloud_instance.DFSieiAcgo)", + "entityType": "chart", + "entityUrn": "urn:li:chart:(sigma,cloud_instance.kH0MeihtGs)", "changeType": "UPSERT", - "aspectName": "browsePathsV2", + "aspectName": "chartInfo", "aspect": { "json": { - "path": [ - { - "id": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:sigma,cloud_instance)", - "urn": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:sigma,cloud_instance)" - }, - { - "id": "urn:li:container:abbebb5181bf9ba2d905d2dea7d8704d", - "urn": "urn:li:container:abbebb5181bf9ba2d905d2dea7d8704d" + "customProperties": { + "VizualizationType": "levelTable", + "type": "table" + }, + "externalUrl": "https://app.sigmacomputing.com/acryldata/workbook/4JRFW1HThPI1K3YTjouXI7?:nodeId=kH0MeihtGs&:fullScreen=true", + "title": "ADOPTIONS", + "description": "", + "lastModified": { + "created": { + "time": 0, + "actor": "urn:li:corpuser:unknown" }, + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + } + }, + "inputs": [ { - "id": "urn:li:container:084a2e283eddfc576ce70989b395a7d8", - "urn": "urn:li:container:084a2e283eddfc576ce70989b395a7d8" + "string": "urn:li:dataset:(urn:li:dataPlatform:snowflake,dev_instance.long_tail_companions.adoption.adoptions,DEV)" } ] } }, "systemMetadata": { - "lastObserved": 1713794496126, + "lastObserved": 1732608523804, "runId": "sigma-test", "lastRunId": "no-run-id-provided" } }, { "entityType": "chart", - "entityUrn": "urn:li:chart:(sigma,cloud_instance.tQJu5N1l81)", + "entityUrn": "urn:li:chart:(sigma,cloud_instance.kH0MeihtGs)", "changeType": "UPSERT", - "aspectName": "container", + "aspectName": "inputFields", "aspect": { "json": { - "container": "urn:li:container:084a2e283eddfc576ce70989b395a7d8" + "fields": [ + { + "schemaFieldUrn": "urn:li:schemaField:(urn:li:chart:(sigma,cloud_instance.kH0MeihtGs),Pk)", + "schemaField": { + "fieldPath": "Pk", + "nullable": false, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "String", + "recursive": false, + "isPartOfKey": false + } + }, + { + "schemaFieldUrn": "urn:li:schemaField:(urn:li:chart:(sigma,cloud_instance.kH0MeihtGs),Pet Fk)", + "schemaField": { + "fieldPath": "Pet Fk", + "nullable": false, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "String", + "recursive": false, + "isPartOfKey": false + } + }, + { + "schemaFieldUrn": "urn:li:schemaField:(urn:li:chart:(sigma,cloud_instance.kH0MeihtGs),Human Fk)", + "schemaField": { + "fieldPath": "Human Fk", + "nullable": false, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "String", + "recursive": false, + "isPartOfKey": false + } + }, + { + "schemaFieldUrn": "urn:li:schemaField:(urn:li:chart:(sigma,cloud_instance.kH0MeihtGs),Status)", + "schemaField": { + "fieldPath": "Status", + "nullable": false, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "String", + "recursive": false, + "isPartOfKey": false + } + }, + { + "schemaFieldUrn": "urn:li:schemaField:(urn:li:chart:(sigma,cloud_instance.kH0MeihtGs),Created At)", + "schemaField": { + "fieldPath": "Created At", + "nullable": false, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "String", + "recursive": false, + "isPartOfKey": false + } + }, + { + "schemaFieldUrn": "urn:li:schemaField:(urn:li:chart:(sigma,cloud_instance.kH0MeihtGs),Updated At)", + "schemaField": { + "fieldPath": "Updated At", + "nullable": false, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "String", + "recursive": false, + "isPartOfKey": false + } + } + ] } }, "systemMetadata": { - "lastObserved": 1713794496189, + "lastObserved": 1732608523804, "runId": "sigma-test", "lastRunId": "no-run-id-provided" } @@ -1483,40 +1120,219 @@ } }, { - "schemaFieldUrn": "urn:li:schemaField:(urn:li:chart:(sigma,cloud_instance.Ml9C5ezT5W),Updated At)", - "schemaField": { - "fieldPath": "Updated At", - "nullable": false, - "type": { - "type": { - "com.linkedin.schema.StringType": {} - } - }, - "nativeDataType": "String", - "recursive": false, - "isPartOfKey": false - } + "schemaFieldUrn": "urn:li:schemaField:(urn:li:chart:(sigma,cloud_instance.Ml9C5ezT5W),Updated At)", + "schemaField": { + "fieldPath": "Updated At", + "nullable": false, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "String", + "recursive": false, + "isPartOfKey": false + } + }, + { + "schemaFieldUrn": "urn:li:schemaField:(urn:li:chart:(sigma,cloud_instance.Ml9C5ezT5W),Count of Profile Id)", + "schemaField": { + "fieldPath": "Count of Profile Id", + "nullable": false, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "String", + "recursive": false, + "isPartOfKey": false + } + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1732608523836, + "runId": "sigma-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dashboard", + "entityUrn": "urn:li:dashboard:(sigma,cloud_instance.DFSieiAcgo)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1732608523838, + "runId": "sigma-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dashboard", + "entityUrn": "urn:li:dashboard:(sigma,cloud_instance.DFSieiAcgo)", + "changeType": "UPSERT", + "aspectName": "dashboardInfo", + "aspect": { + "json": { + "customProperties": { + "ElementsCount": "1" + }, + "title": "Page 2", + "description": "", + "charts": [ + "urn:li:chart:(sigma,cloud_instance.tQJu5N1l81)" + ], + "datasets": [], + "dashboards": [], + "lastModified": { + "created": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + } + } + } + }, + "systemMetadata": { + "lastObserved": 1732608523838, + "runId": "sigma-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dashboard", + "entityUrn": "urn:li:dashboard:(sigma,cloud_instance.DFSieiAcgo)", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:sigma", + "instance": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:sigma,cloud_instance)" + } + }, + "systemMetadata": { + "lastObserved": 1732608523839, + "runId": "sigma-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dashboard", + "entityUrn": "urn:li:dashboard:(sigma,cloud_instance.DFSieiAcgo)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:sigma,cloud_instance)", + "urn": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:sigma,cloud_instance)" + }, + { + "id": "urn:li:container:abbebb5181bf9ba2d905d2dea7d8704d", + "urn": "urn:li:container:abbebb5181bf9ba2d905d2dea7d8704d" }, { - "schemaFieldUrn": "urn:li:schemaField:(urn:li:chart:(sigma,cloud_instance.Ml9C5ezT5W),Count of Profile Id)", - "schemaField": { - "fieldPath": "Count of Profile Id", - "nullable": false, - "type": { - "type": { - "com.linkedin.schema.StringType": {} - } - }, - "nativeDataType": "String", - "recursive": false, - "isPartOfKey": false - } + "id": "Acryl Workbook" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1732608523839, + "runId": "sigma-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:sigma,cloud_instance.49HFLTr6xytgrPly3PFsNC,PROD)", + "changeType": "UPSERT", + "aspectName": "upstreamLineage", + "aspect": { + "json": { + "upstreams": [ + { + "auditStamp": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,dev_instance.long_tail_companions.adoption.pets,DEV)", + "type": "COPY" } ] } }, "systemMetadata": { - "lastObserved": 1713794496120, + "lastObserved": 1732608523874, + "runId": "sigma-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "tag", + "entityUrn": "urn:li:tag:Deprecated", + "changeType": "UPSERT", + "aspectName": "tagKey", + "aspect": { + "json": { + "name": "Deprecated" + } + }, + "systemMetadata": { + "lastObserved": 1732608523874, + "runId": "sigma-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:abbebb5181bf9ba2d905d2dea7d8704d", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1732608523872, + "runId": "sigma-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:abbebb5181bf9ba2d905d2dea7d8704d", + "changeType": "UPSERT", + "aspectName": "ownership", + "aspect": { + "json": { + "owners": [ + { + "owner": "urn:li:corpuser:Shubham_Jagtap", + "type": "DATAOWNER" + } + ], + "ownerTypes": {}, + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + } + } + }, + "systemMetadata": { + "lastObserved": 1732608523873, "runId": "sigma-test", "lastRunId": "no-run-id-provided" } @@ -1698,7 +1514,7 @@ } }, "systemMetadata": { - "lastObserved": 1713794496194, + "lastObserved": 1732608523870, "runId": "sigma-test", "lastRunId": "no-run-id-provided" } @@ -1707,27 +1523,54 @@ "entityType": "chart", "entityUrn": "urn:li:chart:(sigma,cloud_instance.tQJu5N1l81)", "changeType": "UPSERT", - "aspectName": "browsePathsV2", + "aspectName": "status", "aspect": { "json": { - "path": [ - { - "id": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:sigma,cloud_instance)", - "urn": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:sigma,cloud_instance)" + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1732608523866, + "runId": "sigma-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "chart", + "entityUrn": "urn:li:chart:(sigma,cloud_instance.tQJu5N1l81)", + "changeType": "UPSERT", + "aspectName": "chartInfo", + "aspect": { + "json": { + "customProperties": { + "VizualizationType": "levelTable", + "type": "table" + }, + "externalUrl": "https://app.sigmacomputing.com/acryldata/workbook/4JRFW1HThPI1K3YTjouXI7?:nodeId=tQJu5N1l81&:fullScreen=true", + "title": "PETS ADOPTIONS JOIN", + "description": "", + "lastModified": { + "created": { + "time": 0, + "actor": "urn:li:corpuser:unknown" }, + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + } + }, + "inputs": [ { - "id": "urn:li:container:abbebb5181bf9ba2d905d2dea7d8704d", - "urn": "urn:li:container:abbebb5181bf9ba2d905d2dea7d8704d" + "string": "urn:li:dataset:(urn:li:dataPlatform:sigma,cloud_instance.49HFLTr6xytgrPly3PFsNC,PROD)" }, { - "id": "urn:li:container:084a2e283eddfc576ce70989b395a7d8", - "urn": "urn:li:container:084a2e283eddfc576ce70989b395a7d8" + "string": "urn:li:dataset:(urn:li:dataPlatform:snowflake,dev_instance.long_tail_companions.adoption.adoptions,DEV)" } ] } }, "systemMetadata": { - "lastObserved": 1713794496194, + "lastObserved": 1732608523866, "runId": "sigma-test", "lastRunId": "no-run-id-provided" } @@ -1909,64 +1752,134 @@ } }, "systemMetadata": { - "lastObserved": 1713794496190, + "lastObserved": 1732608523867, "runId": "sigma-test", "lastRunId": "no-run-id-provided" } }, { - "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:sigma,cloud_instance.49HFLTr6xytgrPly3PFsNC,PROD)", + "entityType": "container", + "entityUrn": "urn:li:container:abbebb5181bf9ba2d905d2dea7d8704d", "changeType": "UPSERT", - "aspectName": "upstreamLineage", + "aspectName": "dataPlatformInstance", "aspect": { "json": { - "upstreams": [ + "platform": "urn:li:dataPlatform:sigma", + "instance": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:sigma,cloud_instance)" + } + }, + "systemMetadata": { + "lastObserved": 1732608523873, + "runId": "sigma-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:abbebb5181bf9ba2d905d2dea7d8704d", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ { - "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" - }, - "dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,dev_instance.long_tail_companions.adoption.pets,DEV)", - "type": "COPY" + "id": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:sigma,cloud_instance)", + "urn": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:sigma,cloud_instance)" } ] } }, "systemMetadata": { - "lastObserved": 1718348049380, + "lastObserved": 1732608523874, "runId": "sigma-test", "lastRunId": "no-run-id-provided" } }, { - "entityType": "tag", - "entityUrn": "urn:li:tag:Warning", + "entityType": "chart", + "entityUrn": "urn:li:chart:(sigma,cloud_instance.tQJu5N1l81)", "changeType": "UPSERT", - "aspectName": "tagKey", + "aspectName": "browsePathsV2", "aspect": { "json": { - "name": "Warning" + "path": [ + { + "id": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:sigma,cloud_instance)", + "urn": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:sigma,cloud_instance)" + }, + { + "id": "urn:li:container:abbebb5181bf9ba2d905d2dea7d8704d", + "urn": "urn:li:container:abbebb5181bf9ba2d905d2dea7d8704d" + }, + { + "id": "Acryl Workbook" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1732608523869, + "runId": "sigma-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:abbebb5181bf9ba2d905d2dea7d8704d", + "changeType": "UPSERT", + "aspectName": "containerProperties", + "aspect": { + "json": { + "customProperties": { + "platform": "sigma", + "instance": "cloud_instance", + "workspaceId": "3ee61405-3be2-4000-ba72-60d36757b95b" + }, + "name": "Acryl Data", + "created": { + "time": 1710232264826 + }, + "lastModified": { + "time": 1710232264826 + } + } + }, + "systemMetadata": { + "lastObserved": 1732608523872, + "runId": "sigma-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:abbebb5181bf9ba2d905d2dea7d8704d", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Sigma Workspace" + ] } }, "systemMetadata": { - "lastObserved": 1713794496203, + "lastObserved": 1732608523873, "runId": "sigma-test", "lastRunId": "no-run-id-provided" } }, { "entityType": "tag", - "entityUrn": "urn:li:tag:Deprecated", + "entityUrn": "urn:li:tag:Warning", "changeType": "UPSERT", "aspectName": "tagKey", "aspect": { "json": { - "name": "Deprecated" + "name": "Warning" } }, "systemMetadata": { - "lastObserved": 1713794496203, + "lastObserved": 1732608523875, "runId": "sigma-test", "lastRunId": "no-run-id-provided" } diff --git a/metadata-ingestion/tests/integration/sigma/golden_test_sigma_ingest.json b/metadata-ingestion/tests/integration/sigma/golden_test_sigma_ingest.json index f800cb19f88115..bb37e7029330b2 100644 --- a/metadata-ingestion/tests/integration/sigma/golden_test_sigma_ingest.json +++ b/metadata-ingestion/tests/integration/sigma/golden_test_sigma_ingest.json @@ -261,37 +261,8 @@ } }, { - "entityType": "container", - "entityUrn": "urn:li:container:140db5e9decc9b6ec67fa1e8207b6f02", - "changeType": "UPSERT", - "aspectName": "containerProperties", - "aspect": { - "json": { - "customProperties": { - "platform": "sigma", - "workbookId": "9bbbe3b0-c0c8-4fac-b6f1-8dfebfe74f8b", - "path": "Acryl Data", - "latestVersion": "2" - }, - "externalUrl": "https://app.sigmacomputing.com/acryldata/workbook/4JRFW1HThPI1K3YTjouXI7", - "name": "Acryl Workbook", - "created": { - "time": 1713188691477 - }, - "lastModified": { - "time": 1713189117302 - } - } - }, - "systemMetadata": { - "lastObserved": 1713795619227, - "runId": "sigma-test", - "lastRunId": "no-run-id-provided" - } -}, -{ - "entityType": "container", - "entityUrn": "urn:li:container:140db5e9decc9b6ec67fa1e8207b6f02", + "entityType": "dashboard", + "entityUrn": "urn:li:dashboard:(sigma,9bbbe3b0-c0c8-4fac-b6f1-8dfebfe74f8b)", "changeType": "UPSERT", "aspectName": "status", "aspect": { @@ -300,7 +271,7 @@ } }, "systemMetadata": { - "lastObserved": 1713795619228, + "lastObserved": 1732513099680, "runId": "sigma-test", "lastRunId": "no-run-id-provided" } @@ -338,32 +309,70 @@ } }, { - "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:sigma,5LqGLu14qUnqh3cN6wRJBd,PROD)", + "entityType": "dashboard", + "entityUrn": "urn:li:dashboard:(sigma,9bbbe3b0-c0c8-4fac-b6f1-8dfebfe74f8b)", "changeType": "UPSERT", - "aspectName": "browsePathsV2", + "aspectName": "subTypes", "aspect": { "json": { - "path": [ + "typeNames": [ + "Sigma Workbook" + ] + } + }, + "systemMetadata": { + "lastObserved": 1732513099681, + "runId": "sigma-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dashboard", + "entityUrn": "urn:li:dashboard:(sigma,9bbbe3b0-c0c8-4fac-b6f1-8dfebfe74f8b)", + "changeType": "UPSERT", + "aspectName": "dashboardInfo", + "aspect": { + "json": { + "customProperties": { + "path": "Acryl Data", + "latestVersion": "2" + }, + "externalUrl": "https://app.sigmacomputing.com/acryldata/workbook/4JRFW1HThPI1K3YTjouXI7", + "title": "Acryl Workbook", + "description": "", + "charts": [], + "datasets": [], + "dashboards": [ { - "id": "urn:li:container:46c912b7a3f62c8e3269e559648c4b2f", - "urn": "urn:li:container:46c912b7a3f62c8e3269e559648c4b2f" + "sourceUrn": "urn:li:dashboard:(sigma,9bbbe3b0-c0c8-4fac-b6f1-8dfebfe74f8b)", + "destinationUrn": "urn:li:dashboard:(sigma,OSnGLBzL1i)" }, { - "id": "New Folder" + "sourceUrn": "urn:li:dashboard:(sigma,9bbbe3b0-c0c8-4fac-b6f1-8dfebfe74f8b)", + "destinationUrn": "urn:li:dashboard:(sigma,DFSieiAcgo)" } - ] + ], + "lastModified": { + "created": { + "time": 1713188691477, + "actor": "urn:li:corpuser:datahub" + }, + "lastModified": { + "time": 1713189117302, + "actor": "urn:li:corpuser:datahub" + } + } } }, "systemMetadata": { - "lastObserved": 1713795619226, + "lastObserved": 1732535135915, "runId": "sigma-test", "lastRunId": "no-run-id-provided" } }, { - "entityType": "container", - "entityUrn": "urn:li:container:140db5e9decc9b6ec67fa1e8207b6f02", + "entityType": "dashboard", + "entityUrn": "urn:li:dashboard:(sigma,9bbbe3b0-c0c8-4fac-b6f1-8dfebfe74f8b)", "changeType": "UPSERT", "aspectName": "ownership", "aspect": { @@ -382,41 +391,51 @@ } }, "systemMetadata": { - "lastObserved": 1713795619229, + "lastObserved": 1732513099681, "runId": "sigma-test", "lastRunId": "no-run-id-provided" } }, { - "entityType": "container", - "entityUrn": "urn:li:container:140db5e9decc9b6ec67fa1e8207b6f02", + "entityType": "dashboard", + "entityUrn": "urn:li:dashboard:(sigma,9bbbe3b0-c0c8-4fac-b6f1-8dfebfe74f8b)", "changeType": "UPSERT", - "aspectName": "subTypes", + "aspectName": "globalTags", "aspect": { "json": { - "typeNames": [ - "Sigma Workbook" + "tags": [ + { + "tag": "urn:li:tag:Warning" + } ] } }, "systemMetadata": { - "lastObserved": 1713795619229, + "lastObserved": 1732513099682, "runId": "sigma-test", "lastRunId": "no-run-id-provided" } }, { - "entityType": "container", - "entityUrn": "urn:li:container:140db5e9decc9b6ec67fa1e8207b6f02", + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:sigma,5LqGLu14qUnqh3cN6wRJBd,PROD)", "changeType": "UPSERT", - "aspectName": "dataPlatformInstance", + "aspectName": "browsePathsV2", "aspect": { "json": { - "platform": "urn:li:dataPlatform:sigma" + "path": [ + { + "id": "urn:li:container:46c912b7a3f62c8e3269e559648c4b2f", + "urn": "urn:li:container:46c912b7a3f62c8e3269e559648c4b2f" + }, + { + "id": "New Folder" + } + ] } }, "systemMetadata": { - "lastObserved": 1713795619228, + "lastObserved": 1713795619226, "runId": "sigma-test", "lastRunId": "no-run-id-provided" } @@ -457,79 +476,62 @@ "entityType": "dashboard", "entityUrn": "urn:li:dashboard:(sigma,OSnGLBzL1i)", "changeType": "UPSERT", - "aspectName": "dashboardInfo", + "aspectName": "browsePathsV2", "aspect": { "json": { - "customProperties": { - "ElementsCount": "2" - }, - "title": "Page 1", - "description": "", - "charts": [ - "urn:li:chart:(sigma,kH0MeihtGs)", - "urn:li:chart:(sigma,Ml9C5ezT5W)" - ], - "datasets": [], - "lastModified": { - "created": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "path": [ + { + "id": "urn:li:container:46c912b7a3f62c8e3269e559648c4b2f", + "urn": "urn:li:container:46c912b7a3f62c8e3269e559648c4b2f" }, - "lastModified": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + { + "id": "Acryl Workbook" } - } + ] } }, "systemMetadata": { - "lastObserved": 1713795619233, + "lastObserved": 1732545848809, "runId": "sigma-test", "lastRunId": "no-run-id-provided" } }, { - "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:sigma,5LqGLu14qUnqh3cN6wRJBd,PROD)", + "entityType": "dashboard", + "entityUrn": "urn:li:dashboard:(sigma,9bbbe3b0-c0c8-4fac-b6f1-8dfebfe74f8b)", "changeType": "UPSERT", - "aspectName": "ownership", + "aspectName": "container", "aspect": { "json": { - "owners": [ - { - "owner": "urn:li:corpuser:Shubham_Jagtap", - "type": "DATAOWNER" - } - ], - "ownerTypes": {}, - "lastModified": { - "time": 0, - "actor": "urn:li:corpuser:unknown" - } + "container": "urn:li:container:46c912b7a3f62c8e3269e559648c4b2f" } }, "systemMetadata": { - "lastObserved": 1713795619224, + "lastObserved": 1732513099682, "runId": "sigma-test", "lastRunId": "no-run-id-provided" } }, { - "entityType": "container", - "entityUrn": "urn:li:container:140db5e9decc9b6ec67fa1e8207b6f02", + "entityType": "dashboard", + "entityUrn": "urn:li:dashboard:(sigma,9bbbe3b0-c0c8-4fac-b6f1-8dfebfe74f8b)", "changeType": "UPSERT", - "aspectName": "globalTags", + "aspectName": "browsePathsV2", "aspect": { "json": { - "tags": [ + "path": [ { - "tag": "urn:li:tag:Warning" + "id": "urn:li:container:46c912b7a3f62c8e3269e559648c4b2f", + "urn": "urn:li:container:46c912b7a3f62c8e3269e559648c4b2f" + }, + { + "id": "Acryl Workbook" } ] } }, "systemMetadata": { - "lastObserved": 1713795619230, + "lastObserved": 1732545848807, "runId": "sigma-test", "lastRunId": "no-run-id-provided" } @@ -538,14 +540,34 @@ "entityType": "dashboard", "entityUrn": "urn:li:dashboard:(sigma,OSnGLBzL1i)", "changeType": "UPSERT", - "aspectName": "container", + "aspectName": "dashboardInfo", "aspect": { "json": { - "container": "urn:li:container:140db5e9decc9b6ec67fa1e8207b6f02" + "customProperties": { + "ElementsCount": "2" + }, + "title": "Page 1", + "description": "", + "charts": [ + "urn:li:chart:(sigma,kH0MeihtGs)", + "urn:li:chart:(sigma,Ml9C5ezT5W)" + ], + "datasets": [], + "dashboards": [], + "lastModified": { + "created": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + } + } } }, "systemMetadata": { - "lastObserved": 1713795619234, + "lastObserved": 1713795619233, "runId": "sigma-test", "lastRunId": "no-run-id-provided" } @@ -554,59 +576,44 @@ "entityType": "dataset", "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:sigma,5LqGLu14qUnqh3cN6wRJBd,PROD)", "changeType": "UPSERT", - "aspectName": "globalTags", + "aspectName": "ownership", "aspect": { "json": { - "tags": [ + "owners": [ { - "tag": "urn:li:tag:Deprecated" + "owner": "urn:li:corpuser:Shubham_Jagtap", + "type": "DATAOWNER" } - ] + ], + "ownerTypes": {}, + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + } } }, "systemMetadata": { - "lastObserved": 1713795619226, + "lastObserved": 1713795619224, "runId": "sigma-test", "lastRunId": "no-run-id-provided" } }, { - "entityType": "dashboard", - "entityUrn": "urn:li:dashboard:(sigma,OSnGLBzL1i)", + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:sigma,5LqGLu14qUnqh3cN6wRJBd,PROD)", "changeType": "UPSERT", - "aspectName": "browsePathsV2", + "aspectName": "globalTags", "aspect": { "json": { - "path": [ - { - "id": "urn:li:container:46c912b7a3f62c8e3269e559648c4b2f", - "urn": "urn:li:container:46c912b7a3f62c8e3269e559648c4b2f" - }, + "tags": [ { - "id": "urn:li:container:140db5e9decc9b6ec67fa1e8207b6f02", - "urn": "urn:li:container:140db5e9decc9b6ec67fa1e8207b6f02" + "tag": "urn:li:tag:Deprecated" } ] } }, "systemMetadata": { - "lastObserved": 1713795619234, - "runId": "sigma-test", - "lastRunId": "no-run-id-provided" - } -}, -{ - "entityType": "container", - "entityUrn": "urn:li:container:140db5e9decc9b6ec67fa1e8207b6f02", - "changeType": "UPSERT", - "aspectName": "container", - "aspect": { - "json": { - "container": "urn:li:container:46c912b7a3f62c8e3269e559648c4b2f" - } - }, - "systemMetadata": { - "lastObserved": 1713795619231, + "lastObserved": 1713795619226, "runId": "sigma-test", "lastRunId": "no-run-id-provided" } @@ -642,51 +649,13 @@ "urn": "urn:li:container:46c912b7a3f62c8e3269e559648c4b2f" }, { - "id": "urn:li:container:140db5e9decc9b6ec67fa1e8207b6f02", - "urn": "urn:li:container:140db5e9decc9b6ec67fa1e8207b6f02" - } - ] - } - }, - "systemMetadata": { - "lastObserved": 1713795619375, - "runId": "sigma-test", - "lastRunId": "no-run-id-provided" - } -}, -{ - "entityType": "container", - "entityUrn": "urn:li:container:140db5e9decc9b6ec67fa1e8207b6f02", - "changeType": "UPSERT", - "aspectName": "browsePathsV2", - "aspect": { - "json": { - "path": [ - { - "id": "urn:li:container:46c912b7a3f62c8e3269e559648c4b2f", - "urn": "urn:li:container:46c912b7a3f62c8e3269e559648c4b2f" + "id": "Acryl Workbook" } ] } }, "systemMetadata": { - "lastObserved": 1713795619231, - "runId": "sigma-test", - "lastRunId": "no-run-id-provided" - } -}, -{ - "entityType": "chart", - "entityUrn": "urn:li:chart:(sigma,Ml9C5ezT5W)", - "changeType": "UPSERT", - "aspectName": "container", - "aspect": { - "json": { - "container": "urn:li:container:140db5e9decc9b6ec67fa1e8207b6f02" - } - }, - "systemMetadata": { - "lastObserved": 1713795619373, + "lastObserved": 1732545848872, "runId": "sigma-test", "lastRunId": "no-run-id-provided" } @@ -883,6 +852,7 @@ "urn:li:chart:(sigma,tQJu5N1l81)" ], "datasets": [], + "dashboards": [], "lastModified": { "created": { "time": 0, @@ -901,6 +871,30 @@ "lastRunId": "no-run-id-provided" } }, +{ + "entityType": "dashboard", + "entityUrn": "urn:li:dashboard:(sigma,DFSieiAcgo)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:container:46c912b7a3f62c8e3269e559648c4b2f", + "urn": "urn:li:container:46c912b7a3f62c8e3269e559648c4b2f" + }, + { + "id": "Acryl Workbook" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1732545848877, + "runId": "sigma-test", + "lastRunId": "no-run-id-provided" + } +}, { "entityType": "chart", "entityUrn": "urn:li:chart:(sigma,kH0MeihtGs)", @@ -914,14 +908,13 @@ "urn": "urn:li:container:46c912b7a3f62c8e3269e559648c4b2f" }, { - "id": "urn:li:container:140db5e9decc9b6ec67fa1e8207b6f02", - "urn": "urn:li:container:140db5e9decc9b6ec67fa1e8207b6f02" + "id": "Acryl Workbook" } ] } }, "systemMetadata": { - "lastObserved": 1713795619270, + "lastObserved": 1732545848829, "runId": "sigma-test", "lastRunId": "no-run-id-provided" } @@ -1179,54 +1172,6 @@ "lastRunId": "no-run-id-provided" } }, -{ - "entityType": "dashboard", - "entityUrn": "urn:li:dashboard:(sigma,DFSieiAcgo)", - "changeType": "UPSERT", - "aspectName": "container", - "aspect": { - "json": { - "container": "urn:li:container:140db5e9decc9b6ec67fa1e8207b6f02" - } - }, - "systemMetadata": { - "lastObserved": 1713795619382, - "runId": "sigma-test", - "lastRunId": "no-run-id-provided" - } -}, -{ - "entityType": "chart", - "entityUrn": "urn:li:chart:(sigma,kH0MeihtGs)", - "changeType": "UPSERT", - "aspectName": "container", - "aspect": { - "json": { - "container": "urn:li:container:140db5e9decc9b6ec67fa1e8207b6f02" - } - }, - "systemMetadata": { - "lastObserved": 1713795619267, - "runId": "sigma-test", - "lastRunId": "no-run-id-provided" - } -}, -{ - "entityType": "chart", - "entityUrn": "urn:li:chart:(sigma,tQJu5N1l81)", - "changeType": "UPSERT", - "aspectName": "container", - "aspect": { - "json": { - "container": "urn:li:container:140db5e9decc9b6ec67fa1e8207b6f02" - } - }, - "systemMetadata": { - "lastObserved": 1713795619449, - "runId": "sigma-test", - "lastRunId": "no-run-id-provided" - } -}, { "entityType": "chart", "entityUrn": "urn:li:chart:(sigma,tQJu5N1l81)", @@ -1409,31 +1354,6 @@ "lastRunId": "no-run-id-provided" } }, -{ - "entityType": "dashboard", - "entityUrn": "urn:li:dashboard:(sigma,DFSieiAcgo)", - "changeType": "UPSERT", - "aspectName": "browsePathsV2", - "aspect": { - "json": { - "path": [ - { - "id": "urn:li:container:46c912b7a3f62c8e3269e559648c4b2f", - "urn": "urn:li:container:46c912b7a3f62c8e3269e559648c4b2f" - }, - { - "id": "urn:li:container:140db5e9decc9b6ec67fa1e8207b6f02", - "urn": "urn:li:container:140db5e9decc9b6ec67fa1e8207b6f02" - } - ] - } - }, - "systemMetadata": { - "lastObserved": 1713795619383, - "runId": "sigma-test", - "lastRunId": "no-run-id-provided" - } -}, { "entityType": "chart", "entityUrn": "urn:li:chart:(sigma,kH0MeihtGs)", @@ -1554,14 +1474,13 @@ "urn": "urn:li:container:46c912b7a3f62c8e3269e559648c4b2f" }, { - "id": "urn:li:container:140db5e9decc9b6ec67fa1e8207b6f02", - "urn": "urn:li:container:140db5e9decc9b6ec67fa1e8207b6f02" + "id": "Acryl Workbook" } ] } }, "systemMetadata": { - "lastObserved": 1713795619453, + "lastObserved": 1732545848921, "runId": "sigma-test", "lastRunId": "no-run-id-provided" } diff --git a/metadata-ingestion/tests/integration/sigma/golden_test_sigma_ingest_shared_entities_mces.json b/metadata-ingestion/tests/integration/sigma/golden_test_sigma_ingest_shared_entities_mces.json index d6b702bdfd6695..1ce671f09d7765 100644 --- a/metadata-ingestion/tests/integration/sigma/golden_test_sigma_ingest_shared_entities_mces.json +++ b/metadata-ingestion/tests/integration/sigma/golden_test_sigma_ingest_shared_entities_mces.json @@ -279,37 +279,24 @@ } }, { - "entityType": "container", - "entityUrn": "urn:li:container:140db5e9decc9b6ec67fa1e8207b6f02", + "entityType": "dashboard", + "entityUrn": "urn:li:dashboard:(sigma,9bbbe3b0-c0c8-4fac-b6f1-8dfebfe74f8b)", "changeType": "UPSERT", - "aspectName": "containerProperties", + "aspectName": "status", "aspect": { "json": { - "customProperties": { - "platform": "sigma", - "workbookId": "9bbbe3b0-c0c8-4fac-b6f1-8dfebfe74f8b", - "path": "New Acryl Data", - "latestVersion": "2" - }, - "externalUrl": "https://app.sigmacomputing.com/acryldata/workbook/4JRFW1HThPI1K3YTjouXI7", - "name": "Acryl Workbook", - "created": { - "time": 1713188691477 - }, - "lastModified": { - "time": 1713189117302 - } + "removed": false } }, "systemMetadata": { - "lastObserved": 1718004101680, + "lastObserved": 1732513100094, "runId": "sigma-test", "lastRunId": "no-run-id-provided" } }, { - "entityType": "container", - "entityUrn": "urn:li:container:140db5e9decc9b6ec67fa1e8207b6f02", + "entityType": "dashboard", + "entityUrn": "urn:li:dashboard:(sigma,OSnGLBzL1i)", "changeType": "UPSERT", "aspectName": "status", "aspect": { @@ -318,48 +305,76 @@ } }, "systemMetadata": { - "lastObserved": 1718004101680, + "lastObserved": 1718004101684, "runId": "sigma-test", "lastRunId": "no-run-id-provided" } }, { - "entityType": "container", - "entityUrn": "urn:li:container:140db5e9decc9b6ec67fa1e8207b6f02", + "entityType": "dashboard", + "entityUrn": "urn:li:dashboard:(sigma,9bbbe3b0-c0c8-4fac-b6f1-8dfebfe74f8b)", "changeType": "UPSERT", - "aspectName": "dataPlatformInstance", + "aspectName": "subTypes", "aspect": { "json": { - "platform": "urn:li:dataPlatform:sigma" + "typeNames": [ + "Sigma Workbook" + ] } }, "systemMetadata": { - "lastObserved": 1718004101681, + "lastObserved": 1732513100095, "runId": "sigma-test", "lastRunId": "no-run-id-provided" } }, { - "entityType": "container", - "entityUrn": "urn:li:container:140db5e9decc9b6ec67fa1e8207b6f02", + "entityType": "dashboard", + "entityUrn": "urn:li:dashboard:(sigma,9bbbe3b0-c0c8-4fac-b6f1-8dfebfe74f8b)", "changeType": "UPSERT", - "aspectName": "subTypes", + "aspectName": "dashboardInfo", "aspect": { "json": { - "typeNames": [ - "Sigma Workbook" - ] + "customProperties": { + "path": "New Acryl Data", + "latestVersion": "2" + }, + "externalUrl": "https://app.sigmacomputing.com/acryldata/workbook/4JRFW1HThPI1K3YTjouXI7", + "title": "Acryl Workbook", + "description": "", + "charts": [], + "datasets": [], + "dashboards": [ + { + "sourceUrn": "urn:li:dashboard:(sigma,9bbbe3b0-c0c8-4fac-b6f1-8dfebfe74f8b)", + "destinationUrn": "urn:li:dashboard:(sigma,OSnGLBzL1i)" + }, + { + "sourceUrn": "urn:li:dashboard:(sigma,9bbbe3b0-c0c8-4fac-b6f1-8dfebfe74f8b)", + "destinationUrn": "urn:li:dashboard:(sigma,DFSieiAcgo)" + } + ], + "lastModified": { + "created": { + "time": 1713188691477, + "actor": "urn:li:corpuser:datahub" + }, + "lastModified": { + "time": 1713189117302, + "actor": "urn:li:corpuser:datahub" + } + } } }, "systemMetadata": { - "lastObserved": 1718004101681, + "lastObserved": 1732535136409, "runId": "sigma-test", "lastRunId": "no-run-id-provided" } }, { - "entityType": "container", - "entityUrn": "urn:li:container:140db5e9decc9b6ec67fa1e8207b6f02", + "entityType": "dashboard", + "entityUrn": "urn:li:dashboard:(sigma,9bbbe3b0-c0c8-4fac-b6f1-8dfebfe74f8b)", "changeType": "UPSERT", "aspectName": "ownership", "aspect": { @@ -378,14 +393,14 @@ } }, "systemMetadata": { - "lastObserved": 1718004101682, + "lastObserved": 1732513100096, "runId": "sigma-test", "lastRunId": "no-run-id-provided" } }, { - "entityType": "container", - "entityUrn": "urn:li:container:140db5e9decc9b6ec67fa1e8207b6f02", + "entityType": "dashboard", + "entityUrn": "urn:li:dashboard:(sigma,9bbbe3b0-c0c8-4fac-b6f1-8dfebfe74f8b)", "changeType": "UPSERT", "aspectName": "globalTags", "aspect": { @@ -398,60 +413,7 @@ } }, "systemMetadata": { - "lastObserved": 1718004101683, - "runId": "sigma-test", - "lastRunId": "no-run-id-provided" - } -}, -{ - "entityType": "container", - "entityUrn": "urn:li:container:140db5e9decc9b6ec67fa1e8207b6f02", - "changeType": "UPSERT", - "aspectName": "container", - "aspect": { - "json": { - "container": "urn:li:container:b83da80a4d444484521d9f7aca958742" - } - }, - "systemMetadata": { - "lastObserved": 1718004101683, - "runId": "sigma-test", - "lastRunId": "no-run-id-provided" - } -}, -{ - "entityType": "container", - "entityUrn": "urn:li:container:140db5e9decc9b6ec67fa1e8207b6f02", - "changeType": "UPSERT", - "aspectName": "browsePathsV2", - "aspect": { - "json": { - "path": [ - { - "id": "urn:li:container:b83da80a4d444484521d9f7aca958742", - "urn": "urn:li:container:b83da80a4d444484521d9f7aca958742" - } - ] - } - }, - "systemMetadata": { - "lastObserved": 1718004101684, - "runId": "sigma-test", - "lastRunId": "no-run-id-provided" - } -}, -{ - "entityType": "dashboard", - "entityUrn": "urn:li:dashboard:(sigma,OSnGLBzL1i)", - "changeType": "UPSERT", - "aspectName": "status", - "aspect": { - "json": { - "removed": false - } - }, - "systemMetadata": { - "lastObserved": 1718004101684, + "lastObserved": 1732513100096, "runId": "sigma-test", "lastRunId": "no-run-id-provided" } @@ -473,6 +435,7 @@ "urn:li:chart:(sigma,Ml9C5ezT5W)" ], "datasets": [], + "dashboards": [], "lastModified": { "created": { "time": 0, @@ -491,47 +454,6 @@ "lastRunId": "no-run-id-provided" } }, -{ - "entityType": "dashboard", - "entityUrn": "urn:li:dashboard:(sigma,OSnGLBzL1i)", - "changeType": "UPSERT", - "aspectName": "container", - "aspect": { - "json": { - "container": "urn:li:container:140db5e9decc9b6ec67fa1e8207b6f02" - } - }, - "systemMetadata": { - "lastObserved": 1718004101686, - "runId": "sigma-test", - "lastRunId": "no-run-id-provided" - } -}, -{ - "entityType": "dashboard", - "entityUrn": "urn:li:dashboard:(sigma,OSnGLBzL1i)", - "changeType": "UPSERT", - "aspectName": "browsePathsV2", - "aspect": { - "json": { - "path": [ - { - "id": "urn:li:container:b83da80a4d444484521d9f7aca958742", - "urn": "urn:li:container:b83da80a4d444484521d9f7aca958742" - }, - { - "id": "urn:li:container:140db5e9decc9b6ec67fa1e8207b6f02", - "urn": "urn:li:container:140db5e9decc9b6ec67fa1e8207b6f02" - } - ] - } - }, - "systemMetadata": { - "lastObserved": 1718004101686, - "runId": "sigma-test", - "lastRunId": "no-run-id-provided" - } -}, { "entityType": "chart", "entityUrn": "urn:li:chart:(sigma,kH0MeihtGs)", @@ -582,17 +504,65 @@ } }, { - "entityType": "chart", - "entityUrn": "urn:li:chart:(sigma,kH0MeihtGs)", + "entityType": "dashboard", + "entityUrn": "urn:li:dashboard:(sigma,OSnGLBzL1i)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:container:b83da80a4d444484521d9f7aca958742", + "urn": "urn:li:container:b83da80a4d444484521d9f7aca958742" + }, + { + "id": "Acryl Workbook" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1732545849249, + "runId": "sigma-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dashboard", + "entityUrn": "urn:li:dashboard:(sigma,9bbbe3b0-c0c8-4fac-b6f1-8dfebfe74f8b)", "changeType": "UPSERT", "aspectName": "container", "aspect": { "json": { - "container": "urn:li:container:140db5e9decc9b6ec67fa1e8207b6f02" + "container": "urn:li:container:b83da80a4d444484521d9f7aca958742" } }, "systemMetadata": { - "lastObserved": 1718004101689, + "lastObserved": 1732513100096, + "runId": "sigma-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dashboard", + "entityUrn": "urn:li:dashboard:(sigma,9bbbe3b0-c0c8-4fac-b6f1-8dfebfe74f8b)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:container:b83da80a4d444484521d9f7aca958742", + "urn": "urn:li:container:b83da80a4d444484521d9f7aca958742" + }, + { + "id": "Acryl Workbook" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1732545849248, "runId": "sigma-test", "lastRunId": "no-run-id-provided" } @@ -717,14 +687,13 @@ "urn": "urn:li:container:b83da80a4d444484521d9f7aca958742" }, { - "id": "urn:li:container:140db5e9decc9b6ec67fa1e8207b6f02", - "urn": "urn:li:container:140db5e9decc9b6ec67fa1e8207b6f02" + "id": "Acryl Workbook" } ] } }, "systemMetadata": { - "lastObserved": 1718004101692, + "lastObserved": 1732545849252, "runId": "sigma-test", "lastRunId": "no-run-id-provided" } @@ -778,22 +747,6 @@ "lastRunId": "no-run-id-provided" } }, -{ - "entityType": "chart", - "entityUrn": "urn:li:chart:(sigma,Ml9C5ezT5W)", - "changeType": "UPSERT", - "aspectName": "container", - "aspect": { - "json": { - "container": "urn:li:container:140db5e9decc9b6ec67fa1e8207b6f02" - } - }, - "systemMetadata": { - "lastObserved": 1718004101695, - "runId": "sigma-test", - "lastRunId": "no-run-id-provided" - } -}, { "entityType": "chart", "entityUrn": "urn:li:chart:(sigma,Ml9C5ezT5W)", @@ -914,14 +867,13 @@ "urn": "urn:li:container:b83da80a4d444484521d9f7aca958742" }, { - "id": "urn:li:container:140db5e9decc9b6ec67fa1e8207b6f02", - "urn": "urn:li:container:140db5e9decc9b6ec67fa1e8207b6f02" + "id": "Acryl Workbook" } ] } }, "systemMetadata": { - "lastObserved": 1718004101697, + "lastObserved": 1732545849255, "runId": "sigma-test", "lastRunId": "no-run-id-provided" } @@ -1155,6 +1107,7 @@ "urn:li:chart:(sigma,tQJu5N1l81)" ], "datasets": [], + "dashboards": [], "lastModified": { "created": { "time": 0, @@ -1174,17 +1127,17 @@ } }, { - "entityType": "dashboard", - "entityUrn": "urn:li:dashboard:(sigma,DFSieiAcgo)", + "entityType": "chart", + "entityUrn": "urn:li:chart:(sigma,tQJu5N1l81)", "changeType": "UPSERT", - "aspectName": "container", + "aspectName": "status", "aspect": { "json": { - "container": "urn:li:container:140db5e9decc9b6ec67fa1e8207b6f02" + "removed": false } }, "systemMetadata": { - "lastObserved": 1718004101704, + "lastObserved": 1718004101706, "runId": "sigma-test", "lastRunId": "no-run-id-provided" } @@ -1202,30 +1155,13 @@ "urn": "urn:li:container:b83da80a4d444484521d9f7aca958742" }, { - "id": "urn:li:container:140db5e9decc9b6ec67fa1e8207b6f02", - "urn": "urn:li:container:140db5e9decc9b6ec67fa1e8207b6f02" + "id": "Acryl Workbook" } ] } }, "systemMetadata": { - "lastObserved": 1718004101704, - "runId": "sigma-test", - "lastRunId": "no-run-id-provided" - } -}, -{ - "entityType": "chart", - "entityUrn": "urn:li:chart:(sigma,tQJu5N1l81)", - "changeType": "UPSERT", - "aspectName": "status", - "aspect": { - "json": { - "removed": false - } - }, - "systemMetadata": { - "lastObserved": 1718004101706, + "lastObserved": 1732545849260, "runId": "sigma-test", "lastRunId": "no-run-id-provided" } @@ -1263,22 +1199,6 @@ "lastRunId": "no-run-id-provided" } }, -{ - "entityType": "chart", - "entityUrn": "urn:li:chart:(sigma,tQJu5N1l81)", - "changeType": "UPSERT", - "aspectName": "container", - "aspect": { - "json": { - "container": "urn:li:container:140db5e9decc9b6ec67fa1e8207b6f02" - } - }, - "systemMetadata": { - "lastObserved": 1718004101708, - "runId": "sigma-test", - "lastRunId": "no-run-id-provided" - } -}, { "entityType": "chart", "entityUrn": "urn:li:chart:(sigma,tQJu5N1l81)", @@ -1474,14 +1394,13 @@ "urn": "urn:li:container:b83da80a4d444484521d9f7aca958742" }, { - "id": "urn:li:container:140db5e9decc9b6ec67fa1e8207b6f02", - "urn": "urn:li:container:140db5e9decc9b6ec67fa1e8207b6f02" + "id": "Acryl Workbook" } ] } }, "systemMetadata": { - "lastObserved": 1718004101712, + "lastObserved": 1732545849264, "runId": "sigma-test", "lastRunId": "no-run-id-provided" } diff --git a/metadata-ingestion/tests/integration/sql_server/docker-compose.yml b/metadata-ingestion/tests/integration/sql_server/docker-compose.yml index 1046321e4f7205..aed70503903c03 100644 --- a/metadata-ingestion/tests/integration/sql_server/docker-compose.yml +++ b/metadata-ingestion/tests/integration/sql_server/docker-compose.yml @@ -1,7 +1,7 @@ version: "3" services: testsqlserver: - image: "mcr.microsoft.com/mssql/server:latest" + image: "mcr.microsoft.com/mssql/server:2022-latest" platform: linux/amd64 container_name: "testsqlserver" environment: diff --git a/metadata-ingestion/tests/integration/sql_server/golden_files/golden_mces_mssql_no_db_to_file.json b/metadata-ingestion/tests/integration/sql_server/golden_files/golden_mces_mssql_no_db_to_file.json index 4302c41140dc6c..54821347fd28b8 100644 --- a/metadata-ingestion/tests/integration/sql_server/golden_files/golden_mces_mssql_no_db_to_file.json +++ b/metadata-ingestion/tests/integration/sql_server/golden_files/golden_mces_mssql_no_db_to_file.json @@ -113,11 +113,11 @@ "aspect": { "json": { "customProperties": { - "job_id": "01afcab8-187c-459f-828e-727196a1832d", + "job_id": "4130c37d-146c-43da-a671-dd9a413a44b3", "job_name": "Weekly Demo Data Backup", "description": "No description available.", - "date_created": "2024-11-21 21:01:26.550000", - "date_modified": "2024-11-21 21:01:26.690000", + "date_created": "2024-11-22 12:58:03.260000", + "date_modified": "2024-11-22 12:58:03.440000", "step_id": "1", "step_name": "Set database to read only", "subsystem": "TSQL", @@ -1496,6 +1496,138 @@ "lastRunId": "no-run-id-provided" } }, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,DemoData.Foo.age_dist,PROD)", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:046d11ae7c0bc9bde45993041ac011c9" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { + "urn": "urn:li:dataset:(urn:li:dataPlatform:mssql,DemoData.Foo.age_dist,PROD)", + "aspects": [ + { + "com.linkedin.pegasus2avro.common.Status": { + "removed": false + } + }, + { + "com.linkedin.pegasus2avro.dataset.DatasetProperties": { + "customProperties": {}, + "name": "age_dist", + "tags": [] + } + }, + { + "com.linkedin.pegasus2avro.schema.SchemaMetadata": { + "schemaName": "DemoData.Foo.age_dist", + "platform": "urn:li:dataPlatform:mssql", + "version": 0, + "created": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "hash": "", + "platformSchema": { + "com.linkedin.pegasus2avro.schema.MySqlDDL": { + "tableSchema": "" + } + }, + "fields": [ + { + "fieldPath": "Age", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "INTEGER", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "Count", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "INTEGER", + "recursive": false, + "isPartOfKey": false + } + ] + } + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,DemoData.Foo.age_dist,PROD)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Table" + ] + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,DemoData.Foo.age_dist,PROD)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:container:b275b7c099ce32f3faf1817cb054b100", + "urn": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" + }, + { + "id": "urn:li:container:046d11ae7c0bc9bde45993041ac011c9", + "urn": "urn:li:container:046d11ae7c0bc9bde45993041ac011c9" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, { "entityType": "dataset", "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,DemoData.Foo.Items,PROD)", @@ -2150,8 +2282,8 @@ "code": "CREATE PROCEDURE [Foo].[Proc.With.SpecialChar] @ID INT\nAS\n SELECT @ID AS ThatDB;\n", "input parameters": "['@ID']", "parameter @ID": "{'type': 'int'}", - "date_created": "2024-11-21 21:01:26.483000", - "date_modified": "2024-11-21 21:01:26.483000" + "date_created": "2024-11-22 12:58:03.137000", + "date_modified": "2024-11-22 12:58:03.137000" }, "externalUrl": "", "name": "DemoData.Foo.Proc.With.SpecialChar", @@ -2168,14 +2300,24 @@ }, { "entityType": "dataJob", - "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,DemoData.Foo.stored_procedures,PROD),Proc.With.SpecialChar)", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,DemoData.Foo.stored_procedures,PROD),NewProc)", "changeType": "UPSERT", - "aspectName": "dataJobInputOutput", + "aspectName": "dataJobInfo", "aspect": { "json": { - "inputDatasets": [], - "outputDatasets": [], - "inputDatajobs": [] + "customProperties": { + "procedure_depends_on": "{'DemoData.Foo.age_dist': 'USER_TABLE', 'DemoData.Foo.Items': 'USER_TABLE', 'DemoData.Foo.Persons': 'USER_TABLE', 'DemoData.Foo.SalesReason': 'USER_TABLE'}", + "depending_on_procedure": "{}", + "code": "CREATE PROCEDURE [Foo].[NewProc]\n AS\n BEGIN\n --insert into items table from salesreason table\n insert into Foo.Items (ID, ItemName)\n SELECT TempID, Name\n FROM Foo.SalesReason;\n\n\n IF OBJECT_ID('Foo.age_dist', 'U') IS NULL\n BEGIN\n -- Create and populate if table doesn't exist\n SELECT Age, COUNT(*) as Count\n INTO Foo.age_dist\n FROM Foo.Persons\n GROUP BY Age\n END\n ELSE\n BEGIN\n -- Update existing table\n TRUNCATE TABLE Foo.age_dist;\n\n INSERT INTO Foo.age_dist (Age, Count)\n SELECT Age, COUNT(*) as Count\n FROM Foo.Persons\n GROUP BY Age\n END\n\n SELECT ID, Age INTO #TEMPTABLE FROM NewData.FooNew.PersonsNew\n \n UPDATE DemoData.Foo.Persons\n SET Age = t.Age\n FROM DemoData.Foo.Persons p\n JOIN #TEMPTABLE t ON p.ID = t.ID\n\n END\n", + "input parameters": "[]", + "date_created": "2024-11-22 12:58:03.140000", + "date_modified": "2024-11-22 12:58:03.140000" + }, + "externalUrl": "", + "name": "DemoData.Foo.NewProc", + "type": { + "string": "MSSQL_STORED_PROCEDURE" + } } }, "systemMetadata": { @@ -4256,6 +4398,159 @@ "lastRunId": "no-run-id-provided" } }, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,NewData.FooNew.View1,PROD)", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:f721da08adde46586c0f113287cb60d1" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { + "urn": "urn:li:dataset:(urn:li:dataPlatform:mssql,NewData.FooNew.View1,PROD)", + "aspects": [ + { + "com.linkedin.pegasus2avro.common.Status": { + "removed": false + } + }, + { + "com.linkedin.pegasus2avro.dataset.DatasetProperties": { + "customProperties": { + "view_definition": "CREATE VIEW FooNew.View1 AS\nSELECT LastName, FirstName\nFROM FooNew.PersonsNew\nWHERE Age > 18\n", + "is_view": "True" + }, + "name": "View1", + "tags": [] + } + }, + { + "com.linkedin.pegasus2avro.schema.SchemaMetadata": { + "schemaName": "NewData.FooNew.View1", + "platform": "urn:li:dataPlatform:mssql", + "version": 0, + "created": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "hash": "", + "platformSchema": { + "com.linkedin.pegasus2avro.schema.MySqlDDL": { + "tableSchema": "" + } + }, + "fields": [ + { + "fieldPath": "LastName", + "nullable": false, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "VARCHAR(255) COLLATE SQL_Latin1_General_CP1_CI_AS", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "FirstName", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "VARCHAR(255) COLLATE SQL_Latin1_General_CP1_CI_AS", + "recursive": false, + "isPartOfKey": false + } + ] + } + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,NewData.FooNew.View1,PROD)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "View" + ] + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,NewData.FooNew.View1,PROD)", + "changeType": "UPSERT", + "aspectName": "viewProperties", + "aspect": { + "json": { + "materialized": false, + "viewLogic": "CREATE VIEW FooNew.View1 AS\nSELECT LastName, FirstName\nFROM FooNew.PersonsNew\nWHERE Age > 18\n", + "viewLanguage": "SQL" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,NewData.FooNew.View1,PROD)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:container:0a12bec9e9271b0db039923a770d75e5", + "urn": "urn:li:container:0a12bec9e9271b0db039923a770d75e5" + }, + { + "id": "urn:li:container:f721da08adde46586c0f113287cb60d1", + "urn": "urn:li:container:f721da08adde46586c0f113287cb60d1" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, { "entityType": "container", "entityUrn": "urn:li:container:f3cb304e29e178d0615ed5ee6aa4ad58", @@ -4611,6 +4906,55 @@ "lastRunId": "no-run-id-provided" } }, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,newdata.foonew.view1,PROD)", + "changeType": "UPSERT", + "aspectName": "upstreamLineage", + "aspect": { + "json": { + "upstreams": [ + { + "auditStamp": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "dataset": "urn:li:dataset:(urn:li:dataPlatform:mssql,newdata.foonew.personsnew,PROD)", + "type": "VIEW" + } + ], + "fineGrainedLineages": [ + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:mssql,newdata.foonew.personsnew,PROD),firstname)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:mssql,newdata.foonew.view1,PROD),firstname)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:mssql,newdata.foonew.personsnew,PROD),lastname)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:mssql,newdata.foonew.view1,PROD),lastname)" + ], + "confidenceScore": 1.0 + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, { "entityType": "dataFlow", "entityUrn": "urn:li:dataFlow:(mssql,DemoData.Foo.stored_procedures,PROD)", @@ -4643,6 +4987,22 @@ "lastRunId": "no-run-id-provided" } }, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,DemoData.Foo.stored_procedures,PROD),NewProc)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, { "entityType": "dataJob", "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,DemoData.Foo.stored_procedures,PROD),Proc.With.SpecialChar)", @@ -4690,5 +5050,21 @@ "runId": "mssql-test", "lastRunId": "no-run-id-provided" } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,newdata.foonew.view1,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } } ] \ No newline at end of file diff --git a/metadata-ingestion/tests/integration/sql_server/golden_files/golden_mces_mssql_no_db_with_filter.json b/metadata-ingestion/tests/integration/sql_server/golden_files/golden_mces_mssql_no_db_with_filter.json index 0a50556edc6388..1d702214fedf79 100644 --- a/metadata-ingestion/tests/integration/sql_server/golden_files/golden_mces_mssql_no_db_with_filter.json +++ b/metadata-ingestion/tests/integration/sql_server/golden_files/golden_mces_mssql_no_db_with_filter.json @@ -113,11 +113,11 @@ "aspect": { "json": { "customProperties": { - "job_id": "01afcab8-187c-459f-828e-727196a1832d", + "job_id": "2a055367-5e6a-4162-b3a9-dd60f52c79a8", "job_name": "Weekly Demo Data Backup", "description": "No description available.", - "date_created": "2024-11-21 21:01:26.550000", - "date_modified": "2024-11-21 21:01:26.690000", + "date_created": "2024-11-26 07:22:19.640000", + "date_modified": "2024-11-26 07:22:19.773000", "step_id": "1", "step_name": "Set database to read only", "subsystem": "TSQL", @@ -1496,6 +1496,138 @@ "lastRunId": "no-run-id-provided" } }, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,DemoData.Foo.age_dist,PROD)", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:046d11ae7c0bc9bde45993041ac011c9" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { + "urn": "urn:li:dataset:(urn:li:dataPlatform:mssql,DemoData.Foo.age_dist,PROD)", + "aspects": [ + { + "com.linkedin.pegasus2avro.common.Status": { + "removed": false + } + }, + { + "com.linkedin.pegasus2avro.dataset.DatasetProperties": { + "customProperties": {}, + "name": "age_dist", + "tags": [] + } + }, + { + "com.linkedin.pegasus2avro.schema.SchemaMetadata": { + "schemaName": "DemoData.Foo.age_dist", + "platform": "urn:li:dataPlatform:mssql", + "version": 0, + "created": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "hash": "", + "platformSchema": { + "com.linkedin.pegasus2avro.schema.MySqlDDL": { + "tableSchema": "" + } + }, + "fields": [ + { + "fieldPath": "Age", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "INTEGER", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "Count", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "INTEGER", + "recursive": false, + "isPartOfKey": false + } + ] + } + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,DemoData.Foo.age_dist,PROD)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Table" + ] + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,DemoData.Foo.age_dist,PROD)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:container:b275b7c099ce32f3faf1817cb054b100", + "urn": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" + }, + { + "id": "urn:li:container:046d11ae7c0bc9bde45993041ac011c9", + "urn": "urn:li:container:046d11ae7c0bc9bde45993041ac011c9" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, { "entityType": "dataset", "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,DemoData.Foo.Items,PROD)", @@ -2150,8 +2282,8 @@ "code": "CREATE PROCEDURE [Foo].[Proc.With.SpecialChar] @ID INT\nAS\n SELECT @ID AS ThatDB;\n", "input parameters": "['@ID']", "parameter @ID": "{'type': 'int'}", - "date_created": "2024-11-21 21:01:26.483000", - "date_modified": "2024-11-21 21:01:26.483000" + "date_created": "2024-11-26 07:22:19.510000", + "date_modified": "2024-11-26 07:22:19.510000" }, "externalUrl": "", "name": "DemoData.Foo.Proc.With.SpecialChar", @@ -2166,24 +2298,6 @@ "lastRunId": "no-run-id-provided" } }, -{ - "entityType": "dataJob", - "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,DemoData.Foo.stored_procedures,PROD),Proc.With.SpecialChar)", - "changeType": "UPSERT", - "aspectName": "dataJobInputOutput", - "aspect": { - "json": { - "inputDatasets": [], - "outputDatasets": [], - "inputDatajobs": [] - } - }, - "systemMetadata": { - "lastObserved": 1615443388097, - "runId": "mssql-test", - "lastRunId": "no-run-id-provided" - } -}, { "entityType": "container", "entityUrn": "urn:li:container:250ce23f940485303fa5e5d4f5194975", diff --git a/metadata-ingestion/tests/integration/sql_server/golden_files/golden_mces_mssql_to_file.json b/metadata-ingestion/tests/integration/sql_server/golden_files/golden_mces_mssql_to_file.json index 0a50556edc6388..3836e587ef8cf4 100644 --- a/metadata-ingestion/tests/integration/sql_server/golden_files/golden_mces_mssql_to_file.json +++ b/metadata-ingestion/tests/integration/sql_server/golden_files/golden_mces_mssql_to_file.json @@ -113,11 +113,11 @@ "aspect": { "json": { "customProperties": { - "job_id": "01afcab8-187c-459f-828e-727196a1832d", + "job_id": "4130c37d-146c-43da-a671-dd9a413a44b3", "job_name": "Weekly Demo Data Backup", "description": "No description available.", - "date_created": "2024-11-21 21:01:26.550000", - "date_modified": "2024-11-21 21:01:26.690000", + "date_created": "2024-11-22 12:58:03.260000", + "date_modified": "2024-11-22 12:58:03.440000", "step_id": "1", "step_name": "Set database to read only", "subsystem": "TSQL", @@ -1496,6 +1496,138 @@ "lastRunId": "no-run-id-provided" } }, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,DemoData.Foo.age_dist,PROD)", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:046d11ae7c0bc9bde45993041ac011c9" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { + "urn": "urn:li:dataset:(urn:li:dataPlatform:mssql,DemoData.Foo.age_dist,PROD)", + "aspects": [ + { + "com.linkedin.pegasus2avro.common.Status": { + "removed": false + } + }, + { + "com.linkedin.pegasus2avro.dataset.DatasetProperties": { + "customProperties": {}, + "name": "age_dist", + "tags": [] + } + }, + { + "com.linkedin.pegasus2avro.schema.SchemaMetadata": { + "schemaName": "DemoData.Foo.age_dist", + "platform": "urn:li:dataPlatform:mssql", + "version": 0, + "created": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "hash": "", + "platformSchema": { + "com.linkedin.pegasus2avro.schema.MySqlDDL": { + "tableSchema": "" + } + }, + "fields": [ + { + "fieldPath": "Age", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "INTEGER", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "Count", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "INTEGER", + "recursive": false, + "isPartOfKey": false + } + ] + } + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,DemoData.Foo.age_dist,PROD)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Table" + ] + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,DemoData.Foo.age_dist,PROD)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:container:b275b7c099ce32f3faf1817cb054b100", + "urn": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" + }, + { + "id": "urn:li:container:046d11ae7c0bc9bde45993041ac011c9", + "urn": "urn:li:container:046d11ae7c0bc9bde45993041ac011c9" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, { "entityType": "dataset", "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,DemoData.Foo.Items,PROD)", @@ -2150,8 +2282,8 @@ "code": "CREATE PROCEDURE [Foo].[Proc.With.SpecialChar] @ID INT\nAS\n SELECT @ID AS ThatDB;\n", "input parameters": "['@ID']", "parameter @ID": "{'type': 'int'}", - "date_created": "2024-11-21 21:01:26.483000", - "date_modified": "2024-11-21 21:01:26.483000" + "date_created": "2024-11-22 12:58:03.137000", + "date_modified": "2024-11-22 12:58:03.137000" }, "externalUrl": "", "name": "DemoData.Foo.Proc.With.SpecialChar", @@ -2168,14 +2300,24 @@ }, { "entityType": "dataJob", - "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,DemoData.Foo.stored_procedures,PROD),Proc.With.SpecialChar)", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,DemoData.Foo.stored_procedures,PROD),NewProc)", "changeType": "UPSERT", - "aspectName": "dataJobInputOutput", + "aspectName": "dataJobInfo", "aspect": { "json": { - "inputDatasets": [], - "outputDatasets": [], - "inputDatajobs": [] + "customProperties": { + "procedure_depends_on": "{'DemoData.Foo.age_dist': 'USER_TABLE', 'DemoData.Foo.Items': 'USER_TABLE', 'DemoData.Foo.Persons': 'USER_TABLE', 'DemoData.Foo.SalesReason': 'USER_TABLE'}", + "depending_on_procedure": "{}", + "code": "CREATE PROCEDURE [Foo].[NewProc]\n AS\n BEGIN\n --insert into items table from salesreason table\n insert into Foo.Items (ID, ItemName)\n SELECT TempID, Name\n FROM Foo.SalesReason;\n\n\n IF OBJECT_ID('Foo.age_dist', 'U') IS NULL\n BEGIN\n -- Create and populate if table doesn't exist\n SELECT Age, COUNT(*) as Count\n INTO Foo.age_dist\n FROM Foo.Persons\n GROUP BY Age\n END\n ELSE\n BEGIN\n -- Update existing table\n TRUNCATE TABLE Foo.age_dist;\n\n INSERT INTO Foo.age_dist (Age, Count)\n SELECT Age, COUNT(*) as Count\n FROM Foo.Persons\n GROUP BY Age\n END\n\n SELECT ID, Age INTO #TEMPTABLE FROM NewData.FooNew.PersonsNew\n \n UPDATE DemoData.Foo.Persons\n SET Age = t.Age\n FROM DemoData.Foo.Persons p\n JOIN #TEMPTABLE t ON p.ID = t.ID\n\n END\n", + "input parameters": "[]", + "date_created": "2024-11-22 12:58:03.140000", + "date_modified": "2024-11-22 12:58:03.140000" + }, + "externalUrl": "", + "name": "DemoData.Foo.NewProc", + "type": { + "string": "MSSQL_STORED_PROCEDURE" + } } }, "systemMetadata": { @@ -2571,6 +2713,22 @@ "lastRunId": "no-run-id-provided" } }, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,DemoData.Foo.stored_procedures,PROD),NewProc)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, { "entityType": "dataJob", "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,DemoData.Foo.stored_procedures,PROD),Proc.With.SpecialChar)", diff --git a/metadata-ingestion/tests/integration/sql_server/golden_files/golden_mces_mssql_with_lower_case_urn.json b/metadata-ingestion/tests/integration/sql_server/golden_files/golden_mces_mssql_with_lower_case_urn.json index 0279a94084ce56..ebcadcc11dcbfa 100644 --- a/metadata-ingestion/tests/integration/sql_server/golden_files/golden_mces_mssql_with_lower_case_urn.json +++ b/metadata-ingestion/tests/integration/sql_server/golden_files/golden_mces_mssql_with_lower_case_urn.json @@ -113,11 +113,11 @@ "aspect": { "json": { "customProperties": { - "job_id": "01afcab8-187c-459f-828e-727196a1832d", + "job_id": "4130c37d-146c-43da-a671-dd9a413a44b3", "job_name": "Weekly Demo Data Backup", "description": "No description available.", - "date_created": "2024-11-21 21:01:26.550000", - "date_modified": "2024-11-21 21:01:26.690000", + "date_created": "2024-11-22 12:58:03.260000", + "date_modified": "2024-11-22 12:58:03.440000", "step_id": "1", "step_name": "Set database to read only", "subsystem": "TSQL", @@ -1496,6 +1496,138 @@ "lastRunId": "no-run-id-provided" } }, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,demodata.foo.age_dist,PROD)", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:046d11ae7c0bc9bde45993041ac011c9" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { + "urn": "urn:li:dataset:(urn:li:dataPlatform:mssql,demodata.foo.age_dist,PROD)", + "aspects": [ + { + "com.linkedin.pegasus2avro.common.Status": { + "removed": false + } + }, + { + "com.linkedin.pegasus2avro.dataset.DatasetProperties": { + "customProperties": {}, + "name": "age_dist", + "tags": [] + } + }, + { + "com.linkedin.pegasus2avro.schema.SchemaMetadata": { + "schemaName": "demodata.foo.age_dist", + "platform": "urn:li:dataPlatform:mssql", + "version": 0, + "created": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "hash": "", + "platformSchema": { + "com.linkedin.pegasus2avro.schema.MySqlDDL": { + "tableSchema": "" + } + }, + "fields": [ + { + "fieldPath": "Age", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "INTEGER", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "Count", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "INTEGER", + "recursive": false, + "isPartOfKey": false + } + ] + } + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,demodata.foo.age_dist,PROD)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Table" + ] + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,demodata.foo.age_dist,PROD)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:container:b275b7c099ce32f3faf1817cb054b100", + "urn": "urn:li:container:b275b7c099ce32f3faf1817cb054b100" + }, + { + "id": "urn:li:container:046d11ae7c0bc9bde45993041ac011c9", + "urn": "urn:li:container:046d11ae7c0bc9bde45993041ac011c9" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, { "entityType": "dataset", "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,demodata.foo.items,PROD)", @@ -2150,8 +2282,8 @@ "code": "CREATE PROCEDURE [Foo].[Proc.With.SpecialChar] @ID INT\nAS\n SELECT @ID AS ThatDB;\n", "input parameters": "['@ID']", "parameter @ID": "{'type': 'int'}", - "date_created": "2024-11-21 21:01:26.483000", - "date_modified": "2024-11-21 21:01:26.483000" + "date_created": "2024-11-22 12:58:03.137000", + "date_modified": "2024-11-22 12:58:03.137000" }, "externalUrl": "", "name": "DemoData.Foo.Proc.With.SpecialChar", @@ -2168,14 +2300,24 @@ }, { "entityType": "dataJob", - "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,DemoData.Foo.stored_procedures,PROD),Proc.With.SpecialChar)", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,DemoData.Foo.stored_procedures,PROD),NewProc)", "changeType": "UPSERT", - "aspectName": "dataJobInputOutput", + "aspectName": "dataJobInfo", "aspect": { "json": { - "inputDatasets": [], - "outputDatasets": [], - "inputDatajobs": [] + "customProperties": { + "procedure_depends_on": "{'DemoData.Foo.age_dist': 'USER_TABLE', 'DemoData.Foo.Items': 'USER_TABLE', 'DemoData.Foo.Persons': 'USER_TABLE', 'DemoData.Foo.SalesReason': 'USER_TABLE'}", + "depending_on_procedure": "{}", + "code": "CREATE PROCEDURE [Foo].[NewProc]\n AS\n BEGIN\n --insert into items table from salesreason table\n insert into Foo.Items (ID, ItemName)\n SELECT TempID, Name\n FROM Foo.SalesReason;\n\n\n IF OBJECT_ID('Foo.age_dist', 'U') IS NULL\n BEGIN\n -- Create and populate if table doesn't exist\n SELECT Age, COUNT(*) as Count\n INTO Foo.age_dist\n FROM Foo.Persons\n GROUP BY Age\n END\n ELSE\n BEGIN\n -- Update existing table\n TRUNCATE TABLE Foo.age_dist;\n\n INSERT INTO Foo.age_dist (Age, Count)\n SELECT Age, COUNT(*) as Count\n FROM Foo.Persons\n GROUP BY Age\n END\n\n SELECT ID, Age INTO #TEMPTABLE FROM NewData.FooNew.PersonsNew\n \n UPDATE DemoData.Foo.Persons\n SET Age = t.Age\n FROM DemoData.Foo.Persons p\n JOIN #TEMPTABLE t ON p.ID = t.ID\n\n END\n", + "input parameters": "[]", + "date_created": "2024-11-22 12:58:03.140000", + "date_modified": "2024-11-22 12:58:03.140000" + }, + "externalUrl": "", + "name": "DemoData.Foo.NewProc", + "type": { + "string": "MSSQL_STORED_PROCEDURE" + } } }, "systemMetadata": { @@ -2515,68 +2657,19 @@ } }, { - "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,demodata.foo.personsview,PROD)", + "entityType": "container", + "entityUrn": "urn:li:container:0a12bec9e9271b0db039923a770d75e5", "changeType": "UPSERT", - "aspectName": "upstreamLineage", + "aspectName": "containerProperties", "aspect": { "json": { - "upstreams": [ - { - "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" - }, - "dataset": "urn:li:dataset:(urn:li:dataPlatform:mssql,demodata.foo.persons,PROD)", - "type": "VIEW" - } - ], - "fineGrainedLineages": [ - { - "upstreamType": "FIELD_SET", - "upstreams": [ - "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:mssql,demodata.foo.persons,PROD),Age)" - ], - "downstreamType": "FIELD", - "downstreams": [ - "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:mssql,demodata.foo.personsview,PROD),Age)" - ], - "confidenceScore": 1.0 - }, - { - "upstreamType": "FIELD_SET", - "upstreams": [ - "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:mssql,demodata.foo.persons,PROD),FirstName)" - ], - "downstreamType": "FIELD", - "downstreams": [ - "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:mssql,demodata.foo.personsview,PROD),FirstName)" - ], - "confidenceScore": 1.0 - }, - { - "upstreamType": "FIELD_SET", - "upstreams": [ - "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:mssql,demodata.foo.persons,PROD),ID)" - ], - "downstreamType": "FIELD", - "downstreams": [ - "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:mssql,demodata.foo.personsview,PROD),ID)" - ], - "confidenceScore": 1.0 - }, - { - "upstreamType": "FIELD_SET", - "upstreams": [ - "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:mssql,demodata.foo.persons,PROD),LastName)" - ], - "downstreamType": "FIELD", - "downstreams": [ - "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:mssql,demodata.foo.personsview,PROD),LastName)" - ], - "confidenceScore": 1.0 - } - ] + "customProperties": { + "platform": "mssql", + "env": "PROD", + "database": "NewData" + }, + "name": "NewData", + "env": "PROD" } }, "systemMetadata": { @@ -2586,8 +2679,8 @@ } }, { - "entityType": "dataFlow", - "entityUrn": "urn:li:dataFlow:(mssql,DemoData.Foo.stored_procedures,PROD)", + "entityType": "container", + "entityUrn": "urn:li:container:0a12bec9e9271b0db039923a770d75e5", "changeType": "UPSERT", "aspectName": "status", "aspect": { @@ -2602,8 +2695,2418 @@ } }, { - "entityType": "dataFlow", - "entityUrn": "urn:li:dataFlow:(mssql,Weekly Demo Data Backup,PROD)", + "entityType": "container", + "entityUrn": "urn:li:container:0a12bec9e9271b0db039923a770d75e5", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:mssql" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:0a12bec9e9271b0db039923a770d75e5", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Database" + ] + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:0a12bec9e9271b0db039923a770d75e5", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [] + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:8b7691fec458d7383d5bc4e213831375", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:0a12bec9e9271b0db039923a770d75e5" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:8b7691fec458d7383d5bc4e213831375", + "changeType": "UPSERT", + "aspectName": "containerProperties", + "aspect": { + "json": { + "customProperties": { + "platform": "mssql", + "env": "PROD", + "database": "NewData", + "schema": "db_accessadmin" + }, + "name": "db_accessadmin", + "env": "PROD" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:8b7691fec458d7383d5bc4e213831375", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:8b7691fec458d7383d5bc4e213831375", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:mssql" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:8b7691fec458d7383d5bc4e213831375", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Schema" + ] + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:8b7691fec458d7383d5bc4e213831375", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:container:0a12bec9e9271b0db039923a770d75e5", + "urn": "urn:li:container:0a12bec9e9271b0db039923a770d75e5" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:523d13eddd725607ec835a2459b05c9c", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:0a12bec9e9271b0db039923a770d75e5" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:523d13eddd725607ec835a2459b05c9c", + "changeType": "UPSERT", + "aspectName": "containerProperties", + "aspect": { + "json": { + "customProperties": { + "platform": "mssql", + "env": "PROD", + "database": "NewData", + "schema": "db_backupoperator" + }, + "name": "db_backupoperator", + "env": "PROD" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:523d13eddd725607ec835a2459b05c9c", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:523d13eddd725607ec835a2459b05c9c", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:mssql" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:523d13eddd725607ec835a2459b05c9c", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Schema" + ] + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:523d13eddd725607ec835a2459b05c9c", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:container:0a12bec9e9271b0db039923a770d75e5", + "urn": "urn:li:container:0a12bec9e9271b0db039923a770d75e5" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:29bd421b2225a415df9c750e77404c66", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:0a12bec9e9271b0db039923a770d75e5" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:29bd421b2225a415df9c750e77404c66", + "changeType": "UPSERT", + "aspectName": "containerProperties", + "aspect": { + "json": { + "customProperties": { + "platform": "mssql", + "env": "PROD", + "database": "NewData", + "schema": "db_datareader" + }, + "name": "db_datareader", + "env": "PROD" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:29bd421b2225a415df9c750e77404c66", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:29bd421b2225a415df9c750e77404c66", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:mssql" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:29bd421b2225a415df9c750e77404c66", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Schema" + ] + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:29bd421b2225a415df9c750e77404c66", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:container:0a12bec9e9271b0db039923a770d75e5", + "urn": "urn:li:container:0a12bec9e9271b0db039923a770d75e5" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:a3c02df4bcc7280a89f539b793b04197", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:0a12bec9e9271b0db039923a770d75e5" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:a3c02df4bcc7280a89f539b793b04197", + "changeType": "UPSERT", + "aspectName": "containerProperties", + "aspect": { + "json": { + "customProperties": { + "platform": "mssql", + "env": "PROD", + "database": "NewData", + "schema": "db_datawriter" + }, + "name": "db_datawriter", + "env": "PROD" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:a3c02df4bcc7280a89f539b793b04197", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:a3c02df4bcc7280a89f539b793b04197", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:mssql" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:a3c02df4bcc7280a89f539b793b04197", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Schema" + ] + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:a3c02df4bcc7280a89f539b793b04197", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:container:0a12bec9e9271b0db039923a770d75e5", + "urn": "urn:li:container:0a12bec9e9271b0db039923a770d75e5" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:c3b5d1cdc69a7d8faf0e1981e89b89d1", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:0a12bec9e9271b0db039923a770d75e5" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:c3b5d1cdc69a7d8faf0e1981e89b89d1", + "changeType": "UPSERT", + "aspectName": "containerProperties", + "aspect": { + "json": { + "customProperties": { + "platform": "mssql", + "env": "PROD", + "database": "NewData", + "schema": "db_ddladmin" + }, + "name": "db_ddladmin", + "env": "PROD" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:c3b5d1cdc69a7d8faf0e1981e89b89d1", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:c3b5d1cdc69a7d8faf0e1981e89b89d1", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:mssql" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:c3b5d1cdc69a7d8faf0e1981e89b89d1", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Schema" + ] + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:c3b5d1cdc69a7d8faf0e1981e89b89d1", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:container:0a12bec9e9271b0db039923a770d75e5", + "urn": "urn:li:container:0a12bec9e9271b0db039923a770d75e5" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:2b937d85ae7545dc769766008a332f42", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:0a12bec9e9271b0db039923a770d75e5" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:2b937d85ae7545dc769766008a332f42", + "changeType": "UPSERT", + "aspectName": "containerProperties", + "aspect": { + "json": { + "customProperties": { + "platform": "mssql", + "env": "PROD", + "database": "NewData", + "schema": "db_denydatareader" + }, + "name": "db_denydatareader", + "env": "PROD" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:2b937d85ae7545dc769766008a332f42", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:2b937d85ae7545dc769766008a332f42", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:mssql" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:2b937d85ae7545dc769766008a332f42", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Schema" + ] + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:2b937d85ae7545dc769766008a332f42", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:container:0a12bec9e9271b0db039923a770d75e5", + "urn": "urn:li:container:0a12bec9e9271b0db039923a770d75e5" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:a399d8bb765028abb9e55ae39846ca5e", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:0a12bec9e9271b0db039923a770d75e5" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:a399d8bb765028abb9e55ae39846ca5e", + "changeType": "UPSERT", + "aspectName": "containerProperties", + "aspect": { + "json": { + "customProperties": { + "platform": "mssql", + "env": "PROD", + "database": "NewData", + "schema": "db_denydatawriter" + }, + "name": "db_denydatawriter", + "env": "PROD" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:a399d8bb765028abb9e55ae39846ca5e", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:a399d8bb765028abb9e55ae39846ca5e", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:mssql" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:a399d8bb765028abb9e55ae39846ca5e", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Schema" + ] + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:a399d8bb765028abb9e55ae39846ca5e", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:container:0a12bec9e9271b0db039923a770d75e5", + "urn": "urn:li:container:0a12bec9e9271b0db039923a770d75e5" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:457efe38f0aec2af9ad681cf1b43b1cb", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:0a12bec9e9271b0db039923a770d75e5" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:457efe38f0aec2af9ad681cf1b43b1cb", + "changeType": "UPSERT", + "aspectName": "containerProperties", + "aspect": { + "json": { + "customProperties": { + "platform": "mssql", + "env": "PROD", + "database": "NewData", + "schema": "db_owner" + }, + "name": "db_owner", + "env": "PROD" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:457efe38f0aec2af9ad681cf1b43b1cb", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:457efe38f0aec2af9ad681cf1b43b1cb", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:mssql" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:457efe38f0aec2af9ad681cf1b43b1cb", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Schema" + ] + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:457efe38f0aec2af9ad681cf1b43b1cb", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:container:0a12bec9e9271b0db039923a770d75e5", + "urn": "urn:li:container:0a12bec9e9271b0db039923a770d75e5" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:1d87783ffe7e82210365dff4ca8ee7d1", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:0a12bec9e9271b0db039923a770d75e5" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:1d87783ffe7e82210365dff4ca8ee7d1", + "changeType": "UPSERT", + "aspectName": "containerProperties", + "aspect": { + "json": { + "customProperties": { + "platform": "mssql", + "env": "PROD", + "database": "NewData", + "schema": "db_securityadmin" + }, + "name": "db_securityadmin", + "env": "PROD" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:1d87783ffe7e82210365dff4ca8ee7d1", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:1d87783ffe7e82210365dff4ca8ee7d1", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:mssql" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:1d87783ffe7e82210365dff4ca8ee7d1", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Schema" + ] + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:1d87783ffe7e82210365dff4ca8ee7d1", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:container:0a12bec9e9271b0db039923a770d75e5", + "urn": "urn:li:container:0a12bec9e9271b0db039923a770d75e5" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:269d0067d130eda0399a534fc787054c", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:0a12bec9e9271b0db039923a770d75e5" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:269d0067d130eda0399a534fc787054c", + "changeType": "UPSERT", + "aspectName": "containerProperties", + "aspect": { + "json": { + "customProperties": { + "platform": "mssql", + "env": "PROD", + "database": "NewData", + "schema": "dbo" + }, + "name": "dbo", + "env": "PROD" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:269d0067d130eda0399a534fc787054c", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:269d0067d130eda0399a534fc787054c", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:mssql" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:269d0067d130eda0399a534fc787054c", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Schema" + ] + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:269d0067d130eda0399a534fc787054c", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:container:0a12bec9e9271b0db039923a770d75e5", + "urn": "urn:li:container:0a12bec9e9271b0db039923a770d75e5" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,newdata.dbo.productsnew,PROD)", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:269d0067d130eda0399a534fc787054c" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { + "urn": "urn:li:dataset:(urn:li:dataPlatform:mssql,newdata.dbo.productsnew,PROD)", + "aspects": [ + { + "com.linkedin.pegasus2avro.common.Status": { + "removed": false + } + }, + { + "com.linkedin.pegasus2avro.dataset.DatasetProperties": { + "customProperties": {}, + "name": "ProductsNew", + "tags": [] + } + }, + { + "com.linkedin.pegasus2avro.schema.SchemaMetadata": { + "schemaName": "newdata.dbo.productsnew", + "platform": "urn:li:dataPlatform:mssql", + "version": 0, + "created": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "hash": "", + "platformSchema": { + "com.linkedin.pegasus2avro.schema.MySqlDDL": { + "tableSchema": "" + } + }, + "fields": [ + { + "fieldPath": "ID", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "INTEGER", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "ProductName", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "NVARCHAR(max) COLLATE SQL_Latin1_General_CP1_CI_AS", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "Price", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "MONEY", + "recursive": false, + "isPartOfKey": false + } + ] + } + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,newdata.dbo.productsnew,PROD)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Table" + ] + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,newdata.dbo.productsnew,PROD)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:container:0a12bec9e9271b0db039923a770d75e5", + "urn": "urn:li:container:0a12bec9e9271b0db039923a770d75e5" + }, + { + "id": "urn:li:container:269d0067d130eda0399a534fc787054c", + "urn": "urn:li:container:269d0067d130eda0399a534fc787054c" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:f721da08adde46586c0f113287cb60d1", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:0a12bec9e9271b0db039923a770d75e5" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:f721da08adde46586c0f113287cb60d1", + "changeType": "UPSERT", + "aspectName": "containerProperties", + "aspect": { + "json": { + "customProperties": { + "platform": "mssql", + "env": "PROD", + "database": "NewData", + "schema": "FooNew" + }, + "name": "FooNew", + "env": "PROD" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:f721da08adde46586c0f113287cb60d1", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:f721da08adde46586c0f113287cb60d1", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:mssql" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:f721da08adde46586c0f113287cb60d1", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Schema" + ] + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:f721da08adde46586c0f113287cb60d1", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:container:0a12bec9e9271b0db039923a770d75e5", + "urn": "urn:li:container:0a12bec9e9271b0db039923a770d75e5" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,newdata.foonew.itemsnew,PROD)", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:f721da08adde46586c0f113287cb60d1" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { + "urn": "urn:li:dataset:(urn:li:dataPlatform:mssql,newdata.foonew.itemsnew,PROD)", + "aspects": [ + { + "com.linkedin.pegasus2avro.common.Status": { + "removed": false + } + }, + { + "com.linkedin.pegasus2avro.dataset.DatasetProperties": { + "customProperties": {}, + "name": "ItemsNew", + "tags": [] + } + }, + { + "com.linkedin.pegasus2avro.schema.SchemaMetadata": { + "schemaName": "newdata.foonew.itemsnew", + "platform": "urn:li:dataPlatform:mssql", + "version": 0, + "created": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "hash": "", + "platformSchema": { + "com.linkedin.pegasus2avro.schema.MySqlDDL": { + "tableSchema": "" + } + }, + "fields": [ + { + "fieldPath": "ID", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "INTEGER", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "ItemName", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "NVARCHAR(max) COLLATE SQL_Latin1_General_CP1_CI_AS", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "Price", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "SMALLMONEY", + "recursive": false, + "isPartOfKey": false + } + ] + } + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,newdata.foonew.itemsnew,PROD)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Table" + ] + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,newdata.foonew.itemsnew,PROD)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:container:0a12bec9e9271b0db039923a770d75e5", + "urn": "urn:li:container:0a12bec9e9271b0db039923a770d75e5" + }, + { + "id": "urn:li:container:f721da08adde46586c0f113287cb60d1", + "urn": "urn:li:container:f721da08adde46586c0f113287cb60d1" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,newdata.foonew.personsnew,PROD)", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:f721da08adde46586c0f113287cb60d1" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { + "urn": "urn:li:dataset:(urn:li:dataPlatform:mssql,newdata.foonew.personsnew,PROD)", + "aspects": [ + { + "com.linkedin.pegasus2avro.common.Status": { + "removed": false + } + }, + { + "com.linkedin.pegasus2avro.dataset.DatasetProperties": { + "customProperties": {}, + "name": "PersonsNew", + "tags": [] + } + }, + { + "com.linkedin.pegasus2avro.schema.SchemaMetadata": { + "schemaName": "newdata.foonew.personsnew", + "platform": "urn:li:dataPlatform:mssql", + "version": 0, + "created": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "hash": "", + "platformSchema": { + "com.linkedin.pegasus2avro.schema.MySqlDDL": { + "tableSchema": "" + } + }, + "fields": [ + { + "fieldPath": "ID", + "nullable": false, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "INTEGER", + "recursive": false, + "isPartOfKey": true + }, + { + "fieldPath": "LastName", + "nullable": false, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "VARCHAR(255) COLLATE SQL_Latin1_General_CP1_CI_AS", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "FirstName", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "VARCHAR(255) COLLATE SQL_Latin1_General_CP1_CI_AS", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "Age", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "INTEGER", + "recursive": false, + "isPartOfKey": false + } + ] + } + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,newdata.foonew.personsnew,PROD)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Table" + ] + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,newdata.foonew.personsnew,PROD)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:container:0a12bec9e9271b0db039923a770d75e5", + "urn": "urn:li:container:0a12bec9e9271b0db039923a770d75e5" + }, + { + "id": "urn:li:container:f721da08adde46586c0f113287cb60d1", + "urn": "urn:li:container:f721da08adde46586c0f113287cb60d1" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,newdata.foonew.view1,PROD)", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:f721da08adde46586c0f113287cb60d1" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { + "urn": "urn:li:dataset:(urn:li:dataPlatform:mssql,newdata.foonew.view1,PROD)", + "aspects": [ + { + "com.linkedin.pegasus2avro.common.Status": { + "removed": false + } + }, + { + "com.linkedin.pegasus2avro.dataset.DatasetProperties": { + "customProperties": { + "view_definition": "CREATE VIEW FooNew.View1 AS\nSELECT LastName, FirstName\nFROM FooNew.PersonsNew\nWHERE Age > 18\n", + "is_view": "True" + }, + "name": "View1", + "tags": [] + } + }, + { + "com.linkedin.pegasus2avro.schema.SchemaMetadata": { + "schemaName": "newdata.foonew.view1", + "platform": "urn:li:dataPlatform:mssql", + "version": 0, + "created": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "hash": "", + "platformSchema": { + "com.linkedin.pegasus2avro.schema.MySqlDDL": { + "tableSchema": "" + } + }, + "fields": [ + { + "fieldPath": "LastName", + "nullable": false, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "VARCHAR(255) COLLATE SQL_Latin1_General_CP1_CI_AS", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "FirstName", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "VARCHAR(255) COLLATE SQL_Latin1_General_CP1_CI_AS", + "recursive": false, + "isPartOfKey": false + } + ] + } + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,newdata.foonew.view1,PROD)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "View" + ] + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,newdata.foonew.view1,PROD)", + "changeType": "UPSERT", + "aspectName": "viewProperties", + "aspect": { + "json": { + "materialized": false, + "viewLogic": "CREATE VIEW FooNew.View1 AS\nSELECT LastName, FirstName\nFROM FooNew.PersonsNew\nWHERE Age > 18\n", + "viewLanguage": "SQL" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,newdata.foonew.view1,PROD)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:container:0a12bec9e9271b0db039923a770d75e5", + "urn": "urn:li:container:0a12bec9e9271b0db039923a770d75e5" + }, + { + "id": "urn:li:container:f721da08adde46586c0f113287cb60d1", + "urn": "urn:li:container:f721da08adde46586c0f113287cb60d1" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:f3cb304e29e178d0615ed5ee6aa4ad58", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:0a12bec9e9271b0db039923a770d75e5" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:f3cb304e29e178d0615ed5ee6aa4ad58", + "changeType": "UPSERT", + "aspectName": "containerProperties", + "aspect": { + "json": { + "customProperties": { + "platform": "mssql", + "env": "PROD", + "database": "NewData", + "schema": "guest" + }, + "name": "guest", + "env": "PROD" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:f3cb304e29e178d0615ed5ee6aa4ad58", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:f3cb304e29e178d0615ed5ee6aa4ad58", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:mssql" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:f3cb304e29e178d0615ed5ee6aa4ad58", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Schema" + ] + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:f3cb304e29e178d0615ed5ee6aa4ad58", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:container:0a12bec9e9271b0db039923a770d75e5", + "urn": "urn:li:container:0a12bec9e9271b0db039923a770d75e5" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:752bb2abafeb2dae8f4adc7ffd547780", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:0a12bec9e9271b0db039923a770d75e5" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:752bb2abafeb2dae8f4adc7ffd547780", + "changeType": "UPSERT", + "aspectName": "containerProperties", + "aspect": { + "json": { + "customProperties": { + "platform": "mssql", + "env": "PROD", + "database": "NewData", + "schema": "INFORMATION_SCHEMA" + }, + "name": "INFORMATION_SCHEMA", + "env": "PROD" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:752bb2abafeb2dae8f4adc7ffd547780", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:752bb2abafeb2dae8f4adc7ffd547780", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:mssql" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:752bb2abafeb2dae8f4adc7ffd547780", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Schema" + ] + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:752bb2abafeb2dae8f4adc7ffd547780", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:container:0a12bec9e9271b0db039923a770d75e5", + "urn": "urn:li:container:0a12bec9e9271b0db039923a770d75e5" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:46b713e3c7754c51649899f0f284ce34", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:0a12bec9e9271b0db039923a770d75e5" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:46b713e3c7754c51649899f0f284ce34", + "changeType": "UPSERT", + "aspectName": "containerProperties", + "aspect": { + "json": { + "customProperties": { + "platform": "mssql", + "env": "PROD", + "database": "NewData", + "schema": "sys" + }, + "name": "sys", + "env": "PROD" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:46b713e3c7754c51649899f0f284ce34", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:46b713e3c7754c51649899f0f284ce34", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:mssql" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:46b713e3c7754c51649899f0f284ce34", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Schema" + ] + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:46b713e3c7754c51649899f0f284ce34", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:container:0a12bec9e9271b0db039923a770d75e5", + "urn": "urn:li:container:0a12bec9e9271b0db039923a770d75e5" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,demodata.foo.personsview,PROD)", + "changeType": "UPSERT", + "aspectName": "upstreamLineage", + "aspect": { + "json": { + "upstreams": [ + { + "auditStamp": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "dataset": "urn:li:dataset:(urn:li:dataPlatform:mssql,demodata.foo.persons,PROD)", + "type": "VIEW" + } + ], + "fineGrainedLineages": [ + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:mssql,demodata.foo.persons,PROD),Age)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:mssql,demodata.foo.personsview,PROD),Age)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:mssql,demodata.foo.persons,PROD),FirstName)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:mssql,demodata.foo.personsview,PROD),FirstName)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:mssql,demodata.foo.persons,PROD),ID)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:mssql,demodata.foo.personsview,PROD),ID)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:mssql,demodata.foo.persons,PROD),LastName)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:mssql,demodata.foo.personsview,PROD),LastName)" + ], + "confidenceScore": 1.0 + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,newdata.foonew.view1,PROD)", + "changeType": "UPSERT", + "aspectName": "upstreamLineage", + "aspect": { + "json": { + "upstreams": [ + { + "auditStamp": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "dataset": "urn:li:dataset:(urn:li:dataPlatform:mssql,newdata.foonew.personsnew,PROD)", + "type": "VIEW" + } + ], + "fineGrainedLineages": [ + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:mssql,newdata.foonew.personsnew,PROD),FirstName)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:mssql,newdata.foonew.view1,PROD),FirstName)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:mssql,newdata.foonew.personsnew,PROD),LastName)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:mssql,newdata.foonew.view1,PROD),LastName)" + ], + "confidenceScore": 1.0 + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,DemoData.Foo.stored_procedures,PROD),NewProc)", + "changeType": "UPSERT", + "aspectName": "dataJobInputOutput", + "aspect": { + "json": { + "inputDatasets": [ + "urn:li:dataset:(urn:li:dataPlatform:mssql,demodata.foo.persons,PROD)", + "urn:li:dataset:(urn:li:dataPlatform:mssql,demodata.foo.salesreason,PROD)", + "urn:li:dataset:(urn:li:dataPlatform:mssql,newdata.foonew.personsnew,PROD)" + ], + "outputDatasets": [ + "urn:li:dataset:(urn:li:dataPlatform:mssql,demodata.foo.age_dist,PROD)", + "urn:li:dataset:(urn:li:dataPlatform:mssql,demodata.foo.items,PROD)", + "urn:li:dataset:(urn:li:dataPlatform:mssql,demodata.foo.persons,PROD)" + ], + "fineGrainedLineages": [ + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:mssql,demodata.foo.persons,PROD),Age)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:mssql,demodata.foo.age_dist,PROD),Age)" + ], + "confidenceScore": 0.9 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:mssql,demodata.foo.salesreason,PROD),TempID)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:mssql,demodata.foo.items,PROD),tempid)" + ], + "confidenceScore": 0.9 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:mssql,demodata.foo.salesreason,PROD),Name)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:mssql,demodata.foo.items,PROD),name)" + ], + "confidenceScore": 0.9 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:mssql,newdata.foonew.personsnew,PROD),Age)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:mssql,demodata.foo.persons,PROD),Age)" + ], + "confidenceScore": 0.35 + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(mssql,DemoData.Foo.stored_procedures,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(mssql,Weekly Demo Data Backup,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,DemoData.Foo.stored_procedures,PROD),NewProc)", "changeType": "UPSERT", "aspectName": "status", "aspect": { diff --git a/metadata-ingestion/tests/integration/sql_server/golden_files/procedures/DemoData.Foo.NewProc.json b/metadata-ingestion/tests/integration/sql_server/golden_files/procedures/DemoData.Foo.NewProc.json new file mode 100644 index 00000000000000..609e3a6f429452 --- /dev/null +++ b/metadata-ingestion/tests/integration/sql_server/golden_files/procedures/DemoData.Foo.NewProc.json @@ -0,0 +1,57 @@ +[ +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,DemoData.Foo.stored_procedures,PROD),NewProc)", + "changeType": "UPSERT", + "aspectName": "dataJobInputOutput", + "aspect": { + "json": { + "inputDatasets": [ + "urn:li:dataset:(urn:li:dataPlatform:mssql,demodata.foo.persons,PROD)", + "urn:li:dataset:(urn:li:dataPlatform:mssql,demodata.foo.salesreason,PROD)", + "urn:li:dataset:(urn:li:dataPlatform:mssql,newdata.foonew.personsnew,PROD)" + ], + "outputDatasets": [ + "urn:li:dataset:(urn:li:dataPlatform:mssql,demodata.foo.age_dist,PROD)", + "urn:li:dataset:(urn:li:dataPlatform:mssql,demodata.foo.items,PROD)", + "urn:li:dataset:(urn:li:dataPlatform:mssql,demodata.foo.persons,PROD)" + ], + "fineGrainedLineages": [ + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:mssql,demodata.foo.persons,PROD),age)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:mssql,demodata.foo.age_dist,PROD),age)" + ], + "confidenceScore": 0.2 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:mssql,demodata.foo.salesreason,PROD),tempid)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:mssql,demodata.foo.items,PROD),tempid)" + ], + "confidenceScore": 0.2 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:mssql,demodata.foo.salesreason,PROD),name)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:mssql,demodata.foo.items,PROD),name)" + ], + "confidenceScore": 0.2 + } + ] + } + } +} +] \ No newline at end of file diff --git a/metadata-ingestion/tests/integration/sql_server/golden_files/procedures/demodata.foo.proc2.json b/metadata-ingestion/tests/integration/sql_server/golden_files/procedures/demodata.foo.proc2.json new file mode 100644 index 00000000000000..8ebd1c065ebf94 --- /dev/null +++ b/metadata-ingestion/tests/integration/sql_server/golden_files/procedures/demodata.foo.proc2.json @@ -0,0 +1,57 @@ +[ +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,demodata.foo.stored_procedures,PROD),proc2)", + "changeType": "UPSERT", + "aspectName": "dataJobInputOutput", + "aspect": { + "json": { + "inputDatasets": [ + "urn:li:dataset:(urn:li:dataPlatform:mssql,demodata.foo.persons,PROD)", + "urn:li:dataset:(urn:li:dataPlatform:mssql,demodata.foo.salesreason,PROD)", + "urn:li:dataset:(urn:li:dataPlatform:mssql,newdata.foonew.personsnew,PROD)" + ], + "outputDatasets": [ + "urn:li:dataset:(urn:li:dataPlatform:mssql,demodata.foo.age_dist,PROD)", + "urn:li:dataset:(urn:li:dataPlatform:mssql,demodata.foo.items,PROD)", + "urn:li:dataset:(urn:li:dataPlatform:mssql,demodata.foo.persons,PROD)" + ], + "fineGrainedLineages": [ + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:mssql,demodata.foo.persons,PROD),age)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:mssql,demodata.foo.age_dist,PROD),age)" + ], + "confidenceScore": 0.2 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:mssql,demodata.foo.salesreason,PROD),tempid)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:mssql,demodata.foo.items,PROD),tempid)" + ], + "confidenceScore": 0.2 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:mssql,demodata.foo.salesreason,PROD),name)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:mssql,demodata.foo.items,PROD),name)" + ], + "confidenceScore": 0.2 + } + ] + } + } +} +] \ No newline at end of file diff --git a/metadata-ingestion/tests/integration/sql_server/procedures/DemoData.Foo.NewProc.sql b/metadata-ingestion/tests/integration/sql_server/procedures/DemoData.Foo.NewProc.sql new file mode 100644 index 00000000000000..52a8d1327653b2 --- /dev/null +++ b/metadata-ingestion/tests/integration/sql_server/procedures/DemoData.Foo.NewProc.sql @@ -0,0 +1,37 @@ +CREATE PROCEDURE [Foo].[NewProc] + AS + BEGIN + --insert into items table from salesreason table + insert into Foo.Items (ID, ItemName) + SELECT TempID, Name + FROM Foo.SalesReason; + + + IF OBJECT_ID('Foo.age_dist', 'U') IS NULL + + BEGIN + -- Create and populate if table doesn't exist + SELECT Age, COUNT(*) as Count + INTO Foo.age_dist + FROM Foo.Persons + GROUP BY Age + END + ELSE + BEGIN + -- Update existing table + TRUNCATE TABLE Foo.age_dist; + + INSERT INTO Foo.age_dist (Age, Count) + SELECT Age, COUNT(*) as Count + FROM Foo.Persons + GROUP BY Age + END + + SELECT * INTO #TempTable FROM NewData.FooNew.PersonsNew + + UPDATE DemoData.Foo.Persons + SET Age = t.Age + FROM DemoData.Foo.Persons p + JOIN #TempTable t ON p.ID = t.ID + + END \ No newline at end of file diff --git a/metadata-ingestion/tests/integration/sql_server/procedures/demodata.foo.proc2.sql b/metadata-ingestion/tests/integration/sql_server/procedures/demodata.foo.proc2.sql new file mode 100644 index 00000000000000..69194a8d2c5464 --- /dev/null +++ b/metadata-ingestion/tests/integration/sql_server/procedures/demodata.foo.proc2.sql @@ -0,0 +1,36 @@ +CREATE PROCEDURE [foo].[proc2] + AS + BEGIN + --insert into items table from salesreason table + insert into foo.items (id, itemame) + SELECT tempid, name + FROM foo.salesreason; + + + IF OBJECT_ID('foo.age_dist', 'U') IS NULL + + BEGIN + -- Create and populate if table doesn't exist + SELECT age, COUNT(*) as count + INTO foo.age_dist + FROM foo.persons + GROUP BY age + END + ELSE + BEGIN + -- Update existing table + TRUNCATE TABLE foo.age_dist; + + INSERT INTO foo.age_dist (age, count) + SELECT age, COUNT(*) as count + FROM foo.persons + GROUP BY age + END + + SELECT * INTO #temptable FROM newdata.foonew.personsnew + + UPDATE demodata.foo.persons + SET age = t.age + FROM demodata.foo.persons p + JOIN #temptable t ON p.ID = t.ID + END \ No newline at end of file diff --git a/metadata-ingestion/tests/integration/sql_server/setup/setup.sql b/metadata-ingestion/tests/integration/sql_server/setup/setup.sql index f495db3b91cfae..0c3c7ee2fd29e3 100644 --- a/metadata-ingestion/tests/integration/sql_server/setup/setup.sql +++ b/metadata-ingestion/tests/integration/sql_server/setup/setup.sql @@ -1,3 +1,4 @@ +DROP DATABASE IF EXISTS NewData; CREATE DATABASE NewData; GO USE NewData; @@ -14,7 +15,14 @@ CREATE TABLE FooNew.PersonsNew ( FirstName varchar(255), Age int ); +GO +CREATE VIEW FooNew.View1 AS +SELECT LastName, FirstName +FROM FooNew.PersonsNew +WHERE Age > 18 +GO +DROP DATABASE IF EXISTS DemoData; CREATE DATABASE DemoData; GO USE DemoData; @@ -47,11 +55,54 @@ CREATE TABLE Foo.SalesReason ) ; GO +DROP PROCEDURE IF EXISTS [Foo].[Proc.With.SpecialChar]; +GO CREATE PROCEDURE [Foo].[Proc.With.SpecialChar] @ID INT AS SELECT @ID AS ThatDB; GO +DROP PROCEDURE IF EXISTS [Foo].[NewProc]; +GO +CREATE PROCEDURE [Foo].[NewProc] + AS + BEGIN + --insert into items table from salesreason table + insert into Foo.Items (ID, ItemName) + SELECT TempID, Name + FROM Foo.SalesReason; + + + IF OBJECT_ID('Foo.age_dist', 'U') IS NULL + BEGIN + -- Create and populate if table doesn't exist + SELECT Age, COUNT(*) as Count + INTO Foo.age_dist + FROM Foo.Persons + GROUP BY Age + END + ELSE + BEGIN + -- Update existing table + TRUNCATE TABLE Foo.age_dist; + + INSERT INTO Foo.age_dist (Age, Count) + SELECT Age, COUNT(*) as Count + FROM Foo.Persons + GROUP BY Age + END + + SELECT ID, Age INTO #TEMPTABLE FROM NewData.FooNew.PersonsNew + + UPDATE DemoData.Foo.Persons + SET Age = t.Age + FROM DemoData.Foo.Persons p + JOIN #TEMPTABLE t ON p.ID = t.ID + + END +GO + +EXEC Foo.NewProc GO EXEC sys.sp_addextendedproperty @name = N'MS_Description', @@ -93,4 +144,4 @@ EXEC sp_attach_schedule GO EXEC dbo.sp_add_jobserver @job_name = N'Weekly Demo Data Backup' -GO +GO \ No newline at end of file diff --git a/metadata-ingestion/tests/integration/sql_server/source_files/mssql_no_db_with_filter.yml b/metadata-ingestion/tests/integration/sql_server/source_files/mssql_no_db_with_filter.yml index 3749499074adfe..703f60b277b870 100644 --- a/metadata-ingestion/tests/integration/sql_server/source_files/mssql_no_db_with_filter.yml +++ b/metadata-ingestion/tests/integration/sql_server/source_files/mssql_no_db_with_filter.yml @@ -9,6 +9,9 @@ source: database_pattern: deny: - NewData + procedure_pattern: + deny: + - DemoData.Foo.NewProc sink: type: file diff --git a/metadata-ingestion/tests/integration/sql_server/source_files/mssql_with_lower_case_urn.yml b/metadata-ingestion/tests/integration/sql_server/source_files/mssql_with_lower_case_urn.yml index ff1179034833f9..94128810f026b9 100644 --- a/metadata-ingestion/tests/integration/sql_server/source_files/mssql_with_lower_case_urn.yml +++ b/metadata-ingestion/tests/integration/sql_server/source_files/mssql_with_lower_case_urn.yml @@ -5,7 +5,6 @@ source: config: username: sa password: test!Password - database: DemoData host_port: localhost:21433 convert_urns_to_lowercase: true # use_odbc: True diff --git a/metadata-ingestion/tests/integration/sql_server/test_sql_server.py b/metadata-ingestion/tests/integration/sql_server/test_sql_server.py index 1f418ffbd32ea9..b969f77b4c3c18 100644 --- a/metadata-ingestion/tests/integration/sql_server/test_sql_server.py +++ b/metadata-ingestion/tests/integration/sql_server/test_sql_server.py @@ -1,9 +1,16 @@ import os +import pathlib import subprocess import time +from pathlib import Path import pytest +from datahub.ingestion.source.sql.mssql.job_models import StoredProcedure +from datahub.ingestion.source.sql.mssql.stored_procedure_lineage import ( + generate_procedure_lineage, +) +from datahub.sql_parsing.schema_resolver import SchemaResolver from tests.test_helpers import mce_helpers from tests.test_helpers.click_helpers import run_datahub_cmd from tests.test_helpers.docker_helpers import cleanup_image, wait_for_port @@ -57,3 +64,50 @@ def test_mssql_ingest(mssql_runner, pytestconfig, tmp_path, mock_time, config_fi r"root\[\d+\]\['aspect'\]\['json'\]\['customProperties'\]\['date_modified'\]", ], ) + + +PROCEDURE_SQLS_DIR = pathlib.Path(__file__).parent / "procedures" +PROCEDURES_GOLDEN_DIR = pathlib.Path(__file__).parent / "golden_files/procedures/" +procedure_sqls = [sql_file.name for sql_file in PROCEDURE_SQLS_DIR.iterdir()] + + +@pytest.mark.parametrize("procedure_sql_file", procedure_sqls) +@pytest.mark.integration +def test_stored_procedure_lineage( + pytestconfig: pytest.Config, procedure_sql_file: str +) -> None: + sql_file_path = PROCEDURE_SQLS_DIR / procedure_sql_file + procedure_code = sql_file_path.read_text() + + # Procedure file is named as .. + splits = procedure_sql_file.split(".") + db = splits[0] + schema = splits[1] + name = splits[2] + + procedure = StoredProcedure( + db=db, + schema=schema, + name=name, + flow=None, # type: ignore # flow is not used in this test + code=procedure_code, + ) + data_job_urn = f"urn:li:dataJob:(urn:li:dataFlow:(mssql,{db}.{schema}.stored_procedures,PROD),{name})" + + schema_resolver = SchemaResolver(platform="mssql") + + mcps = list( + generate_procedure_lineage( + schema_resolver=schema_resolver, + procedure=procedure, + procedure_job_urn=data_job_urn, + is_temp_table=lambda name: "temp" in name.lower(), + ) + ) + mce_helpers.check_goldens_stream( + pytestconfig, + outputs=mcps, + golden_path=( + PROCEDURES_GOLDEN_DIR / Path(procedure_sql_file).with_suffix(".json") + ), + ) diff --git a/metadata-ingestion/tests/unit/sql_parsing/test_split_statements.py b/metadata-ingestion/tests/unit/sql_parsing/test_split_statements.py new file mode 100644 index 00000000000000..06e0e84ede5547 --- /dev/null +++ b/metadata-ingestion/tests/unit/sql_parsing/test_split_statements.py @@ -0,0 +1,51 @@ +from datahub.sql_parsing.split_statements import split_statements + + +def test_split_statements_complex() -> None: + test_sql = """ + CREATE TABLE Users (Id INT); + -- Comment here + INSERT INTO Users VALUES (1); + BEGIN + UPDATE Users SET Id = 2; + /* Multi-line + comment */ + DELETE FROM /* inline DELETE comment */ Users; + END + GO + SELECT * FROM Users + """ + + statements = [statement.strip() for statement in split_statements(test_sql)] + assert statements == [ + "CREATE TABLE Users (Id INT)", + "-- Comment here", + "INSERT INTO Users VALUES (1)", + "BEGIN", + "UPDATE Users SET Id = 2", + "/* Multi-line\n comment */", + "DELETE FROM /* inline DELETE comment */ Users", + "END", + "GO", + "SELECT * FROM Users", + ] + + +def test_split_statements_cte() -> None: + # SQL example from https://stackoverflow.com/a/11562724 + test_sql = """\ +WITH T AS +( SELECT InvoiceNumber, + DocTotal, + SUM(Sale + VAT) OVER(PARTITION BY InvoiceNumber) AS NewDocTotal + FROM PEDI_InvoiceDetail +) +-- comment +/* multi-line +comment */ +UPDATE T +SET DocTotal = NewDocTotal""" + statements = [statement.strip() for statement in split_statements(test_sql)] + assert statements == [ + test_sql, + ] diff --git a/metadata-ingestion/tests/unit/test_athena_source.py b/metadata-ingestion/tests/unit/test_athena_source.py index 875cf3800daf88..f8b6220d182735 100644 --- a/metadata-ingestion/tests/unit/test_athena_source.py +++ b/metadata-ingestion/tests/unit/test_athena_source.py @@ -93,7 +93,8 @@ def test_athena_get_table_properties(): "CreateTime": datetime.now(), "LastAccessTime": datetime.now(), "PartitionKeys": [ - {"Name": "testKey", "Type": "string", "Comment": "testComment"} + {"Name": "year", "Type": "string", "Comment": "testComment"}, + {"Name": "month", "Type": "string", "Comment": "testComment"}, ], "Parameters": { "comment": "testComment", @@ -112,8 +113,18 @@ def test_athena_get_table_properties(): response=table_metadata ) + # Mock partition query results + mock_cursor.execute.return_value.description = [ + ["year"], + ["month"], + ] + mock_cursor.execute.return_value.__iter__.return_value = [["2023", "12"]] + ctx = PipelineContext(run_id="test") source = AthenaSource(config=config, ctx=ctx) + source.cursor = mock_cursor + + # Test table properties description, custom_properties, location = source.get_table_properties( inspector=mock_inspector, table=table, schema=schema ) @@ -124,13 +135,35 @@ def test_athena_get_table_properties(): "last_access_time": "2020-04-14 07:00:00", "location": "s3://testLocation", "outputformat": "testOutputFormat", - "partition_keys": '[{"name": "testKey", "type": "string", "comment": "testComment"}]', + "partition_keys": '[{"name": "year", "type": "string", "comment": "testComment"}, {"name": "month", "type": "string", "comment": "testComment"}]', "serde.serialization.lib": "testSerde", "table_type": "testType", } - assert location == make_s3_urn("s3://testLocation", "PROD") + # Test partition functionality + partitions = source.get_partitions( + inspector=mock_inspector, schema=schema, table=table + ) + assert partitions == ["year", "month"] + + # Verify the correct SQL query was generated for partitions + expected_query = """\ +select year,month from "test_schema"."test_table$partitions" \ +where CAST(year as VARCHAR) || '-' || CAST(month as VARCHAR) = \ +(select max(CAST(year as VARCHAR) || '-' || CAST(month as VARCHAR)) \ +from "test_schema"."test_table$partitions")""" + mock_cursor.execute.assert_called_once() + actual_query = mock_cursor.execute.call_args[0][0] + assert actual_query == expected_query + + # Verify partition cache was populated correctly + assert source.table_partition_cache[schema][table].partitions == partitions + assert source.table_partition_cache[schema][table].max_partition == { + "year": "2023", + "month": "12", + } + def test_get_column_type_simple_types(): assert isinstance( @@ -214,3 +247,9 @@ def test_column_type_complex_combination(): assert isinstance( result._STRUCT_fields[2][1].item_type._STRUCT_fields[1][1], types.String ) + + +def test_casted_partition_key(): + from datahub.ingestion.source.sql.athena import AthenaSource + + assert AthenaSource._casted_partition_key("test_col") == "CAST(test_col as VARCHAR)" diff --git a/metadata-ingestion/tests/unit/test_kafka_source.py b/metadata-ingestion/tests/unit/test_kafka_source.py index dfd32085b77055..cab0a2bce7ba8c 100644 --- a/metadata-ingestion/tests/unit/test_kafka_source.py +++ b/metadata-ingestion/tests/unit/test_kafka_source.py @@ -10,6 +10,7 @@ ) from freezegun import freeze_time +from datahub.configuration.common import ConfigurationError from datahub.emitter.mce_builder import ( OwnerType, make_dataplatform_instance_urn, @@ -738,3 +739,23 @@ def mock_get_latest_version(subject_name: str) -> Optional[RegisteredSchema]: assert workunits[7].metadata.aspectName == "glossaryTermKey" assert workunits[8].metadata.aspectName == "tagKey" assert workunits[9].metadata.aspectName == "tagKey" + + +def test_kafka_source_oauth_cb_configuration(): + with pytest.raises( + ConfigurationError, + match=( + "oauth_cb must be a string representing python function reference " + "in the format :." + ), + ): + KafkaSourceConfig.parse_obj( + { + "connection": { + "bootstrap": "foobar:9092", + "consumer_config": { + "oauth_cb": test_kafka_ignore_warnings_on_schema_type + }, + } + } + ) diff --git a/metadata-ingestion/tests/unit/test_powerbi_parser.py b/metadata-ingestion/tests/unit/test_powerbi_parser.py index 31579f0c0abd3e..a487a3a5b87f8b 100644 --- a/metadata-ingestion/tests/unit/test_powerbi_parser.py +++ b/metadata-ingestion/tests/unit/test_powerbi_parser.py @@ -8,9 +8,7 @@ from datahub.ingestion.source.powerbi.dataplatform_instance_resolver import ( ResolvePlatformInstanceFromDatasetTypeMapping, ) -from datahub.ingestion.source.powerbi.m_query.resolver import ( - MSSqlDataPlatformTableCreator, -) +from datahub.ingestion.source.powerbi.m_query.pattern_handler import MSSqlLineage from datahub.ingestion.source.powerbi.rest_api_wrapper.data_classes import Table @@ -27,7 +25,7 @@ def creator(): full_name="db.schema.test_table", ) - return MSSqlDataPlatformTableCreator( + return MSSqlLineage( ctx=PipelineContext(run_id="test-run-id"), table=table, reporter=PowerBiDashboardSourceReport(), diff --git a/metadata-ingestion/tests/unit/test_sql_types.py b/metadata-ingestion/tests/unit/test_sql_types.py new file mode 100644 index 00000000000000..ebe5ade115cdd4 --- /dev/null +++ b/metadata-ingestion/tests/unit/test_sql_types.py @@ -0,0 +1,78 @@ +import pytest + +from datahub.ingestion.source.sql.sql_types import ( + ATHENA_SQL_TYPES_MAP, + TRINO_SQL_TYPES_MAP, + resolve_athena_modified_type, + resolve_sql_type, + resolve_trino_modified_type, +) +from datahub.metadata.schema_classes import BooleanTypeClass, StringTypeClass + + +@pytest.mark.parametrize( + "data_type, expected_data_type", + [ + ("boolean", "boolean"), + ("tinyint", "tinyint"), + ("smallint", "smallint"), + ("int", "int"), + ("integer", "integer"), + ("bigint", "bigint"), + ("real", "real"), + ("double", "double"), + ("decimal(10,0)", "decimal"), + ("varchar(20)", "varchar"), + ("char", "char"), + ("varbinary", "varbinary"), + ("json", "json"), + ("date", "date"), + ("time", "time"), + ("time(12)", "time"), + ("timestamp", "timestamp"), + ("timestamp(3)", "timestamp"), + ("row(x bigint, y double)", "row"), + ("array(row(x bigint, y double))", "array"), + ("map(varchar, varchar)", "map"), + ], +) +def test_resolve_trino_modified_type(data_type, expected_data_type): + assert ( + resolve_trino_modified_type(data_type) + == TRINO_SQL_TYPES_MAP[expected_data_type] + ) + + +@pytest.mark.parametrize( + "data_type, expected_data_type", + [ + ("boolean", "boolean"), + ("tinyint", "tinyint"), + ("smallint", "smallint"), + ("int", "int"), + ("integer", "integer"), + ("bigint", "bigint"), + ("float", "float"), + ("double", "double"), + ("decimal(10,0)", "decimal"), + ("varchar(20)", "varchar"), + ("char", "char"), + ("binary", "binary"), + ("date", "date"), + ("timestamp", "timestamp"), + ("timestamp(3)", "timestamp"), + ("struct", "struct"), + ("array>", "array"), + ("map", "map"), + ], +) +def test_resolve_athena_modified_type(data_type, expected_data_type): + assert ( + resolve_athena_modified_type(data_type) + == ATHENA_SQL_TYPES_MAP[expected_data_type] + ) + + +def test_resolve_sql_type() -> None: + assert resolve_sql_type("boolean") == BooleanTypeClass() + assert resolve_sql_type("varchar") == StringTypeClass() diff --git a/metadata-ingestion/tests/unit/urns/test_urn.py b/metadata-ingestion/tests/unit/urns/test_urn.py index 1bf48082fec8c9..73badb3d1b4234 100644 --- a/metadata-ingestion/tests/unit/urns/test_urn.py +++ b/metadata-ingestion/tests/unit/urns/test_urn.py @@ -1,6 +1,12 @@ import pytest -from datahub.metadata.urns import DatasetUrn, Urn +from datahub.metadata.urns import ( + CorpUserUrn, + DashboardUrn, + DataPlatformUrn, + DatasetUrn, + Urn, +) from datahub.utilities.urns.error import InvalidUrnError pytestmark = pytest.mark.filterwarnings("ignore::DeprecationWarning") @@ -36,20 +42,51 @@ def test_url_encode_urn() -> None: def test_invalid_urn() -> None: with pytest.raises(InvalidUrnError): - Urn.create_from_string("urn:li:abc") + Urn.from_string("urn:li:abc") with pytest.raises(InvalidUrnError): - Urn.create_from_string("urn:li:abc:") + Urn.from_string("urn:li:abc:") with pytest.raises(InvalidUrnError): - Urn.create_from_string("urn:li:abc:()") + Urn.from_string("urn:li:abc:()") with pytest.raises(InvalidUrnError): - Urn.create_from_string("urn:li:abc:(abc,)") + Urn.from_string("urn:li:abc:(abc,)") + + with pytest.raises(InvalidUrnError): + Urn.from_string("urn:li:corpuser:abc)") + + +def test_urn_colon() -> None: + # Colon characters are valid in urns, and should not mess up parsing. + + urn = Urn.from_string( + "urn:li:dashboard:(looker,dashboards.thelook::customer_lookup)" + ) + assert isinstance(urn, DashboardUrn) + + assert DataPlatformUrn.from_string("urn:li:dataPlatform:abc:def") + assert DatasetUrn.from_string( + "urn:li:dataset:(urn:li:dataPlatform:abc:def,table_name,PROD)" + ) + assert Urn.from_string("urn:li:corpuser:foo:bar@example.com") + + # I'm not sure why you'd ever want this, but technically it's a valid urn. + urn = Urn.from_string("urn:li:corpuser::") + assert isinstance(urn, CorpUserUrn) + assert urn.username == ":" + assert urn == CorpUserUrn(":") + + +def test_urn_coercion() -> None: + urn = CorpUserUrn("foo␟bar") + assert urn.urn() == "urn:li:corpuser:foo%E2%90%9Fbar" + + assert urn == Urn.from_string(urn.urn()) def test_urn_type_dispatch() -> None: - urn = Urn.from_string("urn:li:dataset:(urn:li:dataPlatform:abc,def,prod)") + urn = Urn.from_string("urn:li:dataset:(urn:li:dataPlatform:abc,def,PROD)") assert isinstance(urn, DatasetUrn) with pytest.raises(InvalidUrnError, match="Passed an urn of type corpuser"): diff --git a/metadata-integration/java/datahub-client/build.gradle b/metadata-integration/java/datahub-client/build.gradle index 1bdc848d0385b1..56a486ad043305 100644 --- a/metadata-integration/java/datahub-client/build.gradle +++ b/metadata-integration/java/datahub-client/build.gradle @@ -19,6 +19,7 @@ jar { dependencies { api project(':entity-registry') api project(':metadata-integration:java:datahub-event') + implementation project(':metadata-integration:java:datahub-schematron:lib') implementation(externalDependency.kafkaAvroSerializer) { exclude group: "org.apache.avro" } @@ -114,7 +115,7 @@ shadowJar { relocate 'org.checkerframework', 'datahub.shaded.org.checkerframework' relocate 'com.google.errorprone', 'datahub.shaded.com.google.errorprone' // Below jars added for kafka emitter only - relocate 'org.apache.avro', 'datahub.shaded.org.apache.avro' +// relocate 'org.apache.avro', 'datahub.shaded.org.apache.avro' relocate 'com.thoughtworks.paranamer', 'datahub.shaded.com.thoughtworks.paranamer' relocate 'org.xerial.snappy', 'datahub.shaded.org.xerial.snappy' relocate 'org.apache.kafka', 'datahub.shaded.org.apache.kafka' diff --git a/metadata-integration/java/datahub-client/scripts/check_jar.sh b/metadata-integration/java/datahub-client/scripts/check_jar.sh index 10299ec714d165..e451a7dd2a009e 100755 --- a/metadata-integration/java/datahub-client/scripts/check_jar.sh +++ b/metadata-integration/java/datahub-client/scripts/check_jar.sh @@ -40,7 +40,10 @@ jar -tvf $jarFile |\ grep -v "mozilla" |\ grep -v "VersionInfo.java" |\ grep -v "mime.types" |\ - grep -v "com/ibm/.*" + grep -v "com/ibm/.*" |\ + grep -v "org/apache/avro" |\ + grep -v "org/apache" + if [ $? -ne 0 ]; then diff --git a/metadata-integration/java/datahub-protobuf/scripts/check_jar.sh b/metadata-integration/java/datahub-protobuf/scripts/check_jar.sh index bd0c28f0f86988..66c70f0b857692 100755 --- a/metadata-integration/java/datahub-protobuf/scripts/check_jar.sh +++ b/metadata-integration/java/datahub-protobuf/scripts/check_jar.sh @@ -44,7 +44,9 @@ jar -tvf $jarFile |\ grep -v "mime.types" |\ grep -v "com/ibm/.*" |\ grep -v "org/glassfish/" |\ - grep -v "LICENSE" + grep -v "LICENSE" |\ + grep -v "org/apache/avro" |\ + grep -v "org/apache" if [ $? -ne 0 ]; then echo "✅ No unexpected class paths found in ${jarFile}" diff --git a/metadata-integration/java/datahub-schematron/README.md b/metadata-integration/java/datahub-schematron/README.md new file mode 100644 index 00000000000000..0dc1c2b9c74551 --- /dev/null +++ b/metadata-integration/java/datahub-schematron/README.md @@ -0,0 +1,73 @@ +# SchemaTron (Incubating) + +> ⚠️ This is an incubating project in draft status. APIs and functionality may change significantly between releases. + +SchemaTron is a schema translation toolkit that converts between various schema formats and DataHub's native schema representation. It currently provides robust support for Apache Avro schema translation with a focus on complex schema structures including unions, arrays, maps, and nested records. + +## Modules + +### CLI Module + +Command-line interface for converting schemas and emitting them to DataHub. + +```bash +# Execute from this directory +../../../gradlew :metadata-integration:java:datahub-schematron:cli:run --args="-i cli/src/test/resources/FlatUser.avsc" +``` + +#### CLI Options + +- `-i, --input`: Input schema file or directory path +- `-p, --platform`: Data platform name (default: "avro") +- `-s, --server`: DataHub server URL (default: "http://localhost:8080") +- `-t, --token`: DataHub access token +- `--sink`: Output sink - "rest" or "file" (default: "rest") +- `--output-file`: Output file path when using file sink (default: "metadata.json") + +### Library Module + +Core translation logic and models for schema conversion. Features include: + +- Support for complex Avro schema structures: + - Union types with multiple record options + - Nested records and arrays + - Optional fields with defaults + - Logical types (date, timestamp, etc.) + - Maps with various value types + - Enum types + - Custom metadata and documentation + +- Comprehensive path handling for schema fields +- DataHub-compatible metadata generation +- Schema fingerprinting and versioning + +## Example Schema Support + +The library can handle sophisticated schema structures including: + +- Customer profiles with multiple identification types (passport, driver's license, national ID) +- Contact information with primary and alternative contact methods +- Address validation with verification metadata +- Subscription history tracking +- Flexible preference and metadata storage +- Tagged customer attributes + +## Development + +The project includes extensive test coverage through: + +- Unit tests for field path handling +- Schema translation comparison tests +- Integration tests with Python reference implementation + +Test resources include example schemas demonstrating various Avro schema features and edge cases. + +## Contributing + +As this is an incubating project, we welcome contributions and feedback on: + +- Additional schema format support +- Improved handling of complex schema patterns +- Enhanced metadata translation +- Documentation and examples +- Test coverage \ No newline at end of file diff --git a/metadata-integration/java/datahub-schematron/cli/build.gradle b/metadata-integration/java/datahub-schematron/cli/build.gradle new file mode 100644 index 00000000000000..1711ff947c2d19 --- /dev/null +++ b/metadata-integration/java/datahub-schematron/cli/build.gradle @@ -0,0 +1,110 @@ +plugins { + id "application" +} +apply plugin: 'java' +apply plugin: 'jacoco' + +ext { + javaMainClass = "io.datahubproject.schematron.cli.SchemaTron" +} + +application { + mainClassName = javaMainClass +} + +dependencies { + // Existing dependencies remain unchanged + implementation 'info.picocli:picocli:4.7.5' + annotationProcessor 'info.picocli:picocli-codegen:4.7.5' + implementation 'ch.qos.logback:logback-classic:1.2.11' + implementation 'ch.qos.logback:logback-core:1.2.11' + implementation project(':metadata-integration:java:datahub-client') + implementation project(':metadata-integration:java:datahub-schematron:lib') + implementation externalDependency.avro + compileOnly externalDependency.lombok + annotationProcessor externalDependency.lombok + + // Test dependencies + testImplementation externalDependency.testng + testImplementation externalDependency.mockito +} + +test { + useTestNG() + + testLogging { + events "passed", "skipped", "failed" + exceptionFormat "full" + showStandardStreams = true + } + + systemProperty 'python.venv.path', System.getProperty('python.venv.path', '../venv') +} + +task validatePythonEnv { + doFirst { + def venvPath = System.getProperty('python.venv.path', '../../../../metadata-ingestion/venv') + def isWindows = System.getProperty('os.name').toLowerCase().contains('windows') + def pythonExe = isWindows ? "${venvPath}/Scripts/python.exe" : "${venvPath}/bin/python" + + def result = exec { + commandLine pythonExe, "-c", "import sys; print(sys.executable)" + ignoreExitValue = true + standardOutput = new ByteArrayOutputStream() + errorOutput = new ByteArrayOutputStream() + } + + if (result.exitValue != 0) { + throw new GradleException("Python virtual environment not properly set up at ${venvPath}") + } + } +} + +test.dependsOn tasks.getByPath(":metadata-ingestion:installDev") + +jacocoTestReport { + dependsOn test +} + +test.finalizedBy jacocoTestReport + +task updateGoldenFiles { + dependsOn validatePythonEnv + doLast { + def venvPath = System.getProperty('python.venv.path', '../../../../metadata-ingestion/venv') + def isWindows = System.getProperty('os.name').toLowerCase().contains('windows') + def pythonExe = isWindows ? "${venvPath}/Scripts/python.exe" : "${venvPath}/bin/python" + def diffsDir = new File('src/test/resources/diffs') + + if (!diffsDir.exists()) { + throw new GradleException("Diffs directory not found at ${diffsDir.absolutePath}") + } + + // Find all json files in the diffs directory + diffsDir.listFiles().findAll { it.name.endsWith('_diff.json') }.each { diffFile -> + def baseName = diffFile.name.replace('_diff.json', '') + def pythonOutput = "build/test-outputs/${baseName}_python.json" + def javaOutput = "build/test-outputs/${baseName}_java.json" + + println "Updating golden file for ${baseName}..." + + exec { + commandLine pythonExe, + 'scripts/mce_diff.py', + '--update-golden-diff', + '--golden-diff-file', + diffFile.absolutePath, + pythonOutput, + javaOutput + ignoreExitValue = true + standardOutput = new ByteArrayOutputStream() + errorOutput = new ByteArrayOutputStream() + } + } + } +} + +configurations { + provided + implementation.extendsFrom provided +} \ No newline at end of file diff --git a/metadata-integration/java/datahub-schematron/cli/scripts/avro_schema_to_mce.py b/metadata-integration/java/datahub-schematron/cli/scripts/avro_schema_to_mce.py new file mode 100644 index 00000000000000..38a90bc3318428 --- /dev/null +++ b/metadata-integration/java/datahub-schematron/cli/scripts/avro_schema_to_mce.py @@ -0,0 +1,94 @@ +from datahub.ingestion.extractor.schema_util import AvroToMceSchemaConverter +from avro.schema import parse as parse_avro, RecordSchema +from datahub.emitter.synchronized_file_emitter import SynchronizedFileEmitter +import datahub.metadata.schema_classes as models +import click +from datahub.emitter.mce_builder import make_data_platform_urn, make_dataset_urn +from datahub.emitter.mcp import MetadataChangeProposalWrapper +import os +import hashlib +from datahub.ingestion.graph.client import get_default_graph + + +def get_schema_hash(schema): + # Convert schema to string if it isn't already + schema_str = str(schema) + + # Create MD5 hash + schema_hash = hashlib.md5(schema_str.encode("utf-8")).hexdigest() + + return schema_hash + + +@click.command(name="avro2datahub") +@click.option("--input-file", "-i", type=click.Path(exists=True), required=True) +@click.option("--platform", type=str, required=True) +@click.option("--output-file", "-o", type=click.Path(), default="metadata.py.json") +@click.option("--to-file", "-f", is_flag=True, default=True) +@click.option("--to-server", "-s", is_flag=True, default=False) +def generate_schema_file_from_avro_schema( + input_file: str, platform: str, output_file: str, to_file: bool, to_server: bool +): + avro_schema_file = input_file + output_file_name = output_file + platform_urn = make_data_platform_urn(platform) + converter = AvroToMceSchemaConverter(is_key_schema=False) + + # Delete the output file if it exists + if os.path.exists(output_file_name): + os.remove(output_file_name) + + with open(avro_schema_file) as f: + raw_string = f.read() + avro_schema = parse_avro(raw_string) + # Get fingerprint bytes + canonical_form = avro_schema.canonical_form + print( + f"Schema canonical form: Length ({len(canonical_form)}); {canonical_form}" + ) + md5_bytes = avro_schema.fingerprint("md5") + # Convert to hex string + avro_schema_hash = md5_bytes.hex() + assert isinstance( + avro_schema, RecordSchema + ), "This command only works for Avro records" + dataset_urn = make_dataset_urn( + platform=platform_urn, + name=( + f"{avro_schema.namespace}.{avro_schema.name}" + if avro_schema.namespace + else avro_schema.name + ), + ) + schema_fields = [ + f for f in converter.to_mce_fields(avro_schema, is_key_schema=False) + ] + schema_metadata = models.SchemaMetadataClass( + schemaName=avro_schema.name, + platform=platform_urn, + version=0, + hash=avro_schema_hash, + platformSchema=models.OtherSchemaClass(rawSchema=raw_string), + fields=schema_fields, + ) + assert schema_metadata.validate() + if to_file: + with SynchronizedFileEmitter(output_file_name) as file_emitter: + file_emitter.emit( + MetadataChangeProposalWrapper( + entityUrn=dataset_urn, aspect=schema_metadata + ) + ) + if to_server: + with get_default_graph() as graph: + graph.emit( + MetadataChangeProposalWrapper( + entityUrn=dataset_urn, aspect=schema_metadata + ) + ) + + print(f"Wrote metadata to {output_file}") + + +if __name__ == "__main__": + generate_schema_file_from_avro_schema() diff --git a/metadata-integration/java/datahub-schematron/cli/scripts/mce_diff.py b/metadata-integration/java/datahub-schematron/cli/scripts/mce_diff.py new file mode 100644 index 00000000000000..37ba11138610c1 --- /dev/null +++ b/metadata-integration/java/datahub-schematron/cli/scripts/mce_diff.py @@ -0,0 +1,345 @@ +import json +from typing import Dict, Any, Optional, Tuple +import click + +import json +from typing import Dict, Any + + +def diff_lists(list1, list2): + """ + Compare two lists element by element and return their differences. + + Args: + list1 (list): First list to compare + list2 (list): Second list to compare + + Returns: + dict: A dictionary containing the differences + """ + result = {"added": {}, "removed": {}, "modified": set(), "modified_details": {}} + + if len(list1) != len(list2): + # Let's first line up the elements that are common to both lists using + # the fieldPath as the key if it exists + if "fieldPath" in list1[0]: + list1_dict = {field["fieldPath"]: field for field in list1} + list2_dict = {field["fieldPath"]: field for field in list2} + common_keys = set(list1_dict.keys()) & set(list2_dict.keys()) + list1 = [list1_dict[key] for key in common_keys] + list2 = [list2_dict[key] for key in common_keys] + list1.extend( + [list1_dict[key] for key in set(list1_dict.keys()) - common_keys] + ) + list2.extend( + [list2_dict[key] for key in set(list2_dict.keys()) - common_keys] + ) + + # Handle added elements (if list2 is longer) + if len(list2) > len(list1): + for i in range(len(list1), len(list2)): + if "fieldPath" in list2[i]: + result["added"][list2[i]["fieldPath"]] = list2[i] + else: + result["added"][str(i)] = list2[i] + + # Handle removed elements (if list1 is longer) + if len(list1) > len(list2): + for i in range(len(list2), len(list1)): + if "fieldPath" in list1[i]: + result["removed"][list1[i]["fieldPath"]] = list1[i] + else: + result["removed"][str(i)] = list1[i] + + # Compare common indices + for i in range(min(len(list1), len(list2))): + value1 = list1[i] + value2 = list2[i] + + if type(value1) != type(value2): + result["modified"].add(str(i)) + result["modified_details"][str(i)] = {"before": value1, "after": value2} + elif isinstance(value1, dict) and isinstance(value2, dict): + nested_diff = diff_dicts( + value1, value2, identifier=value1.get("fieldPath", i) + ) + if any(nested_diff.values()): + result["modified"].add(value1.get("fieldPath", i)) + result["modified_details"][value1.get("fieldPath", i)] = nested_diff + elif isinstance(value1, list) and isinstance(value2, list): + nested_diff = diff_lists(value1, value2) + if any(nested_diff.values()): + result["modified"].add(str(i)) + result["modified_details"][str(i)] = nested_diff + elif value1 != value2: + result["modified"].add(str(i)) + result["modified_details"][str(i)] = { + "before": value1, + "after": value2, + "identifier": i, + } + + return result + + +def diff_schema_field(field1_dict, field2_dict): + + from datahub.metadata.schema_classes import SchemaFieldClass + + field1 = SchemaFieldClass.from_obj(field1_dict) + field2 = SchemaFieldClass.from_obj(field2_dict) + + # Initialize result structure + result = {"added": {}, "removed": {}, "modified": set(), "modified_details": {}} + + result = {} + if field1.fieldPath != field2.fieldPath: + result["fieldPath"] = {"before": field1.fieldPath, "after": field2.fieldPath} + + if field1.type != field2.type: + result["type"] = { + "before": field1.type, + "after": field2.type, + "identifier": field1.fieldPath, + } + + if field1.description != field2.description: + result["description"] = { + "before": field1.description, + "after": field2.description, + "identifier": field1.fieldPath, + } + + if field1.nullable != field2.nullable: + result["nullable"] = { + "before": field1.nullable, + "after": field2.nullable, + "identifier": field1.fieldPath, + } + + return result + + +def diff_schema_metadata(schema1_dict, schema2_dict): + + ignored_for_diff = [ + "created", + "modified", + "hash", + "platformSchema", + "lastModified", + ] # TODO: Reduce this list + + for key in ignored_for_diff: + schema1_dict.pop(key, None) + schema2_dict.pop(key, None) + + return diff_dicts(schema1_dict, schema2_dict) + + +def is_empty_diff(diff_dict) -> bool: + if diff_dict.keys() == EMPTY_DIFF().keys(): + for key in diff_dict: + if diff_dict[key]: + return False + return True + return False + + +def format_diff(diff_dict) -> Any: + if isinstance(diff_dict, set): + diff_dict = sorted(list([x for x in diff_dict])) + elif isinstance(diff_dict, dict): + for key in diff_dict: + diff_dict[key] = format_diff(diff_dict[key]) + return diff_dict + + +def EMPTY_DIFF(): + return { + "added": {}, + "removed": {}, + "modified": set(), + "modified_details": {}, + } + + +def diff_dicts(dict1, dict2, identifier=None): + """ + Compare two dictionaries recursively and return their differences. + + Args: + dict1 (dict): First dictionary to compare + dict2 (dict): Second dictionary to compare + + Returns: + dict: A dictionary containing the differences with the following structure: + { + 'added': Keys present in dict2 but not in dict1, + 'removed': Keys present in dict1 but not in dict2, + 'modified': Keys present in both but with different values, + 'modified_details': Detailed before/after values for modified keys + } + """ + if "nullable" in dict1: + # Assume this is a SchemaFieldClass + return diff_schema_field(dict1, dict2) + + if "hash" in dict1: + # Assume this is a schema metadata class + return diff_schema_metadata(dict1, dict2) + + dict1_keys = set(dict1.keys()) + dict2_keys = set(dict2.keys()) + + # Find keys that were added, removed, or modified + added_keys = dict2_keys - dict1_keys + removed_keys = dict1_keys - dict2_keys + common_keys = dict1_keys & dict2_keys + + # Initialize result structure + result = EMPTY_DIFF() + # Handle added keys + for key in added_keys: + result["added"][key] = dict2[key] + + # Handle removed keys + for key in removed_keys: + result["removed"][key] = dict1[key] + + # Check common keys for modifications + for key in common_keys: + value1 = dict1[key] + value2 = dict2[key] + + # If both values are dictionaries, recurse + if isinstance(value1, dict) and isinstance(value2, dict): + nested_diff = diff_dicts( + value1, value2, identifier=value1.get("fieldPath", key) + ) + if any(nested_diff.values()): # If there are any differences + result["modified"].add(key) + result["modified_details"][key] = nested_diff + # If both values are lists, compare them element by element + elif isinstance(value1, list) and isinstance(value2, list): + nested_diff = diff_lists(value1, value2) + if any(nested_diff.values()): + result["modified"].add(key) + result["modified_details"][key] = nested_diff + # Otherwise compare directly + elif value1 != value2: + result["modified"].add(key) + result["modified_details"][key] = { + "before": value1, + "after": value2, + "identifier": identifier, + } + + return result + + +def process_single_element(element) -> Tuple[str, str, Dict[str, Any]]: + if "entityUrn" in element: + entity = element["entityUrn"] + else: + raise Exception("Element does not have an entityUrn key") + if "aspectName" in element: + aspect = element["aspectName"] + else: + raise Exception("Element does not have an aspectName key") + if "aspect" in element: + if "json" in element["aspect"]: + return entity, aspect, element["aspect"]["json"] + elif "value" in element["aspect"]: + json_value = json.loads(element["aspect"]["value"]) + return entity, aspect, json_value + else: + raise Exception("Element does not have a json or value key") + else: + raise Exception("Element does not have an aspect key") + + +def process_element_with_dict(element, global_dict): + entity, aspect, data = process_single_element(element) + if entity not in global_dict: + global_dict[entity] = {} + if aspect not in global_dict[entity]: + global_dict[entity][aspect] = data + else: + # breakpoint() + raise Exception("Duplicate aspect found") + + +@click.command("compute_diff") +@click.argument("input_file_1", type=click.Path(exists=True)) +@click.argument("input_file_2", type=click.Path(exists=True)) +@click.option("--golden-diff-file", type=click.Path(), default=None) +@click.option("--update-golden-diff", is_flag=True) +def compute_diff( + input_file_1: str, + input_file_2: str, + golden_diff_file: Optional[str] = None, + update_golden_diff: bool = False, +): + + # Read the files into json objects and compare them + # If they are the same, exit 0 + # If they are different, exit 1 + file_1_mcps = {} + with open(input_file_1) as file1: + data1 = json.load(file1) + assert isinstance(data1, list) + for element in data1: + process_element_with_dict(element, file_1_mcps) + print(f"Processed {len(file_1_mcps)} elements from file {input_file_1}") + + file_2_mcps = {} + with open(input_file_2) as file2: + data2 = json.load(file2) + assert isinstance(data2, list) + for element in data2: + process_element_with_dict(element, file_2_mcps) + + print(f"Processed {len(file_2_mcps)} elements from file {input_file_2}") + + if golden_diff_file and not update_golden_diff: + with open(golden_diff_file) as golden_diff: + golden_diff_data = json.load(golden_diff) + else: + golden_diff_data = None + + computed_diff_data = {} + + assert len(file_1_mcps) == len(file_2_mcps) + for entity in file_1_mcps: + assert entity in file_2_mcps + assert len(file_1_mcps[entity]) == len(file_2_mcps[entity]) + for aspect in file_1_mcps[entity]: + assert aspect in file_2_mcps[entity] + aspect_diff = diff_dicts( + file_1_mcps[entity][aspect], file_2_mcps[entity][aspect] + ) + if golden_diff_data: + assert aspect in golden_diff_data[entity] + assert format_diff(aspect_diff) == golden_diff_data[entity][aspect], ( + f"Computed difference is {json.dumps(format_diff(aspect_diff), indent=2)}\n" + f"Expected difference is {json.dumps(golden_diff_data[entity][aspect], indent=2)}" + ) + + else: + if update_golden_diff: + if entity not in computed_diff_data: + computed_diff_data[entity] = {} + computed_diff_data[entity][aspect] = format_diff(aspect_diff) + else: + assert is_empty_diff( + aspect_diff + ), f"Difference is {json.dumps(format_diff(aspect_diff), indent=2)}" + + if update_golden_diff: + with open(golden_diff_file, "w") as golden_diff: + json.dump(computed_diff_data, golden_diff, indent=2, sort_keys=True) + + +if __name__ == "__main__": + compute_diff() diff --git a/metadata-integration/java/datahub-schematron/cli/src/main/java/io/datahubproject/schematron/cli/SchemaTron.java b/metadata-integration/java/datahub-schematron/cli/src/main/java/io/datahubproject/schematron/cli/SchemaTron.java new file mode 100644 index 00000000000000..d8e4a43cfa8fba --- /dev/null +++ b/metadata-integration/java/datahub-schematron/cli/src/main/java/io/datahubproject/schematron/cli/SchemaTron.java @@ -0,0 +1,147 @@ +package io.datahubproject.schematron.cli; + +import com.linkedin.common.FabricType; +import com.linkedin.common.urn.DataPlatformUrn; +import com.linkedin.common.urn.DatasetUrn; +import com.linkedin.events.metadata.ChangeType; +import com.linkedin.schema.SchemaField; +import com.linkedin.schema.SchemaMetadata; +import datahub.client.Emitter; +import datahub.client.file.FileEmitter; +import datahub.client.file.FileEmitterConfig; +import datahub.client.rest.RestEmitter; +import datahub.event.MetadataChangeProposalWrapper; +import io.datahubproject.schematron.converters.avro.AvroSchemaConverter; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.concurrent.Callable; +import java.util.stream.Stream; +import lombok.extern.slf4j.Slf4j; +import org.apache.avro.Schema; +import picocli.CommandLine; +import picocli.CommandLine.Command; +import picocli.CommandLine.Option; + +@Slf4j +@Command( + name = "schema-translator", + description = "Converts schemas to DataHub format and emits them", + mixinStandardHelpOptions = true) +public class SchemaTron implements Callable { + + @Option( + names = {"-i", "--input"}, + description = "Input schema file or directory") + private String input; + + @Option( + names = {"-s", "--server"}, + description = "DataHub server URL", + required = false, + defaultValue = "http://localhost:8080") + private String server; + + @Option( + names = {"-t", "--token"}, + description = "DataHub access token", + required = false, + defaultValue = "") + private String token; + + @Option( + names = {"-p", "--platform"}, + description = "Data platform name", + defaultValue = "avro") + private String platform; + + @Option( + names = {"--sink"}, + description = "DataHub sink name", + defaultValue = "rest") + private String sink; + + @Option( + names = {"--output-file"}, + description = "Output file for the emitted metadata", + defaultValue = "metadata.json") + private String outputFile; + + private final AvroSchemaConverter schemaConverter = AvroSchemaConverter.builder().build(); + + @Override + public Integer call() throws Exception { + + Emitter emitter; + if (sink.equals("rest")) { + emitter = RestEmitter.create(b -> b.server(server).token(token)); + } else if (sink.equals("file")) { + emitter = new FileEmitter(FileEmitterConfig.builder().fileName(outputFile).build()); + } else { + throw new IllegalArgumentException("Unsupported sink: " + sink); + } + + try { + // Process input files + Stream inputFiles; + Path inputPath = Path.of(input); + if (Files.isDirectory(inputPath)) { + inputFiles = Files.walk(inputPath).filter(p -> p.toString().endsWith(".avsc")); + } else { + inputFiles = Stream.of(inputPath); + } + + // Process each file + inputFiles.forEach( + filePath -> { + try { + // Read and parse Avro schema + String schemaStr = Files.readString(filePath); + Schema avroSchema = new Schema.Parser().parse(schemaStr); + + // Convert to DataHub schema + boolean isKeySchema = false; + boolean isDefaultNullable = false; + SchemaMetadata schemaMetadata = + schemaConverter.toDataHubSchema( + avroSchema, + isKeySchema, + isDefaultNullable, + new DataPlatformUrn(platform), + null); + log.info("Generated {} fields", schemaMetadata.getFields().size()); + for (SchemaField field : schemaMetadata.getFields()) { + log.debug("Field path: {}", field.getFieldPath()); + } + + DatasetUrn datasetUrn = + new DatasetUrn( + new DataPlatformUrn(platform), avroSchema.getFullName(), FabricType.PROD); + + MetadataChangeProposalWrapper wrapper = + new MetadataChangeProposalWrapper( + "dataset", + datasetUrn.toString(), + ChangeType.UPSERT, + schemaMetadata, + "schemaMetadata"); + + // Emit to DataHub + emitter.emit(wrapper, null).get(); + log.info("Emitted schema for {}", datasetUrn); + } catch (Exception e) { + System.err.println("Error processing file: " + filePath); + e.printStackTrace(); + } + }); + + return 0; + } finally { + emitter.close(); + } + } + + public static void main(String[] args) { + int exitCode = new CommandLine(new SchemaTron()).execute(args); + System.exit(exitCode); + } +} diff --git a/metadata-integration/java/datahub-schematron/cli/src/test/java/io/datahubproject/schematron/SchemaTranslatorTest.java b/metadata-integration/java/datahub-schematron/cli/src/test/java/io/datahubproject/schematron/SchemaTranslatorTest.java new file mode 100644 index 00000000000000..bb11beb00729e7 --- /dev/null +++ b/metadata-integration/java/datahub-schematron/cli/src/test/java/io/datahubproject/schematron/SchemaTranslatorTest.java @@ -0,0 +1,149 @@ +package io.datahubproject.schematron; + +import static org.testng.Assert.assertEquals; + +import io.datahubproject.schematron.cli.SchemaTron; +import java.io.File; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.Paths; +import java.util.List; +import java.util.Map; +import java.util.stream.Collectors; +import java.util.stream.Stream; +import org.testng.annotations.BeforeClass; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; +import picocli.CommandLine; + +public class SchemaTranslatorTest { + private static final String TEST_RESOURCES_DIR = "src/test/resources"; + private static final String TEMP_OUTPUT_DIR = "build/test-outputs"; + private static final String PYTHON_SCRIPT = "scripts/avro_schema_to_mce.py"; + private static final String DIFF_SCRIPT = "scripts/mce_diff.py"; + private static final String VENV_PATH = + "../../../../metadata-ingestion/venv"; // Adjust this path to your venv location + + @BeforeClass + public static void setup() { + // Create output directory if it doesn't exist + new File(TEMP_OUTPUT_DIR).mkdirs(); + + // Verify venv exists + if (!new File(VENV_PATH).exists()) { + throw new RuntimeException("Virtual environment not found at " + VENV_PATH); + } + } + + @DataProvider(name = "schemaFiles") + public Object[][] getSchemaFiles() throws Exception { + List schemaFiles = + Files.walk(Paths.get(TEST_RESOURCES_DIR)) + .filter(path -> path.toString().endsWith(".avsc")) + .collect(Collectors.toList()); + + Object[][] testData = new Object[schemaFiles.size()][1]; + for (int i = 0; i < schemaFiles.size(); i++) { + testData[i][0] = schemaFiles.get(i); + } + return testData; + } + + @Test(dataProvider = "schemaFiles") + public void testSchemaTranslations(Path schemaFile) throws Exception { + compareTranslations(schemaFile); + } + + private ProcessBuilder createPythonProcessBuilder(String... args) { + ProcessBuilder pb; + String os = System.getProperty("os.name").toLowerCase(); + + if (os.contains("windows")) { + // Windows paths + String pythonPath = Paths.get(VENV_PATH, "Scripts", "python").toString(); + pb = + new ProcessBuilder( + Stream.concat(Stream.of(pythonPath), Stream.of(args)).toArray(String[]::new)); + } else { + // Unix-like paths + String pythonPath = Paths.get(VENV_PATH, "bin", "python").toString(); + pb = + new ProcessBuilder( + Stream.concat(Stream.of(pythonPath), Stream.of(args)).toArray(String[]::new)); + } + + // Add virtual environment to PYTHONPATH + Map env = pb.environment(); + String sitePkgPath = + Paths.get( + VENV_PATH, + os.contains("windows") ? "Lib/site-packages" : "lib/python3.x/site-packages") + .toString(); + + String pythonPath = env.getOrDefault("PYTHONPATH", ""); + env.put("PYTHONPATH", pythonPath + File.pathSeparator + sitePkgPath); + + return pb.inheritIO(); + } + + private void compareTranslations(Path schemaFile) throws Exception { + String baseName = schemaFile.getFileName().toString().replace(".avsc", ""); + String javaOutput = TEMP_OUTPUT_DIR + "/" + baseName + "_java.json"; + String pythonOutput = TEMP_OUTPUT_DIR + "/" + baseName + "_python.json"; + String diffFile = schemaFile.getParent().toString() + "/diffs/" + baseName + "_diff.json"; + + // Test if diffFile exists + File diff = new File(diffFile); + if (!diff.exists()) { + diffFile = null; + } + + // Run Python translator + Process pythonProcess = + createPythonProcessBuilder( + PYTHON_SCRIPT, + "--platform", + "datahub", + "--input-file", + schemaFile.toString(), + "--output-file", + pythonOutput) + .inheritIO() + .start(); + + int pythonExitCode = pythonProcess.waitFor(); + assertEquals(pythonExitCode, 0, "Python translation failed"); + + // Run Java translator directly using SchemaTron + SchemaTron schemaTron = new SchemaTron(); + int javaExitCode = + new CommandLine(schemaTron) + .execute( + "-i", + schemaFile.toAbsolutePath().toString(), + "--sink", + "file", + "--output-file", + javaOutput, + "--platform", + "datahub"); + + assertEquals(javaExitCode, 0, "Java translation failed"); + + // Compare outputs + // if diffFile is not provided, we just compare the outputs + ProcessBuilder diffProcessBuilder; + if (diffFile == null) { + diffProcessBuilder = createPythonProcessBuilder(DIFF_SCRIPT, pythonOutput, javaOutput); + } else { + diffProcessBuilder = + createPythonProcessBuilder( + DIFF_SCRIPT, pythonOutput, javaOutput, "--golden-diff-file", diffFile); + } + + Process diffProcess = diffProcessBuilder.inheritIO().start(); + + int diffExitCode = diffProcess.waitFor(); + assertEquals(diffExitCode, 0, "Outputs differ for " + schemaFile.getFileName()); + } +} diff --git a/metadata-integration/java/datahub-schematron/cli/src/test/resources/CustomerProfile.avsc b/metadata-integration/java/datahub-schematron/cli/src/test/resources/CustomerProfile.avsc new file mode 100644 index 00000000000000..81f8b0e54b11e0 --- /dev/null +++ b/metadata-integration/java/datahub-schematron/cli/src/test/resources/CustomerProfile.avsc @@ -0,0 +1,456 @@ +{ + "type": "record", + "name": "CustomerProfile", + "namespace": "com.example.customer", + "doc": "A complex customer profile schema demonstrating various union types and optional fields", + "fields": [ + { + "name": "customerId", + "type": { + "type": "string", + "logicalType": "uuid" + }, + "doc": "Unique identifier for the customer" + }, + { + "name": "identificationDocument", + "type": [ + "null", + { + "type": "record", + "name": "Passport", + "fields": [ + { + "name": "passportNumber", + "type": "string" + }, + { + "name": "expiryDate", + "type": { + "type": "long", + "logicalType": "date" + } + } + ] + }, + { + "type": "record", + "name": "DriversLicense", + "fields": [ + { + "name": "licenseNumber", + "type": "string" + }, + { + "name": "state", + "type": "string" + }, + { + "name": "validUntil", + "type": { + "type": "long", + "logicalType": "date" + } + } + ] + }, + { + "type": "record", + "name": "NationalID", + "fields": [ + { + "name": "idNumber", + "type": "string" + }, + { + "name": "country", + "type": "string" + } + ] + } + ], + "default": null, + "doc": "Customer's identification document - can be passport, driver's license, or national ID" + }, + { + "name": "contactInfo", + "type": { + "type": "record", + "name": "ContactInformation", + "fields": [ + { + "name": "primaryContact", + "type": [ + { + "type": "record", + "name": "EmailContact", + "fields": [ + { + "name": "emailAddress", + "type": "string" + }, + { + "name": "isVerified", + "type": "boolean", + "default": false + } + ] + }, + { + "type": "record", + "name": "PhoneContact", + "fields": [ + { + "name": "countryCode", + "type": "string" + }, + { + "name": "number", + "type": "string" + }, + { + "name": "type", + "type": { + "type": "enum", + "name": "PhoneType", + "symbols": [ + "MOBILE", + "LANDLINE" + ] + } + } + ] + } + ], + "doc": "Primary contact method - either email or phone" + }, + { + "name": "alternativeContacts", + "type": { + "type": "array", + "items": [ + "null", + "EmailContact", + "PhoneContact" + ] + }, + "default": [], + "doc": "List of alternative contact methods" + } + ] + } + }, + { + "name": "addresses", + "type": { + "type": "array", + "items": { + "type": "record", + "name": "Address", + "fields": [ + { + "name": "type", + "type": { + "type": "enum", + "name": "AddressType", + "symbols": [ + "RESIDENTIAL", + "BUSINESS", + "SHIPPING" + ] + }, + "default": "RESIDENTIAL" + }, + { + "name": "street", + "type": "string" + }, + { + "name": "city", + "type": "string" + }, + { + "name": "state", + "type": [ + "null", + "string" + ], + "default": null + }, + { + "name": "country", + "type": "string" + }, + { + "name": "postalCode", + "type": [ + "null", + "string" + ], + "default": null + }, + { + "name": "validationStatus", + "type": [ + "null", + { + "type": "record", + "name": "AddressValidation", + "fields": [ + { + "name": "isValid", + "type": "boolean" + }, + { + "name": "verificationDate", + "type": { + "type": "long", + "logicalType": "timestamp-millis" + } + }, + { + "name": "verificationMethod", + "type": { + "type": "enum", + "name": "VerificationMethod", + "symbols": [ + "MANUAL", + "AUTOMATED" + ] + } + } + ] + } + ], + "default": null + } + ] + } + }, + "doc": "Customer's addresses with validation information" + }, + { + "name": "preferences", + "type": { + "type": "map", + "values": [ + "null", + "string", + "boolean", + { + "type": "record", + "name": "FrequencyPreference", + "fields": [ + { + "name": "frequency", + "type": { + "type": "enum", + "name": "Frequency", + "symbols": [ + "DAILY", + "WEEKLY", + "MONTHLY" + ] + } + }, + { + "name": "enabled", + "type": "boolean", + "default": true + }, + { + "name": "lastUpdated", + "type": { + "type": "long", + "logicalType": "timestamp-millis" + } + } + ] + } + ] + }, + "doc": "Customer preferences with various possible value types" + }, + { + "name": "subscriptionHistory", + "type": [ + "null", + { + "type": "array", + "items": { + "type": "record", + "name": "Subscription", + "fields": [ + { + "name": "planName", + "type": "string" + }, + { + "name": "startDate", + "type": { + "type": "long", + "logicalType": "date" + } + }, + { + "name": "endDate", + "type": [ + "null", + { + "type": "long", + "logicalType": "date" + } + ], + "default": null + }, + { + "name": "status", + "type": { + "type": "enum", + "name": "SubscriptionStatus", + "symbols": [ + "ACTIVE", + "CANCELLED", + "EXPIRED", + "SUSPENDED" + ] + } + }, + { + "name": "paymentMethod", + "type": [ + "null", + { + "type": "record", + "name": "PaymentMethod", + "fields": [ + { + "name": "type", + "type": { + "type": "enum", + "name": "PaymentType", + "symbols": [ + "CREDIT_CARD", + "DEBIT_CARD", + "BANK_TRANSFER", + "DIGITAL_WALLET" + ] + } + }, + { + "name": "lastFourDigits", + "type": [ + "null", + "string" + ], + "default": null + }, + { + "name": "expiryDate", + "type": [ + "null", + { + "type": "long", + "logicalType": "date" + } + ], + "default": null + } + ] + } + ], + "default": null + } + ] + } + } + ], + "default": null, + "doc": "Historical record of customer subscriptions" + }, + { + "name": "metadata", + "type": { + "type": "map", + "values": [ + "null", + "string", + "long", + "boolean", + { + "type": "record", + "name": "MetadataValue", + "fields": [ + { + "name": "value", + "type": [ + "null", + "string", + "long", + "boolean" + ], + "default": null + }, + { + "name": "timestamp", + "type": { + "type": "long", + "logicalType": "timestamp-millis" + } + }, + { + "name": "source", + "type": "string" + } + ] + } + ] + }, + "doc": "Flexible metadata storage with various possible value types" + }, + { + "name": "tags", + "type": [ + "null", + { + "type": "array", + "items": { + "type": "record", + "name": "Tag", + "fields": [ + { + "name": "name", + "type": "string" + }, + { + "name": "value", + "type": [ + "null", + "string" + ], + "default": null + }, + { + "name": "score", + "type": [ + "null", + "double" + ], + "default": null + }, + { + "name": "addedAt", + "type": { + "type": "long", + "logicalType": "timestamp-millis" + } + } + ] + } + } + ], + "default": null, + "doc": "Optional tags associated with the customer profile" + } + ] +} \ No newline at end of file diff --git a/metadata-integration/java/datahub-schematron/cli/src/test/resources/CustomerProfile2.avsc b/metadata-integration/java/datahub-schematron/cli/src/test/resources/CustomerProfile2.avsc new file mode 100644 index 00000000000000..b8c7654ea072a2 --- /dev/null +++ b/metadata-integration/java/datahub-schematron/cli/src/test/resources/CustomerProfile2.avsc @@ -0,0 +1,244 @@ +{ + "type": "record", + "name": "CustomerProfile2", + "namespace": "com.example.customer", + "doc": "A complex customer profile schema demonstrating various union types and optional fields", + "fields": [ + { + "name": "customerId", + "type": { + "type": "string", + "logicalType": "uuid" + }, + "doc": "Unique identifier for the customer" + }, + { + "name": "identificationDocument", + "type": [ + "null", + { + "type": "record", + "name": "Passport", + "fields": [ + { + "name": "passportNumber", + "type": "string" + }, + { + "name": "expiryDate", + "type": { + "type": "long", + "logicalType": "date" + } + } + ] + }, + { + "type": "record", + "name": "DriversLicense", + "fields": [ + { + "name": "licenseNumber", + "type": "string" + }, + { + "name": "state", + "type": "string" + }, + { + "name": "validUntil", + "type": { + "type": "long", + "logicalType": "date" + } + } + ] + }, + { + "type": "record", + "name": "NationalID", + "fields": [ + { + "name": "idNumber", + "type": "string" + }, + { + "name": "country", + "type": "string" + } + ] + } + ], + "default": null, + "doc": "Customer's identification document" + }, + { + "name": "contactInfo", + "type": { + "type": "record", + "name": "ContactInformation", + "fields": [ + { + "name": "primaryEmailContact", + "type": [ + "null", + { + "type": "record", + "name": "PrimaryEmailContact", + "fields": [ + { + "name": "emailAddress", + "type": "string" + }, + { + "name": "isVerified", + "type": "boolean", + "default": false + } + ] + } + ], + "default": null + }, + { + "name": "primaryPhoneContact", + "type": [ + "null", + { + "type": "record", + "name": "PrimaryPhoneContact", + "fields": [ + { + "name": "countryCode", + "type": "string" + }, + { + "name": "number", + "type": "string" + }, + { + "name": "type", + "type": { + "type": "enum", + "name": "PhoneType", + "symbols": [ + "MOBILE", + "LANDLINE" + ] + } + } + ] + } + ], + "default": null + }, + { + "name": "alternativeEmailContacts", + "type": { + "type": "array", + "items": { + "type": "record", + "name": "AlternativeEmailContact", + "fields": [ + { + "name": "emailAddress", + "type": "string" + }, + { + "name": "isVerified", + "type": "boolean", + "default": false + } + ] + } + }, + "default": [] + }, + { + "name": "alternativePhoneContacts", + "type": { + "type": "array", + "items": { + "type": "record", + "name": "AlternativePhoneContact", + "fields": [ + { + "name": "countryCode", + "type": "string" + }, + { + "name": "number", + "type": "string" + }, + { + "name": "type", + "type": "PhoneType" + } + ] + } + }, + "default": [] + } + ] + } + }, + { + "name": "preferences", + "type": { + "type": "record", + "name": "Preferences", + "fields": [ + { + "name": "simplePreferences", + "type": { + "type": "map", + "values": [ + "null", + "string", + "boolean" + ] + }, + "default": {} + }, + { + "name": "frequencyPreferences", + "type": { + "type": "map", + "values": { + "type": "record", + "name": "FrequencyPreference", + "fields": [ + { + "name": "frequency", + "type": { + "type": "enum", + "name": "Frequency", + "symbols": [ + "DAILY", + "WEEKLY", + "MONTHLY" + ] + } + }, + { + "name": "enabled", + "type": "boolean", + "default": true + }, + { + "name": "lastUpdated", + "type": { + "type": "long", + "logicalType": "timestamp-millis" + } + } + ] + } + }, + "default": {} + } + ] + } + } + ] +} \ No newline at end of file diff --git a/metadata-integration/java/datahub-schematron/cli/src/test/resources/FlatUser.avsc b/metadata-integration/java/datahub-schematron/cli/src/test/resources/FlatUser.avsc new file mode 100644 index 00000000000000..c796878c32ae41 --- /dev/null +++ b/metadata-integration/java/datahub-schematron/cli/src/test/resources/FlatUser.avsc @@ -0,0 +1,45 @@ +{ + "type": "record", + "name": "FlatUser", + "namespace": "com.example", + "fields": [ + { + "name": "id", + "type": "int", + "doc": "The unique identifier for a user", + "default": -1, + "metadata": { + "key1": "value1", + "key2": "value2" + } + }, + { + "name": "username", + "type": "string", + "doc": "The username of the user" + }, + { + "name": "email", + "type": "string", + "doc": "The email of the user" + }, + { + "name": "age", + "type": "int", + "doc": "The age of the user" + }, + { + "name": "isActive", + "type": "boolean", + "doc": "Whether the user is active or not" + }, + { + "name": "registrationDate", + "type": { + "type": "long", + "logicalType": "timestamp-millis" + }, + "doc": "The registration date of the user" + } + ] +} \ No newline at end of file diff --git a/metadata-integration/java/datahub-schematron/cli/src/test/resources/diffs/CustomerProfile2_diff.json b/metadata-integration/java/datahub-schematron/cli/src/test/resources/diffs/CustomerProfile2_diff.json new file mode 100644 index 00000000000000..d4677d722a0cb2 --- /dev/null +++ b/metadata-integration/java/datahub-schematron/cli/src/test/resources/diffs/CustomerProfile2_diff.json @@ -0,0 +1,125 @@ +{ + "urn:li:dataset:(urn:li:dataPlatform:datahub,com.example.customer.CustomerProfile2,PROD)": { + "schemaMetadata": { + "added": {}, + "modified": [ + "fields" + ], + "modified_details": { + "fields": { + "added": {}, + "modified": [ + "[version=2.0].[type=CustomerProfile2].[type=ContactInformation].contactInfo.[type=PrimaryEmailContact].primaryEmailContact.[type=boolean].isVerified", + "[version=2.0].[type=CustomerProfile2].[type=ContactInformation].contactInfo.[type=array].[type=AlternativeEmailContact].alternativeEmailContacts", + "[version=2.0].[type=CustomerProfile2].[type=ContactInformation].contactInfo.[type=array].[type=AlternativeEmailContact].alternativeEmailContacts.[type=boolean].isVerified", + "[version=2.0].[type=CustomerProfile2].[type=ContactInformation].contactInfo.[type=array].[type=AlternativePhoneContact].alternativePhoneContacts", + "[version=2.0].[type=CustomerProfile2].[type=Preferences].preferences.[type=map].[type=FrequencyPreference].frequencyPreferences", + "[version=2.0].[type=CustomerProfile2].[type=Preferences].preferences.[type=map].[type=FrequencyPreference].frequencyPreferences.[type=boolean].enabled", + "[version=2.0].[type=CustomerProfile2].[type=Preferences].preferences.[type=map].[type=union].[type=boolean].simplePreferences", + "[version=2.0].[type=CustomerProfile2].[type=Preferences].preferences.[type=map].[type=union].[type=string].simplePreferences", + "[version=2.0].[type=CustomerProfile2].[type=Preferences].preferences.[type=map].[type=union].simplePreferences", + "[version=2.0].[type=CustomerProfile2].[type=union].[type=DriversLicense].identificationDocument", + "[version=2.0].[type=CustomerProfile2].[type=union].[type=NationalID].identificationDocument", + "[version=2.0].[type=CustomerProfile2].[type=union].[type=Passport].identificationDocument", + "[version=2.0].[type=CustomerProfile2].[type=union].identificationDocument" + ], + "modified_details": { + "[version=2.0].[type=CustomerProfile2].[type=ContactInformation].contactInfo.[type=PrimaryEmailContact].primaryEmailContact.[type=boolean].isVerified": { + "description": { + "after": null, + "before": "\nField default value: False", + "identifier": "[version=2.0].[type=CustomerProfile2].[type=ContactInformation].contactInfo.[type=PrimaryEmailContact].primaryEmailContact.[type=boolean].isVerified" + } + }, + "[version=2.0].[type=CustomerProfile2].[type=ContactInformation].contactInfo.[type=array].[type=AlternativeEmailContact].alternativeEmailContacts": { + "description": { + "after": null, + "before": "\nField default value: []", + "identifier": "[version=2.0].[type=CustomerProfile2].[type=ContactInformation].contactInfo.[type=array].[type=AlternativeEmailContact].alternativeEmailContacts" + } + }, + "[version=2.0].[type=CustomerProfile2].[type=ContactInformation].contactInfo.[type=array].[type=AlternativeEmailContact].alternativeEmailContacts.[type=boolean].isVerified": { + "description": { + "after": null, + "before": "\nField default value: False", + "identifier": "[version=2.0].[type=CustomerProfile2].[type=ContactInformation].contactInfo.[type=array].[type=AlternativeEmailContact].alternativeEmailContacts.[type=boolean].isVerified" + } + }, + "[version=2.0].[type=CustomerProfile2].[type=ContactInformation].contactInfo.[type=array].[type=AlternativePhoneContact].alternativePhoneContacts": { + "description": { + "after": null, + "before": "\nField default value: []", + "identifier": "[version=2.0].[type=CustomerProfile2].[type=ContactInformation].contactInfo.[type=array].[type=AlternativePhoneContact].alternativePhoneContacts" + } + }, + "[version=2.0].[type=CustomerProfile2].[type=Preferences].preferences.[type=map].[type=FrequencyPreference].frequencyPreferences": { + "description": { + "after": null, + "before": "\nField default value: {}", + "identifier": "[version=2.0].[type=CustomerProfile2].[type=Preferences].preferences.[type=map].[type=FrequencyPreference].frequencyPreferences" + } + }, + "[version=2.0].[type=CustomerProfile2].[type=Preferences].preferences.[type=map].[type=FrequencyPreference].frequencyPreferences.[type=boolean].enabled": { + "description": { + "after": null, + "before": "\nField default value: True", + "identifier": "[version=2.0].[type=CustomerProfile2].[type=Preferences].preferences.[type=map].[type=FrequencyPreference].frequencyPreferences.[type=boolean].enabled" + } + }, + "[version=2.0].[type=CustomerProfile2].[type=Preferences].preferences.[type=map].[type=union].[type=boolean].simplePreferences": { + "description": { + "after": null, + "before": "\nField default value: {}", + "identifier": "[version=2.0].[type=CustomerProfile2].[type=Preferences].preferences.[type=map].[type=union].[type=boolean].simplePreferences" + } + }, + "[version=2.0].[type=CustomerProfile2].[type=Preferences].preferences.[type=map].[type=union].[type=string].simplePreferences": { + "description": { + "after": null, + "before": "\nField default value: {}", + "identifier": "[version=2.0].[type=CustomerProfile2].[type=Preferences].preferences.[type=map].[type=union].[type=string].simplePreferences" + } + }, + "[version=2.0].[type=CustomerProfile2].[type=Preferences].preferences.[type=map].[type=union].simplePreferences": { + "description": { + "after": null, + "before": "\nField default value: {}", + "identifier": "[version=2.0].[type=CustomerProfile2].[type=Preferences].preferences.[type=map].[type=union].simplePreferences" + } + }, + "[version=2.0].[type=CustomerProfile2].[type=union].[type=DriversLicense].identificationDocument": { + "nullable": { + "after": false, + "before": true, + "identifier": "[version=2.0].[type=CustomerProfile2].[type=union].[type=DriversLicense].identificationDocument" + } + }, + "[version=2.0].[type=CustomerProfile2].[type=union].[type=NationalID].identificationDocument": { + "nullable": { + "after": false, + "before": true, + "identifier": "[version=2.0].[type=CustomerProfile2].[type=union].[type=NationalID].identificationDocument" + } + }, + "[version=2.0].[type=CustomerProfile2].[type=union].[type=Passport].identificationDocument": { + "nullable": { + "after": false, + "before": true, + "identifier": "[version=2.0].[type=CustomerProfile2].[type=union].[type=Passport].identificationDocument" + } + }, + "[version=2.0].[type=CustomerProfile2].[type=union].identificationDocument": { + "description": { + "after": "Customer's identification document\nField default value: null", + "before": "Customer's identification document", + "identifier": "[version=2.0].[type=CustomerProfile2].[type=union].identificationDocument" + } + } + }, + "removed": {} + } + }, + "removed": {} + } + } +} \ No newline at end of file diff --git a/metadata-integration/java/datahub-schematron/cli/src/test/resources/diffs/CustomerProfile_diff.json b/metadata-integration/java/datahub-schematron/cli/src/test/resources/diffs/CustomerProfile_diff.json new file mode 100644 index 00000000000000..4bf0e1074d9a4a --- /dev/null +++ b/metadata-integration/java/datahub-schematron/cli/src/test/resources/diffs/CustomerProfile_diff.json @@ -0,0 +1,181 @@ +{ + "urn:li:dataset:(urn:li:dataPlatform:datahub,com.example.customer.CustomerProfile,PROD)": { + "schemaMetadata": { + "added": {}, + "modified": [ + "fields" + ], + "modified_details": { + "fields": { + "added": { + "[version=2.0].[type=CustomerProfile].[type=ContactInformation].contactInfo.[type=array].[type=union].[type=EmailContact].alternativeContacts.[type=boolean].isVerified": { + "fieldPath": "[version=2.0].[type=CustomerProfile].[type=ContactInformation].contactInfo.[type=array].[type=union].[type=EmailContact].alternativeContacts.[type=boolean].isVerified", + "isPartOfKey": false, + "nativeDataType": "boolean", + "nullable": false, + "type": { + "type": { + "com.linkedin.schema.BooleanType": {} + } + } + }, + "[version=2.0].[type=CustomerProfile].[type=ContactInformation].contactInfo.[type=array].[type=union].[type=EmailContact].alternativeContacts.[type=string].emailAddress": { + "fieldPath": "[version=2.0].[type=CustomerProfile].[type=ContactInformation].contactInfo.[type=array].[type=union].[type=EmailContact].alternativeContacts.[type=string].emailAddress", + "isPartOfKey": false, + "nativeDataType": "string", + "nullable": false, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + } + }, + "[version=2.0].[type=CustomerProfile].[type=ContactInformation].contactInfo.[type=array].[type=union].[type=PhoneContact].alternativeContacts.[type=enum].type": { + "fieldPath": "[version=2.0].[type=CustomerProfile].[type=ContactInformation].contactInfo.[type=array].[type=union].[type=PhoneContact].alternativeContacts.[type=enum].type", + "isPartOfKey": false, + "nativeDataType": "Enum", + "nullable": false, + "type": { + "type": { + "com.linkedin.schema.EnumType": {} + } + } + }, + "[version=2.0].[type=CustomerProfile].[type=ContactInformation].contactInfo.[type=array].[type=union].[type=PhoneContact].alternativeContacts.[type=string].countryCode": { + "fieldPath": "[version=2.0].[type=CustomerProfile].[type=ContactInformation].contactInfo.[type=array].[type=union].[type=PhoneContact].alternativeContacts.[type=string].countryCode", + "isPartOfKey": false, + "nativeDataType": "string", + "nullable": false, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + } + }, + "[version=2.0].[type=CustomerProfile].[type=ContactInformation].contactInfo.[type=array].[type=union].[type=PhoneContact].alternativeContacts.[type=string].number": { + "fieldPath": "[version=2.0].[type=CustomerProfile].[type=ContactInformation].contactInfo.[type=array].[type=union].[type=PhoneContact].alternativeContacts.[type=string].number", + "isPartOfKey": false, + "nativeDataType": "string", + "nullable": false, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + } + } + }, + "modified": [ + "[version=2.0].[type=CustomerProfile].[type=ContactInformation].contactInfo.[type=array].[type=union].[type=EmailContact].alternativeContacts", + "[version=2.0].[type=CustomerProfile].[type=ContactInformation].contactInfo.[type=array].[type=union].[type=PhoneContact].alternativeContacts", + "[version=2.0].[type=CustomerProfile].[type=ContactInformation].contactInfo.[type=array].[type=union].alternativeContacts", + "[version=2.0].[type=CustomerProfile].[type=ContactInformation].contactInfo.[type=union].[type=EmailContact].primaryContact.[type=boolean].isVerified", + "[version=2.0].[type=CustomerProfile].[type=array].[type=Address].addresses.[type=enum].type", + "[version=2.0].[type=CustomerProfile].[type=map].[type=union].[type=FrequencyPreference].preferences.[type=boolean].enabled", + "[version=2.0].[type=CustomerProfile].[type=map].[type=union].[type=MetadataValue].metadata.[type=union].[type=boolean].value", + "[version=2.0].[type=CustomerProfile].[type=map].[type=union].[type=MetadataValue].metadata.[type=union].[type=long].value", + "[version=2.0].[type=CustomerProfile].[type=map].[type=union].[type=MetadataValue].metadata.[type=union].[type=string].value", + "[version=2.0].[type=CustomerProfile].[type=union].[type=DriversLicense].identificationDocument", + "[version=2.0].[type=CustomerProfile].[type=union].[type=NationalID].identificationDocument", + "[version=2.0].[type=CustomerProfile].[type=union].[type=Passport].identificationDocument", + "[version=2.0].[type=CustomerProfile].[type=union].identificationDocument" + ], + "modified_details": { + "[version=2.0].[type=CustomerProfile].[type=ContactInformation].contactInfo.[type=array].[type=union].[type=EmailContact].alternativeContacts": { + "description": { + "after": "List of alternative contact methods", + "before": "List of alternative contact methods\nField default value: []", + "identifier": "[version=2.0].[type=CustomerProfile].[type=ContactInformation].contactInfo.[type=array].[type=union].[type=EmailContact].alternativeContacts" + } + }, + "[version=2.0].[type=CustomerProfile].[type=ContactInformation].contactInfo.[type=array].[type=union].[type=PhoneContact].alternativeContacts": { + "description": { + "after": "List of alternative contact methods", + "before": "List of alternative contact methods\nField default value: []", + "identifier": "[version=2.0].[type=CustomerProfile].[type=ContactInformation].contactInfo.[type=array].[type=union].[type=PhoneContact].alternativeContacts" + } + }, + "[version=2.0].[type=CustomerProfile].[type=ContactInformation].contactInfo.[type=array].[type=union].alternativeContacts": { + "description": { + "after": "List of alternative contact methods", + "before": "List of alternative contact methods\nField default value: []", + "identifier": "[version=2.0].[type=CustomerProfile].[type=ContactInformation].contactInfo.[type=array].[type=union].alternativeContacts" + } + }, + "[version=2.0].[type=CustomerProfile].[type=ContactInformation].contactInfo.[type=union].[type=EmailContact].primaryContact.[type=boolean].isVerified": { + "description": { + "after": null, + "before": "\nField default value: False", + "identifier": "[version=2.0].[type=CustomerProfile].[type=ContactInformation].contactInfo.[type=union].[type=EmailContact].primaryContact.[type=boolean].isVerified" + } + }, + "[version=2.0].[type=CustomerProfile].[type=array].[type=Address].addresses.[type=enum].type": { + "description": { + "after": null, + "before": "\nField default value: RESIDENTIAL", + "identifier": "[version=2.0].[type=CustomerProfile].[type=array].[type=Address].addresses.[type=enum].type" + } + }, + "[version=2.0].[type=CustomerProfile].[type=map].[type=union].[type=FrequencyPreference].preferences.[type=boolean].enabled": { + "description": { + "after": null, + "before": "\nField default value: True", + "identifier": "[version=2.0].[type=CustomerProfile].[type=map].[type=union].[type=FrequencyPreference].preferences.[type=boolean].enabled" + } + }, + "[version=2.0].[type=CustomerProfile].[type=map].[type=union].[type=MetadataValue].metadata.[type=union].[type=boolean].value": { + "nullable": { + "after": false, + "before": true, + "identifier": "[version=2.0].[type=CustomerProfile].[type=map].[type=union].[type=MetadataValue].metadata.[type=union].[type=boolean].value" + } + }, + "[version=2.0].[type=CustomerProfile].[type=map].[type=union].[type=MetadataValue].metadata.[type=union].[type=long].value": { + "nullable": { + "after": false, + "before": true, + "identifier": "[version=2.0].[type=CustomerProfile].[type=map].[type=union].[type=MetadataValue].metadata.[type=union].[type=long].value" + } + }, + "[version=2.0].[type=CustomerProfile].[type=map].[type=union].[type=MetadataValue].metadata.[type=union].[type=string].value": { + "nullable": { + "after": false, + "before": true, + "identifier": "[version=2.0].[type=CustomerProfile].[type=map].[type=union].[type=MetadataValue].metadata.[type=union].[type=string].value" + } + }, + "[version=2.0].[type=CustomerProfile].[type=union].[type=DriversLicense].identificationDocument": { + "nullable": { + "after": false, + "before": true, + "identifier": "[version=2.0].[type=CustomerProfile].[type=union].[type=DriversLicense].identificationDocument" + } + }, + "[version=2.0].[type=CustomerProfile].[type=union].[type=NationalID].identificationDocument": { + "nullable": { + "after": false, + "before": true, + "identifier": "[version=2.0].[type=CustomerProfile].[type=union].[type=NationalID].identificationDocument" + } + }, + "[version=2.0].[type=CustomerProfile].[type=union].[type=Passport].identificationDocument": { + "nullable": { + "after": false, + "before": true, + "identifier": "[version=2.0].[type=CustomerProfile].[type=union].[type=Passport].identificationDocument" + } + }, + "[version=2.0].[type=CustomerProfile].[type=union].identificationDocument": { + "description": { + "after": "Customer's identification document - can be passport, driver's license, or national ID\nField default value: null", + "before": "Customer's identification document - can be passport, driver's license, or national ID", + "identifier": "[version=2.0].[type=CustomerProfile].[type=union].identificationDocument" + } + } + }, + "removed": {} + } + }, + "removed": {} + } + } +} \ No newline at end of file diff --git a/metadata-integration/java/datahub-schematron/lib/build.gradle b/metadata-integration/java/datahub-schematron/lib/build.gradle new file mode 100644 index 00000000000000..3ba22ff4cb7b5d --- /dev/null +++ b/metadata-integration/java/datahub-schematron/lib/build.gradle @@ -0,0 +1,126 @@ +plugins { + id("com.palantir.git-version") apply false +} +apply plugin: 'java' +apply plugin: 'jacoco' +apply plugin: 'signing' +apply plugin: 'io.codearte.nexus-staging' +apply plugin: 'maven-publish' +apply from: '../../versioning.gradle' + +dependencies { + + implementation project(':entity-registry') +// +// // Jackson dependencies - use the same versions as in the parent project +// implementation 'com.fasterxml.jackson.core:jackson-core:2.12.3' +// implementation 'com.fasterxml.jackson.core:jackson-databind:2.12.3' +// implementation 'com.fasterxml.jackson.core:jackson-annotations:2.12.3' + + // Core dependencies +// implementation externalDependency.guava +// implementation externalDependency.gson +// implementation externalDependency.commonsCli +// implementation externalDependency.slf4jApi +// implementation externalDependency.jacksonCore + + // Schema format dependencies +// implementation externalDependency.protobuf + implementation externalDependency.avro +// implementation 'org.apache.thrift:libthrift:0.16.0' +// implementation 'io.swagger.parser.v3:swagger-parser:2.1.12' + + // Utilities + compileOnly externalDependency.lombok + annotationProcessor externalDependency.lombok + + // Testing + testImplementation externalDependency.testng + testImplementation 'org.mockito:mockito-core:5.3.1' +} + +jacocoTestReport { + dependsOn test +} + +test.finalizedBy jacocoTestReport + +configurations { + provided + implementation.extendsFrom provided +} + +java { + withJavadocJar() + withSourcesJar() +} + +publishing { + publications { + mavenJava(MavenPublication) { + from components.java + + pom { + name = 'Datahub Schematron' + groupId = 'io.acryl' + artifactId = 'datahub-schematron' + description = 'DataHub schema translation library for converting between different schema formats using DataHub as an intermediate representation' + url = 'https://datahubproject.io' + + scm { + connection = 'scm:git:git://github.com/datahub-project/datahub.git' + developerConnection = 'scm:git:ssh://github.com:datahub-project/datahub.git' + url = 'https://github.com/datahub-project/datahub.git' + } + + licenses { + license { + name = 'The Apache License, Version 2.0' + url = 'http://www.apache.org/licenses/LICENSE-2.0.txt' + } + } + + developers { + developer { + id = 'datahub' + name = 'Datahub' + email = 'datahub@acryl.io' + } + } + } + } + } + + repositories { + maven { + def releasesRepoUrl = "https://s01.oss.sonatype.org/service/local/staging/deploy/maven2/" + def snapshotsRepoUrl = "https://s01.oss.sonatype.org/content/repositories/snapshots/" + def ossrhUsername = System.getenv('RELEASE_USERNAME') + def ossrhPassword = System.getenv('RELEASE_PASSWORD') + credentials { + username ossrhUsername + password ossrhPassword + } + url = version.endsWith('SNAPSHOT') ? snapshotsRepoUrl : releasesRepoUrl + } + } +} + +signing { + def signingKey = findProperty("signingKey") + def signingPassword = System.getenv("SIGNING_PASSWORD") + // Only require signing if we have the signing key property + required = signingKey != null + + if (signingKey != null) { + useInMemoryPgpKeys(signingKey, signingPassword) + sign publishing.publications.mavenJava + } + +} + +nexusStaging { + serverUrl = "https://s01.oss.sonatype.org/service/local/" + username = System.getenv("NEXUS_USERNAME") + password = System.getenv("NEXUS_PASSWORD") +} \ No newline at end of file diff --git a/metadata-integration/java/datahub-schematron/lib/src/main/java/io/datahubproject/schematron/converters/SchemaConverter.java b/metadata-integration/java/datahub-schematron/lib/src/main/java/io/datahubproject/schematron/converters/SchemaConverter.java new file mode 100644 index 00000000000000..cb364f2c7a1a2d --- /dev/null +++ b/metadata-integration/java/datahub-schematron/lib/src/main/java/io/datahubproject/schematron/converters/SchemaConverter.java @@ -0,0 +1,25 @@ +package io.datahubproject.schematron.converters; + +import com.linkedin.common.urn.DataPlatformUrn; +import com.linkedin.schema.SchemaMetadata; + +/** Base interface for converting between different schema formats. */ +public interface SchemaConverter { + /** + * Converts a schema into DataHub's SchemaField format. + * + * @param schema The source schema to convert + * @param isKeySchema Whether this represents a key schema + * @param defaultNullable Default nullable setting for fields + * @param platformUrn Data platform urn + * @param rawSchemaString Raw schema string (if available). When provided - it will be used to + * generate the schema fingerprint + * @return List of SchemaFields representing the schema in DataHub's format + */ + SchemaMetadata toDataHubSchema( + T schema, + boolean isKeySchema, + boolean defaultNullable, + DataPlatformUrn platformUrn, + String rawSchemaString); +} diff --git a/metadata-integration/java/datahub-schematron/lib/src/main/java/io/datahubproject/schematron/converters/avro/AvroSchemaConverter.java b/metadata-integration/java/datahub-schematron/lib/src/main/java/io/datahubproject/schematron/converters/avro/AvroSchemaConverter.java new file mode 100644 index 00000000000000..0ddb357db76ba1 --- /dev/null +++ b/metadata-integration/java/datahub-schematron/lib/src/main/java/io/datahubproject/schematron/converters/avro/AvroSchemaConverter.java @@ -0,0 +1,611 @@ +package io.datahubproject.schematron.converters.avro; + +import com.fasterxml.jackson.databind.ObjectMapper; +import com.linkedin.common.urn.DataPlatformUrn; +import com.linkedin.schema.*; +import io.datahubproject.schematron.converters.SchemaConverter; +import io.datahubproject.schematron.models.*; +import java.nio.charset.StandardCharsets; +import java.util.*; +import java.util.function.Supplier; +import lombok.Builder; +import lombok.SneakyThrows; +import lombok.extern.slf4j.Slf4j; +import org.apache.avro.JsonProperties; +import org.apache.avro.LogicalType; +import org.apache.avro.Schema; +import org.apache.avro.SchemaNormalization; + +/** Converts Avro schemas to DataHub's schema format following SchemaFieldPath Specification V2. */ +@Slf4j +@Builder +public class AvroSchemaConverter implements SchemaConverter { + + private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); + private static final Map> LOGICAL_TYPE_MAPPING; + + static { + Map> logicalTypeMap = new HashMap<>(); + logicalTypeMap.put("date", () -> SchemaFieldDataType.Type.create(new DateType())); + logicalTypeMap.put("time-micros", () -> SchemaFieldDataType.Type.create(new TimeType())); + logicalTypeMap.put("time-millis", () -> SchemaFieldDataType.Type.create(new TimeType())); + logicalTypeMap.put("timestamp-micros", () -> SchemaFieldDataType.Type.create(new TimeType())); + logicalTypeMap.put("timestamp-millis", () -> SchemaFieldDataType.Type.create(new TimeType())); + logicalTypeMap.put("decimal", () -> SchemaFieldDataType.Type.create(new NumberType())); + logicalTypeMap.put("uuid", () -> SchemaFieldDataType.Type.create(new StringType())); + LOGICAL_TYPE_MAPPING = Collections.unmodifiableMap(logicalTypeMap); + } + + private SchemaFieldDataType.Type getTypeFromLogicalType(Schema schema) { + LogicalType logicalType = schema.getLogicalType(); + if (logicalType != null) { + Supplier typeSupplier = + LOGICAL_TYPE_MAPPING.get(logicalType.getName()); + if (typeSupplier != null) { + return typeSupplier.get(); + } + } + return getBaseType(schema); + } + + private SchemaFieldDataType.Type getBaseType(Schema schema) { + switch (schema.getType()) { + case BOOLEAN: + return SchemaFieldDataType.Type.create(new BooleanType()); + case INT: + case LONG: + case FLOAT: + case DOUBLE: + return SchemaFieldDataType.Type.create(new NumberType()); + case STRING: + return SchemaFieldDataType.Type.create(new StringType()); + case BYTES: + return SchemaFieldDataType.Type.create(new BytesType()); + case FIXED: + return SchemaFieldDataType.Type.create(new FixedType()); + case ENUM: + return SchemaFieldDataType.Type.create(new EnumType()); + case ARRAY: + return SchemaFieldDataType.Type.create(new ArrayType()); + case MAP: + return SchemaFieldDataType.Type.create(new MapType()); + case RECORD: + return SchemaFieldDataType.Type.create(new RecordType()); + case UNION: + return SchemaFieldDataType.Type.create(new UnionType()); + default: + return SchemaFieldDataType.Type.create(new NullType()); + } + } + + private String getFieldType(Schema schema) { + // For the field path, we just want the base type without the logical type + return schema.getType().getName().toLowerCase(); + } + + private String getNativeDataType(Schema schema) { + // For native data type, we can include the logical type information + LogicalType logicalType = schema.getLogicalType(); + if (logicalType != null) { + return schema.getType().getName().toLowerCase() + "(" + logicalType.getName() + ")"; + } + return schema.getType().getName().toLowerCase(); + } + + @Override + public SchemaMetadata toDataHubSchema( + Schema schema, + boolean isKeySchema, + boolean defaultNullable, + DataPlatformUrn platformUrn, + String rawSchemaString) { + + try { + byte[] fingerprintBytes = null; + try { + if (rawSchemaString != null) { + String canonicalForm = SchemaNormalization.toParsingForm(schema); + log.debug("Length of canonical form: {}", canonicalForm.length()); + log.debug("Canonical form: {}", canonicalForm); + fingerprintBytes = + SchemaNormalization.fingerprint( + "MD5", rawSchemaString.getBytes(StandardCharsets.UTF_8)); + } else { + fingerprintBytes = SchemaNormalization.parsingFingerprint("MD5", schema); + } + } catch (Exception e) { + log.error("Failed to compute schema fingerprint", e); + } + + String schemaHash = ""; + if (fingerprintBytes != null) { + // Convert to hex string + StringBuilder hexString = new StringBuilder(); + for (byte b : fingerprintBytes) { + hexString.append(String.format("%02x", b)); + } + schemaHash = hexString.toString(); + } + + List fields = new ArrayList<>(); + FieldPath basePath = new FieldPath(); + basePath.setKeySchema(isKeySchema); + + // Add the record type to the base path + if (schema.getType() == Schema.Type.RECORD) { + basePath = basePath.expandType(schema.getName(), schema.toString()); + } + + processSchema(schema, basePath, defaultNullable, fields); + + return new SchemaMetadata() + .setSchemaName(schema.getName()) + .setPlatform(platformUrn) + .setVersion(0) + .setHash(schemaHash) + .setPlatformSchema( + SchemaMetadata.PlatformSchema.create( + new OtherSchema().setRawSchema(schema.toString()))) + .setFields(new SchemaFieldArray(fields)); + + } catch (Exception e) { + log.error("Failed to convert Avro schema", e); + throw new RuntimeException("Failed to convert Avro schema", e); + } + } + + private void processSchema( + Schema schema, FieldPath fieldPath, boolean defaultNullable, List fields) { + if (schema.getType() == Schema.Type.RECORD) { + for (Schema.Field field : schema.getFields()) { + processField(field, fieldPath, defaultNullable, fields); + } + } + } + + private void processField( + Schema.Field field, FieldPath fieldPath, boolean defaultNullable, List fields) { + processField(field, fieldPath, defaultNullable, fields, false, null); + } + + private void processField( + Schema.Field field, + FieldPath fieldPath, + boolean defaultNullable, + List fields, + boolean nullableOverride) { + processField(field, fieldPath, defaultNullable, fields, nullableOverride, null); + } + + private void processField( + Schema.Field field, + FieldPath fieldPath, + boolean defaultNullable, + List fields, + boolean nullableOverride, + DataHubType typeOverride) { + log.debug( + "Processing field: {}, Field path : {}, Field schema: {}", + field.name(), + fieldPath.asString(), + field.schema()); + Schema fieldSchema = field.schema(); + boolean isNullable = isNullable(fieldSchema, defaultNullable); + if (nullableOverride) { + // If a nullable override is provided, use the override value + isNullable = true; + } + if (typeOverride != null) { + // If a type override is provided, use the nullable value from the override + isNullable = nullableOverride; + } + log.debug( + "DefaultNullability: {}, Determined nullability for field name: {} at path: {} is {}", + defaultNullable, + field.name(), + fieldPath.asString(), + isNullable); + String discriminatedType = getDiscriminatedType(fieldSchema); + + FieldElement element = + new FieldElement(new ArrayList<>(), new ArrayList<>(), field.name(), typeOverride); + + FieldPath newPath = fieldPath.clonePlus(element); + + switch (fieldSchema.getType()) { + case RECORD: + processRecordField( + field, newPath, discriminatedType, defaultNullable, fields, isNullable, typeOverride); + break; + case ARRAY: + processArrayField(field, newPath, discriminatedType, defaultNullable, fields, isNullable); + break; + case MAP: + processMapField(field, newPath, discriminatedType, defaultNullable, fields, isNullable); + break; + case UNION: + processUnionField( + field, newPath, discriminatedType, defaultNullable, fields, isNullable, typeOverride); + break; + case ENUM: + processEnumField(field, newPath, discriminatedType, defaultNullable, fields, isNullable); + break; + default: + processPrimitiveField( + field, newPath, discriminatedType, defaultNullable, fields, isNullable); + break; + } + } + + private void processRecordField( + Schema.Field field, + FieldPath fieldPath, + String discriminatedType, + boolean defaultNullable, + List fields, + boolean isNullable, + DataHubType typeOverride) { + + log.debug("Record Field Path before expand: {}", fieldPath.asString()); + FieldPath recordPath = fieldPath.expandType(discriminatedType, field.schema().toString()); + log.debug("Record Field Path after expand: {}", recordPath.asString()); + + SchemaFieldDataType dataType = + typeOverride != null + ? typeOverride.asSchemaFieldType() + : new SchemaFieldDataType().setType(SchemaFieldDataType.Type.create(new RecordType())); + + // Add the record field itself + SchemaField recordField = + new SchemaField() + .setFieldPath(recordPath.asString()) + .setType(dataType) + .setNativeDataType(discriminatedType) + .setNullable(isNullable || defaultNullable) + .setIsPartOfKey(fieldPath.isKeySchema()); + + populateCommonProperties(field, recordField); + + fields.add(recordField); + + // Process nested fields + for (Schema.Field nestedField : field.schema().getFields()) { + processField(nestedField, recordPath, defaultNullable, fields); + } + } + + @SneakyThrows + private static void populateCommonProperties(Schema.Field field, SchemaField datahubField) { + // Create a new mutable HashMap to store combined properties + Map combinedProps = new HashMap<>(); + + // Add properties from field if any exist + Map fieldProps = field.getObjectProps(); + if (fieldProps != null) { + combinedProps.putAll(fieldProps); + } + + // Add properties from schema if any exist + Map schemaProps = field.schema().getObjectProps(); + if (schemaProps != null) { + combinedProps.putAll(schemaProps); + } + + // Only proceed with serialization if we have properties + if (!combinedProps.isEmpty()) { + try { + String jsonSerializedProps = OBJECT_MAPPER.writeValueAsString(combinedProps); + datahubField.setJsonProps(jsonSerializedProps); + } catch (Exception e) { + log.error( + "Non-fatal error. Failed to serialize schema properties for field: " + field.name(), e); + } + } + + // Set the description if it exists + if (field.doc() != null && !field.doc().isEmpty()) { + datahubField.setDescription(field.doc()); + if (field.hasDefaultValue()) { + Object defaultValue = field.defaultVal(); + // if the default value is the JSON NULL node, then we handle it differently + if (defaultValue == JsonProperties.NULL_VALUE) { + datahubField.setDescription( + datahubField.getDescription() + "\nField default value: null"); + } else { + datahubField.setDescription( + datahubField.getDescription() + + "\nField default value: " + + OBJECT_MAPPER.writeValueAsString(defaultValue)); + } + } + } + } + + private void processArrayField( + Schema.Field field, + FieldPath fieldPath, + String discriminatedType, + boolean defaultNullable, + List fields, + boolean isNullable) { + + Schema arraySchema = field.schema(); + Schema elementSchema = arraySchema.getElementType(); + String elementType = getDiscriminatedType(elementSchema); + + fieldPath = fieldPath.expandType("array", arraySchema); + // Set parent type for proper array handling + DataHubType arrayDataHubType = new DataHubType(ArrayType.class, elementType); + + // Process element type if it's complex + if (elementSchema.getType() == Schema.Type.RECORD + || elementSchema.getType() == Schema.Type.ARRAY + || elementSchema.getType() == Schema.Type.MAP + || elementSchema.getType() == Schema.Type.UNION) { + log.debug("Array Field Path before expand: {}", fieldPath.asString()); + fieldPath = fieldPath.popLast(); + fieldPath = + fieldPath.clonePlus( + new FieldElement(Collections.singletonList("array"), new ArrayList<>(), null, null)); + Schema.Field elementField = + new Schema.Field( + field.name(), + elementSchema, + elementSchema.getDoc() != null ? elementSchema.getDoc() : field.doc(), + null // TODO: What is the default value for an array element? + ); + processField(elementField, fieldPath, defaultNullable, fields, isNullable, arrayDataHubType); + } else { + + SchemaField arrayField = + new SchemaField() + .setFieldPath(fieldPath.asString()) + .setType(arrayDataHubType.asSchemaFieldType()) + .setNativeDataType("array(" + elementType + ")") + .setNullable(isNullable || defaultNullable) + .setIsPartOfKey(fieldPath.isKeySchema()); + + populateCommonProperties(field, arrayField); + log.debug("Array field path: {} with doc: {}", fieldPath.asString(), field.doc()); + fields.add(arrayField); + } + } + + private void processMapField( + Schema.Field field, + FieldPath fieldPath, + String discriminatedType, + boolean defaultNullable, + List fields, + boolean isNullable) { + + Schema mapSchema = field.schema(); + Schema valueSchema = mapSchema.getValueType(); + String valueType = getDiscriminatedType(valueSchema); + + DataHubType mapDataHubType = new DataHubType(MapType.class, valueType); + fieldPath = fieldPath.expandType("map", mapSchema); + + // Process value type if it's complex + if (valueSchema.getType() == Schema.Type.RECORD + || valueSchema.getType() == Schema.Type.ARRAY + || valueSchema.getType() == Schema.Type.MAP + || valueSchema.getType() == Schema.Type.UNION) { + Schema.Field valueField = + new Schema.Field( + field.name(), + valueSchema, + valueSchema.getDoc() != null ? valueSchema.getDoc() : field.doc(), + null // TODO: What is the default value for a map value? + ); // Nullability for map values follows the nullability of the map itself + FieldPath valueFieldPath = + fieldPath + .popLast() + .clonePlus( + new FieldElement( + Collections.singletonList("map"), new ArrayList<>(), null, null)); + processField(valueField, valueFieldPath, defaultNullable, fields, isNullable, mapDataHubType); + } else { + SchemaField mapField = + new SchemaField() + .setFieldPath(fieldPath.asString()) + .setType(mapDataHubType.asSchemaFieldType()) + .setNativeDataType("map") + .setNullable(isNullable || defaultNullable) + .setIsPartOfKey(fieldPath.isKeySchema()); + + populateCommonProperties(field, mapField); + fields.add(mapField); + } + } + + private void processUnionField( + Schema.Field field, + FieldPath fieldPath, + String discriminatedType, + boolean defaultNullable, + List fields, + boolean isNullable, + DataHubType typeOverride) { + + List unionTypes = field.schema().getTypes(); + + // If this is just a nullable type (union with null), process the non-null type + // directly + if (unionTypes.size() == 2 && isNullable) { + Schema nonNullSchema = + unionTypes.stream() + .filter(s -> s.getType() != Schema.Type.NULL) + .findFirst() + .orElseThrow(NoSuchElementException::new); + + processField( + new Schema.Field(field.name(), nonNullSchema, field.doc()), + fieldPath.popLast(), + defaultNullable, + fields, + true); + return; + } + + log.debug("Union Field Path before expand: {}", fieldPath.asString()); + + // Otherwise, process as a true union type + DataHubType unionDataHubType = new DataHubType(UnionType.class, discriminatedType); + FieldPath unionFieldPath = fieldPath.expandType("union", field.schema().toString()); + log.debug("Union Field Path after expand: {}", unionFieldPath.asString()); + + SchemaField unionField = + new SchemaField() + .setFieldPath(unionFieldPath.asString()) + .setType( + typeOverride == null + ? unionDataHubType.asSchemaFieldType() + : typeOverride.asSchemaFieldType()) + .setNativeDataType("union") + .setNullable(isNullable || defaultNullable) + .setIsPartOfKey(fieldPath.isKeySchema()); + + populateCommonProperties(field, unionField); + fields.add(unionField); + + String unionDescription = field.doc() != null ? field.doc() : field.schema().getDoc(); + + // Process each union type + int typeIndex = 0; + for (Schema unionSchema : unionTypes) { + if (unionSchema.getType() != Schema.Type.NULL) { + log.debug("TypeIndex: {}, Field path : {}", typeIndex, fieldPath.asString()); + FieldPath indexedFieldPath = fieldPath.popLast(); + indexedFieldPath = + indexedFieldPath.clonePlus( + new FieldElement( + Collections.singletonList("union"), new ArrayList<>(), null, null)); + log.debug("TypeIndex: {}, Indexed Field path : {}", typeIndex, indexedFieldPath.asString()); + // FieldPath unionFieldPath = + // fieldPath.expandType(getDiscriminatedType(unionSchema), + // unionSchema.toString()); + log.debug("TypeIndex: {}, Union Field path : {}", typeIndex, unionFieldPath.asString()); + String unionFieldName = field.name(); + Schema.Field unionFieldInner = + new Schema.Field( + unionFieldName, + unionSchema, + unionSchema.getDoc() != null ? unionSchema.getDoc() : unionDescription, + null); + log.debug( + "TypeIndex: {}, Union Field path : {}, Doc: {}", + typeIndex, + unionFieldPath.asString(), + unionFieldInner.doc()); + processField(unionFieldInner, indexedFieldPath, defaultNullable, fields); + } + typeIndex++; + } + } + + private void processEnumField( + Schema.Field field, + FieldPath fieldPath, + String discriminatedType, + boolean defaultNullable, + List fields, + boolean isNullable) { + + fieldPath = fieldPath.expandType("enum", field.schema().toString()); + + String enumDescription = field.doc() != null ? field.doc() : ""; + enumDescription += + " Allowed symbols are: " + String.join(", ", field.schema().getEnumSymbols()); + + SchemaField enumField = + new SchemaField() + .setFieldPath(fieldPath.asString()) + .setType( + new SchemaFieldDataType().setType(SchemaFieldDataType.Type.create(new EnumType()))) + .setNativeDataType("Enum") + .setNullable(isNullable || defaultNullable) + .setIsPartOfKey(fieldPath.isKeySchema()); + + populateCommonProperties(field, enumField); + + if (field.doc() != null && !field.doc().isEmpty()) { + enumField.setDescription(enumDescription); + } + + fields.add(enumField); + } + + @SneakyThrows + private void processPrimitiveField( + Schema.Field field, + FieldPath fieldPath, + String discriminatedType, + boolean defaultNullable, + List fields, + boolean isNullable) { + + fieldPath = fieldPath.expandType(discriminatedType, field.schema().toString()); + SchemaField primitiveField = + new SchemaField() + .setFieldPath(fieldPath.asString()) + .setType(new SchemaFieldDataType().setType(getTypeFromLogicalType(field.schema()))) + .setNativeDataType(getNativeDataType(field.schema())) + .setNullable(isNullable || defaultNullable) + .setIsPartOfKey(fieldPath.isKeySchema()); + + populateCommonProperties(field, primitiveField); + + fields.add(primitiveField); + } + + private boolean isNullable(Schema schema, boolean defaultNullable) { + if (schema.getType() == Schema.Type.UNION) { + return schema.getTypes().stream().anyMatch(type -> type.getType() == Schema.Type.NULL); + } + return defaultNullable; + } + + /** + * for record type we want to include the fully qualified name stripped of the namespace + * + * @param schema + * @return + */ + private String getDiscriminatedType(Schema schema) { + + if (schema.getType() == Schema.Type.RECORD) { + if (schema.getNamespace() != null) { + return schema.getFullName().substring(schema.getNamespace().length() + 1); + } else { + return schema.getFullName(); + } + } + return schema.getType().getName().toLowerCase(); + } + + private SchemaFieldDataType getPrimitiveFieldType(Schema schema) { + + SchemaFieldDataType fieldType = new SchemaFieldDataType(); + switch (schema.getType()) { + case BOOLEAN: + fieldType.setType(SchemaFieldDataType.Type.create(new BooleanType())); + break; + case INT: + case LONG: + case FLOAT: + case DOUBLE: + fieldType.setType(SchemaFieldDataType.Type.create(new NumberType())); + break; + case STRING: + fieldType.setType(SchemaFieldDataType.Type.create(new StringType())); + break; + case BYTES: + fieldType.setType(SchemaFieldDataType.Type.create(new BytesType())); + break; + default: + fieldType.setType(SchemaFieldDataType.Type.create(new NullType())); + } + return fieldType; + } +} diff --git a/metadata-integration/java/datahub-schematron/lib/src/main/java/io/datahubproject/schematron/models/DataHubType.java b/metadata-integration/java/datahub-schematron/lib/src/main/java/io/datahubproject/schematron/models/DataHubType.java new file mode 100644 index 00000000000000..ec6e8ce5a35547 --- /dev/null +++ b/metadata-integration/java/datahub-schematron/lib/src/main/java/io/datahubproject/schematron/models/DataHubType.java @@ -0,0 +1,40 @@ +package io.datahubproject.schematron.models; + +import com.linkedin.data.template.StringArray; +import com.linkedin.schema.*; +import lombok.Data; + +@Data +public class DataHubType { + private Class type; + private String nestedType; + + public DataHubType(Class type, String nestedType) { + this.type = type; + this.nestedType = nestedType; + } + + public SchemaFieldDataType asSchemaFieldType() { + if (type == UnionType.class) { + return new SchemaFieldDataType() + .setType( + SchemaFieldDataType.Type.create( + new UnionType() + .setNestedTypes(nestedType != null ? new StringArray(nestedType) : null))); + } else if (type == ArrayType.class) { + return new SchemaFieldDataType() + .setType( + SchemaFieldDataType.Type.create( + new ArrayType() + .setNestedType(nestedType != null ? new StringArray(nestedType) : null))); + } else if (type == MapType.class) { + return new SchemaFieldDataType() + .setType( + SchemaFieldDataType.Type.create( + new MapType() + .setKeyType("string") + .setValueType(nestedType != null ? nestedType : null))); + } + throw new IllegalArgumentException("Unexpected type " + type); + } +} diff --git a/metadata-integration/java/datahub-schematron/lib/src/main/java/io/datahubproject/schematron/models/FieldElement.java b/metadata-integration/java/datahub-schematron/lib/src/main/java/io/datahubproject/schematron/models/FieldElement.java new file mode 100644 index 00000000000000..6cdde845d95614 --- /dev/null +++ b/metadata-integration/java/datahub-schematron/lib/src/main/java/io/datahubproject/schematron/models/FieldElement.java @@ -0,0 +1,38 @@ +package io.datahubproject.schematron.models; + +import java.util.ArrayList; +import java.util.List; +import java.util.stream.Collectors; +import lombok.Data; + +@Data +public class FieldElement { + private List type; + private List schemaTypes; + private String name; + private DataHubType parentType; + + public FieldElement( + List type, List schemaTypes, String name, DataHubType parentType) { + this.type = type; + this.schemaTypes = schemaTypes; + this.name = name; + this.parentType = parentType; + } + + public FieldElement clone() { + return new FieldElement(new ArrayList<>(type), new ArrayList<>(schemaTypes), name, parentType); + } + + public String asString(boolean v2Format) { + if (v2Format) { + String typePrefix = + type.stream() + .map(innerType -> "[type=" + innerType + "]") + .collect(Collectors.joining(".")); + return name != null ? typePrefix + "." + name : typePrefix; + } else { + return name != null ? name : ""; + } + } +} diff --git a/metadata-integration/java/datahub-schematron/lib/src/main/java/io/datahubproject/schematron/models/FieldPath.java b/metadata-integration/java/datahub-schematron/lib/src/main/java/io/datahubproject/schematron/models/FieldPath.java new file mode 100644 index 00000000000000..b4b72fcc031a59 --- /dev/null +++ b/metadata-integration/java/datahub-schematron/lib/src/main/java/io/datahubproject/schematron/models/FieldPath.java @@ -0,0 +1,174 @@ +package io.datahubproject.schematron.models; + +import com.linkedin.schema.*; +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; +import java.util.Map; +import java.util.Objects; +import java.util.stream.Collectors; +import lombok.Data; +import lombok.NonNull; + +@Data +public class FieldPath { + public static final String EMPTY_FIELD_NAME = " "; + @NonNull private List path; + private boolean isKeySchema; + private boolean useV2PathsAlways; + + public FieldPath() { + this.path = new ArrayList<>(); + this.isKeySchema = false; + this.useV2PathsAlways = true; + } + + public void setPath(List path) { + if (path == null) { + throw new IllegalArgumentException("Path cannot be null"); + } + // Ensure that no element in the path is null + if (path.stream().anyMatch(Objects::isNull)) { + throw new IllegalArgumentException("Path cannot contain null elements"); + } + this.path = path; + } + + private boolean needsV2Path() { + if (useV2PathsAlways) { + return true; + } + if (isKeySchema) { + return true; + } + return path.stream() + .flatMap(element -> element.getType().stream()) + .anyMatch(t -> t.equals("union") || t.equals("array")); + } + + private void setParentTypeIfNotExists(DataHubType parentType) { + if (!path.isEmpty() && path.get(path.size() - 1).getParentType() == null) { + path.get(path.size() - 1).setParentType(parentType); + } + } + + private SchemaFieldDataType getTypeOverride() { + if (!path.isEmpty() && path.get(path.size() - 1).getParentType() != null) { + return path.get(path.size() - 1).getParentType().asSchemaFieldType(); + } + return null; + } + + private String getNativeTypeOverride() { + SchemaFieldDataType typeOverride = getTypeOverride(); + if (typeOverride != null) { + if (typeOverride.getType().isArrayType()) { + ArrayType arrayType = typeOverride.getType().getArrayType(); + return String.format( + "array(%s)", + arrayType.getNestedType() != null ? String.join(",", arrayType.getNestedType()) : ""); + } else if (typeOverride.getType().isMapType()) { + MapType mapType = typeOverride.getType().getMapType(); + return String.format("map(str,%s)", mapType.getValueType()); + } + } + return null; + } + + public String getRecursive(Map schema) { + String schemaStr = schema.toString(); + for (FieldElement p : path) { + for (int i = 0; i < p.getSchemaTypes().size(); i++) { + if (p.getSchemaTypes().get(i).equals(schemaStr)) { + return p.getType().get(i); + } + } + } + return null; + } + + public FieldPath popLast() { + FieldPath fpath = new FieldPath(); + fpath.setKeySchema(isKeySchema); + fpath.setPath(new ArrayList<>(path)); + fpath.getPath().remove(fpath.getPath().size() - 1); + return fpath; + } + + public FieldPath clonePlus(FieldElement element) { + FieldPath fpath = new FieldPath(); + fpath.setKeySchema(isKeySchema); + fpath.setPath(new ArrayList<>(path)); + fpath.getPath().add(element); + return fpath; + } + + // TODO: Why is typeSchema an Object? + public FieldPath expandType(String type, Object typeSchema) { + FieldPath fpath = new FieldPath(); + fpath.setKeySchema(isKeySchema); + fpath.setPath(path.stream().map(FieldElement::clone).collect(Collectors.toList())); + + if (!fpath.getPath().isEmpty()) { + FieldElement lastElement = fpath.getPath().get(fpath.getPath().size() - 1); + lastElement.getType().add(type); + lastElement.getSchemaTypes().add(typeSchema.toString()); + } else { + fpath + .getPath() + .add( + new FieldElement( + new ArrayList<>(Collections.singletonList(type)), + new ArrayList<>(Collections.singletonList(typeSchema.toString())), + null, + null)); + } + return fpath; + } + + public boolean hasFieldName() { + return path.stream().anyMatch(f -> f.getName() != null); + } + + public boolean ensureFieldName() { + if (!hasFieldName()) { + if (path.isEmpty()) { + path.add(new FieldElement(new ArrayList<>(), new ArrayList<>(), null, null)); + } + path.get(path.size() - 1).setName(EMPTY_FIELD_NAME); + } + return true; + } + + public String asString() { + boolean v2Format = needsV2Path(); + List prefix = new ArrayList<>(); + + if (v2Format) { + prefix.add("[version=2.0]"); + if (isKeySchema) { + prefix.add("[key=True]"); + } + } + + if (!path.isEmpty()) { + return String.join(".", prefix) + + "." + + path.stream().map(f -> f.asString(v2Format)).collect(Collectors.joining(".")); + } else { + return String.join(".", prefix); + } + } + + public String dump() { + StringBuilder sb = new StringBuilder(); + sb.append("FieldPath: "); + sb.append(this.asString()); + for (FieldElement f : path) { + sb.append(f.getName()); + sb.append(" "); + sb.append(f.getSchemaTypes().toString()); + } + return sb.toString(); + } +} diff --git a/metadata-integration/java/datahub-schematron/lib/src/main/java/io/datahubproject/schematron/utils/Constants.java b/metadata-integration/java/datahub-schematron/lib/src/main/java/io/datahubproject/schematron/utils/Constants.java new file mode 100644 index 00000000000000..b41d2d88c9dc0e --- /dev/null +++ b/metadata-integration/java/datahub-schematron/lib/src/main/java/io/datahubproject/schematron/utils/Constants.java @@ -0,0 +1,12 @@ +package io.datahubproject.schematron.utils; + +/** Constants used throughout the schema conversion process. */ +public final class Constants { + private Constants() {} + + public static final String ADD_TAG_OPERATION = "ADD_TAG"; + public static final String ADD_TERM_OPERATION = "ADD_TERM"; + + public static final String TAG_URN_PREFIX = "urn:li:tag:"; + public static final String TERM_URN_PREFIX = "urn:li:glossaryTerm:"; +} diff --git a/metadata-integration/java/datahub-schematron/lib/src/test/java/io/datahubproject/schematron/models/FieldPathTest.java b/metadata-integration/java/datahub-schematron/lib/src/test/java/io/datahubproject/schematron/models/FieldPathTest.java new file mode 100644 index 00000000000000..d823a2c8ed51b7 --- /dev/null +++ b/metadata-integration/java/datahub-schematron/lib/src/test/java/io/datahubproject/schematron/models/FieldPathTest.java @@ -0,0 +1,246 @@ +package io.datahubproject.schematron.models; + +import static org.testng.Assert.*; + +import com.linkedin.schema.ArrayType; +import com.linkedin.schema.MapType; +import com.linkedin.schema.UnionType; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.List; +import org.testng.annotations.*; + +@Test(groups = "unit") +public class FieldPathTest { + + @Test(groups = "basic") + public void testEmptyFieldPath() { + FieldPath path = new FieldPath(); + assertEquals(path.asString(), "[version=2.0]"); + } + + @Test(groups = "basic") + public void testKeySchemaPath() { + FieldPath path = new FieldPath(); + path.setKeySchema(true); + assertEquals(path.asString(), "[version=2.0].[key=True]"); + } + + @Test(groups = "basic") + public void testSimplePath() { + FieldPath path = new FieldPath(); + FieldElement element = + new FieldElement( + Collections.singletonList("string"), Collections.singletonList("schema"), "name", null); + path.setPath(Collections.singletonList(element)); + assertEquals(path.asString(), "[version=2.0].[type=string].name"); + } + + @Test(groups = "nested") + public void testNestedPath() { + FieldPath path = new FieldPath(); + FieldElement record = + new FieldElement( + Collections.singletonList("record"), + Collections.singletonList("record-schema"), + "user", + null); + FieldElement field = + new FieldElement( + Collections.singletonList("string"), + Collections.singletonList("string-schema"), + "name", + null); + path.setPath(Arrays.asList(record, field)); + assertEquals(path.asString(), "[version=2.0].[type=record].user.[type=string].name"); + } + + @Test(groups = "complex") + public void testUnionPath() { + FieldPath path = new FieldPath(); + + // Add union type + FieldElement union = + new FieldElement( + Collections.singletonList("union"), + Collections.singletonList("union-schema"), + "document", + null); + + // Add specific union member (record type) + FieldElement passport = + new FieldElement( + Collections.singletonList("Passport"), + Collections.singletonList("passport-schema"), + "document", + new DataHubType(UnionType.class, "Passport")); + + // Add field within the record + FieldElement number = + new FieldElement( + Collections.singletonList("string"), + Collections.singletonList("string-schema"), + "number", + null); + + path.setPath(Arrays.asList(union, passport, number)); + assertEquals( + path.asString(), + "[version=2.0].[type=union].document.[type=Passport].document.[type=string].number"); + } + + @Test(groups = "operations") + public void testClonePlus() { + FieldPath original = new FieldPath(); + FieldElement element1 = + new FieldElement( + Collections.singletonList("record"), + Collections.singletonList("schema1"), + "user", + null); + original.setPath(Collections.singletonList(element1)); + + FieldElement element2 = + new FieldElement( + Collections.singletonList("string"), + Collections.singletonList("schema2"), + "name", + null); + + FieldPath newPath = original.clonePlus(element2); + + // Verify original path remains unchanged + assertEquals(original.asString(), "[version=2.0].[type=record].user"); + + // Verify new path has both elements + assertEquals(newPath.asString(), "[version=2.0].[type=record].user.[type=string].name"); + } + + @Test(groups = "operations") + public void testExpandType() { + FieldPath path = new FieldPath(); + FieldElement element = new FieldElement(new ArrayList<>(), new ArrayList<>(), "field", null); + path.setPath(Collections.singletonList(element)); + + FieldPath expanded = path.expandType("string", "schema"); + + assertEquals(expanded.asString(), "[version=2.0].[type=string].field"); + assertEquals(expanded.getPath().get(0).getType().size(), 1); + assertEquals(expanded.getPath().get(0).getType().get(0), "string"); + assertEquals(expanded.getPath().get(0).getSchemaTypes().get(0), "schema"); + } + + @Test(groups = "operations") + public void testHasFieldName() { + FieldPath path = new FieldPath(); + assertFalse(path.hasFieldName()); + + FieldElement element = + new FieldElement( + Collections.singletonList("string"), Collections.singletonList("schema"), "name", null); + path.setPath(Collections.singletonList(element)); + assertTrue(path.hasFieldName()); + } + + @Test(groups = "operations") + public void testEnsureFieldName() { + FieldPath path = new FieldPath(); + assertFalse(path.hasFieldName()); + + path.ensureFieldName(); + assertTrue(path.hasFieldName()); + assertEquals(path.getPath().get(0).getName(), FieldPath.EMPTY_FIELD_NAME); + } + + @Test(groups = "complex") + public void testArrayPath() { + FieldPath path = new FieldPath(); + FieldElement array = + new FieldElement( + Collections.singletonList("array"), + Collections.singletonList("array-schema"), + "items", + new DataHubType(ArrayType.class, "string")); + + path.setPath(Collections.singletonList(array)); + assertEquals(path.asString(), "[version=2.0].[type=array].items"); + } + + @Test(groups = "complex") + public void testMapPath() { + FieldPath path = new FieldPath(); + FieldElement map = + new FieldElement( + Collections.singletonList("map"), + Collections.singletonList("map-schema"), + "properties", + new DataHubType(MapType.class, "string")); + + path.setPath(Collections.singletonList(map)); + assertEquals(path.asString(), "[version=2.0].[type=map].properties"); + } + + @Test(groups = "complex") + public void testMultipleTypesInPath() { + FieldPath path = new FieldPath(); + FieldElement element = + new FieldElement( + Arrays.asList("union", "string"), + Arrays.asList("union-schema", "string-schema"), + "field", + null); + path.setPath(Collections.singletonList(element)); + assertEquals(path.asString(), "[version=2.0].[type=union].[type=string].field"); + } + + @Test(groups = "complex") + public void testParentTypeHandling() { + FieldPath path = new FieldPath(); + DataHubType parentType = new DataHubType(ArrayType.class, "string"); + FieldElement element = + new FieldElement( + Collections.singletonList("array"), + Collections.singletonList("array-schema"), + "items", + parentType); + path.setPath(Collections.singletonList(element)); + + assertNotNull(path.getPath().get(0).getParentType()); + assertEquals(path.getPath().get(0).getParentType().getType(), ArrayType.class); + assertEquals(path.getPath().get(0).getParentType().getNestedType(), "string"); + } + + @Test(groups = "edge-cases") + public void testNoParentPath() { + FieldPath path = new FieldPath(); + assertEquals(path.asString(), "[version=2.0]"); + } + + @Test(groups = "edge-cases") + public void testEmptyElementList() { + FieldPath path = new FieldPath(); + path.setPath(new ArrayList<>()); + assertEquals(path.asString(), "[version=2.0]"); + } + + @DataProvider(name = "invalidPaths") + public Object[][] getInvalidPaths() { + return new Object[][] { + {null, "Expected IllegalArgumentException for null element"}, + { + Arrays.asList((FieldElement) null), + "Expected IllegalArgumentException for null element in list" + } + }; + } + + @Test( + groups = "edge-cases", + dataProvider = "invalidPaths", + expectedExceptions = IllegalArgumentException.class) + public void testInvalidPaths(List elements, String message) { + FieldPath path = new FieldPath(); + path.setPath(elements); + } +} diff --git a/metadata-integration/java/datahub-schematron/lib/src/test/resources/CustomerProfile.avsc b/metadata-integration/java/datahub-schematron/lib/src/test/resources/CustomerProfile.avsc new file mode 100644 index 00000000000000..81f8b0e54b11e0 --- /dev/null +++ b/metadata-integration/java/datahub-schematron/lib/src/test/resources/CustomerProfile.avsc @@ -0,0 +1,456 @@ +{ + "type": "record", + "name": "CustomerProfile", + "namespace": "com.example.customer", + "doc": "A complex customer profile schema demonstrating various union types and optional fields", + "fields": [ + { + "name": "customerId", + "type": { + "type": "string", + "logicalType": "uuid" + }, + "doc": "Unique identifier for the customer" + }, + { + "name": "identificationDocument", + "type": [ + "null", + { + "type": "record", + "name": "Passport", + "fields": [ + { + "name": "passportNumber", + "type": "string" + }, + { + "name": "expiryDate", + "type": { + "type": "long", + "logicalType": "date" + } + } + ] + }, + { + "type": "record", + "name": "DriversLicense", + "fields": [ + { + "name": "licenseNumber", + "type": "string" + }, + { + "name": "state", + "type": "string" + }, + { + "name": "validUntil", + "type": { + "type": "long", + "logicalType": "date" + } + } + ] + }, + { + "type": "record", + "name": "NationalID", + "fields": [ + { + "name": "idNumber", + "type": "string" + }, + { + "name": "country", + "type": "string" + } + ] + } + ], + "default": null, + "doc": "Customer's identification document - can be passport, driver's license, or national ID" + }, + { + "name": "contactInfo", + "type": { + "type": "record", + "name": "ContactInformation", + "fields": [ + { + "name": "primaryContact", + "type": [ + { + "type": "record", + "name": "EmailContact", + "fields": [ + { + "name": "emailAddress", + "type": "string" + }, + { + "name": "isVerified", + "type": "boolean", + "default": false + } + ] + }, + { + "type": "record", + "name": "PhoneContact", + "fields": [ + { + "name": "countryCode", + "type": "string" + }, + { + "name": "number", + "type": "string" + }, + { + "name": "type", + "type": { + "type": "enum", + "name": "PhoneType", + "symbols": [ + "MOBILE", + "LANDLINE" + ] + } + } + ] + } + ], + "doc": "Primary contact method - either email or phone" + }, + { + "name": "alternativeContacts", + "type": { + "type": "array", + "items": [ + "null", + "EmailContact", + "PhoneContact" + ] + }, + "default": [], + "doc": "List of alternative contact methods" + } + ] + } + }, + { + "name": "addresses", + "type": { + "type": "array", + "items": { + "type": "record", + "name": "Address", + "fields": [ + { + "name": "type", + "type": { + "type": "enum", + "name": "AddressType", + "symbols": [ + "RESIDENTIAL", + "BUSINESS", + "SHIPPING" + ] + }, + "default": "RESIDENTIAL" + }, + { + "name": "street", + "type": "string" + }, + { + "name": "city", + "type": "string" + }, + { + "name": "state", + "type": [ + "null", + "string" + ], + "default": null + }, + { + "name": "country", + "type": "string" + }, + { + "name": "postalCode", + "type": [ + "null", + "string" + ], + "default": null + }, + { + "name": "validationStatus", + "type": [ + "null", + { + "type": "record", + "name": "AddressValidation", + "fields": [ + { + "name": "isValid", + "type": "boolean" + }, + { + "name": "verificationDate", + "type": { + "type": "long", + "logicalType": "timestamp-millis" + } + }, + { + "name": "verificationMethod", + "type": { + "type": "enum", + "name": "VerificationMethod", + "symbols": [ + "MANUAL", + "AUTOMATED" + ] + } + } + ] + } + ], + "default": null + } + ] + } + }, + "doc": "Customer's addresses with validation information" + }, + { + "name": "preferences", + "type": { + "type": "map", + "values": [ + "null", + "string", + "boolean", + { + "type": "record", + "name": "FrequencyPreference", + "fields": [ + { + "name": "frequency", + "type": { + "type": "enum", + "name": "Frequency", + "symbols": [ + "DAILY", + "WEEKLY", + "MONTHLY" + ] + } + }, + { + "name": "enabled", + "type": "boolean", + "default": true + }, + { + "name": "lastUpdated", + "type": { + "type": "long", + "logicalType": "timestamp-millis" + } + } + ] + } + ] + }, + "doc": "Customer preferences with various possible value types" + }, + { + "name": "subscriptionHistory", + "type": [ + "null", + { + "type": "array", + "items": { + "type": "record", + "name": "Subscription", + "fields": [ + { + "name": "planName", + "type": "string" + }, + { + "name": "startDate", + "type": { + "type": "long", + "logicalType": "date" + } + }, + { + "name": "endDate", + "type": [ + "null", + { + "type": "long", + "logicalType": "date" + } + ], + "default": null + }, + { + "name": "status", + "type": { + "type": "enum", + "name": "SubscriptionStatus", + "symbols": [ + "ACTIVE", + "CANCELLED", + "EXPIRED", + "SUSPENDED" + ] + } + }, + { + "name": "paymentMethod", + "type": [ + "null", + { + "type": "record", + "name": "PaymentMethod", + "fields": [ + { + "name": "type", + "type": { + "type": "enum", + "name": "PaymentType", + "symbols": [ + "CREDIT_CARD", + "DEBIT_CARD", + "BANK_TRANSFER", + "DIGITAL_WALLET" + ] + } + }, + { + "name": "lastFourDigits", + "type": [ + "null", + "string" + ], + "default": null + }, + { + "name": "expiryDate", + "type": [ + "null", + { + "type": "long", + "logicalType": "date" + } + ], + "default": null + } + ] + } + ], + "default": null + } + ] + } + } + ], + "default": null, + "doc": "Historical record of customer subscriptions" + }, + { + "name": "metadata", + "type": { + "type": "map", + "values": [ + "null", + "string", + "long", + "boolean", + { + "type": "record", + "name": "MetadataValue", + "fields": [ + { + "name": "value", + "type": [ + "null", + "string", + "long", + "boolean" + ], + "default": null + }, + { + "name": "timestamp", + "type": { + "type": "long", + "logicalType": "timestamp-millis" + } + }, + { + "name": "source", + "type": "string" + } + ] + } + ] + }, + "doc": "Flexible metadata storage with various possible value types" + }, + { + "name": "tags", + "type": [ + "null", + { + "type": "array", + "items": { + "type": "record", + "name": "Tag", + "fields": [ + { + "name": "name", + "type": "string" + }, + { + "name": "value", + "type": [ + "null", + "string" + ], + "default": null + }, + { + "name": "score", + "type": [ + "null", + "double" + ], + "default": null + }, + { + "name": "addedAt", + "type": { + "type": "long", + "logicalType": "timestamp-millis" + } + } + ] + } + } + ], + "default": null, + "doc": "Optional tags associated with the customer profile" + } + ] +} \ No newline at end of file diff --git a/metadata-integration/java/datahub-schematron/lib/src/test/resources/CustomerProfile2.avsc b/metadata-integration/java/datahub-schematron/lib/src/test/resources/CustomerProfile2.avsc new file mode 100644 index 00000000000000..b8c7654ea072a2 --- /dev/null +++ b/metadata-integration/java/datahub-schematron/lib/src/test/resources/CustomerProfile2.avsc @@ -0,0 +1,244 @@ +{ + "type": "record", + "name": "CustomerProfile2", + "namespace": "com.example.customer", + "doc": "A complex customer profile schema demonstrating various union types and optional fields", + "fields": [ + { + "name": "customerId", + "type": { + "type": "string", + "logicalType": "uuid" + }, + "doc": "Unique identifier for the customer" + }, + { + "name": "identificationDocument", + "type": [ + "null", + { + "type": "record", + "name": "Passport", + "fields": [ + { + "name": "passportNumber", + "type": "string" + }, + { + "name": "expiryDate", + "type": { + "type": "long", + "logicalType": "date" + } + } + ] + }, + { + "type": "record", + "name": "DriversLicense", + "fields": [ + { + "name": "licenseNumber", + "type": "string" + }, + { + "name": "state", + "type": "string" + }, + { + "name": "validUntil", + "type": { + "type": "long", + "logicalType": "date" + } + } + ] + }, + { + "type": "record", + "name": "NationalID", + "fields": [ + { + "name": "idNumber", + "type": "string" + }, + { + "name": "country", + "type": "string" + } + ] + } + ], + "default": null, + "doc": "Customer's identification document" + }, + { + "name": "contactInfo", + "type": { + "type": "record", + "name": "ContactInformation", + "fields": [ + { + "name": "primaryEmailContact", + "type": [ + "null", + { + "type": "record", + "name": "PrimaryEmailContact", + "fields": [ + { + "name": "emailAddress", + "type": "string" + }, + { + "name": "isVerified", + "type": "boolean", + "default": false + } + ] + } + ], + "default": null + }, + { + "name": "primaryPhoneContact", + "type": [ + "null", + { + "type": "record", + "name": "PrimaryPhoneContact", + "fields": [ + { + "name": "countryCode", + "type": "string" + }, + { + "name": "number", + "type": "string" + }, + { + "name": "type", + "type": { + "type": "enum", + "name": "PhoneType", + "symbols": [ + "MOBILE", + "LANDLINE" + ] + } + } + ] + } + ], + "default": null + }, + { + "name": "alternativeEmailContacts", + "type": { + "type": "array", + "items": { + "type": "record", + "name": "AlternativeEmailContact", + "fields": [ + { + "name": "emailAddress", + "type": "string" + }, + { + "name": "isVerified", + "type": "boolean", + "default": false + } + ] + } + }, + "default": [] + }, + { + "name": "alternativePhoneContacts", + "type": { + "type": "array", + "items": { + "type": "record", + "name": "AlternativePhoneContact", + "fields": [ + { + "name": "countryCode", + "type": "string" + }, + { + "name": "number", + "type": "string" + }, + { + "name": "type", + "type": "PhoneType" + } + ] + } + }, + "default": [] + } + ] + } + }, + { + "name": "preferences", + "type": { + "type": "record", + "name": "Preferences", + "fields": [ + { + "name": "simplePreferences", + "type": { + "type": "map", + "values": [ + "null", + "string", + "boolean" + ] + }, + "default": {} + }, + { + "name": "frequencyPreferences", + "type": { + "type": "map", + "values": { + "type": "record", + "name": "FrequencyPreference", + "fields": [ + { + "name": "frequency", + "type": { + "type": "enum", + "name": "Frequency", + "symbols": [ + "DAILY", + "WEEKLY", + "MONTHLY" + ] + } + }, + { + "name": "enabled", + "type": "boolean", + "default": true + }, + { + "name": "lastUpdated", + "type": { + "type": "long", + "logicalType": "timestamp-millis" + } + } + ] + } + }, + "default": {} + } + ] + } + } + ] +} \ No newline at end of file diff --git a/metadata-integration/java/datahub-schematron/lib/src/test/resources/FlatUser.avsc b/metadata-integration/java/datahub-schematron/lib/src/test/resources/FlatUser.avsc new file mode 100644 index 00000000000000..c796878c32ae41 --- /dev/null +++ b/metadata-integration/java/datahub-schematron/lib/src/test/resources/FlatUser.avsc @@ -0,0 +1,45 @@ +{ + "type": "record", + "name": "FlatUser", + "namespace": "com.example", + "fields": [ + { + "name": "id", + "type": "int", + "doc": "The unique identifier for a user", + "default": -1, + "metadata": { + "key1": "value1", + "key2": "value2" + } + }, + { + "name": "username", + "type": "string", + "doc": "The username of the user" + }, + { + "name": "email", + "type": "string", + "doc": "The email of the user" + }, + { + "name": "age", + "type": "int", + "doc": "The age of the user" + }, + { + "name": "isActive", + "type": "boolean", + "doc": "Whether the user is active or not" + }, + { + "name": "registrationDate", + "type": { + "type": "long", + "logicalType": "timestamp-millis" + }, + "doc": "The registration date of the user" + } + ] +} \ No newline at end of file diff --git a/metadata-integration/java/spark-lineage-legacy/scripts/check_jar.sh b/metadata-integration/java/spark-lineage-legacy/scripts/check_jar.sh index 854c4227d08d93..d4108421216489 100755 --- a/metadata-integration/java/spark-lineage-legacy/scripts/check_jar.sh +++ b/metadata-integration/java/spark-lineage-legacy/scripts/check_jar.sh @@ -41,7 +41,9 @@ jar -tvf $jarFile |\ grep -v "VersionInfo.java" |\ grep -v "mime.types" |\ grep -v "com/ibm/.*" |\ - grep -v "google/" + grep -v "google/" |\ + grep -v "org/apache/avro" |\ + grep -v "org/apache" if [ $? -ne 0 ]; then diff --git a/metadata-io/build.gradle b/metadata-io/build.gradle index 09a41d100199d4..41294fab7b24a9 100644 --- a/metadata-io/build.gradle +++ b/metadata-io/build.gradle @@ -140,6 +140,8 @@ test { } testLogging.showStandardStreams = true testLogging.exceptionFormat = 'full' + + environment 'STRICT_URN_VALIDATION_ENABLED', 'true' } ebean { diff --git a/metadata-io/metadata-io-api/build.gradle b/metadata-io/metadata-io-api/build.gradle index b8028fad07bb65..5273177b752819 100644 --- a/metadata-io/metadata-io-api/build.gradle +++ b/metadata-io/metadata-io-api/build.gradle @@ -16,3 +16,7 @@ dependencies { testImplementation externalDependency.lombok testAnnotationProcessor externalDependency.lombok } + +test { + environment 'STRICT_URN_VALIDATION_ENABLED', 'true' +} \ No newline at end of file diff --git a/metadata-io/metadata-io-api/src/main/java/com/linkedin/metadata/entity/ebean/batch/AspectsBatchImpl.java b/metadata-io/metadata-io-api/src/main/java/com/linkedin/metadata/entity/ebean/batch/AspectsBatchImpl.java index 7f56abe64f9a77..c0d65640df2378 100644 --- a/metadata-io/metadata-io-api/src/main/java/com/linkedin/metadata/entity/ebean/batch/AspectsBatchImpl.java +++ b/metadata-io/metadata-io-api/src/main/java/com/linkedin/metadata/entity/ebean/batch/AspectsBatchImpl.java @@ -47,7 +47,8 @@ public class AspectsBatchImpl implements AspectsBatch { */ @Override public Pair>, List> toUpsertBatchItems( - final Map> latestAspects) { + Map> latestAspects, + Map> nextVersions) { // Process proposals to change items Stream mutatedProposalsStream = @@ -56,6 +57,7 @@ public Pair>, List> toUpsertBatchItems( .filter(item -> item instanceof ProposedItem) .map(item -> (MCPItem) item) .collect(Collectors.toList())); + // Regular change items Stream changeMCPStream = items.stream().filter(item -> !(item instanceof ProposedItem)); @@ -83,10 +85,8 @@ public Pair>, List> toUpsertBatchItems( currentValue, retrieverContext.getAspectRetriever()); } - // Populate old aspect for write hooks - upsertItem.setPreviousSystemAspect(latest); - - return upsertItem; + return AspectsBatch.incrementBatchVersion( + upsertItem, latestAspects, nextVersions); }) .collect(Collectors.toCollection(LinkedList::new)); @@ -96,6 +96,7 @@ public Pair>, List> toUpsertBatchItems( LinkedList newItems = applyMCPSideEffects(upsertBatchItems).collect(Collectors.toCollection(LinkedList::new)); upsertBatchItems.addAll(newItems); + Map> newUrnAspectNames = getNewUrnAspectsMap(getUrnAspectsMap(), upsertBatchItems); diff --git a/metadata-io/metadata-io-api/src/main/java/com/linkedin/metadata/entity/validation/ValidationApiUtils.java b/metadata-io/metadata-io-api/src/main/java/com/linkedin/metadata/entity/validation/ValidationApiUtils.java index f7e639ecf36038..5e1f09fcc64393 100644 --- a/metadata-io/metadata-io-api/src/main/java/com/linkedin/metadata/entity/validation/ValidationApiUtils.java +++ b/metadata-io/metadata-io-api/src/main/java/com/linkedin/metadata/entity/validation/ValidationApiUtils.java @@ -1,6 +1,7 @@ package com.linkedin.metadata.entity.validation; import com.linkedin.common.urn.Urn; +import com.linkedin.common.urn.UrnUtils; import com.linkedin.data.schema.validation.ValidationResult; import com.linkedin.data.template.RecordTemplate; import com.linkedin.metadata.Constants; @@ -10,16 +11,27 @@ import com.linkedin.metadata.models.EntitySpec; import com.linkedin.metadata.models.registry.EntityRegistry; import java.net.URISyntaxException; +import java.net.URLDecoder; import java.net.URLEncoder; +import java.nio.charset.StandardCharsets; +import java.util.List; +import java.util.Set; import java.util.function.Consumer; +import java.util.stream.Collectors; +import java.util.stream.Stream; import javax.annotation.Nonnull; import javax.annotation.Nullable; import lombok.extern.slf4j.Slf4j; @Slf4j public class ValidationApiUtils { + public static final String STRICT_URN_VALIDATION_ENABLED = "STRICT_URN_VALIDATION_ENABLED"; public static final int URN_NUM_BYTES_LIMIT = 512; + // Related to BrowsePathv2 public static final String URN_DELIMITER_SEPARATOR = "␟"; + // https://datahubproject.io/docs/what/urn/#restrictions + public static final Set ILLEGAL_URN_COMPONENT_CHARACTERS = Set.of("(", ")"); + public static final Set ILLEGAL_URN_TUPLE_CHARACTERS = Set.of(","); /** * Validates a {@link RecordTemplate} and throws {@link ValidationException} if validation fails. @@ -38,6 +50,16 @@ public static void validateOrThrow(RecordTemplate record) { } public static void validateUrn(@Nonnull EntityRegistry entityRegistry, @Nonnull final Urn urn) { + validateUrn( + entityRegistry, + urn, + Boolean.TRUE.equals( + Boolean.parseBoolean( + System.getenv().getOrDefault(STRICT_URN_VALIDATION_ENABLED, "false")))); + } + + public static void validateUrn( + @Nonnull EntityRegistry entityRegistry, @Nonnull final Urn urn, boolean strict) { EntityRegistryUrnValidator validator = new EntityRegistryUrnValidator(entityRegistry); validator.setCurrentEntitySpec(entityRegistry.getEntitySpec(urn.getEntityType())); RecordTemplateValidator.validate( @@ -59,10 +81,31 @@ public static void validateUrn(@Nonnull EntityRegistry entityRegistry, @Nonnull + Integer.toString(URN_NUM_BYTES_LIMIT) + " bytes (when URL encoded)"); } + if (urn.toString().contains(URN_DELIMITER_SEPARATOR)) { throw new IllegalArgumentException( "Error: URN cannot contain " + URN_DELIMITER_SEPARATOR + " character"); } + + int totalParts = urn.getEntityKey().getParts().size(); + List illegalComponents = + urn.getEntityKey().getParts().stream() + .flatMap(part -> processUrnPartRecursively(part, totalParts)) + .collect(Collectors.toList()); + + if (!illegalComponents.isEmpty()) { + String message = + String.format( + "Illegal `%s` characters detected in URN %s component(s): %s", + ILLEGAL_URN_COMPONENT_CHARACTERS, urn, illegalComponents); + + if (strict) { + throw new IllegalArgumentException(message); + } else { + log.error(message); + } + } + try { Urn.createFromString(urn.toString()); } catch (URISyntaxException e) { @@ -70,6 +113,28 @@ public static void validateUrn(@Nonnull EntityRegistry entityRegistry, @Nonnull } } + /** Recursively process URN parts with URL decoding */ + private static Stream processUrnPartRecursively(String urnPart, int totalParts) { + String decodedPart = + URLDecoder.decode(URLEncodingFixer.fixURLEncoding(urnPart), StandardCharsets.UTF_8); + if (decodedPart.startsWith("urn:li:")) { + // Recursively process nested URN after decoding + int nestedParts = UrnUtils.getUrn(decodedPart).getEntityKey().getParts().size(); + return UrnUtils.getUrn(decodedPart).getEntityKey().getParts().stream() + .flatMap(part -> processUrnPartRecursively(part, nestedParts)); + } + if (totalParts > 1) { + if (ILLEGAL_URN_TUPLE_CHARACTERS.stream().anyMatch(c -> urnPart.contains(c))) { + return Stream.of(urnPart); + } + } + if (ILLEGAL_URN_COMPONENT_CHARACTERS.stream().anyMatch(c -> urnPart.contains(c))) { + return Stream.of(urnPart); + } + + return Stream.empty(); + } + /** * Validates a {@link RecordTemplate} and logs a warning if validation fails. * @@ -123,4 +188,53 @@ public static void validateRecordTemplate( RecordTemplateValidator.validate(aspect, resultFunction, validator); } } + + /** + * Fixes malformed URL encoding by escaping unescaped % characters while preserving valid + * percent-encoded sequences. + */ + private static class URLEncodingFixer { + /** + * @param input The potentially malformed URL-encoded string + * @return A string with proper URL encoding that can be safely decoded + */ + public static String fixURLEncoding(String input) { + if (input == null) { + return null; + } + + StringBuilder result = new StringBuilder(input.length() * 2); + int i = 0; + + while (i < input.length()) { + char currentChar = input.charAt(i); + + if (currentChar == '%') { + if (i + 2 < input.length()) { + // Check if the next two characters form a valid hex pair + String hexPair = input.substring(i + 1, i + 3); + if (isValidHexPair(hexPair)) { + // This is a valid percent-encoded sequence, keep it as is + result.append(currentChar); + } else { + // Invalid sequence, escape the % character + result.append("%25"); + } + } else { + // % at the end of string, escape it + result.append("%25"); + } + } else { + result.append(currentChar); + } + i++; + } + + return result.toString(); + } + + private static boolean isValidHexPair(String pair) { + return pair.matches("[0-9A-Fa-f]{2}"); + } + } } diff --git a/metadata-io/metadata-io-api/src/test/java/com/linkedin/metadata/entity/ebean/batch/AspectsBatchImplTest.java b/metadata-io/metadata-io-api/src/test/java/com/linkedin/metadata/entity/ebean/batch/AspectsBatchImplTest.java index 31dd868b4cb4a3..96f535f2295aa4 100644 --- a/metadata-io/metadata-io-api/src/test/java/com/linkedin/metadata/entity/ebean/batch/AspectsBatchImplTest.java +++ b/metadata-io/metadata-io-api/src/test/java/com/linkedin/metadata/entity/ebean/batch/AspectsBatchImplTest.java @@ -41,6 +41,7 @@ import io.datahubproject.metadata.context.RetrieverContext; import java.nio.charset.StandardCharsets; import java.util.Collection; +import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.stream.Stream; @@ -120,7 +121,7 @@ public void toUpsertBatchItemsChangeItemTest() { AspectsBatchImpl.builder().items(testItems).retrieverContext(retrieverContext).build(); assertEquals( - testBatch.toUpsertBatchItems(Map.of()), + testBatch.toUpsertBatchItems(new HashMap<>(), new HashMap<>()), Pair.of(Map.of(), testItems), "Expected noop, pass through with no additional MCPs or changes"); } @@ -176,7 +177,7 @@ public void toUpsertBatchItemsPatchItemTest() { AspectsBatchImpl.builder().items(testItems).retrieverContext(retrieverContext).build(); assertEquals( - testBatch.toUpsertBatchItems(Map.of()), + testBatch.toUpsertBatchItems(new HashMap<>(), new HashMap<>()), Pair.of( Map.of(), List.of( @@ -195,7 +196,7 @@ public void toUpsertBatchItemsPatchItemTest() { .recordTemplate( new StructuredProperties() .setProperties(new StructuredPropertyValueAssignmentArray())) - .systemMetadata(testItems.get(0).getSystemMetadata()) + .systemMetadata(testItems.get(0).getSystemMetadata().setVersion("1")) .build(mockAspectRetriever), ChangeItemImpl.builder() .urn( @@ -212,7 +213,7 @@ public void toUpsertBatchItemsPatchItemTest() { .recordTemplate( new StructuredProperties() .setProperties(new StructuredPropertyValueAssignmentArray())) - .systemMetadata(testItems.get(1).getSystemMetadata()) + .systemMetadata(testItems.get(1).getSystemMetadata().setVersion("1")) .build(mockAspectRetriever))), "Expected patch items converted to upsert change items"); } @@ -264,7 +265,7 @@ public void toUpsertBatchItemsProposedItemTest() { AspectsBatchImpl.builder().items(testItems).retrieverContext(retrieverContext).build(); assertEquals( - testBatch.toUpsertBatchItems(Map.of()), + testBatch.toUpsertBatchItems(new HashMap<>(), new HashMap<>()), Pair.of( Map.of(), List.of( @@ -280,7 +281,7 @@ public void toUpsertBatchItemsProposedItemTest() { .getEntitySpec(DATASET_ENTITY_NAME) .getAspectSpec(STATUS_ASPECT_NAME)) .auditStamp(AuditStampUtils.createDefaultAuditStamp()) - .systemMetadata(testItems.get(0).getSystemMetadata()) + .systemMetadata(testItems.get(0).getSystemMetadata().setVersion("1")) .recordTemplate(new Status().setRemoved(false)) .build(mockAspectRetriever), ChangeItemImpl.builder() @@ -295,7 +296,7 @@ public void toUpsertBatchItemsProposedItemTest() { .getEntitySpec(DATASET_ENTITY_NAME) .getAspectSpec(STATUS_ASPECT_NAME)) .auditStamp(AuditStampUtils.createDefaultAuditStamp()) - .systemMetadata(testItems.get(1).getSystemMetadata()) + .systemMetadata(testItems.get(1).getSystemMetadata().setVersion("1")) .recordTemplate(new Status().setRemoved(false)) .build(mockAspectRetriever))), "Mutation to status aspect"); @@ -328,7 +329,7 @@ public void singleInvalidDoesntBreakBatch() { .build(); assertEquals( - testBatch.toUpsertBatchItems(Map.of()).getSecond().size(), + testBatch.toUpsertBatchItems(new HashMap<>(), new HashMap<>()).getSecond().size(), 1, "Expected 1 valid mcp to be passed through."); } diff --git a/metadata-io/metadata-io-api/src/test/java/com/linkedin/metadata/entity/validation/ValidationApiUtilsTest.java b/metadata-io/metadata-io-api/src/test/java/com/linkedin/metadata/entity/validation/ValidationApiUtilsTest.java new file mode 100644 index 00000000000000..a2c9a15d92f90a --- /dev/null +++ b/metadata-io/metadata-io-api/src/test/java/com/linkedin/metadata/entity/validation/ValidationApiUtilsTest.java @@ -0,0 +1,154 @@ +package com.linkedin.metadata.entity.validation; + +import com.linkedin.common.urn.Urn; +import com.linkedin.common.urn.UrnUtils; +import com.linkedin.metadata.models.registry.EntityRegistry; +import io.datahubproject.test.metadata.context.TestOperationContexts; +import java.net.URISyntaxException; +import org.testng.annotations.Test; + +public class ValidationApiUtilsTest { + private static final EntityRegistry entityRegistry = + TestOperationContexts.defaultEntityRegistry(); + + @Test + public void testValidateDatasetUrn() { + Urn validUrn = UrnUtils.getUrn("urn:li:dataset:(urn:li:dataPlatform:hdfs,/path/to/data,PROD)"); + ValidationApiUtils.validateUrn(entityRegistry, validUrn, true); + // If no exception is thrown, test passes + } + + @Test + public void testSimpleUrnColon() { + ValidationApiUtils.validateUrn( + entityRegistry, UrnUtils.getUrn("urn:li:corpuser:foo:bar"), true); + ValidationApiUtils.validateUrn( + entityRegistry, UrnUtils.getUrn("urn:li:dataPlatform:abc:def"), true); + ValidationApiUtils.validateUrn( + entityRegistry, UrnUtils.getUrn("urn:li:corpuser:foo:bar@example.com"), true); + // If no exception is thrown, test passes + } + + @Test + public void testSimpleUrnComma() { + ValidationApiUtils.validateUrn(entityRegistry, UrnUtils.getUrn("urn:li:corpuser:,"), true); + // If no exception is thrown, test passes + } + + @Test(expectedExceptions = IllegalArgumentException.class) + public void testTupleUrnComma() { + ValidationApiUtils.validateUrn( + entityRegistry, UrnUtils.getUrn("urn:li:dashboard:(looker,dashboards,thelook)"), true); + } + + @Test(expectedExceptions = IllegalArgumentException.class) + public void testFabricTypeCasing() { + // prod != PROD + ValidationApiUtils.validateUrn( + entityRegistry, + UrnUtils.getUrn("urn:li:dataset:(urn:li:dataPlatform:abc:def,table_name,prod)"), + true); + } + + @Test + public void testComplexUrnColon() throws URISyntaxException { + Urn validUrn = + Urn.createFromString( + "urn:li:dataset:(urn:li:dataPlatform:s3,urn:li:dataset:%28urn:li:dataPlatform:s3%2Ctest-datalake-concepts/prog_maintenance%2CPROD%29,PROD)"); + ValidationApiUtils.validateUrn(entityRegistry, validUrn, true); + // If no exception is thrown, test passes + } + + @Test(expectedExceptions = IllegalArgumentException.class) + public void testFabricTypeParen() { + Urn invalidUrn = UrnUtils.getUrn("urn:li:dataset:(urn:li:dataPlatform:hdfs,/path/to/data,())"); + ValidationApiUtils.validateUrn(entityRegistry, invalidUrn, true); + } + + @Test(expectedExceptions = IllegalArgumentException.class) + public void testUrnWithTrailingWhitespace() { + Urn invalidUrn = + UrnUtils.getUrn("urn:li:dataset:(urn:li:dataPlatform:hdfs,/path/to/data,PROD) "); + ValidationApiUtils.validateUrn(entityRegistry, invalidUrn, true); + } + + @Test(expectedExceptions = IllegalArgumentException.class) + public void testUrnWithIllegalDelimiter() { + Urn invalidUrn = UrnUtils.getUrn("urn:li:dataset:(urn:li:dataPlatform:hdfs␟path,PROD)"); + ValidationApiUtils.validateUrn(entityRegistry, invalidUrn, true); + } + + @Test(expectedExceptions = IllegalArgumentException.class) + public void testComplexUrnWithParens() { + Urn invalidUrn = UrnUtils.getUrn("urn:li:dataset:(urn:li:dataPlatform:hdfs,(illegal),PROD)"); + ValidationApiUtils.validateUrn(entityRegistry, invalidUrn, true); + } + + @Test(expectedExceptions = IllegalArgumentException.class) + public void testSimpleUrnWithParens() { + Urn invalidUrn = UrnUtils.getUrn("urn:li:corpuser:(foo)123"); + ValidationApiUtils.validateUrn(entityRegistry, invalidUrn, true); + } + + @Test(expectedExceptions = IllegalArgumentException.class) + public void testExcessiveLength() { + StringBuilder longPath = new StringBuilder("urn:li:dataset:(urn:li:dataPlatform:hdfs,"); + // Create a path that will exceed 512 bytes when URL encoded + for (int i = 0; i < 500; i++) { + longPath.append("very/long/path/"); + } + longPath.append(",PROD)"); + Urn invalidUrn = UrnUtils.getUrn(longPath.toString()); + + ValidationApiUtils.validateUrn(entityRegistry, invalidUrn, true); + } + + @Test + public void testValidComplexUrn() { + Urn validUrn = + UrnUtils.getUrn( + "urn:li:dataset:(urn:li:dataPlatform:bigquery,myproject.dataset.table,PROD)"); + + ValidationApiUtils.validateUrn(entityRegistry, validUrn, true); + // If no exception is thrown, test passes + } + + @Test(expectedExceptions = NullPointerException.class) + public void testUrnNull() { + ValidationApiUtils.validateUrn(entityRegistry, null, true); + } + + @Test + public void testValidPartialUrlEncode() { + Urn validUrn = UrnUtils.getUrn("urn:li:assertion:123=-%28__% weekly__%29"); + + ValidationApiUtils.validateUrn(entityRegistry, validUrn, true); + // If no exception is thrown, test passes + } + + @Test + public void testValidPartialUrlEncode2() { + Urn validUrn = + UrnUtils.getUrn( + "urn:li:dataset:(urn:li:dataPlatform:s3,urn:li:dataset:%28urn:li:dataPlatform:s3%2Ctest-datalake-concepts%prog_maintenance%2CPROD%29,PROD)"); + + ValidationApiUtils.validateUrn(entityRegistry, validUrn, true); + // If no exception is thrown, test passes + } + + @Test + public void testValidColon() { + Urn validUrn = + UrnUtils.getUrn("urn:li:dashboard:(looker,dashboards.thelook::cohort_data_tool)"); + + ValidationApiUtils.validateUrn(entityRegistry, validUrn, true); + // If no exception is thrown, test passes + } + + @Test + public void testNoTupleComma() { + Urn invalidUrn = UrnUtils.getUrn("urn:li:corpuser:,"); + ValidationApiUtils.validateUrn(entityRegistry, invalidUrn, true); + // If no exception is thrown, test passes + } +} diff --git a/metadata-io/src/main/java/com/linkedin/metadata/dataproducts/sideeffects/DataProductUnsetSideEffect.java b/metadata-io/src/main/java/com/linkedin/metadata/dataproducts/sideeffects/DataProductUnsetSideEffect.java index 544040d14f8b7c..dae1a8ff51a2cf 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/dataproducts/sideeffects/DataProductUnsetSideEffect.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/dataproducts/sideeffects/DataProductUnsetSideEffect.java @@ -9,6 +9,7 @@ import com.linkedin.dataproduct.DataProductAssociation; import com.linkedin.dataproduct.DataProductAssociationArray; import com.linkedin.dataproduct.DataProductProperties; +import com.linkedin.events.metadata.ChangeType; import com.linkedin.metadata.aspect.RetrieverContext; import com.linkedin.metadata.aspect.batch.ChangeMCP; import com.linkedin.metadata.aspect.batch.MCLItem; @@ -27,9 +28,11 @@ import java.util.ArrayList; import java.util.Collection; import java.util.Collections; +import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.Optional; +import java.util.stream.Collectors; import java.util.stream.Stream; import javax.annotation.Nonnull; import lombok.Getter; @@ -64,71 +67,108 @@ private static Stream generatePatchRemove( MCLItem mclItem, @Nonnull RetrieverContext retrieverContext) { if (DATA_PRODUCT_PROPERTIES_ASPECT_NAME.equals(mclItem.getAspectName())) { - List mcpItems = new ArrayList<>(); + DataProductProperties dataProductProperties = mclItem.getAspect(DataProductProperties.class); if (dataProductProperties == null) { log.error("Unable to process data product properties for urn: {}", mclItem.getUrn()); return Stream.empty(); } - for (DataProductAssociation dataProductAssociation : + DataProductAssociationArray newDataProductAssociationArray = Optional.ofNullable(dataProductProperties.getAssets()) - .orElse(new DataProductAssociationArray())) { - RelatedEntitiesScrollResult result = - retrieverContext - .getGraphRetriever() - .scrollRelatedEntities( - null, - QueryUtils.newFilter( - "urn", dataProductAssociation.getDestinationUrn().toString()), - null, - EMPTY_FILTER, - ImmutableList.of("DataProductContains"), - QueryUtils.newRelationshipFilter(EMPTY_FILTER, RelationshipDirection.INCOMING), - Collections.emptyList(), - null, - 10, // Should only ever be one, if ever greater than ten will decrease over time - // to become consistent - null, - null); - if (!result.getEntities().isEmpty()) { - for (RelatedEntities entity : result.getEntities()) { - if (!mclItem.getUrn().equals(UrnUtils.getUrn(entity.getSourceUrn()))) { - EntitySpec entitySpec = - retrieverContext - .getAspectRetriever() - .getEntityRegistry() - .getEntitySpec(DATA_PRODUCT_ENTITY_NAME); - GenericJsonPatch.PatchOp patchOp = new GenericJsonPatch.PatchOp(); - patchOp.setOp(PatchOperationType.REMOVE.getValue()); - patchOp.setPath(String.format("/assets/%s", entity.getDestinationUrn())); - mcpItems.add( - PatchItemImpl.builder() - .urn(UrnUtils.getUrn(entity.getSourceUrn())) - .entitySpec( - retrieverContext - .getAspectRetriever() - .getEntityRegistry() - .getEntitySpec(DATA_PRODUCT_ENTITY_NAME)) - .aspectName(DATA_PRODUCT_PROPERTIES_ASPECT_NAME) - .aspectSpec(entitySpec.getAspectSpec(DATA_PRODUCT_PROPERTIES_ASPECT_NAME)) - .patch( - GenericJsonPatch.builder() - .arrayPrimaryKeys( - Map.of( - DataProductPropertiesTemplate.ASSETS_FIELD_NAME, - List.of(DataProductPropertiesTemplate.KEY_FIELD_NAME))) - .patch(List.of(patchOp)) - .build() - .getJsonPatch()) - .auditStamp(mclItem.getAuditStamp()) - .systemMetadata(mclItem.getSystemMetadata()) - .build(retrieverContext.getAspectRetriever().getEntityRegistry())); - } + .orElse(new DataProductAssociationArray()); + + DataProductProperties previousDataProductProperties = + mclItem.getPreviousAspect(DataProductProperties.class); + + if (!ChangeType.UPSERT.equals(mclItem.getChangeType()) + || previousDataProductProperties == null) { + // CREATE/CREATE_ENTITY/RESTATE + return generateUnsetMCPs(mclItem, newDataProductAssociationArray, retrieverContext); + } else { + // UPSERT with previous + DataProductAssociationArray oldDataProductAssociationArray = + Optional.ofNullable(previousDataProductProperties.getAssets()) + .orElse(new DataProductAssociationArray()); + + DataProductAssociationArray additions = + newDataProductAssociationArray.stream() + .filter(association -> !oldDataProductAssociationArray.contains(association)) + .collect(Collectors.toCollection(DataProductAssociationArray::new)); + + return generateUnsetMCPs(mclItem, additions, retrieverContext); + } + } + return Stream.empty(); + } + + private static Stream generateUnsetMCPs( + @Nonnull MCLItem dataProductItem, + @Nonnull DataProductAssociationArray dataProductAssociations, + @Nonnull RetrieverContext retrieverContext) { + List mcpItems = new ArrayList<>(); + Map> patchOpMap = new HashMap<>(); + + for (DataProductAssociation dataProductAssociation : dataProductAssociations) { + RelatedEntitiesScrollResult result = + retrieverContext + .getGraphRetriever() + .scrollRelatedEntities( + null, + QueryUtils.newFilter( + "urn", dataProductAssociation.getDestinationUrn().toString()), + null, + EMPTY_FILTER, + ImmutableList.of("DataProductContains"), + QueryUtils.newRelationshipFilter(EMPTY_FILTER, RelationshipDirection.INCOMING), + Collections.emptyList(), + null, + 10, // Should only ever be one, if ever greater than ten will decrease over time + // to become consistent + null, + null); + if (!result.getEntities().isEmpty()) { + for (RelatedEntities entity : result.getEntities()) { + if (!dataProductItem.getUrn().equals(UrnUtils.getUrn(entity.getSourceUrn()))) { + GenericJsonPatch.PatchOp patchOp = new GenericJsonPatch.PatchOp(); + patchOp.setOp(PatchOperationType.REMOVE.getValue()); + patchOp.setPath(String.format("/assets/%s", entity.getDestinationUrn())); + patchOpMap + .computeIfAbsent(entity.getSourceUrn(), urn -> new ArrayList<>()) + .add(patchOp); } } } - return mcpItems.stream(); } - return Stream.empty(); + for (String urn : patchOpMap.keySet()) { + EntitySpec entitySpec = + retrieverContext + .getAspectRetriever() + .getEntityRegistry() + .getEntitySpec(DATA_PRODUCT_ENTITY_NAME); + mcpItems.add( + PatchItemImpl.builder() + .urn(UrnUtils.getUrn(urn)) + .entitySpec( + retrieverContext + .getAspectRetriever() + .getEntityRegistry() + .getEntitySpec(DATA_PRODUCT_ENTITY_NAME)) + .aspectName(DATA_PRODUCT_PROPERTIES_ASPECT_NAME) + .aspectSpec(entitySpec.getAspectSpec(DATA_PRODUCT_PROPERTIES_ASPECT_NAME)) + .patch( + GenericJsonPatch.builder() + .arrayPrimaryKeys( + Map.of( + DataProductPropertiesTemplate.ASSETS_FIELD_NAME, + List.of(DataProductPropertiesTemplate.KEY_FIELD_NAME))) + .patch(patchOpMap.get(urn)) + .build() + .getJsonPatch()) + .auditStamp(dataProductItem.getAuditStamp()) + .systemMetadata(dataProductItem.getSystemMetadata()) + .build(retrieverContext.getAspectRetriever().getEntityRegistry())); + } + + return mcpItems.stream(); } } diff --git a/metadata-io/src/main/java/com/linkedin/metadata/entity/EntityServiceImpl.java b/metadata-io/src/main/java/com/linkedin/metadata/entity/EntityServiceImpl.java index 9337ea3c2b6f77..a0a55cf505cf35 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/entity/EntityServiceImpl.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/entity/EntityServiceImpl.java @@ -868,71 +868,64 @@ private List ingestAspectsToLocalDB( // Read before write is unfortunate, however batch it final Map> urnAspects = batchWithDefaults.getUrnAspectsMap(); // read #1 - final Map> latestAspects = + Map> databaseAspects = + aspectDao.getLatestAspects(urnAspects, true); + + final Map> batchAspects = EntityUtils.toSystemAspects( - opContext.getRetrieverContext().get(), - aspectDao.getLatestAspects(urnAspects, true)); + opContext.getRetrieverContext().get(), databaseAspects); + // read #2 (potentially) final Map> nextVersions = - EntityUtils.calculateNextVersions( - txContext, aspectDao, latestAspects, urnAspects); + EntityUtils.calculateNextVersions(txContext, aspectDao, batchAspects, urnAspects); // 1. Convert patches to full upserts // 2. Run any entity/aspect level hooks Pair>, List> updatedItems = - batchWithDefaults.toUpsertBatchItems(latestAspects); + batchWithDefaults.toUpsertBatchItems(batchAspects, nextVersions); // Fetch additional information if needed - final Map> updatedLatestAspects; - final Map> updatedNextVersions; + final List changeMCPs; + if (!updatedItems.getFirst().isEmpty()) { + // These items are new items from side effects + Map> sideEffects = updatedItems.getFirst(); + + final Map> updatedLatestAspects; + final Map> updatedNextVersions; + Map> newLatestAspects = EntityUtils.toSystemAspects( opContext.getRetrieverContext().get(), aspectDao.getLatestAspects(updatedItems.getFirst(), true)); // merge - updatedLatestAspects = AspectsBatch.merge(latestAspects, newLatestAspects); + updatedLatestAspects = AspectsBatch.merge(batchAspects, newLatestAspects); Map> newNextVersions = EntityUtils.calculateNextVersions( txContext, aspectDao, updatedLatestAspects, updatedItems.getFirst()); // merge updatedNextVersions = AspectsBatch.merge(nextVersions, newNextVersions); + + changeMCPs = + updatedItems.getSecond().stream() + .peek( + changeMCP -> { + // Add previous version to each side-effect + if (sideEffects + .getOrDefault( + changeMCP.getUrn().toString(), Collections.emptySet()) + .contains(changeMCP.getAspectName())) { + + AspectsBatch.incrementBatchVersion( + changeMCP, updatedLatestAspects, updatedNextVersions); + } + }) + .collect(Collectors.toList()); } else { - updatedLatestAspects = latestAspects; - updatedNextVersions = nextVersions; + changeMCPs = updatedItems.getSecond(); } - // Add previous version to each upsert - List changeMCPs = - updatedItems.getSecond().stream() - .peek( - changeMCP -> { - String urnStr = changeMCP.getUrn().toString(); - long nextVersion = - updatedNextVersions - .getOrDefault(urnStr, Map.of()) - .getOrDefault(changeMCP.getAspectName(), 0L); - - changeMCP.setPreviousSystemAspect( - updatedLatestAspects - .getOrDefault(urnStr, Map.of()) - .getOrDefault(changeMCP.getAspectName(), null)); - - changeMCP.setNextAspectVersion(nextVersion); - - // support inner-batch upserts - updatedLatestAspects - .computeIfAbsent(urnStr, key -> new HashMap<>()) - .put( - changeMCP.getAspectName(), - changeMCP.getSystemAspect(nextVersion)); - updatedNextVersions - .computeIfAbsent(urnStr, key -> new HashMap<>()) - .put(changeMCP.getAspectName(), nextVersion + 1); - }) - .collect(Collectors.toList()); - // No changes, return if (changeMCPs.isEmpty()) { return Collections.emptyList(); @@ -954,40 +947,50 @@ private List ingestAspectsToLocalDB( List upsertResults = changeMCPs.stream() .map( - item -> { - final EntityAspect.EntitySystemAspect latest = - (EntityAspect.EntitySystemAspect) item.getPreviousSystemAspect(); + writeItem -> { + + /* + database*Aspect - should be used for comparisons of before batch operation information + */ + final EntityAspect databaseAspect = + databaseAspects + .getOrDefault(writeItem.getUrn().toString(), Map.of()) + .get(writeItem.getAspectName()); + final EntityAspect.EntitySystemAspect databaseSystemAspect = + databaseAspect == null + ? null + : EntityAspect.EntitySystemAspect.builder() + .build( + writeItem.getEntitySpec(), + writeItem.getAspectSpec(), + databaseAspect); final UpdateAspectResult result; - if (overwrite || latest == null) { + /* + This condition is specifically for an older conditional write ingestAspectIfNotPresent() + overwrite is always true otherwise + */ + if (overwrite || databaseAspect == null) { result = - ingestAspectToLocalDB( - txContext, - item.getUrn(), - item.getAspectName(), - item.getRecordTemplate(), - item.getAuditStamp(), - item.getSystemMetadata(), - latest == null ? null : latest, - item.getNextAspectVersion()) + ingestAspectToLocalDB(txContext, writeItem, databaseSystemAspect) .toBuilder() - .request(item) + .request(writeItem) .build(); } else { - RecordTemplate oldValue = latest.getRecordTemplate(); - SystemMetadata oldMetadata = latest.getSystemMetadata(); + RecordTemplate oldValue = databaseSystemAspect.getRecordTemplate(); + SystemMetadata oldMetadata = databaseSystemAspect.getSystemMetadata(); result = UpdateAspectResult.builder() - .urn(item.getUrn()) - .request(item) + .urn(writeItem.getUrn()) + .request(writeItem) .oldValue(oldValue) .newValue(oldValue) .oldSystemMetadata(oldMetadata) .newSystemMetadata(oldMetadata) .operation(MetadataAuditOperation.UPDATE) - .auditStamp(item.getAuditStamp()) - .maxVersion(latest.getVersion()) + .auditStamp(writeItem.getAuditStamp()) + .maxVersion(databaseAspect.getVersion()) .build(); } @@ -1011,8 +1014,8 @@ private List ingestAspectsToLocalDB( // Only consider retention when there was a previous version .filter( result -> - latestAspects.containsKey(result.getUrn().toString()) - && latestAspects + batchAspects.containsKey(result.getUrn().toString()) + && batchAspects .get(result.getUrn().toString()) .containsKey(result.getRequest().getAspectName())) .filter( @@ -1102,9 +1105,11 @@ private List emitMCL( * @param auditStamp an {@link AuditStamp} containing metadata about the writer & current time * @param systemMetadata * @return the {@link RecordTemplate} representation of the written aspect object + * @deprecated See Conditional Write ChangeType CREATE */ @Nullable @Override + @Deprecated public RecordTemplate ingestAspectIfNotPresent( @Nonnull OperationContext opContext, @Nonnull Urn urn, @@ -2495,87 +2500,107 @@ private Map getEnvelopedAspects( ((EntityAspect.EntitySystemAspect) systemAspect).toEnvelopedAspects())); } + /** + * @param txContext Transaction context, keeps track of retries, exceptions etc. + * @param writeItem The aspect being written + * @param databaseAspect The aspect as it exists in the database. + * @return result object + */ @Nonnull private UpdateAspectResult ingestAspectToLocalDB( @Nullable TransactionContext txContext, - @Nonnull final Urn urn, - @Nonnull final String aspectName, - @Nonnull final RecordTemplate newValue, - @Nonnull final AuditStamp auditStamp, - @Nonnull final SystemMetadata providedSystemMetadata, - @Nullable final EntityAspect.EntitySystemAspect latest, - @Nonnull final Long nextVersion) { + @Nonnull final ChangeMCP writeItem, + @Nullable final EntityAspect.EntitySystemAspect databaseAspect) { // Set the "last run id" to be the run id provided with the new system metadata. This will be // stored in index // for all aspects that have a run id, regardless of whether they change. - providedSystemMetadata.setLastRunId( - providedSystemMetadata.getRunId(GetMode.NULL), SetMode.IGNORE_NULL); + writeItem + .getSystemMetadata() + .setLastRunId(writeItem.getSystemMetadata().getRunId(GetMode.NULL), SetMode.IGNORE_NULL); // 2. Compare the latest existing and new. - final RecordTemplate oldValue = latest == null ? null : latest.getRecordTemplate(); + final EntityAspect.EntitySystemAspect previousBatchAspect = + (EntityAspect.EntitySystemAspect) writeItem.getPreviousSystemAspect(); + final RecordTemplate previousValue = + previousBatchAspect == null ? null : previousBatchAspect.getRecordTemplate(); // 3. If there is no difference between existing and new, we just update // the lastObserved in system metadata. RunId should stay as the original runId - if (oldValue != null && DataTemplateUtil.areEqual(oldValue, newValue)) { - SystemMetadata latestSystemMetadata = latest.getSystemMetadata(); - latestSystemMetadata.setLastObserved(providedSystemMetadata.getLastObserved()); + if (previousValue != null + && DataTemplateUtil.areEqual(previousValue, writeItem.getRecordTemplate())) { + + SystemMetadata latestSystemMetadata = previousBatchAspect.getSystemMetadata(); + latestSystemMetadata.setLastObserved(writeItem.getSystemMetadata().getLastObserved()); latestSystemMetadata.setLastRunId( - providedSystemMetadata.getLastRunId(GetMode.NULL), SetMode.IGNORE_NULL); + writeItem.getSystemMetadata().getLastRunId(GetMode.NULL), SetMode.IGNORE_NULL); - latest.getEntityAspect().setSystemMetadata(RecordUtils.toJsonString(latestSystemMetadata)); + previousBatchAspect + .getEntityAspect() + .setSystemMetadata(RecordUtils.toJsonString(latestSystemMetadata)); - log.info("Ingesting aspect with name {}, urn {}", aspectName, urn); - aspectDao.saveAspect(txContext, latest.getEntityAspect(), false); + log.info( + "Ingesting aspect with name {}, urn {}", + previousBatchAspect.getAspectName(), + previousBatchAspect.getUrn()); + aspectDao.saveAspect(txContext, previousBatchAspect.getEntityAspect(), false); // metrics aspectDao.incrementWriteMetrics( - aspectName, 1, latest.getMetadataRaw().getBytes(StandardCharsets.UTF_8).length); + previousBatchAspect.getAspectName(), + 1, + previousBatchAspect.getMetadataRaw().getBytes(StandardCharsets.UTF_8).length); return UpdateAspectResult.builder() - .urn(urn) - .oldValue(oldValue) - .newValue(oldValue) - .oldSystemMetadata(latest.getSystemMetadata()) + .urn(writeItem.getUrn()) + .oldValue(previousValue) + .newValue(previousValue) + .oldSystemMetadata(previousBatchAspect.getSystemMetadata()) .newSystemMetadata(latestSystemMetadata) .operation(MetadataAuditOperation.UPDATE) - .auditStamp(auditStamp) + .auditStamp(writeItem.getAuditStamp()) .maxVersion(0) .build(); } // 4. Save the newValue as the latest version - log.debug("Ingesting aspect with name {}, urn {}", aspectName, urn); - String newValueStr = EntityApiUtils.toJsonAspect(newValue); + log.debug( + "Ingesting aspect with name {}, urn {}", writeItem.getAspectName(), writeItem.getUrn()); + String newValueStr = EntityApiUtils.toJsonAspect(writeItem.getRecordTemplate()); long versionOfOld = aspectDao.saveLatestAspect( txContext, - urn.toString(), - aspectName, - latest == null ? null : EntityApiUtils.toJsonAspect(oldValue), - latest == null ? null : latest.getCreatedBy(), - latest == null ? null : latest.getEntityAspect().getCreatedFor(), - latest == null ? null : latest.getCreatedOn(), - latest == null ? null : latest.getSystemMetadataRaw(), + writeItem.getUrn().toString(), + writeItem.getAspectName(), + previousBatchAspect == null ? null : EntityApiUtils.toJsonAspect(previousValue), + previousBatchAspect == null ? null : previousBatchAspect.getCreatedBy(), + previousBatchAspect == null + ? null + : previousBatchAspect.getEntityAspect().getCreatedFor(), + previousBatchAspect == null ? null : previousBatchAspect.getCreatedOn(), + previousBatchAspect == null ? null : previousBatchAspect.getSystemMetadataRaw(), newValueStr, - auditStamp.getActor().toString(), - auditStamp.hasImpersonator() ? auditStamp.getImpersonator().toString() : null, - new Timestamp(auditStamp.getTime()), - EntityApiUtils.toJsonAspect(providedSystemMetadata), - nextVersion); + writeItem.getAuditStamp().getActor().toString(), + writeItem.getAuditStamp().hasImpersonator() + ? writeItem.getAuditStamp().getImpersonator().toString() + : null, + new Timestamp(writeItem.getAuditStamp().getTime()), + EntityApiUtils.toJsonAspect(writeItem.getSystemMetadata()), + writeItem.getNextAspectVersion()); // metrics aspectDao.incrementWriteMetrics( - aspectName, 1, newValueStr.getBytes(StandardCharsets.UTF_8).length); + writeItem.getAspectName(), 1, newValueStr.getBytes(StandardCharsets.UTF_8).length); return UpdateAspectResult.builder() - .urn(urn) - .oldValue(oldValue) - .newValue(newValue) - .oldSystemMetadata(latest == null ? null : latest.getSystemMetadata()) - .newSystemMetadata(providedSystemMetadata) + .urn(writeItem.getUrn()) + .oldValue(previousValue) + .newValue(writeItem.getRecordTemplate()) + .oldSystemMetadata( + previousBatchAspect == null ? null : previousBatchAspect.getSystemMetadata()) + .newSystemMetadata(writeItem.getSystemMetadata()) .operation(MetadataAuditOperation.UPDATE) - .auditStamp(auditStamp) + .auditStamp(writeItem.getAuditStamp()) .maxVersion(versionOfOld) .build(); } diff --git a/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/query/ESBrowseDAO.java b/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/query/ESBrowseDAO.java index 61bba11098fae2..35f133cc794f2a 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/query/ESBrowseDAO.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/query/ESBrowseDAO.java @@ -36,6 +36,7 @@ import java.util.Collections; import java.util.List; import java.util.Map; +import java.util.Objects; import java.util.Optional; import java.util.Set; import java.util.stream.Collectors; @@ -425,7 +426,10 @@ public List getBrowsePaths( if (!sourceMap.containsKey(BROWSE_PATH)) { return Collections.emptyList(); } - return (List) sourceMap.get(BROWSE_PATH); + List browsePaths = + ((List) sourceMap.get(BROWSE_PATH)) + .stream().filter(Objects::nonNull).collect(Collectors.toList()); + return browsePaths; } public BrowseResultV2 browseV2( diff --git a/metadata-io/src/main/java/com/linkedin/metadata/structuredproperties/validation/PropertyDefinitionValidator.java b/metadata-io/src/main/java/com/linkedin/metadata/structuredproperties/validation/PropertyDefinitionValidator.java index ae5472af622ad5..6e047c12da9a9f 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/structuredproperties/validation/PropertyDefinitionValidator.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/structuredproperties/validation/PropertyDefinitionValidator.java @@ -89,6 +89,9 @@ public static Stream validateDefinitionUpserts( item.getAspect(StructuredPropertyDefinition.class); versionFormatCheck(item, newDefinition.getVersion()).ifPresent(exceptions::addException); + urnIdCheck(item).ifPresent(exceptions::addException); + qualifiedNameCheck(item, newDefinition.getQualifiedName()) + .ifPresent(exceptions::addException); if (item.getPreviousSystemAspect() != null) { @@ -192,4 +195,20 @@ private static Optional versionFormatCheck( } return Optional.empty(); } + + private static Optional urnIdCheck(MCPItem item) { + if (item.getUrn().getId().contains(" ")) { + return Optional.of(AspectValidationException.forItem(item, "Urn ID cannot have spaces")); + } + return Optional.empty(); + } + + private static Optional qualifiedNameCheck( + MCPItem item, @Nonnull String qualifiedName) { + if (qualifiedName.contains(" ")) { + return Optional.of( + AspectValidationException.forItem(item, "Qualified names cannot have spaces")); + } + return Optional.empty(); + } } diff --git a/metadata-io/src/test/java/com/linkedin/metadata/dataproducts/sideeffects/DataProductUnsetSideEffectTest.java b/metadata-io/src/test/java/com/linkedin/metadata/dataproducts/sideeffects/DataProductUnsetSideEffectTest.java index 1151014bf1162f..976b165fea53df 100644 --- a/metadata-io/src/test/java/com/linkedin/metadata/dataproducts/sideeffects/DataProductUnsetSideEffectTest.java +++ b/metadata-io/src/test/java/com/linkedin/metadata/dataproducts/sideeffects/DataProductUnsetSideEffectTest.java @@ -17,6 +17,7 @@ import com.linkedin.events.metadata.ChangeType; import com.linkedin.metadata.aspect.AspectRetriever; import com.linkedin.metadata.aspect.GraphRetriever; +import com.linkedin.metadata.aspect.SystemAspect; import com.linkedin.metadata.aspect.batch.MCPItem; import com.linkedin.metadata.aspect.models.graph.RelatedEntities; import com.linkedin.metadata.aspect.models.graph.RelatedEntitiesScrollResult; @@ -34,6 +35,8 @@ import com.linkedin.metadata.utils.AuditStampUtils; import com.linkedin.test.metadata.aspect.TestEntityRegistry; import io.datahubproject.metadata.context.RetrieverContext; +import jakarta.json.JsonArray; +import jakarta.json.JsonObject; import java.util.ArrayList; import java.util.Collections; import java.util.List; @@ -45,13 +48,7 @@ public class DataProductUnsetSideEffectTest { private static final EntityRegistry TEST_REGISTRY = new TestEntityRegistry(); private static final List SUPPORTED_CHANGE_TYPES = - List.of( - ChangeType.CREATE, - ChangeType.PATCH, - ChangeType.CREATE_ENTITY, - ChangeType.UPSERT, - ChangeType.DELETE, - ChangeType.RESTATE); + List.of(ChangeType.CREATE, ChangeType.CREATE_ENTITY, ChangeType.UPSERT, ChangeType.RESTATE); private static final Urn TEST_PRODUCT_URN = UrnUtils.getUrn("urn:li:dataProduct:someDataProductId"); @@ -251,6 +248,214 @@ public void testDPRemoveOld() { .build(mockAspectRetriever.getEntityRegistry()))); } + @Test + public void testBulkAssetMove() { + DataProductUnsetSideEffect test = new DataProductUnsetSideEffect(); + test.setConfig(TEST_PLUGIN_CONFIG); + + // Create 100 dataset URNs and set up their existing relationships + List datasetUrns = new ArrayList<>(); + for (int i = 0; i < 100; i++) { + Urn datasetUrn = + UrnUtils.getUrn( + String.format("urn:li:dataset:(urn:li:dataPlatform:hive,fct_users_%d,PROD)", i)); + datasetUrns.add(datasetUrn); + + // Mock the existing relationship for each dataset with the old data product + RelatedEntities relatedEntities = + new RelatedEntities( + "DataProductContains", + TEST_PRODUCT_URN_2.toString(), // Old data product + datasetUrn.toString(), + RelationshipDirection.INCOMING, + null); + + List relatedEntitiesList = new ArrayList<>(); + relatedEntitiesList.add(relatedEntities); + RelatedEntitiesScrollResult relatedEntitiesScrollResult = + new RelatedEntitiesScrollResult(1, 10, null, relatedEntitiesList); + + when(retrieverContext + .getGraphRetriever() + .scrollRelatedEntities( + eq(null), + eq(QueryUtils.newFilter("urn", datasetUrn.toString())), + eq(null), + eq(EMPTY_FILTER), + eq(ImmutableList.of("DataProductContains")), + eq( + QueryUtils.newRelationshipFilter( + EMPTY_FILTER, RelationshipDirection.INCOMING)), + eq(Collections.emptyList()), + eq(null), + eq(10), + eq(null), + eq(null))) + .thenReturn(relatedEntitiesScrollResult); + } + + // Create data product properties with all 100 assets + DataProductProperties dataProductProperties = new DataProductProperties(); + DataProductAssociationArray dataProductAssociations = new DataProductAssociationArray(); + for (Urn datasetUrn : datasetUrns) { + DataProductAssociation association = new DataProductAssociation(); + association.setDestinationUrn(datasetUrn); + dataProductAssociations.add(association); + } + dataProductProperties.setAssets(dataProductAssociations); + + // Run test + ChangeItemImpl dataProductPropertiesChangeItem = + ChangeItemImpl.builder() + .urn(TEST_PRODUCT_URN) // New data product + .aspectName(DATA_PRODUCT_PROPERTIES_ASPECT_NAME) + .changeType(ChangeType.UPSERT) + .entitySpec(TEST_REGISTRY.getEntitySpec(DATA_PRODUCT_ENTITY_NAME)) + .aspectSpec( + TEST_REGISTRY + .getEntitySpec(DATA_PRODUCT_ENTITY_NAME) + .getAspectSpec(DATA_PRODUCT_PROPERTIES_ASPECT_NAME)) + .recordTemplate(dataProductProperties) + .auditStamp(AuditStampUtils.createDefaultAuditStamp()) + .build(mockAspectRetriever); + + List testOutput = + test.postMCPSideEffect( + List.of( + MCLItemImpl.builder() + .build( + dataProductPropertiesChangeItem, + null, + null, + retrieverContext.getAspectRetriever())), + retrieverContext) + .toList(); + + // Verify test + assertEquals(testOutput.size(), 1, "Expected one patch to remove assets from old data product"); + + MCPItem patchItem = testOutput.get(0); + assertEquals( + patchItem.getUrn(), TEST_PRODUCT_URN_2, "Patch should target the old data product"); + assertEquals(patchItem.getAspectName(), DATA_PRODUCT_PROPERTIES_ASPECT_NAME); + + // Verify the patch contains remove operations for all 100 assets + JsonArray patchArray = ((PatchItemImpl) patchItem).getPatch().toJsonArray(); + assertEquals(patchArray.size(), 100, "Should have 100 remove operations"); + + // Verify each remove operation + for (int i = 0; i < 100; i++) { + JsonObject op = patchArray.getJsonObject(i); + assertEquals(op.getString("op"), PatchOperationType.REMOVE.getValue()); + assertEquals( + op.getString("path"), + String.format("/assets/urn:li:dataset:(urn:li:dataPlatform:hive,fct_users_%d,PROD)", i)); + } + } + + @Test + public void testUpsertWithPreviousAspect() { + DataProductUnsetSideEffect test = new DataProductUnsetSideEffect(); + test.setConfig(TEST_PLUGIN_CONFIG); + + // Case 1: UPSERT with new additions + DataProductProperties previousProperties = new DataProductProperties(); + DataProductAssociationArray previousAssociations = new DataProductAssociationArray(); + DataProductAssociation previousAssociation = new DataProductAssociation(); + previousAssociation.setDestinationUrn(DATASET_URN_1); + previousAssociations.add(previousAssociation); + previousProperties.setAssets(previousAssociations); + + // New properties include both old and new datasets + DataProductProperties newProperties = new DataProductProperties(); + DataProductAssociationArray newAssociations = new DataProductAssociationArray(); + DataProductAssociation association1 = new DataProductAssociation(); + association1.setDestinationUrn(DATASET_URN_1); + DataProductAssociation association2 = new DataProductAssociation(); + association2.setDestinationUrn(DATASET_URN_2); + newAssociations.add(association1); + newAssociations.add(association2); + newProperties.setAssets(newAssociations); + + // Create change item with previous aspect + SystemAspect prevData = mock(SystemAspect.class); + when(prevData.getRecordTemplate()).thenReturn(previousProperties); + + ChangeItemImpl dataProductPropertiesChangeItem = + ChangeItemImpl.builder() + .urn(TEST_PRODUCT_URN) + .aspectName(DATA_PRODUCT_PROPERTIES_ASPECT_NAME) + .changeType(ChangeType.UPSERT) + .entitySpec(TEST_REGISTRY.getEntitySpec(DATA_PRODUCT_ENTITY_NAME)) + .aspectSpec( + TEST_REGISTRY + .getEntitySpec(DATA_PRODUCT_ENTITY_NAME) + .getAspectSpec(DATA_PRODUCT_PROPERTIES_ASPECT_NAME)) + .recordTemplate(newProperties) + .previousSystemAspect(prevData) + .auditStamp(AuditStampUtils.createDefaultAuditStamp()) + .build(mockAspectRetriever); + + List testOutput = + test.postMCPSideEffect( + List.of( + MCLItemImpl.builder() + .build( + dataProductPropertiesChangeItem, + null, + null, + retrieverContext.getAspectRetriever())), + retrieverContext) + .toList(); + + // Verify that only one patch is generated for the new dataset + assertEquals( + testOutput.size(), 1, "Expected removal of previous data product for new dataset only"); + MCPItem patchItem = testOutput.get(0); + assertEquals( + patchItem.getUrn(), TEST_PRODUCT_URN_2, "Patch should target the old data product"); + GenericJsonPatch.PatchOp expectedPatchOp = new GenericJsonPatch.PatchOp(); + expectedPatchOp.setOp(PatchOperationType.REMOVE.getValue()); + expectedPatchOp.setPath(String.format("/assets/%s", DATASET_URN_2)); + + // Case 2: UPSERT with no new additions + DataProductProperties sameProperties = new DataProductProperties(); + DataProductAssociationArray sameAssociations = new DataProductAssociationArray(); + DataProductAssociation sameAssociation = new DataProductAssociation(); + sameAssociation.setDestinationUrn(DATASET_URN_1); + sameAssociations.add(sameAssociation); + sameProperties.setAssets(sameAssociations); + + SystemAspect prevSameData = mock(SystemAspect.class); + when(prevData.getRecordTemplate()).thenReturn(sameProperties); + + ChangeItemImpl noChangeItem = + ChangeItemImpl.builder() + .urn(TEST_PRODUCT_URN) + .aspectName(DATA_PRODUCT_PROPERTIES_ASPECT_NAME) + .changeType(ChangeType.UPSERT) + .entitySpec(TEST_REGISTRY.getEntitySpec(DATA_PRODUCT_ENTITY_NAME)) + .aspectSpec( + TEST_REGISTRY + .getEntitySpec(DATA_PRODUCT_ENTITY_NAME) + .getAspectSpec(DATA_PRODUCT_PROPERTIES_ASPECT_NAME)) + .recordTemplate(sameProperties) + .previousSystemAspect(prevSameData) + .auditStamp(AuditStampUtils.createDefaultAuditStamp()) + .build(mockAspectRetriever); + + List noChangeOutput = + test.postMCPSideEffect( + List.of( + MCLItemImpl.builder() + .build(noChangeItem, null, null, retrieverContext.getAspectRetriever())), + retrieverContext) + .toList(); + + // Verify no patches are generated when there are no new additions + assertEquals(noChangeOutput.size(), 0, "Expected no changes when assets are the same"); + } + private static DataProductProperties getTestDataProductProperties(Urn destinationUrn) { DataProductProperties dataProductProperties = new DataProductProperties(); DataProductAssociationArray dataProductAssociations = new DataProductAssociationArray(); diff --git a/metadata-io/src/test/java/com/linkedin/metadata/entity/EbeanEntityServiceTest.java b/metadata-io/src/test/java/com/linkedin/metadata/entity/EbeanEntityServiceTest.java index 04c9297b1ed7aa..f2ed2fddba7654 100644 --- a/metadata-io/src/test/java/com/linkedin/metadata/entity/EbeanEntityServiceTest.java +++ b/metadata-io/src/test/java/com/linkedin/metadata/entity/EbeanEntityServiceTest.java @@ -1,6 +1,8 @@ package com.linkedin.metadata.entity; import static com.linkedin.metadata.Constants.CORP_USER_ENTITY_NAME; +import static com.linkedin.metadata.Constants.DATASET_ENTITY_NAME; +import static com.linkedin.metadata.Constants.GLOBAL_TAGS_ASPECT_NAME; import static com.linkedin.metadata.Constants.STATUS_ASPECT_NAME; import static org.mockito.Mockito.mock; import static org.testng.Assert.assertEquals; @@ -8,7 +10,11 @@ import static org.testng.Assert.assertTrue; import com.linkedin.common.AuditStamp; +import com.linkedin.common.GlobalTags; import com.linkedin.common.Status; +import com.linkedin.common.TagAssociation; +import com.linkedin.common.TagAssociationArray; +import com.linkedin.common.urn.TagUrn; import com.linkedin.common.urn.Urn; import com.linkedin.common.urn.UrnUtils; import com.linkedin.data.template.DataTemplateUtil; @@ -18,17 +24,21 @@ import com.linkedin.metadata.AspectGenerationUtils; import com.linkedin.metadata.Constants; import com.linkedin.metadata.EbeanTestUtils; +import com.linkedin.metadata.aspect.patch.GenericJsonPatch; +import com.linkedin.metadata.aspect.patch.PatchOperationType; import com.linkedin.metadata.config.EbeanConfiguration; import com.linkedin.metadata.config.PreProcessHooks; import com.linkedin.metadata.entity.ebean.EbeanAspectDao; import com.linkedin.metadata.entity.ebean.EbeanRetentionService; import com.linkedin.metadata.entity.ebean.batch.AspectsBatchImpl; import com.linkedin.metadata.entity.ebean.batch.ChangeItemImpl; +import com.linkedin.metadata.entity.ebean.batch.PatchItemImpl; import com.linkedin.metadata.event.EventProducer; import com.linkedin.metadata.key.CorpUserKey; import com.linkedin.metadata.models.registry.EntityRegistryException; import com.linkedin.metadata.query.ListUrnsResult; import com.linkedin.metadata.service.UpdateIndicesService; +import com.linkedin.metadata.utils.AuditStampUtils; import com.linkedin.metadata.utils.PegasusUtils; import com.linkedin.mxe.MetadataChangeProposal; import com.linkedin.mxe.SystemMetadata; @@ -433,6 +443,220 @@ public void testBatchDuplicate() throws Exception { "Expected 2nd item to be the latest"); } + @Test + public void testBatchPatchWithTrailingNoOp() throws Exception { + Urn entityUrn = + UrnUtils.getUrn( + "urn:li:dataset:(urn:li:dataPlatform:snowflake,testBatchPatchWithTrailingNoOp,PROD)"); + TagUrn tag1 = TagUrn.createFromString("urn:li:tag:tag1"); + Urn tag2 = UrnUtils.getUrn("urn:li:tag:tag2"); + Urn tagOther = UrnUtils.getUrn("urn:li:tag:other"); + + SystemMetadata systemMetadata = AspectGenerationUtils.createSystemMetadata(); + + ChangeItemImpl initialAspectTag1 = + ChangeItemImpl.builder() + .urn(entityUrn) + .aspectName(GLOBAL_TAGS_ASPECT_NAME) + .recordTemplate( + new GlobalTags() + .setTags(new TagAssociationArray(new TagAssociation().setTag(tag1)))) + .systemMetadata(systemMetadata.copy()) + .auditStamp(TEST_AUDIT_STAMP) + .build(TestOperationContexts.emptyAspectRetriever(null)); + + PatchItemImpl patchAdd2 = + PatchItemImpl.builder() + .urn(entityUrn) + .entitySpec(_testEntityRegistry.getEntitySpec(DATASET_ENTITY_NAME)) + .aspectName(GLOBAL_TAGS_ASPECT_NAME) + .aspectSpec( + _testEntityRegistry + .getEntitySpec(DATASET_ENTITY_NAME) + .getAspectSpec(GLOBAL_TAGS_ASPECT_NAME)) + .patch( + GenericJsonPatch.builder() + .arrayPrimaryKeys(Map.of("properties", List.of("tag"))) + .patch(List.of(tagPatchOp(PatchOperationType.ADD, tag2))) + .build() + .getJsonPatch()) + .auditStamp(AuditStampUtils.createDefaultAuditStamp()) + .build(_testEntityRegistry); + + PatchItemImpl patchRemoveNonExistent = + PatchItemImpl.builder() + .urn(entityUrn) + .entitySpec(_testEntityRegistry.getEntitySpec(DATASET_ENTITY_NAME)) + .aspectName(GLOBAL_TAGS_ASPECT_NAME) + .aspectSpec( + _testEntityRegistry + .getEntitySpec(DATASET_ENTITY_NAME) + .getAspectSpec(GLOBAL_TAGS_ASPECT_NAME)) + .patch( + GenericJsonPatch.builder() + .arrayPrimaryKeys(Map.of("properties", List.of("tag"))) + .patch(List.of(tagPatchOp(PatchOperationType.REMOVE, tagOther))) + .build() + .getJsonPatch()) + .auditStamp(AuditStampUtils.createDefaultAuditStamp()) + .build(_testEntityRegistry); + + // establish base entity + _entityServiceImpl.ingestAspects( + opContext, + AspectsBatchImpl.builder() + .retrieverContext(opContext.getRetrieverContext().get()) + .items(List.of(initialAspectTag1)) + .build(), + false, + true); + + _entityServiceImpl.ingestAspects( + opContext, + AspectsBatchImpl.builder() + .retrieverContext(opContext.getRetrieverContext().get()) + .items(List.of(patchAdd2, patchRemoveNonExistent)) + .build(), + false, + true); + + // List aspects urns + ListUrnsResult batch = _entityServiceImpl.listUrns(opContext, entityUrn.getEntityType(), 0, 1); + + assertEquals(batch.getStart().intValue(), 0); + assertEquals(batch.getCount().intValue(), 1); + assertEquals(batch.getTotal().intValue(), 1); + assertEquals(batch.getEntities().size(), 1); + assertEquals(entityUrn.toString(), batch.getEntities().get(0).toString()); + + EnvelopedAspect envelopedAspect = + _entityServiceImpl.getLatestEnvelopedAspect( + opContext, DATASET_ENTITY_NAME, entityUrn, GLOBAL_TAGS_ASPECT_NAME); + assertEquals( + envelopedAspect.getSystemMetadata().getVersion(), + "2", + "Expected version 2. 1 - Initial, + 1 batch operation (1 add, 1 remove)"); + assertEquals( + new GlobalTags(envelopedAspect.getValue().data()) + .getTags().stream().map(TagAssociation::getTag).collect(Collectors.toSet()), + Set.of(tag1, tag2), + "Expected both tags"); + } + + @Test + public void testBatchPatchAdd() throws Exception { + Urn entityUrn = + UrnUtils.getUrn("urn:li:dataset:(urn:li:dataPlatform:snowflake,testBatchPatchAdd,PROD)"); + TagUrn tag1 = TagUrn.createFromString("urn:li:tag:tag1"); + TagUrn tag2 = TagUrn.createFromString("urn:li:tag:tag2"); + TagUrn tag3 = TagUrn.createFromString("urn:li:tag:tag3"); + + SystemMetadata systemMetadata = AspectGenerationUtils.createSystemMetadata(); + + ChangeItemImpl initialAspectTag1 = + ChangeItemImpl.builder() + .urn(entityUrn) + .aspectName(GLOBAL_TAGS_ASPECT_NAME) + .recordTemplate( + new GlobalTags() + .setTags(new TagAssociationArray(new TagAssociation().setTag(tag1)))) + .systemMetadata(systemMetadata.copy()) + .auditStamp(TEST_AUDIT_STAMP) + .build(TestOperationContexts.emptyAspectRetriever(null)); + + PatchItemImpl patchAdd3 = + PatchItemImpl.builder() + .urn(entityUrn) + .entitySpec(_testEntityRegistry.getEntitySpec(DATASET_ENTITY_NAME)) + .aspectName(GLOBAL_TAGS_ASPECT_NAME) + .aspectSpec( + _testEntityRegistry + .getEntitySpec(DATASET_ENTITY_NAME) + .getAspectSpec(GLOBAL_TAGS_ASPECT_NAME)) + .patch( + GenericJsonPatch.builder() + .arrayPrimaryKeys(Map.of("properties", List.of("tag"))) + .patch(List.of(tagPatchOp(PatchOperationType.ADD, tag3))) + .build() + .getJsonPatch()) + .auditStamp(AuditStampUtils.createDefaultAuditStamp()) + .build(_testEntityRegistry); + + PatchItemImpl patchAdd2 = + PatchItemImpl.builder() + .urn(entityUrn) + .entitySpec(_testEntityRegistry.getEntitySpec(DATASET_ENTITY_NAME)) + .aspectName(GLOBAL_TAGS_ASPECT_NAME) + .aspectSpec( + _testEntityRegistry + .getEntitySpec(DATASET_ENTITY_NAME) + .getAspectSpec(GLOBAL_TAGS_ASPECT_NAME)) + .patch( + GenericJsonPatch.builder() + .arrayPrimaryKeys(Map.of("properties", List.of("tag"))) + .patch(List.of(tagPatchOp(PatchOperationType.ADD, tag2))) + .build() + .getJsonPatch()) + .auditStamp(AuditStampUtils.createDefaultAuditStamp()) + .build(_testEntityRegistry); + + PatchItemImpl patchAdd1 = + PatchItemImpl.builder() + .urn(entityUrn) + .entitySpec(_testEntityRegistry.getEntitySpec(DATASET_ENTITY_NAME)) + .aspectName(GLOBAL_TAGS_ASPECT_NAME) + .aspectSpec( + _testEntityRegistry + .getEntitySpec(DATASET_ENTITY_NAME) + .getAspectSpec(GLOBAL_TAGS_ASPECT_NAME)) + .patch( + GenericJsonPatch.builder() + .arrayPrimaryKeys(Map.of("properties", List.of("tag"))) + .patch(List.of(tagPatchOp(PatchOperationType.ADD, tag1))) + .build() + .getJsonPatch()) + .auditStamp(AuditStampUtils.createDefaultAuditStamp()) + .build(_testEntityRegistry); + + // establish base entity + _entityServiceImpl.ingestAspects( + opContext, + AspectsBatchImpl.builder() + .retrieverContext(opContext.getRetrieverContext().get()) + .items(List.of(initialAspectTag1)) + .build(), + false, + true); + + _entityServiceImpl.ingestAspects( + opContext, + AspectsBatchImpl.builder() + .retrieverContext(opContext.getRetrieverContext().get()) + .items(List.of(patchAdd3, patchAdd2, patchAdd1)) + .build(), + false, + true); + + // List aspects urns + ListUrnsResult batch = _entityServiceImpl.listUrns(opContext, entityUrn.getEntityType(), 0, 1); + + assertEquals(batch.getStart().intValue(), 0); + assertEquals(batch.getCount().intValue(), 1); + assertEquals(batch.getTotal().intValue(), 1); + assertEquals(batch.getEntities().size(), 1); + assertEquals(entityUrn.toString(), batch.getEntities().get(0).toString()); + + EnvelopedAspect envelopedAspect = + _entityServiceImpl.getLatestEnvelopedAspect( + opContext, DATASET_ENTITY_NAME, entityUrn, GLOBAL_TAGS_ASPECT_NAME); + assertEquals(envelopedAspect.getSystemMetadata().getVersion(), "3", "Expected version 3"); + assertEquals( + new GlobalTags(envelopedAspect.getValue().data()) + .getTags().stream().map(TagAssociation::getTag).collect(Collectors.toSet()), + Set.of(tag1, tag2, tag3), + "Expected all tags"); + } + @Test public void dataGeneratorThreadingTest() { DataGenerator dataGenerator = new DataGenerator(opContext, _entityServiceImpl); @@ -659,4 +883,14 @@ public void run() { } } } + + private static GenericJsonPatch.PatchOp tagPatchOp(PatchOperationType op, Urn tagUrn) { + GenericJsonPatch.PatchOp patchOp = new GenericJsonPatch.PatchOp(); + patchOp.setOp(op.getValue()); + patchOp.setPath(String.format("/tags/%s", tagUrn)); + if (PatchOperationType.ADD.equals(op)) { + patchOp.setValue(Map.of("tag", tagUrn.toString())); + } + return patchOp; + } } diff --git a/metadata-io/src/test/java/com/linkedin/metadata/entity/EntityServiceTest.java b/metadata-io/src/test/java/com/linkedin/metadata/entity/EntityServiceTest.java index 53f5ebfe59728e..654c448fdec946 100644 --- a/metadata-io/src/test/java/com/linkedin/metadata/entity/EntityServiceTest.java +++ b/metadata-io/src/test/java/com/linkedin/metadata/entity/EntityServiceTest.java @@ -50,10 +50,8 @@ import com.linkedin.metadata.event.EventProducer; import com.linkedin.metadata.key.CorpUserKey; import com.linkedin.metadata.models.AspectSpec; -import com.linkedin.metadata.models.registry.ConfigEntityRegistry; import com.linkedin.metadata.models.registry.EntityRegistry; import com.linkedin.metadata.models.registry.EntityRegistryException; -import com.linkedin.metadata.models.registry.MergedEntityRegistry; import com.linkedin.metadata.run.AspectRowSummary; import com.linkedin.metadata.service.UpdateIndicesService; import com.linkedin.metadata.snapshot.CorpUserSnapshot; @@ -75,6 +73,7 @@ import com.linkedin.structured.StructuredPropertyValueAssignmentArray; import com.linkedin.util.Pair; import io.datahubproject.metadata.context.OperationContext; +import io.datahubproject.test.metadata.context.TestOperationContexts; import jakarta.annotation.Nonnull; import java.util.ArrayList; import java.util.Arrays; @@ -113,18 +112,13 @@ public abstract class EntityServiceTest browsePaths = browseDAO.getBrowsePaths(opContext, "dataset", dummyUrn); assertEquals(browsePaths.size(), 1); assertEquals(browsePaths.get(0), "foo"); + + // Test the case of null browsePaths field + sourceMap.put("browsePaths", Collections.singletonList(null)); + when(mockSearchHit.getSourceAsMap()).thenReturn(sourceMap); + when(mockSearchHits.getHits()).thenReturn(new SearchHit[] {mockSearchHit}); + when(mockSearchResponse.getHits()).thenReturn(mockSearchHits); + when(mockClient.search(any(), eq(RequestOptions.DEFAULT))).thenReturn(mockSearchResponse); + List nullBrowsePaths = browseDAO.getBrowsePaths(opContext, "dataset", dummyUrn); + assertEquals(nullBrowsePaths.size(), 0); } } diff --git a/metadata-io/src/test/java/com/linkedin/metadata/structuredproperties/validators/PropertyDefinitionValidatorTest.java b/metadata-io/src/test/java/com/linkedin/metadata/structuredproperties/validators/PropertyDefinitionValidatorTest.java index 2af731a51145e3..18949f0566dd19 100644 --- a/metadata-io/src/test/java/com/linkedin/metadata/structuredproperties/validators/PropertyDefinitionValidatorTest.java +++ b/metadata-io/src/test/java/com/linkedin/metadata/structuredproperties/validators/PropertyDefinitionValidatorTest.java @@ -397,4 +397,40 @@ public void testCanChangeAllowedValueDescriptions() .count(), 0); } + + @Test + public void testUrnIdWithSpace() + throws URISyntaxException, CloneNotSupportedException, AspectValidationException { + Urn propertyUrn = UrnUtils.getUrn("urn:li:structuredProperty:test me out.foo.bar"); + StructuredPropertyDefinition newProperty = new StructuredPropertyDefinition(); + newProperty.setEntityTypes(new UrnArray(Urn.createFromString("urn:li:logicalEntity:dataset"))); + newProperty.setDisplayName("oldProp"); + newProperty.setQualifiedName("foo.bar"); + newProperty.setCardinality(PropertyCardinality.MULTIPLE); + newProperty.setValueType(Urn.createFromString("urn:li:logicalType:STRING")); + assertEquals( + PropertyDefinitionValidator.validateDefinitionUpserts( + TestMCP.ofOneMCP(propertyUrn, null, newProperty, entityRegistry), + mockRetrieverContext) + .count(), + 1); + } + + @Test + public void testQualifiedNameWithSpace() + throws URISyntaxException, CloneNotSupportedException, AspectValidationException { + Urn propertyUrn = UrnUtils.getUrn("urn:li:structuredProperty:foo.bar"); + StructuredPropertyDefinition newProperty = new StructuredPropertyDefinition(); + newProperty.setEntityTypes(new UrnArray(Urn.createFromString("urn:li:logicalEntity:dataset"))); + newProperty.setDisplayName("oldProp"); + newProperty.setQualifiedName("foo.bar with spaces"); + newProperty.setCardinality(PropertyCardinality.MULTIPLE); + newProperty.setValueType(Urn.createFromString("urn:li:logicalType:STRING")); + assertEquals( + PropertyDefinitionValidator.validateDefinitionUpserts( + TestMCP.ofOneMCP(propertyUrn, null, newProperty, entityRegistry), + mockRetrieverContext) + .count(), + 1); + } } diff --git a/metadata-models/src/main/pegasus/com/linkedin/schema/SchemaField.pdl b/metadata-models/src/main/pegasus/com/linkedin/schema/SchemaField.pdl index 0b72d376b0be49..61731e8d37fd69 100644 --- a/metadata-models/src/main/pegasus/com/linkedin/schema/SchemaField.pdl +++ b/metadata-models/src/main/pegasus/com/linkedin/schema/SchemaField.pdl @@ -16,7 +16,7 @@ record SchemaField { @Searchable = { "fieldName": "fieldPaths", "fieldType": "TEXT", - "boostScore": 5.0, + "boostScore": 1.0, "queryByDefault": "true" } fieldPath: SchemaFieldPath diff --git a/metadata-service/configuration/src/main/resources/bootstrap_mcps.yaml b/metadata-service/configuration/src/main/resources/bootstrap_mcps.yaml index f9497258c384fc..0e283dfdfc93ca 100644 --- a/metadata-service/configuration/src/main/resources/bootstrap_mcps.yaml +++ b/metadata-service/configuration/src/main/resources/bootstrap_mcps.yaml @@ -38,7 +38,7 @@ bootstrap: # Ingestion Recipes - name: ingestion-datahub-gc - version: v4 + version: v5 optional: false mcps_location: "bootstrap_mcps/ingestion-datahub-gc.yaml" values_env: "DATAHUB_GC_BOOTSTRAP_VALUES" diff --git a/metadata-service/configuration/src/main/resources/bootstrap_mcps/ingestion-datahub-gc.yaml b/metadata-service/configuration/src/main/resources/bootstrap_mcps/ingestion-datahub-gc.yaml index 395eb5db534245..c0c5be85b16b1d 100644 --- a/metadata-service/configuration/src/main/resources/bootstrap_mcps/ingestion-datahub-gc.yaml +++ b/metadata-service/configuration/src/main/resources/bootstrap_mcps/ingestion-datahub-gc.yaml @@ -19,6 +19,7 @@ config: cleanup_expired_tokens: {{cleanup_expired_tokens}}{{^cleanup_expired_tokens}}false{{/cleanup_expired_tokens}} truncate_indices: {{truncate_indices}}{{^truncate_indices}}true{{/truncate_indices}} + truncate_index_older_than_days: {{truncate_indices_retention_days}}{{^truncate_indices_retention_days}}30{{/truncate_indices_retention_days}} dataprocess_cleanup: retention_days: {{dataprocess_cleanup.retention_days}}{{^dataprocess_cleanup.retention_days}}10{{/dataprocess_cleanup.retention_days}} delete_empty_data_jobs: {{dataprocess_cleanup.delete_empty_data_jobs}}{{^dataprocess_cleanup.delete_empty_data_jobs}}true{{/dataprocess_cleanup.delete_empty_data_jobs}} diff --git a/metadata-service/configuration/src/main/resources/search_config.yaml b/metadata-service/configuration/src/main/resources/search_config.yaml index e93f8af8b1d6c4..47494c8cb1ca43 100644 --- a/metadata-service/configuration/src/main/resources/search_config.yaml +++ b/metadata-service/configuration/src/main/resources/search_config.yaml @@ -65,9 +65,9 @@ queryConfigurations: boost_mode: replace # Criteria for exact-match only - # Contains quotes, is a single term with `_`, `.`, or `-` (normally consider for tokenization) then use exact match query + # Contains quotes then use exact match query - queryRegex: >- - ^["'].+["']$|^[a-zA-Z0-9]\S+[_.-]\S+[a-zA-Z0-9]$ + ^["'].+["']$ simpleQuery: false prefixMatchQuery: true exactMatchQuery: true diff --git a/metadata-service/factories/src/main/java/com/linkedin/gms/factory/plugins/SpringStandardPluginConfiguration.java b/metadata-service/factories/src/main/java/com/linkedin/gms/factory/plugins/SpringStandardPluginConfiguration.java index b2db0857a6a5c8..26e0da8e6fb990 100644 --- a/metadata-service/factories/src/main/java/com/linkedin/gms/factory/plugins/SpringStandardPluginConfiguration.java +++ b/metadata-service/factories/src/main/java/com/linkedin/gms/factory/plugins/SpringStandardPluginConfiguration.java @@ -98,8 +98,7 @@ public MCPSideEffect dataProductUnsetSideEffect() { AspectPluginConfig.builder() .enabled(true) .className(DataProductUnsetSideEffect.class.getName()) - .supportedOperations( - List.of("CREATE", "CREATE_ENTITY", "UPSERT", "RESTATE", "DELETE", "PATCH")) + .supportedOperations(List.of("CREATE", "CREATE_ENTITY", "UPSERT", "RESTATE")) .supportedEntityAspectNames( List.of( AspectPluginConfig.EntityAspectName.builder() diff --git a/metadata-service/restli-servlet-impl/src/main/java/com/linkedin/metadata/resources/analytics/Analytics.java b/metadata-service/restli-servlet-impl/src/main/java/com/linkedin/metadata/resources/analytics/Analytics.java index 9bbe1bb35fc654..94da6308eda1f2 100644 --- a/metadata-service/restli-servlet-impl/src/main/java/com/linkedin/metadata/resources/analytics/Analytics.java +++ b/metadata-service/restli-servlet-impl/src/main/java/com/linkedin/metadata/resources/analytics/Analytics.java @@ -5,7 +5,6 @@ import com.datahub.authorization.AuthUtil; import com.datahub.plugins.auth.authorization.Authorizer; import com.linkedin.analytics.GetTimeseriesAggregatedStatsResponse; -import com.linkedin.metadata.authorization.PoliciesConfig; import com.linkedin.metadata.query.filter.Filter; import com.linkedin.metadata.resources.restli.RestliUtils; import com.linkedin.metadata.timeseries.TimeseriesAspectService; @@ -14,7 +13,6 @@ import com.linkedin.restli.server.RestLiServiceException; import com.linkedin.restli.server.annotations.Action; import com.linkedin.restli.server.annotations.ActionParam; -import com.linkedin.restli.server.annotations.Context; import com.linkedin.restli.server.annotations.Optional; import com.linkedin.restli.server.annotations.RestLiSimpleResource; import com.linkedin.restli.server.resources.SimpleResourceTemplate; @@ -24,12 +22,10 @@ import com.linkedin.timeseries.GroupingBucket; import com.linkedin.timeseries.GroupingBucketArray; import java.util.Arrays; -import java.util.List; import javax.annotation.Nonnull; import javax.annotation.Nullable; import javax.inject.Inject; import javax.inject.Named; -import javax.servlet.http.HttpServletRequest; import io.datahubproject.metadata.context.OperationContext; import io.datahubproject.metadata.context.RequestContext; @@ -38,6 +34,7 @@ import static com.datahub.authorization.AuthUtil.isAPIAuthorized; import static com.linkedin.metadata.authorization.ApiGroup.TIMESERIES; import static com.linkedin.metadata.authorization.ApiOperation.READ; +import static com.linkedin.metadata.utils.CriterionUtils.validateAndConvert; /** Rest.li entry point: /analytics */ @Slf4j @@ -90,8 +87,9 @@ public Task getTimeseriesStats( resp.setEntityName(entityName); resp.setAspectName(aspectName); resp.setAggregationSpecs(new AggregationSpecArray(Arrays.asList(aggregationSpecs))); - if (filter != null) { - resp.setFilter(filter); + final Filter finalFilter = validateAndConvert(filter); + if (finalFilter != null) { + resp.setFilter(finalFilter); } if (groupingBuckets != null) { resp.setGroupingBuckets(new GroupingBucketArray(Arrays.asList(groupingBuckets))); @@ -99,7 +97,7 @@ public Task getTimeseriesStats( GenericTable aggregatedStatsTable = timeseriesAspectService.getAggregatedStats(opContext, - entityName, aspectName, aggregationSpecs, filter, groupingBuckets); + entityName, aspectName, aggregationSpecs, finalFilter, groupingBuckets); resp.setTable(aggregatedStatsTable); return resp; }); diff --git a/metadata-service/restli-servlet-impl/src/main/java/com/linkedin/metadata/resources/entity/AspectResource.java b/metadata-service/restli-servlet-impl/src/main/java/com/linkedin/metadata/resources/entity/AspectResource.java index 37dca1cecd817c..a8b9c34ab66ae6 100644 --- a/metadata-service/restli-servlet-impl/src/main/java/com/linkedin/metadata/resources/entity/AspectResource.java +++ b/metadata-service/restli-servlet-impl/src/main/java/com/linkedin/metadata/resources/entity/AspectResource.java @@ -11,6 +11,7 @@ import static com.linkedin.metadata.authorization.ApiOperation.READ; import static com.linkedin.metadata.resources.operations.OperationsResource.*; import static com.linkedin.metadata.resources.restli.RestliConstants.*; +import static com.linkedin.metadata.utils.CriterionUtils.validateAndConvert; import com.codahale.metrics.MetricRegistry; import com.datahub.authentication.Authentication; @@ -22,14 +23,12 @@ import com.linkedin.common.urn.Urn; import com.linkedin.metadata.aspect.EnvelopedAspectArray; import com.linkedin.metadata.aspect.VersionedAspect; -import com.linkedin.metadata.aspect.batch.BatchItem; import com.linkedin.metadata.authorization.PoliciesConfig; import com.linkedin.metadata.entity.EntityService; import com.linkedin.metadata.entity.IngestResult; import com.linkedin.metadata.entity.ebean.batch.AspectsBatchImpl; import com.linkedin.metadata.aspect.batch.AspectsBatch; import com.linkedin.metadata.entity.validation.ValidationException; -import com.linkedin.metadata.models.registry.EntityRegistry; import com.linkedin.metadata.query.filter.Filter; import com.linkedin.metadata.query.filter.SortCriterion; import com.linkedin.metadata.resources.operations.Utils; @@ -38,7 +37,6 @@ import com.linkedin.metadata.timeseries.TimeseriesAspectService; import com.linkedin.mxe.GenericAspect; import com.linkedin.mxe.MetadataChangeProposal; -import com.linkedin.mxe.SystemMetadata; import com.linkedin.parseq.Task; import com.linkedin.restli.common.HttpStatus; import com.linkedin.restli.internal.server.methods.AnyRecord; @@ -59,8 +57,6 @@ import java.time.Clock; import java.util.Arrays; import java.util.List; -import java.util.Map; -import java.util.Objects; import java.util.Set; import java.util.stream.Collectors; import javax.annotation.Nonnull; @@ -239,7 +235,7 @@ public Task getTimeseriesAspectValues( startTimeMillis, endTimeMillis, limit, - filter, + validateAndConvert(filter), sort))); return response; }, diff --git a/metadata-service/restli-servlet-impl/src/main/java/com/linkedin/metadata/resources/entity/EntityResource.java b/metadata-service/restli-servlet-impl/src/main/java/com/linkedin/metadata/resources/entity/EntityResource.java index 30aa3ffa578c17..6c5576f2e5d9f4 100644 --- a/metadata-service/restli-servlet-impl/src/main/java/com/linkedin/metadata/resources/entity/EntityResource.java +++ b/metadata-service/restli-servlet-impl/src/main/java/com/linkedin/metadata/resources/entity/EntityResource.java @@ -12,6 +12,7 @@ import static com.linkedin.metadata.entity.validation.ValidationUtils.*; import static com.linkedin.metadata.resources.restli.RestliConstants.*; import static com.linkedin.metadata.search.utils.SearchUtils.*; +import static com.linkedin.metadata.utils.CriterionUtils.validateAndConvert; import static com.linkedin.metadata.utils.PegasusUtils.*; import static com.linkedin.metadata.utils.SystemMetadataUtils.generateSystemMetadataIfEmpty; @@ -401,7 +402,7 @@ public Task search( // This API is not used by the frontend for search bars so we default to structured result = entitySearchService.search(opContext, - List.of(entityName), input, filter, sortCriterionList, start, count); + List.of(entityName), input, validateAndConvert(filter), sortCriterionList, start, count); if (!isAPIAuthorizedResult( opContext, @@ -448,7 +449,7 @@ public Task searchAcrossEntities( log.info("GET SEARCH RESULTS ACROSS ENTITIES for {} with query {}", entityList, input); return RestliUtils.toTask( () -> { - SearchResult result = searchService.searchAcrossEntities(opContext, entityList, input, filter, sortCriterionList, start, count); + SearchResult result = searchService.searchAcrossEntities(opContext, entityList, input, validateAndConvert(filter), sortCriterionList, start, count); if (!isAPIAuthorizedResult( opContext, result)) { @@ -514,7 +515,7 @@ public Task scrollAcrossEntities( opContext, entityList, input, - filter, + validateAndConvert(filter), sortCriterionList, scrollId, keepAlive, @@ -583,7 +584,7 @@ public Task searchAcrossLineage( entityList, input, maxHops, - filter, + validateAndConvert(filter), sortCriterionList, start, count), @@ -648,7 +649,7 @@ public Task scrollAcrossLineage( entityList, input, maxHops, - filter, + validateAndConvert(filter), sortCriterionList, scrollId, keepAlive, @@ -683,10 +684,11 @@ public Task list( List sortCriterionList = getSortCriteria(sortCriteria, sortCriterion); - log.info("GET LIST RESULTS for {} with filter {}", entityName, filter); + final Filter finalFilter = validateAndConvert(filter); + log.info("GET LIST RESULTS for {} with filter {}", entityName, finalFilter); return RestliUtils.toTask( () -> { - SearchResult result = entitySearchService.filter(opContext, entityName, filter, sortCriterionList, start, count); + SearchResult result = entitySearchService.filter(opContext, entityName, finalFilter, sortCriterionList, start, count); if (!AuthUtil.isAPIAuthorizedResult( opContext, result)) { diff --git a/metadata-service/services/src/main/java/com/linkedin/metadata/entity/EntityService.java b/metadata-service/services/src/main/java/com/linkedin/metadata/entity/EntityService.java index beb8bd3d090a5f..445724f0144e64 100644 --- a/metadata-service/services/src/main/java/com/linkedin/metadata/entity/EntityService.java +++ b/metadata-service/services/src/main/java/com/linkedin/metadata/entity/EntityService.java @@ -363,7 +363,9 @@ List ingestAspects( * @param auditStamp an {@link AuditStamp} containing metadata about the writer & current time * @param systemMetadata * @return the {@link RecordTemplate} representation of the written aspect object + * @deprecated See Conditional Write ChangeType CREATE */ + @Deprecated RecordTemplate ingestAspectIfNotPresent( @Nonnull OperationContext opContext, @Nonnull Urn urn, diff --git a/metadata-utils/src/main/java/com/linkedin/metadata/utils/CriterionUtils.java b/metadata-utils/src/main/java/com/linkedin/metadata/utils/CriterionUtils.java index e40c4af1e0ae73..f8e138487fc168 100644 --- a/metadata-utils/src/main/java/com/linkedin/metadata/utils/CriterionUtils.java +++ b/metadata-utils/src/main/java/com/linkedin/metadata/utils/CriterionUtils.java @@ -1,17 +1,81 @@ package com.linkedin.metadata.utils; +import static com.linkedin.metadata.Constants.URN_LI_PREFIX; + import com.linkedin.data.template.StringArray; import com.linkedin.metadata.query.filter.Condition; import com.linkedin.metadata.query.filter.Criterion; +import com.linkedin.metadata.query.filter.Filter; +import java.util.ArrayList; import java.util.Arrays; import java.util.Collection; import java.util.Collections; +import java.util.List; import java.util.stream.Collectors; import javax.annotation.Nonnull; +import javax.annotation.Nullable; +import lombok.extern.slf4j.Slf4j; +@Slf4j public class CriterionUtils { private CriterionUtils() {} + /** + * This function is meant to validate and correct Filter input for rest.li endpoints. + * + * @param inputFilter the rest.li filter parameter + * @return validated and corrected filter + */ + @Nullable + public static Filter validateAndConvert(@Nullable Filter inputFilter) { + if (inputFilter != null) { + List invalidCriterion = new ArrayList<>(); + if (inputFilter.hasCriteria()) { + invalidCriterion.addAll( + inputFilter.getCriteria().stream() + .filter( + criterion -> + (criterion.hasValue() && !criterion.getValue().isEmpty()) + || !criterion.hasValue()) + .collect(Collectors.toList())); + } + if (inputFilter.hasOr()) { + invalidCriterion.addAll( + inputFilter.getOr().stream() + .flatMap(c -> c.getAnd().stream()) + .filter( + criterion -> + (criterion.hasValue() && !criterion.getValue().isEmpty()) + || !criterion.hasValue()) + .collect(Collectors.toList())); + } + + for (Criterion criterion : invalidCriterion) { + if (criterion.hasValue()) { + if ((criterion.getValue().contains(",") + && !criterion.getValue().startsWith(URN_LI_PREFIX)) + || criterion.getValue().contains(")," + URN_LI_PREFIX)) { + throw new IllegalArgumentException( + "Criterion `value` is deprecated and contains an ambiguous comma. Please use `values`."); + } + if (criterion.hasValues() && !criterion.getValue().equals(criterion.getValues().get(0))) { + throw new IllegalArgumentException( + "Criterion `value` is deprecated and `values`[0] is populated with a conflicting value."); + } + // auto-convert + if (!criterion.hasValues()) { + log.error( + "Deprecated use of a filter using Criterion's `value` has been detected and corrected. Please migrate to `values` instead."); + criterion.setValues(new StringArray(criterion.getValue())); + } + } + // must be set per required field + criterion.setValue(""); + } + } + return inputFilter; + } + public static Criterion buildExistsCriterion(@Nonnull String field) { return buildCriterion(field, Condition.EXISTS, false, Collections.emptyList()); } diff --git a/metadata-utils/src/test/java/com/linkedin/metadata/utils/CriterionUtilsTest.java b/metadata-utils/src/test/java/com/linkedin/metadata/utils/CriterionUtilsTest.java new file mode 100644 index 00000000000000..e2f22dd665c7c7 --- /dev/null +++ b/metadata-utils/src/test/java/com/linkedin/metadata/utils/CriterionUtilsTest.java @@ -0,0 +1,274 @@ +package com.linkedin.metadata.utils; + +import static org.testng.Assert.assertEquals; +import static org.testng.Assert.assertFalse; +import static org.testng.Assert.assertNotNull; +import static org.testng.Assert.assertNull; +import static org.testng.Assert.assertTrue; + +import com.linkedin.data.template.StringArray; +import com.linkedin.metadata.query.filter.ConjunctiveCriterion; +import com.linkedin.metadata.query.filter.ConjunctiveCriterionArray; +import com.linkedin.metadata.query.filter.Criterion; +import com.linkedin.metadata.query.filter.CriterionArray; +import com.linkedin.metadata.query.filter.Filter; +import org.testng.annotations.Test; + +public class CriterionUtilsTest { + @Test + public void testNullFilter() { + Filter result = CriterionUtils.validateAndConvert(null); + assertNull(result); + } + + @Test + public void testEmptyFilter() { + Filter input = new Filter(); + Filter result = CriterionUtils.validateAndConvert(input); + assertNotNull(result); + assertFalse(result.hasCriteria()); + assertFalse(result.hasOr()); + } + + @Test + public void testSimpleCriterionConversion() { + Filter input = new Filter(); + Criterion criterion = new Criterion(); + criterion.setValue("testValue"); + input.setCriteria(new CriterionArray(criterion)); + + Filter result = CriterionUtils.validateAndConvert(input); + + Criterion convertedCriterion = result.getCriteria().get(0); + assertEquals(convertedCriterion.getValue(), ""); + assertTrue(convertedCriterion.hasValues()); + assertEquals("testValue", convertedCriterion.getValues().get(0)); + } + + @Test + public void testOrClauseCriterionConversion() { + Filter input = new Filter(); + + // Create OR clause with AND criteria + Criterion criterion = new Criterion(); + criterion.setValue("orValue"); + + ConjunctiveCriterion conjunctive = new ConjunctiveCriterion(); + conjunctive.setAnd(new CriterionArray(criterion)); + + input.setOr(new ConjunctiveCriterionArray(conjunctive)); + + Filter result = CriterionUtils.validateAndConvert(input); + + Criterion convertedCriterion = result.getOr().get(0).getAnd().get(0); + assertEquals(convertedCriterion.getValue(), ""); + assertTrue(convertedCriterion.hasValues()); + assertEquals("orValue", convertedCriterion.getValues().get(0)); + } + + @Test(expectedExceptions = IllegalArgumentException.class) + public void testCommaInValueThrowsException() { + Filter input = new Filter(); + Criterion criterion = new Criterion(); + criterion.setValue("value1,value2"); + input.setCriteria(new CriterionArray(criterion)); + + CriterionUtils.validateAndConvert(input); + } + + @Test(expectedExceptions = IllegalArgumentException.class) + public void testConflictingValuesThrowsException() { + Filter input = new Filter(); + Criterion criterion = new Criterion(); + criterion.setValue("value1"); + criterion.setValues(new StringArray("differentValue")); + input.setCriteria(new CriterionArray(criterion)); + + CriterionUtils.validateAndConvert(input); + } + + @Test + public void testExistingValuesNotModified() { + Filter input = new Filter(); + Criterion criterion = new Criterion(); + criterion.setValue("value1"); + criterion.setValues(new StringArray("value1")); // Same value, should not throw exception + input.setCriteria(new CriterionArray(criterion)); + + Filter result = CriterionUtils.validateAndConvert(input); + + Criterion convertedCriterion = result.getCriteria().get(0); + assertEquals(convertedCriterion.getValue(), ""); + assertTrue(convertedCriterion.hasValues()); + assertEquals("value1", convertedCriterion.getValues().get(0)); + } + + @Test + public void testMultipleCriteriaConversion() { + Filter input = new Filter(); + + Criterion criterion1 = new Criterion(); + criterion1.setValue("value1"); + + Criterion criterion2 = new Criterion(); + criterion2.setValue("value2"); + + input.setCriteria(new CriterionArray(criterion1, criterion2)); + + Filter result = CriterionUtils.validateAndConvert(input); + + assertEquals(2, result.getCriteria().size()); + + for (Criterion c : result.getCriteria()) { + assertEquals(c.getValue(), ""); + assertTrue(c.hasValues()); + assertTrue(c.getValues().get(0).equals("value1") || c.getValues().get(0).equals("value2")); + } + } + + @Test + public void testMixedCriteriaAndOrClause() { + Filter input = new Filter(); + + // Add direct criteria + Criterion criterion1 = new Criterion(); + criterion1.setValue("directValue"); + input.setCriteria(new CriterionArray(criterion1)); + + // Add OR clause with AND criteria + Criterion criterion2 = new Criterion(); + criterion2.setValue("orValue"); + ConjunctiveCriterion conjunctive = new ConjunctiveCriterion(); + conjunctive.setAnd(new CriterionArray(criterion2)); + input.setOr(new ConjunctiveCriterionArray(conjunctive)); + + Filter result = CriterionUtils.validateAndConvert(input); + + // Check direct criterion + Criterion convertedDirect = result.getCriteria().get(0); + assertEquals(convertedDirect.getValue(), ""); + assertTrue(convertedDirect.hasValues()); + assertEquals("directValue", convertedDirect.getValues().get(0)); + + // Check OR clause criterion + Criterion convertedOr = result.getOr().get(0).getAnd().get(0); + assertEquals(convertedOr.getValue(), ""); + assertTrue(convertedOr.hasValues()); + assertEquals("orValue", convertedOr.getValues().get(0)); + } + + @Test + public void testEmptyStringValueNotConverted() { + Filter input = new Filter(); + Criterion criterion = new Criterion(); + criterion.setValue(""); // Empty string value + input.setCriteria(new CriterionArray(criterion)); + + Filter result = CriterionUtils.validateAndConvert(input); + + Criterion convertedCriterion = result.getCriteria().get(0); + assertEquals(convertedCriterion.getValue(), ""); + assertFalse(convertedCriterion.hasValues()); // Should not be converted since value was empty + } + + @Test + public void testMixedEmptyAndNonEmptyValues() { + Filter input = new Filter(); + + Criterion emptyCriterion = new Criterion(); + emptyCriterion.setValue(""); + + Criterion nonEmptyCriterion = new Criterion(); + nonEmptyCriterion.setValue("value1"); + + input.setCriteria(new CriterionArray(emptyCriterion, nonEmptyCriterion)); + + Filter result = CriterionUtils.validateAndConvert(input); + + assertEquals(2, result.getCriteria().size()); + + // Check empty criterion + Criterion convertedEmpty = result.getCriteria().get(0); + assertEquals(convertedEmpty.getValue(), ""); + assertFalse(convertedEmpty.hasValues()); + + // Check non-empty criterion + Criterion convertedNonEmpty = result.getCriteria().get(1); + assertEquals(convertedNonEmpty.getValue(), ""); + assertTrue(convertedNonEmpty.hasValues()); + assertEquals(convertedNonEmpty.getValues().get(0), "value1"); + } + + @Test + public void testOrClauseWithEmptyValues() { + Filter input = new Filter(); + + // Create OR clause with mixed empty and non-empty criteria + Criterion emptyCriterion = new Criterion(); + emptyCriterion.setValue(""); + + Criterion nonEmptyCriterion = new Criterion(); + nonEmptyCriterion.setValue("orValue"); + + ConjunctiveCriterion conjunctive = new ConjunctiveCriterion(); + conjunctive.setAnd(new CriterionArray(emptyCriterion, nonEmptyCriterion)); + + input.setOr(new ConjunctiveCriterionArray(conjunctive)); + + Filter result = CriterionUtils.validateAndConvert(input); + + // Check empty criterion + Criterion convertedEmpty = result.getOr().get(0).getAnd().get(0); + assertEquals(convertedEmpty.getValue(), ""); + assertFalse(convertedEmpty.hasValues()); + + // Check non-empty criterion + Criterion convertedNonEmpty = result.getOr().get(0).getAnd().get(1); + assertEquals(convertedNonEmpty.getValue(), ""); + assertTrue(convertedNonEmpty.hasValues()); + assertEquals(convertedNonEmpty.getValues().get(0), "orValue"); + } + + @Test + public void testCriterionWithOnlyValues() { + Filter input = new Filter(); + Criterion criterion = new Criterion(); + criterion.setValues(new StringArray("value1")); // Only has values, no value field set + input.setCriteria(new CriterionArray(criterion)); + + Filter result = CriterionUtils.validateAndConvert(input); + + Criterion convertedCriterion = result.getCriteria().get(0); + assertEquals(convertedCriterion.getValue(), ""); + assertTrue(convertedCriterion.hasValues()); + assertEquals(convertedCriterion.getValues().get(0), "value1"); + } + + @Test(expectedExceptions = IllegalArgumentException.class) + public void testMultiUrnThrowsException() { + Filter input = new Filter(); + Criterion criterion = new Criterion(); + criterion.setValue( + "urn:li:dataset:(urn:li:dataPlatform:postgres,foo,PROD),urn:li:dataset:(urn:li:dataPlatform:postgres,foo,PROD)"); + input.setCriteria(new CriterionArray(criterion)); + + CriterionUtils.validateAndConvert(input); + } + + @Test + public void testUrnConversion() { + Filter input = new Filter(); + Criterion criterion = new Criterion(); + criterion.setValue("urn:li:dataset:(urn:li:dataPlatform:postgres,foo,PROD)"); + input.setCriteria(new CriterionArray(criterion)); + + Filter result = CriterionUtils.validateAndConvert(input); + + Criterion convertedCriterion = result.getCriteria().get(0); + assertEquals(convertedCriterion.getValue(), ""); + assertTrue(convertedCriterion.hasValues()); + assertEquals( + "urn:li:dataset:(urn:li:dataPlatform:postgres,foo,PROD)", + convertedCriterion.getValues().get(0)); + } +} diff --git a/settings.gradle b/settings.gradle index fa1fdb9f1a67ce..8756df31c1ac6f 100644 --- a/settings.gradle +++ b/settings.gradle @@ -75,3 +75,5 @@ include 'metadata-service:configuration' include ':metadata-jobs:common' include ':metadata-operation-context' include ':metadata-service:openapi-servlet:models' +include ':metadata-integration:java:datahub-schematron:lib' +include ':metadata-integration:java:datahub-schematron:cli' diff --git a/smoke-test/tests/cypress/cypress/e2e/siblings/siblings.js b/smoke-test/tests/cypress/cypress/e2e/siblings/siblings.js index fb772bd7af1e74..57617d7721e594 100644 --- a/smoke-test/tests/cypress/cypress/e2e/siblings/siblings.js +++ b/smoke-test/tests/cypress/cypress/e2e/siblings/siblings.js @@ -98,7 +98,7 @@ describe("siblings", () => { it("will combine results in search", () => { cy.login(); - cy.visit("/search?page=1&query=raw_orders"); + cy.visit("/search?page=1&query=%22raw_orders%22"); cy.contains("Showing 1 - 2 of ");