From a66390c6b11e50c1283d88e1e6474d8dd827c992 Mon Sep 17 00:00:00 2001 From: mohdsiddique Date: Thu, 8 Dec 2022 04:39:50 +0530 Subject: [PATCH] feat(ingest): support knowledge links in business glossary (#6375) Co-authored-by: Shirshanka Das Co-authored-by: MohdSiddique Bagwan Co-authored-by: Harshal Sheth --- .../bootstrap_data/business_glossary.yml | 6 + .../source/metadata/business_glossary.py | 105 +++- .../src/datahub/utilities/urn_encoder.py | 7 +- .../business-glossary/business_glossary.yml | 72 +++ .../glossary_events_golden.json | 554 ++++++++++++++++++ .../business-glossary/glossary_to_file.yml | 10 + .../test_business_glossary.py | 43 ++ .../src/main/resources/entity-registry.yml | 1 + 8 files changed, 775 insertions(+), 23 deletions(-) create mode 100644 metadata-ingestion/tests/integration/business-glossary/business_glossary.yml create mode 100644 metadata-ingestion/tests/integration/business-glossary/glossary_events_golden.json create mode 100644 metadata-ingestion/tests/integration/business-glossary/glossary_to_file.yml create mode 100644 metadata-ingestion/tests/integration/business-glossary/test_business_glossary.py diff --git a/metadata-ingestion/examples/bootstrap_data/business_glossary.yml b/metadata-ingestion/examples/bootstrap_data/business_glossary.yml index 70ea3a2ce3404..a80f2fec84924 100644 --- a/metadata-ingestion/examples/bootstrap_data/business_glossary.yml +++ b/metadata-ingestion/examples/bootstrap_data/business_glossary.yml @@ -7,6 +7,9 @@ url: "https://github.com/datahub-project/datahub/" nodes: - name: Classification description: A set of terms related to Data Classification + knowledge_links: + - label: Wiki link for classification + url: "https://en.wikipedia.org/wiki/Classification" terms: - name: Sensitive description: Sensitive Data @@ -110,3 +113,6 @@ nodes: source_url: "https://spec.edmcouncil.org/fibo/ontology/FBC/ProductsAndServices/ClientsAndAccounts/Account" related_terms: - House.Kitchen + knowledge_links: + - url: "https://en.wikipedia.org/wiki/Spoon" + label: Wiki link diff --git a/metadata-ingestion/src/datahub/ingestion/source/metadata/business_glossary.py b/metadata-ingestion/src/datahub/ingestion/source/metadata/business_glossary.py index 9cd73057f1d31..068e107c76279 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/metadata/business_glossary.py +++ b/metadata-ingestion/src/datahub/ingestion/source/metadata/business_glossary.py @@ -1,7 +1,10 @@ import logging +import pathlib +import time from dataclasses import dataclass, field from typing import Any, Dict, Iterable, List, Optional, Union +import pydantic from pydantic import validator from pydantic.fields import Field @@ -14,6 +17,7 @@ make_group_urn, make_user_urn, ) +from datahub.emitter.mcp import MetadataChangeProposalWrapper from datahub.ingestion.api.decorators import ( # SourceCapability,; capability, SupportStatus, config_class, @@ -22,6 +26,7 @@ ) from datahub.ingestion.api.source import Source, SourceReport from datahub.ingestion.api.workunit import MetadataWorkUnit, UsageStatsWorkUnit +from datahub.utilities.urn_encoder import UrnEncoder logger = logging.getLogger(__name__) @@ -40,6 +45,11 @@ class Owners(ConfigModel): groups: Optional[List[str]] +class KnowledgeCard(ConfigModel): + url: Optional[str] + label: Optional[str] + + class GlossaryTermConfig(ConfigModel): id: Optional[str] name: str @@ -53,6 +63,7 @@ class GlossaryTermConfig(ConfigModel): values: Optional[List[str]] related_terms: Optional[List[str]] custom_properties: Optional[Dict[str, str]] + knowledge_links: Optional[List[KnowledgeCard]] class GlossaryNodeConfig(ConfigModel): @@ -62,6 +73,7 @@ class GlossaryNodeConfig(ConfigModel): owners: Optional[Owners] terms: Optional[List[GlossaryTermConfig]] nodes: Optional[List["GlossaryNodeConfig"]] + knowledge_links: Optional[List[KnowledgeCard]] GlossaryNodeConfig.update_forward_refs() @@ -77,7 +89,9 @@ class DefaultConfig(ConfigModel): class BusinessGlossarySourceConfig(ConfigModel): - file: str = Field(description="Path to business glossary file to ingest.") + file: pydantic.FilePath = Field( + description="Path to business glossary file to ingest." + ) enable_auto_id: bool = Field( description="Generate id field from GlossaryNode and GlossaryTerm's name field", default=False, @@ -101,6 +115,10 @@ def create_id(path: List[str], default_id: Optional[str], enable_auto_id: bool) return default_id # No need to create id from path as default_id is provided id_: str = ".".join(path) + + if UrnEncoder.contains_reserved_char(id_): + enable_auto_id = True + if enable_auto_id: id_ = datahub_guid({"path": id_}) return id_ @@ -153,14 +171,13 @@ def get_owners(owners: Owners) -> models.OwnershipClass: def get_mces( glossary: BusinessGlossaryConfig, ingestion_config: BusinessGlossarySourceConfig -) -> List[models.MetadataChangeEventClass]: - events: List[models.MetadataChangeEventClass] = [] +) -> Iterable[Union[MetadataChangeProposalWrapper, models.MetadataChangeEventClass]]: path: List[str] = [] root_owners = get_owners(glossary.owners) if glossary.nodes: for node in glossary.nodes: - events += get_mces_from_node( + yield from get_mces_from_node( node, path + [node.name], parentNode=None, @@ -171,7 +188,7 @@ def get_mces( if glossary.terms: for term in glossary.terms: - events += get_mces_from_term( + yield from get_mces_from_term( term, path + [term.name], parentNode=None, @@ -180,13 +197,39 @@ def get_mces( ingestion_config=ingestion_config, ) - return events - def get_mce_from_snapshot(snapshot: Any) -> models.MetadataChangeEventClass: return models.MetadataChangeEventClass(proposedSnapshot=snapshot) +def make_institutional_memory_mcp( + urn: str, knowledge_cards: List[KnowledgeCard] +) -> Optional[MetadataChangeProposalWrapper]: + elements: List[models.InstitutionalMemoryMetadataClass] = [] + + for knowledge_card in knowledge_cards: + if knowledge_card.label and knowledge_card.url: + elements.append( + models.InstitutionalMemoryMetadataClass( + url=knowledge_card.url, + description=knowledge_card.label, + createStamp=models.AuditStampClass( + time=int(time.time() * 1000.0), + actor="urn:li:corpuser:datahub", + message="ingestion bot", + ), + ) + ) + + if elements: + return MetadataChangeProposalWrapper( + entityUrn=urn, + aspect=models.InstitutionalMemoryClass(elements=elements), + ) + + return None + + def get_mces_from_node( glossaryNode: GlossaryNodeConfig, path: List[str], @@ -194,7 +237,7 @@ def get_mces_from_node( parentOwners: models.OwnershipClass, defaults: DefaultConfig, ingestion_config: BusinessGlossarySourceConfig, -) -> List[models.MetadataChangeEventClass]: +) -> Iterable[Union[MetadataChangeProposalWrapper, models.MetadataChangeEventClass]]: node_urn = make_glossary_node_urn( path, glossaryNode.id, ingestion_config.enable_auto_id ) @@ -212,10 +255,18 @@ def get_mces_from_node( urn=node_urn, aspects=[node_info, node_owners, valid_status], ) - mces = [get_mce_from_snapshot(node_snapshot)] + yield get_mce_from_snapshot(node_snapshot) + + if glossaryNode.knowledge_links is not None: + mcp: Optional[MetadataChangeProposalWrapper] = make_institutional_memory_mcp( + node_urn, glossaryNode.knowledge_links + ) + if mcp is not None: + yield mcp + if glossaryNode.nodes: for node in glossaryNode.nodes: - mces += get_mces_from_node( + yield from get_mces_from_node( node, path + [node.name], parentNode=node_urn, @@ -226,7 +277,7 @@ def get_mces_from_node( if glossaryNode.terms: for term in glossaryNode.terms: - mces += get_mces_from_term( + yield from get_mces_from_term( glossaryTerm=term, path=path + [term.name], parentNode=node_urn, @@ -234,7 +285,6 @@ def get_mces_from_node( defaults=defaults, ingestion_config=ingestion_config, ) - return mces def get_mces_from_term( @@ -244,7 +294,7 @@ def get_mces_from_term( parentOwnership: models.OwnershipClass, defaults: DefaultConfig, ingestion_config: BusinessGlossarySourceConfig, -) -> List[models.MetadataChangeEventClass]: +) -> Iterable[Union[models.MetadataChangeEventClass, MetadataChangeProposalWrapper]]: term_urn = make_glossary_term_urn( path, glossaryTerm.id, ingestion_config.enable_auto_id ) @@ -338,14 +388,18 @@ def get_mces_from_term( ownership = get_owners(glossaryTerm.owners) aspects.append(ownership) - term_browse = models.BrowsePathsClass(paths=["/" + "/".join(path)]) - aspects.append(term_browse) - term_snapshot: models.GlossaryTermSnapshotClass = models.GlossaryTermSnapshotClass( urn=term_urn, aspects=aspects, ) - return [get_mce_from_snapshot(term_snapshot)] + yield get_mce_from_snapshot(term_snapshot) + + if glossaryTerm.knowledge_links: + mcp: Optional[MetadataChangeProposalWrapper] = make_institutional_memory_mcp( + term_urn, glossaryTerm.knowledge_links + ) + if mcp is not None: + yield mcp def populate_path_vs_id(glossary: BusinessGlossaryConfig) -> None: @@ -388,7 +442,7 @@ def create(cls, config_dict, ctx): config = BusinessGlossarySourceConfig.parse_obj(config_dict) return cls(ctx, config) - def load_glossary_config(self, file_name: str) -> BusinessGlossaryConfig: + def load_glossary_config(self, file_name: pathlib.Path) -> BusinessGlossaryConfig: config = load_config_file(file_name) glossary_cfg = BusinessGlossaryConfig.parse_obj(config) return glossary_cfg @@ -396,10 +450,17 @@ def load_glossary_config(self, file_name: str) -> BusinessGlossaryConfig: def get_workunits(self) -> Iterable[Union[MetadataWorkUnit, UsageStatsWorkUnit]]: glossary_config = self.load_glossary_config(self.config.file) populate_path_vs_id(glossary_config) - for mce in get_mces(glossary_config, ingestion_config=self.config): - wu = MetadataWorkUnit(f"{mce.proposedSnapshot.urn}", mce=mce) - self.report.report_workunit(wu) - yield wu + for event in get_mces(glossary_config, ingestion_config=self.config): + if isinstance(event, models.MetadataChangeEventClass): + wu = MetadataWorkUnit(f"{event.proposedSnapshot.urn}", mce=event) + self.report.report_workunit(wu) + yield wu + elif isinstance(event, MetadataChangeProposalWrapper): + wu = MetadataWorkUnit( + id=f"{event.entityType}-{event.aspectName}-{event.entityUrn}", + mcp=event, + ) + yield wu def get_report(self): return self.report diff --git a/metadata-ingestion/src/datahub/utilities/urn_encoder.py b/metadata-ingestion/src/datahub/utilities/urn_encoder.py index 68212784da33c..706d50d942055 100644 --- a/metadata-ingestion/src/datahub/utilities/urn_encoder.py +++ b/metadata-ingestion/src/datahub/utilities/urn_encoder.py @@ -3,7 +3,8 @@ # NOTE: Frontend relies on encoding these three characters. Specifically, we decode and encode schema fields for column level lineage. # If this changes, make appropriate changes to datahub-web-react/src/app/lineage/utils/columnLineageUtils.ts -RESERVED_CHARS = [",", "(", ")"] +RESERVED_CHARS = {",", "(", ")"} +RESERVED_CHARS_EXTENDED = RESERVED_CHARS.union({"%"}) class UrnEncoder: @@ -19,3 +20,7 @@ def encode_string(s: str) -> str: def encode_char(c: str) -> str: assert len(c) == 1, "Invalid input, Expected single character" return urllib.parse.quote(c) if c in RESERVED_CHARS else c + + @staticmethod + def contains_reserved_char(value: str) -> bool: + return bool(set(value).intersection(RESERVED_CHARS_EXTENDED)) diff --git a/metadata-ingestion/tests/integration/business-glossary/business_glossary.yml b/metadata-ingestion/tests/integration/business-glossary/business_glossary.yml new file mode 100644 index 0000000000000..9550960282872 --- /dev/null +++ b/metadata-ingestion/tests/integration/business-glossary/business_glossary.yml @@ -0,0 +1,72 @@ +version: 1 +source: DataHub +owners: + users: + - mjames +url: "https://github.com/datahub-project/datahub/" +nodes: + - name: Classification + description: A set of terms related to Data Classification + knowledge_links: + - label: Wiki link for classification + url: "https://en.wikipedia.org/wiki/Classification" + terms: + - name: Sensitive + description: Sensitive Data + custom_properties: + is_confidential: false + knowledge_links: + - label: Google Link + url: "https://www.google.com" + - name: Confidential + description: Confidential Data + custom_properties: + is_confidential: true + - name: Highly Confidential + description: Highly Confidential Data + custom_properties: + is_confidential: true + - name: Personal Information + description: All terms related to personal information + owners: + users: + - mjames + terms: + - name: Email + description: An individual's email address + inherits: + - Classification.Confidential + owners: + groups: + - Trust and Safety + - name: Address + description: A physical address + - name: Gender + description: The gender identity of the individual + inherits: + - Classification.Sensitive + - name: Clients And Accounts + description: Provides basic concepts such as account, account holder, account provider, relationship manager that are commonly used by financial services providers to describe customers and to determine counterparty identities + owners: + groups: + - finance + terms: + - name: Account + description: Container for records associated with a business arrangement for regular transactions and services + term_source: "EXTERNAL" + source_ref: FIBO + source_url: "https://spec.edmcouncil.org/fibo/ontology/FBC/ProductsAndServices/ClientsAndAccounts/Account" + inherits: + - Classification.Highly Confidential + contains: + - Clients And Accounts.Balance + - name: Balance + description: Amount of money available or owed + term_source: "EXTERNAL" + source_ref: FIBO + source_url: "https://spec.edmcouncil.org/fibo/ontology/FBC/ProductsAndServices/ClientsAndAccounts/Balance" + - name: KPIs + description: Common Business KPIs + terms: + - name: CSAT % + description: Customer Satisfaction Score diff --git a/metadata-ingestion/tests/integration/business-glossary/glossary_events_golden.json b/metadata-ingestion/tests/integration/business-glossary/glossary_events_golden.json new file mode 100644 index 0000000000000..8bd977993dca7 --- /dev/null +++ b/metadata-ingestion/tests/integration/business-glossary/glossary_events_golden.json @@ -0,0 +1,554 @@ +[ +{ + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.GlossaryNodeSnapshot": { + "urn": "urn:li:glossaryNode:Classification", + "aspects": [ + { + "com.linkedin.pegasus2avro.glossary.GlossaryNodeInfo": { + "definition": "A set of terms related to Data Classification", + "name": "Classification" + } + }, + { + "com.linkedin.pegasus2avro.common.Ownership": { + "owners": [ + { + "owner": "urn:li:corpuser:mjames", + "type": "DEVELOPER" + } + ], + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + } + } + }, + { + "com.linkedin.pegasus2avro.common.Status": { + "removed": false + } + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "datahub-business-glossary-2020_04_14-07_00_00" + } +}, +{ + "entityType": "glossaryNode", + "entityUrn": "urn:li:glossaryNode:Classification", + "changeType": "UPSERT", + "aspectName": "institutionalMemory", + "aspect": { + "value": "{\"elements\": [{\"url\": \"https://en.wikipedia.org/wiki/Classification\", \"description\": \"Wiki link for classification\", \"createStamp\": {\"time\": 1586847600000, \"actor\": \"urn:li:corpuser:datahub\", \"message\": \"ingestion bot\"}}]}", + "contentType": "application/json" + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "datahub-business-glossary-2020_04_14-07_00_00" + } +}, +{ + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.GlossaryTermSnapshot": { + "urn": "urn:li:glossaryTerm:Classification.Sensitive", + "aspects": [ + { + "com.linkedin.pegasus2avro.glossary.GlossaryTermInfo": { + "customProperties": { + "is_confidential": "False" + }, + "name": "Sensitive", + "definition": "Sensitive Data", + "parentNode": "urn:li:glossaryNode:Classification", + "termSource": "INTERNAL", + "sourceRef": "DataHub", + "sourceUrl": "https://github.com/datahub-project/datahub/" + } + }, + { + "com.linkedin.pegasus2avro.common.Ownership": { + "owners": [ + { + "owner": "urn:li:corpuser:mjames", + "type": "DEVELOPER" + } + ], + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + } + } + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "datahub-business-glossary-2020_04_14-07_00_00" + } +}, +{ + "entityType": "glossaryTerm", + "entityUrn": "urn:li:glossaryTerm:Classification.Sensitive", + "changeType": "UPSERT", + "aspectName": "institutionalMemory", + "aspect": { + "value": "{\"elements\": [{\"url\": \"https://www.google.com\", \"description\": \"Google Link\", \"createStamp\": {\"time\": 1586847600000, \"actor\": \"urn:li:corpuser:datahub\", \"message\": \"ingestion bot\"}}]}", + "contentType": "application/json" + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "datahub-business-glossary-2020_04_14-07_00_00" + } +}, +{ + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.GlossaryTermSnapshot": { + "urn": "urn:li:glossaryTerm:Classification.Confidential", + "aspects": [ + { + "com.linkedin.pegasus2avro.glossary.GlossaryTermInfo": { + "customProperties": { + "is_confidential": "True" + }, + "name": "Confidential", + "definition": "Confidential Data", + "parentNode": "urn:li:glossaryNode:Classification", + "termSource": "INTERNAL", + "sourceRef": "DataHub", + "sourceUrl": "https://github.com/datahub-project/datahub/" + } + }, + { + "com.linkedin.pegasus2avro.common.Ownership": { + "owners": [ + { + "owner": "urn:li:corpuser:mjames", + "type": "DEVELOPER" + } + ], + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + } + } + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "datahub-business-glossary-2020_04_14-07_00_00" + } +}, +{ + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.GlossaryTermSnapshot": { + "urn": "urn:li:glossaryTerm:Classification.Highly Confidential", + "aspects": [ + { + "com.linkedin.pegasus2avro.glossary.GlossaryTermInfo": { + "customProperties": { + "is_confidential": "True" + }, + "name": "Highly Confidential", + "definition": "Highly Confidential Data", + "parentNode": "urn:li:glossaryNode:Classification", + "termSource": "INTERNAL", + "sourceRef": "DataHub", + "sourceUrl": "https://github.com/datahub-project/datahub/" + } + }, + { + "com.linkedin.pegasus2avro.common.Ownership": { + "owners": [ + { + "owner": "urn:li:corpuser:mjames", + "type": "DEVELOPER" + } + ], + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + } + } + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "datahub-business-glossary-2020_04_14-07_00_00" + } +}, +{ + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.GlossaryNodeSnapshot": { + "urn": "urn:li:glossaryNode:Personal Information", + "aspects": [ + { + "com.linkedin.pegasus2avro.glossary.GlossaryNodeInfo": { + "definition": "All terms related to personal information", + "name": "Personal Information" + } + }, + { + "com.linkedin.pegasus2avro.common.Ownership": { + "owners": [ + { + "owner": "urn:li:corpuser:mjames", + "type": "DEVELOPER" + } + ], + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + } + } + }, + { + "com.linkedin.pegasus2avro.common.Status": { + "removed": false + } + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "datahub-business-glossary-2020_04_14-07_00_00" + } +}, +{ + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.GlossaryTermSnapshot": { + "urn": "urn:li:glossaryTerm:Personal Information.Email", + "aspects": [ + { + "com.linkedin.pegasus2avro.glossary.GlossaryTermInfo": { + "customProperties": {}, + "name": "Email", + "definition": "An individual's email address", + "parentNode": "urn:li:glossaryNode:Personal Information", + "termSource": "INTERNAL", + "sourceRef": "DataHub", + "sourceUrl": "https://github.com/datahub-project/datahub/" + } + }, + { + "com.linkedin.pegasus2avro.glossary.GlossaryRelatedTerms": { + "isRelatedTerms": [ + "urn:li:glossaryTerm:Classification.Confidential" + ] + } + }, + { + "com.linkedin.pegasus2avro.common.Ownership": { + "owners": [ + { + "owner": "urn:li:corpGroup:Trust and Safety", + "type": "DEVELOPER" + } + ], + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + } + } + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "datahub-business-glossary-2020_04_14-07_00_00" + } +}, +{ + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.GlossaryTermSnapshot": { + "urn": "urn:li:glossaryTerm:Personal Information.Address", + "aspects": [ + { + "com.linkedin.pegasus2avro.glossary.GlossaryTermInfo": { + "customProperties": {}, + "name": "Address", + "definition": "A physical address", + "parentNode": "urn:li:glossaryNode:Personal Information", + "termSource": "INTERNAL", + "sourceRef": "DataHub", + "sourceUrl": "https://github.com/datahub-project/datahub/" + } + }, + { + "com.linkedin.pegasus2avro.common.Ownership": { + "owners": [ + { + "owner": "urn:li:corpuser:mjames", + "type": "DEVELOPER" + } + ], + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + } + } + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "datahub-business-glossary-2020_04_14-07_00_00" + } +}, +{ + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.GlossaryTermSnapshot": { + "urn": "urn:li:glossaryTerm:Personal Information.Gender", + "aspects": [ + { + "com.linkedin.pegasus2avro.glossary.GlossaryTermInfo": { + "customProperties": {}, + "name": "Gender", + "definition": "The gender identity of the individual", + "parentNode": "urn:li:glossaryNode:Personal Information", + "termSource": "INTERNAL", + "sourceRef": "DataHub", + "sourceUrl": "https://github.com/datahub-project/datahub/" + } + }, + { + "com.linkedin.pegasus2avro.glossary.GlossaryRelatedTerms": { + "isRelatedTerms": [ + "urn:li:glossaryTerm:Classification.Sensitive" + ] + } + }, + { + "com.linkedin.pegasus2avro.common.Ownership": { + "owners": [ + { + "owner": "urn:li:corpuser:mjames", + "type": "DEVELOPER" + } + ], + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + } + } + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "datahub-business-glossary-2020_04_14-07_00_00" + } +}, +{ + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.GlossaryNodeSnapshot": { + "urn": "urn:li:glossaryNode:Clients And Accounts", + "aspects": [ + { + "com.linkedin.pegasus2avro.glossary.GlossaryNodeInfo": { + "definition": "Provides basic concepts such as account, account holder, account provider, relationship manager that are commonly used by financial services providers to describe customers and to determine counterparty identities", + "name": "Clients And Accounts" + } + }, + { + "com.linkedin.pegasus2avro.common.Ownership": { + "owners": [ + { + "owner": "urn:li:corpGroup:finance", + "type": "DEVELOPER" + } + ], + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + } + } + }, + { + "com.linkedin.pegasus2avro.common.Status": { + "removed": false + } + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "datahub-business-glossary-2020_04_14-07_00_00" + } +}, +{ + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.GlossaryTermSnapshot": { + "urn": "urn:li:glossaryTerm:Clients And Accounts.Account", + "aspects": [ + { + "com.linkedin.pegasus2avro.glossary.GlossaryTermInfo": { + "customProperties": {}, + "name": "Account", + "definition": "Container for records associated with a business arrangement for regular transactions and services", + "parentNode": "urn:li:glossaryNode:Clients And Accounts", + "termSource": "EXTERNAL", + "sourceRef": "FIBO", + "sourceUrl": "https://spec.edmcouncil.org/fibo/ontology/FBC/ProductsAndServices/ClientsAndAccounts/Account" + } + }, + { + "com.linkedin.pegasus2avro.glossary.GlossaryRelatedTerms": { + "isRelatedTerms": [ + "urn:li:glossaryTerm:Classification.Highly Confidential" + ], + "hasRelatedTerms": [ + "urn:li:glossaryTerm:Clients And Accounts.Balance" + ] + } + }, + { + "com.linkedin.pegasus2avro.common.Ownership": { + "owners": [ + { + "owner": "urn:li:corpGroup:finance", + "type": "DEVELOPER" + } + ], + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + } + } + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "datahub-business-glossary-2020_04_14-07_00_00" + } +}, +{ + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.GlossaryTermSnapshot": { + "urn": "urn:li:glossaryTerm:Clients And Accounts.Balance", + "aspects": [ + { + "com.linkedin.pegasus2avro.glossary.GlossaryTermInfo": { + "customProperties": {}, + "name": "Balance", + "definition": "Amount of money available or owed", + "parentNode": "urn:li:glossaryNode:Clients And Accounts", + "termSource": "EXTERNAL", + "sourceRef": "FIBO", + "sourceUrl": "https://spec.edmcouncil.org/fibo/ontology/FBC/ProductsAndServices/ClientsAndAccounts/Balance" + } + }, + { + "com.linkedin.pegasus2avro.common.Ownership": { + "owners": [ + { + "owner": "urn:li:corpGroup:finance", + "type": "DEVELOPER" + } + ], + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + } + } + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "datahub-business-glossary-2020_04_14-07_00_00" + } +}, +{ + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.GlossaryNodeSnapshot": { + "urn": "urn:li:glossaryNode:KPIs", + "aspects": [ + { + "com.linkedin.pegasus2avro.glossary.GlossaryNodeInfo": { + "definition": "Common Business KPIs", + "name": "KPIs" + } + }, + { + "com.linkedin.pegasus2avro.common.Ownership": { + "owners": [ + { + "owner": "urn:li:corpuser:mjames", + "type": "DEVELOPER" + } + ], + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + } + } + }, + { + "com.linkedin.pegasus2avro.common.Status": { + "removed": false + } + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "datahub-business-glossary-2020_04_14-07_00_00" + } +}, +{ + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.GlossaryTermSnapshot": { + "urn": "urn:li:glossaryTerm:4faf1eed790370f65942f2998a7993d6", + "aspects": [ + { + "com.linkedin.pegasus2avro.glossary.GlossaryTermInfo": { + "customProperties": {}, + "name": "CSAT %", + "definition": "Customer Satisfaction Score", + "parentNode": "urn:li:glossaryNode:KPIs", + "termSource": "INTERNAL", + "sourceRef": "DataHub", + "sourceUrl": "https://github.com/datahub-project/datahub/" + } + }, + { + "com.linkedin.pegasus2avro.common.Ownership": { + "owners": [ + { + "owner": "urn:li:corpuser:mjames", + "type": "DEVELOPER" + } + ], + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + } + } + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "datahub-business-glossary-2020_04_14-07_00_00" + } +} +] \ No newline at end of file diff --git a/metadata-ingestion/tests/integration/business-glossary/glossary_to_file.yml b/metadata-ingestion/tests/integration/business-glossary/glossary_to_file.yml new file mode 100644 index 0000000000000..4e7c4977f073f --- /dev/null +++ b/metadata-ingestion/tests/integration/business-glossary/glossary_to_file.yml @@ -0,0 +1,10 @@ +source: + type: datahub-business-glossary + config: + # Coordinates + file: ./business_glossary.yml + +sink: + type: file + config: + filename: glossary_events.json diff --git a/metadata-ingestion/tests/integration/business-glossary/test_business_glossary.py b/metadata-ingestion/tests/integration/business-glossary/test_business_glossary.py new file mode 100644 index 0000000000000..2ac9cca972bad --- /dev/null +++ b/metadata-ingestion/tests/integration/business-glossary/test_business_glossary.py @@ -0,0 +1,43 @@ +import shutil +from typing import List + +import pytest +from freezegun import freeze_time + +from datahub.ingestion.source.metadata import business_glossary +from tests.test_helpers import mce_helpers +from tests.test_helpers.click_helpers import run_datahub_cmd + +FROZEN_TIME = "2020-04-14 07:00:00" + + +@freeze_time(FROZEN_TIME) +@pytest.mark.integration +def test_glossary_ingest(docker_compose_runner, pytestconfig, tmp_path, mock_time): + test_resources_dir = pytestconfig.rootpath / "tests/integration/business-glossary" + + # Run the metadata ingestion pipeline. + config_file = (test_resources_dir / "glossary_to_file.yml").resolve() + shutil.copy(test_resources_dir / "business_glossary.yml", tmp_path) + run_datahub_cmd( + ["ingest", "--strict-warnings", "-c", f"{config_file}"], tmp_path=tmp_path + ) + # These paths change from one instance run of the clickhouse docker to the other, and the FROZEN_TIME does not apply to these. + ignore_paths: List[str] = [ + r"root\[\d+\]\['proposedSnapshot'\].+\['aspects'\].+\['customProperties'\]\['metadata_modification_time'\]", + r"root\[\d+\]\['proposedSnapshot'\].+\['aspects'\].+\['customProperties'\]\['data_paths'\]", + r"root\[\d+\]\['proposedSnapshot'\].+\['aspects'\].+\['customProperties'\]\['metadata_path'\]", + ] + # Verify the output. + mce_helpers.check_golden_file( + pytestconfig, + ignore_paths=ignore_paths, + output_path=tmp_path / "glossary_events.json", + golden_path=test_resources_dir / "glossary_events_golden.json", + ) + + +@freeze_time(FROZEN_TIME) +def test_auto_id_creation_on_reserved_char(): + id_: str = business_glossary.create_id(["pii", "secure % password"], None, False) + assert id_ == "24baf9389cc05c162c7148c96314d733" diff --git a/metadata-models/src/main/resources/entity-registry.yml b/metadata-models/src/main/resources/entity-registry.yml index 781476873ba0a..06b4c647b7aa0 100644 --- a/metadata-models/src/main/resources/entity-registry.yml +++ b/metadata-models/src/main/resources/entity-registry.yml @@ -151,6 +151,7 @@ entities: keyAspect: glossaryNodeKey aspects: - glossaryNodeInfo + - institutionalMemory - ownership - status - name: dataHubIngestionSource