From 7b087851fc5e7ba07f8a3f88dc0afc7f10fee420 Mon Sep 17 00:00:00 2001 From: Shirshanka Das Date: Wed, 18 Jan 2023 21:27:53 -0800 Subject: [PATCH] feat(datahub-lite): adding tab completion, small serialization fixes --- docs/datahub_lite.md | 108 +++++++++++++++++- .../src/datahub/cli/lite_cli.py | 39 ++++++- .../src/datahub/lite/duckdb_lite.py | 15 ++- 3 files changed, 147 insertions(+), 15 deletions(-) diff --git a/docs/datahub_lite.md b/docs/datahub_lite.md index cefe6c3afac52..b52da68257d6f 100644 --- a/docs/datahub_lite.md +++ b/docs/datahub_lite.md @@ -1,3 +1,6 @@ +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; + # DataHub Lite (Experimental) ## What is it? @@ -106,12 +109,18 @@ As a convenient short-cut, you can import metadata from any standard DataHub met ## Exploring Metadata -The `datahub lite` group of commands provides a set of capabilities for you to explore the metadata you just ingested. +The `datahub lite` group of commands provides a set of capabilities for you to explore the metadata you just ingested. ### List (ls) Listing functions like a directory structure that is customized based on the kind of system being explored. DataHub's metadata is automatically organized into databases, tables, views, dashboards, charts, etc. +:::note + +Using the `ls` command below is much more pleasant when you have tab completion enabled on your shell. Check out the [Setting up Tab Completion](#tab-completion) section at the bottom of the guide. + +::: + ```shell > datahub lite ls / databases @@ -133,6 +142,9 @@ metadata_index metadata_aspect_v2 ``` + + + ### Read (get) Once you have located a path of interest, you can read metadata at that entity, by issuing a **get**. You can additionally filter the metadata retrieved from an entity by the aspect type of the metadata (e.g. to request the schema, filter by the **schemaMetadata** aspect). @@ -157,7 +169,7 @@ Get metadata for an entity by path ```json -> datahub lite get /databases/mysql/instances/default/databases/datahub/tables/metadata_aspect_v2 +> datahub lite get --path /databases/mysql/instances/default/databases/datahub/tables/metadata_aspect_v2 { "urn": "urn:li:dataset:(urn:li:dataPlatform:mysql,datahub.metadata_aspect_v2,PROD)", "container": { @@ -313,7 +325,7 @@ Get metadata for an entity by path #### Get metadata for an entity filtered by specific aspect ```json -> datahub lite get /databases/mysql/instances/default/databases/datahub/tables/metadata_aspect_v2 --aspect status +> datahub lite get --path /databases/mysql/instances/default/databases/datahub/tables/metadata_aspect_v2 --aspect status { "urn": "urn:li:dataset:(urn:li:dataPlatform:mysql,datahub.metadata_aspect_v2,PROD)", "status": { @@ -324,10 +336,17 @@ Get metadata for an entity by path } ``` +:::note + +Using the `get` command by path is much more pleasant when you have tab completion enabled on your shell. Check out the [Setting up Tab Completion](#tab-completion) section at the bottom of the guide. + +::: + + #### Get metadata using the urn of the entity ```json -> datahub lite get "urn:li:dataset:(urn:li:dataPlatform:mysql,datahub.metadata_aspect_v2,PROD)" --aspect status +> datahub lite get --urn "urn:li:dataset:(urn:li:dataPlatform:mysql,datahub.metadata_aspect_v2,PROD)" --aspect status { "urn": "urn:li:dataset:(urn:li:dataPlatform:mysql,datahub.metadata_aspect_v2,PROD)", "status": { @@ -344,7 +363,7 @@ Get metadata with additional details (systemMetadata) ```json -> datahub lite get /databases/mysql/instances/default/databases/datahub/tables/metadata_aspect_v2 --aspect status --verbose +> datahub lite get --path /databases/mysql/instances/default/databases/datahub/tables/metadata_aspect_v2 --aspect status --verbose { "urn": "urn:li:dataset:(urn:li:dataPlatform:mysql,datahub.metadata_aspect_v2,PROD)", "status": { @@ -518,4 +537,81 @@ DataHub Lite maintains a few derived tables to make access possible via both the ## Caveat Emptor! -DataHub Lite is a very new project. Do not use it for production use-cases. The API-s and storage formats are subject to change and we get feedback from early adopters. That said, we are really interested in accepting PR-s and suggestions for improvements to this fledgling project. \ No newline at end of file +DataHub Lite is a very new project. Do not use it for production use-cases. The API-s and storage formats are subject to change and we get feedback from early adopters. That said, we are really interested in accepting PR-s and suggestions for improvements to this fledgling project. + + +## Advanced Options + +### Tab Completion + +Using the datahub lite commands like `ls` or `get` is much more pleasant when you have tab completion enabled on your shell. Tab completion is supported on the command line through the [Click Shell completion](https://click.palletsprojects.com/en/8.1.x/shell-completion/) module. +To set up shell completion for your shell, follow the instructions below: + +#### Option 1 (inline eval) + + + +Add this to ~/.zshrc: + +```shell +eval "$(_DATAHUB_COMPLETE=zsh_source datahub)" +``` + + + + +Add this to ~/.bashrc: + +```shell +eval "$(_DATAHUB_COMPLETE=bash_source datahub)" +``` + + + + + +#### Option 2 (external completion script) + +Using eval means that the command is invoked and evaluated every time a shell is started, which can delay shell responsiveness. To speed it up, write the generated script to a file, then source that. + + + + +Save the script somewhere. + +```shell +_DATAHUB_COMPLETE=zsh_source datahub > ~/.datahub-complete.zsh +``` + +Source the file in ~/.zshrc. + +```shell +. ~/.datahub-complete.zsh +``` + + + + +```shell +_DATAHUB_COMPLETE=bash_source datahub > ~/.datahub-complete.bash +``` + +Source the file in ~/.bashrc. + +```shell +. ~/.datahub-complete.bash +``` + + + + + +Save the script to ~/.config/fish/completions/datahub.fish: + +```shell +_DATAHUB_COMPLETE=fish_source datahub > ~/.config/fish/completions/datahub.fish +``` + + + + diff --git a/metadata-ingestion/src/datahub/cli/lite_cli.py b/metadata-ingestion/src/datahub/cli/lite_cli.py index 38447889fa58d..0532110ea2f23 100644 --- a/metadata-ingestion/src/datahub/cli/lite_cli.py +++ b/metadata-ingestion/src/datahub/cli/lite_cli.py @@ -6,6 +6,7 @@ from typing import List, Optional import click +from click.shell_completion import CompletionItem from click_default_group import DefaultGroup from datahub.cli.cli_utils import ( @@ -73,9 +74,36 @@ def list_urns() -> None: click.echo(result) +class CompleteablePath(click.ParamType): + name = "path" + + def shell_complete(self, ctx, param, incomplete): + path = incomplete or "/" + lite = _get_datahub_lite(read_only=True) + try: + completions = lite.ls(path) + return [ + CompletionItem(browseable.auto_complete.suggested_path, type="plain") + if browseable.auto_complete + else CompletionItem( + f"{incomplete}/{browseable.name}".replace("//", "/") + ) + for browseable in completions + if not browseable.leaf + ] + except Exception as e: + logger.debug(f"failed with {e}") + return [] + + @lite.command(context_settings=dict(allow_extra_args=True)) @click.option("--urn", required=False, type=str, help="Get metadata rooted at an urn") -@click.option("--path", required=False, type=str, help="Get metadata rooted at a path") +@click.option( + "--path", + required=False, + type=CompleteablePath(), + help="Get metadata rooted at a path", +) @click.option("-a", "--aspect", required=False, multiple=True, type=str) @click.option("--asof", required=False, type=click.DateTime(formats=["%Y-%m-%d"])) @click.option("--verbose", required=False, is_flag=True, default=False) @@ -90,7 +118,7 @@ def get( verbose: bool, ) -> None: """Get one or more metadata elements""" - + start_time = time.time() if urn is None and path is None: if not ctx.args: raise click.UsageError( @@ -142,6 +170,8 @@ def get( indent=2, ) ) + end_time = time.time() + logger.debug(f"Time taken: {int((end_time - start_time)*1000.0)} millis") @lite.command() @@ -182,16 +212,19 @@ def serve(port: int) -> None: @lite.command(context_settings=dict(allow_extra_args=True)) -@click.argument("path", required=False) +@click.argument("path", required=False, type=CompleteablePath()) @click.pass_context @telemetry.with_telemetry def ls(ctx: click.Context, path: Optional[str]) -> None: """List at a path""" + start_time = time.time() path = path or "/" lite = _get_datahub_lite(read_only=True) try: browseables = lite.ls(path) + end_time = time.time() + logger.debug(f"Time taken: {int((end_time - start_time)*1000.0)} millis") auto_complete: List[AutoComplete] = [ b.auto_complete for b in browseables if b.auto_complete is not None ] diff --git a/metadata-ingestion/src/datahub/lite/duckdb_lite.py b/metadata-ingestion/src/datahub/lite/duckdb_lite.py index a834559dbf9a4..62decfd1dc565 100644 --- a/metadata-ingestion/src/datahub/lite/duckdb_lite.py +++ b/metadata-ingestion/src/datahub/lite/duckdb_lite.py @@ -10,6 +10,7 @@ from datahub.emitter.aspect import ASPECT_MAP from datahub.emitter.mcp import MetadataChangeProposalWrapper from datahub.emitter.mcp_builder import mcps_from_mce +from datahub.emitter.serialization_helper import post_json_transform from datahub.lite.lite_local import ( AutoComplete, Browseable, @@ -260,7 +261,7 @@ def get( aspect: Union[dict, _Aspect] = json.loads(r[2]) if typed: assert isinstance(aspect, dict) - aspect = ASPECT_MAP[aspect_name].from_obj(aspect) + aspect = ASPECT_MAP[aspect_name].from_obj(post_json_transform(aspect)) result_map[aspect_name] = {"value": aspect} if details: @@ -496,7 +497,9 @@ def get_all_entities( aspect_name in ASPECT_MAP ), f"Missing aspect name {aspect_name} in the registry" try: - aspect_payload = ASPECT_MAP[aspect_name].from_obj(aspect_payload) + aspect_payload = ASPECT_MAP[aspect_name].from_obj( + post_json_transform(aspect_payload) + ) except Exception as e: logger.exception( f"Failed to process urn: {urn}, aspect_name: {aspect_name}, metadata: {aspect_payload}", @@ -524,7 +527,7 @@ def get_all_aspects(self) -> Iterable[MetadataChangeProposalWrapper]: for r in results.fetchall(): urn = r[0] aspect_name = r[1] - aspect_metadata = ASPECT_MAP[aspect_name].from_obj(json.loads(r[2])) # type: ignore + aspect_metadata = ASPECT_MAP[aspect_name].from_obj(post_json_transform(json.loads(r[2]))) # type: ignore system_metadata = SystemMetadataClass.from_obj(json.loads(r[3])) mcp = MetadataChangeProposalWrapper( entityUrn=urn, @@ -560,17 +563,17 @@ def get_category_from_platform(self, data_platform_urn: DataPlatformUrn) -> Urn: "iceberg", "trino", ], - "streaming_systems": ["kafka"], + "streaming": ["kafka"], "orchestrators": ["airflow", "spark"], "data_movers": ["kafka-connect", "nifi"], "transformation_tools": ["dbt"], - "data_quality_tools": ["great-expectations"], + "data_quality": ["great-expectations"], } for k, v in category_to_platform_map.items(): if data_platform_urn.get_entity_id_as_string() in v: return Urn(entity_type="systemNode", entity_id=[k]) - logger.warning( + logger.debug( f"Failed to find category for platform {data_platform_urn}, mapping to generic data_platform" ) return Urn(entity_type="systemNode", entity_id=["data_platforms"])