diff --git a/docs/datahub_lite.md b/docs/datahub_lite.md
index cefe6c3afac52..b52da68257d6f 100644
--- a/docs/datahub_lite.md
+++ b/docs/datahub_lite.md
@@ -1,3 +1,6 @@
+import Tabs from '@theme/Tabs';
+import TabItem from '@theme/TabItem';
+
# DataHub Lite (Experimental)
## What is it?
@@ -106,12 +109,18 @@ As a convenient short-cut, you can import metadata from any standard DataHub met
## Exploring Metadata
-The `datahub lite` group of commands provides a set of capabilities for you to explore the metadata you just ingested.
+The `datahub lite` group of commands provides a set of capabilities for you to explore the metadata you just ingested.
### List (ls)
Listing functions like a directory structure that is customized based on the kind of system being explored. DataHub's metadata is automatically organized into databases, tables, views, dashboards, charts, etc.
+:::note
+
+Using the `ls` command below is much more pleasant when you have tab completion enabled on your shell. Check out the [Setting up Tab Completion](#tab-completion) section at the bottom of the guide.
+
+:::
+
```shell
> datahub lite ls /
databases
@@ -133,6 +142,9 @@ metadata_index
metadata_aspect_v2
```
+
+
+
### Read (get)
Once you have located a path of interest, you can read metadata at that entity, by issuing a **get**. You can additionally filter the metadata retrieved from an entity by the aspect type of the metadata (e.g. to request the schema, filter by the **schemaMetadata** aspect).
@@ -157,7 +169,7 @@ Get metadata for an entity by path
```json
-> datahub lite get /databases/mysql/instances/default/databases/datahub/tables/metadata_aspect_v2
+> datahub lite get --path /databases/mysql/instances/default/databases/datahub/tables/metadata_aspect_v2
{
"urn": "urn:li:dataset:(urn:li:dataPlatform:mysql,datahub.metadata_aspect_v2,PROD)",
"container": {
@@ -313,7 +325,7 @@ Get metadata for an entity by path
#### Get metadata for an entity filtered by specific aspect
```json
-> datahub lite get /databases/mysql/instances/default/databases/datahub/tables/metadata_aspect_v2 --aspect status
+> datahub lite get --path /databases/mysql/instances/default/databases/datahub/tables/metadata_aspect_v2 --aspect status
{
"urn": "urn:li:dataset:(urn:li:dataPlatform:mysql,datahub.metadata_aspect_v2,PROD)",
"status": {
@@ -324,10 +336,17 @@ Get metadata for an entity by path
}
```
+:::note
+
+Using the `get` command by path is much more pleasant when you have tab completion enabled on your shell. Check out the [Setting up Tab Completion](#tab-completion) section at the bottom of the guide.
+
+:::
+
+
#### Get metadata using the urn of the entity
```json
-> datahub lite get "urn:li:dataset:(urn:li:dataPlatform:mysql,datahub.metadata_aspect_v2,PROD)" --aspect status
+> datahub lite get --urn "urn:li:dataset:(urn:li:dataPlatform:mysql,datahub.metadata_aspect_v2,PROD)" --aspect status
{
"urn": "urn:li:dataset:(urn:li:dataPlatform:mysql,datahub.metadata_aspect_v2,PROD)",
"status": {
@@ -344,7 +363,7 @@ Get metadata with additional details (systemMetadata)
```json
-> datahub lite get /databases/mysql/instances/default/databases/datahub/tables/metadata_aspect_v2 --aspect status --verbose
+> datahub lite get --path /databases/mysql/instances/default/databases/datahub/tables/metadata_aspect_v2 --aspect status --verbose
{
"urn": "urn:li:dataset:(urn:li:dataPlatform:mysql,datahub.metadata_aspect_v2,PROD)",
"status": {
@@ -518,4 +537,81 @@ DataHub Lite maintains a few derived tables to make access possible via both the
## Caveat Emptor!
-DataHub Lite is a very new project. Do not use it for production use-cases. The API-s and storage formats are subject to change and we get feedback from early adopters. That said, we are really interested in accepting PR-s and suggestions for improvements to this fledgling project.
\ No newline at end of file
+DataHub Lite is a very new project. Do not use it for production use-cases. The API-s and storage formats are subject to change and we get feedback from early adopters. That said, we are really interested in accepting PR-s and suggestions for improvements to this fledgling project.
+
+
+## Advanced Options
+
+### Tab Completion
+
+Using the datahub lite commands like `ls` or `get` is much more pleasant when you have tab completion enabled on your shell. Tab completion is supported on the command line through the [Click Shell completion](https://click.palletsprojects.com/en/8.1.x/shell-completion/) module.
+To set up shell completion for your shell, follow the instructions below:
+
+#### Option 1 (inline eval)
+
+
+
+Add this to ~/.zshrc:
+
+```shell
+eval "$(_DATAHUB_COMPLETE=zsh_source datahub)"
+```
+
+
+
+
+Add this to ~/.bashrc:
+
+```shell
+eval "$(_DATAHUB_COMPLETE=bash_source datahub)"
+```
+
+
+
+
+
+#### Option 2 (external completion script)
+
+Using eval means that the command is invoked and evaluated every time a shell is started, which can delay shell responsiveness. To speed it up, write the generated script to a file, then source that.
+
+
+
+
+Save the script somewhere.
+
+```shell
+_DATAHUB_COMPLETE=zsh_source datahub > ~/.datahub-complete.zsh
+```
+
+Source the file in ~/.zshrc.
+
+```shell
+. ~/.datahub-complete.zsh
+```
+
+
+
+
+```shell
+_DATAHUB_COMPLETE=bash_source datahub > ~/.datahub-complete.bash
+```
+
+Source the file in ~/.bashrc.
+
+```shell
+. ~/.datahub-complete.bash
+```
+
+
+
+
+
+Save the script to ~/.config/fish/completions/datahub.fish:
+
+```shell
+_DATAHUB_COMPLETE=fish_source datahub > ~/.config/fish/completions/datahub.fish
+```
+
+
+
+
diff --git a/metadata-ingestion/src/datahub/cli/lite_cli.py b/metadata-ingestion/src/datahub/cli/lite_cli.py
index 38447889fa58d..0532110ea2f23 100644
--- a/metadata-ingestion/src/datahub/cli/lite_cli.py
+++ b/metadata-ingestion/src/datahub/cli/lite_cli.py
@@ -6,6 +6,7 @@
from typing import List, Optional
import click
+from click.shell_completion import CompletionItem
from click_default_group import DefaultGroup
from datahub.cli.cli_utils import (
@@ -73,9 +74,36 @@ def list_urns() -> None:
click.echo(result)
+class CompleteablePath(click.ParamType):
+ name = "path"
+
+ def shell_complete(self, ctx, param, incomplete):
+ path = incomplete or "/"
+ lite = _get_datahub_lite(read_only=True)
+ try:
+ completions = lite.ls(path)
+ return [
+ CompletionItem(browseable.auto_complete.suggested_path, type="plain")
+ if browseable.auto_complete
+ else CompletionItem(
+ f"{incomplete}/{browseable.name}".replace("//", "/")
+ )
+ for browseable in completions
+ if not browseable.leaf
+ ]
+ except Exception as e:
+ logger.debug(f"failed with {e}")
+ return []
+
+
@lite.command(context_settings=dict(allow_extra_args=True))
@click.option("--urn", required=False, type=str, help="Get metadata rooted at an urn")
-@click.option("--path", required=False, type=str, help="Get metadata rooted at a path")
+@click.option(
+ "--path",
+ required=False,
+ type=CompleteablePath(),
+ help="Get metadata rooted at a path",
+)
@click.option("-a", "--aspect", required=False, multiple=True, type=str)
@click.option("--asof", required=False, type=click.DateTime(formats=["%Y-%m-%d"]))
@click.option("--verbose", required=False, is_flag=True, default=False)
@@ -90,7 +118,7 @@ def get(
verbose: bool,
) -> None:
"""Get one or more metadata elements"""
-
+ start_time = time.time()
if urn is None and path is None:
if not ctx.args:
raise click.UsageError(
@@ -142,6 +170,8 @@ def get(
indent=2,
)
)
+ end_time = time.time()
+ logger.debug(f"Time taken: {int((end_time - start_time)*1000.0)} millis")
@lite.command()
@@ -182,16 +212,19 @@ def serve(port: int) -> None:
@lite.command(context_settings=dict(allow_extra_args=True))
-@click.argument("path", required=False)
+@click.argument("path", required=False, type=CompleteablePath())
@click.pass_context
@telemetry.with_telemetry
def ls(ctx: click.Context, path: Optional[str]) -> None:
"""List at a path"""
+ start_time = time.time()
path = path or "/"
lite = _get_datahub_lite(read_only=True)
try:
browseables = lite.ls(path)
+ end_time = time.time()
+ logger.debug(f"Time taken: {int((end_time - start_time)*1000.0)} millis")
auto_complete: List[AutoComplete] = [
b.auto_complete for b in browseables if b.auto_complete is not None
]
diff --git a/metadata-ingestion/src/datahub/lite/duckdb_lite.py b/metadata-ingestion/src/datahub/lite/duckdb_lite.py
index a834559dbf9a4..62decfd1dc565 100644
--- a/metadata-ingestion/src/datahub/lite/duckdb_lite.py
+++ b/metadata-ingestion/src/datahub/lite/duckdb_lite.py
@@ -10,6 +10,7 @@
from datahub.emitter.aspect import ASPECT_MAP
from datahub.emitter.mcp import MetadataChangeProposalWrapper
from datahub.emitter.mcp_builder import mcps_from_mce
+from datahub.emitter.serialization_helper import post_json_transform
from datahub.lite.lite_local import (
AutoComplete,
Browseable,
@@ -260,7 +261,7 @@ def get(
aspect: Union[dict, _Aspect] = json.loads(r[2])
if typed:
assert isinstance(aspect, dict)
- aspect = ASPECT_MAP[aspect_name].from_obj(aspect)
+ aspect = ASPECT_MAP[aspect_name].from_obj(post_json_transform(aspect))
result_map[aspect_name] = {"value": aspect}
if details:
@@ -496,7 +497,9 @@ def get_all_entities(
aspect_name in ASPECT_MAP
), f"Missing aspect name {aspect_name} in the registry"
try:
- aspect_payload = ASPECT_MAP[aspect_name].from_obj(aspect_payload)
+ aspect_payload = ASPECT_MAP[aspect_name].from_obj(
+ post_json_transform(aspect_payload)
+ )
except Exception as e:
logger.exception(
f"Failed to process urn: {urn}, aspect_name: {aspect_name}, metadata: {aspect_payload}",
@@ -524,7 +527,7 @@ def get_all_aspects(self) -> Iterable[MetadataChangeProposalWrapper]:
for r in results.fetchall():
urn = r[0]
aspect_name = r[1]
- aspect_metadata = ASPECT_MAP[aspect_name].from_obj(json.loads(r[2])) # type: ignore
+ aspect_metadata = ASPECT_MAP[aspect_name].from_obj(post_json_transform(json.loads(r[2]))) # type: ignore
system_metadata = SystemMetadataClass.from_obj(json.loads(r[3]))
mcp = MetadataChangeProposalWrapper(
entityUrn=urn,
@@ -560,17 +563,17 @@ def get_category_from_platform(self, data_platform_urn: DataPlatformUrn) -> Urn:
"iceberg",
"trino",
],
- "streaming_systems": ["kafka"],
+ "streaming": ["kafka"],
"orchestrators": ["airflow", "spark"],
"data_movers": ["kafka-connect", "nifi"],
"transformation_tools": ["dbt"],
- "data_quality_tools": ["great-expectations"],
+ "data_quality": ["great-expectations"],
}
for k, v in category_to_platform_map.items():
if data_platform_urn.get_entity_id_as_string() in v:
return Urn(entity_type="systemNode", entity_id=[k])
- logger.warning(
+ logger.debug(
f"Failed to find category for platform {data_platform_urn}, mapping to generic data_platform"
)
return Urn(entity_type="systemNode", entity_id=["data_platforms"])