feat: remove jq requirement + tweak modeldocgen args (#6904)

Co-authored-by: Tamas Nemeth <[email protected]>
datahub-project · Dec 30, 2022 · 62a2aa9 · 62a2aa9
1 parent b796db1
commit 62a2aa9
Show file tree

Hide file tree

Showing 7 changed files with 43 additions and 66 deletions.
diff --git a/docker/datahub-ingestion/base.Dockerfile b/docker/datahub-ingestion/base.Dockerfile
@@ -17,7 +17,6 @@ RUN apt-get update && apt-get install -y \
     && apt-get install -y -qq \
     #    gcc \
     make \
-    jq \
     python3-ldap \
     libldap2-dev \
     libsasl2-dev \

diff --git a/docs/cli.md b/docs/cli.md
@@ -230,8 +230,8 @@ datahub delete --urn "urn:li:dataset:(urn:li:dataPlatform:hive,SampleHiveDataset
 The `get` command allows you to easily retrieve metadata from DataHub, by using the REST API. This works for both versioned aspects and timeseries aspects. For timeseries aspects, it fetches the latest value.
 For example the following command gets the ownership aspect from the dataset `urn:li:dataset:(urn:li:dataPlatform:hive,SampleHiveDataset,PROD)`
 
-```console
-datahub get --urn "urn:li:dataset:(urn:li:dataPlatform:hive,SampleHiveDataset,PROD)" --aspect ownership | jq                                                                       put_command
+```shell-session
+$ datahub get --urn "urn:li:dataset:(urn:li:dataPlatform:hive,SampleHiveDataset,PROD)" --aspect ownership
 {
   "value": {
     "com.linkedin.metadata.snapshot.DatasetSnapshot": {

diff --git a/docs/get-started-with-datahub.md b/docs/get-started-with-datahub.md
@@ -19,8 +19,7 @@ Before you go further, ensure you have the following installed:
 
 * [Python >=3.7.0](https://www.python.org/downloads/)
 * [Docker](https://docs.docker.com/get-docker/)
-* [jq](https://stedolan.github.io/jq/download/)
-* [Docker Compose](https://github.com/docker/compose/blob/master/INSTALL.md) - if using Linux
+* [Docker Compose v2](https://docs.docker.com/compose/install/) - may be bundled with docker
 
 :::note
 

diff --git a/docs/quickstart.md b/docs/quickstart.md
@@ -16,19 +16,16 @@ Tested & confirmed config: 2 CPUs, 8GB RAM, 2GB Swap area, and 10GB disk space.
 
 :::
 
-2. Install [jq](https://stedolan.github.io/jq/download/)
-
-3. Launch the Docker Engine from command line or the desktop app.
+2. Launch the Docker Engine from command line or the desktop app.
 
 3. Install the DataHub CLI
 
    a. Ensure you have Python 3.7+ installed & configured. (Check using `python3 --version`).
 
    b. Run the following commands in your terminal
 
-   ```
+   ```sh
    python3 -m pip install --upgrade pip wheel setuptools
-   python3 -m pip uninstall datahub acryl-datahub || true  # sanity check - ok if it fails
    python3 -m pip install --upgrade acryl-datahub
    datahub version
    ```
@@ -88,7 +85,7 @@ Tested & confirmed config: 2 CPUs, 8GB RAM, 2GB Swap area, and 10GB disk space.
 
 5. To ingest the sample metadata, run the following CLI command from your terminal
 
-   ```
+   ```bash
    datahub docker ingest-sample-data
    ```
 
@@ -110,13 +107,13 @@ Command not found: datahub
 If running the datahub cli produces "command not found" errors inside your terminal, your system may be defaulting to an
 older version of Python. Try prefixing your `datahub` commands with `python3 -m`:
 
-```
+```bash
 python3 -m datahub docker quickstart
 ```
 
 Another possibility is that your system PATH does not include pip's `$HOME/.local/bin` directory.  On linux, you can add this to your `~/.bashrc`:
 
-```
+```bash
 if [ -d "$HOME/.local/bin" ] ; then
     PATH="$HOME/.local/bin:$PATH"
 fi

diff --git a/metadata-ingestion/scripts/datahub_preflight.sh b/metadata-ingestion/scripts/datahub_preflight.sh
@@ -98,7 +98,7 @@ EOF
   fi
 
   printf "✨ Setting up prerequisities\n"
-  brew install "jq"
+  # none for now, since jq was removed
 
   printf "\e[38;2;0;255;0m✅ Done\e[38;2;255;255;255m\n"
 }

diff --git a/metadata-ingestion/scripts/modeldocgen.py b/metadata-ingestion/scripts/modeldocgen.py
@@ -1,7 +1,9 @@
 import glob
 import json
 import logging
+import os
 import re
+import shutil
 import unittest.mock
 from dataclasses import Field, dataclass, field
 from enum import auto
@@ -135,12 +137,7 @@ def load_schema_file(schema_file: str) -> None:
         # probably an aspect schema
         record_schema: avro.schema.RecordSchema = avro_schema
         aspect_def = record_schema.get_prop("Aspect")
-        try:
-            aspect_definition = AspectDefinition(**aspect_def)
-        except Exception as e:
-            import pdb
-
-            breakpoint()
+        aspect_definition = AspectDefinition(**aspect_def)
 
         aspect_definition.schema = record_schema
         aspect_registry[aspect_definition.name] = aspect_definition
@@ -255,8 +252,9 @@ def make_entity_docs(entity_display_name: str, graph: RelationshipGraph) -> str:
         timeseries_aspects_section = ""
 
         for aspect in entity_def.aspects or []:
-            aspect_definition: AspectDefinition = aspect_registry.get(aspect)
+            aspect_definition: AspectDefinition = aspect_registry[aspect]
             assert aspect_definition
+            assert aspect_definition.schema
             deprecated_message = (
                 " (Deprecated)"
                 if aspect_definition.schema.get_prop("Deprecated")
@@ -270,7 +268,7 @@ def make_entity_docs(entity_display_name: str, graph: RelationshipGraph) -> str:
                 f"\n### {aspect}{deprecated_message}{timeseries_qualifier}\n"
             )
             this_aspect_doc += f"{aspect_definition.schema.get_prop('doc')}\n"
-            this_aspect_doc += f"<details>\n<summary>Schema</summary>\n\n"
+            this_aspect_doc += "<details>\n<summary>Schema</summary>\n\n"
             # breakpoint()
             this_aspect_doc += f"```javascript\n{json.dumps(aspect_definition.schema.to_json(), indent=2)}\n```\n</details>\n"
 
@@ -287,20 +285,20 @@ def make_entity_docs(entity_display_name: str, graph: RelationshipGraph) -> str:
         relationships_section = "\n## Relationships\n"
         adjacency = graph.get_adjacency(entity_def.display_name)
         if adjacency.self_loop:
-            relationships_section += f"\n### Self\nThese are the relationships to itself, stored in this entity's aspects"
+            relationships_section += "\n### Self\nThese are the relationships to itself, stored in this entity's aspects"
         for relnship in adjacency.self_loop:
             relationships_section += (
                 f"\n- {relnship.name} ({relnship.doc[1:] if relnship.doc else ''})"
             )
 
         if adjacency.outgoing:
-            relationships_section += f"\n### Outgoing\nThese are the relationships stored in this entity's aspects"
+            relationships_section += "\n### Outgoing\nThese are the relationships stored in this entity's aspects"
             relationships_section += make_relnship_docs(
                 adjacency.outgoing, direction="outgoing"
             )
 
         if adjacency.incoming:
-            relationships_section += f"\n### Incoming\nThese are the relationships stored in other entity's aspects"
+            relationships_section += "\n### Incoming\nThese are the relationships stored in other entity's aspects"
             relationships_section += make_relnship_docs(
                 adjacency.incoming, direction="incoming"
             )
@@ -405,9 +403,6 @@ def strip_types(field_path: str) -> str:
                             f_field.globalTags.tags.append(
                                 TagAssociationClass(tag="urn:li:tag:Temporal")
                             )
-                        import pdb
-
-                        # breakpoint()
                     if "Searchable" in json_dict:
                         f_field.globalTags = f_field.globalTags or GlobalTagsClass(
                             tags=[]
@@ -533,7 +528,7 @@ def get_sorted_entity_names(
         (x, y) for (x, y) in entity_names if y.category == EntityCategory.CORE
     ]
     priority_bearing_core_entities = [(x, y) for (x, y) in core_entities if y.priority]
-    priority_bearing_core_entities.sort(key=lambda x: x[1].priority)
+    priority_bearing_core_entities.sort(key=lambda t: t[1].priority)
     priority_bearing_core_entities = [x for (x, y) in priority_bearing_core_entities]
 
     non_priority_core_entities = [x for (x, y) in core_entities if not y.priority]
@@ -570,6 +565,7 @@ def preprocess_markdown(markdown_contents: str) -> str:
     content_swap_register = {}
     while inline_pattern.search(markdown_contents, pos=pos):
         match = inline_pattern.search(markdown_contents, pos=pos)
+        assert match
         file_name = match.group(1)
         with open(file_name, "r") as fp:
             inline_content = fp.read()
@@ -587,7 +583,9 @@ def preprocess_markdown(markdown_contents: str) -> str:
 
 
 @click.command()
-@click.argument("schema_files", type=click.Path(exists=True), nargs=-1, required=True)
+@click.argument("schemas_root", type=click.Path(exists=True), required=True)
+@click.option("--registry", type=click.Path(exists=True), required=True)
+@click.option("--generated-docs-dir", type=click.Path(exists=True), required=True)
 @click.option("--server", type=str, required=False)
 @click.option("--file", type=str, required=False)
 @click.option(
@@ -596,7 +594,9 @@ def preprocess_markdown(markdown_contents: str) -> str:
 @click.option("--png", type=str, required=False)
 @click.option("--extra-docs", type=str, required=False)
 def generate(
-    schema_files: List[str],
+    schemas_root: str,
+    registry: str,
+    generated_docs_dir: str,
     server: Optional[str],
     file: Optional[str],
     dot: Optional[str],
@@ -619,40 +619,39 @@ def generate(
                     final_markdown = preprocess_markdown(file_contents)
                     entity_extra_docs[entity_name] = final_markdown
 
-    for schema_file in schema_files:
-        if schema_file.endswith(".yml") or schema_file.endswith(".yaml"):
-            # registry file
-            load_registry_file(schema_file)
-        else:
-            # schema file
-            load_schema_file(schema_file)
+    # registry file
+    load_registry_file(registry)
+
+    # schema files
+    for schema_file in Path(schemas_root).glob("**/*.avsc"):
+        if (
+            schema_file.name in {"MetadataChangeEvent.avsc"}
+            or json.loads(schema_file.read_text()).get("Aspect") is not None
+        ):
+            load_schema_file(str(schema_file))
 
     if entity_extra_docs:
         for entity_name in entity_extra_docs:
 
-            entity_registry.get(entity_name).doc_file_contents = entity_extra_docs[
+            entity_registry[entity_name].doc_file_contents = entity_extra_docs[
                 entity_name
             ]
 
     relationship_graph = RelationshipGraph()
     events = generate_stitched_record(relationship_graph)
 
-    generated_docs_dir = "../docs/generated/metamodel"
-    import shutil
-
     shutil.rmtree(f"{generated_docs_dir}/entities", ignore_errors=True)
-    entity_names = [(x, entity_registry.get(x)) for x in generated_documentation]
+    entity_names = [(x, entity_registry[x]) for x in generated_documentation]
 
     sorted_entity_names = get_sorted_entity_names(entity_names)
 
     index = 0
     for category, sorted_entities in sorted_entity_names:
         for entity_name in sorted_entities:
-            entity_def = entity_registry.get(entity_name)
+            entity_def = entity_registry[entity_name]
 
             entity_category = entity_def.category
             entity_dir = f"{generated_docs_dir}/entities/"
-            import os
 
             os.makedirs(entity_dir, exist_ok=True)
 

diff --git a/metadata-ingestion/scripts/modeldocgen.sh b/metadata-ingestion/scripts/modeldocgen.sh
@@ -2,32 +2,15 @@
 set -euo pipefail
 
 OUTDIR=./generated/docs
+DOCS_OUTDIR=../docs/generated/metamodel
 
 # Note: this assumes that datahub has already been built with `./gradlew build`.
 DATAHUB_ROOT=..
-REGISTRY_ROOT="$DATAHUB_ROOT/metadata-models/src/main/resources"
 SCHEMAS_ROOT="$DATAHUB_ROOT/metadata-events/mxe-schemas/src/mainGeneratedAvroSchema/avro/"
-FILES="$REGISTRY_ROOT/entity-registry.yml $SCHEMAS_ROOT/com/linkedin/mxe/MetadataChangeEvent.avsc"
+ENTITY_REGISTRY="$DATAHUB_ROOT/metadata-models/src/main/resources/entity-registry.yml"
 METADATA_MODEL_DOCS_ROOT="$DATAHUB_ROOT/metadata-models/docs"
-# Since we depend on jq, check if jq is installed
-if ! which jq > /dev/null; then
-   echo "jq is not installed. Please install jq and rerun (https://stedolan.github.io/jq/)"
-   exit 1
-fi
-
-find $SCHEMAS_ROOT -name "*.avsc" | sort | while read file
-do
-# Add all other files that are aspects but not included in the above
-        if (jq '.Aspect' -e $file > /dev/null)
-        then
-            FILES="${FILES} ${file}"
-        fi
-        echo $FILES > /tmp/docgen_files.txt
-done
-
-FILES=$(cat /tmp/docgen_files.txt)
 
 rm -r $OUTDIR || true
-python scripts/modeldocgen.py $FILES --file generated/docs/metadata_model_mces.json --extra-docs ${METADATA_MODEL_DOCS_ROOT} $@
+python scripts/modeldocgen.py $SCHEMAS_ROOT --registry $ENTITY_REGISTRY --generated-docs-dir $DOCS_OUTDIR --file generated/docs/metadata_model_mces.json --extra-docs ${METADATA_MODEL_DOCS_ROOT} $@
 ## Full version of this command that generates dot files and png files (requires pydot and graphviz)
-# python scripts/modeldocgen.py $FILES --dot generated/docs/metadata_graph.dot --file generated/docs/metadata_model_mces.json --extra-docs ${METADATA_MODEL_DOCS_ROOT} --png generated/docs/metadata_graph.png $@
+# python scripts/modeldocgen.py $SCHEMAS_ROOT --registry $ENTITY_REGISTRY --generated-docs-dir $DOCS_OUTDIR --dot generated/docs/metadata_graph.dot --file generated/docs/metadata_model_mces.json --extra-docs ${METADATA_MODEL_DOCS_ROOT} --png generated/docs/metadata_graph.png $@