Start DBT module for running normalization transformations

airbytehq · Nov 4, 2020 · bad20b2 · bad20b2
1 parent 9c6c731
commit bad20b2
Show file tree

Hide file tree

Showing 18 changed files with 8,242 additions and 0 deletions.
diff --git a/...yte-integrations/bases/base-normalization/normalization/dbt-transform/README.md b/...yte-integrations/bases/base-normalization/normalization/dbt-transform/README.md
@@ -0,0 +1,15 @@
+## Installing DBT
+
+1. Activate your venv and run `pip3 install dbt`
+1. Copy `airbyte-normalization/sample_files/profiles.yml` over to `~/.dbt/profiles.yml`
+1. Edit to configure your profiles accordingly
+
+## Running DBT
+
+1. `cd airbyte-normalization`
+1. You can now run DBT commands, to check the setup is fine: `dbt debug`
+1. To build the DBT tables in your warehouse: `dbt run`
+
+Note that in order to work with the current models that i am testing, you should have:
+ - `recipes` and `recipes_json` tables
+ - in a `data` dataset in your bigquery project (referenced in your `profiles.yml`... 
diff --git a/airbyte-integrations/bases/base-normalization/normalization/dbt-transform/dbt_project.yml b/airbyte-integrations/bases/base-normalization/normalization/dbt-transform/dbt_project.yml
@@ -0,0 +1,33 @@
+# Name your package! Package names should contain only lowercase characters
+# and underscores. A good package name should reflect your organization's
+# name or the intended use of these models
+name: 'airbyte'
+version: '1.0'
+config-version: 2
+
+# This setting configures which "profile" dbt uses for this project. Profiles contain
+# database connection information, and should be configured in the  ~/.dbt/profiles.yml file
+profile: 'dev'
+
+# These configurations specify where dbt should look for different types of files.
+# The `source-paths` config, for example, states that source models can be found
+# in the "models/" directory. You probably won't need to change these!
+source-paths: ["models"]
+docs-paths: ["docs"]
+analysis-paths: ["analysis"]
+test-paths: ["tests"]
+data-paths: ["data"]
+macro-paths: ["macros"]
+
+target-path: "build"  # directory which will store compiled SQL files
+clean-targets:         # directories to be removed by `dbt clean`
+    - "build"
+    - "dbt_modules"
+
+# You can define configurations for models in the `source-paths` directory here.
+# Using these configurations, you can enable or disable models, change how they
+# are materialized, and more!
+models:
+  airbyte:
+    +schema: normalization
+    +materialized: view
diff --git a/...integrations/bases/base-normalization/normalization/dbt-transform/models/recipes_test.sql b/...integrations/bases/base-normalization/normalization/dbt-transform/models/recipes_test.sql
@@ -0,0 +1,7 @@
+SELECT
+    *
+FROM
+    {{ SOURCE(
+        'data',
+        'recipes_json'
+    ) }}
diff --git a/airbyte-integrations/bases/base-normalization/normalization/dbt-transform/models/sources.yml b/airbyte-integrations/bases/base-normalization/normalization/dbt-transform/models/sources.yml
@@ -0,0 +1,7 @@
+version: 2
+
+sources:
+  - name: data
+    tables:
+      - name: recipes
+      - name: recipes_json
diff --git a/...ns/bases/base-normalization/normalization/dbt-transform/normalization/normalization.ipynb b/...ns/bases/base-normalization/normalization/dbt-transform/normalization/normalization.ipynb
diff --git a/...ions/bases/base-normalization/normalization/dbt-transform/normalization/normalization.txt b/...ions/bases/base-normalization/normalization/dbt-transform/normalization/normalization.txt
@@ -0,0 +1,264 @@
+"""
+MIT License
+
+Copyright (c) 2020 Airbyte
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+"""
+
+import json
+
+
+# +
+def load_catalog(file):
+    with open(file) as f:
+        catalog = json.load(f)
+    print(f"From catalog {file}:")
+    print("--------------------")
+    print(json.dumps(catalog, sort_keys=True, indent=4))
+    print("--------------------")
+    return catalog
+
+
+catalog = load_catalog("../sample_files/catalog.json")
+github = load_catalog("../sample_files/catalog_github.json")
+stripe = load_catalog("../sample_files/catalog_stripe.json")
+
+
+# +
+def is_string(property_type) -> bool:
+    return property_type == "string" or "string" in property_type
+
+
+def is_integer(property_type) -> bool:
+    return property_type == "integer" or "integer" in property_type
+
+
+def is_boolean(property_type) -> bool:
+    return property_type == "boolean" or "boolean" in property_type
+
+
+def is_array(property_type) -> bool:
+    return property_type == "array" or "array" in property_type
+
+
+def is_object(property_type) -> bool:
+    return property_type == "object" or "object" in property_type
+
+
+def find_combining_schema(properties: dict):
+    return set(properties).intersection(set(["anyOf", "oneOf", "allOf"]))
+
+
+def json_extract_base_property(path: str, json_col: str, name: str, definition: dict) -> str:
+    current = ".".join([path, name])
+    if not "type" in definition:
+        return None
+    elif is_string(definition["type"]):
+        return f"cast(json_extract_scalar({json_col}, '{current}') as string) as {name}"
+    elif is_integer(definition["type"]):
+        return f"cast(json_extract_scalar({json_col}, '{current}') as int64) as {name}"
+    elif is_boolean(definition["type"]):
+        return f"cast(json_extract_scalar({json_col}, '{current}') as boolean) as {name}"
+    else:
+        return None
+
+
+def json_extract_nested_property(path: str, json_col: str, name: str, definition: dict) -> str:
+    current = ".".join([path, name])
+    if definition == None or not "type" in definition:
+        return (None, None)
+    elif is_array(definition["type"]):
+        return (f"json_extract_array({json_col}, '{current}') as {name}", f"cross join unnest({name}) as {name}")
+    elif is_object(definition["type"]):
+        return (f"json_extract({json_col}, '{current}') as {name}", "")
+    else:
+        return (None, None)
+
+
+# +
+def select_table(table: str, columns="*"):
+    return f"\nselect {columns} from {table}"
+
+
+def extract_node_properties(path: str, json_col: str, properties: dict) -> dict:
+    result = {}
+    if properties:
+        for field in properties.keys():
+            sql_field = json_extract_base_property(path=path, json_col=json_col, name=field, definition=properties[field])
+            if sql_field:
+                result[field] = sql_field
+    return result
+
+
+def find_properties_object(path: str, field: str, properties) -> dict:
+    if isinstance(properties, str) or isinstance(properties, int):
+        return None
+    else:
+        if "items" in properties:
+            return find_properties_object(path, field, properties["items"])
+        elif "properties" in properties:
+            # we found a properties object
+            return {field: properties["properties"]}
+        elif "type" in properties and json_extract_base_property(path=path, json_col="", name="", definition=properties):
+            # we found a basic type
+            return {field: None}
+        elif isinstance(properties, dict):
+            for key in properties.keys():
+                if not json_extract_base_property(path, "", key, properties[key]):
+                    child = find_properties_object(path, key, properties[key])
+                    if child:
+                        return child
+        elif isinstance(properties, list):
+            for item in properties:
+                child = find_properties_object(path=path, field=field, properties=item)
+                if child:
+                    return child
+    return None
+
+
+def extract_nested_properties(path: str, json_col: str, field: str, properties: dict) -> dict:
+    result = {}
+    if properties:
+        for key in properties.keys():
+            combining = find_combining_schema(properties[key])
+            if combining:
+                # skip combining schemas
+                for combo in combining:
+                    found = find_properties_object(path=f"{path}.{field}.{key}", field=key, properties=properties[key][combo])
+                    result.update(found)
+            elif not "type" in properties[key]:
+                pass
+            elif is_array(properties[key]["type"]):
+                combining = find_combining_schema(properties[key]["items"])
+                if combining:
+                    # skip combining schemas
+                    for combo in combining:
+                        found = find_properties_object(path=f"{path}.{key}", field=key, properties=properties[key]["items"][combo])
+                        result.update(found)
+                else:
+                    found = find_properties_object(path=f"{path}.{key}", field=key, properties=properties[key]["items"])
+                    result.update(found)
+            elif is_object(properties[key]["type"]):
+                found = find_properties_object(path=f"{path}.{key}", field=key, properties=properties[key])
+                result.update(found)
+    return result
+
+
+def process_node(path: str, json_col: str, name: str, properties: dict, from_table: str = "", previous="with ", inject_cols="") -> dict:
+    result = {}
+    if previous == "with ":
+        prefix = previous
+    else:
+        prefix = previous + ","
+    node_properties = extract_node_properties(path=path, json_col=json_col, properties=properties)
+    node_columns = ",\n    ".join([sql for sql in node_properties.values()])
+    # FIXME: use DBT macros to be cross_db compatible instead
+    hash_node_columns = (
+        "coalesce(cast("
+        + ' as string), ""),\n      coalesce(cast('.join([column for column in node_properties.keys()])
+        + ' as string), "")'
+    )
+    node_sql = f"""{prefix}
+{name}_node as (
+  select     
+    {inject_cols}
+    {node_columns}
+  from {from_table}
+),
+{name}_with_id as (
+  select
+    *,
+    to_hex(md5(concat(
+      {hash_node_columns}
+    ))) as _{name}_hashid
+  from {name}_node
+)"""
+    # SQL Query for current node's basic properties
+    result[name] = node_sql + select_table(f"{name}_with_id")
+
+    children_columns = extract_nested_properties(path=path, json_col=json_col, field=name, properties=properties)
+    if children_columns:
+        for col in children_columns.keys():
+            child_col, join_child_table = json_extract_nested_property(path=path, json_col=json_col, name=col, definition=properties[col])
+            child_sql = f"""{prefix}
+{name}_node as (
+  select     
+    {child_col},
+    {node_columns}
+  from {from_table}
+),
+{name}_with_id as (
+  select
+    to_hex(md5(concat(
+      {hash_node_columns}
+    ))) as _{name}_hashid,
+    {col}
+  from {name}_node
+  {join_child_table}
+)"""
+            if children_columns[col]:
+                children = process_node(
+                    path="$",
+                    json_col=col,
+                    name=f"{name}_{col}",
+                    properties=children_columns[col],
+                    from_table=f"{name}_with_id",
+                    previous=child_sql,
+                    inject_cols=f"_{name}_hashid as _{name}_foreign_hashid,",
+                )
+                result.update(children)
+            else:
+                # SQL Query for current node's basic properties
+                result[f"{name}_{col}"] = child_sql + select_table(
+                    f"{name}_with_id",
+                    columns=f"""
+  _{name}_hashid as _{name}_foreign_hashid,
+  {col}
+""",
+                )
+    return result
+
+
+def generate_dbt_model(catalog: dict, json_col: str, from_table: str) -> dict:
+    result = {}
+    for obj in catalog["streams"]:
+        name = obj["name"]
+        if "json_schema" in obj:
+            properties = obj["json_schema"]["properties"]
+        elif "schema" in obj:
+            properties = obj["schema"]["properties"]
+        result.update(process_node(path="$", json_col=json_col, name=name, properties=properties, from_table=from_table))
+    return result
+
+
+def print_result(result):
+    for name in result.keys():
+        print(f"In File {name}.sql:")
+        print("--------------------")
+        print(result[name])
+        print("--------------------")
+
+
+print_result(generate_dbt_model(catalog=catalog, json_col="json_blob", from_table="`airbytesandbox.data.one_recipe_json`"))
+# -
+
+print_result(generate_dbt_model(catalog=stripe, json_col="json_blob", from_table="`airbytesandbox.data.stripe_json`"))
+
+print_result(generate_dbt_model(catalog=github, json_col="json_blob", from_table="`airbytesandbox.data.github_json`"))