-
Notifications
You must be signed in to change notification settings - Fork 4.4k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Start DBT module for running normalization transformations
- Loading branch information
1 parent
9c6c731
commit bad20b2
Showing
18 changed files
with
8,242 additions
and
0 deletions.
There are no files selected for viewing
15 changes: 15 additions & 0 deletions
15
...yte-integrations/bases/base-normalization/normalization/dbt-transform/README.md
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,15 @@ | ||
## Installing DBT | ||
|
||
1. Activate your venv and run `pip3 install dbt` | ||
1. Copy `airbyte-normalization/sample_files/profiles.yml` over to `~/.dbt/profiles.yml` | ||
1. Edit to configure your profiles accordingly | ||
|
||
## Running DBT | ||
|
||
1. `cd airbyte-normalization` | ||
1. You can now run DBT commands, to check the setup is fine: `dbt debug` | ||
1. To build the DBT tables in your warehouse: `dbt run` | ||
|
||
Note that in order to work with the current models that i am testing, you should have: | ||
- `recipes` and `recipes_json` tables | ||
- in a `data` dataset in your bigquery project (referenced in your `profiles.yml`... |
33 changes: 33 additions & 0 deletions
33
airbyte-integrations/bases/base-normalization/normalization/dbt-transform/dbt_project.yml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,33 @@ | ||
# Name your package! Package names should contain only lowercase characters | ||
# and underscores. A good package name should reflect your organization's | ||
# name or the intended use of these models | ||
name: 'airbyte' | ||
version: '1.0' | ||
config-version: 2 | ||
|
||
# This setting configures which "profile" dbt uses for this project. Profiles contain | ||
# database connection information, and should be configured in the ~/.dbt/profiles.yml file | ||
profile: 'dev' | ||
|
||
# These configurations specify where dbt should look for different types of files. | ||
# The `source-paths` config, for example, states that source models can be found | ||
# in the "models/" directory. You probably won't need to change these! | ||
source-paths: ["models"] | ||
docs-paths: ["docs"] | ||
analysis-paths: ["analysis"] | ||
test-paths: ["tests"] | ||
data-paths: ["data"] | ||
macro-paths: ["macros"] | ||
|
||
target-path: "build" # directory which will store compiled SQL files | ||
clean-targets: # directories to be removed by `dbt clean` | ||
- "build" | ||
- "dbt_modules" | ||
|
||
# You can define configurations for models in the `source-paths` directory here. | ||
# Using these configurations, you can enable or disable models, change how they | ||
# are materialized, and more! | ||
models: | ||
airbyte: | ||
+schema: normalization | ||
+materialized: view |
7 changes: 7 additions & 0 deletions
7
...integrations/bases/base-normalization/normalization/dbt-transform/models/recipes_test.sql
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,7 @@ | ||
SELECT | ||
* | ||
FROM | ||
{{ SOURCE( | ||
'data', | ||
'recipes_json' | ||
) }} |
7 changes: 7 additions & 0 deletions
7
airbyte-integrations/bases/base-normalization/normalization/dbt-transform/models/sources.yml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,7 @@ | ||
version: 2 | ||
|
||
sources: | ||
- name: data | ||
tables: | ||
- name: recipes | ||
- name: recipes_json |
5,984 changes: 5,984 additions & 0 deletions
5,984
...ns/bases/base-normalization/normalization/dbt-transform/normalization/normalization.ipynb
Large diffs are not rendered by default.
Oops, something went wrong.
264 changes: 264 additions & 0 deletions
264
...ions/bases/base-normalization/normalization/dbt-transform/normalization/normalization.txt
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,264 @@ | ||
""" | ||
MIT License | ||
|
||
Copyright (c) 2020 Airbyte | ||
|
||
Permission is hereby granted, free of charge, to any person obtaining a copy | ||
of this software and associated documentation files (the "Software"), to deal | ||
in the Software without restriction, including without limitation the rights | ||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | ||
copies of the Software, and to permit persons to whom the Software is | ||
furnished to do so, subject to the following conditions: | ||
|
||
The above copyright notice and this permission notice shall be included in all | ||
copies or substantial portions of the Software. | ||
|
||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | ||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | ||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | ||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | ||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | ||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | ||
SOFTWARE. | ||
""" | ||
|
||
import json | ||
|
||
|
||
# + | ||
def load_catalog(file): | ||
with open(file) as f: | ||
catalog = json.load(f) | ||
print(f"From catalog {file}:") | ||
print("--------------------") | ||
print(json.dumps(catalog, sort_keys=True, indent=4)) | ||
print("--------------------") | ||
return catalog | ||
|
||
|
||
catalog = load_catalog("../sample_files/catalog.json") | ||
github = load_catalog("../sample_files/catalog_github.json") | ||
stripe = load_catalog("../sample_files/catalog_stripe.json") | ||
|
||
|
||
# + | ||
def is_string(property_type) -> bool: | ||
return property_type == "string" or "string" in property_type | ||
|
||
|
||
def is_integer(property_type) -> bool: | ||
return property_type == "integer" or "integer" in property_type | ||
|
||
|
||
def is_boolean(property_type) -> bool: | ||
return property_type == "boolean" or "boolean" in property_type | ||
|
||
|
||
def is_array(property_type) -> bool: | ||
return property_type == "array" or "array" in property_type | ||
|
||
|
||
def is_object(property_type) -> bool: | ||
return property_type == "object" or "object" in property_type | ||
|
||
|
||
def find_combining_schema(properties: dict): | ||
return set(properties).intersection(set(["anyOf", "oneOf", "allOf"])) | ||
|
||
|
||
def json_extract_base_property(path: str, json_col: str, name: str, definition: dict) -> str: | ||
current = ".".join([path, name]) | ||
if not "type" in definition: | ||
return None | ||
elif is_string(definition["type"]): | ||
return f"cast(json_extract_scalar({json_col}, '{current}') as string) as {name}" | ||
elif is_integer(definition["type"]): | ||
return f"cast(json_extract_scalar({json_col}, '{current}') as int64) as {name}" | ||
elif is_boolean(definition["type"]): | ||
return f"cast(json_extract_scalar({json_col}, '{current}') as boolean) as {name}" | ||
else: | ||
return None | ||
|
||
|
||
def json_extract_nested_property(path: str, json_col: str, name: str, definition: dict) -> str: | ||
current = ".".join([path, name]) | ||
if definition == None or not "type" in definition: | ||
return (None, None) | ||
elif is_array(definition["type"]): | ||
return (f"json_extract_array({json_col}, '{current}') as {name}", f"cross join unnest({name}) as {name}") | ||
elif is_object(definition["type"]): | ||
return (f"json_extract({json_col}, '{current}') as {name}", "") | ||
else: | ||
return (None, None) | ||
|
||
|
||
# + | ||
def select_table(table: str, columns="*"): | ||
return f"\nselect {columns} from {table}" | ||
|
||
|
||
def extract_node_properties(path: str, json_col: str, properties: dict) -> dict: | ||
result = {} | ||
if properties: | ||
for field in properties.keys(): | ||
sql_field = json_extract_base_property(path=path, json_col=json_col, name=field, definition=properties[field]) | ||
if sql_field: | ||
result[field] = sql_field | ||
return result | ||
|
||
|
||
def find_properties_object(path: str, field: str, properties) -> dict: | ||
if isinstance(properties, str) or isinstance(properties, int): | ||
return None | ||
else: | ||
if "items" in properties: | ||
return find_properties_object(path, field, properties["items"]) | ||
elif "properties" in properties: | ||
# we found a properties object | ||
return {field: properties["properties"]} | ||
elif "type" in properties and json_extract_base_property(path=path, json_col="", name="", definition=properties): | ||
# we found a basic type | ||
return {field: None} | ||
elif isinstance(properties, dict): | ||
for key in properties.keys(): | ||
if not json_extract_base_property(path, "", key, properties[key]): | ||
child = find_properties_object(path, key, properties[key]) | ||
if child: | ||
return child | ||
elif isinstance(properties, list): | ||
for item in properties: | ||
child = find_properties_object(path=path, field=field, properties=item) | ||
if child: | ||
return child | ||
return None | ||
|
||
|
||
def extract_nested_properties(path: str, json_col: str, field: str, properties: dict) -> dict: | ||
result = {} | ||
if properties: | ||
for key in properties.keys(): | ||
combining = find_combining_schema(properties[key]) | ||
if combining: | ||
# skip combining schemas | ||
for combo in combining: | ||
found = find_properties_object(path=f"{path}.{field}.{key}", field=key, properties=properties[key][combo]) | ||
result.update(found) | ||
elif not "type" in properties[key]: | ||
pass | ||
elif is_array(properties[key]["type"]): | ||
combining = find_combining_schema(properties[key]["items"]) | ||
if combining: | ||
# skip combining schemas | ||
for combo in combining: | ||
found = find_properties_object(path=f"{path}.{key}", field=key, properties=properties[key]["items"][combo]) | ||
result.update(found) | ||
else: | ||
found = find_properties_object(path=f"{path}.{key}", field=key, properties=properties[key]["items"]) | ||
result.update(found) | ||
elif is_object(properties[key]["type"]): | ||
found = find_properties_object(path=f"{path}.{key}", field=key, properties=properties[key]) | ||
result.update(found) | ||
return result | ||
|
||
|
||
def process_node(path: str, json_col: str, name: str, properties: dict, from_table: str = "", previous="with ", inject_cols="") -> dict: | ||
result = {} | ||
if previous == "with ": | ||
prefix = previous | ||
else: | ||
prefix = previous + "," | ||
node_properties = extract_node_properties(path=path, json_col=json_col, properties=properties) | ||
node_columns = ",\n ".join([sql for sql in node_properties.values()]) | ||
# FIXME: use DBT macros to be cross_db compatible instead | ||
hash_node_columns = ( | ||
"coalesce(cast(" | ||
+ ' as string), ""),\n coalesce(cast('.join([column for column in node_properties.keys()]) | ||
+ ' as string), "")' | ||
) | ||
node_sql = f"""{prefix} | ||
{name}_node as ( | ||
select | ||
{inject_cols} | ||
{node_columns} | ||
from {from_table} | ||
), | ||
{name}_with_id as ( | ||
select | ||
*, | ||
to_hex(md5(concat( | ||
{hash_node_columns} | ||
))) as _{name}_hashid | ||
from {name}_node | ||
)""" | ||
# SQL Query for current node's basic properties | ||
result[name] = node_sql + select_table(f"{name}_with_id") | ||
|
||
children_columns = extract_nested_properties(path=path, json_col=json_col, field=name, properties=properties) | ||
if children_columns: | ||
for col in children_columns.keys(): | ||
child_col, join_child_table = json_extract_nested_property(path=path, json_col=json_col, name=col, definition=properties[col]) | ||
child_sql = f"""{prefix} | ||
{name}_node as ( | ||
select | ||
{child_col}, | ||
{node_columns} | ||
from {from_table} | ||
), | ||
{name}_with_id as ( | ||
select | ||
to_hex(md5(concat( | ||
{hash_node_columns} | ||
))) as _{name}_hashid, | ||
{col} | ||
from {name}_node | ||
{join_child_table} | ||
)""" | ||
if children_columns[col]: | ||
children = process_node( | ||
path="$", | ||
json_col=col, | ||
name=f"{name}_{col}", | ||
properties=children_columns[col], | ||
from_table=f"{name}_with_id", | ||
previous=child_sql, | ||
inject_cols=f"_{name}_hashid as _{name}_foreign_hashid,", | ||
) | ||
result.update(children) | ||
else: | ||
# SQL Query for current node's basic properties | ||
result[f"{name}_{col}"] = child_sql + select_table( | ||
f"{name}_with_id", | ||
columns=f""" | ||
_{name}_hashid as _{name}_foreign_hashid, | ||
{col} | ||
""", | ||
) | ||
return result | ||
|
||
|
||
def generate_dbt_model(catalog: dict, json_col: str, from_table: str) -> dict: | ||
result = {} | ||
for obj in catalog["streams"]: | ||
name = obj["name"] | ||
if "json_schema" in obj: | ||
properties = obj["json_schema"]["properties"] | ||
elif "schema" in obj: | ||
properties = obj["schema"]["properties"] | ||
result.update(process_node(path="$", json_col=json_col, name=name, properties=properties, from_table=from_table)) | ||
return result | ||
|
||
|
||
def print_result(result): | ||
for name in result.keys(): | ||
print(f"In File {name}.sql:") | ||
print("--------------------") | ||
print(result[name]) | ||
print("--------------------") | ||
|
||
|
||
print_result(generate_dbt_model(catalog=catalog, json_col="json_blob", from_table="`airbytesandbox.data.one_recipe_json`")) | ||
# - | ||
|
||
print_result(generate_dbt_model(catalog=stripe, json_col="json_blob", from_table="`airbytesandbox.data.stripe_json`")) | ||
|
||
print_result(generate_dbt_model(catalog=github, json_col="json_blob", from_table="`airbytesandbox.data.github_json`")) |
Oops, something went wrong.