From cee33c0e1f288ce259cfbe9b70da531d351a85f6 Mon Sep 17 00:00:00 2001 From: Schum <68906108+Schum-io@users.noreply.github.com> Date: Thu, 25 May 2023 15:22:00 +0600 Subject: [PATCH 01/78] fix: SYSTEM SYNC REPLICA for on_cluster_clause (#156) * fix SYSTEM SYNC REPLICA * add schema --- dbt/include/clickhouse/macros/adapters.sql | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dbt/include/clickhouse/macros/adapters.sql b/dbt/include/clickhouse/macros/adapters.sql index 70295b8a..8ef23409 100644 --- a/dbt/include/clickhouse/macros/adapters.sql +++ b/dbt/include/clickhouse/macros/adapters.sql @@ -107,7 +107,7 @@ {% macro exchange_tables_atomic(old_relation, target_relation, obj_types='TABLES') %} {%- if adapter.get_clickhouse_cluster_name() is not none and obj_types == 'TABLES' %} - {% do run_query("SYSTEM SYNC REPLICA "+ target_relation.identifier + on_cluster_clause()) %} + {% do run_query("SYSTEM SYNC REPLICA " + on_cluster_clause() + target_relation.schema + '.' + target_relation.identifier) %} {%- endif %} {%- call statement('exchange_tables_atomic') -%} From 7d3eb2ecc7cc555613b2a1036cd26190aa467177 Mon Sep 17 00:00:00 2001 From: Geoff Genz Date: Thu, 25 May 2023 03:34:38 -0600 Subject: [PATCH 02/78] Update version and pypi job --- .github/workflows/pypi.yml | 3 ++- dbt/adapters/clickhouse/__version__.py | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/.github/workflows/pypi.yml b/.github/workflows/pypi.yml index 755a4b58..1698d186 100644 --- a/.github/workflows/pypi.yml +++ b/.github/workflows/pypi.yml @@ -1,10 +1,11 @@ --- name: "PyPI Release" -on: # yamllint disable-line rule:truthy +on: push: tags: - 'v*' + workflow_dispatch: jobs: publish: diff --git a/dbt/adapters/clickhouse/__version__.py b/dbt/adapters/clickhouse/__version__.py index 825ecca3..8a16224f 100644 --- a/dbt/adapters/clickhouse/__version__.py +++ b/dbt/adapters/clickhouse/__version__.py @@ -1 +1 @@ -version = '1.4.1' +version = '1.4.2' From a5ce19559243ea03f7960427defc207870d77f68 Mon Sep 17 00:00:00 2001 From: Sergey Reshetnikov Date: Tue, 13 Jun 2023 05:42:06 +0700 Subject: [PATCH 03/78] Fix incompatible return type (#162) --- dbt/adapters/clickhouse/connections.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/dbt/adapters/clickhouse/connections.py b/dbt/adapters/clickhouse/connections.py index c61766b3..5c1705f8 100644 --- a/dbt/adapters/clickhouse/connections.py +++ b/dbt/adapters/clickhouse/connections.py @@ -6,7 +6,7 @@ import agate import dbt.exceptions from dbt.adapters.sql import SQLConnectionManager -from dbt.contracts.connection import Connection +from dbt.contracts.connection import AdapterResponse, Connection from dbt.adapters.clickhouse.dbclient import ChRetryableException, get_db_client from dbt.adapters.clickhouse.logger import logger @@ -74,7 +74,7 @@ def get_table_from_response(cls, response, column_names) -> agate.Table: def execute( self, sql: str, auto_begin: bool = False, fetch: bool = False - ) -> Tuple[str, agate.Table]: + ) -> Tuple[AdapterResponse, agate.Table]: # Don't try to fetch result of clustered DDL responses, we don't know what to do with them if fetch and ddl_re.match(sql): fetch = False @@ -98,7 +98,7 @@ def execute( ) else: table = dbt.clients.agate_helper.empty_table() - return status, table + return AdapterResponse(_message=status), table def add_query( self, From 95a4b21a3477b93ebcfb60e6561ffdef8a152981 Mon Sep 17 00:00:00 2001 From: gladkikhtutu <88535677+gladkikhtutu@users.noreply.github.com> Date: Mon, 26 Jun 2023 19:23:02 +0300 Subject: [PATCH 04/78] Distributed table materialization (#163) * distributed table materialization * fix rebase * PR fixes --- dbt/adapters/clickhouse/credentials.py | 1 + dbt/adapters/clickhouse/impl.py | 6 ++ .../materializations/distributed_table.sql | 100 ++++++++++++++++++ 3 files changed, 107 insertions(+) create mode 100644 dbt/include/clickhouse/macros/materializations/distributed_table.sql diff --git a/dbt/adapters/clickhouse/credentials.py b/dbt/adapters/clickhouse/credentials.py index 178625f1..4ccf2d63 100644 --- a/dbt/adapters/clickhouse/credentials.py +++ b/dbt/adapters/clickhouse/credentials.py @@ -32,6 +32,7 @@ class ClickHouseCredentials(Credentials): check_exchange: bool = True custom_settings: Optional[Dict[str, Any]] = None use_lw_deletes: bool = False + local_suffix: str = 'local' @property def type(self): diff --git a/dbt/adapters/clickhouse/impl.py b/dbt/adapters/clickhouse/impl.py index 29d26277..f482ddb4 100644 --- a/dbt/adapters/clickhouse/impl.py +++ b/dbt/adapters/clickhouse/impl.py @@ -74,6 +74,12 @@ def get_clickhouse_cluster_name(self): if conn.credentials.cluster: return f'"{conn.credentials.cluster}"' + @available.parse(lambda *a, **k: {}) + def get_clickhouse_local_suffix(self): + conn = self.connections.get_if_exists() + if conn.credentials.local_suffix: + return f'{conn.credentials.local_suffix}' + @available def clickhouse_db_engine_clause(self): conn = self.connections.get_if_exists() diff --git a/dbt/include/clickhouse/macros/materializations/distributed_table.sql b/dbt/include/clickhouse/macros/materializations/distributed_table.sql new file mode 100644 index 00000000..44eeb740 --- /dev/null +++ b/dbt/include/clickhouse/macros/materializations/distributed_table.sql @@ -0,0 +1,100 @@ +{% materialization distributed_table, adapter='clickhouse' %} + {%- set local_suffix = adapter.get_clickhouse_local_suffix() -%} + + {%- set existing_relation = load_cached_relation(this) -%} + {%- set target_relation = this.incorporate(type='table') -%} + + {% set existing_relation_local = existing_relation.incorporate(path={"identifier": model['name'] + local_suffix}) if existing_relation is not none else none %} + {% set target_relation_local = target_relation.incorporate(path={"identifier": model['name'] + local_suffix}) if target_relation is not none else none %} + + {%- set backup_relation = none -%} + {%- set preexisting_backup_relation = none -%} + {%- set preexisting_intermediate_relation = none -%} + + {% if existing_relation_local is not none %} + {%- set backup_relation_type = existing_relation_local.type -%} + {%- set backup_relation = make_backup_relation(target_relation_local, backup_relation_type) -%} + {%- set preexisting_backup_relation = load_cached_relation(backup_relation) -%} + {% if not existing_relation.can_exchange %} + {%- set intermediate_relation = make_intermediate_relation(target_relation_local) -%} + {%- set preexisting_intermediate_relation = load_cached_relation(intermediate_relation) -%} + {% endif %} + {% endif %} + {% set view_relation = default__make_temp_relation(target_relation, '__dbt_tmp') %} + -- drop the temp relations if they exist already in the database + {{ drop_relation_if_exists(preexisting_intermediate_relation) }} + {{ drop_relation_if_exists(preexisting_backup_relation) }} + {{ drop_relation_if_exists(view_relation) }} + + {% set grant_config = config.get('grants') %} + + {{ run_hooks(pre_hooks, inside_transaction=False) }} + + {% call statement('main') %} + {{ create_view_as(view_relation, sql) }} + {% endcall %} + + {{ run_hooks(pre_hooks, inside_transaction=True) }} + + {% if backup_relation is none %} + {% do run_query(create_empty_table_from_relation(target_relation_local, view_relation)) or '' %} + {% do run_query(create_distributed_table(target_relation, target_relation_local)) or '' %} + {% elif existing_relation.can_exchange %} + -- We can do an atomic exchange, so no need for an intermediate + {% call statement('main') -%} + {% do run_query(create_empty_table_from_relation(backup_relation, view_relation)) or '' %} + {%- endcall %} + {% do exchange_tables_atomic(backup_relation, existing_relation) %} + {% else %} + {% do run_query(create_empty_table_from_relation(intermediate_relation, view_relation)) or '' %} + {{ adapter.rename_relation(existing_relation_local, backup_relation) }} + {{ adapter.rename_relation(intermediate_relation, target_relation_local) }} + {% endif %} + {% do run_query(clickhouse__insert_into(target_relation, sql)) or '' %} + {{ drop_relation_if_exists(view_relation) }} + -- cleanup + {% set should_revoke = should_revoke(existing_relation, full_refresh_mode=True) %} + {% do apply_grants(target_relation, grant_config, should_revoke=should_revoke) %} + + {% do persist_docs(target_relation, model) %} + {{ run_hooks(post_hooks, inside_transaction=True) }} + {{ adapter.commit() }} + {{ drop_relation_if_exists(backup_relation) }} + {{ run_hooks(post_hooks, inside_transaction=False) }} + {{ return({'relations': [target_relation]}) }} + +{% endmaterialization %} + +{% macro create_distributed_table(relation, local_relation) %} + {%- set cluster = adapter.get_clickhouse_cluster_name()[1:-1] -%} + {%- set sharding = config.get('sharding_key') -%} + + CREATE TABLE {{ relation }} {{ on_cluster_clause() }} AS {{ local_relation }} + ENGINE = Distributed('{{ cluster}}', '{{ relation.schema }}', '{{ local_relation.name }}' + {% if sharding is not none %} + , {{ sharding }} + {% endif %} + ) + {% endmacro %} + +{% macro create_empty_table_from_relation(relation, source_relation) -%} + {%- set sql_header = config.get('sql_header', none) -%} + {%- set columns = adapter.get_columns_in_relation(source_relation) | list -%} + + {%- set col_list = [] -%} + {% for col in columns %} + {{col_list.append(col.name + ' ' + col.data_type) or '' }} + {% endfor %} + {{ sql_header if sql_header is not none }} + + create table {{ relation.include(database=False) }} + {{ on_cluster_clause() }} ( + {{col_list | join(', ')}} + ) + + {{ engine_clause() }} + {{ order_cols(label="order by") }} + {{ primary_key_clause(label="primary key") }} + {{ partition_cols(label="partition by") }} + {{ adapter.get_model_settings(model) }} +{%- endmacro %} From 005cd8f80c616797f090e78790c7c60364eaf044 Mon Sep 17 00:00:00 2001 From: Geoff Genz Date: Tue, 27 Jun 2023 07:47:29 -0600 Subject: [PATCH 05/78] Bump version --- .github/workflows/pypi.yml | 2 +- dbt/adapters/clickhouse/__version__.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/pypi.yml b/.github/workflows/pypi.yml index 1698d186..72fefdc2 100644 --- a/.github/workflows/pypi.yml +++ b/.github/workflows/pypi.yml @@ -19,7 +19,7 @@ jobs: - name: Setup Python uses: actions/setup-python@v4 with: - python-version: 3.9 + python-version: 3.10 - name: Upgrade Setuptools run: pip install --upgrade setuptools wheel diff --git a/dbt/adapters/clickhouse/__version__.py b/dbt/adapters/clickhouse/__version__.py index 8a16224f..50e4bb93 100644 --- a/dbt/adapters/clickhouse/__version__.py +++ b/dbt/adapters/clickhouse/__version__.py @@ -1 +1 @@ -version = '1.4.2' +version = '1.4.3' From b1def6efc3c9cb2bf601a4975bddbd848bff3d70 Mon Sep 17 00:00:00 2001 From: Geoff Genz Date: Tue, 27 Jun 2023 07:52:49 -0600 Subject: [PATCH 06/78] Tweak PyPI build Python release --- .github/workflows/pypi.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/pypi.yml b/.github/workflows/pypi.yml index 72fefdc2..6094f06f 100644 --- a/.github/workflows/pypi.yml +++ b/.github/workflows/pypi.yml @@ -19,7 +19,7 @@ jobs: - name: Setup Python uses: actions/setup-python@v4 with: - python-version: 3.10 + python-version: "3.10" - name: Upgrade Setuptools run: pip install --upgrade setuptools wheel From 67cf9db5285b9d1c41ce321bd902ece577b6cc34 Mon Sep 17 00:00:00 2001 From: Damir Basic Knezevic Date: Mon, 17 Jul 2023 17:34:18 +0200 Subject: [PATCH 07/78] Add space to exchange_tables_atomic macro (#168) * Add space to exchange_tables_atomic macro This changes the SYSTEM SYNC REPLICA query to have a space between the ON CLUSTER clause and the table name. * Move whitespace to on_cluster_clause --- dbt/include/clickhouse/macros/materializations/table.sql | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/dbt/include/clickhouse/macros/materializations/table.sql b/dbt/include/clickhouse/macros/materializations/table.sql index 6a7bdce2..18362a2a 100644 --- a/dbt/include/clickhouse/macros/materializations/table.sql +++ b/dbt/include/clickhouse/macros/materializations/table.sql @@ -124,7 +124,8 @@ {% macro on_cluster_clause(label) %} {% set active_cluster = adapter.get_clickhouse_cluster_name() %} {%- if active_cluster is not none %} - ON CLUSTER {{ active_cluster }} + {# Add trailing whitespace to avoid problems when this clause is not last #} + ON CLUSTER {{ active_cluster + ' ' }} {%- endif %} {%- endmacro -%} From c3cad11d870d583a416fd450df0a95c8f54aafb7 Mon Sep 17 00:00:00 2001 From: Geoff Genz Date: Wed, 19 Jul 2023 17:38:01 -0600 Subject: [PATCH 08/78] Fix bad logging/error handling (#170) --- .github/workflows/test_cloud.yml | 1 + .github/workflows/test_matrix.yml | 4 ++-- CHANGELOG.md | 24 ++++++++++++++++++++++++ dbt/adapters/clickhouse/__version__.py | 2 +- dbt/adapters/clickhouse/connections.py | 2 +- dbt/adapters/clickhouse/dbclient.py | 4 ++-- 6 files changed, 31 insertions(+), 6 deletions(-) diff --git a/.github/workflows/test_cloud.yml b/.github/workflows/test_cloud.yml index cbf0f3e7..d11a2d11 100644 --- a/.github/workflows/test_cloud.yml +++ b/.github/workflows/test_cloud.yml @@ -5,6 +5,7 @@ on: # yamllint disable-line rule:truthy push: branches: - '*_cloud' + workflow_dispatch: jobs: cloud_tests: diff --git a/.github/workflows/test_matrix.yml b/.github/workflows/test_matrix.yml index c0d9bac3..d204477c 100644 --- a/.github/workflows/test_matrix.yml +++ b/.github/workflows/test_matrix.yml @@ -29,9 +29,9 @@ jobs: - '3.11' clickhouse-version: - '22.8' - - '23.2' - '23.3' - - '23.4' + - '23.5' + - '23.6' - latest steps: diff --git a/CHANGELOG.md b/CHANGELOG.md index 989ec917..e66caf96 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,27 @@ +### Release [1.4.4], 2023-07-19 +#### Bug Fixes +- Fixed two logging/exception handling issues that would cause exception on startup or when handling some exceptions +from the ClickHouse server. Partially addresses https://github.com/ClickHouse/dbt-clickhouse/issues/169. +- Fixed issue with the `on_cluster` macro that would break the exchange tables step of incremental materializations +with an active cluster. Thanks to [Andrew Davis](https://github.com/Savid) for the PR. Closes +https://github.com/ClickHouse/dbt-clickhouse/issues/167 + +### Release [1.4.3], 2023-06-27 +#### Bug Fix +- Use correct return value for `execute`. This would cause an exception when running hooks. Thanks to +[Sergey Reshetnikov](https://github.com/PrVrSs) for the PR. Closed https://github.com/ClickHouse/dbt-clickhouse/issues/161 + +#### Improvement +- Added macros for creating distributed tables. See the `distributed_table.sql` include file. Thanks to +[gladkikhtutu](https://github.com/gladkikhtutu) for the contribution. + +### Release [1.4.2], 2023-05-14 +#### Bug fixes +- Create initial dbt database (if not found) on the defined cluster on first run, instead of just the execution node. +Thanks to [Jens Hoevenaars](https://github.com/codenation-nl) for the PR +- Fix the SYSTEM SYNC REPLICA statement when exchanging tables ON CLUSTER for incremental materializations. Thanks to +[Schum](https://github.com/Schum-io) for PR. Closed https://github.com/ClickHouse/dbt-clickhouse/issues/157. + ### Release [1.4.1], 2023-05-11 #### Improvements - Reduce the number of SQL calls for Modify Comment operations. Thanks to [Konstantin Ilchenko](https://github.com/simpl1g). diff --git a/dbt/adapters/clickhouse/__version__.py b/dbt/adapters/clickhouse/__version__.py index 50e4bb93..f91302bf 100644 --- a/dbt/adapters/clickhouse/__version__.py +++ b/dbt/adapters/clickhouse/__version__.py @@ -1 +1 @@ -version = '1.4.3' +version = '1.4.4' diff --git a/dbt/adapters/clickhouse/connections.py b/dbt/adapters/clickhouse/connections.py index 5c1705f8..85c141be 100644 --- a/dbt/adapters/clickhouse/connections.py +++ b/dbt/adapters/clickhouse/connections.py @@ -30,7 +30,7 @@ def exception_handler(self, sql): logger.debug('Error running SQL: {}', sql) if isinstance(exp, dbt.exceptions.DbtRuntimeError): raise - raise dbt.exceptions.DbtRuntimeError from exp + raise dbt.exceptions.DbtRuntimeError('ClickHouse exception: ' + str(exp)) from exp @classmethod def open(cls, connection): diff --git a/dbt/adapters/clickhouse/dbclient.py b/dbt/adapters/clickhouse/dbclient.py index a779aecc..2051d635 100644 --- a/dbt/adapters/clickhouse/dbclient.py +++ b/dbt/adapters/clickhouse/dbclient.py @@ -180,7 +180,7 @@ def _check_atomic_exchange(self) -> bool: for table in swap_tables: self.command(f'DROP TABLE IF EXISTS {table}') except DbtDatabaseError: - logger.info('Unexpected server exception dropping table', exc_info=True) + logger.info('Unexpected server exception dropping table') except DbtDatabaseError: - logger.warning('Failed to run exchange test', exc_info=True) + logger.warning('Failed to run exchange test') return False From 8a2731fea7762b9d818b054a653922b7ad4f905e Mon Sep 17 00:00:00 2001 From: gladkikhtutu <88535677+gladkikhtutu@users.noreply.github.com> Date: Thu, 27 Jul 2023 14:04:58 +0300 Subject: [PATCH 09/78] Distributed incremental materialization (#172) * distributed table materialization * fix rebase * PR fixes * distributed incremental materialization * fix * fix * add insert_distributed_sync to README.md * add checks on insert_distributed_sync * add checks on insert_distributed_sync * review fixes --- README.md | 101 +++++++++++++ .../distributed_incremental.sql | 140 ++++++++++++++++++ .../materializations/distributed_table.sql | 25 +++- .../macros/materializations/incremental.sql | 63 ++++++-- 4 files changed, 310 insertions(+), 19 deletions(-) create mode 100644 dbt/include/clickhouse/macros/materializations/distributed_incremental.sql diff --git a/README.md b/README.md index 030df42e..96dd9f71 100644 --- a/README.md +++ b/README.md @@ -29,6 +29,8 @@ pip install dbt-clickhouse - [x] Snapshots - [x] Most dbt-utils macros (now included in dbt-core) - [x] Ephemeral materialization +- [x] Distributed table materialization (experimental) +- [x] Distributed incremental materialization (experimental) # Usage Notes @@ -65,6 +67,7 @@ your_profile_name: cluster_mode: [False] # Use specific settings designed to improve operation on Replicated databases (recommended for ClickHouse Cloud) use_lw_deletes: [False] Use the strategy `delete+insert` as the default incremental strategy. check_exchange: [True] # Validate that clickhouse support the atomic EXCHANGE TABLES command. (Not needed for most ClickHouse versions) + local_suffix [local] # Table suffix of local tables on shards for distributed materializations custom_settings: [{}] # A dicitonary/mapping of custom ClickHouse settings for the connection - default is empty. # Native (clickhouse-driver) connection settings @@ -153,6 +156,104 @@ keys used to populate the parameters of the S3 table function: See the [S3 test file](https://github.com/ClickHouse/dbt-clickhouse/blob/main/tests/integration/adapter/test_s3.py) for examples of how to use this macro. +# Distributed materializations + +WARNING: + +To use distributed materializations correctly you should set **insert_distributed_sync** = 1 (or use as prehook) in order to have correct data while SELECT queries. Otherwise, downstream calculation could be wrong if the distributed insert is not finished in time. + +## Distributed table materialization + +Distributed table created with following steps: +1. Creates temp view with sql query to get right structure +2. Create empty local tables based on view +3. Create distributed table based on local tables. +4. Data inserts into distributed table, so it is distributed across shards without duplicating. + +### Distributed table model example +```sql +{{ + config( + materialized='distributed_table', + order_by='id, created_at', + sharding_key='cityHash64(id)', + engine='ReplacingMergeTree' + ) +}} + +select id, created_at, item from {{ source('db', 'table') }} +``` + +### Generated migrations + +```sql +CREATE TABLE db.table_local on cluster cluster +( + `id` UInt64, + `created_at` DateTime, + `item` String +) +ENGINE = ReplacingMergeTree +ORDER BY (id, created_at) +SETTINGS index_granularity = 8192; + + +CREATE TABLE db.table on cluster cluster +( + `id` UInt64, + `created_at` DateTime, + `item` String +) +ENGINE = Distributed('cluster', 'db', 'table_local', cityHash64(id)); +``` + +## Distributed incremental materialization + +Incremental model based on the same idea as distributed table, the main difficulty is to process all incremental strategies correctly. + +1. _The Append Strategy_ just insert data into distributed table. +2. _The Delete+Insert_ Strategy creates distributed temp table to work with all data on every shard. +3. _The Default (Legacy) Strategy_ creates distributed temp and intermediate tables for the same reason. + +Only shard tables are replacing, because distributed table does not keep data. +The distributed table reloads only when the full_refresh mode is enabled or the table structure may have changed. + +### Distributed incremental model example +```sql +{{ + config( + materialized='distributed_incremental', + engine='MergeTree', + incremental_strategy='append', + unique_key='id,created_at' + ) +}} + +select id, created_at, item from {{ source('db', 'table') }} +``` + +### Generated migrations + +```sql +CREATE TABLE db.table_local on cluster cluster +( + `id` UInt64, + `created_at` DateTime, + `item` String +) +ENGINE = MergeTree +SETTINGS index_granularity = 8192; + + +CREATE TABLE db.table on cluster cluster +( + `id` UInt64, + `created_at` DateTime, + `item` String +) +ENGINE = Distributed('cluster', 'db', 'table_local', cityHash64(id)); +``` + # Running Tests This adapter passes all of dbt basic tests as presented in dbt's official docs: https://docs.getdbt.com/docs/contributing/testing-a-new-adapter#testing-your-adapter. diff --git a/dbt/include/clickhouse/macros/materializations/distributed_incremental.sql b/dbt/include/clickhouse/macros/materializations/distributed_incremental.sql new file mode 100644 index 00000000..cdc53151 --- /dev/null +++ b/dbt/include/clickhouse/macros/materializations/distributed_incremental.sql @@ -0,0 +1,140 @@ +{% materialization distributed_incremental, adapter='clickhouse' %} + {% set insert_distributed_sync = run_query("SELECT value FROM system.settings WHERE name = 'insert_distributed_sync'")[0][0] %} + {% if insert_distributed_sync != '1' %} + {% do exceptions.raise_compiler_error('To use distributed materialization setting insert_distributed_sync should be set to 1') %} + {% endif %} + + {%- set local_suffix = adapter.get_clickhouse_local_suffix() -%} + + {%- set existing_relation = load_cached_relation(this) -%} + {%- set target_relation = this.incorporate(type='table') -%} + + {% set existing_relation_local = existing_relation.incorporate(path={"identifier": model['name'] + local_suffix}) if existing_relation is not none else none %} + {% set target_relation_local = target_relation.incorporate(path={"identifier": model['name'] + local_suffix}) if target_relation is not none else none %} + + {%- set unique_key = config.get('unique_key') -%} + {% if unique_key is not none and unique_key|length == 0 %} + {% set unique_key = none %} + {% endif %} + {% if unique_key is iterable and (unique_key is not string and unique_key is not mapping) %} + {% set unique_key = unique_key|join(', ') %} + {% endif %} + {%- set inserts_only = config.get('inserts_only') -%} + {%- set grant_config = config.get('grants') -%} + {%- set full_refresh_mode = (should_full_refresh() or existing_relation.is_view) -%} + {%- set on_schema_change = incremental_validate_on_schema_change(config.get('on_schema_change'), default='ignore') -%} + + {%- set intermediate_relation = make_intermediate_relation(target_relation_local)-%} + {%- set distributed_intermediate_relation = make_intermediate_relation(target_relation)-%} + {%- set backup_relation_type = 'table' if existing_relation is none else existing_relation.type -%} + {%- set backup_relation = make_backup_relation(target_relation_local, backup_relation_type) -%} + {%- set distributed_backup_relation = make_backup_relation(target_relation, backup_relation_type) -%} + {%- set preexisting_intermediate_relation = load_cached_relation(intermediate_relation)-%} + {%- set preexisting_backup_relation = load_cached_relation(backup_relation) -%} + {%- set view_relation = default__make_temp_relation(target_relation, '__dbt_view_tmp') -%} + + {{ drop_relation_if_exists(preexisting_intermediate_relation) }} + {{ drop_relation_if_exists(preexisting_backup_relation) }} + {{ drop_relation_if_exists(view_relation) }} + {{ drop_relation_if_exists(distributed_intermediate_relation) }} + + {{ run_hooks(pre_hooks, inside_transaction=False) }} + {{ run_hooks(pre_hooks, inside_transaction=True) }} + {% set to_drop = [] %} + {% set schema_changes = none %} + + {% call statement('main') %} + {{ create_view_as(view_relation, sql) }} + {% endcall %} + + {% if existing_relation is none %} + -- No existing table, simply create a new one + {{ create_distributed_local_table(target_relation, target_relation_local, view_relation, sql) }} + + {% elif full_refresh_mode %} + -- Completely replacing the old table, so create a temporary table and then swap it + {{ create_distributed_local_table(distributed_intermediate_relation, intermediate_relation, view_relation, sql) }} + {% do adapter.drop_relation(distributed_intermediate_relation) or '' %} + {% set need_swap = true %} + + {% elif inserts_only or unique_key is none -%} + -- There are no updates/deletes or duplicate keys are allowed. Simply add all of the new rows to the existing + -- table. It is the user's responsibility to avoid duplicates. Note that "inserts_only" is a ClickHouse adapter + -- specific configurable that is used to avoid creating an expensive intermediate table. + {% call statement('main') %} + {{ clickhouse__insert_into(target_relation, sql) }} + {% endcall %} + + {% else %} + {% set incremental_strategy = adapter.calculate_incremental_strategy(config.get('incremental_strategy')) %} + {% set incremental_predicates = config.get('predicates', none) or config.get('incremental_predicates', none) %} + {% if on_schema_change != 'ignore' %} + {%- set schema_changes = check_for_schema_changes(existing_relation, target_relation) -%} + {% if schema_changes['schema_changed'] and incremental_strategy in ('append', 'delete_insert') %} + {% set incremental_strategy = 'legacy' %} + {% do log('Schema changes detected, switching to legacy incremental strategy') %} + {% endif %} + {% endif %} + {% if incremental_strategy != 'delete_insert' and incremental_predicates %} + {% do exceptions.raise_compiler_error('Cannot apply incremental predicates with ' + incremental_strategy + ' strategy.') %} + {% endif %} + {% if incremental_strategy == 'legacy' %} + {% do clickhouse__incremental_legacy(existing_relation, intermediate_relation, schema_changes, unique_key, True) %} + {% set need_swap = true %} + {% elif incremental_strategy == 'delete_insert' %} + {% do clickhouse__incremental_delete_insert(existing_relation, unique_key, incremental_predicates, True) %} + {% elif incremental_strategy == 'append' %} + {% call statement('main') %} + {{ clickhouse__insert_into(target_relation, sql) }} + {% endcall %} + {% endif %} + {% endif %} + + {% if need_swap %} + {% if False %} + {% do adapter.rename_relation(intermediate_relation, backup_relation) %} + {% do exchange_tables_atomic(backup_relation, target_relation_local) %} + {% else %} + {% do adapter.rename_relation(target_relation_local, backup_relation) %} + {% do adapter.rename_relation(intermediate_relation, target_relation_local) %} + {% endif %} + + -- Structure could have changed, need to update distributed table from replaced local table + {% set target_relation_new = target_relation.incorporate(path={"identifier": model['name'] + '_temp'}) %} + {{ drop_relation_if_exists(target_relation_new) }} + {% do run_query(create_distributed_table(target_relation_new, target_relation_local)) %} + + {% if False %} + {% do adapter.rename_relation(target_relation_new, distributed_backup_relation) %} + {% do exchange_tables_atomic(distributed_backup_relation, target_relation) %} + {% else %} + {% do adapter.rename_relation(target_relation, distributed_backup_relation) %} + {% do adapter.rename_relation(target_relation_new, target_relation) %} + {% endif %} + + {% do to_drop.append(backup_relation) %} + {% do to_drop.append(distributed_backup_relation) %} + {% endif %} + + {% set should_revoke = should_revoke(existing_relation, full_refresh_mode) %} + {% do apply_grants(target_relation, grant_config, should_revoke=should_revoke) %} + + {% do persist_docs(target_relation, model) %} + + {% if existing_relation is none or existing_relation.is_view or should_full_refresh() %} + {% do create_indexes(target_relation) %} + {% endif %} + + {{ run_hooks(post_hooks, inside_transaction=True) }} + + {% do adapter.commit() %} + + {% for rel in to_drop %} + {% do adapter.drop_relation(rel) %} + {% endfor %} + + {{ run_hooks(post_hooks, inside_transaction=False) }} + + {{ return({'relations': [target_relation]}) }} + +{%- endmaterialization %} \ No newline at end of file diff --git a/dbt/include/clickhouse/macros/materializations/distributed_table.sql b/dbt/include/clickhouse/macros/materializations/distributed_table.sql index 44eeb740..4713db71 100644 --- a/dbt/include/clickhouse/macros/materializations/distributed_table.sql +++ b/dbt/include/clickhouse/macros/materializations/distributed_table.sql @@ -1,4 +1,9 @@ {% materialization distributed_table, adapter='clickhouse' %} + {% set insert_distributed_sync = run_query("SELECT value FROM system.settings WHERE name = 'insert_distributed_sync'")[0][0] %} + {% if insert_distributed_sync != '1' %} + {% do exceptions.raise_compiler_error('To use distributed materialization setting insert_distributed_sync should be set to 1') %} + {% endif %} + {%- set local_suffix = adapter.get_clickhouse_local_suffix() -%} {%- set existing_relation = load_cached_relation(this) -%} @@ -37,8 +42,7 @@ {{ run_hooks(pre_hooks, inside_transaction=True) }} {% if backup_relation is none %} - {% do run_query(create_empty_table_from_relation(target_relation_local, view_relation)) or '' %} - {% do run_query(create_distributed_table(target_relation, target_relation_local)) or '' %} + {{ create_distributed_local_table(target_relation, target_relation_local, view_relation) }} {% elif existing_relation.can_exchange %} -- We can do an atomic exchange, so no need for an intermediate {% call statement('main') -%} @@ -66,7 +70,12 @@ {% endmaterialization %} {% macro create_distributed_table(relation, local_relation) %} - {%- set cluster = adapter.get_clickhouse_cluster_name()[1:-1] -%} + {%- set cluster = adapter.get_clickhouse_cluster_name() -%} + {% if cluster is none %} + {% do exceptions.raise_compiler_error('Cluster name should be defined for using distributed materializations, current is None') %} + {% endif %} + + {%- set cluster = cluster[1:-1] -%} {%- set sharding = config.get('sharding_key') -%} CREATE TABLE {{ relation }} {{ on_cluster_clause() }} AS {{ local_relation }} @@ -98,3 +107,13 @@ {{ partition_cols(label="partition by") }} {{ adapter.get_model_settings(model) }} {%- endmacro %} + +{% macro create_distributed_local_table(distributed_relation, shard_relation, structure_relation, sql_query=none) -%} + {{ drop_relation_if_exists(shard_relation) }} + {{ drop_relation_if_exists(distributed_relation) }} + {% do run_query(create_empty_table_from_relation(shard_relation, structure_relation)) or '' %} + {% do run_query(create_distributed_table(distributed_relation, shard_relation)) or '' %} + {% if sql_query is not none %} + {% do run_query(clickhouse__insert_into(distributed_relation, sql_query)) or '' %} + {% endif %} +{%- endmacro %} diff --git a/dbt/include/clickhouse/macros/materializations/incremental.sql b/dbt/include/clickhouse/macros/materializations/incremental.sql index 76116a14..491dff3e 100644 --- a/dbt/include/clickhouse/macros/materializations/incremental.sql +++ b/dbt/include/clickhouse/macros/materializations/incremental.sql @@ -134,59 +134,89 @@ {% endmacro %} -{% macro clickhouse__incremental_legacy(existing_relation, intermediate_relation, on_schema_change, unique_key) %} - -- First create a temporary table for all of the new data +{% macro clickhouse__incremental_legacy(existing_relation, intermediate_relation, on_schema_change, unique_key, is_distributed=False) %} {% set new_data_relation = existing_relation.incorporate(path={"identifier": model['name'] + '__dbt_new_data'}) %} {{ drop_relation_if_exists(new_data_relation) }} - {% call statement('create_new_data_temp') %} + {%- set distributed_new_data_relation = existing_relation.incorporate(path={"identifier": model['name'] + '__dbt_distributed_new_data'}) -%} + + {%- set inserted_relation = intermediate_relation -%} + {%- set inserting_relation = new_data_relation -%} + + -- First create a temporary table for all of the new data + {% if is_distributed %} + -- Need to use distributed table to have data on all shards + {%- set inserting_relation = distributed_new_data_relation -%} + {{ create_distributed_local_table(distributed_new_data_relation, new_data_relation, existing_relation, sql) }} + {% else %} + {% call statement('create_new_data_temp') %} {{ get_create_table_as_sql(False, new_data_relation, sql) }} - {% endcall %} + {% endcall %} + {% endif %} -- Next create another temporary table that will eventually be used to replace the existing table. We can't -- use the table just created in the previous step because we don't want to override any updated rows with -- old rows when we insert the old data - {% call statement('main') %} - create table {{ intermediate_relation }} as {{ new_data_relation }} - {% endcall %} + {% if is_distributed %} + {%- set distributed_intermediate_relation = make_intermediate_relation(existing_relation) -%} + {%- set inserted_relation = distributed_intermediate_relation -%} + {{ create_distributed_local_table(distributed_intermediate_relation, intermediate_relation, existing_relation) }} + {% else %} + {% call statement('main') %} + create table {{ intermediate_relation }} as {{ new_data_relation }} {{ on_cluster_clause() }} + {% endcall %} + {% endif %} -- Insert all the existing rows into the new temporary table, ignoring any rows that have keys in the "new data" -- table. {%- set dest_columns = adapter.get_columns_in_relation(existing_relation) -%} {%- set dest_cols_csv = dest_columns | map(attribute='quoted') | join(', ') -%} {% call statement('insert_existing_data') %} - insert into {{ intermediate_relation }} ({{ dest_cols_csv }}) + insert into {{ inserted_relation }} ({{ dest_cols_csv }}) select {{ dest_cols_csv }} from {{ existing_relation }} where ({{ unique_key }}) not in ( select {{ unique_key }} - from {{ new_data_relation }} + from {{ inserting_relation }} ) {{ adapter.get_model_settings(model) }} {% endcall %} -- Insert all of the new data into the temporary table {% call statement('insert_new_data') %} - insert into {{ intermediate_relation }} ({{ dest_cols_csv }}) + insert into {{ inserted_relation }} ({{ dest_cols_csv }}) select {{ dest_cols_csv }} - from {{ new_data_relation }} + from {{ inserting_relation }} {{ adapter.get_model_settings(model) }} {% endcall %} {% do adapter.drop_relation(new_data_relation) %} + {{ drop_relation_if_exists(distributed_new_data_relation) }} + {{ drop_relation_if_exists(distributed_intermediate_relation) }} {% endmacro %} -{% macro clickhouse__incremental_delete_insert(existing_relation, unique_key, incremental_predicates) %} +{% macro clickhouse__incremental_delete_insert(existing_relation, unique_key, incremental_predicates, is_distributed=False) %} {% set new_data_relation = existing_relation.incorporate(path={"identifier": model['name'] + '__dbt_new_data_' + invocation_id.replace('-', '_')}) %} {{ drop_relation_if_exists(new_data_relation) }} - {% call statement('main') %} + {%- set distributed_new_data_relation = existing_relation.incorporate(path={"identifier": model['name'] + '__dbt_distributed_new_data'}) -%} + + {%- set inserting_relation = new_data_relation -%} + + {% if is_distributed %} + -- Need to use distributed table to have data on all shards + {%- set inserting_relation = distributed_new_data_relation -%} + {{ create_distributed_local_table(distributed_new_data_relation, new_data_relation, existing_relation, sql) }} + {% else %} + {% call statement('main') %} {{ get_create_table_as_sql(False, new_data_relation, sql) }} - {% endcall %} + {% endcall %} + {% endif %} + {% call statement('delete_existing_data') %} delete from {{ existing_relation }} where ({{ unique_key }}) in (select {{ unique_key }} - from {{ new_data_relation }}) + from {{ inserting_relation }}) {%- if incremental_predicates %} {% for predicate in incremental_predicates %} and {{ predicate }} @@ -197,7 +227,8 @@ {%- set dest_columns = adapter.get_columns_in_relation(existing_relation) -%} {%- set dest_cols_csv = dest_columns | map(attribute='quoted') | join(', ') -%} {% call statement('insert_new_data') %} - insert into {{ existing_relation}} select {{ dest_cols_csv}} from {{ new_data_relation }} + insert into {{ existing_relation }} select {{ dest_cols_csv }} from {{ inserting_relation }} {% endcall %} {% do adapter.drop_relation(new_data_relation) %} + {{ drop_relation_if_exists(distributed_new_data_relation) }} {% endmacro %} From 3a28a6603c61dc0d3255215b172441c5c494409f Mon Sep 17 00:00:00 2001 From: Geoff Genz Date: Thu, 27 Jul 2023 05:28:45 -0600 Subject: [PATCH 10/78] Update version and tweak docs --- CHANGELOG.md | 5 +++++ README.md | 5 ++++- dbt/adapters/clickhouse/__version__.py | 2 +- 3 files changed, 10 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index e66caf96..4dc317d7 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,8 @@ +### Release [1.4.5], 2023-07-27 +#### Improvement +- Adds additional experimental support for Distributed table engine models and incremental materialization. See the README for +details. Thanks to [gladkikhtutu](https://github.com/gladkikhtutu) for the contribution! + ### Release [1.4.4], 2023-07-19 #### Bug Fixes - Fixed two logging/exception handling issues that would cause exception on startup or when handling some exceptions diff --git a/README.md b/README.md index 96dd9f71..5204f20f 100644 --- a/README.md +++ b/README.md @@ -158,9 +158,12 @@ See the [S3 test file](https://github.com/ClickHouse/dbt-clickhouse/blob/main/te # Distributed materializations +Note: Distributed materializations experimental and are not currently included in the automated test suite. + WARNING: -To use distributed materializations correctly you should set **insert_distributed_sync** = 1 (or use as prehook) in order to have correct data while SELECT queries. Otherwise, downstream calculation could be wrong if the distributed insert is not finished in time. +To use distributed materializations correctly you should set **insert_distributed_sync** = 1 (or use as prehook) in order to have correct data while SELECT queries. Otherwise, downstream operations could produce invalid results +if the distributed insert has not completed before additional updates are executed. ## Distributed table materialization diff --git a/dbt/adapters/clickhouse/__version__.py b/dbt/adapters/clickhouse/__version__.py index f91302bf..f018559b 100644 --- a/dbt/adapters/clickhouse/__version__.py +++ b/dbt/adapters/clickhouse/__version__.py @@ -1 +1 @@ -version = '1.4.4' +version = '1.4.5' From 1a0649eff771c30e1eba410d42ad436d3e6432d5 Mon Sep 17 00:00:00 2001 From: Geoff Genz Date: Thu, 27 Jul 2023 10:51:19 -0600 Subject: [PATCH 11/78] Lw delete set fix (#174) * Move lightweight delete settings to per query for HTTP stickiness fix * Minor cleanup and doc updates --- .github/workflows/pypi.yml | 2 + .github/workflows/test_cloud.yml | 1 + CHANGELOG.md | 6 +++ README.md | 19 ++++++--- dbt/adapters/clickhouse/__version__.py | 2 +- dbt/adapters/clickhouse/dbclient.py | 42 ++++++++++++------- .../adapter/persist_docs/test_persist_docs.py | 6 ++- tests/integration/adapter/test_comments.py | 2 +- 8 files changed, 55 insertions(+), 25 deletions(-) diff --git a/.github/workflows/pypi.yml b/.github/workflows/pypi.yml index 6094f06f..6e4fcc6d 100644 --- a/.github/workflows/pypi.yml +++ b/.github/workflows/pypi.yml @@ -1,12 +1,14 @@ --- name: "PyPI Release" +# yamllint disable-line rule:truthy on: push: tags: - 'v*' workflow_dispatch: + jobs: publish: name: PyPI Release diff --git a/.github/workflows/test_cloud.yml b/.github/workflows/test_cloud.yml index d11a2d11..5d98542f 100644 --- a/.github/workflows/test_cloud.yml +++ b/.github/workflows/test_cloud.yml @@ -17,6 +17,7 @@ jobs: DBT_CH_TEST_HOST: ${{ secrets.INTEGRATIONS_TEAM_TESTS_CLOUD_HOST }} DBT_CH_TEST_PASSWORD: ${{ secrets.INTEGRATIONS_TEAM_TESTS_CLOUD_PASSWORD }} DBT_CH_TEST_CLUSTER_MODE: true + DBT_CH_TEST_CLOUD: true steps: - name: Checkout diff --git a/CHANGELOG.md b/CHANGELOG.md index 4dc317d7..afc247d2 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,9 @@ +### Release [1.4.6], 2023-07-27 +#### Bug fix +- Lightweight deletes could fail in environments where the HTTP session was not preserved (such as clusters behind a non-sticky +load balancer). This has been fixed by sending the required settings with every request instead of relying on a SET statement. +A similar approach has been used to persist the 'insert_distributed_sync' setting for Distributed table materializations. + ### Release [1.4.5], 2023-07-27 #### Improvement - Adds additional experimental support for Distributed table engine models and incremental materialization. See the README for diff --git a/README.md b/README.md index 5204f20f..0b8b3477 100644 --- a/README.md +++ b/README.md @@ -34,6 +34,14 @@ pip install dbt-clickhouse # Usage Notes +## SET Statement Warning +In many environments, using the SET statement to persist a ClickHouse setting across all DBT queries is not reliable +and can cause unexpected failures. This is particularly true when using HTTP connections through a load balancer that +distributes queries across multiple nodes (such as ClickHouse cloud), although in some circumstances this can also +happen with native ClickHouse connections. Accordingly, we recommend configuring any required ClickHouse settings in the +"custom_settings" property of the DBT profile as a best practice, instead of relying on a prehook "SET" statement as +has been occasionally suggested. + ## Database The dbt model relation identifier `database.schema.table` is not compatible with Clickhouse because Clickhouse does not support a `schema`. @@ -68,7 +76,7 @@ your_profile_name: use_lw_deletes: [False] Use the strategy `delete+insert` as the default incremental strategy. check_exchange: [True] # Validate that clickhouse support the atomic EXCHANGE TABLES command. (Not needed for most ClickHouse versions) local_suffix [local] # Table suffix of local tables on shards for distributed materializations - custom_settings: [{}] # A dicitonary/mapping of custom ClickHouse settings for the connection - default is empty. + custom_settings: [{}] # A dictionary/mapping of custom ClickHouse settings for the connection - default is empty. # Native (clickhouse-driver) connection settings sync_request_timeout: [5] Timeout for server ping @@ -158,12 +166,11 @@ See the [S3 test file](https://github.com/ClickHouse/dbt-clickhouse/blob/main/te # Distributed materializations -Note: Distributed materializations experimental and are not currently included in the automated test suite. - -WARNING: +Notes: -To use distributed materializations correctly you should set **insert_distributed_sync** = 1 (or use as prehook) in order to have correct data while SELECT queries. Otherwise, downstream operations could produce invalid results -if the distributed insert has not completed before additional updates are executed. +- Distributed materializations are experimental and are not currently included in the automated test suite. +- dbt-clickhouse queries now automatically include the setting `insert_distributed_sync = 1` in order to ensure that downstream incremental +materialization operations execute correctly. This could cause some distributed table inserts to run more slowly than expected. ## Distributed table materialization diff --git a/dbt/adapters/clickhouse/__version__.py b/dbt/adapters/clickhouse/__version__.py index f018559b..da429ebf 100644 --- a/dbt/adapters/clickhouse/__version__.py +++ b/dbt/adapters/clickhouse/__version__.py @@ -1 +1 @@ -version = '1.4.5' +version = '1.4.6' diff --git a/dbt/adapters/clickhouse/dbclient.py b/dbt/adapters/clickhouse/dbclient.py index 2051d635..2ad197ba 100644 --- a/dbt/adapters/clickhouse/dbclient.py +++ b/dbt/adapters/clickhouse/dbclient.py @@ -6,6 +6,9 @@ from dbt.adapters.clickhouse.credentials import ClickHouseCredentials from dbt.adapters.clickhouse.logger import logger +LW_DELETE_SETTING = 'allow_experimental_lightweight_delete' +ND_MUTATION_SETTING = 'allow_nondeterministic_mutations' + def get_db_client(credentials: ClickHouseCredentials): driver = credentials.driver @@ -63,6 +66,7 @@ def __init__(self, credentials: ClickHouseCredentials): self._conn_settings['database_replicated_enforce_synchronous_settings'] = '1' self._conn_settings['insert_quorum'] = 'auto' self._conn_settings['mutations_sync'] = '2' + self._conn_settings['insert_distributed_sync'] = '1' self._client = self._create_client(credentials) check_exchange = credentials.check_exchange and not credentials.cluster_mode try: @@ -108,27 +112,35 @@ def _server_version(self): pass def _check_lightweight_deletes(self, requested: bool): - lw_deletes = self.get_ch_setting('allow_experimental_lightweight_delete') - if lw_deletes is None: + lw_deletes = self.get_ch_setting(LW_DELETE_SETTING) + nd_mutations = self.get_ch_setting(ND_MUTATION_SETTING) + if lw_deletes is None or nd_mutations is None: if requested: logger.warning( 'use_lw_deletes requested but are not available on this ClickHouse server' ) return False, False - lw_deletes = int(lw_deletes) - if lw_deletes == 1: + lw_deletes = int(lw_deletes) > 0 + if not lw_deletes: + try: + self.command(f'SET {LW_DELETE_SETTING} = 1') + self._conn_settings[LW_DELETE_SETTING] = '1' + lw_deletes = True + except DbtDatabaseError: + pass + nd_mutations = int(nd_mutations) > 0 + if lw_deletes and not nd_mutations: + try: + self.command(f'SET {ND_MUTATION_SETTING} = 1') + self._conn_settings[ND_MUTATION_SETTING] = '1' + nd_mutations = True + except DbtDatabaseError: + pass + if lw_deletes and nd_mutations: return True, requested - if not requested: - return False, False - try: - self.command('SET allow_experimental_lightweight_delete = 1') - self.command('SET allow_nondeterministic_mutations = 1') - return True, True - except DbtDatabaseError as ex: - logger.warning( - 'use_lw_deletes requested but cannot enable on this ClickHouse server %s', str(ex) - ) - return False, False + if requested: + logger.warning('use_lw_deletes requested but cannot enable on this ClickHouse server') + return False, False def _ensure_database(self, database_engine, cluster_name) -> None: if not self.database: diff --git a/tests/integration/adapter/persist_docs/test_persist_docs.py b/tests/integration/adapter/persist_docs/test_persist_docs.py index 710ce611..a9129a3b 100644 --- a/tests/integration/adapter/persist_docs/test_persist_docs.py +++ b/tests/integration/adapter/persist_docs/test_persist_docs.py @@ -101,14 +101,16 @@ def project_config_update(self): } } - def test_has_comments_pglike(self, project): + def test_has_comments_pg_like(self): + if os.environ.get('DBT_CH_TEST_CLOUD', '').lower() in ('1', 'true', 'yes'): + pytest.skip('Not running comment test for cloud') run_dbt(["docs", "generate"]) with open("target/catalog.json") as fp: catalog_data = json.load(fp) assert "nodes" in catalog_data assert len(catalog_data["nodes"]) == 4 table_node = catalog_data["nodes"]["model.test.table_model"] - view_node = self._assert_has_table_comments(table_node) + self._assert_has_table_comments(table_node) view_node = catalog_data["nodes"]["model.test.view_model"] self._assert_has_view_comments(view_node) diff --git a/tests/integration/adapter/test_comments.py b/tests/integration/adapter/test_comments.py index 2e310c0c..5179954a 100644 --- a/tests/integration/adapter/test_comments.py +++ b/tests/integration/adapter/test_comments.py @@ -67,7 +67,7 @@ def models(self): ['table_comment', 'view_comment'], ) def test_comment(self, project, model_name): - if '_cloud' in os.environ.get('GITHUB_REF', ''): + if os.environ.get('DBT_CH_TEST_CLOUD', '').lower() in ('1', 'true', 'yes'): pytest.skip('Not running comment test for cloud') run_dbt(["run"]) run_dbt(["docs", "generate"]) From 4b8a2025cb4ff7bd1b1dc5d8566d660d64c6e41c Mon Sep 17 00:00:00 2001 From: Geoff Genz Date: Wed, 9 Aug 2023 06:53:54 -0600 Subject: [PATCH 12/78] Fix legacy incremental materialization (#178) --- CHANGELOG.md | 4 ++++ dbt/adapters/clickhouse/__version__.py | 2 +- .../clickhouse/macros/materializations/incremental.sql | 8 +++++--- 3 files changed, 10 insertions(+), 4 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index afc247d2..53c4b921 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,7 @@ +### Release [1.4.7], 2023-08-09 +#### Bug Fix +- Fixed an exception in "legacy" incremental materializations that are not distributed + ### Release [1.4.6], 2023-07-27 #### Bug fix - Lightweight deletes could fail in environments where the HTTP session was not preserved (such as clusters behind a non-sticky diff --git a/dbt/adapters/clickhouse/__version__.py b/dbt/adapters/clickhouse/__version__.py index da429ebf..392ee67d 100644 --- a/dbt/adapters/clickhouse/__version__.py +++ b/dbt/adapters/clickhouse/__version__.py @@ -1 +1 @@ -version = '1.4.6' +version = '1.4.7' diff --git a/dbt/include/clickhouse/macros/materializations/incremental.sql b/dbt/include/clickhouse/macros/materializations/incremental.sql index 491dff3e..1eb35b0e 100644 --- a/dbt/include/clickhouse/macros/materializations/incremental.sql +++ b/dbt/include/clickhouse/macros/materializations/incremental.sql @@ -137,7 +137,6 @@ {% macro clickhouse__incremental_legacy(existing_relation, intermediate_relation, on_schema_change, unique_key, is_distributed=False) %} {% set new_data_relation = existing_relation.incorporate(path={"identifier": model['name'] + '__dbt_new_data'}) %} {{ drop_relation_if_exists(new_data_relation) }} - {%- set distributed_new_data_relation = existing_relation.incorporate(path={"identifier": model['name'] + '__dbt_distributed_new_data'}) -%} {%- set inserted_relation = intermediate_relation -%} {%- set inserting_relation = new_data_relation -%} @@ -145,6 +144,7 @@ -- First create a temporary table for all of the new data {% if is_distributed %} -- Need to use distributed table to have data on all shards + {%- set distributed_new_data_relation = existing_relation.incorporate(path={"identifier": model['name'] + '__dbt_distributed_new_data'}) -%} {%- set inserting_relation = distributed_new_data_relation -%} {{ create_distributed_local_table(distributed_new_data_relation, new_data_relation, existing_relation, sql) }} {% else %} @@ -190,8 +190,10 @@ {% endcall %} {% do adapter.drop_relation(new_data_relation) %} - {{ drop_relation_if_exists(distributed_new_data_relation) }} - {{ drop_relation_if_exists(distributed_intermediate_relation) }} + {% if is_distributed %} + {{ drop_relation_if_exists(distributed_new_data_relation) }} + {{ drop_relation_if_exists(distributed_intermediate_relation) }} + {% endif %} {% endmacro %} From 9c8139f1e2d39d0c487df5e5387e58eabee388db Mon Sep 17 00:00:00 2001 From: Zhenbang <122523068+zli06160@users.noreply.github.com> Date: Tue, 22 Aug 2023 15:36:51 +0200 Subject: [PATCH 13/78] fix: distributed_table materialization issue (#184) --- dbt/include/clickhouse/macros/adapters.sql | 2 +- .../clickhouse/macros/materializations/distributed_table.sql | 5 +++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/dbt/include/clickhouse/macros/adapters.sql b/dbt/include/clickhouse/macros/adapters.sql index 8ef23409..2d5301b6 100644 --- a/dbt/include/clickhouse/macros/adapters.sql +++ b/dbt/include/clickhouse/macros/adapters.sql @@ -106,7 +106,7 @@ {% macro exchange_tables_atomic(old_relation, target_relation, obj_types='TABLES') %} - {%- if adapter.get_clickhouse_cluster_name() is not none and obj_types == 'TABLES' %} + {%- if adapter.get_clickhouse_cluster_name() is not none and obj_types == 'TABLES' and 'Replicated' in engine_clause() %} {% do run_query("SYSTEM SYNC REPLICA " + on_cluster_clause() + target_relation.schema + '.' + target_relation.identifier) %} {%- endif %} diff --git a/dbt/include/clickhouse/macros/materializations/distributed_table.sql b/dbt/include/clickhouse/macros/materializations/distributed_table.sql index 4713db71..dfccc0d7 100644 --- a/dbt/include/clickhouse/macros/materializations/distributed_table.sql +++ b/dbt/include/clickhouse/macros/materializations/distributed_table.sql @@ -46,13 +46,14 @@ {% elif existing_relation.can_exchange %} -- We can do an atomic exchange, so no need for an intermediate {% call statement('main') -%} - {% do run_query(create_empty_table_from_relation(backup_relation, view_relation)) or '' %} + {{ create_empty_table_from_relation(backup_relation, view_relation) }} {%- endcall %} - {% do exchange_tables_atomic(backup_relation, existing_relation) %} + {% do exchange_tables_atomic(backup_relation, existing_relation_local) %} {% else %} {% do run_query(create_empty_table_from_relation(intermediate_relation, view_relation)) or '' %} {{ adapter.rename_relation(existing_relation_local, backup_relation) }} {{ adapter.rename_relation(intermediate_relation, target_relation_local) }} + {{ create_distributed_table(target_relation, target_relation_local) }} {% endif %} {% do run_query(clickhouse__insert_into(target_relation, sql)) or '' %} {{ drop_relation_if_exists(view_relation) }} From 80cba25b17475c8264a84f076a8f34e3aa545e70 Mon Sep 17 00:00:00 2001 From: Geoff Genz Date: Tue, 22 Aug 2023 16:26:56 -0600 Subject: [PATCH 14/78] Bump version and changelog (#185) --- CHANGELOG.md | 5 +++++ dbt/adapters/clickhouse/__version__.py | 2 +- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 53c4b921..7fafb9a5 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,8 @@ +### Release [1.4.8], 2023-08-22 +#### Bug Fix +- Fixed issues with experimental Distributed table materializations. Closes https://github.com/ClickHouse/dbt-clickhouse/issues/179. +Thanks to [Zhebnang](https://github.com/zli06160) for the report and for contributing to the fix with [gfunc](https://github.com/gfunc). + ### Release [1.4.7], 2023-08-09 #### Bug Fix - Fixed an exception in "legacy" incremental materializations that are not distributed diff --git a/dbt/adapters/clickhouse/__version__.py b/dbt/adapters/clickhouse/__version__.py index 392ee67d..0e921568 100644 --- a/dbt/adapters/clickhouse/__version__.py +++ b/dbt/adapters/clickhouse/__version__.py @@ -1 +1 @@ -version = '1.4.7' +version = '1.4.8' From b79669ae555f5b060ef42dc14a3d6372cd418b33 Mon Sep 17 00:00:00 2001 From: Andy Date: Sun, 22 Oct 2023 00:11:27 +0100 Subject: [PATCH 15/78] cluster names containing dash characters (#198) (#200) Co-authored-by: the4thamigo-uk --- dbt/adapters/clickhouse/dbclient.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dbt/adapters/clickhouse/dbclient.py b/dbt/adapters/clickhouse/dbclient.py index 2ad197ba..320ea566 100644 --- a/dbt/adapters/clickhouse/dbclient.py +++ b/dbt/adapters/clickhouse/dbclient.py @@ -150,7 +150,7 @@ def _ensure_database(self, database_engine, cluster_name) -> None: db_exists = self.command(check_db) if not db_exists: engine_clause = f' ENGINE {database_engine} ' if database_engine else '' - cluster_clause = f' ON CLUSTER {cluster_name} ' if cluster_name is not None else '' + cluster_clause = f' ON CLUSTER "{cluster_name}" ' if cluster_name is not None else '' self.command(f'CREATE DATABASE {self.database}{cluster_clause}{engine_clause}') db_exists = self.command(check_db) if not db_exists: From d63285a1af318f94395ce3052ba739538a9a7308 Mon Sep 17 00:00:00 2001 From: Geoff Genz Date: Thu, 26 Oct 2023 16:44:47 -0600 Subject: [PATCH 16/78] Add basic error test, fix minor merge conflict (#202) --- dbt/adapters/clickhouse/dbclient.py | 6 ++++- tests/integration/adapter/test_errors.py | 30 ++++++++++++++++++++++++ 2 files changed, 35 insertions(+), 1 deletion(-) create mode 100644 tests/integration/adapter/test_errors.py diff --git a/dbt/adapters/clickhouse/dbclient.py b/dbt/adapters/clickhouse/dbclient.py index 320ea566..d029c16e 100644 --- a/dbt/adapters/clickhouse/dbclient.py +++ b/dbt/adapters/clickhouse/dbclient.py @@ -150,7 +150,11 @@ def _ensure_database(self, database_engine, cluster_name) -> None: db_exists = self.command(check_db) if not db_exists: engine_clause = f' ENGINE {database_engine} ' if database_engine else '' - cluster_clause = f' ON CLUSTER "{cluster_name}" ' if cluster_name is not None else '' + cluster_clause = ( + f' ON CLUSTER {cluster_name} ' + if cluster_name is not None and cluster_name.strip() != '' + else '' + ) self.command(f'CREATE DATABASE {self.database}{cluster_clause}{engine_clause}') db_exists = self.command(check_db) if not db_exists: diff --git a/tests/integration/adapter/test_errors.py b/tests/integration/adapter/test_errors.py new file mode 100644 index 00000000..cd841e83 --- /dev/null +++ b/tests/integration/adapter/test_errors.py @@ -0,0 +1,30 @@ +import pytest +from dbt.tests.util import run_dbt + +oom_table_sql = """ +SELECT a FROM system.numbers_mt GROUP BY repeat(toString(number), 100000) as a +""" + +schema_yaml = """ +version: 2 + +models: + - name: oom_table + description: Table that generates OOM + config: + materialized: table + order_by: a +""" + + +class TestOOMError: + @pytest.fixture(scope="class") + def models(self): + return { + "schema.yml": schema_yaml, + "oom_table.sql": oom_table_sql, + } + + def test_oom(self, project): + res = run_dbt(["run"], expect_pass=False) + assert 'exceeded' in res.results[0].message From 96474f10950ac398ea7e9ea32230d77a96ec9a35 Mon Sep 17 00:00:00 2001 From: gfunc Date: Thu, 26 Oct 2023 17:46:21 -0500 Subject: [PATCH 17/78] Cluster setting and Distributed Table tests (#186) * added can_on_cluster var in ClickhouseRelation * add tests for cluster * fix lint issue * debug set cluster env variable * debug test * debug and add tests * skip distributed table grant test * debug workflow * debug workflow * debug test * add tests fro distributed_incremental * fix zk path error * fix wrong alias for distributed materializations update aliase test * update base on review --- .github/workflows/test_matrix.yml | 16 +- README.md | 31 ++- dbt/adapters/clickhouse/impl.py | 19 +- dbt/adapters/clickhouse/relation.py | 52 +++- dbt/include/clickhouse/macros/adapters.sql | 40 ++- .../macros/adapters/apply_grants.sql | 6 +- .../clickhouse/macros/adapters/relation.sql | 4 +- .../materializations/distributed_table.sql | 20 +- .../distributed_incremental.sql | 12 +- .../{ => incremental}/incremental.sql | 17 +- .../incremental/is_incremental.sql | 13 + .../macros/materializations/seed.sql | 2 +- .../macros/materializations/table.sql | 6 +- .../clickhouse/macros/persist_docs.sql | 6 +- .../test_distributed_incremental.py | 205 ++++++++++++++ tests/integration/adapter/test_aliases.py | 101 +++++++ tests/integration/adapter/test_basic.py | 253 +++++++++++++++++- tests/integration/adapter/test_grants.py | 49 ++++ tests/integration/conftest.py | 10 +- tests/integration/docker-compose.yml | 48 +++- tests/integration/test_config.xml | 89 ++++++ 21 files changed, 929 insertions(+), 70 deletions(-) rename dbt/include/clickhouse/macros/materializations/{ => incremental}/distributed_incremental.sql (92%) rename dbt/include/clickhouse/macros/materializations/{ => incremental}/incremental.sql (93%) create mode 100644 dbt/include/clickhouse/macros/materializations/incremental/is_incremental.sql create mode 100644 tests/integration/adapter/incremental/test_distributed_incremental.py create mode 100644 tests/integration/adapter/test_aliases.py create mode 100644 tests/integration/test_config.xml diff --git a/.github/workflows/test_matrix.yml b/.github/workflows/test_matrix.yml index d204477c..1efb9485 100644 --- a/.github/workflows/test_matrix.yml +++ b/.github/workflows/test_matrix.yml @@ -44,16 +44,10 @@ jobs: echo "TEST_SETTINGS_FILE=22_3" >> $GITHUB_ENV echo "DBT_CH_TEST_CH_VERSION=22.3" >> $GITHUB_ENV - - name: Run ClickHouse Container - run: docker run - -d - -p 8123:8123 - -p 9000:9000 - --name clickhouse - -v /var/lib/clickhouse - -v ${{ github.workspace }}/tests/integration/test_settings_$TEST_SETTINGS_FILE.xml:/etc/clickhouse-server/users.d/test_settings.xml - --ulimit nofile=262144:262144 - clickhouse/clickhouse-server:${{ matrix.clickhouse-version }} + - name: Run ClickHouse Cluster Containers + env: + PROJECT_ROOT: ${{ github.workspace }}/tests/integration + run: REPLICA_NUM=1 docker-compose -f ${{ github.workspace }}/tests/integration/docker-compose.yml up -d - name: Setup Python ${{ matrix.python-version }} uses: actions/setup-python@v4 @@ -64,6 +58,8 @@ jobs: run: pip3 install -r dev_requirements.txt - name: Run HTTP tests + env: + DBT_CH_TEST_CLUSTER: test_shard run: | PYTHONPATH="${PYTHONPATH}:dbt" pytest tests diff --git a/README.md b/README.md index 0b8b3477..083d38ac 100644 --- a/README.md +++ b/README.md @@ -65,7 +65,7 @@ your_profile_name: port: [8123] # If not set, defaults to 8123, 8443, 9000, 9440 depending on the secure and driver settings user: [default] # User for all database operations password: [] # Password for the user - cluster: [] If set, DDL/table operations will be executed with the `ON CLUSTER` clause using this cluster + cluster: [] If set, certain DDL/table operations will be executed with the `ON CLUSTER` clause using this cluster. Distributed materializations require this setting to work. See the following ClickHouse Cluster section for more details. verify: [True] # Validate TLS certificate if using TLS/SSL secure: [False] # Use TLS (native protocol) or HTTPS (http protocol) retries: [1] # Number of times to retry a "retriable" database exception (such as a 503 'Service Unavailable' error) @@ -75,7 +75,7 @@ your_profile_name: cluster_mode: [False] # Use specific settings designed to improve operation on Replicated databases (recommended for ClickHouse Cloud) use_lw_deletes: [False] Use the strategy `delete+insert` as the default incremental strategy. check_exchange: [True] # Validate that clickhouse support the atomic EXCHANGE TABLES command. (Not needed for most ClickHouse versions) - local_suffix [local] # Table suffix of local tables on shards for distributed materializations + local_suffix [_local] # Table suffix of local tables on shards for distributed materializations. custom_settings: [{}] # A dictionary/mapping of custom ClickHouse settings for the connection - default is empty. # Native (clickhouse-driver) connection settings @@ -91,14 +91,35 @@ your_profile_name: | engine | The table engine (type of table) to use when creating tables | Optional (default: `MergeTree()`) | | order_by | A tuple of column names or arbitrary expressions. This allows you to create a small sparse index that helps find data faster. | Optional (default: `tuple()`) | | partition_by | A partition is a logical combination of records in a table by a specified criterion. The partition key can be any expression from the table columns. | Optional | +| sharding_key | Sharding key determines the destination server when inserting into distributed engine table. The sharding key can be random or as an output of a hash function | Optional (default: `rand()`) | | primary_key | Like order_by, a ClickHouse primary key expression. If not specified, ClickHouse will use the order by expression as the primary key | | unique_key | A tuple of column names that uniquely identify rows. Used with incremental models for updates. | Optional | | inserts_only | If set to True for an incremental model, incremental updates will be inserted directly to the target table without creating intermediate table. It has been deprecated in favor of the `append` incremental `strategy`, which operates in the same way | Optional | | incremental_strategy | Incremental model update strategy of `delete+insert` or `append`. See the following Incremental Model Strategies | Optional (default: `default`) | | incremental_predicates | Additional conditions to be applied to the incremental materialization (only applied to `delete+insert` strategy | +## ClickHouse Cluster + +`cluster` setting in profile enables dbt-clickhouse to run against a ClickHouse cluster. + +### Effective Scope + + +if `cluster` is set in profile, `on_cluster_clause` now will return cluster info for: +- Database creation +- View materialization +- Distributed materializations +- Models with Replicated engines + +table and incremental materializations with non-replicated engine will not be affected by `cluster` setting (model would be created on the connected node only). + +### Compatibility + + +If a model has been created without a `cluster` setting, dbt-clickhouse will detect the situation and run all DDL/DML without `on cluster` clause for this model. + + ## Known Limitations -* Replicated tables (combined with the `cluster` profile setting) are available using the `on_cluster_clause` macro but are not included in the test suite and not formally tested. * Ephemeral models/CTEs don't work if placed before the "INSERT INTO" in a ClickHouse insert statement, see https://github.com/ClickHouse/ClickHouse/issues/30323. This should not affect most models, but care should be taken where an ephemeral model is placed in model definitions and other SQL statements. @@ -143,7 +164,7 @@ The following macros are included to facilitate creating ClickHouse specific tab - `partition_cols` -- Uses the `partition_by` model configuration property to assign a ClickHouse partition key. No partition key is assigned by default. - `order_cols` -- Uses the `order_by` model configuration to assign a ClickHouse order by/sorting key. If not specified ClickHouse will use an empty tuple() and the table will be unsorted - `primary_key_clause` -- Uses the `primary_key` model configuration property to assign a ClickHouse primary key. By default, primary key is set and ClickHouse will use the order by clause as the primary key. -- `on_cluster_clause` -- Uses the `cluster` profile property to add an `ON CLUSTER` clause to all dbt-operations +- `on_cluster_clause` -- Uses the `cluster` profile property to add an `ON CLUSTER` clause to certain dbt-operations: distributed materializations, views creation, database creation. ### s3Source Helper Macro @@ -168,7 +189,6 @@ See the [S3 test file](https://github.com/ClickHouse/dbt-clickhouse/blob/main/te Notes: -- Distributed materializations are experimental and are not currently included in the automated test suite. - dbt-clickhouse queries now automatically include the setting `insert_distributed_sync = 1` in order to ensure that downstream incremental materialization operations execute correctly. This could cause some distributed table inserts to run more slowly than expected. @@ -281,6 +301,7 @@ configuration file (this file should not be checked into git). The following en 8. DBT_CH_TEST_CH_VERSION - ClickHouse docker image to use. Defaults to `latest` 9. DBT_CH_TEST_INCLUDE_S3 - Include S3 tests. Default=False since these are currently dependent on a specific ClickHouse S3 bucket/test dataset 10. DBT_CH_TEST_CLUSTER_MODE - Use the profile value +11. DBT_CH_TEST_CLUSTER - ClickHouse cluster name, if DBT_CH_TEST_USE_DOCKER set to true, only `test_replica` and `test_shard` is valid (see tests/test_config.xml for cluster settings) ## Original Author diff --git a/dbt/adapters/clickhouse/impl.py b/dbt/adapters/clickhouse/impl.py index f482ddb4..c59e0497 100644 --- a/dbt/adapters/clickhouse/impl.py +++ b/dbt/adapters/clickhouse/impl.py @@ -30,6 +30,7 @@ class ClickHouseConfig(AdapterConfig): engine: str = 'MergeTree()' order_by: Optional[Union[List[str], str]] = 'tuple()' partition_by: Optional[Union[List[str], str]] = None + sharding_key: Optional[Union[List[str], str]] = 'rand()' class ClickHouseAdapter(SQLAdapter): @@ -77,8 +78,11 @@ def get_clickhouse_cluster_name(self): @available.parse(lambda *a, **k: {}) def get_clickhouse_local_suffix(self): conn = self.connections.get_if_exists() - if conn.credentials.local_suffix: - return f'{conn.credentials.local_suffix}' + suffix = conn.credentials.local_suffix + if suffix: + if suffix.startswith('_'): + return f'{suffix}' + return f'_{suffix}' @available def clickhouse_db_engine_clause(self): @@ -107,6 +111,13 @@ def can_exchange(self, schema: str, rel_type: str) -> bool: ch_db = self.get_ch_database(schema) return ch_db and ch_db.engine in ('Atomic', 'Replicated') + @available.parse_none + def should_on_cluster(self, materialized: str = '', engine: str = '') -> bool: + conn = self.connections.get_if_exists() + if conn and conn.credentials.cluster: + return ClickHouseRelation.get_on_cluster(conn.credentials.cluster, materialized, engine) + return ClickHouseRelation.get_on_cluster('', materialized, engine) + @available.parse_none def calculate_incremental_strategy(self, strategy: str) -> str: conn = self.connections.get_if_exists() @@ -198,19 +209,21 @@ def list_relations_without_caching( relations = [] for row in results: - name, schema, type_info, db_engine = row + name, schema, type_info, db_engine, on_cluster = row rel_type = RelationType.View if 'view' in type_info else RelationType.Table can_exchange = ( conn_supports_exchange and rel_type == RelationType.Table and db_engine in ('Atomic', 'Replicated') ) + relation = self.Relation.create( database=None, schema=schema, identifier=name, type=rel_type, can_exchange=can_exchange, + can_on_cluster=(on_cluster >= 1), ) relations.append(relation) diff --git a/dbt/adapters/clickhouse/relation.py b/dbt/adapters/clickhouse/relation.py index f7a044e5..3fc91437 100644 --- a/dbt/adapters/clickhouse/relation.py +++ b/dbt/adapters/clickhouse/relation.py @@ -1,10 +1,11 @@ from dataclasses import dataclass, field -from typing import Any, Optional, Type +from typing import Any, Dict, Optional, Type from dbt.adapters.base.relation import BaseRelation, Policy, Self -from dbt.contracts.graph.nodes import SourceDefinition +from dbt.contracts.graph.nodes import ManifestNode, SourceDefinition +from dbt.contracts.relation import HasQuoting from dbt.exceptions import DbtRuntimeError -from dbt.utils import deep_merge +from dbt.utils import deep_merge, merge @dataclass @@ -27,6 +28,7 @@ class ClickHouseRelation(BaseRelation): include_policy: Policy = field(default_factory=lambda: ClickHouseIncludePolicy()) quote_character: str = '' can_exchange: bool = False + can_on_cluster: bool = False def __post_init__(self): if self.database != self.schema and self.database: @@ -50,6 +52,23 @@ def matches( raise DbtRuntimeError(f'Passed unexpected schema value {schema} to Relation.matches') return self.database == database and self.identifier == identifier + @property + def should_on_cluster(self) -> bool: + if self.include_policy.identifier: + return self.can_on_cluster + else: + # create database/schema on cluster by default + return True + + @classmethod + def get_on_cluster( + cls: Type[Self], cluster: str = '', materialized: str = '', engine: str = '' + ) -> bool: + if cluster.strip(): + return 'view' == materialized or 'distributed' in materialized or 'Replicated' in engine + else: + return False + @classmethod def create_from_source(cls: Type[Self], source: SourceDefinition, **kwargs: Any) -> Self: source_quoting = source.quoting.to_dict(omit_none=True) @@ -73,3 +92,30 @@ def create_from_source(cls: Type[Self], source: SourceDefinition, **kwargs: Any) quote_policy=quote_policy, **kwargs, ) + + @classmethod + def create_from_node( + cls: Type[Self], + config: HasQuoting, + node: ManifestNode, + quote_policy: Optional[Dict[str, bool]] = None, + **kwargs: Any, + ) -> Self: + if quote_policy is None: + quote_policy = {} + + quote_policy = merge(config.quoting, quote_policy) + + cluster = config.credentials.cluster if config.credentials.cluster else '' + materialized = node.get_materialization() if node.get_materialization() else '' + engine = node.config.get('engine') if node.config.get('engine') else '' + can_on_cluster = cls.get_on_cluster(cluster, materialized, engine) + + return cls.create( + database=node.database, + schema=node.schema, + identifier=node.alias, + quote_policy=quote_policy, + can_on_cluster=can_on_cluster, + **kwargs, + ) diff --git a/dbt/include/clickhouse/macros/adapters.sql b/dbt/include/clickhouse/macros/adapters.sql index 2d5301b6..8d52a2c1 100644 --- a/dbt/include/clickhouse/macros/adapters.sql +++ b/dbt/include/clickhouse/macros/adapters.sql @@ -3,7 +3,7 @@ {{ sql_header if sql_header is not none }} - create view {{ relation.include(database=False) }} {{ on_cluster_clause()}} + create view {{ relation.include(database=False) }} {{ on_cluster_clause(relation)}} as ( {{ sql }} ) @@ -19,14 +19,14 @@ {% macro clickhouse__create_schema(relation) -%} {%- call statement('create_schema') -%} create database if not exists {{ relation.without_identifier().include(database=False) }} - {{ on_cluster_clause()}} + {{ on_cluster_clause(relation)}} {{ adapter.clickhouse_db_engine_clause() }} {% endcall %} {% endmacro %} {% macro clickhouse__drop_schema(relation) -%} {%- call statement('drop_schema') -%} - drop database if exists {{ relation.without_identifier().include(database=False) }} {{ on_cluster_clause()}} + drop database if exists {{ relation.without_identifier().include(database=False) }} {{ on_cluster_clause(relation)}} {%- endcall -%} {% endmacro %} @@ -36,9 +36,19 @@ t.name as name, t.database as schema, if(engine not in ('MaterializedView', 'View'), 'table', 'view') as type, - db.engine as db_engine - from system.tables as t JOIN system.databases as db on t.database = db.name - where schema = '{{ schema_relation.schema }}' + db.engine as db_engine, + {%- if adapter.get_clickhouse_cluster_name() -%} + count(distinct _shard_num) > 1 as is_on_cluster + from clusterAllReplicas({{ adapter.get_clickhouse_cluster_name() }}, system.tables) as t + join system.databases as db on t.database = db.name + where schema = '{{ schema_relation.schema }}' + group by name, schema, type, db_engine + {%- else -%} + 0 as is_on_cluster + from system.tables as t join system.databases as db on t.database = db.name + where schema = '{{ schema_relation.schema }}' + {% endif %} + {% endcall %} {{ return(load_result('list_relations_without_caching').table) }} {% endmacro %} @@ -56,22 +66,23 @@ {% macro clickhouse__drop_relation(relation, obj_type='table') -%} {% call statement('drop_relation', auto_begin=False) -%} - drop {{ obj_type }} if exists {{ relation }} {{ on_cluster_clause()}} + {# drop relation on cluster by default if cluster is set #} + drop {{ obj_type }} if exists {{ relation }} {{ on_cluster_clause(relation.without_identifier())}} {%- endcall %} {% endmacro %} {% macro clickhouse__rename_relation(from_relation, to_relation, obj_type='table') -%} {% call statement('drop_relation') %} - drop {{ obj_type }} if exists {{ to_relation }} {{ on_cluster_clause()}} + drop {{ obj_type }} if exists {{ to_relation }} {{ on_cluster_clause(to_relation.without_identifier())}} {% endcall %} {% call statement('rename_relation') %} - rename {{ obj_type }} {{ from_relation }} to {{ to_relation }} {{ on_cluster_clause()}} + rename {{ obj_type }} {{ from_relation }} to {{ to_relation }} {{ on_cluster_clause(from_relation)}} {% endcall %} {% endmacro %} {% macro clickhouse__truncate_relation(relation) -%} {% call statement('truncate_relation') -%} - truncate table {{ relation }} {{ on_cluster_clause()}} + truncate table {{ relation }} {{ on_cluster_clause(relation)}} {%- endcall %} {% endmacro %} @@ -100,17 +111,18 @@ {% macro clickhouse__alter_column_type(relation, column_name, new_column_type) -%} {% call statement('alter_column_type') %} - alter table {{ relation }} {{ on_cluster_clause()}} modify column {{ adapter.quote(column_name) }} {{ new_column_type }} + alter table {{ relation }} {{ on_cluster_clause(relation)}} modify column {{ adapter.quote(column_name) }} {{ new_column_type }} {% endcall %} {% endmacro %} {% macro exchange_tables_atomic(old_relation, target_relation, obj_types='TABLES') %} {%- if adapter.get_clickhouse_cluster_name() is not none and obj_types == 'TABLES' and 'Replicated' in engine_clause() %} - {% do run_query("SYSTEM SYNC REPLICA " + on_cluster_clause() + target_relation.schema + '.' + target_relation.identifier) %} + {%- call statement('exchange_table_sync_replica') -%} + SYSTEM SYNC REPLICA {{ on_cluster_clause(target_relation) }} {{ target_relation.schema }}.{{ target_relation.identifier }} + {% endcall %} {%- endif %} - {%- call statement('exchange_tables_atomic') -%} - EXCHANGE {{ obj_types }} {{ old_relation }} AND {{ target_relation }} {{ on_cluster_clause()}} + EXCHANGE {{ obj_types }} {{ old_relation }} AND {{ target_relation }} {{ on_cluster_clause(target_relation)}} {% endcall %} {% endmacro %} diff --git a/dbt/include/clickhouse/macros/adapters/apply_grants.sql b/dbt/include/clickhouse/macros/adapters/apply_grants.sql index cd9732d4..387b333b 100644 --- a/dbt/include/clickhouse/macros/adapters/apply_grants.sql +++ b/dbt/include/clickhouse/macros/adapters/apply_grants.sql @@ -1,5 +1,5 @@ {% macro clickhouse__get_show_grant_sql(relation) %} - SELECT access_type as privilege_type, COALESCE(user_name, role_name) as grantee FROM system.grants WHERE table = '{{ relation.name }}' + SELECT access_type as privilege_type, COALESCE(user_name, role_name) as grantee from system.grants where table = '{{ relation.name }}' AND database = '{{ relation.schema }}' {%- endmacro %} @@ -13,9 +13,9 @@ {%- macro clickhouse__get_grant_sql(relation, privilege, grantees) -%} - grant {{ on_cluster_clause()}} {{ privilege }} on {{ relation }} to {{ grantees | join(', ') }} + grant {{ on_cluster_clause(relation)}} {{ privilege }} on {{ relation }} to {{ grantees | join(', ') }} {%- endmacro -%} {%- macro clickhouse__get_revoke_sql(relation, privilege, grantees) -%} - revoke {{ on_cluster_clause()}} {{ privilege }} on {{ relation }} from {{ grantees | join(', ') }} + revoke {{ on_cluster_clause(relation)}} {{ privilege }} on {{ relation }} from {{ grantees | join(', ') }} {%- endmacro -%} diff --git a/dbt/include/clickhouse/macros/adapters/relation.sql b/dbt/include/clickhouse/macros/adapters/relation.sql index d6ec3d0f..59ce37ab 100644 --- a/dbt/include/clickhouse/macros/adapters/relation.sql +++ b/dbt/include/clickhouse/macros/adapters/relation.sql @@ -5,12 +5,14 @@ {% endif %} {%- set can_exchange = adapter.can_exchange(schema, type) %} + {%- set should_on_cluster = adapter.should_on_cluster(config.get('materialized'), engine_clause()) %} {%- set new_relation = api.Relation.create( database=None, schema=schema, identifier=identifier, type=type, - can_exchange=can_exchange + can_exchange=can_exchange, + can_on_cluster=should_on_cluster ) -%} {% do return([false, new_relation]) %} {% endmacro %} diff --git a/dbt/include/clickhouse/macros/materializations/distributed_table.sql b/dbt/include/clickhouse/macros/materializations/distributed_table.sql index dfccc0d7..9f920ad9 100644 --- a/dbt/include/clickhouse/macros/materializations/distributed_table.sql +++ b/dbt/include/clickhouse/macros/materializations/distributed_table.sql @@ -9,8 +9,13 @@ {%- set existing_relation = load_cached_relation(this) -%} {%- set target_relation = this.incorporate(type='table') -%} - {% set existing_relation_local = existing_relation.incorporate(path={"identifier": model['name'] + local_suffix}) if existing_relation is not none else none %} - {% set target_relation_local = target_relation.incorporate(path={"identifier": model['name'] + local_suffix}) if target_relation is not none else none %} + {% set on_cluster = on_cluster_clause(target_relation) %} + {% if on_cluster.strip() == '' %} + {% do exceptions.raise_compiler_error('To use distributed materialization cluster setting in dbt profile must be set') %} + {% endif %} + + {% set existing_relation_local = existing_relation.incorporate(path={"identifier": this.identifier + local_suffix}) if existing_relation is not none else none %} + {% set target_relation_local = target_relation.incorporate(path={"identifier": this.identifier + local_suffix}) if target_relation is not none else none %} {%- set backup_relation = none -%} {%- set preexisting_backup_relation = none -%} @@ -59,6 +64,7 @@ {{ drop_relation_if_exists(view_relation) }} -- cleanup {% set should_revoke = should_revoke(existing_relation, full_refresh_mode=True) %} + {% do apply_grants(target_relation_local, grant_config, should_revoke=should_revoke) %} {% do apply_grants(target_relation, grant_config, should_revoke=should_revoke) %} {% do persist_docs(target_relation, model) %} @@ -79,11 +85,13 @@ {%- set cluster = cluster[1:-1] -%} {%- set sharding = config.get('sharding_key') -%} - CREATE TABLE {{ relation }} {{ on_cluster_clause() }} AS {{ local_relation }} + create table {{ relation }} {{ on_cluster_clause(relation) }} as {{ local_relation }} ENGINE = Distributed('{{ cluster}}', '{{ relation.schema }}', '{{ local_relation.name }}' - {% if sharding is not none %} + {%- if sharding is not none and sharding.strip() != '' -%} , {{ sharding }} - {% endif %} + {%- else %} + , rand() + {% endif -%} ) {% endmacro %} @@ -98,7 +106,7 @@ {{ sql_header if sql_header is not none }} create table {{ relation.include(database=False) }} - {{ on_cluster_clause() }} ( + {{ on_cluster_clause(relation) }} ( {{col_list | join(', ')}} ) diff --git a/dbt/include/clickhouse/macros/materializations/distributed_incremental.sql b/dbt/include/clickhouse/macros/materializations/incremental/distributed_incremental.sql similarity index 92% rename from dbt/include/clickhouse/macros/materializations/distributed_incremental.sql rename to dbt/include/clickhouse/macros/materializations/incremental/distributed_incremental.sql index cdc53151..568ada36 100644 --- a/dbt/include/clickhouse/macros/materializations/distributed_incremental.sql +++ b/dbt/include/clickhouse/macros/materializations/incremental/distributed_incremental.sql @@ -9,8 +9,13 @@ {%- set existing_relation = load_cached_relation(this) -%} {%- set target_relation = this.incorporate(type='table') -%} - {% set existing_relation_local = existing_relation.incorporate(path={"identifier": model['name'] + local_suffix}) if existing_relation is not none else none %} - {% set target_relation_local = target_relation.incorporate(path={"identifier": model['name'] + local_suffix}) if target_relation is not none else none %} + {% set on_cluster = on_cluster_clause(target_relation) %} + {% if on_cluster.strip() == '' %} + {% do exceptions.raise_compiler_error('To use distributed materializations cluster setting in dbt profile must be set') %} + {% endif %} + + {% set existing_relation_local = existing_relation.incorporate(path={"identifier": this.identifier + local_suffix}) if existing_relation is not none else none %} + {% set target_relation_local = target_relation.incorporate(path={"identifier": this.identifier + local_suffix}) if target_relation is not none else none %} {%- set unique_key = config.get('unique_key') -%} {% if unique_key is not none and unique_key|length == 0 %} @@ -100,7 +105,7 @@ {% endif %} -- Structure could have changed, need to update distributed table from replaced local table - {% set target_relation_new = target_relation.incorporate(path={"identifier": model['name'] + '_temp'}) %} + {% set target_relation_new = target_relation.incorporate(path={"identifier": target_relation.identifier + '_temp'}) %} {{ drop_relation_if_exists(target_relation_new) }} {% do run_query(create_distributed_table(target_relation_new, target_relation_local)) %} @@ -118,6 +123,7 @@ {% set should_revoke = should_revoke(existing_relation, full_refresh_mode) %} {% do apply_grants(target_relation, grant_config, should_revoke=should_revoke) %} + {% do apply_grants(target_relation_local, grant_config, should_revoke=should_revoke) %} {% do persist_docs(target_relation, model) %} diff --git a/dbt/include/clickhouse/macros/materializations/incremental.sql b/dbt/include/clickhouse/macros/materializations/incremental/incremental.sql similarity index 93% rename from dbt/include/clickhouse/macros/materializations/incremental.sql rename to dbt/include/clickhouse/macros/materializations/incremental/incremental.sql index 1eb35b0e..9f9fa4bc 100644 --- a/dbt/include/clickhouse/macros/materializations/incremental.sql +++ b/dbt/include/clickhouse/macros/materializations/incremental/incremental.sql @@ -135,7 +135,7 @@ {% macro clickhouse__incremental_legacy(existing_relation, intermediate_relation, on_schema_change, unique_key, is_distributed=False) %} - {% set new_data_relation = existing_relation.incorporate(path={"identifier": model['name'] + '__dbt_new_data'}) %} + {% set new_data_relation = existing_relation.incorporate(path={"identifier": existing_relation.identifier + '__dbt_new_data'}) %} {{ drop_relation_if_exists(new_data_relation) }} {%- set inserted_relation = intermediate_relation -%} @@ -144,7 +144,7 @@ -- First create a temporary table for all of the new data {% if is_distributed %} -- Need to use distributed table to have data on all shards - {%- set distributed_new_data_relation = existing_relation.incorporate(path={"identifier": model['name'] + '__dbt_distributed_new_data'}) -%} + {%- set distributed_new_data_relation = existing_relation.incorporate(path={"identifier": existing_relation.identifier + '__dbt_distributed_new_data'}) -%} {%- set inserting_relation = distributed_new_data_relation -%} {{ create_distributed_local_table(distributed_new_data_relation, new_data_relation, existing_relation, sql) }} {% else %} @@ -162,7 +162,7 @@ {{ create_distributed_local_table(distributed_intermediate_relation, intermediate_relation, existing_relation) }} {% else %} {% call statement('main') %} - create table {{ intermediate_relation }} as {{ new_data_relation }} {{ on_cluster_clause() }} + create table {{ intermediate_relation }} as {{ new_data_relation }} {{ on_cluster_clause(existing_relation) }} {% endcall %} {% endif %} @@ -199,10 +199,10 @@ {% macro clickhouse__incremental_delete_insert(existing_relation, unique_key, incremental_predicates, is_distributed=False) %} - {% set new_data_relation = existing_relation.incorporate(path={"identifier": model['name'] + {% set new_data_relation = existing_relation.incorporate(path={"identifier": existing_relation.identifier + '__dbt_new_data_' + invocation_id.replace('-', '_')}) %} {{ drop_relation_if_exists(new_data_relation) }} - {%- set distributed_new_data_relation = existing_relation.incorporate(path={"identifier": model['name'] + '__dbt_distributed_new_data'}) -%} + {%- set distributed_new_data_relation = existing_relation.incorporate(path={"identifier": existing_relation.identifier + '__dbt_distributed_new_data'}) -%} {%- set inserting_relation = new_data_relation -%} @@ -217,8 +217,13 @@ {% endif %} {% call statement('delete_existing_data') %} - delete from {{ existing_relation }} where ({{ unique_key }}) in (select {{ unique_key }} + {% if is_distributed %} + delete from {{ existing_relation }}{{ adapter.get_clickhouse_local_suffix() }} {{ on_cluster_clause(existing_relation) }} where ({{ unique_key }}) in (select {{ unique_key }} from {{ inserting_relation }}) + {% else %} + delete from {{ existing_relation }} where ({{ unique_key }}) in (select {{ unique_key }} + from {{ inserting_relation }}) + {% endif %} {%- if incremental_predicates %} {% for predicate in incremental_predicates %} and {{ predicate }} diff --git a/dbt/include/clickhouse/macros/materializations/incremental/is_incremental.sql b/dbt/include/clickhouse/macros/materializations/incremental/is_incremental.sql new file mode 100644 index 00000000..552e0ac5 --- /dev/null +++ b/dbt/include/clickhouse/macros/materializations/incremental/is_incremental.sql @@ -0,0 +1,13 @@ + +{% macro is_incremental() %} + {#-- do not run introspective queries in parsing #} + {% if not execute %} + {{ return(False) }} + {% else %} + {% set relation = adapter.get_relation(this.database, this.schema, this.table) %} + {{ return(relation is not none + and relation.type == 'table' + and (model.config.materialized == 'incremental' or model.config.materialized == 'distributed_incremental' ) + and not should_full_refresh()) }} + {% endif %} +{% endmacro %} diff --git a/dbt/include/clickhouse/macros/materializations/seed.sql b/dbt/include/clickhouse/macros/materializations/seed.sql index c5f7a02b..120e6c48 100644 --- a/dbt/include/clickhouse/macros/materializations/seed.sql +++ b/dbt/include/clickhouse/macros/materializations/seed.sql @@ -17,7 +17,7 @@ {%- set quote_seed_column = model['config'].get('quote_columns', None) -%} {% set sql %} - create table {{ this.render() }} {{ on_cluster_clause() }} ( + create table {{ this.render() }} {{ on_cluster_clause(this) }} ( {%- for col_name in agate_table.column_names -%} {%- set inferred_type = adapter.convert_type(agate_table, loop.index0) -%} {%- set type = column_override.get(col_name, inferred_type) -%} diff --git a/dbt/include/clickhouse/macros/materializations/table.sql b/dbt/include/clickhouse/macros/materializations/table.sql index 18362a2a..0e93d633 100644 --- a/dbt/include/clickhouse/macros/materializations/table.sql +++ b/dbt/include/clickhouse/macros/materializations/table.sql @@ -121,9 +121,9 @@ {%- endif %} {%- endmacro -%} -{% macro on_cluster_clause(label) %} +{% macro on_cluster_clause(relation) %} {% set active_cluster = adapter.get_clickhouse_cluster_name() %} - {%- if active_cluster is not none %} + {%- if active_cluster is not none and relation.should_on_cluster %} {# Add trailing whitespace to avoid problems when this clause is not last #} ON CLUSTER {{ active_cluster + ' ' }} {%- endif %} @@ -154,7 +154,7 @@ {{ adapter.get_model_settings(model) }} {%- else %} create table {{ relation.include(database=False) }} - {{ on_cluster_clause()}} + {{ on_cluster_clause(relation)}} {{ engine_clause() }} {{ order_cols(label="order by") }} {{ primary_key_clause(label="primary key") }} diff --git a/dbt/include/clickhouse/macros/persist_docs.sql b/dbt/include/clickhouse/macros/persist_docs.sql index 5d9db873..5e175fd1 100644 --- a/dbt/include/clickhouse/macros/persist_docs.sql +++ b/dbt/include/clickhouse/macros/persist_docs.sql @@ -1,13 +1,13 @@ {% macro one_alter_relation(relation, alter_comments) %} - alter table {{ relation }} {{ on_cluster_clause() }} {{ alter_comments }} + alter table {{ relation }} {{ on_cluster_clause(relation) }} {{ alter_comments }} {% endmacro %} {% macro one_alter_column_comment(relation, column_name, comment) %} - alter table {{ relation }} {{ on_cluster_clause() }} comment column {{ column_name }} '{{ comment }}' + alter table {{ relation }} {{ on_cluster_clause(relation) }} comment column {{ column_name }} '{{ comment }}' {% endmacro %} {% macro clickhouse__alter_relation_comment(relation, comment) %} - alter table {{ relation }} {{ on_cluster_clause() }} modify comment '{{ comment }}' + alter table {{ relation }} {{ on_cluster_clause(relation) }} modify comment '{{ comment }}' {% endmacro %} {% macro clickhouse__persist_docs(relation, model, for_relation, for_columns) %} diff --git a/tests/integration/adapter/incremental/test_distributed_incremental.py b/tests/integration/adapter/incremental/test_distributed_incremental.py new file mode 100644 index 00000000..bcc0ddf6 --- /dev/null +++ b/tests/integration/adapter/incremental/test_distributed_incremental.py @@ -0,0 +1,205 @@ +import os + +import pytest +from dbt.tests.adapter.basic.files import ( + model_incremental, + schema_base_yml, + seeds_added_csv, + seeds_base_csv, +) +from dbt.tests.adapter.basic.test_incremental import BaseIncremental, BaseIncrementalNotSchemaChange +from dbt.tests.util import run_dbt + +from tests.integration.adapter.incremental.test_incremental import uniq_schema + +uniq_source_model = """ +{{config( + materialized='distributed_table', + engine='MergeTree()', + order_by=['ts'], + unique_key=['impid'] + ) +}} +SELECT now() - toIntervalHour(number) as ts, toInt32(number) as impid, concat('value', toString(number)) as value1 + FROM numbers(100) +""" + +uniq_incremental_model = """ +{{ + config( + materialized='distributed_incremental', + engine='MergeTree()', + order_by=['ts'], + unique_key=['impid'] + ) +}} +select ts, impid from unique_source_one +{% if is_incremental() %} +where ts >= now() - toIntervalHour(1) +{% endif %} +""" + + +class TestSimpleDistributedIncremental: + @pytest.fixture(scope="class") + def models(self): + return { + "unique_source_one.sql": uniq_source_model, + "unique_incremental_one.sql": uniq_incremental_model, + "schema.yml": uniq_schema, + } + + @pytest.mark.skipif( + os.environ.get('DBT_CH_TEST_CLUSTER', '').strip() == '', reason='Not on a cluster' + ) + def test_simple_incremental(self, project): + run_dbt(["run", "--select", "unique_source_one"]) + run_dbt(["run", "--select", "unique_incremental_one"]) + + +lw_delete_schema = """ +version: 2 + +models: + - name: "lw_delete_inc" + description: "Incremental table" +""" + +lw_delete_inc = """ +{{ config( + materialized='distributed_incremental', + order_by=['key1'], + unique_key='key1', + incremental_strategy='delete+insert' + ) +}} +{% if is_incremental() %} + WITH (SELECT max(key1) - 20 FROM lw_delete_inc) as old_max + SELECT assumeNotNull(toUInt64(number + old_max + 1)) as key1, toInt64(-(number + old_max)) as key2, toString(number + 30) as value FROM numbers(100) +{% else %} + SELECT toUInt64(number) as key1, toInt64(-number) as key2, toString(number) as value FROM numbers(100) +{% endif %} +""" + + +class TestLWDeleteDistributedIncremental: + @pytest.fixture(scope="class") + def models(self): + return {"lw_delete_inc.sql": lw_delete_inc} + + @pytest.mark.skipif( + os.environ.get('DBT_CH_TEST_CLUSTER', '').strip() == '', reason='Not on a cluster' + ) + def test_lw_delete(self, project): + run_dbt() + result = project.run_sql("select count(*) as num_rows from lw_delete_inc", fetch="one") + assert result[0] == 100 + run_dbt() + result = project.run_sql("select count(*) as num_rows from lw_delete_inc", fetch="one") + assert result[0] == 180 + + +compound_key_schema = """ +version: 2 + +models: + - name: "compound_key_inc" + description: "Incremental table" +""" + +compound_key_inc = """ +{{ config( + materialized='distributed_incremental', + order_by=['key1', 'key2'], + unique_key='key1, key2', + incremental_strategy='delete+insert' + ) +}} +{% if is_incremental() %} + WITH (SELECT max(key1) - 20 FROM compound_key_inc) as old_max + SELECT assumeNotNull(toUInt64(number + old_max + 1)) as key1, toInt64(-key1) as key2, toString(number + 30) as value FROM numbers(100) +{% else %} + SELECT toUInt64(number) as key1, toInt64(-number) as key2, toString(number) as value FROM numbers(100) +{% endif %} +""" + + +class TestDistributedIncrementalCompoundKey: + @pytest.fixture(scope="class") + def models(self): + return {"compound_key_inc.sql": compound_key_inc} + + @pytest.mark.skipif( + os.environ.get('DBT_CH_TEST_CLUSTER', '').strip() == '', reason='Not on a cluster' + ) + def test_compound_key(self, project): + run_dbt() + result = project.run_sql("select count(*) as num_rows from compound_key_inc", fetch="one") + assert result[0] == 100 + run_dbt() + result = project.run_sql("select count(*) as num_rows from compound_key_inc", fetch="one") + assert result[0] == 180 + + +replicated_seed_schema_yml = """ +version: 2 + +seeds: + - name: base + config: + engine: ReplicatedMergeTree('/clickhouse/tables/{uuid}/one_shard', '{server_index}' ) + - name: added + config: + engine: ReplicatedMergeTree('/clickhouse/tables/{uuid}/one_shard', '{server_index}' ) +""" + + +class TestInsertsOnlyDistributedIncrementalMaterialization(BaseIncremental): + @pytest.fixture(scope="class") + def models(self): + config_materialized_incremental = """ + {{ config(order_by='(some_date, id, name)', inserts_only=True, materialized='distributed_incremental', unique_key='id') }} + """ + incremental_sql = config_materialized_incremental + model_incremental + return { + "incremental.sql": incremental_sql, + "schema.yml": schema_base_yml, + } + + @pytest.fixture(scope="class") + def seeds(self): + return { + "base.csv": seeds_base_csv, + "added.csv": seeds_added_csv, + "schema.yml": replicated_seed_schema_yml, + } + + @pytest.mark.skipif( + os.environ.get('DBT_CH_TEST_CLUSTER', '').strip() == '', reason='Not on a cluster' + ) + def test_incremental(self, project): + super().test_incremental(project) + + +incremental_not_schema_change_sql = """ +{{ config(materialized="distributed_incremental", unique_key="user_id_current_time",on_schema_change="sync_all_columns") }} +select + toString(1) || '-' || toString(now64()) as user_id_current_time, + {% if is_incremental() %} + 'thisis18characters' as platform + {% else %} + 'okthisis20characters' as platform + {% endif %} +""" + + +class TestDistributedIncrementalNotSchemaChange(BaseIncrementalNotSchemaChange): + @pytest.fixture(scope="class") + def models(self): + return {"incremental_not_schema_change.sql": incremental_not_schema_change_sql} + + @pytest.mark.skipif( + os.environ.get('DBT_CH_TEST_CLUSTER', '').strip() == '', reason='Not on a cluster' + ) + def test_incremental_not_schema_change(self, project): + super().test_incremental_not_schema_change(project) diff --git a/tests/integration/adapter/test_aliases.py b/tests/integration/adapter/test_aliases.py new file mode 100644 index 00000000..30575aa8 --- /dev/null +++ b/tests/integration/adapter/test_aliases.py @@ -0,0 +1,101 @@ +import os + +import pytest +from dbt.tests.adapter.aliases.fixtures import ( + MODELS__ALIAS_IN_PROJECT_SQL, + MODELS__ALIAS_IN_PROJECT_WITH_OVERRIDE_SQL, + MODELS__SCHEMA_YML, +) +from dbt.tests.adapter.aliases.test_aliases import ( + BaseAliasErrors, + BaseAliases, + BaseSameAliasDifferentDatabases, + BaseSameAliasDifferentSchemas, +) +from dbt.tests.util import relation_from_name, run_dbt + +MODELS__DISTRIBUTED_FOO_ALIAS_SQL = """ + +{{ + config( + alias='foo', + materialized='distributed_table' + ) +}} + +select {{ string_literal(this.name) }} as tablename + +""" + +MODELS__DISTRIBUTED_REF_FOO_ALIAS_SQL = """ + +{{ + config( + materialized='distributed_table' + ) +}} + +with trigger_ref as ( + + -- we should still be able to ref a model by its filepath + select * from {{ ref('foo_alias') }} + +) + +-- this name should still be the filename +select {{ string_literal(this.name) }} as tablename + +""" + + +class TestAliases(BaseAliases): + pass + + +class TestAliasErrors(BaseAliasErrors): + pass + + +class TestSameAliasDifferentSchemas(BaseSameAliasDifferentSchemas): + pass + + +class TestSameAliasDifferentDatabases(BaseSameAliasDifferentDatabases): + pass + + +class TestDistributedAliases(BaseAliases): + @pytest.fixture(scope="class") + def models(self): + return { + "schema.yml": MODELS__SCHEMA_YML, + "foo_alias.sql": MODELS__DISTRIBUTED_FOO_ALIAS_SQL, + "alias_in_project.sql": MODELS__ALIAS_IN_PROJECT_SQL, + "alias_in_project_with_override.sql": MODELS__ALIAS_IN_PROJECT_WITH_OVERRIDE_SQL, + "ref_foo_alias.sql": MODELS__DISTRIBUTED_REF_FOO_ALIAS_SQL, + } + + @pytest.mark.skipif( + os.environ.get('DBT_CH_TEST_CLUSTER', '').strip() == '', reason='Not on a cluster' + ) + def test_alias_model_name(self, project): + results = run_dbt(["run"]) + assert len(results) == 4 + + cluster = project.test_config['cluster'] + relation = relation_from_name(project.adapter, "foo") + + result = project.run_sql( + f"select max(tablename) AS tablename From clusterAllReplicas('{cluster}', {relation}_local) ", + fetch="one", + ) + assert result[0] == "foo" + + relation = relation_from_name(project.adapter, "ref_foo_alias") + result = project.run_sql( + f"select max(tablename) AS tablename From clusterAllReplicas('{cluster}', {relation}_local) ", + fetch="one", + ) + assert result[0] == "ref_foo_alias" + + run_dbt(["test"]) diff --git a/tests/integration/adapter/test_basic.py b/tests/integration/adapter/test_basic.py index fc11e146..645c1181 100644 --- a/tests/integration/adapter/test_basic.py +++ b/tests/integration/adapter/test_basic.py @@ -1,5 +1,7 @@ +import os + import pytest -from dbt.tests.adapter.basic.files import model_base, schema_base_yml +from dbt.tests.adapter.basic.files import model_base, schema_base_yml, seeds_base_csv from dbt.tests.adapter.basic.test_adapter_methods import BaseAdapterMethod from dbt.tests.adapter.basic.test_base import BaseSimpleMaterializations from dbt.tests.adapter.basic.test_empty import BaseEmpty @@ -9,7 +11,13 @@ from dbt.tests.adapter.basic.test_singular_tests import BaseSingularTests from dbt.tests.adapter.basic.test_snapshot_check_cols import BaseSnapshotCheckCols from dbt.tests.adapter.basic.test_snapshot_timestamp import BaseSnapshotTimestamp -from dbt.tests.util import check_relation_types, relation_from_name, run_dbt +from dbt.tests.util import ( + check_relation_types, + check_relations_equal, + check_result_nodes_by_name, + relation_from_name, + run_dbt, +) # CSV content with boolean column type. seeds_boolean_csv = """ @@ -43,6 +51,27 @@ str1: Nullable(String) """ +replicated_seeds_schema_yml = """ +version: 2 + +seeds: + - name: empty + config: + engine: ReplicatedMergeTree('/clickhouse/tables/{uuid}/one_shard', '{server_index}' ) + column_types: + val2: Nullable(UInt32) + str1: Nullable(String) +""" + +base_seeds_schema_yml = """ +version: 2 + +seeds: + - name: base + config: + engine: ReplicatedMergeTree('/clickhouse/tables/{uuid}/one_shard', '{server_index}' ) +""" + class TestBaseSimpleMaterializations(BaseSimpleMaterializations): pass @@ -128,3 +157,223 @@ def test_seed(self, project): columns = project.run_sql("DESCRIBE TABLE empty", fetch='all') assert columns[2][1] == 'Nullable(UInt32)' assert columns[3][1] == 'Nullable(String)' + + +class TestReplicatedCSVSeed: + @pytest.fixture(scope="class") + def seeds(self): + return { + "schema.yml": replicated_seeds_schema_yml, + "empty.csv": seeds_empty_csv, + } + + @pytest.mark.skipif( + os.environ.get('DBT_CH_TEST_CLUSTER', '').strip() == '', reason='Not on a cluster' + ) + def test_seed(self, project): + # seed command + results = run_dbt(["seed"]) + assert len(results) == 1 + columns = project.run_sql("DESCRIBE TABLE empty", fetch='all') + assert columns[2][1] == 'Nullable(UInt32)' + assert columns[3][1] == 'Nullable(String)' + + +class TestDistributedMaterializations(BaseSimpleMaterializations): + '''Test distributed materializations and check if data is properly distributed/replicated''' + + @pytest.fixture(scope="class") + def models(self): + config_distributed_table = """ + {{ config( + order_by='(some_date, id, name)', + engine='MergeTree()', + materialized='distributed_table', + settings={'allow_nullable_key': 1}) + }} + """ + return { + "distributed.sql": config_distributed_table + model_base, + "schema.yml": schema_base_yml, + } + + @pytest.fixture(scope="class") + def seeds(self): + return { + "schema.yml": base_seeds_schema_yml, + "base.csv": seeds_base_csv, + } + + def assert_total_count_correct(self, project): + '''Check if data is properly distributed''' + cluster = project.test_config['cluster'] + table_relation = relation_from_name(project.adapter, "distributed") + cluster_info = project.run_sql( + f"select shard_num,max(host_name) as host_name, count(distinct replica_num) as replica_counts " + f"from system.clusters where cluster='{cluster}' group by shard_num", + fetch="all", + ) + sum_count = project.run_sql( + f"select count() From clusterAllReplicas('{cluster}',{table_relation}_local)", + fetch="one", + ) + total_count = 0 + # total count should be equal to sum(count of each shard * replica_counts) + for shard_num, host_name, replica_counts in cluster_info: + count = project.run_sql( + f"select count() From remote('{host_name}',{table_relation}_local)", + fetch="one", + ) + total_count += count[0] * replica_counts + assert total_count == sum_count[0] + + @pytest.mark.skipif( + os.environ.get('DBT_CH_TEST_CLUSTER', '').strip() == '', reason='Not on a cluster' + ) + def test_base(self, project): + # cluster setting must exists + cluster = project.test_config['cluster'] + assert cluster + + # seed command + results = run_dbt(["seed"]) + # seed result length + assert len(results) == 1 + + # run command + results = run_dbt() + # run result length + assert len(results) == 1 + + # names exist in result nodes + check_result_nodes_by_name(results, ["distributed"]) + + # check relation types + expected = { + "base": "table", + "distributed": "table", + } + check_relation_types(project.adapter, expected) + + relation = relation_from_name(project.adapter, "base") + # table rowcount + result = project.run_sql(f"select count(*) as num_rows from {relation}", fetch="one") + assert result[0] == 10 + + # relations_equal + check_relations_equal(project.adapter, ["base", "distributed"]) + + # check result + self.assert_total_count_correct(project) + + # run full-refresh + results = run_dbt(['run', '--full-refresh']) + # run result length + assert len(results) == 1 + # check result + self.assert_total_count_correct(project) + + # check relations in catalog + catalog = run_dbt(["docs", "generate"]) + assert len(catalog.nodes) == 2 + assert len(catalog.sources) == 1 + + @pytest.mark.skipif( + os.environ.get('DBT_CH_TEST_CLUSTER', '').strip() != '', reason='Not on a cluster' + ) + def test_no_cluster_setting(self, project): + result = run_dbt(['run', '--select', 'distributed'], False) + assert result[0].status == 'error' + assert 'Compilation Error' in result[0].message + + +class TestReplicatedTableMaterialization(BaseSimpleMaterializations): + '''Test ReplicatedMergeTree table with table materialization''' + + @pytest.fixture(scope="class") + def models(self): + config_replicated_table = """ + {{ config( + order_by='(some_date, id, name)', + engine="ReplicatedMergeTree('/clickhouse/tables/{uuid}/one_shard', '{server_index}' )", + materialized='table', + settings={'allow_nullable_key': 1}) + }} + """ + + return { + "replicated.sql": config_replicated_table + model_base, + "schema.yml": schema_base_yml, + } + + def assert_total_count_correct(self, project): + '''Check if table is created on cluster and data is properly replicated''' + cluster = project.test_config['cluster'] + # check if data is properly distributed/replicated + table_relation = relation_from_name(project.adapter, "replicated") + # ClickHouse cluster in the docker-compose file + # under tests/integration is configured with 3 nodes + host_count = project.run_sql( + f"select count(host_name) as host_count from system.clusters where cluster='{cluster}'", + fetch="one", + ) + assert host_count[0] == 3 + + table_count = project.run_sql( + f"select count() From clusterAllReplicas('{cluster}', system.tables) " + f"where database='{table_relation.schema}' and name='{table_relation.identifier}'", + fetch="one", + ) + assert table_count[0] == host_count[0] + + sum_count = project.run_sql( + f"select count() From clusterAllReplicas('{cluster}',{table_relation})", + fetch="one", + ) + + assert sum_count[0] == 3 * 10 + + @pytest.mark.skipif( + os.environ.get('DBT_CH_TEST_CLUSTER', '').strip() == '', reason='Not on a cluster' + ) + def test_base(self, project): + # cluster setting must exists + cluster = project.test_config['cluster'] + assert cluster + + # seed command + results = run_dbt(["seed"]) + # seed result length + assert len(results) == 1 + + # run command + results = run_dbt() + # run result length + assert len(results) == 1 + + # names exist in result nodes + check_result_nodes_by_name(results, ["replicated"]) + + # check relation types + expected = { + "base": "table", + "replicated": "table", + } + check_relation_types(project.adapter, expected) + + relation = relation_from_name(project.adapter, "base") + # table rowcount + result = project.run_sql(f"select count(*) as num_rows from {relation}", fetch="one") + assert result[0] == 10 + + # relations_equal + check_relations_equal(project.adapter, ["base", "replicated"]) + + self.assert_total_count_correct(project) + + # run full refresh + results = run_dbt(['--debug', 'run', '--full-refresh']) + # run result length + assert len(results) == 1 + + self.assert_total_count_correct(project) diff --git a/tests/integration/adapter/test_grants.py b/tests/integration/adapter/test_grants.py index 418264aa..9d6aaab3 100644 --- a/tests/integration/adapter/test_grants.py +++ b/tests/integration/adapter/test_grants.py @@ -1,8 +1,23 @@ +import os + +import pytest from dbt.tests.adapter.grants.test_incremental_grants import BaseIncrementalGrants from dbt.tests.adapter.grants.test_invalid_grants import BaseInvalidGrants from dbt.tests.adapter.grants.test_model_grants import BaseModelGrants from dbt.tests.adapter.grants.test_seed_grants import BaseSeedGrants from dbt.tests.adapter.grants.test_snapshot_grants import BaseSnapshotGrants +from dbt.tests.util import get_manifest, run_dbt_and_capture, write_file + +distributed_table_model_schema_yml = """ +version: 2 +models: + - name: my_model + config: + materialized: distributed_table + grants: + select: ["{{ env_var('DBT_TEST_USER_1') }}"] + insert: ["{{ env_var('DBT_TEST_USER_2') }}"] +""" class TestModelGrants(BaseModelGrants): @@ -28,3 +43,37 @@ def privilege_does_not_exist_error(self): class TestSnapshotGrants(BaseSnapshotGrants): pass + + +class TestDistributedTableModelGrants(BaseModelGrants): + @pytest.mark.skipif( + os.environ.get('DBT_CH_TEST_CLUSTER', '').strip() == '', reason='Not on a cluster' + ) + def test_view_table_grants(self, project, get_test_users): + # we want the test to fail, not silently skip + test_users = get_test_users + select_privilege_name = self.privilege_grantee_name_overrides()["select"] + insert_privilege_name = self.privilege_grantee_name_overrides()["insert"] + assert len(test_users) == 3 + # Distributed Table materialization, single select grant + updated_yaml = self.interpolate_name_overrides(distributed_table_model_schema_yml) + write_file(updated_yaml, project.project_root, "models", "schema.yml") + (results, log_output) = run_dbt_and_capture(["--debug", "run"]) + assert len(results) == 1 + manifest = get_manifest(project.project_root) + model_id = "model.test.my_model" + model = manifest.nodes[model_id] + assert model.config.materialized == "distributed_table" + expected = {select_privilege_name: [test_users[0]], insert_privilege_name: [test_users[1]]} + self.assert_expected_grants_match_actual(project, "my_model", expected) + + def assert_expected_grants_match_actual(self, project, relation_name, expected_grants): + super().assert_expected_grants_match_actual(project, relation_name, expected_grants) + + # also needs grants for local table + actual_local_grants = self.get_grants_on_relation(project, relation_name + "_local") + from dbt.context.base import BaseContext + + diff_a_local = BaseContext.diff_of_two_dicts(actual_local_grants, expected_grants) + diff_b_local = BaseContext.diff_of_two_dicts(expected_grants, actual_local_grants) + assert diff_a_local == diff_b_local == {} diff --git a/tests/integration/conftest.py b/tests/integration/conftest.py index 5e79e256..b4b9c4f9 100644 --- a/tests/integration/conftest.py +++ b/tests/integration/conftest.py @@ -35,6 +35,7 @@ def test_config(ch_test_users, ch_test_version): test_driver = 'native' if test_port in (10900, 9000, 9440) else 'http' test_user = os.environ.get('DBT_CH_TEST_USER', 'default') test_password = os.environ.get('DBT_CH_TEST_PASSWORD', '') + test_cluster = os.environ.get('DBT_CH_TEST_CLUSTER', '') test_db_engine = os.environ.get('DBT_CH_TEST_DB_ENGINE', '') test_secure = test_port in (8443, 9440) test_cluster_mode = os.environ.get('DBT_CH_TEST_CLUSTER_MODE', '').lower() in ( @@ -53,6 +54,7 @@ def test_config(ch_test_users, ch_test_version): try: run_cmd(['docker-compose', '-f', compose_file, 'down', '-v']) sys.stderr.write('Starting docker compose') + os.environ['PROJECT_ROOT'] = '.' up_result = run_cmd(['docker-compose', '-f', compose_file, 'up', '-d']) if up_result[0]: raise Exception(f'Failed to start docker: {up_result[2]}') @@ -74,8 +76,12 @@ def test_config(ch_test_users, ch_test_version): secure=test_secure, ) for dbt_user in ch_test_users: + cmd = 'CREATE USER IF NOT EXISTS %s IDENTIFIED WITH sha256_hash BY %s' + if test_cluster != '': + cmd = f'CREATE USER IF NOT EXISTS %s ON CLUSTER "{test_cluster}" IDENTIFIED WITH sha256_hash BY %s' + test_client.command( - 'CREATE USER IF NOT EXISTS %s IDENTIFIED WITH sha256_hash BY %s', + cmd, (dbt_user, '5e884898da28047151d0e56f8dc6292773603d0d6aabbdd62a11ef721d1542d8'), ) yield { @@ -84,6 +90,7 @@ def test_config(ch_test_users, ch_test_version): 'port': test_port, 'user': test_user, 'password': test_password, + 'cluster': test_cluster, 'db_engine': test_db_engine, 'secure': test_secure, 'cluster_mode': test_cluster_mode, @@ -111,6 +118,7 @@ def dbt_profile_target(test_config): 'user': test_config['user'], 'password': test_config['password'], 'port': test_config['port'], + 'cluster': test_config['cluster'], 'database_engine': test_config['db_engine'], 'cluster_mode': test_config['cluster_mode'], 'secure': test_config['secure'], diff --git a/tests/integration/docker-compose.yml b/tests/integration/docker-compose.yml index d3a90fa1..e7810f0f 100644 --- a/tests/integration/docker-compose.yml +++ b/tests/integration/docker-compose.yml @@ -1,15 +1,51 @@ --- version: '3' +x-ch-common: &ch-common + volumes: + - /var/lib/clickhouse + - type: bind + source: ${PROJECT_ROOT:-.}/test_settings_${DBT_CH_TEST_SETTINGS:-latest}.xml + target: /etc/clickhouse-server/users.d/test_settings.xml + - type: bind + source: ${PROJECT_ROOT:-.}/test_config.xml + target: /etc/clickhouse-server/config.d/test_config.xml + ulimits: + nofile: + soft: 262144 + hard: 262144 + services: - ch_server: + ch0: image: clickhouse/clickhouse-server:${DBT_CH_TEST_CH_VERSION:-latest} + environment: + - SERVER_INDEX=1 + - SHARD_NUM=${SHARD_NUM:-1} + - REPLICA_NUM=${REPLICA_NUM:-1} ports: + - "8123:8123" + - "8443:8443" + - "9000:9000" + # for local docker tests - "10723:8123" - "10743:8443" - "10900:9000" - volumes: - - /var/lib/clickhouse - - type: bind - source: ./test_settings_${DBT_CH_TEST_SETTINGS:-latest}.xml - target: /etc/clickhouse-server/users.d/test_settings.xml + <<: *ch-common + ch1: + image: clickhouse/clickhouse-server:${DBT_CH_TEST_CH_VERSION:-latest} + environment: + - SERVER_INDEX=2 + - SHARD_NUM=${SHARD_NUM:-2} + - REPLICA_NUM=${REPLICA_NUM:-2} + <<: *ch-common + ch2: + image: clickhouse/clickhouse-server:${DBT_CH_TEST_CH_VERSION:-latest} + environment: + - SERVER_INDEX=3 + - SHARD_NUM=${SHARD_NUM:-3} + - REPLICA_NUM=${REPLICA_NUM:-3} + <<: *ch-common + +networks: + default: + name: integration-test diff --git a/tests/integration/test_config.xml b/tests/integration/test_config.xml new file mode 100644 index 00000000..9f2aec4f --- /dev/null +++ b/tests/integration/test_config.xml @@ -0,0 +1,89 @@ + + 8123 + 9000 + 9009 + + + + + + + + + + ch0 + 9000 + + + + + ch1 + 9000 + + + + + ch2 + 9000 + + + + + + + ch0 + 9000 + + + ch1 + 9000 + + + ch2 + 9000 + + + + + + 9181 + + + + 10000 + 30000 + + + + + 1 + ch0 + 9234 + + + 2 + ch1 + 9234 + + + 3 + ch2 + 9234 + + + + + + ch0 + 9181 + + + ch1 + 9181 + + + ch2 + 9181 + + + From af72593dbdfb6ad929e4c61d126477f113d0a42a Mon Sep 17 00:00:00 2001 From: Geoff Genz Date: Thu, 26 Oct 2023 17:41:02 -0600 Subject: [PATCH 18/78] Update version and CHANGELOG, incorporate cluster name fix (#203) --- CHANGELOG.md | 9 +++++++++ dbt/adapters/clickhouse/__version__.py | 2 +- dbt/adapters/clickhouse/dbclient.py | 2 +- 3 files changed, 11 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 7fafb9a5..38e6e945 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,12 @@ +### Release [1.4.9], 2023-10-27 +#### Improvement +- Lots of work on Distributed table materializations. Big thanks to [gfunc](https://github.com/gfunc) for the additional PR +and [Zhenbang](https://github.com/zli06160) for code review and suggestions. See the README for details on how to +use the new functionality. +#### Bug Fix +- dbt would fail if a cluster name contained a dash. This has been fixed. Thanks to [Andy](https://github.com/the4thamigo-uk +for the PR + ### Release [1.4.8], 2023-08-22 #### Bug Fix - Fixed issues with experimental Distributed table materializations. Closes https://github.com/ClickHouse/dbt-clickhouse/issues/179. diff --git a/dbt/adapters/clickhouse/__version__.py b/dbt/adapters/clickhouse/__version__.py index 0e921568..d3641e44 100644 --- a/dbt/adapters/clickhouse/__version__.py +++ b/dbt/adapters/clickhouse/__version__.py @@ -1 +1 @@ -version = '1.4.8' +version = '1.4.9' diff --git a/dbt/adapters/clickhouse/dbclient.py b/dbt/adapters/clickhouse/dbclient.py index d029c16e..30e1ff4f 100644 --- a/dbt/adapters/clickhouse/dbclient.py +++ b/dbt/adapters/clickhouse/dbclient.py @@ -151,7 +151,7 @@ def _ensure_database(self, database_engine, cluster_name) -> None: if not db_exists: engine_clause = f' ENGINE {database_engine} ' if database_engine else '' cluster_clause = ( - f' ON CLUSTER {cluster_name} ' + f' ON CLUSTER "{cluster_name}" ' if cluster_name is not None and cluster_name.strip() != '' else '' ) From 9edb5547efa49f656fc44e84770e177f449e02b7 Mon Sep 17 00:00:00 2001 From: Geoff Genz Date: Thu, 23 Nov 2023 15:12:18 -0700 Subject: [PATCH 19/78] Release 1 5 0 (#210) * Initial 1.5.0 commit * Reorganize basic tests * Fix lint * Add case sensitive cache * Fix s3 bucket bug * Checkpoint for constraints/contracts * Fix native column query * Loosen replication test * Checkpoint for constraints tests * Checkpoint for constraints tests * Add rendering of model level CHECK constraints * Fix lint * Reorganize test files * Add one hooks test * Fix lint --- .github/workflows/test_matrix.yml | 6 +- .gitignore | 1 + CHANGELOG.md | 8 + README.md | 7 + dbt/adapters/clickhouse/__version__.py | 2 +- dbt/adapters/clickhouse/cache.py | 432 ++++++++++++++++++ dbt/adapters/clickhouse/connections.py | 7 +- dbt/adapters/clickhouse/credentials.py | 6 +- dbt/adapters/clickhouse/dbclient.py | 4 + dbt/adapters/clickhouse/httpclient.py | 13 + dbt/adapters/clickhouse/impl.py | 73 ++- dbt/adapters/clickhouse/nativeclient.py | 11 +- dbt/adapters/clickhouse/query.py | 14 + dbt/adapters/clickhouse/relation.py | 7 +- dbt/include/clickhouse/macros/adapters.sql | 13 +- dbt/include/clickhouse/macros/catalog.sql | 2 +- .../clickhouse/macros/column_spec_ddl.sql | 40 ++ .../macros/materializations/table.sql | 31 +- .../macros/materializations/view.sql | 16 + dev_requirements.txt | 6 +- tests/conftest.py | 7 + .../adapter/{ => aliases}/test_aliases.py | 0 .../adapter/basic/test_adapter_methods.py | 9 + tests/integration/adapter/basic/test_base.py | 5 + tests/integration/adapter/basic/test_basic.py | 95 ++++ .../test_docs_generate.py} | 0 tests/integration/adapter/basic/test_empty.py | 5 + .../adapter/basic/test_ephemeral.py | 5 + .../adapter/basic/test_generic_tests.py | 5 + .../adapter/basic/test_incremental.py | 24 + .../adapter/basic/test_singular_tests.py | 5 + .../test_singular_tests_ephemeral.py} | 5 - .../adapter/basic/test_snapshot_check_cols.py | 5 + .../adapter/basic/test_snapshot_timestamp.py | 5 + .../basic/test_table_materialization.py | 5 + .../adapter/basic/test_validate_connection.py | 5 + .../adapter/caching/test_caching.py | 99 ++++ .../test_clickhouse_comments.py} | 0 .../test_clickhouse_errors.py} | 0 .../test_clickhouse_s3.py} | 29 ++ .../test_clickhouse_source_schema.py} | 0 .../clickhouse/test_clickhouse_sql_header.py | 28 ++ ...test_clickhouse_table_materializations.py} | 139 +----- .../test_clickhouse_upper_case.py} | 0 .../test_column_types.py} | 0 .../{ => concurrency}/test_concurrency.py | 0 .../constraints/fixtures_contraints.py | 258 +++++++++++ .../adapter/constraints/test_constraints.py | 190 ++++++++ .../adapter/dbt_debug/test_dbt_debug.py | 22 + .../adapter/dbt_show/test_dbt_show.py | 9 + .../test_distributed_grants.py} | 29 -- .../adapter/grants/test_incremental_grants.py | 5 + .../adapter/grants/test_invalid_grants.py | 10 + .../adapter/grants/test_model_grants.py | 5 + .../adapter/grants/test_seed_grants.py | 5 + .../adapter/grants/test_snapshot_grants.py | 5 + .../adapter/hooks/test_model_hooks.py | 16 + ...ncremental.py => test_base_incremental.py} | 20 +- .../test_distributed_incremental.py | 2 +- .../test_query_comment.py} | 0 .../test_changing_relation_type.py | 0 tests/integration/conftest.py | 10 +- 62 files changed, 1513 insertions(+), 252 deletions(-) create mode 100644 dbt/adapters/clickhouse/cache.py create mode 100644 dbt/adapters/clickhouse/query.py create mode 100644 dbt/include/clickhouse/macros/column_spec_ddl.sql rename tests/integration/adapter/{ => aliases}/test_aliases.py (100%) create mode 100644 tests/integration/adapter/basic/test_adapter_methods.py create mode 100644 tests/integration/adapter/basic/test_base.py create mode 100644 tests/integration/adapter/basic/test_basic.py rename tests/integration/adapter/{test_docs.py => basic/test_docs_generate.py} (100%) create mode 100644 tests/integration/adapter/basic/test_empty.py create mode 100644 tests/integration/adapter/basic/test_ephemeral.py create mode 100644 tests/integration/adapter/basic/test_generic_tests.py create mode 100644 tests/integration/adapter/basic/test_incremental.py create mode 100644 tests/integration/adapter/basic/test_singular_tests.py rename tests/integration/adapter/{test_singular.py => basic/test_singular_tests_ephemeral.py} (56%) create mode 100644 tests/integration/adapter/basic/test_snapshot_check_cols.py create mode 100644 tests/integration/adapter/basic/test_snapshot_timestamp.py create mode 100644 tests/integration/adapter/basic/test_table_materialization.py create mode 100644 tests/integration/adapter/basic/test_validate_connection.py create mode 100644 tests/integration/adapter/caching/test_caching.py rename tests/integration/adapter/{test_comments.py => clickhouse/test_clickhouse_comments.py} (100%) rename tests/integration/adapter/{test_errors.py => clickhouse/test_clickhouse_errors.py} (100%) rename tests/integration/adapter/{test_s3.py => clickhouse/test_clickhouse_s3.py} (74%) rename tests/integration/adapter/{test_relations.py => clickhouse/test_clickhouse_source_schema.py} (100%) create mode 100644 tests/integration/adapter/clickhouse/test_clickhouse_sql_header.py rename tests/integration/adapter/{test_basic.py => clickhouse/test_clickhouse_table_materializations.py} (71%) rename tests/integration/adapter/{test_upper_case.py => clickhouse/test_clickhouse_upper_case.py} (100%) rename tests/integration/adapter/{test_column.py => column_types/test_column_types.py} (100%) rename tests/integration/adapter/{ => concurrency}/test_concurrency.py (100%) create mode 100644 tests/integration/adapter/constraints/fixtures_contraints.py create mode 100644 tests/integration/adapter/constraints/test_constraints.py create mode 100644 tests/integration/adapter/dbt_debug/test_dbt_debug.py create mode 100644 tests/integration/adapter/dbt_show/test_dbt_show.py rename tests/integration/adapter/{test_grants.py => grants/test_distributed_grants.py} (74%) create mode 100644 tests/integration/adapter/grants/test_incremental_grants.py create mode 100644 tests/integration/adapter/grants/test_invalid_grants.py create mode 100644 tests/integration/adapter/grants/test_model_grants.py create mode 100644 tests/integration/adapter/grants/test_seed_grants.py create mode 100644 tests/integration/adapter/grants/test_snapshot_grants.py create mode 100644 tests/integration/adapter/hooks/test_model_hooks.py rename tests/integration/adapter/incremental/{test_incremental.py => test_base_incremental.py} (86%) rename tests/integration/adapter/{test_query_comments.py => query_comment/test_query_comment.py} (100%) rename tests/integration/adapter/{ => relations}/test_changing_relation_type.py (100%) diff --git a/.github/workflows/test_matrix.yml b/.github/workflows/test_matrix.yml index 1efb9485..4976354e 100644 --- a/.github/workflows/test_matrix.yml +++ b/.github/workflows/test_matrix.yml @@ -23,15 +23,13 @@ jobs: strategy: matrix: python-version: - - '3.8' - '3.9' - '3.10' - '3.11' clickhouse-version: - - '22.8' - '23.3' - - '23.5' - - '23.6' + - '23.3' + - '23.8' - latest steps: diff --git a/.gitignore b/.gitignore index 745da238..583c17ae 100644 --- a/.gitignore +++ b/.gitignore @@ -96,3 +96,4 @@ dbt-tut # local development stuff dev/ .python-version +*_project/ diff --git a/CHANGELOG.md b/CHANGELOG.md index 38e6e945..a9efbce1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,11 @@ +### Release [1.5.0], TBD +#### Improvements +- Compatible with dbt 1.5.x +- Contract support (using exact column data types) + +#### Bug Fix +- Fix s3 macro when bucket includes `https://` prefix. Closes https://github.com/ClickHouse/dbt-clickhouse/issues/192. + ### Release [1.4.9], 2023-10-27 #### Improvement - Lots of work on Distributed table materializations. Big thanks to [gfunc](https://github.com/gfunc) for the additional PR diff --git a/README.md b/README.md index 083d38ac..8e008998 100644 --- a/README.md +++ b/README.md @@ -31,6 +31,7 @@ pip install dbt-clickhouse - [x] Ephemeral materialization - [x] Distributed table materialization (experimental) - [x] Distributed incremental materialization (experimental) +- [x] Contracts # Usage Notes @@ -185,6 +186,12 @@ keys used to populate the parameters of the S3 table function: See the [S3 test file](https://github.com/ClickHouse/dbt-clickhouse/blob/main/tests/integration/adapter/test_s3.py) for examples of how to use this macro. +# Contracts and Constraints + +Only exact column type contracts are supported. For example, a contract with a UInt32 column type will fail if the model returns a UInt64 or other integer type. +ClickHouse also support _only_ `CHECK` constraints on the entire table/model. Primary key, foreign key, unique, and column level CHECK constraints are not supported. +(See ClickHouse documentation on primary/order by keys.) + # Distributed materializations Notes: diff --git a/dbt/adapters/clickhouse/__version__.py b/dbt/adapters/clickhouse/__version__.py index d3641e44..4139253f 100644 --- a/dbt/adapters/clickhouse/__version__.py +++ b/dbt/adapters/clickhouse/__version__.py @@ -1 +1 @@ -version = '1.4.9' +version = '1.5.0' diff --git a/dbt/adapters/clickhouse/cache.py b/dbt/adapters/clickhouse/cache.py new file mode 100644 index 00000000..28d9fa21 --- /dev/null +++ b/dbt/adapters/clickhouse/cache.py @@ -0,0 +1,432 @@ +import threading +from collections import namedtuple +from copy import deepcopy +from typing import Any, Dict, Iterable, List, Optional, Set, Tuple + +from dbt.events.functions import fire_event, fire_event_if +from dbt.events.types import CacheAction, CacheDumpGraph +from dbt.exceptions import ( + NewNameAlreadyInCacheError, + NoneRelationFoundError, + TruncatedModelNameCausedCollisionError, +) +from dbt.flags import get_flags + +ReferenceKey = namedtuple("ReferenceKey", "schema identifier") + + +def dot_separated(key: ReferenceKey) -> str: + """Return the key in dot-separated string form. + + :param _ReferenceKey key: The key to stringify. + """ + return ".".join(map(str, key)) + + +class CachedRelation: + """Nothing about _CachedRelation is guaranteed to be thread-safe! + + :attr str database: The schema of this relation. + :attr str identifier: The identifier of this relation. + :attr Dict[ReferenceKey, CachedRelation] referenced_by: The relations + that refer to this relation. + :attr BaseRelation inner: The underlying dbt relation. + """ + + def __init__(self, inner): + self.referenced_by = {} + self.inner = inner + + def __str__(self) -> str: + return "CachedRelation(schema={}, identifier={}, inner={})".format( + self.schema, self.identifier, self.inner + ) + + @property + def schema(self) -> Optional[str]: + return self.inner.schema + + @property + def identifier(self) -> Optional[str]: + return self.inner.identifier + + def __copy__(self): + new = self.__class__(self.inner) + new.__dict__.update(self.__dict__) + return new + + def __deepcopy__(self, memo): + new = self.__class__(self.inner.incorporate()) + new.__dict__.update(self.__dict__) + new.referenced_by = deepcopy(self.referenced_by, memo) + + def is_referenced_by(self, key): + return key in self.referenced_by + + def key(self): + """Get the _ReferenceKey that represents this relation + + :return _ReferenceKey: A key for this relation. + """ + return ReferenceKey(self.schema, self.identifier) + + def add_reference(self, referrer: "CachedRelation"): + """Add a reference from referrer to self, indicating that if this node + were drop...cascaded, the referrer would be dropped as well. + + :param _CachedRelation referrer: The node that refers to this node. + """ + self.referenced_by[referrer.key()] = referrer + + def collect_consequences(self): + """Recursively collect a set of _ReferenceKeys that would + consequentially get dropped if this were dropped via + "drop ... cascade". + + :return Set[_ReferenceKey]: All the relations that would be dropped + """ + consequences = {self.key()} + for relation in self.referenced_by.values(): + consequences.update(relation.collect_consequences()) + return consequences + + def release_references(self, keys): + """Non-recursively indicate that an iterable of _ReferenceKey no longer + exist. Unknown keys are ignored. + + :param Iterable[_ReferenceKey] keys: The keys to drop. + """ + keys = set(self.referenced_by) & set(keys) + for key in keys: + self.referenced_by.pop(key) + + def rename(self, new_relation): + """Rename this cached relation to new_relation. + Note that this will change the output of key(), all refs must be + updated! + + :param _CachedRelation new_relation: The new name to apply to the + relation + """ + # Relations store this stuff inside their `path` dict. But they + # also store a table_name, and usually use it in their .render(), + # so we need to update that as well. It doesn't appear that + # table_name is ever anything but the identifier (via .create()) + self.inner = self.inner.incorporate( + path={"identifier": new_relation.inner.identifier}, + ) + + def rename_key(self, old_key, new_key): + """Rename a reference that may or may not exist. Only handles the + reference itself, so this is the other half of what `rename` does. + + If old_key is not in referenced_by, this is a no-op. + + :param _ReferenceKey old_key: The old key to be renamed. + :param _ReferenceKey new_key: The new key to rename to. + :raises InternalError: If the new key already exists. + """ + if new_key in self.referenced_by: + raise NewNameAlreadyInCacheError(old_key, new_key) + + if old_key not in self.referenced_by: + return + value = self.referenced_by.pop(old_key) + self.referenced_by[new_key] = value + + def dump_graph_entry(self): + """Return a key/value pair representing this key and its referents. + + return List[str]: The dot-separated form of all referent keys. + """ + return [dot_separated(r) for r in self.referenced_by] + + +class ClickHouseRelationsCache: + """A cache of the relations known to dbt. Keeps track of relationships + declared between tables and handles renames/drops as a real database would. + + :attr Dict[_ReferenceKey, _CachedRelation] relations: The known relations. + :attr threading.RLock lock: The lock around relations, held during updates. + The adapters also hold this lock while filling the cache. + :attr Set[str] schemas: The set of known/cached schemas + """ + + def __init__(self) -> None: + self.relations: Dict[ReferenceKey, CachedRelation] = {} + self.lock = threading.RLock() + self.schemas: Set[Optional[str]] = set() + + def add_schema( + self, + _database: Optional[str], + schema: Optional[str], + ) -> None: + """Add a schema to the set of known schemas (case-insensitive) + + :param _database: The database name to add (not used in ClickHouse) + :param schema: The schema name to add. + """ + self.schemas.add(schema) + + def drop_schema( + self, + _database: Optional[str], + schema: Optional[str], + ) -> None: + """Drop the given schema and remove it from the set of known schemas. + + Then remove all its contents (and their dependents, etc) as well. + """ + key = schema + if key not in self.schemas: + return + + # avoid iterating over self.relations while removing things by + # collecting the list first. + + with self.lock: + to_remove = self._list_relations_in_schema(schema) + self._remove_all(to_remove) + # handle a drop_schema race by using discard() over remove() + self.schemas.discard(key) + + def update_schemas(self, schemas: Iterable[Tuple[Optional[str], str]]): + """Add multiple schemas to the set of known schemas + + :param schemas: An iterable of the schema names to add. + """ + self.schemas.update(s[1] for s in schemas) + + def __contains__(self, schema_id: Tuple[Optional[str], str]): + """A schema is 'in' the relations cache if it is in the set of cached + schemas. + + :param schema_id: The db name and schema name to look up. + """ + return schema_id[1] in self.schemas + + def dump_graph(self): + """Dump a key-only representation of the schema to a dictionary. Every + known relation is a key with a value of a list of keys it is referenced + by. + """ + # we have to hold the lock for the entire dump, if other threads modify + # self.relations or any cache entry's referenced_by during iteration + # it's a runtime error! + with self.lock: + return {dot_separated(k): str(v.dump_graph_entry()) for k, v in self.relations.items()} + + def _setdefault(self, relation: CachedRelation): + """Add a relation to the cache, or return it if it already exists. + + :param CachedRelation relation: The relation to set or get. + :return CachedRelation: The relation stored under the given relation's + key + """ + self.add_schema(None, relation.schema) + key = relation.key() + return self.relations.setdefault(key, relation) + + def add(self, relation): + """Add the relation inner to the cache + + :param BaseRelation relation: The underlying relation. + """ + flags = get_flags() + cached = CachedRelation(relation) + fire_event_if( + flags.LOG_CACHE_EVENTS, + lambda: CacheDumpGraph(before_after="before", action="adding", dump=self.dump_graph()), + ) + fire_event(CacheAction(action="add_relation", ref_key=_make_ref_key_dict(cached))) + + with self.lock: + self._setdefault(cached) + fire_event_if( + flags.LOG_CACHE_EVENTS, + lambda: CacheDumpGraph(before_after="after", action="adding", dump=self.dump_graph()), + ) + + def _remove_refs(self, keys): + """Removes all references to all entries in keys. This does not + cascade! + + :param Iterable[_ReferenceKey] keys: The keys to remove. + """ + # remove direct refs + for key in keys: + del self.relations[key] + # then remove all entries from each child + for cached in self.relations.values(): + cached.release_references(keys) + + def drop(self, relation): + """Drop the named relation and cascade it appropriately to all + dependent relations. + + Because dbt proactively does many `drop relation if exist ... cascade` + that are noops, nonexistent relation drops cause a debug log and no + other actions. + + :param relation relation: The relation to drop. + + """ + dropped_key = _make_ref_key(relation) + dropped_key_msg = _make_ref_key_dict(relation) + fire_event(CacheAction(action="drop_relation", ref_key=dropped_key_msg)) + with self.lock: + if dropped_key not in self.relations: + fire_event(CacheAction(action="drop_missing_relation", ref_key=dropped_key_msg)) + return + consequences = self.relations[dropped_key].collect_consequences() + # convert from a list of _ReferenceKeys to a list of ReferenceKeyMsgs + consequence_msgs = [key._asdict() for key in consequences] + fire_event( + CacheAction( + action="drop_cascade", ref_key=dropped_key_msg, ref_list=consequence_msgs + ) + ) + self._remove_refs(consequences) + + def _rename_relation(self, old_key, new_relation): + """Rename a relation named old_key to new_key, updating references. + Return whether here was a key to rename. + + :param _ReferenceKey old_key: The existing key, to rename from. + :param _CachedRelation new_relation: The new relation, to rename to. + """ + # On the database level, a rename updates all values that were + # previously referenced by old_name to be referenced by new_name. + # basically, the name changes but some underlying ID moves. Kind of + # like an object reference! + relation = self.relations.pop(old_key) + new_key = new_relation.key() + + # relation has to rename its innards, so it needs the _CachedRelation. + relation.rename(new_relation) + # update all the relations that refer to it + for cached in self.relations.values(): + if cached.is_referenced_by(old_key): + fire_event( + CacheAction( + action="update_reference", + ref_key=_make_ref_key_dict(old_key), + ref_key_2=_make_ref_key_dict(new_key), + ref_key_3=_make_ref_key_dict(cached.key()), + ) + ) + + cached.rename_key(old_key, new_key) + + self.relations[new_key] = relation + # also fixup the schemas! + self.add_schema(None, new_key.schema) + + return True + + def _check_rename_constraints(self, old_key, new_key): + """Check the rename constraints, and return whether the rename can proceed. + + If the new key is already present, that is an error. + If the old key is absent, we debug log and return False, assuming it's + a temp table being renamed. + + :param _ReferenceKey old_key: The existing key, to rename from. + :param _ReferenceKey new_key: The new key, to rename to. + :return bool: If the old relation exists for renaming. + :raises InternalError: If the new key is already present. + """ + if new_key in self.relations: + # Tell user when collision caused by model names truncated during + # materialization. + raise TruncatedModelNameCausedCollisionError(new_key, self.relations) + + if old_key not in self.relations: + fire_event(CacheAction(action="temporary_relation", ref_key=old_key._asdict())) + return False + return True + + def rename(self, old, new): + """Rename the old schema/identifier to the new schema/identifier and + update references. + + If the new schema/identifier is already present, that is an error. + If the schema/identifier key is absent, we only debug log and return, + assuming it's a temp table being renamed. + + :param BaseRelation old: The existing relation name information. + :param BaseRelation new: The new relation name information. + :raises InternalError: If the new key is already present. + """ + old_key = _make_ref_key(old) + new_key = _make_ref_key(new) + fire_event( + CacheAction( + action="rename_relation", + ref_key=old_key._asdict(), + ref_key_2=new_key._asdict(), + ) + ) + flags = get_flags() + fire_event_if( + flags.LOG_CACHE_EVENTS, + lambda: CacheDumpGraph(before_after="before", action="rename", dump=self.dump_graph()), + ) + + with self.lock: + if self._check_rename_constraints(old_key, new_key): + self._rename_relation(old_key, CachedRelation(new)) + else: + self._setdefault(CachedRelation(new)) + + fire_event_if( + flags.LOG_CACHE_EVENTS, + lambda: CacheDumpGraph(before_after="after", action="rename", dump=self.dump_graph()), + ) + + def get_relations(self, _database: Optional[str], schema: Optional[str]) -> List[Any]: + """Yield all relations matching the given schema (ClickHouse database).""" + with self.lock: + results = [r.inner for r in self.relations.values() if r.schema == schema] + + if None in results: + raise NoneRelationFoundError() + return results + + def clear(self): + """Clear the cache""" + with self.lock: + self.relations.clear() + self.schemas.clear() + + def _list_relations_in_schema(self, schema: Optional[str]) -> List[CachedRelation]: + """Get the relations in a schema. Callers should hold the lock.""" + key = schema + + to_remove: List[CachedRelation] = [] + for cachekey, relation in self.relations.items(): + if cachekey.schema == key: + to_remove.append(relation) + return to_remove + + def _remove_all(self, to_remove: List[CachedRelation]): + """Remove all the listed relations. Ignore relations that have been + cascaded out. + """ + for relation in to_remove: + # it may have been cascaded out already + drop_key = _make_ref_key(relation) + if drop_key in self.relations: + self.drop(drop_key) + + +def _make_ref_key(relation: Any) -> ReferenceKey: + return ReferenceKey(relation.schema, relation.identifier) + + +def _make_ref_key_dict(relation: Any): + return { + "schema": relation.schema, + "identifier": relation.identifier, + } diff --git a/dbt/adapters/clickhouse/connections.py b/dbt/adapters/clickhouse/connections.py index 85c141be..c4098649 100644 --- a/dbt/adapters/clickhouse/connections.py +++ b/dbt/adapters/clickhouse/connections.py @@ -1,7 +1,7 @@ import re import time from contextlib import contextmanager -from typing import Any, Optional, Tuple +from typing import Any, Optional, Tuple, Union import agate import dbt.exceptions @@ -141,3 +141,8 @@ def begin(self): def commit(self): pass + + @classmethod + def data_type_code_to_name(cls, type_code: Union[int, str]) -> str: + assert isinstance(type_code, int) + return '' diff --git a/dbt/adapters/clickhouse/credentials.py b/dbt/adapters/clickhouse/credentials.py index 4ccf2d63..427b94c1 100644 --- a/dbt/adapters/clickhouse/credentials.py +++ b/dbt/adapters/clickhouse/credentials.py @@ -16,7 +16,7 @@ class ClickHouseCredentials(Credentials): port: Optional[int] = None user: Optional[str] = 'default' retries: int = 1 - database: Optional[str] = None + database: Optional[str] = '' schema: Optional[str] = 'default' password: str = '' cluster: Optional[str] = None @@ -43,7 +43,7 @@ def unique_field(self): return self.host def __post_init__(self): - if self.database is not None and self.database != self.schema: + if self.database and self.database != self.schema: raise DbtRuntimeError( f' schema: {self.schema} \n' f' database: {self.database} \n' @@ -51,7 +51,7 @@ def __post_init__(self): f'On Clickhouse, database must be omitted or have the same value as' f' schema.' ) - self.database = None + self.database = '' def _connection_keys(self): return ( diff --git a/dbt/adapters/clickhouse/dbclient.py b/dbt/adapters/clickhouse/dbclient.py index 30e1ff4f..ef069ab3 100644 --- a/dbt/adapters/clickhouse/dbclient.py +++ b/dbt/adapters/clickhouse/dbclient.py @@ -88,6 +88,10 @@ def query(self, sql: str, **kwargs): def command(self, sql: str, **kwargs): pass + @abstractmethod + def columns_in_query(self, sql: str, **kwargs): + pass + @abstractmethod def get_ch_setting(self, setting_name): pass diff --git a/dbt/adapters/clickhouse/httpclient.py b/dbt/adapters/clickhouse/httpclient.py index 6a1991e9..6e074464 100644 --- a/dbt/adapters/clickhouse/httpclient.py +++ b/dbt/adapters/clickhouse/httpclient.py @@ -1,8 +1,11 @@ +from typing import List + import clickhouse_connect from clickhouse_connect.driver.exceptions import DatabaseError, OperationalError from dbt.exceptions import DbtDatabaseError from dbt.version import __version__ as dbt_version +from dbt.adapters.clickhouse import ClickHouseColumn from dbt.adapters.clickhouse.__version__ import version as dbt_clickhouse_version from dbt.adapters.clickhouse.dbclient import ChClientWrapper, ChRetryableException @@ -20,6 +23,16 @@ def command(self, sql, **kwargs): except DatabaseError as ex: raise DbtDatabaseError(str(ex).strip()) from ex + def columns_in_query(self, sql: str, **kwargs) -> List[ClickHouseColumn]: + try: + query_result = self._client.query(f'{sql} LIMIT 0', **kwargs) + return [ + ClickHouseColumn.create(name, ch_type.name) + for name, ch_type in zip(query_result.column_names, query_result.column_types) + ] + except DatabaseError as ex: + raise DbtDatabaseError(str(ex).strip()) from ex + def get_ch_setting(self, setting_name): setting = self._client.server_settings.get(setting_name) return setting.value if setting else None diff --git a/dbt/adapters/clickhouse/impl.py b/dbt/adapters/clickhouse/impl.py index c59e0497..ec4b3c07 100644 --- a/dbt/adapters/clickhouse/impl.py +++ b/dbt/adapters/clickhouse/impl.py @@ -2,23 +2,26 @@ import io from concurrent.futures import Future from dataclasses import dataclass -from typing import Callable, Dict, List, Optional, Set, Union +from typing import Any, Callable, Dict, List, Optional, Set, Union import agate -import dbt.exceptions from dbt.adapters.base import AdapterConfig, available -from dbt.adapters.base.impl import catch_as_completed +from dbt.adapters.base.impl import BaseAdapter, ConstraintSupport, catch_as_completed from dbt.adapters.base.relation import BaseRelation, InformationSchema from dbt.adapters.sql import SQLAdapter -from dbt.clients.agate_helper import table_from_rows from dbt.contracts.graph.manifest import Manifest +from dbt.contracts.graph.nodes import ConstraintType, ModelLevelConstraint from dbt.contracts.relation import RelationType +from dbt.events.functions import warn_or_error +from dbt.events.types import ConstraintNotSupported from dbt.exceptions import DbtInternalError, DbtRuntimeError, NotImplementedError from dbt.utils import executor, filter_null_values +from dbt.adapters.clickhouse.cache import ClickHouseRelationsCache from dbt.adapters.clickhouse.column import ClickHouseColumn from dbt.adapters.clickhouse.connections import ClickHouseConnectionManager from dbt.adapters.clickhouse.logger import logger +from dbt.adapters.clickhouse.query import quote_identifier from dbt.adapters.clickhouse.relation import ClickHouseRelation GET_CATALOG_MACRO_NAME = 'get_catalog' @@ -39,6 +42,18 @@ class ClickHouseAdapter(SQLAdapter): ConnectionManager = ClickHouseConnectionManager AdapterSpecificConfigs = ClickHouseConfig + CONSTRAINT_SUPPORT = { + ConstraintType.check: ConstraintSupport.ENFORCED, + ConstraintType.not_null: ConstraintSupport.NOT_SUPPORTED, + ConstraintType.unique: ConstraintSupport.NOT_SUPPORTED, + ConstraintType.primary_key: ConstraintSupport.NOT_SUPPORTED, + ConstraintType.foreign_key: ConstraintSupport.NOT_SUPPORTED, + } + + def __init__(self, config): + BaseAdapter.__init__(self, config) + self.cache = ClickHouseRelationsCache() + @classmethod def date_function(cls): return 'now()' @@ -163,13 +178,12 @@ def s3source_clause( fmt = fmt or s3config.get('fmt') bucket = bucket or s3config.get('bucket', '') path = path or s3config.get('path', '') - url = bucket + url = bucket.replace('https://', '') if path: if bucket and path and not bucket.endswith('/') and not bucket.startswith('/'): path = f'/{path}' url = f'{url}{path}'.replace('//', '/') - if not url.startswith('http'): - url = f'https://{url}' + url = f'https://{url}' access = '' if aws_access_key_id and not aws_secret_access_key: raise DbtRuntimeError('S3 aws_access_key_id specified without aws_secret_access_key') @@ -218,7 +232,7 @@ def list_relations_without_caching( ) relation = self.Relation.create( - database=None, + database='', schema=schema, identifier=name, type=rel_type, @@ -230,7 +244,7 @@ def list_relations_without_caching( return relations def get_relation(self, database: Optional[str], schema: str, identifier: str): - return super().get_relation(None, schema, identifier) + return super().get_relation('', schema, identifier) @available.parse_none def get_ch_database(self, schema: str): @@ -269,21 +283,11 @@ def _get_one_catalog( manifest: Manifest, ) -> agate.Table: if len(schemas) != 1: - dbt.exceptions.raise_compiler_error( - f'Expected only one schema in clickhouse _get_one_catalog, found ' f'{schemas}' + raise DbtRuntimeError( + f"Expected only one schema in clickhouse _get_one_catalog, found ' f'{schemas}'" ) - return super()._get_one_catalog(information_schema, schemas, manifest) - @classmethod - def _catalog_filter_table(cls, table: agate.Table, manifest: Manifest) -> agate.Table: - table = table_from_rows( - table.rows, - table.column_names, - text_only_columns=['table_schema', 'table_name'], - ) - return table.where(_catalog_filter_schemas(manifest)) - def get_rows_different_sql( self, relation_a: ClickHouseRelation, @@ -369,6 +373,33 @@ def get_model_settings(self, model): res.append(f' {key}={settings[key]}') return '' if len(res) == 0 else 'SETTINGS ' + ', '.join(res) + '\n' + @available.parse_none + def get_column_schema_from_query(self, sql: str, *_) -> List[ClickHouseColumn]: + """Get a list of the Columns with names and data types from the given sql.""" + conn = self.connections.get_if_exists() + return conn.handle.columns_in_query(sql) + + @available.parse_none + def format_columns(self, columns) -> List[Dict]: + return [{'name': column.name, 'data_type': column.dtype} for column in columns] + + @classmethod + def render_raw_columns_constraints(cls, raw_columns: Dict[str, Dict[str, Any]]) -> List: + rendered_columns = [] + for v in raw_columns.values(): + rendered_columns.append(f"{quote_identifier(v['name'])} {v['data_type']}") + if v.get("constraints"): + warn_or_error(ConstraintNotSupported(constraint='column', adapter='clickhouse')) + return rendered_columns + + @classmethod + def render_model_constraint(cls, constraint: ModelLevelConstraint) -> Optional[str]: + if constraint.type == ConstraintType.check and constraint.expression: + if not constraint.name: + raise DbtRuntimeError("CHECK Constraint 'name' is required") + return f"CONSTRAINT {constraint.name} CHECK ({constraint.expression})" + return None + @dataclass class ClickHouseDatabase: diff --git a/dbt/adapters/clickhouse/nativeclient.py b/dbt/adapters/clickhouse/nativeclient.py index 0f400a09..6fbff418 100644 --- a/dbt/adapters/clickhouse/nativeclient.py +++ b/dbt/adapters/clickhouse/nativeclient.py @@ -1,10 +1,12 @@ +from typing import List + import clickhouse_driver import pkg_resources from clickhouse_driver.errors import NetworkError, SocketTimeoutError from dbt.exceptions import DbtDatabaseError from dbt.version import __version__ as dbt_version -from dbt.adapters.clickhouse import ClickHouseCredentials +from dbt.adapters.clickhouse import ClickHouseColumn, ClickHouseCredentials from dbt.adapters.clickhouse.__version__ import version as dbt_clickhouse_version from dbt.adapters.clickhouse.dbclient import ChClientWrapper, ChRetryableException from dbt.adapters.clickhouse.logger import logger @@ -30,6 +32,13 @@ def command(self, sql, **kwargs): except clickhouse_driver.errors.Error as ex: raise DbtDatabaseError(str(ex).strip()) from ex + def columns_in_query(self, sql: str, **kwargs) -> List[ClickHouseColumn]: + try: + _, columns = self._client.execute(f'{sql} LIMIT 0', with_column_types=True) + return [ClickHouseColumn.create(column[0], column[1]) for column in columns] + except clickhouse_driver.errors.Error as ex: + raise DbtDatabaseError(str(ex).strip()) from ex + def get_ch_setting(self, setting_name): try: result = self._client.execute( diff --git a/dbt/adapters/clickhouse/query.py b/dbt/adapters/clickhouse/query.py new file mode 100644 index 00000000..f69222e0 --- /dev/null +++ b/dbt/adapters/clickhouse/query.py @@ -0,0 +1,14 @@ +BS = '\\' +must_escape = (BS, '\'', '`') + + +def quote_identifier(identifier: str): + first_char = identifier[0] + if first_char in ('`', '"') and identifier[-1] == first_char: + # Identifier is already quoted, assume that it's valid + return identifier + return f'`{escape_str(identifier)}`' + + +def escape_str(value: str): + return ''.join(f'{BS}{c}' if c in must_escape else c for c in value) diff --git a/dbt/adapters/clickhouse/relation.py b/dbt/adapters/clickhouse/relation.py index 3fc91437..818928d8 100644 --- a/dbt/adapters/clickhouse/relation.py +++ b/dbt/adapters/clickhouse/relation.py @@ -33,6 +33,7 @@ class ClickHouseRelation(BaseRelation): def __post_init__(self): if self.database != self.schema and self.database: raise DbtRuntimeError(f'Cannot set database {self.database} in clickhouse!') + self.path.database = '' def render(self): if self.include_policy.database and self.include_policy.schema: @@ -44,7 +45,7 @@ def render(self): def matches( self, - database: Optional[str] = None, + database: Optional[str] = '', schema: Optional[str] = None, identifier: Optional[str] = None, ): @@ -86,7 +87,7 @@ def create_from_source(cls: Type[Self], source: SourceDefinition, **kwargs: Any) schema = source.database return cls.create( - database=source.database, + database='', schema=schema, identifier=source.identifier, quote_policy=quote_policy, @@ -112,7 +113,7 @@ def create_from_node( can_on_cluster = cls.get_on_cluster(cluster, materialized, engine) return cls.create( - database=node.database, + database='', schema=node.schema, identifier=node.alias, quote_policy=quote_policy, diff --git a/dbt/include/clickhouse/macros/adapters.sql b/dbt/include/clickhouse/macros/adapters.sql index 8d52a2c1..ae0ef8d6 100644 --- a/dbt/include/clickhouse/macros/adapters.sql +++ b/dbt/include/clickhouse/macros/adapters.sql @@ -1,14 +1,3 @@ -{% macro clickhouse__create_view_as(relation, sql) -%} - {%- set sql_header = config.get('sql_header', none) -%} - - {{ sql_header if sql_header is not none }} - - create view {{ relation.include(database=False) }} {{ on_cluster_clause(relation)}} - as ( - {{ sql }} - ) -{%- endmacro %} - {% macro clickhouse__list_schemas(database) %} {% call statement('list_schemas', fetch_result=True, auto_begin=False) %} select name from system.databases @@ -95,7 +84,7 @@ {% macro clickhouse__generate_database_name(custom_database_name=none, node=none) -%} - {% do return(None) %} + {% do return('') %} {%- endmacro %} {% macro clickhouse__get_columns_in_query(select_sql) %} diff --git a/dbt/include/clickhouse/macros/catalog.sql b/dbt/include/clickhouse/macros/catalog.sql index 16b3987e..b2c55999 100644 --- a/dbt/include/clickhouse/macros/catalog.sql +++ b/dbt/include/clickhouse/macros/catalog.sql @@ -1,7 +1,7 @@ {% macro clickhouse__get_catalog(information_schema, schemas) -%} {%- call statement('catalog', fetch_result=True) -%} select - null as table_database, + '' as table_database, columns.database as table_schema, columns.table as table_name, if(tables.engine not in ('MaterializedView', 'View'), 'table', 'view') as table_type, diff --git a/dbt/include/clickhouse/macros/column_spec_ddl.sql b/dbt/include/clickhouse/macros/column_spec_ddl.sql new file mode 100644 index 00000000..24194d91 --- /dev/null +++ b/dbt/include/clickhouse/macros/column_spec_ddl.sql @@ -0,0 +1,40 @@ +{% macro clickhouse__get_assert_columns_equivalent(sql) -%} + {%- set user_defined_columns = model['columns'] -%} + + {%- if not user_defined_columns -%} + {{ exceptions.raise_contract_error([], []) }} + {%- endif -%} + + {%- set yaml_columns = user_defined_columns.values() -%} + + {%- set sql_file_provided_columns = adapter.get_column_schema_from_query(sql) -%} + {%- set sql_columns = adapter.format_columns(sql_file_provided_columns) -%} + + {%- if sql_columns|length != yaml_columns|length -%} + {%- do exceptions.raise_contract_error(yaml_columns, sql_columns) -%} + {%- endif -%} + + {%- if sql_columns|length != yaml_columns|length -%} + {%- do exceptions.raise_contract_error(yaml_columns, sql_columns) -%} + {%- endif -%} + + {%- for sql_col in sql_columns -%} + {%- set yaml_col = [] -%} + {%- for this_col in yaml_columns -%} + {%- if this_col['name'] == sql_col['name'] -%} + {%- do yaml_col.append(this_col) -%} + {%- break -%} + {%- endif -%} + {%- endfor -%} + {%- if not yaml_col -%} + {#-- Column with name not found in yaml #} + {%- do exceptions.raise_contract_error(yaml_columns, sql_columns) -%} + {%- endif -%} + {%- if sql_col['data_type'] != yaml_col[0]['data_type'] -%} + {#-- Column data types don't match #} + {%- do exceptions.raise_contract_error(yaml_columns, sql_columns) -%} + {%- endif -%} + {%- endfor -%} + +{% endmacro %} + diff --git a/dbt/include/clickhouse/macros/materializations/table.sql b/dbt/include/clickhouse/macros/materializations/table.sql index 0e93d633..8386593b 100644 --- a/dbt/include/clickhouse/macros/materializations/table.sql +++ b/dbt/include/clickhouse/macros/materializations/table.sql @@ -130,18 +130,19 @@ {%- endmacro -%} {% macro clickhouse__create_table_as(temporary, relation, sql) -%} - {% set create_table = create_table_or_empty(temporary, relation, sql) %} + {% set has_contract = config.get('contract').enforced %} + {% set create_table = create_table_or_empty(temporary, relation, sql, has_contract) %} {% if adapter.is_before_version('22.7.1.2484') -%} {{ create_table }} {%- else %} {% call statement('create_table_empty') %} {{ create_table }} {% endcall %} - {{ clickhouse__insert_into(relation.include(database=False), sql) }} + {{ clickhouse__insert_into(relation.include(database=False), sql, has_contract) }} {%- endif %} {%- endmacro %} -{% macro create_table_or_empty(temporary, relation, sql) -%} +{% macro create_table_or_empty(temporary, relation, sql, has_contract) -%} {%- set sql_header = config.get('sql_header', none) -%} {{ sql_header if sql_header is not none }} @@ -152,27 +153,39 @@ {{ order_cols(label="order by") }} {{ partition_cols(label="partition by") }} {{ adapter.get_model_settings(model) }} + as ( {{ sql }} ) {%- else %} create table {{ relation.include(database=False) }} {{ on_cluster_clause(relation)}} + {%- if has_contract%} + {{ get_assert_columns_equivalent(sql) }} + {{ get_table_columns_and_constraints() }} + {%- endif %} {{ engine_clause() }} {{ order_cols(label="order by") }} {{ primary_key_clause(label="primary key") }} {{ partition_cols(label="partition by") }} {{ adapter.get_model_settings(model) }} - {% if not adapter.is_before_version('22.7.1.2484') -%} + + {%- if not has_contract %} + {%- if not adapter.is_before_version('22.7.1.2484') %} empty + {%- endif %} + as ( {{ sql }} ) {%- endif %} {%- endif %} - as ( - {{ sql }} - ) + {%- endmacro %} -{% macro clickhouse__insert_into(target_relation, sql) %} +{% macro clickhouse__insert_into(target_relation, sql, has_contract) %} {%- set dest_columns = adapter.get_columns_in_relation(target_relation) -%} {%- set dest_cols_csv = dest_columns | map(attribute='quoted') | join(', ') -%} insert into {{ target_relation }} ({{ dest_cols_csv }}) - {{ sql }} + {%- if has_contract -%} + -- Use a subquery to get columns in the right order + SELECT {{ dest_cols_csv }} FROM ( {{ sql }} ) + {%- else -%} + {{ sql }} + {%- endif -%} {%- endmacro %} diff --git a/dbt/include/clickhouse/macros/materializations/view.sql b/dbt/include/clickhouse/macros/materializations/view.sql index 01ea6dcf..735ec973 100644 --- a/dbt/include/clickhouse/macros/materializations/view.sql +++ b/dbt/include/clickhouse/macros/materializations/view.sql @@ -65,3 +65,19 @@ {{ return({'relations': [target_relation]}) }} {%- endmaterialization -%} + + +{% macro clickhouse__create_view_as(relation, sql) -%} + {%- set sql_header = config.get('sql_header', none) -%} + {{ sql_header if sql_header is not none }} + + create view {{ relation.include(database=False) }} {{ on_cluster_clause(relation)}} + {% set contract_config = config.get('contract') %} + {% if contract_config.enforced %} + {{ get_assert_columns_equivalent(sql) }} + {%- endif %} + as ( + {{ sql }} + ) +{%- endmacro %} + diff --git a/dev_requirements.txt b/dev_requirements.txt index 4f5a9403..746e2cc0 100644 --- a/dev_requirements.txt +++ b/dev_requirements.txt @@ -1,9 +1,9 @@ -dbt-core~=1.4.1 -clickhouse-connect>=0.5.24 +dbt-core~=1.5.8 +clickhouse-connect>=0.6.18 clickhouse-driver>=0.2.3 pytest>=7.2.0 pytest-dotenv==0.5.2 -dbt-tests-adapter~=1.4.1 +dbt-tests-adapter~=1.5.8 black==22.3.0 isort==5.10.1 mypy==0.991 diff --git a/tests/conftest.py b/tests/conftest.py index a04b964b..89fc9395 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,3 +1,10 @@ +import os +import time + +os.environ['TZ'] = 'UTC' +time.tzset() + + # Import the standard integration fixtures as a plugin # Note: fixtures with session scope need to be local pytest_plugins = ["dbt.tests.fixtures.project"] diff --git a/tests/integration/adapter/test_aliases.py b/tests/integration/adapter/aliases/test_aliases.py similarity index 100% rename from tests/integration/adapter/test_aliases.py rename to tests/integration/adapter/aliases/test_aliases.py diff --git a/tests/integration/adapter/basic/test_adapter_methods.py b/tests/integration/adapter/basic/test_adapter_methods.py new file mode 100644 index 00000000..8a70c3c6 --- /dev/null +++ b/tests/integration/adapter/basic/test_adapter_methods.py @@ -0,0 +1,9 @@ +from dbt.tests.adapter.basic.test_adapter_methods import BaseAdapterMethod + + +class TestBaseAdapterMethod(BaseAdapterMethod): + pass + + +class TestBaseCaching(BaseAdapterMethod): + pass diff --git a/tests/integration/adapter/basic/test_base.py b/tests/integration/adapter/basic/test_base.py new file mode 100644 index 00000000..e5ef1a69 --- /dev/null +++ b/tests/integration/adapter/basic/test_base.py @@ -0,0 +1,5 @@ +from dbt.tests.adapter.basic.test_base import BaseSimpleMaterializations + + +class TestBaseSimpleMaterializations(BaseSimpleMaterializations): + pass diff --git a/tests/integration/adapter/basic/test_basic.py b/tests/integration/adapter/basic/test_basic.py new file mode 100644 index 00000000..e340b5e1 --- /dev/null +++ b/tests/integration/adapter/basic/test_basic.py @@ -0,0 +1,95 @@ +import os + +import pytest +from dbt.tests.util import run_dbt + +# CSV content with boolean column type. +seeds_boolean_csv = """ +key,value +abc,true +def,false +hij,true +klm,false +""".lstrip() + +# CSV content with empty fields. +seeds_empty_csv = """ +key,val1,val2,str1 +abc,1,1,some_str +abc,1,0,"another string" +def,1,0, +hij,1,1,Caps +hij,1,,"second string" +klm,1,0,"test" +klm,1,,"test4" +""".lstrip() + +seeds_schema_yml = """ +version: 2 + +seeds: + - name: empty + config: + column_types: + val2: Nullable(UInt32) + str1: Nullable(String) +""" + +replicated_seeds_schema_yml = """ +version: 2 + +seeds: + - name: empty + config: + engine: ReplicatedMergeTree('/clickhouse/tables/{uuid}/one_shard', '{server_index}' ) + column_types: + val2: Nullable(UInt32) + str1: Nullable(String) +""" + +base_seeds_schema_yml = """ +version: 2 + +seeds: + - name: base + config: + engine: ReplicatedMergeTree('/clickhouse/tables/{uuid}/one_shard', '{server_index}' ) +""" + + +class TestCSVSeed: + @pytest.fixture(scope="class") + def seeds(self): + return { + "schema.yml": seeds_schema_yml, + "boolean.csv": seeds_boolean_csv, + "empty.csv": seeds_empty_csv, + } + + def test_seed(self, project): + # seed command + results = run_dbt(["seed"]) + assert len(results) == 2 + columns = project.run_sql("DESCRIBE TABLE empty", fetch='all') + assert columns[2][1] == 'Nullable(UInt32)' + assert columns[3][1] == 'Nullable(String)' + + +class TestReplicatedCSVSeed: + @pytest.fixture(scope="class") + def seeds(self): + return { + "schema.yml": replicated_seeds_schema_yml, + "empty.csv": seeds_empty_csv, + } + + @pytest.mark.skipif( + os.environ.get('DBT_CH_TEST_CLUSTER', '').strip() == '', reason='Not on a cluster' + ) + def test_seed(self, project): + # seed command + results = run_dbt(["seed"]) + assert len(results) == 1 + columns = project.run_sql("DESCRIBE TABLE empty", fetch='all') + assert columns[2][1] == 'Nullable(UInt32)' + assert columns[3][1] == 'Nullable(String)' diff --git a/tests/integration/adapter/test_docs.py b/tests/integration/adapter/basic/test_docs_generate.py similarity index 100% rename from tests/integration/adapter/test_docs.py rename to tests/integration/adapter/basic/test_docs_generate.py diff --git a/tests/integration/adapter/basic/test_empty.py b/tests/integration/adapter/basic/test_empty.py new file mode 100644 index 00000000..4ef30c3f --- /dev/null +++ b/tests/integration/adapter/basic/test_empty.py @@ -0,0 +1,5 @@ +from dbt.tests.adapter.basic.test_empty import BaseEmpty + + +class TestEmpty(BaseEmpty): + pass diff --git a/tests/integration/adapter/basic/test_ephemeral.py b/tests/integration/adapter/basic/test_ephemeral.py new file mode 100644 index 00000000..c04caa0b --- /dev/null +++ b/tests/integration/adapter/basic/test_ephemeral.py @@ -0,0 +1,5 @@ +from dbt.tests.adapter.basic.test_ephemeral import BaseEphemeral + + +class TestEphemeral(BaseEphemeral): + pass diff --git a/tests/integration/adapter/basic/test_generic_tests.py b/tests/integration/adapter/basic/test_generic_tests.py new file mode 100644 index 00000000..63246ea9 --- /dev/null +++ b/tests/integration/adapter/basic/test_generic_tests.py @@ -0,0 +1,5 @@ +from dbt.tests.adapter.basic.test_generic_tests import BaseGenericTests + + +class TestGenericTests(BaseGenericTests): + pass diff --git a/tests/integration/adapter/basic/test_incremental.py b/tests/integration/adapter/basic/test_incremental.py new file mode 100644 index 00000000..3cc4cce9 --- /dev/null +++ b/tests/integration/adapter/basic/test_incremental.py @@ -0,0 +1,24 @@ +import pytest +from dbt.tests.adapter.basic.test_incremental import BaseIncremental, BaseIncrementalNotSchemaChange + + +class TestIncremental(BaseIncremental): + pass + + +incremental_not_schema_change_sql = """ +{{ config(materialized="incremental", unique_key="user_id_current_time",on_schema_change="sync_all_columns") }} +select + toString(1) || '-' || toString(now64()) as user_id_current_time, + {% if is_incremental() %} + 'thisis18characters' as platform + {% else %} + 'okthisis20characters' as platform + {% endif %} +""" + + +class TestIncrementalNotSchemaChange(BaseIncrementalNotSchemaChange): + @pytest.fixture(scope="class") + def models(self): + return {"incremental_not_schema_change.sql": incremental_not_schema_change_sql} diff --git a/tests/integration/adapter/basic/test_singular_tests.py b/tests/integration/adapter/basic/test_singular_tests.py new file mode 100644 index 00000000..2e5d7917 --- /dev/null +++ b/tests/integration/adapter/basic/test_singular_tests.py @@ -0,0 +1,5 @@ +from dbt.tests.adapter.basic.test_singular_tests import BaseSingularTests + + +class TestSingularTests(BaseSingularTests): + pass diff --git a/tests/integration/adapter/test_singular.py b/tests/integration/adapter/basic/test_singular_tests_ephemeral.py similarity index 56% rename from tests/integration/adapter/test_singular.py rename to tests/integration/adapter/basic/test_singular_tests_ephemeral.py index c81bafe1..89919591 100644 --- a/tests/integration/adapter/test_singular.py +++ b/tests/integration/adapter/basic/test_singular_tests_ephemeral.py @@ -1,10 +1,5 @@ -from dbt.tests.adapter.basic.test_singular_tests import BaseSingularTests from dbt.tests.adapter.basic.test_singular_tests_ephemeral import BaseSingularTestsEphemeral -class TestSingularTests(BaseSingularTests): - pass - - class TestSingularTestsEphemeral(BaseSingularTestsEphemeral): pass diff --git a/tests/integration/adapter/basic/test_snapshot_check_cols.py b/tests/integration/adapter/basic/test_snapshot_check_cols.py new file mode 100644 index 00000000..3a57d7f4 --- /dev/null +++ b/tests/integration/adapter/basic/test_snapshot_check_cols.py @@ -0,0 +1,5 @@ +from dbt.tests.adapter.basic.test_snapshot_check_cols import BaseSnapshotCheckCols + + +class TestSnapshotCheckCols(BaseSnapshotCheckCols): + pass diff --git a/tests/integration/adapter/basic/test_snapshot_timestamp.py b/tests/integration/adapter/basic/test_snapshot_timestamp.py new file mode 100644 index 00000000..d9ebf373 --- /dev/null +++ b/tests/integration/adapter/basic/test_snapshot_timestamp.py @@ -0,0 +1,5 @@ +from dbt.tests.adapter.basic.test_snapshot_timestamp import BaseSnapshotTimestamp + + +class TestSnapshotTimestamp(BaseSnapshotTimestamp): + pass diff --git a/tests/integration/adapter/basic/test_table_materialization.py b/tests/integration/adapter/basic/test_table_materialization.py new file mode 100644 index 00000000..4664f189 --- /dev/null +++ b/tests/integration/adapter/basic/test_table_materialization.py @@ -0,0 +1,5 @@ +from dbt.tests.adapter.basic.test_table_materialization import BaseTableMaterialization + + +class TestTableMat(BaseTableMaterialization): + pass diff --git a/tests/integration/adapter/basic/test_validate_connection.py b/tests/integration/adapter/basic/test_validate_connection.py new file mode 100644 index 00000000..e1389e65 --- /dev/null +++ b/tests/integration/adapter/basic/test_validate_connection.py @@ -0,0 +1,5 @@ +from dbt.tests.adapter.basic.test_validate_connection import BaseValidateConnection + + +class TestValidateConnection(BaseValidateConnection): + pass diff --git a/tests/integration/adapter/caching/test_caching.py b/tests/integration/adapter/caching/test_caching.py new file mode 100644 index 00000000..f71dffd0 --- /dev/null +++ b/tests/integration/adapter/caching/test_caching.py @@ -0,0 +1,99 @@ +import pytest +from dbt.tests.util import run_dbt + +model_sql = """ +{{ + config( + materialized='table' + ) +}} +select 1 as id +""" + +another_schema_model_sql = """ +{{ + config( + materialized='table', + schema='another_schema' + ) +}} +select 1 as id +""" + + +class BaseCachingTest: + @pytest.fixture(scope="class") + def project_config_update(self): + return { + "config-version": 2, + "quoting": { + "identifier": False, + "schema": False, + }, + } + + def run_and_inspect_cache(self, project, run_args=None): + run_dbt(run_args) + + # the cache was empty at the start of the run. + # the model materialization returned a relation and added to the cache. + adapter = project.adapter + assert len(adapter.cache.relations) == 1 + relation = list(adapter.cache.relations).pop() + assert relation.schema == project.test_schema + + # on the second run, dbt will find a relation in the database during cache population. + run_dbt(run_args) + adapter = project.adapter + assert len(adapter.cache.relations) == 1 + second_relation = list(adapter.cache.relations).pop() + + for key in ["schema", "identifier"]: + assert getattr(relation, key) == getattr(second_relation, key) + + def test_cache(self, project): + self.run_and_inspect_cache(project, run_args=["run"]) + + +class TestNoPopulateCache(BaseCachingTest): + @pytest.fixture(scope="class") + def models(self): + return { + "model.sql": model_sql, + } + + def test_cache(self, project): + # --no-populate-cache still allows the cache to populate all relations + # under a schema, so the behavior here remains the same as other tests + run_args = ["--no-populate-cache", "run"] + self.run_and_inspect_cache(project, run_args) + + +class TestCachingLowerCaseModel(BaseCachingTest): + @pytest.fixture(scope="class") + def models(self): + return { + "model.sql": model_sql, + } + + +class TestCachingUppercaseModel(BaseCachingTest): + @pytest.fixture(scope="class") + def models(self): + return { + "MODEL.sql": model_sql, + } + + +class TestCachingSelectedSchemaOnly(BaseCachingTest): + @pytest.fixture(scope="class") + def models(self): + return { + "model.sql": model_sql, + "another_schema_model.sql": another_schema_model_sql, + } + + def test_cache(self, project): + # this should only cache the schema containing the selected model + run_args = ["--cache-selected-only", "run", "--select", "model"] + self.run_and_inspect_cache(project, run_args) diff --git a/tests/integration/adapter/test_comments.py b/tests/integration/adapter/clickhouse/test_clickhouse_comments.py similarity index 100% rename from tests/integration/adapter/test_comments.py rename to tests/integration/adapter/clickhouse/test_clickhouse_comments.py diff --git a/tests/integration/adapter/test_errors.py b/tests/integration/adapter/clickhouse/test_clickhouse_errors.py similarity index 100% rename from tests/integration/adapter/test_errors.py rename to tests/integration/adapter/clickhouse/test_clickhouse_errors.py diff --git a/tests/integration/adapter/test_s3.py b/tests/integration/adapter/clickhouse/test_clickhouse_s3.py similarity index 74% rename from tests/integration/adapter/test_s3.py rename to tests/integration/adapter/clickhouse/test_clickhouse_s3.py index 8fb8727f..10f1289e 100644 --- a/tests/integration/adapter/test_s3.py +++ b/tests/integration/adapter/clickhouse/test_clickhouse_s3.py @@ -27,6 +27,10 @@ select * from {{ clickhouse_s3source('taxi_s3', path='/trips_4.gz') }} LIMIT 5000 """ +s3_taxis_full_source = """ +select * from {{ clickhouse_s3source('taxi_s3', path='/trips_5.gz') }} LIMIT 1000 +""" + s3_taxis_inc = """ {{ config( materialized='incremental', @@ -84,3 +88,28 @@ def test_s3_incremental(self, project): ) assert 5000 < result[0] < 10000 assert result[1] > 0 + + +class TestS3Bucket: + @pytest.fixture(scope="class") + def project_config_update(self): + return { + 'vars': { + 'taxi_s3': { + 'bucket': 'https://datasets-documentation.s3.eu-west-3.amazonaws.com/nyc-taxi/', + 'fmt': 'TabSeparatedWithNames', + } + } + } + + @pytest.fixture(scope="class") + def models(self): + return { + "s3_taxis_source.sql": s3_taxis_full_source, + "schema.yml": schema_yaml, + } + + def test_read(self, project): + run_dbt(["run", "--select", "s3_taxis_source.sql"]) + result = project.run_sql("select count() as num_rows from s3_taxis_source", fetch="one") + assert result[0] == 1000 diff --git a/tests/integration/adapter/test_relations.py b/tests/integration/adapter/clickhouse/test_clickhouse_source_schema.py similarity index 100% rename from tests/integration/adapter/test_relations.py rename to tests/integration/adapter/clickhouse/test_clickhouse_source_schema.py diff --git a/tests/integration/adapter/clickhouse/test_clickhouse_sql_header.py b/tests/integration/adapter/clickhouse/test_clickhouse_sql_header.py new file mode 100644 index 00000000..0b135a96 --- /dev/null +++ b/tests/integration/adapter/clickhouse/test_clickhouse_sql_header.py @@ -0,0 +1,28 @@ +import pytest +from dbt.tests.util import run_dbt_and_capture + +my_model_sql_header_sql = """ +{{ + config( + materialized = "table", + ) +}} + +{% call set_sql_header(config) %} +set log_comment = 'TEST_LOG_COMMENT'; +{%- endcall %} +select getSettings('log_comment') as column_name +""" + + +class TestSQLHeader: + @pytest.fixture(scope="class") + def models(self): + return { + "my_model_sql_header.sql": my_model_sql_header_sql, + } + + def test__sql_header(self, project): + _, log_output = run_dbt_and_capture(["run", "-s", "my_model_sql_header"], expect_pass=False) + + assert 'Multi-statements' in log_output diff --git a/tests/integration/adapter/test_basic.py b/tests/integration/adapter/clickhouse/test_clickhouse_table_materializations.py similarity index 71% rename from tests/integration/adapter/test_basic.py rename to tests/integration/adapter/clickhouse/test_clickhouse_table_materializations.py index 645c1181..0c0f1bbb 100644 --- a/tests/integration/adapter/test_basic.py +++ b/tests/integration/adapter/clickhouse/test_clickhouse_table_materializations.py @@ -2,15 +2,7 @@ import pytest from dbt.tests.adapter.basic.files import model_base, schema_base_yml, seeds_base_csv -from dbt.tests.adapter.basic.test_adapter_methods import BaseAdapterMethod from dbt.tests.adapter.basic.test_base import BaseSimpleMaterializations -from dbt.tests.adapter.basic.test_empty import BaseEmpty -from dbt.tests.adapter.basic.test_ephemeral import BaseEphemeral -from dbt.tests.adapter.basic.test_generic_tests import BaseGenericTests -from dbt.tests.adapter.basic.test_incremental import BaseIncremental -from dbt.tests.adapter.basic.test_singular_tests import BaseSingularTests -from dbt.tests.adapter.basic.test_snapshot_check_cols import BaseSnapshotCheckCols -from dbt.tests.adapter.basic.test_snapshot_timestamp import BaseSnapshotTimestamp from dbt.tests.util import ( check_relation_types, check_relations_equal, @@ -19,94 +11,7 @@ run_dbt, ) -# CSV content with boolean column type. -seeds_boolean_csv = """ -key,value -abc,true -def,false -hij,true -klm,false -""".lstrip() - -# CSV content with empty fields. -seeds_empty_csv = """ -key,val1,val2,str1 -abc,1,1,some_str -abc,1,0,"another string" -def,1,0, -hij,1,1,Caps -hij,1,,"second string" -klm,1,0,"test" -klm,1,,"test4" -""".lstrip() - -seeds_schema_yml = """ -version: 2 - -seeds: - - name: empty - config: - column_types: - val2: Nullable(UInt32) - str1: Nullable(String) -""" - -replicated_seeds_schema_yml = """ -version: 2 - -seeds: - - name: empty - config: - engine: ReplicatedMergeTree('/clickhouse/tables/{uuid}/one_shard', '{server_index}' ) - column_types: - val2: Nullable(UInt32) - str1: Nullable(String) -""" - -base_seeds_schema_yml = """ -version: 2 - -seeds: - - name: base - config: - engine: ReplicatedMergeTree('/clickhouse/tables/{uuid}/one_shard', '{server_index}' ) -""" - - -class TestBaseSimpleMaterializations(BaseSimpleMaterializations): - pass - - -class TestEmpty(BaseEmpty): - pass - - -class TestIncremental(BaseIncremental): - pass - - -class TestEphemeral(BaseEphemeral): - pass - - -class TestSnapshotTimestamp(BaseSnapshotTimestamp): - pass - - -class TestSnapshotCheckCols(BaseSnapshotCheckCols): - pass - - -class TestSingularTests(BaseSingularTests): - pass - - -class TestGenericTests(BaseGenericTests): - pass - - -class TestBaseAdapterMethod(BaseAdapterMethod): - pass +from tests.integration.adapter.basic.test_basic import base_seeds_schema_yml class TestMergeTreeTableMaterialization(BaseSimpleMaterializations): @@ -141,44 +46,6 @@ def test_base(self, project): assert result[0] == 10 -class TestCSVSeed: - @pytest.fixture(scope="class") - def seeds(self): - return { - "schema.yml": seeds_schema_yml, - "boolean.csv": seeds_boolean_csv, - "empty.csv": seeds_empty_csv, - } - - def test_seed(self, project): - # seed command - results = run_dbt(["seed"]) - assert len(results) == 2 - columns = project.run_sql("DESCRIBE TABLE empty", fetch='all') - assert columns[2][1] == 'Nullable(UInt32)' - assert columns[3][1] == 'Nullable(String)' - - -class TestReplicatedCSVSeed: - @pytest.fixture(scope="class") - def seeds(self): - return { - "schema.yml": replicated_seeds_schema_yml, - "empty.csv": seeds_empty_csv, - } - - @pytest.mark.skipif( - os.environ.get('DBT_CH_TEST_CLUSTER', '').strip() == '', reason='Not on a cluster' - ) - def test_seed(self, project): - # seed command - results = run_dbt(["seed"]) - assert len(results) == 1 - columns = project.run_sql("DESCRIBE TABLE empty", fetch='all') - assert columns[2][1] == 'Nullable(UInt32)' - assert columns[3][1] == 'Nullable(String)' - - class TestDistributedMaterializations(BaseSimpleMaterializations): '''Test distributed materializations and check if data is properly distributed/replicated''' @@ -317,7 +184,7 @@ def assert_total_count_correct(self, project): f"select count(host_name) as host_count from system.clusters where cluster='{cluster}'", fetch="one", ) - assert host_count[0] == 3 + assert host_count[0] > 1 table_count = project.run_sql( f"select count() From clusterAllReplicas('{cluster}', system.tables) " @@ -331,7 +198,7 @@ def assert_total_count_correct(self, project): fetch="one", ) - assert sum_count[0] == 3 * 10 + assert sum_count[0] >= 20 @pytest.mark.skipif( os.environ.get('DBT_CH_TEST_CLUSTER', '').strip() == '', reason='Not on a cluster' diff --git a/tests/integration/adapter/test_upper_case.py b/tests/integration/adapter/clickhouse/test_clickhouse_upper_case.py similarity index 100% rename from tests/integration/adapter/test_upper_case.py rename to tests/integration/adapter/clickhouse/test_clickhouse_upper_case.py diff --git a/tests/integration/adapter/test_column.py b/tests/integration/adapter/column_types/test_column_types.py similarity index 100% rename from tests/integration/adapter/test_column.py rename to tests/integration/adapter/column_types/test_column_types.py diff --git a/tests/integration/adapter/test_concurrency.py b/tests/integration/adapter/concurrency/test_concurrency.py similarity index 100% rename from tests/integration/adapter/test_concurrency.py rename to tests/integration/adapter/concurrency/test_concurrency.py diff --git a/tests/integration/adapter/constraints/fixtures_contraints.py b/tests/integration/adapter/constraints/fixtures_contraints.py new file mode 100644 index 00000000..508b25b1 --- /dev/null +++ b/tests/integration/adapter/constraints/fixtures_contraints.py @@ -0,0 +1,258 @@ +contract_model_schema_yml = """ +version: 2 +models: + - name: my_model + config: + contract: + enforced: true + columns: + - name: id + data_type: Int32 + description: hello + - name: color + data_type: String + - name: date_day + data_type: Date + - name: my_model_error + config: + contract: + enforced: true + columns: + - name: id + data_type: Int32 + description: hello + tests: + - unique + - name: color + data_type: String + - name: date_day + data_type: Date + - name: my_model_wrong_order + config: + contract: + enforced: true + columns: + - name: id + data_type: UInt32 + description: hello + tests: + - unique + - name: color + data_type: String + - name: date_day + data_type: Date + - name: my_model_wrong_name + config: + contract: + enforced: true + columns: + - name: id + data_type: Int32 + description: hello + - name: color + data_type: String + - name: date_day + data_type: Date +""" + + +# model columns in a different order to schema definitions +my_model_wrong_order_sql = """ +{{ + config( + materialized = "table" + ) +}} + +select + 'blue' as color, + 1::UInt32 as id, + toDate('2019-01-01') as date_day +""" + + +# model columns name different to schema definitions +my_model_wrong_name_sql = """ +{{ + config( + materialized = "table" + ) +}} + +select + 'blue' as color, + 1 as error, + '2019-01-01' as date_day +""" + + +my_model_data_type_sql = """ +{{{{ + config( + materialized = "table" + ) +}}}} + +select + {sql_value} as wrong_data_type_column_name +""" + + +model_data_type_schema_yml = """ +version: 2 +models: + - name: my_model_data_type + config: + contract: + enforced: true + columns: + - name: wrong_data_type_column_name + data_type: {data_type} +""" + +my_model_view_wrong_name_sql = """ +{{ + config( + materialized = "view" + ) +}} + +select + 'blue' as color, + 1 as error, + toDate('2019-01-01') as date_day +""" + +my_model_view_wrong_order_sql = """ +{{ + config( + materialized = "view" + ) +}} + +select + 'blue' as color, + 1::UInt32 as id, + toDate('2019-01-01') as date_day +""" + + +my_model_incremental_wrong_order_sql = """ +{{ + config( + materialized = "incremental", + on_schema_change='append_new_columns' + ) +}} + +select + 'blue' as color, + 1::UInt32 as id, + toDate('2019-01-01') as date_day +""" + +my_model_incremental_wrong_name_sql = """ +{{ + config( + materialized = "incremental", + on_schema_change='append_new_columns' + ) +}} + +select + 'blue' as color, + 1 as error, + '2019-01-01' as date_day +""" + +constraint_model_schema_yml = """ +version: 2 +models: + - name: bad_column_constraint_model + materialized: table + config: + contract: + enforced: true + columns: + - name: id + data_type: Int32 + constraints: + - type: check + expression: '> 0' + - name: color + data_type: String + - name: date_day + data_type: Date + - name: bad_foreign_key_model + config: + contract: + enforced: true + constraints: + - type: foreign_key + columns: [ id ] + expression: 'foreign_key_model (id)' + columns: + - name: id + data_type: Int32 + - name: check_constraints_model + config: + contract: + enforced: true + constraints: + - type: check + name: valid_id + expression: 'id > 100 and id < 200' + columns: + - name: id + data_type: Int32 + - name: color + data_type: String + - name: date_day + data_type: Date +""" + +bad_column_constraint_model_sql = """ +{{ + config( + materialized = "table" + ) +}} + +SELECT 5::Int32 as id, 'black' as color, toDate('2023-01-01') as date_day +""" + +bad_foreign_key_model_sql = """ +{{ + config( + materialized = "table" + ) +}} + +SELECT 1::Int32 as id +""" + +check_constraints_model_sql = """ +{{ + config( + materialized = "table", + ) +}} + +select + 'blue' as color, + 101::Int32 as id, + toDate('2019-01-01') as date_day +""" + +check_constraints_model_fail_sql = """ +{{ + config( + materialized = "table", + ) +}} + +select + 'blue' as color, + 1::Int32 as id, + toDate('2019-01-01') as date_day +""" diff --git a/tests/integration/adapter/constraints/test_constraints.py b/tests/integration/adapter/constraints/test_constraints.py new file mode 100644 index 00000000..2fe35537 --- /dev/null +++ b/tests/integration/adapter/constraints/test_constraints.py @@ -0,0 +1,190 @@ +import pytest +from dbt.tests.util import get_manifest, run_dbt, run_dbt_and_capture, write_file +from fixtures_contraints import ( + bad_column_constraint_model_sql, + bad_foreign_key_model_sql, + check_constraints_model_fail_sql, + check_constraints_model_sql, + constraint_model_schema_yml, + contract_model_schema_yml, + model_data_type_schema_yml, + my_model_data_type_sql, + my_model_incremental_wrong_name_sql, + my_model_incremental_wrong_order_sql, + my_model_view_wrong_name_sql, + my_model_view_wrong_order_sql, + my_model_wrong_name_sql, + my_model_wrong_order_sql, +) + + +class ClickHouseContractColumnsEqual: + """ + dbt should catch these mismatches during its "preflight" checks. + """ + + @pytest.fixture + def data_types(self): + # sql_column_value, schema_data_type, error_data_type + return [ + ["1::Int32", "Int32", "Int32"], + ["'1'", "String", "String"], + ["true", "Bool", "Bool"], + ["'2013-11-03'::DateTime", "DateTime", "DateTime"], + ["['a','b','c']", "Array(String)", "Array(String)"], + ["[1::Int32,2::Int32,3::Int32]", "Array(Int32)", "Array(Int32)"], + ["'1'::Float64", "Float64", "Float64"], + ] + + def test__contract_wrong_column_order(self, project): + # This no longer causes an error, since we enforce yaml column order + run_dbt(["run", "-s", "my_model_wrong_order"], expect_pass=True) + manifest = get_manifest(project.project_root) + model_id = "model.test.my_model_wrong_order" + my_model_config = manifest.nodes[model_id].config + contract_actual_config = my_model_config.contract + + assert contract_actual_config.enforced is True + + def test__contract_wrong_column_names(self, project): + _, log_output = run_dbt_and_capture(["run", "-s", "my_model_wrong_name"], expect_pass=False) + run_dbt(["run", "-s", "my_model_wrong_name"], expect_pass=False) + manifest = get_manifest(project.project_root) + model_id = "model.test.my_model_wrong_name" + my_model_config = manifest.nodes[model_id].config + contract_actual_config = my_model_config.contract + + assert contract_actual_config.enforced is True + + expected = ["id", "error", "missing in definition", "missing in contract"] + assert all([(exp in log_output or exp.upper() in log_output) for exp in expected]) + + def test__contract_wrong_column_data_types(self, project, data_types): + for (sql_column_value, schema_data_type, error_data_type) in data_types: + # Write parametrized data_type to sql file + write_file( + my_model_data_type_sql.format(sql_value=sql_column_value), + "models", + "my_model_data_type.sql", + ) + write_file( + model_data_type_schema_yml.format(data_type='Int128'), + "models", + "contract_schema.yml", + ) + + results, log_output = run_dbt_and_capture( + ["run", "-s", "my_model_data_type"], expect_pass=False + ) + manifest = get_manifest(project.project_root) + model_id = "model.test.my_model_data_type" + my_model_config = manifest.nodes[model_id].config + contract_actual_config = my_model_config.contract + + assert contract_actual_config.enforced is True + expected = [ + "wrong_data_type_column_name", + error_data_type, + "Int128", + "data type mismatch", + ] + assert all([(exp in log_output or exp.upper() in log_output) for exp in expected]) + + def test__contract_correct_column_data_types(self, project, data_types): + for (sql_column_value, schema_data_type, _) in data_types: + # Write parametrized data_type to sql file + write_file( + my_model_data_type_sql.format(sql_value=sql_column_value), + "models", + "my_model_data_type.sql", + ) + # Write correct data_type to corresponding schema file + write_file( + model_data_type_schema_yml.format(data_type=schema_data_type), + "models", + "contract_schema.yml", + ) + + run_dbt(["run", "-s", "my_model_data_type"]) + + manifest = get_manifest(project.project_root) + model_id = "model.test.my_model_data_type" + my_model_config = manifest.nodes[model_id].config + contract_actual_config = my_model_config.contract + + assert contract_actual_config.enforced is True + + +class TestTableContractColumnsEqual(ClickHouseContractColumnsEqual): + @pytest.fixture(scope="class") + def models(self): + return { + "my_model_wrong_order.sql": my_model_wrong_order_sql, + "my_model_wrong_name.sql": my_model_wrong_name_sql, + "contract_schema.yml": contract_model_schema_yml, + } + + +class TestViewContractColumnsEqual(ClickHouseContractColumnsEqual): + @pytest.fixture(scope="class") + def models(self): + return { + "my_model_wrong_order.sql": my_model_view_wrong_order_sql, + "my_model_wrong_name.sql": my_model_view_wrong_name_sql, + "contract_schema.yml": contract_model_schema_yml, + } + + +class TestIncrementalContractColumnsEqual(ClickHouseContractColumnsEqual): + @pytest.fixture(scope="class") + def models(self): + return { + "my_model_wrong_order.sql": my_model_incremental_wrong_order_sql, + "my_model_wrong_name.sql": my_model_incremental_wrong_name_sql, + "contract_schema.yml": contract_model_schema_yml, + } + + +class TestBadConstraints: + @pytest.fixture(scope="class") + def models(self): + return { + "bad_column_constraint_model.sql": bad_column_constraint_model_sql, + "bad_foreign_key_model.sql": bad_foreign_key_model_sql, + "constraints_schema.yml": constraint_model_schema_yml, + } + + def test_invalid_column_constraint(self, project): + _, log_output = run_dbt_and_capture(["run", "-s", "bad_column_constraint_model"]) + assert "not supported" in log_output + + def test_invalid_fk_constraint(self, project): + _, log_output = run_dbt_and_capture(["run", "-s", "bad_foreign_key_model"]) + assert "not supported" in log_output + + +class TestModelConstraints: + @pytest.fixture(scope="class") + def models(self): + return { + "check_constraints_model.sql": check_constraints_model_sql, + "constraints_schema.yml": constraint_model_schema_yml, + } + + def test_model_constraints_ddl(self, project): + run_dbt(["run", "-s", "check_constraints_model"]) + + +class TestModelConstraintApplied: + @pytest.fixture(scope="class") + def models(self): + return { + "check_constraints_model.sql": check_constraints_model_fail_sql, + "constraints_schema.yml": constraint_model_schema_yml, + } + + def test_model_constraints_fail_ddl(self, project): + _, log_output = run_dbt_and_capture( + ["run", "-s", "check_constraints_model"], expect_pass=False + ) + assert 'violated' in log_output.lower() diff --git a/tests/integration/adapter/dbt_debug/test_dbt_debug.py b/tests/integration/adapter/dbt_debug/test_dbt_debug.py new file mode 100644 index 00000000..044d0634 --- /dev/null +++ b/tests/integration/adapter/dbt_debug/test_dbt_debug.py @@ -0,0 +1,22 @@ +import re + +from dbt.tests.adapter.dbt_debug.test_dbt_debug import BaseDebug +from dbt.tests.util import run_dbt + + +class TestDebugClickHouse(BaseDebug): + def test_ok(self, project): + run_dbt(["debug"]) + assert "ERROR" not in self.capsys.readouterr().out + + def test_nopass(self, project): + run_dbt(["debug", "--target", "nopass"], expect_pass=False) + self.assertGotValue(re.compile(r"\s+profiles\.yml file"), "ERROR invalid") + + def test_wronguser(self, project): + run_dbt(["debug", "--target", "wronguser"], expect_pass=False) + self.assertGotValue(re.compile(r"\s+Connection test"), "ERROR") + + def test_empty_target(self, project): + run_dbt(["debug", "--target", "none_target"], expect_pass=False) + self.assertGotValue(re.compile(r"\s+output 'none_target'"), "misconfigured") diff --git a/tests/integration/adapter/dbt_show/test_dbt_show.py b/tests/integration/adapter/dbt_show/test_dbt_show.py new file mode 100644 index 00000000..98d60315 --- /dev/null +++ b/tests/integration/adapter/dbt_show/test_dbt_show.py @@ -0,0 +1,9 @@ +from dbt.tests.adapter.dbt_show.test_dbt_show import BaseShowLimit, BaseShowSqlHeader + + +class TestShowLimit(BaseShowLimit): + pass + + +class TestShowSqlHeader(BaseShowSqlHeader): + pass diff --git a/tests/integration/adapter/test_grants.py b/tests/integration/adapter/grants/test_distributed_grants.py similarity index 74% rename from tests/integration/adapter/test_grants.py rename to tests/integration/adapter/grants/test_distributed_grants.py index 9d6aaab3..4f2aca32 100644 --- a/tests/integration/adapter/test_grants.py +++ b/tests/integration/adapter/grants/test_distributed_grants.py @@ -1,11 +1,7 @@ import os import pytest -from dbt.tests.adapter.grants.test_incremental_grants import BaseIncrementalGrants -from dbt.tests.adapter.grants.test_invalid_grants import BaseInvalidGrants from dbt.tests.adapter.grants.test_model_grants import BaseModelGrants -from dbt.tests.adapter.grants.test_seed_grants import BaseSeedGrants -from dbt.tests.adapter.grants.test_snapshot_grants import BaseSnapshotGrants from dbt.tests.util import get_manifest, run_dbt_and_capture, write_file distributed_table_model_schema_yml = """ @@ -20,31 +16,6 @@ """ -class TestModelGrants(BaseModelGrants): - pass - - -class TestIncrementalGrants(BaseIncrementalGrants): - pass - - -class TestSeedGrants(BaseSeedGrants): - pass - - -class TestInvalidGrants(BaseInvalidGrants): - def grantee_does_not_exist_error(self): - return "511" - - # ClickHouse doesn't give a very specific error for an invalid privilege - def privilege_does_not_exist_error(self): - return "Syntax error" - - -class TestSnapshotGrants(BaseSnapshotGrants): - pass - - class TestDistributedTableModelGrants(BaseModelGrants): @pytest.mark.skipif( os.environ.get('DBT_CH_TEST_CLUSTER', '').strip() == '', reason='Not on a cluster' diff --git a/tests/integration/adapter/grants/test_incremental_grants.py b/tests/integration/adapter/grants/test_incremental_grants.py new file mode 100644 index 00000000..06c1aad8 --- /dev/null +++ b/tests/integration/adapter/grants/test_incremental_grants.py @@ -0,0 +1,5 @@ +from dbt.tests.adapter.grants.test_incremental_grants import BaseIncrementalGrants + + +class TestIncrementalGrants(BaseIncrementalGrants): + pass diff --git a/tests/integration/adapter/grants/test_invalid_grants.py b/tests/integration/adapter/grants/test_invalid_grants.py new file mode 100644 index 00000000..2f54e290 --- /dev/null +++ b/tests/integration/adapter/grants/test_invalid_grants.py @@ -0,0 +1,10 @@ +from dbt.tests.adapter.grants.test_invalid_grants import BaseInvalidGrants + + +class TestInvalidGrants(BaseInvalidGrants): + def grantee_does_not_exist_error(self): + return "511" + + # ClickHouse doesn't give a very specific error for an invalid privilege + def privilege_does_not_exist_error(self): + return "Syntax error" diff --git a/tests/integration/adapter/grants/test_model_grants.py b/tests/integration/adapter/grants/test_model_grants.py new file mode 100644 index 00000000..a6db5924 --- /dev/null +++ b/tests/integration/adapter/grants/test_model_grants.py @@ -0,0 +1,5 @@ +from dbt.tests.adapter.grants.test_model_grants import BaseModelGrants + + +class TestModelGrants(BaseModelGrants): + pass diff --git a/tests/integration/adapter/grants/test_seed_grants.py b/tests/integration/adapter/grants/test_seed_grants.py new file mode 100644 index 00000000..e08361b0 --- /dev/null +++ b/tests/integration/adapter/grants/test_seed_grants.py @@ -0,0 +1,5 @@ +from dbt.tests.adapter.grants.test_seed_grants import BaseSeedGrants + + +class TestSeedGrants(BaseSeedGrants): + pass diff --git a/tests/integration/adapter/grants/test_snapshot_grants.py b/tests/integration/adapter/grants/test_snapshot_grants.py new file mode 100644 index 00000000..098a996b --- /dev/null +++ b/tests/integration/adapter/grants/test_snapshot_grants.py @@ -0,0 +1,5 @@ +from dbt.tests.adapter.grants.test_snapshot_grants import BaseSnapshotGrants + + +class TestSnapshotGrants(BaseSnapshotGrants): + pass diff --git a/tests/integration/adapter/hooks/test_model_hooks.py b/tests/integration/adapter/hooks/test_model_hooks.py new file mode 100644 index 00000000..3df77579 --- /dev/null +++ b/tests/integration/adapter/hooks/test_model_hooks.py @@ -0,0 +1,16 @@ +import pytest +from dbt.exceptions import CompilationError +from dbt.tests.adapter.hooks.fixtures import models__hooks_error +from dbt.tests.util import run_dbt + + +class TestDuplicateHooksInConfigs: + @pytest.fixture(scope="class") + def models(self): + return {"hooks.sql": models__hooks_error} + + def test_run_duplicate_hook_defs(self, project): + with pytest.raises(CompilationError) as exc: + run_dbt() + assert "pre_hook" in str(exc.value) + assert "pre-hook" in str(exc.value) diff --git a/tests/integration/adapter/incremental/test_incremental.py b/tests/integration/adapter/incremental/test_base_incremental.py similarity index 86% rename from tests/integration/adapter/incremental/test_incremental.py rename to tests/integration/adapter/incremental/test_base_incremental.py index bfa97fab..aa9812aa 100644 --- a/tests/integration/adapter/incremental/test_incremental.py +++ b/tests/integration/adapter/incremental/test_base_incremental.py @@ -1,6 +1,6 @@ import pytest from dbt.tests.adapter.basic.files import model_incremental, schema_base_yml -from dbt.tests.adapter.basic.test_incremental import BaseIncremental, BaseIncrementalNotSchemaChange +from dbt.tests.adapter.basic.test_incremental import BaseIncremental from dbt.tests.util import run_dbt uniq_schema = """ @@ -146,21 +146,3 @@ def models(self): "incremental.sql": incremental_sql, "schema.yml": schema_base_yml, } - - -incremental_not_schema_change_sql = """ -{{ config(materialized="incremental", unique_key="user_id_current_time",on_schema_change="sync_all_columns") }} -select - toString(1) || '-' || toString(now64()) as user_id_current_time, - {% if is_incremental() %} - 'thisis18characters' as platform - {% else %} - 'okthisis20characters' as platform - {% endif %} -""" - - -class TestIncrementalNotSchemaChange(BaseIncrementalNotSchemaChange): - @pytest.fixture(scope="class") - def models(self): - return {"incremental_not_schema_change.sql": incremental_not_schema_change_sql} diff --git a/tests/integration/adapter/incremental/test_distributed_incremental.py b/tests/integration/adapter/incremental/test_distributed_incremental.py index bcc0ddf6..f132933d 100644 --- a/tests/integration/adapter/incremental/test_distributed_incremental.py +++ b/tests/integration/adapter/incremental/test_distributed_incremental.py @@ -10,7 +10,7 @@ from dbt.tests.adapter.basic.test_incremental import BaseIncremental, BaseIncrementalNotSchemaChange from dbt.tests.util import run_dbt -from tests.integration.adapter.incremental.test_incremental import uniq_schema +from tests.integration.adapter.incremental.test_base_incremental import uniq_schema uniq_source_model = """ {{config( diff --git a/tests/integration/adapter/test_query_comments.py b/tests/integration/adapter/query_comment/test_query_comment.py similarity index 100% rename from tests/integration/adapter/test_query_comments.py rename to tests/integration/adapter/query_comment/test_query_comment.py diff --git a/tests/integration/adapter/test_changing_relation_type.py b/tests/integration/adapter/relations/test_changing_relation_type.py similarity index 100% rename from tests/integration/adapter/test_changing_relation_type.py rename to tests/integration/adapter/relations/test_changing_relation_type.py diff --git a/tests/integration/conftest.py b/tests/integration/conftest.py index b4b9c4f9..50b1af6a 100644 --- a/tests/integration/conftest.py +++ b/tests/integration/conftest.py @@ -32,7 +32,10 @@ def test_config(ch_test_users, ch_test_version): compose_file = f'{Path(__file__).parent}/docker-compose.yml' test_host = os.environ.get('DBT_CH_TEST_HOST', 'localhost') test_port = int(os.environ.get('DBT_CH_TEST_PORT', 8123)) - test_driver = 'native' if test_port in (10900, 9000, 9440) else 'http' + client_port = int(os.environ.get('DBT_CH_TEST_CLIENT_PORT', 0)) + test_driver = os.environ.get('DBT_CH_TEST_DRIVER', '').lower() + if test_driver == '': + test_driver = 'native' if test_port in (10900, 9000, 9440) else 'http' test_user = os.environ.get('DBT_CH_TEST_USER', 'default') test_password = os.environ.get('DBT_CH_TEST_PASSWORD', '') test_cluster = os.environ.get('DBT_CH_TEST_CLUSTER', '') @@ -49,7 +52,7 @@ def test_config(ch_test_users, ch_test_version): docker = os.environ.get('DBT_CH_TEST_USE_DOCKER', '').lower() in ('1', 'true', 'yes') if docker: - client_port = 10723 + client_port = client_port or 10723 test_port = 10900 if test_driver == 'native' else client_port try: run_cmd(['docker-compose', '-f', compose_file, 'down', '-v']) @@ -62,7 +65,7 @@ def test_config(ch_test_users, ch_test_version): wait_until_responsive(timeout=30.0, pause=0.5, check=lambda: is_responsive(url)) except Exception as e: raise Exception('Failed to run docker-compose: {}', str(e)) - else: + elif not client_port: if test_driver == 'native': client_port = 8443 if test_port == 9440 else 8123 else: @@ -94,6 +97,7 @@ def test_config(ch_test_users, ch_test_version): 'db_engine': test_db_engine, 'secure': test_secure, 'cluster_mode': test_cluster_mode, + 'database': '', } if docker: From 5e8e54b20adc9dc6e2cf999a86df9c052b9e0c52 Mon Sep 17 00:00:00 2001 From: Geoff Genz Date: Thu, 23 Nov 2023 15:29:20 -0700 Subject: [PATCH 20/78] Update test and dependency versions. (#211) --- .github/workflows/test_cloud.yml | 4 ++-- .github/workflows/test_matrix.yml | 3 ++- CHANGELOG.md | 2 +- dev_requirements.txt | 4 ++-- setup.py | 7 +++---- 5 files changed, 10 insertions(+), 10 deletions(-) diff --git a/.github/workflows/test_cloud.yml b/.github/workflows/test_cloud.yml index 5d98542f..d6403b1d 100644 --- a/.github/workflows/test_cloud.yml +++ b/.github/workflows/test_cloud.yml @@ -23,10 +23,10 @@ jobs: - name: Checkout uses: actions/checkout@v3 - - name: Setup Python 3.10 + - name: Setup Python 3.11 uses: actions/setup-python@v4 with: - python-version: '3.10' + python-version: '3.11' - name: Install requirements run: pip3 install -r dev_requirements.txt diff --git a/.github/workflows/test_matrix.yml b/.github/workflows/test_matrix.yml index 4976354e..636b2a71 100644 --- a/.github/workflows/test_matrix.yml +++ b/.github/workflows/test_matrix.yml @@ -27,9 +27,10 @@ jobs: - '3.10' - '3.11' clickhouse-version: - - '23.3' - '23.3' - '23.8' + - '23.9' + - '23.10' - latest steps: diff --git a/CHANGELOG.md b/CHANGELOG.md index a9efbce1..8da90756 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,4 +1,4 @@ -### Release [1.5.0], TBD +### Release [1.5.0], 2023-11-23 #### Improvements - Compatible with dbt 1.5.x - Contract support (using exact column data types) diff --git a/dev_requirements.txt b/dev_requirements.txt index 746e2cc0..5e1771ce 100644 --- a/dev_requirements.txt +++ b/dev_requirements.txt @@ -1,6 +1,6 @@ dbt-core~=1.5.8 -clickhouse-connect>=0.6.18 -clickhouse-driver>=0.2.3 +clickhouse-connect>=0.6.21 +clickhouse-driver>=0.2.6 pytest>=7.2.0 pytest-dotenv==0.5.2 dbt-tests-adapter~=1.5.8 diff --git a/setup.py b/setup.py index 1a2d2359..0bb32f68 100644 --- a/setup.py +++ b/setup.py @@ -25,7 +25,7 @@ def _dbt_clickhouse_version(): package_version = _dbt_clickhouse_version() description = '''The Clickhouse plugin for dbt (data build tool)''' -dbt_version = '1.4.0' +dbt_version = '1.5.0' dbt_minor = '.'.join(dbt_version.split('.')[0:2]) if not package_version.startswith(dbt_minor): @@ -55,8 +55,8 @@ def _dbt_clickhouse_version(): }, install_requires=[ f'dbt-core~={dbt_version}', - 'clickhouse-connect>=0.5.24', - 'clickhouse-driver>=0.2.3', + 'clickhouse-connect>=0.6.21', + 'clickhouse-driver>=0.2.6', ], python_requires=">=3.7", platforms='any', @@ -66,7 +66,6 @@ def _dbt_clickhouse_version(): 'Operating System :: Microsoft :: Windows', 'Operating System :: MacOS :: MacOS X', 'Operating System :: POSIX :: Linux', - 'Programming Language :: Python :: 3.7', 'Programming Language :: Python :: 3.8', 'Programming Language :: Python :: 3.9', 'Programming Language :: Python :: 3.10', From 588e5ca41f04c98e4bec8791f45a0521fbc465d0 Mon Sep 17 00:00:00 2001 From: Kristof Szaloki Date: Mon, 27 Nov 2023 17:29:28 +0100 Subject: [PATCH 21/78] Adjust the wrapper parenthesis around the table materialization sql code (#212) --- dbt/include/clickhouse/macros/materializations/table.sql | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/dbt/include/clickhouse/macros/materializations/table.sql b/dbt/include/clickhouse/macros/materializations/table.sql index 8386593b..6282ad40 100644 --- a/dbt/include/clickhouse/macros/materializations/table.sql +++ b/dbt/include/clickhouse/macros/materializations/table.sql @@ -153,7 +153,9 @@ {{ order_cols(label="order by") }} {{ partition_cols(label="partition by") }} {{ adapter.get_model_settings(model) }} - as ( {{ sql }} ) + as ( + {{ sql }} + ) {%- else %} create table {{ relation.include(database=False) }} {{ on_cluster_clause(relation)}} @@ -171,7 +173,9 @@ {%- if not adapter.is_before_version('22.7.1.2484') %} empty {%- endif %} - as ( {{ sql }} ) + as ( + {{ sql }} + ) {%- endif %} {%- endif %} From 1f17ec2e8f98a4e540446ce1e015cf6866e61ce7 Mon Sep 17 00:00:00 2001 From: Geoff Genz Date: Mon, 27 Nov 2023 09:39:19 -0700 Subject: [PATCH 22/78] Update for 1.5.1 bug fix --- CHANGELOG.md | 4 ++++ dbt/adapters/clickhouse/__version__.py | 2 +- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 8da90756..f8b1f0ac 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,7 @@ +### Release [1.5.1], 2023-11-27 +#### Bug Fix +- Fix table materialization for compatibility with SQLFluff. Thanks to [Kristof Szaloki](https://github.com/kris947) for the PR! + ### Release [1.5.0], 2023-11-23 #### Improvements - Compatible with dbt 1.5.x diff --git a/dbt/adapters/clickhouse/__version__.py b/dbt/adapters/clickhouse/__version__.py index 4139253f..b9148ac4 100644 --- a/dbt/adapters/clickhouse/__version__.py +++ b/dbt/adapters/clickhouse/__version__.py @@ -1 +1 @@ -version = '1.5.0' +version = '1.5.1' From 9997b825530994b9dbe11672731d74f7076a0423 Mon Sep 17 00:00:00 2001 From: Steven Reitsma <4895139+StevenReitsma@users.noreply.github.com> Date: Tue, 28 Nov 2023 16:22:43 +0100 Subject: [PATCH 23/78] Fix creation of replicated tables when using legacy materialization (#208) --- .../macros/materializations/incremental/incremental.sql | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dbt/include/clickhouse/macros/materializations/incremental/incremental.sql b/dbt/include/clickhouse/macros/materializations/incremental/incremental.sql index 9f9fa4bc..042a94e8 100644 --- a/dbt/include/clickhouse/macros/materializations/incremental/incremental.sql +++ b/dbt/include/clickhouse/macros/materializations/incremental/incremental.sql @@ -162,7 +162,7 @@ {{ create_distributed_local_table(distributed_intermediate_relation, intermediate_relation, existing_relation) }} {% else %} {% call statement('main') %} - create table {{ intermediate_relation }} as {{ new_data_relation }} {{ on_cluster_clause(existing_relation) }} + create table {{ intermediate_relation }} {{ on_cluster_clause(existing_relation) }} as {{ new_data_relation }} {% endcall %} {% endif %} From 3fec9a4e669a6032c8ec712391812e897ff326c3 Mon Sep 17 00:00:00 2001 From: Geoff Genz Date: Tue, 28 Nov 2023 08:48:34 -0700 Subject: [PATCH 24/78] On cluster sync cleanup --- .github/workflows/test_matrix.yml | 1 + CHANGELOG.md | 7 +++++++ dbt/adapters/clickhouse/__version__.py | 2 +- dbt/include/clickhouse/macros/adapters.sql | 2 +- dbt/include/clickhouse/macros/materializations/table.sql | 5 ++++- 5 files changed, 14 insertions(+), 3 deletions(-) diff --git a/.github/workflows/test_matrix.yml b/.github/workflows/test_matrix.yml index 636b2a71..3da76748 100644 --- a/.github/workflows/test_matrix.yml +++ b/.github/workflows/test_matrix.yml @@ -66,6 +66,7 @@ jobs: - name: Run Native tests env: DBT_CH_TEST_PORT: 9000 + DBT_CH_TEST_CLUSTER: test_shard run: | PYTHONPATH="${PYTHONPATH}:dbt" pytest tests diff --git a/CHANGELOG.md b/CHANGELOG.md index f8b1f0ac..bd0431d3 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,10 @@ +### Release [1.5.2], 2023-11-28 +#### Bug Fix +- The `ON CLUSTER` clause was in the incorrect place for legacy incremental materializations. This has been fixed. Thanks to +[Steven Reitsma](https://github.com/StevenReitsma) for the fix! +- The `ON CLUSTER` DDL for drop tables did not include a SYNC modifier, which might be the cause of some "table already exists" +errors + ### Release [1.5.1], 2023-11-27 #### Bug Fix - Fix table materialization for compatibility with SQLFluff. Thanks to [Kristof Szaloki](https://github.com/kris947) for the PR! diff --git a/dbt/adapters/clickhouse/__version__.py b/dbt/adapters/clickhouse/__version__.py index b9148ac4..e8b09c2b 100644 --- a/dbt/adapters/clickhouse/__version__.py +++ b/dbt/adapters/clickhouse/__version__.py @@ -1 +1 @@ -version = '1.5.1' +version = '1.5.2' diff --git a/dbt/include/clickhouse/macros/adapters.sql b/dbt/include/clickhouse/macros/adapters.sql index ae0ef8d6..718c775a 100644 --- a/dbt/include/clickhouse/macros/adapters.sql +++ b/dbt/include/clickhouse/macros/adapters.sql @@ -56,7 +56,7 @@ {% macro clickhouse__drop_relation(relation, obj_type='table') -%} {% call statement('drop_relation', auto_begin=False) -%} {# drop relation on cluster by default if cluster is set #} - drop {{ obj_type }} if exists {{ relation }} {{ on_cluster_clause(relation.without_identifier())}} + drop {{ obj_type }} if exists {{ relation }} {{ on_cluster_clause(relation.without_identifier(), True)}} {%- endcall %} {% endmacro %} diff --git a/dbt/include/clickhouse/macros/materializations/table.sql b/dbt/include/clickhouse/macros/materializations/table.sql index 6282ad40..ca07cdbe 100644 --- a/dbt/include/clickhouse/macros/materializations/table.sql +++ b/dbt/include/clickhouse/macros/materializations/table.sql @@ -121,11 +121,14 @@ {%- endif %} {%- endmacro -%} -{% macro on_cluster_clause(relation) %} +{% macro on_cluster_clause(relation, force_sync) %} {% set active_cluster = adapter.get_clickhouse_cluster_name() %} {%- if active_cluster is not none and relation.should_on_cluster %} {# Add trailing whitespace to avoid problems when this clause is not last #} ON CLUSTER {{ active_cluster + ' ' }} + {%- if force_sync %} + SYNC + {%- endif %} {%- endif %} {%- endmacro -%} From bf11cbea501d5e254888c51c7a6ebb4e34a1d188 Mon Sep 17 00:00:00 2001 From: Geoff Genz Date: Tue, 28 Nov 2023 18:04:49 -0700 Subject: [PATCH 25/78] Bug fixes related to model settings. (#214) --- CHANGELOG.md | 21 ++++++- README.md | 33 ++++++---- dbt/adapters/clickhouse/credentials.py | 2 + dbt/adapters/clickhouse/dbclient.py | 10 +++ dbt/adapters/clickhouse/impl.py | 12 +++- .../incremental/incremental.sql | 9 +-- .../macros/materializations/seed.sql | 2 +- .../macros/materializations/table.sql | 4 +- tests/integration/adapter/basic/test_basic.py | 2 + .../test_clickhouse_table_materializations.py | 11 +++- .../adapter/constraints/test_constraints.py | 4 +- .../incremental/test_base_incremental.py | 61 +++++++++++++++---- 12 files changed, 133 insertions(+), 38 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index bd0431d3..5f4e6162 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,9 +1,26 @@ ### Release [1.5.2], 2023-11-28 -#### Bug Fix +#### Bug Fixes - The `ON CLUSTER` clause was in the incorrect place for legacy incremental materializations. This has been fixed. Thanks to [Steven Reitsma](https://github.com/StevenReitsma) for the fix! - The `ON CLUSTER` DDL for drop tables did not include a SYNC modifier, which might be the cause of some "table already exists" -errors +errors. The `SYNC` modifier has been added to the `on_cluster` macro when dropping relations. +- Fixed a bug where using table settings such as `allow_nullable_key` would break "legacy" incremental materializations. Closes +https://github.com/ClickHouse/dbt-clickhouse/issues/209. Also see the new model `config` property `insert_settings` described +below. +- Fixed an issue where incremental materializations would incorrectly exclude duplicated inserted elements due to "automatic" +ClickHouse deduplication on replicated tables. Closes https://github.com/ClickHouse/dbt-clickhouse/issues/213. The fix consists +of always sending a `replicated_deduplication_window=0` table setting when creating the incremental relations. This +behavior can be overridden by setting the new profile parameter `allow_automatic_deduplication` to `True`, although for +general dbt operations this is probably not necessary and not recommended. Finally thanks to Andy(https://github.com/andy-miracl) +for the report and debugging help! + +#### Improvements +- Added a new profile property `allow_automatic_deduplication`, which defaults to `False`. ClickHouse Replicated deduplication is +now disable for incremental inserts, but this property can be set to true if for some reason the default ClickHouse behavior +for inserted blocks is desired. +- Added a new model `config` property `query_settings` for any ClickHouse settings that should be sent with the `INSERT INTO` +or `DELETE_FROM` queries used with materializations. Note this is distinct from the existing property `settings` which is +used for ClickHouse "table" settings in DDL statements like `CREATE TABLE ... AS`. ### Release [1.5.1], 2023-11-27 #### Bug Fix diff --git a/README.md b/README.md index 8e008998..b5c8b8b8 100644 --- a/README.md +++ b/README.md @@ -77,6 +77,7 @@ your_profile_name: use_lw_deletes: [False] Use the strategy `delete+insert` as the default incremental strategy. check_exchange: [True] # Validate that clickhouse support the atomic EXCHANGE TABLES command. (Not needed for most ClickHouse versions) local_suffix [_local] # Table suffix of local tables on shards for distributed materializations. + allow_automatic_deduplication [False] # Enable ClickHouse automatic deduplication for Replicated tables custom_settings: [{}] # A dictionary/mapping of custom ClickHouse settings for the connection - default is empty. # Native (clickhouse-driver) connection settings @@ -87,17 +88,27 @@ your_profile_name: ## Model Configuration -| Option | Description | Required? | -|------------------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-----------------------------------| -| engine | The table engine (type of table) to use when creating tables | Optional (default: `MergeTree()`) | -| order_by | A tuple of column names or arbitrary expressions. This allows you to create a small sparse index that helps find data faster. | Optional (default: `tuple()`) | -| partition_by | A partition is a logical combination of records in a table by a specified criterion. The partition key can be any expression from the table columns. | Optional | -| sharding_key | Sharding key determines the destination server when inserting into distributed engine table. The sharding key can be random or as an output of a hash function | Optional (default: `rand()`) | -| primary_key | Like order_by, a ClickHouse primary key expression. If not specified, ClickHouse will use the order by expression as the primary key | -| unique_key | A tuple of column names that uniquely identify rows. Used with incremental models for updates. | Optional | -| inserts_only | If set to True for an incremental model, incremental updates will be inserted directly to the target table without creating intermediate table. It has been deprecated in favor of the `append` incremental `strategy`, which operates in the same way | Optional | -| incremental_strategy | Incremental model update strategy of `delete+insert` or `append`. See the following Incremental Model Strategies | Optional (default: `default`) | -| incremental_predicates | Additional conditions to be applied to the incremental materialization (only applied to `delete+insert` strategy | +| Option | Description | Default if any | +|------------------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|----------------| +| engine | The table engine (type of table) to use when creating tables | `MergeTree()` | +| order_by | A tuple of column names or arbitrary expressions. This allows you to create a small sparse index that helps find data faster. | `tuple()` | +| partition_by | A partition is a logical combination of records in a table by a specified criterion. The partition key can be any expression from the table columns. | | +| sharding_key | Sharding key determines the destination server when inserting into distributed engine table. The sharding key can be random or as an output of a hash function | `rand()`) | +| primary_key | Like order_by, a ClickHouse primary key expression. If not specified, ClickHouse will use the order by expression as the primary key | | +| unique_key | A tuple of column names that uniquely identify rows. Used with incremental models for updates. | | +| inserts_only | If set to True for an incremental model, incremental updates will be inserted directly to the target table without creating intermediate table. It has been deprecated in favor of the `append` incremental `strategy`, which operates in the same way | | +| incremental_strategy | Incremental model update strategy of `delete+insert` or `append`. See the following Incremental Model Strategies | `default` | +| incremental_predicates | Additional conditions to be applied to the incremental materialization (only applied to `delete+insert` strategy | | +| settings | A map/dictionary of "TABLE" settings to be used to DDL statements like 'CREATE TABLE' with this model | | +| query_settings | A map/dictionary of ClickHouse user level settings to be used with `INSERT` or `DELETE` statements in conjunction with this model | | + +## A Note on Model Settings +ClickHouse has several types/levels of "settings". In the model configuration above, two types of these are configurable. `settings` means the `SETTINGS` +clause used in `CREATE TABLE/VIEW` types of DDL statements, so this is generally settings that are specific to the specific ClickHouse table engine. The new +`query_settings` is use to add a `SETTINGS` clause to the `INSERT` and `DELETE` queries used for model materialization (including incremental materializations). +There are hundreds of ClickHouse settings, and it's not always clear which is a "table" setting and which is a "user" setting (although the latter are generally +available in the `system.settings` table.) In general the defaults are recommended, and any use of these properties should be carefully researched and tested. + ## ClickHouse Cluster `cluster` setting in profile enables dbt-clickhouse to run against a ClickHouse cluster. diff --git a/dbt/adapters/clickhouse/credentials.py b/dbt/adapters/clickhouse/credentials.py index 427b94c1..cbf65069 100644 --- a/dbt/adapters/clickhouse/credentials.py +++ b/dbt/adapters/clickhouse/credentials.py @@ -33,6 +33,7 @@ class ClickHouseCredentials(Credentials): custom_settings: Optional[Dict[str, Any]] = None use_lw_deletes: bool = False local_suffix: str = 'local' + allow_automatic_deduplication = False @property def type(self): @@ -73,4 +74,5 @@ def _connection_keys(self): 'check_exchange', 'custom_settings', 'use_lw_deletes', + 'allow_automatic_deduplication', ) diff --git a/dbt/adapters/clickhouse/dbclient.py b/dbt/adapters/clickhouse/dbclient.py index ef069ab3..ab5567e8 100644 --- a/dbt/adapters/clickhouse/dbclient.py +++ b/dbt/adapters/clickhouse/dbclient.py @@ -1,5 +1,6 @@ import uuid from abc import ABC, abstractmethod +from typing import Dict from dbt.exceptions import DbtDatabaseError, FailedToConnectError @@ -8,6 +9,7 @@ LW_DELETE_SETTING = 'allow_experimental_lightweight_delete' ND_MUTATION_SETTING = 'allow_nondeterministic_mutations' +DEDUP_WINDOW_SETTING = 'replicated_deduplication_window' def get_db_client(credentials: ClickHouseCredentials): @@ -79,6 +81,9 @@ def __init__(self, credentials: ClickHouseCredentials): except Exception as ex: self.close() raise ex + self._model_settings = {} + if not credentials.allow_automatic_deduplication: + self._model_settings[DEDUP_WINDOW_SETTING] = '0' @abstractmethod def query(self, sql: str, **kwargs): @@ -115,6 +120,11 @@ def _set_client_database(self): def _server_version(self): pass + def update_model_settings(self, model_settings: Dict[str, str]): + for key, value in self._model_settings.items(): + if key not in model_settings: + model_settings[key] = value + def _check_lightweight_deletes(self, requested: bool): lw_deletes = self.get_ch_setting(LW_DELETE_SETTING) nd_mutations = self.get_ch_setting(ND_MUTATION_SETTING) diff --git a/dbt/adapters/clickhouse/impl.py b/dbt/adapters/clickhouse/impl.py index ec4b3c07..6cc6055f 100644 --- a/dbt/adapters/clickhouse/impl.py +++ b/dbt/adapters/clickhouse/impl.py @@ -367,7 +367,17 @@ def run_sql_for_tests(self, sql, fetch, conn): @available def get_model_settings(self, model): - settings = model['config'].get('settings', dict()) + settings = model['config'].get('settings', {}) + conn = self.connections.get_if_exists() + conn.handle.update_model_settings(settings) + res = [] + for key in settings: + res.append(f' {key}={settings[key]}') + return '' if len(res) == 0 else 'SETTINGS ' + ', '.join(res) + '\n' + + @available + def get_model_query_settings(self, model): + settings = model['config'].get('query_settings', {}) res = [] for key in settings: res.append(f' {key}={settings[key]}') diff --git a/dbt/include/clickhouse/macros/materializations/incremental/incremental.sql b/dbt/include/clickhouse/macros/materializations/incremental/incremental.sql index 042a94e8..ca15991b 100644 --- a/dbt/include/clickhouse/macros/materializations/incremental/incremental.sql +++ b/dbt/include/clickhouse/macros/materializations/incremental/incremental.sql @@ -178,7 +178,7 @@ select {{ unique_key }} from {{ inserting_relation }} ) - {{ adapter.get_model_settings(model) }} + {{ adapter.get_model_query_settings(model) }} {% endcall %} -- Insert all of the new data into the temporary table @@ -186,7 +186,7 @@ insert into {{ inserted_relation }} ({{ dest_cols_csv }}) select {{ dest_cols_csv }} from {{ inserting_relation }} - {{ adapter.get_model_settings(model) }} + {{ adapter.get_model_query_settings(model) }} {% endcall %} {% do adapter.drop_relation(new_data_relation) %} @@ -228,13 +228,14 @@ {% for predicate in incremental_predicates %} and {{ predicate }} {% endfor %} - {%- endif -%}; + {%- endif -%} + {{ adapter.get_model_query_settings(model) }} {% endcall %} {%- set dest_columns = adapter.get_columns_in_relation(existing_relation) -%} {%- set dest_cols_csv = dest_columns | map(attribute='quoted') | join(', ') -%} {% call statement('insert_new_data') %} - insert into {{ existing_relation }} select {{ dest_cols_csv }} from {{ inserting_relation }} + insert into {{ existing_relation }} {{ adapter.get_model_query_settings(model) }} select {{ dest_cols_csv }} from {{ inserting_relation }} {% endcall %} {% do adapter.drop_relation(new_data_relation) %} {{ drop_relation_if_exists(distributed_new_data_relation) }} diff --git a/dbt/include/clickhouse/macros/materializations/seed.sql b/dbt/include/clickhouse/macros/materializations/seed.sql index 120e6c48..f05a5ac4 100644 --- a/dbt/include/clickhouse/macros/materializations/seed.sql +++ b/dbt/include/clickhouse/macros/materializations/seed.sql @@ -4,7 +4,7 @@ {% set sql -%} insert into {{ this.render() }} ({{ cols_sql }}) - {{ adapter.get_model_settings(model) }} + {{ adapter.get_model_query_settings(model) }} format CSV {{ data_sql }} {%- endset %} diff --git a/dbt/include/clickhouse/macros/materializations/table.sql b/dbt/include/clickhouse/macros/materializations/table.sql index ca07cdbe..22537f5c 100644 --- a/dbt/include/clickhouse/macros/materializations/table.sql +++ b/dbt/include/clickhouse/macros/materializations/table.sql @@ -188,11 +188,13 @@ {%- set dest_columns = adapter.get_columns_in_relation(target_relation) -%} {%- set dest_cols_csv = dest_columns | map(attribute='quoted') | join(', ') -%} - insert into {{ target_relation }} ({{ dest_cols_csv }}) + insert into {{ target_relation }} + ({{ dest_cols_csv }}) {%- if has_contract -%} -- Use a subquery to get columns in the right order SELECT {{ dest_cols_csv }} FROM ( {{ sql }} ) {%- else -%} {{ sql }} + {{ adapter.get_model_query_settings(model) }} {%- endif -%} {%- endmacro %} diff --git a/tests/integration/adapter/basic/test_basic.py b/tests/integration/adapter/basic/test_basic.py index e340b5e1..75936f0b 100644 --- a/tests/integration/adapter/basic/test_basic.py +++ b/tests/integration/adapter/basic/test_basic.py @@ -33,6 +33,8 @@ column_types: val2: Nullable(UInt32) str1: Nullable(String) + settings: + allow_nullable_key: 1 """ replicated_seeds_schema_yml = """ diff --git a/tests/integration/adapter/clickhouse/test_clickhouse_table_materializations.py b/tests/integration/adapter/clickhouse/test_clickhouse_table_materializations.py index 0c0f1bbb..c7f20e00 100644 --- a/tests/integration/adapter/clickhouse/test_clickhouse_table_materializations.py +++ b/tests/integration/adapter/clickhouse/test_clickhouse_table_materializations.py @@ -18,8 +18,13 @@ class TestMergeTreeTableMaterialization(BaseSimpleMaterializations): @pytest.fixture(scope="class") def models(self): config_materialized_table = """ - {{ config(order_by='(some_date, id, name)', engine='MergeTree()', materialized='table', - settings={'allow_nullable_key': 1}) }} + {{ config( + order_by='(some_date, id, name)', + engine='MergeTree()', + materialized='table', + settings={'allow_nullable_key': 1}, + query_settings={'allow_nondeterministic_mutations': 1}) + }} """ base_table_sql = config_materialized_table + model_base return { @@ -204,7 +209,7 @@ def assert_total_count_correct(self, project): os.environ.get('DBT_CH_TEST_CLUSTER', '').strip() == '', reason='Not on a cluster' ) def test_base(self, project): - # cluster setting must exists + # cluster setting must exist cluster = project.test_config['cluster'] assert cluster diff --git a/tests/integration/adapter/constraints/test_constraints.py b/tests/integration/adapter/constraints/test_constraints.py index 2fe35537..f18a7ca9 100644 --- a/tests/integration/adapter/constraints/test_constraints.py +++ b/tests/integration/adapter/constraints/test_constraints.py @@ -60,7 +60,7 @@ def test__contract_wrong_column_names(self, project): assert all([(exp in log_output or exp.upper() in log_output) for exp in expected]) def test__contract_wrong_column_data_types(self, project, data_types): - for (sql_column_value, schema_data_type, error_data_type) in data_types: + for sql_column_value, schema_data_type, error_data_type in data_types: # Write parametrized data_type to sql file write_file( my_model_data_type_sql.format(sql_value=sql_column_value), @@ -91,7 +91,7 @@ def test__contract_wrong_column_data_types(self, project, data_types): assert all([(exp in log_output or exp.upper() in log_output) for exp in expected]) def test__contract_correct_column_data_types(self, project, data_types): - for (sql_column_value, schema_data_type, _) in data_types: + for sql_column_value, schema_data_type, _ in data_types: # Write parametrized data_type to sql file write_file( my_model_data_type_sql.format(sql_value=sql_column_value), diff --git a/tests/integration/adapter/incremental/test_base_incremental.py b/tests/integration/adapter/incremental/test_base_incremental.py index aa9812aa..24635db5 100644 --- a/tests/integration/adapter/incremental/test_base_incremental.py +++ b/tests/integration/adapter/incremental/test_base_incremental.py @@ -33,7 +33,8 @@ materialized='incremental', engine='MergeTree()', order_by=['ts'], - unique_key=['impid'] + unique_key=['impid'], + settings={'allow_nullable_key':'1'} ) }} select ts, impid from unique_source_one @@ -57,25 +58,18 @@ def test_simple_incremental(self, project): run_dbt(["run", "--select", "unique_incremental_one"]) -lw_delete_schema = """ -version: 2 - -models: - - name: "lw_delete_inc" - description: "Incremental table" -""" - lw_delete_inc = """ {{ config( materialized='incremental', order_by=['key1'], unique_key='key1', - incremental_strategy='delete+insert' + incremental_strategy='delete+insert', + settings={'allow_nullable_key':1} ) }} {% if is_incremental() %} - WITH (SELECT max(key1) - 20 FROM lw_delete_inc) as old_max - SELECT assumeNotNull(toUInt64(number + old_max + 1)) as key1, toInt64(-(number + old_max)) as key2, toString(number + 30) as value FROM numbers(100) + select 2 as key1, 500 as key2, 'test' as value UNION ALL + select 102 as key1, 400 as key2, 'test2' as value {% else %} SELECT toUInt64(number) as key1, toInt64(-number) as key2, toString(number) as value FROM numbers(100) {% endif %} @@ -93,7 +87,45 @@ def test_lw_delete(self, project): assert result[0] == 100 run_dbt() result = project.run_sql("select count(*) as num_rows from lw_delete_inc", fetch="one") - assert result[0] == 180 + assert result[0] == 101 + run_dbt() + result = project.run_sql("select count(*) as num_rows from lw_delete_inc", fetch="one") + assert result[0] == 101 + + +legacy_inc = """ +{{ config( + materialized='incremental', + order_by=['key1'], + unique_key='key1', + incremental_strategy='legacy', + settings={'allow_nullable_key':1} + ) +}} +{% if is_incremental() %} + select 2 as key1, 500 as key2, 'test' as value UNION ALL + select 102 as key1, 400 as key2, 'test2' as value +{% else %} + SELECT toUInt64(number) as key1, toInt64(-number) as key2, toString(number) as value FROM numbers(100) +{% endif %} +""" + + +class TestLegacyIncremental: + @pytest.fixture(scope="class") + def models(self): + return {"legacy_inc.sql": legacy_inc} + + def test_legacy(self, project): + run_dbt() + result = project.run_sql("select count(*) as num_rows from legacy_inc", fetch="one") + assert result[0] == 100 + run_dbt() + result = project.run_sql("select count(*) as num_rows from legacy_inc", fetch="one") + assert result[0] == 101 + run_dbt() + result = project.run_sql("select count(*) as num_rows from legacy_inc", fetch="one") + assert result[0] == 101 compound_key_schema = """ @@ -133,6 +165,9 @@ def test_compound_key(self, project): run_dbt() result = project.run_sql("select count(*) as num_rows from compound_key_inc", fetch="one") assert result[0] == 180 + run_dbt() + result = project.run_sql("select count(*) as num_rows from compound_key_inc", fetch="one") + assert result[0] == 260 class TestInsertsOnlyIncrementalMaterialization(BaseIncremental): From 8561210cebcd3ab100a7a2a8c618faf6bbef3447 Mon Sep 17 00:00:00 2001 From: Rory Sawyer Date: Wed, 29 Nov 2023 14:03:45 -0500 Subject: [PATCH 26/78] Add materialization macro for materialized view (#207) * Add materialization macro for materialized view * fix isort issues in materialized view test --- .../materializations/materialized_view.sql | 120 +++++++++++++ .../adapter/test_materialized_view.py | 164 ++++++++++++++++++ 2 files changed, 284 insertions(+) create mode 100644 dbt/include/clickhouse/macros/materializations/materialized_view.sql create mode 100644 tests/integration/adapter/test_materialized_view.py diff --git a/dbt/include/clickhouse/macros/materializations/materialized_view.sql b/dbt/include/clickhouse/macros/materializations/materialized_view.sql new file mode 100644 index 00000000..f3c66cfd --- /dev/null +++ b/dbt/include/clickhouse/macros/materializations/materialized_view.sql @@ -0,0 +1,120 @@ +{#- + Create or update a materialized view in ClickHouse. + This involves creating both the materialized view itself and a + target table that the materialized view writes to. +-#} +{%- materialization materialized_view, adapter='clickhouse' -%} + + {%- set target_relation = this.incorporate(type='table') -%} + {%- set mv_name = target_relation.name + '_mv' -%} + {%- set target_mv = api.Relation.create(identifier=mv_name, schema=schema, database=database, type='materializedview') -%} + {%- set cluster_clause = on_cluster_clause(target_relation) -%} + + {# look for an existing relation for the target table and create backup relations if necessary #} + {%- set existing_relation = load_cached_relation(this) -%} + {%- set backup_relation = none -%} + {%- set preexisting_backup_relation = none -%} + {%- set preexisting_intermediate_relation = none -%} + {% if existing_relation is not none %} + {%- set backup_relation_type = existing_relation.type -%} + {%- set backup_relation = make_backup_relation(target_relation, backup_relation_type) -%} + {%- set preexisting_backup_relation = load_cached_relation(backup_relation) -%} + {% if not existing_relation.can_exchange %} + {%- set intermediate_relation = make_intermediate_relation(target_relation) -%} + {%- set preexisting_intermediate_relation = load_cached_relation(intermediate_relation) -%} + {% endif %} + {% endif %} + + {% set grant_config = config.get('grants') %} + + {{ run_hooks(pre_hooks, inside_transaction=False) }} + + -- drop the temp relations if they exist already in the database + {{ drop_relation_if_exists(preexisting_intermediate_relation) }} + {{ drop_relation_if_exists(preexisting_backup_relation) }} + + -- `BEGIN` happens here: + {{ run_hooks(pre_hooks, inside_transaction=True) }} + + {% if backup_relation is none %} + {{ log('Creating new materialized view ' + target_relation.name )}} + {% call statement('main') -%} + {{ clickhouse__get_create_materialized_view_as_sql(target_relation, sql) }} + {%- endcall %} + {% elif existing_relation.can_exchange %} + {{ log('Replacing existing materialized view' + target_relation.name) }} + {% call statement('drop existing materialized view') %} + drop view if exists {{ mv_name }} {{ cluster_clause }} + {% endcall %} + {% call statement('main') -%} + {{ get_create_table_as_sql(False, backup_relation, sql) }} + {%- endcall %} + {% do exchange_tables_atomic(backup_relation, existing_relation) %} + {% call statement('create new materialized view') %} + {{ clickhouse__create_mv_sql(mv_name, existing_relation.name, cluster_clause, sql) }} + {% endcall %} + {% else %} + {{ log('Replacing existing materialized view' + target_relation.name) }} + {{ clickhouse__replace_mv(target_relation, existing_relation, intermediate_relation, backup_relation, sql) }} + {% endif %} + + -- cleanup + {% set should_revoke = should_revoke(existing_relation, full_refresh_mode=True) %} + {% do apply_grants(target_relation, grant_config, should_revoke=should_revoke) %} + + {% do persist_docs(target_relation, model) %} + + {{ run_hooks(post_hooks, inside_transaction=True) }} + + {{ adapter.commit() }} + + {{ drop_relation_if_exists(backup_relation) }} + + {{ run_hooks(post_hooks, inside_transaction=False) }} + + {{ return({'relations': [target_relation, target_mv]}) }} + +{%- endmaterialization -%} + + +{# + There are two steps to creating a materialized view: + 1. Create a new table based on the SQL in the model + 2. Create a materialized view using the SQL in the model that inserts + data into the table creating during step 1 +#} +{% macro clickhouse__get_create_materialized_view_as_sql(relation, sql) -%} + {% call statement('create_target_table') %} + {{ get_create_table_as_sql(False, relation, sql) }} + {% endcall %} + {%- set cluster_clause = on_cluster_clause(relation) -%} + {%- set mv_name = relation.name + '_mv' -%} + {{ clickhouse__create_mv_sql(mv_name, relation.name, cluster_clause, sql) }} +{%- endmacro %} + + +{% macro clickhouse__create_mv_sql(relation_name, target_table, cluster_clause, sql) -%} + create materialized view if not exists {{ relation_name }} {{ cluster_clause }} + to {{ target_table }} + as {{ sql }} +{%- endmacro %} + + +{% macro clickhouse__replace_mv(target_relation, existing_relation, intermediate_relation, backup_relation, sql) %} + {# drop existing materialized view while we recreate the target table #} + {%- set cluster_clause = on_cluster_clause(target_relation) -%} + {%- set mv_name = target_relation.name + '_mv' -%} + {% call statement('drop existing mv') -%} + drop view if exists {{ mv_name }} {{ cluster_clause }} + {%- endcall %} + + {# recreate the target table #} + {% call statement('main') -%} + {{ get_create_table_as_sql(False, intermediate_relation, sql) }} + {%- endcall %} + {{ adapter.rename_relation(existing_relation, backup_relation) }} + {{ adapter.rename_relation(intermediate_relation, target_relation) }} + + {# now that the target table is recreated, we can finally create our new view #} + {{ clickhouse__create_mv_sql(mv_name, target_relation.name, cluster_clause, sql) }} +{% endmacro %} diff --git a/tests/integration/adapter/test_materialized_view.py b/tests/integration/adapter/test_materialized_view.py new file mode 100644 index 00000000..23452c40 --- /dev/null +++ b/tests/integration/adapter/test_materialized_view.py @@ -0,0 +1,164 @@ +""" +test materialized view creation +""" + +import json + +import pytest +from dbt.tests.util import check_relation_types, run_dbt + +PEOPLE_SEED_CSV = """ +id,name,age,department +1231,Dade,33,engineering +6666,Ksenia,48,engineering +8888,Kate,50,engineering +""".lstrip() + +# This model is parameterized, in a way, by the "run_type" dbt project variable +# This is to be able to switch between different model definitions within +# the same test run and allow us to test the evolution of a materialized view +MV_MODEL = """ +{{ config( + materialized='materialized_view', + engine='MergeTree()', + order_by='(id)', +) }} + +{% if var('run_type', '') == '' %} +select + id, + name, + case + when name like 'Dade' then 'crash_override' + when name like 'Kate' then 'acid burn' + else 'N/A' + end as hacker_alias +from {{ source('raw', 'people') }} +where department = 'engineering' + +{% else %} + +select + id, + name, + case + -- Dade wasn't always known as 'crash override'! + when name like 'Dade' and age = 11 then 'zero cool' + when name like 'Dade' and age != 11 then 'crash override' + when name like 'Kate' then 'acid burn' + else 'N/A' + end as hacker_alias +from {{ source('raw', 'people') }} +where department = 'engineering' + +{% endif %} +""" + + +SEED_SCHEMA_YML = """ +version: 2 + +sources: + - name: raw + schema: "{{ target.schema }}" + tables: + - name: people +""" + + +class TestBasicMV: + @pytest.fixture(scope="class") + def seeds(self): + """ + we need a base table to pull from + """ + return { + "people.csv": PEOPLE_SEED_CSV, + "schema.yml": SEED_SCHEMA_YML, + } + + @pytest.fixture(scope="class") + def models(self): + return { + "hackers.sql": MV_MODEL, + } + + def test_create(self, project): + """ + 1. create a base table via dbt seed + 2. create a model as a materialized view, selecting from the table created in (1) + 3. insert data into the base table and make sure it's there in the target table created in (2) + """ + results = run_dbt(["seed"]) + assert len(results) == 1 + columns = project.run_sql("DESCRIBE TABLE people", fetch="all") + assert columns[0][1] == "Int32" + + # create the model + results = run_dbt() + assert len(results) == 1 + + columns = project.run_sql("DESCRIBE TABLE hackers", fetch="all") + assert columns[0][1] == "Int32" + + columns = project.run_sql("DESCRIBE hackers_mv", fetch="all") + assert columns[0][1] == "Int32" + + check_relation_types( + project.adapter, + { + "hackers_mv": "view", + "hackers": "table", + }, + ) + + # insert some data and make sure it reaches the target table + project.run_sql( + f""" + insert into {project.test_schema}.people ("id", "name", "age", "department") + values (1232,'Dade',16,'engineering'), (9999,'eugene',40,'malware'); + """ + ) + + result = project.run_sql("select count(*) from hackers", fetch="all") + assert result[0][0] == 4 + + +class TestUpdateMV: + @pytest.fixture(scope="class") + def seeds(self): + """ + we need a base table to pull from + """ + return { + "people.csv": PEOPLE_SEED_CSV, + "schema.yml": SEED_SCHEMA_YML, + } + + @pytest.fixture(scope="class") + def models(self): + return { + "hackers.sql": MV_MODEL, + } + + def test_update(self, project): + # create our initial materialized view + run_dbt(["seed"]) + run_dbt() + + # re-run dbt but this time with the new MV SQL + run_vars = {"run_type": "extended_schema"} + run_dbt(["run", "--vars", json.dumps(run_vars)]) + + project.run_sql( + f""" + insert into {project.test_schema}.people ("id", "name", "age", "department") + values (1232,'Dade',11,'engineering'), (9999,'eugene',40,'malware'); + """ + ) + + # assert that we now have both of Dade's aliases in our hackers table + result = project.run_sql( + "select distinct hacker_alias from hackers where name = 'Dade'", fetch="all" + ) + assert len(result) == 2 From 246a4d878c5fd08bba45b6924f9504f13c57d3fb Mon Sep 17 00:00:00 2001 From: Geoff Genz Date: Wed, 29 Nov 2023 18:03:18 -0700 Subject: [PATCH 27/78] Release 1 6 0 (#215) * Initial dbt 1.6 update * Add skipped clone test * Clean up MV PR --- CHANGELOG.md | 10 ++++++ README.md | 31 +++++++++++++------ dbt/adapters/clickhouse/__version__.py | 2 +- dbt/adapters/clickhouse/connections.py | 2 +- dbt/adapters/clickhouse/dbclient.py | 4 ++- .../materializations/materialized_view.sql | 2 +- dev_requirements.txt | 8 ++--- pyproject.toml | 2 +- setup.py | 4 +-- .../adapter/dbt_clone/test_dbt_clone.py | 7 +++++ .../test_materialized_view.py | 3 +- 11 files changed, 53 insertions(+), 22 deletions(-) create mode 100644 tests/integration/adapter/dbt_clone/test_dbt_clone.py rename tests/integration/adapter/{ => materialized_view}/test_materialized_view.py (96%) diff --git a/CHANGELOG.md b/CHANGELOG.md index 5f4e6162..1ad6630f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,13 @@ +### Release [1.6.0], 2023-11-30 +#### Improvements +- Compatible with dbt 1.6.x. Note that dbt new `clone` feature is not supported, as ClickHouse has no native "light weight" +clone functionality, and copying tables without actual data transfer is not possible in ClickHouse (barring file manipulation +outside ClickHouse itself). +- A new ClickHouse specific Materialized View materialization contributed by [Rory Sawyer](https://github.com/SoryRawyer). +This creates a ClickHouse Materialized view using the `TO` form with the name `_mv` and the associated target +table ``. It's highly recommended to fully understand how ClickHouse materialized views work before using +this materialization. + ### Release [1.5.2], 2023-11-28 #### Bug Fixes - The `ON CLUSTER` clause was in the incorrect place for legacy incremental materializations. This has been fixed. Thanks to diff --git a/README.md b/README.md index b5c8b8b8..8022f214 100644 --- a/README.md +++ b/README.md @@ -22,6 +22,7 @@ pip install dbt-clickhouse - [x] Table materialization - [x] View materialization - [x] Incremental materialization +- [x] Materialized View materializations (uses the `TO` form of MATERIALIZED VIEW, experimental) - [x] Seeds - [x] Sources - [x] Docs generate @@ -102,16 +103,9 @@ your_profile_name: | settings | A map/dictionary of "TABLE" settings to be used to DDL statements like 'CREATE TABLE' with this model | | | query_settings | A map/dictionary of ClickHouse user level settings to be used with `INSERT` or `DELETE` statements in conjunction with this model | | -## A Note on Model Settings -ClickHouse has several types/levels of "settings". In the model configuration above, two types of these are configurable. `settings` means the `SETTINGS` -clause used in `CREATE TABLE/VIEW` types of DDL statements, so this is generally settings that are specific to the specific ClickHouse table engine. The new -`query_settings` is use to add a `SETTINGS` clause to the `INSERT` and `DELETE` queries used for model materialization (including incremental materializations). -There are hundreds of ClickHouse settings, and it's not always clear which is a "table" setting and which is a "user" setting (although the latter are generally -available in the `system.settings` table.) In general the defaults are recommended, and any use of these properties should be carefully researched and tested. - ## ClickHouse Cluster -`cluster` setting in profile enables dbt-clickhouse to run against a ClickHouse cluster. +The `cluster` setting in profile enables dbt-clickhouse to run against a ClickHouse cluster. ### Effective Scope @@ -130,6 +124,15 @@ table and incremental materializations with non-replicated engine will not be af If a model has been created without a `cluster` setting, dbt-clickhouse will detect the situation and run all DDL/DML without `on cluster` clause for this model. +## A Note on Model Settings + +ClickHouse has several types/levels of "settings". In the model configuration above, two types of these are configurable. `settings` means the `SETTINGS` +clause used in `CREATE TABLE/VIEW` types of DDL statements, so this is generally settings that are specific to the specific ClickHouse table engine. The new +`query_settings` is use to add a `SETTINGS` clause to the `INSERT` and `DELETE` queries used for model materialization (including incremental materializations). +There are hundreds of ClickHouse settings, and it's not always clear which is a "table" setting and which is a "user" setting (although the latter are generally +available in the `system.settings` table.) In general the defaults are recommended, and any use of these properties should be carefully researched and tested. + + ## Known Limitations * Ephemeral models/CTEs don't work if placed before the "INSERT INTO" in a ClickHouse insert statement, see https://github.com/ClickHouse/ClickHouse/issues/30323. This @@ -192,10 +195,10 @@ keys used to populate the parameters of the S3 table function: | fmt | The expected ClickHouse input format (such as `TSV` or `CSVWithNames`) of the referenced S3 objects. | | structure | The column structure of the data in bucket, as a list of name/datatype pairs, such as `['id UInt32', 'date DateTime', 'value String']` If not provided ClickHouse will infer the structure. | | aws_access_key_id | The S3 access key id. | -| aws_secret_access_key | The S3 secrete key. | +| aws_secret_access_key | The S3 secret key. | | compression | The compression method used with the S3 objects. If not provided ClickHouse will attempt to determine compression based on the file name. | -See the [S3 test file](https://github.com/ClickHouse/dbt-clickhouse/blob/main/tests/integration/adapter/test_s3.py) for examples of how to use this macro. +See the [S3 test file](https://github.com/ClickHouse/dbt-clickhouse/blob/main/tests/integration/adapter/clickhouse/test_clickhouse_s3.py) for examples of how to use this macro. # Contracts and Constraints @@ -203,6 +206,14 @@ Only exact column type contracts are supported. For example, a contract with a ClickHouse also support _only_ `CHECK` constraints on the entire table/model. Primary key, foreign key, unique, and column level CHECK constraints are not supported. (See ClickHouse documentation on primary/order by keys.) +# Materialized Views (Experimental) +A `materialized_view` materialization should be a `SELECT` from an existing (source) table. The adapter will create a target table with the model name +and a ClickHouse MATERIALIZED VIEW with the name `_mv`. Unlike PostgreSQL, a ClickHouse materialized view is not "static" (and has +no corresponding REFRESH operation). Instead, it acts as an "insert trigger", and will insert new rows into the target table using the defined `SELECT` +"transformation" in the view definition on rows inserted into the source table. See the [test file] +(https://github.com/ClickHouse/dbt-clickhouse/blob/main/tests/integration/adapter/materialized_view/test_materialized_view.py) for an introductory example +of how to use this functionality. + # Distributed materializations Notes: diff --git a/dbt/adapters/clickhouse/__version__.py b/dbt/adapters/clickhouse/__version__.py index e8b09c2b..f7c7de21 100644 --- a/dbt/adapters/clickhouse/__version__.py +++ b/dbt/adapters/clickhouse/__version__.py @@ -1 +1 @@ -version = '1.5.2' +version = '1.6.0' diff --git a/dbt/adapters/clickhouse/connections.py b/dbt/adapters/clickhouse/connections.py index c4098649..dcb411f8 100644 --- a/dbt/adapters/clickhouse/connections.py +++ b/dbt/adapters/clickhouse/connections.py @@ -73,7 +73,7 @@ def get_table_from_response(cls, response, column_names) -> agate.Table: return dbt.clients.agate_helper.table_from_data_flat(data, column_names) def execute( - self, sql: str, auto_begin: bool = False, fetch: bool = False + self, sql: str, auto_begin: bool = False, fetch: bool = False, limit: Optional[int] = None ) -> Tuple[AdapterResponse, agate.Table]: # Don't try to fetch result of clustered DDL responses, we don't know what to do with them if fetch and ddl_re.match(sql): diff --git a/dbt/adapters/clickhouse/dbclient.py b/dbt/adapters/clickhouse/dbclient.py index ab5567e8..9b8e1ee1 100644 --- a/dbt/adapters/clickhouse/dbclient.py +++ b/dbt/adapters/clickhouse/dbclient.py @@ -169,7 +169,9 @@ def _ensure_database(self, database_engine, cluster_name) -> None: if cluster_name is not None and cluster_name.strip() != '' else '' ) - self.command(f'CREATE DATABASE {self.database}{cluster_clause}{engine_clause}') + self.command( + f'CREATE DATABASE IF NOT EXISTS {self.database}{cluster_clause}{engine_clause}' + ) db_exists = self.command(check_db) if not db_exists: raise FailedToConnectError( diff --git a/dbt/include/clickhouse/macros/materializations/materialized_view.sql b/dbt/include/clickhouse/macros/materializations/materialized_view.sql index f3c66cfd..8ba96d02 100644 --- a/dbt/include/clickhouse/macros/materializations/materialized_view.sql +++ b/dbt/include/clickhouse/macros/materializations/materialized_view.sql @@ -7,7 +7,7 @@ {%- set target_relation = this.incorporate(type='table') -%} {%- set mv_name = target_relation.name + '_mv' -%} - {%- set target_mv = api.Relation.create(identifier=mv_name, schema=schema, database=database, type='materializedview') -%} + {%- set target_mv = api.Relation.create(identifier=mv_name, schema=schema, database=database, type='materialized_view') -%} {%- set cluster_clause = on_cluster_clause(target_relation) -%} {# look for an existing relation for the target table and create backup relations if necessary #} diff --git a/dev_requirements.txt b/dev_requirements.txt index 5e1771ce..8906bfec 100644 --- a/dev_requirements.txt +++ b/dev_requirements.txt @@ -1,16 +1,16 @@ -dbt-core~=1.5.8 +dbt-core~=1.6.9 clickhouse-connect>=0.6.21 clickhouse-driver>=0.2.6 pytest>=7.2.0 pytest-dotenv==0.5.2 -dbt-tests-adapter~=1.5.8 -black==22.3.0 +dbt-tests-adapter~=1.6.9 +black==23.11.0 isort==5.10.1 mypy==0.991 yamllint==1.26.3 flake8==4.0.1 types-requests==2.27.29 -agate~=1.6.3 +agate~=1.7.1 requests~=2.27.1 setuptools~=65.3.0 types-setuptools==67.1.0.0 \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml index 34c3848d..68570715 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,7 +1,7 @@ [tool.black] line-length = 100 skip-string-normalization = true -target-version = ['py38', 'py39'] +target-version = ['py310', 'py311'] exclude = '(\.eggs|\.git|\.mypy_cache|\.venv|venv|env|_build|build|build|dist|)' [tool.isort] diff --git a/setup.py b/setup.py index 0bb32f68..fb2d5311 100644 --- a/setup.py +++ b/setup.py @@ -25,7 +25,7 @@ def _dbt_clickhouse_version(): package_version = _dbt_clickhouse_version() description = '''The Clickhouse plugin for dbt (data build tool)''' -dbt_version = '1.5.0' +dbt_version = '1.6.0' dbt_minor = '.'.join(dbt_version.split('.')[0:2]) if not package_version.startswith(dbt_minor): @@ -58,7 +58,7 @@ def _dbt_clickhouse_version(): 'clickhouse-connect>=0.6.21', 'clickhouse-driver>=0.2.6', ], - python_requires=">=3.7", + python_requires=">=3.8", platforms='any', classifiers=[ 'Development Status :: 5 - Production/Stable', diff --git a/tests/integration/adapter/dbt_clone/test_dbt_clone.py b/tests/integration/adapter/dbt_clone/test_dbt_clone.py new file mode 100644 index 00000000..0252a2f7 --- /dev/null +++ b/tests/integration/adapter/dbt_clone/test_dbt_clone.py @@ -0,0 +1,7 @@ +import pytest +from dbt.tests.adapter.dbt_clone.test_dbt_clone import BaseClonePossible + + +@pytest.mark.skip("clone not supported") +class TestBaseClonePossible(BaseClonePossible): + pass diff --git a/tests/integration/adapter/test_materialized_view.py b/tests/integration/adapter/materialized_view/test_materialized_view.py similarity index 96% rename from tests/integration/adapter/test_materialized_view.py rename to tests/integration/adapter/materialized_view/test_materialized_view.py index 23452c40..b5efb018 100644 --- a/tests/integration/adapter/test_materialized_view.py +++ b/tests/integration/adapter/materialized_view/test_materialized_view.py @@ -1,5 +1,6 @@ """ -test materialized view creation +test materialized view creation. This is ClickHouse specific, which has a significantly different implementation +of materialized views from PostgreSQL or Oracle """ import json From 08bbbf97b67e5ef4eb78ad14bba0079f8c7aa5e2 Mon Sep 17 00:00:00 2001 From: Geoff Genz Date: Mon, 4 Dec 2023 17:29:19 -0700 Subject: [PATCH 28/78] Release 1 6 1 (#217) * Identifier quoting checkpoint * Identifier quoting checkpoint * Fix distributed table local quoting * Fix issues with deduplication settings --- CHANGELOG.md | 12 +++++++++ dbt/adapters/clickhouse/__version__.py | 2 +- dbt/adapters/clickhouse/credentials.py | 2 +- dbt/adapters/clickhouse/dbclient.py | 13 +++++++--- dbt/adapters/clickhouse/impl.py | 13 +--------- dbt/adapters/clickhouse/relation.py | 26 ++++++++++--------- dbt/adapters/clickhouse/util.py | 13 ++++++++++ .../incremental/incremental.sql | 3 ++- .../materializations/materialized_view.sql | 23 ++++++++-------- .../macros/materializations/snapshot.sql | 2 +- .../macros/materializations/table.sql | 6 ++--- .../adapter/aliases/test_aliases.py | 8 +++--- .../test_clickhouse_table_materializations.py | 10 +++---- .../test_materialized_view.py | 6 +++-- tests/unit/{test_adapter.py => test_util.py} | 2 +- 15 files changed, 82 insertions(+), 59 deletions(-) create mode 100644 dbt/adapters/clickhouse/util.py rename tests/unit/{test_adapter.py => test_util.py} (89%) diff --git a/CHANGELOG.md b/CHANGELOG.md index 1ad6630f..1db44176 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,15 @@ +### Release [1.6.1], 2023-12-04 +#### Bug Fixes +- Identifier quoting was disabled for tables/databases etc. This would cause failures for schemas or tables using reserved words +or containing special characters. This has been fixed and some macros have been updated to correctly handle such identifiers. +Note that there still may be untested edge cases where nonstandard identifiers cause issues, so they are still not recommended. +Closes https://github.com/ClickHouse/dbt-clickhouse/issues/144. Thanks to [Alexandru Pisarenco](https://github.com/apisarenco) for the +report and initial PR! +- The new `allow_automatic_deduplication` setting was not being correctly propagated to the adapter, so setting it to `True` +did not have the intended affect. In addition, this setting is now ignored for older ClickHouse versions that +do not support `CREATE TABLE AS SELECT ... EMPTY`, since the automatic deduplication window is required to allow correct +inserts in Replicated tables on those older versions. Fixes https://github.com/ClickHouse/dbt-clickhouse/issues/216. + ### Release [1.6.0], 2023-11-30 #### Improvements - Compatible with dbt 1.6.x. Note that dbt new `clone` feature is not supported, as ClickHouse has no native "light weight" diff --git a/dbt/adapters/clickhouse/__version__.py b/dbt/adapters/clickhouse/__version__.py index f7c7de21..43239b87 100644 --- a/dbt/adapters/clickhouse/__version__.py +++ b/dbt/adapters/clickhouse/__version__.py @@ -1 +1 @@ -version = '1.6.0' +version = '1.6.1' diff --git a/dbt/adapters/clickhouse/credentials.py b/dbt/adapters/clickhouse/credentials.py index cbf65069..d0775c6a 100644 --- a/dbt/adapters/clickhouse/credentials.py +++ b/dbt/adapters/clickhouse/credentials.py @@ -33,7 +33,7 @@ class ClickHouseCredentials(Credentials): custom_settings: Optional[Dict[str, Any]] = None use_lw_deletes: bool = False local_suffix: str = 'local' - allow_automatic_deduplication = False + allow_automatic_deduplication: bool = False @property def type(self): diff --git a/dbt/adapters/clickhouse/dbclient.py b/dbt/adapters/clickhouse/dbclient.py index 9b8e1ee1..bf6f3725 100644 --- a/dbt/adapters/clickhouse/dbclient.py +++ b/dbt/adapters/clickhouse/dbclient.py @@ -6,6 +6,8 @@ from dbt.adapters.clickhouse.credentials import ClickHouseCredentials from dbt.adapters.clickhouse.logger import logger +from dbt.adapters.clickhouse.query import quote_identifier +from dbt.adapters.clickhouse.util import compare_versions LW_DELETE_SETTING = 'allow_experimental_lightweight_delete' ND_MUTATION_SETTING = 'allow_nondeterministic_mutations' @@ -82,7 +84,10 @@ def __init__(self, credentials: ClickHouseCredentials): self.close() raise ex self._model_settings = {} - if not credentials.allow_automatic_deduplication: + if ( + not credentials.allow_automatic_deduplication + and compare_versions(self._server_version(), '22.7.1.2484') >= 0 + ): self._model_settings[DEDUP_WINDOW_SETTING] = '0' @abstractmethod @@ -159,7 +164,7 @@ def _check_lightweight_deletes(self, requested: bool): def _ensure_database(self, database_engine, cluster_name) -> None: if not self.database: return - check_db = f'EXISTS DATABASE {self.database}' + check_db = f'EXISTS DATABASE {quote_identifier(self.database)}' try: db_exists = self.command(check_db) if not db_exists: @@ -170,7 +175,7 @@ def _ensure_database(self, database_engine, cluster_name) -> None: else '' ) self.command( - f'CREATE DATABASE IF NOT EXISTS {self.database}{cluster_clause}{engine_clause}' + f'CREATE DATABASE IF NOT EXISTS {quote_identifier(self.database)}{cluster_clause}{engine_clause}' ) db_exists = self.command(check_db) if not db_exists: @@ -194,7 +199,7 @@ def _check_atomic_exchange(self) -> bool: table_id = str(uuid.uuid1()).replace('-', '') swap_tables = [f'__dbt_exchange_test_{x}_{table_id}' for x in range(0, 2)] for table in swap_tables: - self.command(create_cmd.format(table)) + self.command(create_cmd.format(quote_identifier(table))) try: self.command('EXCHANGE TABLES {} AND {}'.format(*swap_tables)) return True diff --git a/dbt/adapters/clickhouse/impl.py b/dbt/adapters/clickhouse/impl.py index 6cc6055f..bd20fb03 100644 --- a/dbt/adapters/clickhouse/impl.py +++ b/dbt/adapters/clickhouse/impl.py @@ -23,6 +23,7 @@ from dbt.adapters.clickhouse.logger import logger from dbt.adapters.clickhouse.query import quote_identifier from dbt.adapters.clickhouse.relation import ClickHouseRelation +from dbt.adapters.clickhouse.util import compare_versions GET_CATALOG_MACRO_NAME = 'get_catalog' LIST_SCHEMAS_MACRO_NAME = 'list_schemas' @@ -438,18 +439,6 @@ def test(row: agate.Row) -> bool: return test -def compare_versions(v1: str, v2: str) -> int: - v1_parts = v1.split('.') - v2_parts = v2.split('.') - for part1, part2 in zip(v1_parts, v2_parts): - try: - if int(part1) != int(part2): - return 1 if int(part1) > int(part2) else -1 - except ValueError: - raise DbtRuntimeError("Version must consist of only numbers separated by '.'") - return 0 - - COLUMNS_EQUAL_SQL = ''' SELECT row_count_diff.difference as row_count_difference, diff --git a/dbt/adapters/clickhouse/relation.py b/dbt/adapters/clickhouse/relation.py index 818928d8..cc2865f4 100644 --- a/dbt/adapters/clickhouse/relation.py +++ b/dbt/adapters/clickhouse/relation.py @@ -3,16 +3,18 @@ from dbt.adapters.base.relation import BaseRelation, Policy, Self from dbt.contracts.graph.nodes import ManifestNode, SourceDefinition -from dbt.contracts.relation import HasQuoting +from dbt.contracts.relation import HasQuoting, Path, RelationType from dbt.exceptions import DbtRuntimeError from dbt.utils import deep_merge, merge +from dbt.adapters.clickhouse.query import quote_identifier + @dataclass class ClickHouseQuotePolicy(Policy): - database: bool = False - schema: bool = False - identifier: bool = False + database: bool = True + schema: bool = True + identifier: bool = True @dataclass @@ -26,7 +28,7 @@ class ClickHouseIncludePolicy(Policy): class ClickHouseRelation(BaseRelation): quote_policy: Policy = field(default_factory=lambda: ClickHouseQuotePolicy()) include_policy: Policy = field(default_factory=lambda: ClickHouseIncludePolicy()) - quote_character: str = '' + quote_character: str = '`' can_exchange: bool = False can_on_cluster: bool = False @@ -35,13 +37,13 @@ def __post_init__(self): raise DbtRuntimeError(f'Cannot set database {self.database} in clickhouse!') self.path.database = '' - def render(self): - if self.include_policy.database and self.include_policy.schema: - raise DbtRuntimeError( - 'Got a clickhouse relation with schema and database set to ' - 'include, but only one can be set' - ) - return super().render() + def render(self) -> str: + return ".".join(quote_identifier(part) for _, part in self._render_iterator() if part) + + def derivative(self, suffix: str, relation_type: Optional[str] = None) -> BaseRelation: + path = Path(schema=self.path.schema, database='', identifier=self.path.identifier + suffix) + derivative_type = RelationType[relation_type] if relation_type else self.type + return ClickHouseRelation(type=derivative_type, path=path) def matches( self, diff --git a/dbt/adapters/clickhouse/util.py b/dbt/adapters/clickhouse/util.py new file mode 100644 index 00000000..bfe7d239 --- /dev/null +++ b/dbt/adapters/clickhouse/util.py @@ -0,0 +1,13 @@ +from dbt.exceptions import DbtRuntimeError + + +def compare_versions(v1: str, v2: str) -> int: + v1_parts = v1.split('.') + v2_parts = v2.split('.') + for part1, part2 in zip(v1_parts, v2_parts): + try: + if int(part1) != int(part2): + return 1 if int(part1) > int(part2) else -1 + except ValueError: + raise DbtRuntimeError("Version must consist of only numbers separated by '.'") + return 0 diff --git a/dbt/include/clickhouse/macros/materializations/incremental/incremental.sql b/dbt/include/clickhouse/macros/materializations/incremental/incremental.sql index ca15991b..7ab105d7 100644 --- a/dbt/include/clickhouse/macros/materializations/incremental/incremental.sql +++ b/dbt/include/clickhouse/macros/materializations/incremental/incremental.sql @@ -218,7 +218,8 @@ {% call statement('delete_existing_data') %} {% if is_distributed %} - delete from {{ existing_relation }}{{ adapter.get_clickhouse_local_suffix() }} {{ on_cluster_clause(existing_relation) }} where ({{ unique_key }}) in (select {{ unique_key }} + {%- set existing_local = existing_relation.derivative(adapter.get_clickhouse_local_suffix()) %} + delete from {{ existing_local }} {{ on_cluster_clause(existing_relation) }} where ({{ unique_key }}) in (select {{ unique_key }} from {{ inserting_relation }}) {% else %} delete from {{ existing_relation }} where ({{ unique_key }}) in (select {{ unique_key }} diff --git a/dbt/include/clickhouse/macros/materializations/materialized_view.sql b/dbt/include/clickhouse/macros/materializations/materialized_view.sql index 8ba96d02..293cc41b 100644 --- a/dbt/include/clickhouse/macros/materializations/materialized_view.sql +++ b/dbt/include/clickhouse/macros/materializations/materialized_view.sql @@ -6,8 +6,7 @@ {%- materialization materialized_view, adapter='clickhouse' -%} {%- set target_relation = this.incorporate(type='table') -%} - {%- set mv_name = target_relation.name + '_mv' -%} - {%- set target_mv = api.Relation.create(identifier=mv_name, schema=schema, database=database, type='materialized_view') -%} + {%- set mv_relation = target_relation.derivative('_mv', 'MaterializedView') -%} {%- set cluster_clause = on_cluster_clause(target_relation) -%} {# look for an existing relation for the target table and create backup relations if necessary #} @@ -44,14 +43,14 @@ {% elif existing_relation.can_exchange %} {{ log('Replacing existing materialized view' + target_relation.name) }} {% call statement('drop existing materialized view') %} - drop view if exists {{ mv_name }} {{ cluster_clause }} + drop view if exists {{ mv_relation }} {{ cluster_clause }} {% endcall %} {% call statement('main') -%} {{ get_create_table_as_sql(False, backup_relation, sql) }} {%- endcall %} {% do exchange_tables_atomic(backup_relation, existing_relation) %} {% call statement('create new materialized view') %} - {{ clickhouse__create_mv_sql(mv_name, existing_relation.name, cluster_clause, sql) }} + {{ clickhouse__create_mv_sql(mv_relation, existing_relation.name, cluster_clause, sql) }} {% endcall %} {% else %} {{ log('Replacing existing materialized view' + target_relation.name) }} @@ -72,7 +71,7 @@ {{ run_hooks(post_hooks, inside_transaction=False) }} - {{ return({'relations': [target_relation, target_mv]}) }} + {{ return({'relations': [target_relation, mv_relation]}) }} {%- endmaterialization -%} @@ -88,13 +87,13 @@ {{ get_create_table_as_sql(False, relation, sql) }} {% endcall %} {%- set cluster_clause = on_cluster_clause(relation) -%} - {%- set mv_name = relation.name + '_mv' -%} - {{ clickhouse__create_mv_sql(mv_name, relation.name, cluster_clause, sql) }} + {%- set mv_relation = relation.derivative('_mv', 'MaterializedView') -%} + {{ clickhouse__create_mv_sql(mv_relation, relation, cluster_clause, sql) }} {%- endmacro %} -{% macro clickhouse__create_mv_sql(relation_name, target_table, cluster_clause, sql) -%} - create materialized view if not exists {{ relation_name }} {{ cluster_clause }} +{% macro clickhouse__create_mv_sql(mv_relation, target_table, cluster_clause, sql) -%} + create materialized view if not exists {{ mv_relation }} {{ cluster_clause }} to {{ target_table }} as {{ sql }} {%- endmacro %} @@ -103,9 +102,9 @@ {% macro clickhouse__replace_mv(target_relation, existing_relation, intermediate_relation, backup_relation, sql) %} {# drop existing materialized view while we recreate the target table #} {%- set cluster_clause = on_cluster_clause(target_relation) -%} - {%- set mv_name = target_relation.name + '_mv' -%} + {%- set mv_relation = target_relation.derivative('_mv', 'MaterializedView') -%} {% call statement('drop existing mv') -%} - drop view if exists {{ mv_name }} {{ cluster_clause }} + drop view if exists {{ mv_relation }} {{ cluster_clause }} {%- endcall %} {# recreate the target table #} @@ -116,5 +115,5 @@ {{ adapter.rename_relation(intermediate_relation, target_relation) }} {# now that the target table is recreated, we can finally create our new view #} - {{ clickhouse__create_mv_sql(mv_name, target_relation.name, cluster_clause, sql) }} + {{ clickhouse__create_mv_sql(mv_relation, target_relation, cluster_clause, sql) }} {% endmacro %} diff --git a/dbt/include/clickhouse/macros/materializations/snapshot.sql b/dbt/include/clickhouse/macros/materializations/snapshot.sql index 2a317736..71e5acc5 100644 --- a/dbt/include/clickhouse/macros/materializations/snapshot.sql +++ b/dbt/include/clickhouse/macros/materializations/snapshot.sql @@ -25,7 +25,7 @@ {%- set insert_cols_csv = insert_cols | join(', ') -%} {%- set valid_to_col = adapter.quote('dbt_valid_to') -%} - {%- set upsert = target ~ '__snapshot_upsert' -%} + {%- set upsert = target.derivative('__snapshot_upsert') -%} {% call statement('create_upsert_relation') %} create table if not exists {{ upsert }} as {{ target }} {% endcall %} diff --git a/dbt/include/clickhouse/macros/materializations/table.sql b/dbt/include/clickhouse/macros/materializations/table.sql index 22537f5c..72cc72c8 100644 --- a/dbt/include/clickhouse/macros/materializations/table.sql +++ b/dbt/include/clickhouse/macros/materializations/table.sql @@ -141,7 +141,7 @@ {% call statement('create_table_empty') %} {{ create_table }} {% endcall %} - {{ clickhouse__insert_into(relation.include(database=False), sql, has_contract) }} + {{ clickhouse__insert_into(relation, sql, has_contract) }} {%- endif %} {%- endmacro %} @@ -151,7 +151,7 @@ {{ sql_header if sql_header is not none }} {% if temporary -%} - create temporary table {{ relation.name }} + create temporary table {{ relation }} engine Memory {{ order_cols(label="order by") }} {{ partition_cols(label="partition by") }} @@ -160,7 +160,7 @@ {{ sql }} ) {%- else %} - create table {{ relation.include(database=False) }} + create table {{ relation }} {{ on_cluster_clause(relation)}} {%- if has_contract%} {{ get_assert_columns_equivalent(sql) }} diff --git a/tests/integration/adapter/aliases/test_aliases.py b/tests/integration/adapter/aliases/test_aliases.py index 30575aa8..a9a3d585 100644 --- a/tests/integration/adapter/aliases/test_aliases.py +++ b/tests/integration/adapter/aliases/test_aliases.py @@ -83,17 +83,17 @@ def test_alias_model_name(self, project): assert len(results) == 4 cluster = project.test_config['cluster'] - relation = relation_from_name(project.adapter, "foo") + local_relation = relation_from_name(project.adapter, "foo_local") result = project.run_sql( - f"select max(tablename) AS tablename From clusterAllReplicas('{cluster}', {relation}_local) ", + f"select max(tablename) AS tablename From clusterAllReplicas('{cluster}', {local_relation}) ", fetch="one", ) assert result[0] == "foo" - relation = relation_from_name(project.adapter, "ref_foo_alias") + local_relation = relation_from_name(project.adapter, "ref_foo_alias_local") result = project.run_sql( - f"select max(tablename) AS tablename From clusterAllReplicas('{cluster}', {relation}_local) ", + f"select max(tablename) AS tablename From clusterAllReplicas('{cluster}', {local_relation}) ", fetch="one", ) assert result[0] == "ref_foo_alias" diff --git a/tests/integration/adapter/clickhouse/test_clickhouse_table_materializations.py b/tests/integration/adapter/clickhouse/test_clickhouse_table_materializations.py index c7f20e00..ff6e2efb 100644 --- a/tests/integration/adapter/clickhouse/test_clickhouse_table_materializations.py +++ b/tests/integration/adapter/clickhouse/test_clickhouse_table_materializations.py @@ -77,23 +77,23 @@ def seeds(self): } def assert_total_count_correct(self, project): - '''Check if data is properly distributed''' + # Check if data is properly distributed cluster = project.test_config['cluster'] - table_relation = relation_from_name(project.adapter, "distributed") + table_relation = relation_from_name(project.adapter, "distributed_local") cluster_info = project.run_sql( f"select shard_num,max(host_name) as host_name, count(distinct replica_num) as replica_counts " f"from system.clusters where cluster='{cluster}' group by shard_num", fetch="all", ) sum_count = project.run_sql( - f"select count() From clusterAllReplicas('{cluster}',{table_relation}_local)", + f"select count() From clusterAllReplicas('{cluster}',{table_relation})", fetch="one", ) total_count = 0 # total count should be equal to sum(count of each shard * replica_counts) for shard_num, host_name, replica_counts in cluster_info: count = project.run_sql( - f"select count() From remote('{host_name}',{table_relation}_local)", + f"select count() From remote('{host_name}',{table_relation})", fetch="one", ) total_count += count[0] * replica_counts @@ -103,7 +103,7 @@ def assert_total_count_correct(self, project): os.environ.get('DBT_CH_TEST_CLUSTER', '').strip() == '', reason='Not on a cluster' ) def test_base(self, project): - # cluster setting must exists + # cluster setting must exist cluster = project.test_config['cluster'] assert cluster diff --git a/tests/integration/adapter/materialized_view/test_materialized_view.py b/tests/integration/adapter/materialized_view/test_materialized_view.py index b5efb018..9305d064 100644 --- a/tests/integration/adapter/materialized_view/test_materialized_view.py +++ b/tests/integration/adapter/materialized_view/test_materialized_view.py @@ -8,6 +8,8 @@ import pytest from dbt.tests.util import check_relation_types, run_dbt +from dbt.adapters.clickhouse.query import quote_identifier + PEOPLE_SEED_CSV = """ id,name,age,department 1231,Dade,33,engineering @@ -116,7 +118,7 @@ def test_create(self, project): # insert some data and make sure it reaches the target table project.run_sql( f""" - insert into {project.test_schema}.people ("id", "name", "age", "department") + insert into {quote_identifier(project.test_schema)}.people ("id", "name", "age", "department") values (1232,'Dade',16,'engineering'), (9999,'eugene',40,'malware'); """ ) @@ -153,7 +155,7 @@ def test_update(self, project): project.run_sql( f""" - insert into {project.test_schema}.people ("id", "name", "age", "department") + insert into {quote_identifier(project.test_schema)}.people ("id", "name", "age", "department") values (1232,'Dade',11,'engineering'), (9999,'eugene',40,'malware'); """ ) diff --git a/tests/unit/test_adapter.py b/tests/unit/test_util.py similarity index 89% rename from tests/unit/test_adapter.py rename to tests/unit/test_util.py index 0faf9dbe..d87d2e57 100644 --- a/tests/unit/test_adapter.py +++ b/tests/unit/test_util.py @@ -1,4 +1,4 @@ -from dbt.adapters.clickhouse.impl import compare_versions +from dbt.adapters.clickhouse.util import compare_versions def test_is_before_version(): From e6e74e494ae09f6ff56ff106f166933e2114bb34 Mon Sep 17 00:00:00 2001 From: Geoff Genz Date: Wed, 6 Dec 2023 13:39:46 -0700 Subject: [PATCH 29/78] Release 1 6 2 (#219) * Limited fix to completely broken `on_schema_change` * Tweak changelog --- CHANGELOG.md | 12 ++++ dbt/adapters/clickhouse/__version__.py | 2 +- dbt/adapters/clickhouse/errors.py | 24 +++++++ dbt/adapters/clickhouse/impl.py | 40 ++++++++++- dbt/adapters/clickhouse/util.py | 8 +++ .../incremental/incremental.sql | 68 ++++++++---------- .../adapter/basic/test_incremental.py | 2 +- .../adapter/incremental/test_schema_change.py | 71 +++++++++++++++++++ 8 files changed, 185 insertions(+), 42 deletions(-) create mode 100644 dbt/adapters/clickhouse/errors.py create mode 100644 tests/integration/adapter/incremental/test_schema_change.py diff --git a/CHANGELOG.md b/CHANGELOG.md index 1db44176..ec8ddc32 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,15 @@ +### Release [1.6.2], 2023-12-06 +#### Bug Fix +- The dbt `on_schema_change` configuration value for incremental models was effectively being ignored. This has been fixed +with a very limited implementation. Closes https://github.com/ClickHouse/dbt-clickhouse/issues/199. Because of the way that +ORDER BY/SORT BY/PARTITION BY/PRIMARY KEYS work in ClickHouse, plus the complexities of correctly transforming ClickHouse data types, +`sync_all_columns` is not currently supported (although an implementation that works for non-key columns is theoretically possible, +such an enhancement is not currently planned). Accordingly, only `ignore`, `fail`, and `append_new_columns` values are supported +for `on_schema_change`. It is also not currently supported for Distributed tables. + +Note that actually appending new columns requires a fallback to the `legacy` incremental strategy, which is quite inefficient, +so while theoretically possible, using `append_new_columns` is not recommended except for very small data volumes. + ### Release [1.6.1], 2023-12-04 #### Bug Fixes - Identifier quoting was disabled for tables/databases etc. This would cause failures for schemas or tables using reserved words diff --git a/dbt/adapters/clickhouse/__version__.py b/dbt/adapters/clickhouse/__version__.py index 43239b87..5ccf9d1c 100644 --- a/dbt/adapters/clickhouse/__version__.py +++ b/dbt/adapters/clickhouse/__version__.py @@ -1 +1 @@ -version = '1.6.1' +version = '1.6.2' diff --git a/dbt/adapters/clickhouse/errors.py b/dbt/adapters/clickhouse/errors.py new file mode 100644 index 00000000..1d3b5c69 --- /dev/null +++ b/dbt/adapters/clickhouse/errors.py @@ -0,0 +1,24 @@ +schema_change_fail_error = """ +The source and target schemas on this incremental model are out of sync. + They can be reconciled in several ways: + - set the `on_schema_change` config to `append_new_columns`. (ClickHouse does not support `sync_all_columns`) + - Re-run the incremental model with `full_refresh: True` to update the target schema. + - update the schema manually and re-run the process. + + Additional troubleshooting context: + Source columns not in target: {0} + Target columns not in source: {1} + New column types: {2} +""" + +schema_change_datatype_error = """ +The source and target schemas on this incremental model contain different data types. This is not supported. + + Changed column types: {0} +""" + +schema_change_missing_source_error = """ +The target schema in on this incremental model contains a column not in the source schema. This is not supported. + + Source columns not in target: {0} +""" diff --git a/dbt/adapters/clickhouse/impl.py b/dbt/adapters/clickhouse/impl.py index bd20fb03..ca0c3a44 100644 --- a/dbt/adapters/clickhouse/impl.py +++ b/dbt/adapters/clickhouse/impl.py @@ -20,10 +20,15 @@ from dbt.adapters.clickhouse.cache import ClickHouseRelationsCache from dbt.adapters.clickhouse.column import ClickHouseColumn from dbt.adapters.clickhouse.connections import ClickHouseConnectionManager +from dbt.adapters.clickhouse.errors import ( + schema_change_datatype_error, + schema_change_fail_error, + schema_change_missing_source_error, +) from dbt.adapters.clickhouse.logger import logger from dbt.adapters.clickhouse.query import quote_identifier from dbt.adapters.clickhouse.relation import ClickHouseRelation -from dbt.adapters.clickhouse.util import compare_versions +from dbt.adapters.clickhouse.util import NewColumnDataType, compare_versions GET_CATALOG_MACRO_NAME = 'get_catalog' LIST_SCHEMAS_MACRO_NAME = 'list_schemas' @@ -151,6 +156,39 @@ def calculate_incremental_strategy(self, strategy: str) -> str: strategy = 'legacy' return strategy + @available.parse_none + def check_incremental_schema_changes( + self, on_schema_change, existing, target_sql + ) -> List[ClickHouseColumn]: + if on_schema_change not in ('fail', 'ignore', 'append_new_columns'): + raise DbtRuntimeError( + "Only `fail`, `ignore`, and `append_new_columns` supported for `on_schema_change`" + ) + source = self.get_columns_in_relation(existing) + source_map = {column.name: column for column in source} + target = self.get_column_schema_from_query(target_sql) + target_map = {column.name: column for column in source} + source_not_in_target = [column for column in source if column.name not in target_map.keys()] + target_not_in_source = [column for column in target if column.name not in source_map.keys()] + new_column_data_types = [] + for target_column in target: + source_column = source_map.get(target_column.name) + if source_column and source_column.dtype != target_column.dtype: + new_column_data_types.append( + NewColumnDataType(source_column.name, target_column.dtype) + ) + if new_column_data_types: + raise DbtRuntimeError(schema_change_datatype_error.format(new_column_data_types)) + if source_not_in_target: + raise DbtRuntimeError(schema_change_missing_source_error.format(source_not_in_target)) + if target_not_in_source and on_schema_change == 'fail': + raise DbtRuntimeError( + schema_change_fail_error.format( + source_not_in_target, target_not_in_source, new_column_data_types + ) + ) + return target_not_in_source + @available.parse_none def s3source_clause( self, diff --git a/dbt/adapters/clickhouse/util.py b/dbt/adapters/clickhouse/util.py index bfe7d239..7114dbde 100644 --- a/dbt/adapters/clickhouse/util.py +++ b/dbt/adapters/clickhouse/util.py @@ -1,3 +1,5 @@ +from dataclasses import dataclass + from dbt.exceptions import DbtRuntimeError @@ -11,3 +13,9 @@ def compare_versions(v1: str, v2: str) -> int: except ValueError: raise DbtRuntimeError("Version must consist of only numbers separated by '.'") return 0 + + +@dataclass +class NewColumnDataType: + column_name: str + new_type: str diff --git a/dbt/include/clickhouse/macros/materializations/incremental/incremental.sql b/dbt/include/clickhouse/macros/materializations/incremental/incremental.sql index 7ab105d7..742642d2 100644 --- a/dbt/include/clickhouse/macros/materializations/incremental/incremental.sql +++ b/dbt/include/clickhouse/macros/materializations/incremental/incremental.sql @@ -50,21 +50,23 @@ {% endcall %} {% else %} - {% set schema_changes = none %} + {% set column_changes = none %} {% set incremental_strategy = adapter.calculate_incremental_strategy(config.get('incremental_strategy')) %} {% set incremental_predicates = config.get('predicates', none) or config.get('incremental_predicates', none) %} - {% if on_schema_change != 'ignore' %} - {%- set schema_changes = check_for_schema_changes(existing_relation, target_relation) -%} - {% if schema_changes['schema_changed'] and incremental_strategy in ('append', 'delete_insert') %} - {% set incremental_strategy = 'legacy' %} - {% do log('Schema changes detected, switching to legacy incremental strategy') %} + {%- if on_schema_change != 'ignore' %} + {%- set column_changes = adapter.check_incremental_schema_changes(on_schema_change, existing_relation, sql) -%} + {%- if column_changes %} + {%- if incremental_strategy in ('append', 'delete_insert') %} + {% set incremental_strategy = 'legacy' %} + {{ log('Schema changes detected, switching to legacy incremental strategy') }} + {%- endif %} {% endif %} {% endif %} {% if incremental_strategy != 'delete_insert' and incremental_predicates %} {% do exceptions.raise_compiler_error('Cannot apply incremental predicates with ' + incremental_strategy + ' strategy.') %} {% endif %} {% if incremental_strategy == 'legacy' %} - {% do clickhouse__incremental_legacy(existing_relation, intermediate_relation, schema_changes, unique_key) %} + {% do clickhouse__incremental_legacy(existing_relation, intermediate_relation, column_changes, unique_key) %} {% set need_swap = true %} {% elif incremental_strategy == 'delete_insert' %} {% do clickhouse__incremental_delete_insert(existing_relation, unique_key, incremental_predicates) %} @@ -109,32 +111,7 @@ {%- endmaterialization %} - -{% macro process_schema_changes(on_schema_change, source_relation, target_relation) %} - - {%- set schema_changes_dict = check_for_schema_changes(source_relation, target_relation) -%} - {% if not schema_changes_dict['schema_changed'] %} - {{ return }} - {% endif %} - - {% if on_schema_change == 'fail' %} - {% set fail_msg %} - The source and target schemas on this incremental model are out of sync! - They can be reconciled in several ways: - - set the `on_schema_change` config to either append_new_columns or sync_all_columns, depending on your situation. - - Re-run the incremental model with `full_refresh: True` to update the target schema. - - update the schema manually and re-run the process. - {% endset %} - {% do exceptions.raise_compiler_error(fail_msg) %} - {{ return }} - {% endif %} - - {% do sync_column_schemas(on_schema_change, target_relation, schema_changes_dict) %} - -{% endmacro %} - - -{% macro clickhouse__incremental_legacy(existing_relation, intermediate_relation, on_schema_change, unique_key, is_distributed=False) %} +{% macro clickhouse__incremental_legacy(existing_relation, intermediate_relation, column_changes, unique_key, is_distributed=False) %} {% set new_data_relation = existing_relation.incorporate(path={"identifier": existing_relation.identifier + '__dbt_new_data'}) %} {{ drop_relation_if_exists(new_data_relation) }} @@ -143,10 +120,17 @@ -- First create a temporary table for all of the new data {% if is_distributed %} + {% if column_changes %} + {% do exceptions.raise_compiler_error('Schema changes not supported with Distributed tables ') %} + {% endif %} -- Need to use distributed table to have data on all shards {%- set distributed_new_data_relation = existing_relation.incorporate(path={"identifier": existing_relation.identifier + '__dbt_distributed_new_data'}) -%} {%- set inserting_relation = distributed_new_data_relation -%} {{ create_distributed_local_table(distributed_new_data_relation, new_data_relation, existing_relation, sql) }} + {% elif column_changes %} + {% call statement('create_new_data_temp') %} + {{ get_create_table_as_sql(False, new_data_relation, sql) }} + {% endcall %} {% else %} {% call statement('create_new_data_temp') %} {{ get_create_table_as_sql(False, new_data_relation, sql) }} @@ -168,11 +152,11 @@ -- Insert all the existing rows into the new temporary table, ignoring any rows that have keys in the "new data" -- table. - {%- set dest_columns = adapter.get_columns_in_relation(existing_relation) -%} - {%- set dest_cols_csv = dest_columns | map(attribute='quoted') | join(', ') -%} + {%- set source_columns = adapter.get_columns_in_relation(existing_relation) -%} + {%- set source_columns_csv = source_columns | map(attribute='quoted') | join(', ') -%} {% call statement('insert_existing_data') %} - insert into {{ inserted_relation }} ({{ dest_cols_csv }}) - select {{ dest_cols_csv }} + insert into {{ inserted_relation }} ({{ source_columns_csv }}) + select {{ source_columns_csv }} from {{ existing_relation }} where ({{ unique_key }}) not in ( select {{ unique_key }} @@ -182,9 +166,15 @@ {% endcall %} -- Insert all of the new data into the temporary table + {% if column_changes %} + {%- set dest_columns = adapter.get_columns_in_relation(new_data_relation) -%} + {%- set dest_columns_csv = dest_columns | map(attribute='quoted') | join(', ') -%} + {% else %} + {%- set dest_columns_csv = source_columns_csv %} + {% endif %} {% call statement('insert_new_data') %} - insert into {{ inserted_relation }} ({{ dest_cols_csv }}) - select {{ dest_cols_csv }} + insert into {{ inserted_relation }} ({{ dest_columns_csv }}) + select {{ dest_columns_csv }} from {{ inserting_relation }} {{ adapter.get_model_query_settings(model) }} {% endcall %} diff --git a/tests/integration/adapter/basic/test_incremental.py b/tests/integration/adapter/basic/test_incremental.py index 3cc4cce9..c50d477a 100644 --- a/tests/integration/adapter/basic/test_incremental.py +++ b/tests/integration/adapter/basic/test_incremental.py @@ -7,7 +7,7 @@ class TestIncremental(BaseIncremental): incremental_not_schema_change_sql = """ -{{ config(materialized="incremental", unique_key="user_id_current_time",on_schema_change="sync_all_columns") }} +{{ config(materialized="incremental", unique_key="user_id_current_time",on_schema_change="append_new_columns") }} select toString(1) || '-' || toString(now64()) as user_id_current_time, {% if is_incremental() %} diff --git a/tests/integration/adapter/incremental/test_schema_change.py b/tests/integration/adapter/incremental/test_schema_change.py new file mode 100644 index 00000000..9bccaf4e --- /dev/null +++ b/tests/integration/adapter/incremental/test_schema_change.py @@ -0,0 +1,71 @@ +import pytest +from dbt.tests.util import run_dbt, run_dbt_and_capture + +schema_change_sql = """ +{{ + config( + materialized='incremental', + unique_key='col_1', + on_schema_change='%schema_change%' + ) +}} + +{% if not is_incremental() %} +select + number as col_1, + number + 1 as col_2 +from numbers(3) +{% else %} +select + number as col_1, + number + 1 as col_2, + number + 2 as col_3 +from numbers(2, 3) +{% endif %} +""" + + +class TestOnSchemaChange: + @pytest.fixture(scope="class") + def models(self): + return { + "schema_change_ignore.sql": schema_change_sql.replace("%schema_change%", "ignore"), + "schema_change_fail.sql": schema_change_sql.replace("%schema_change%", "fail"), + "schema_change_append.sql": schema_change_sql.replace( + "%schema_change%", "append_new_columns" + ), + } + + def test_ignore(self, project): + run_dbt(["run", "--select", "schema_change_ignore"]) + result = project.run_sql("select * from schema_change_ignore order by col_1", fetch="all") + assert len(result) == 3 + assert result[0][1] == 1 + run_dbt(["run", "--select", "schema_change_ignore"]) + result = project.run_sql("select * from schema_change_ignore", fetch="all") + assert len(result) == 5 + + def test_fail(self, project): + run_dbt(["run", "--select", "schema_change_fail"]) + result = project.run_sql("select * from schema_change_fail order by col_1", fetch="all") + assert len(result) == 3 + assert result[0][1] == 1 + _, log_output = run_dbt_and_capture( + [ + "run", + "--select", + "schema_change_fail", + ], + expect_pass=False, + ) + assert 'out of sync' in log_output.lower() + + def test_append(self, project): + run_dbt(["run", "--select", "schema_change_append"]) + result = project.run_sql("select * from schema_change_append order by col_1", fetch="all") + assert len(result) == 3 + assert result[0][1] == 1 + run_dbt(["--debug", "run", "--select", "schema_change_append"]) + result = project.run_sql("select * from schema_change_append order by col_1", fetch="all") + assert result[0][2] == 0 + assert result[3][2] == 5 From 2e72a0063934d3b5fa238d62fbfaa4fe87784057 Mon Sep 17 00:00:00 2001 From: Geoff Genz Date: Wed, 6 Dec 2023 21:11:29 -0700 Subject: [PATCH 30/78] Release 1 7 0 (#220) * Initial dependency updates for 1.7.x * Initial dependency updates for 1.7.x --- CHANGELOG.md | 7 +++ dbt/adapters/clickhouse/__version__.py | 2 +- dbt/adapters/clickhouse/impl.py | 72 +++++++++++++------------- dev_requirements.txt | 6 +-- setup.py | 5 +- 5 files changed, 50 insertions(+), 42 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index ec8ddc32..d37c1f4d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,10 @@ +### Release [1.7.0], 2023-12-07 +#### Improvements +- Minimal compatibility with dbt 1.7.x. The date_spine macro and additional automated tests have not been implemented, +but are planned for a future patch release. +- DBT 1.7 introduces a (complex) optimization mechanism for retrieving a dbt catalog which is overkill for ClickHouse +(which has no separate schema/database level), so this release includes some internal catalog changes to simplify that process. + ### Release [1.6.2], 2023-12-06 #### Bug Fix - The dbt `on_schema_change` configuration value for incremental models was effectively being ignored. This has been fixed diff --git a/dbt/adapters/clickhouse/__version__.py b/dbt/adapters/clickhouse/__version__.py index 5ccf9d1c..95381e51 100644 --- a/dbt/adapters/clickhouse/__version__.py +++ b/dbt/adapters/clickhouse/__version__.py @@ -1 +1 @@ -version = '1.6.2' +version = '1.7.0' diff --git a/dbt/adapters/clickhouse/impl.py b/dbt/adapters/clickhouse/impl.py index ca0c3a44..f5b9b0cf 100644 --- a/dbt/adapters/clickhouse/impl.py +++ b/dbt/adapters/clickhouse/impl.py @@ -1,22 +1,23 @@ import csv import io -from concurrent.futures import Future from dataclasses import dataclass -from typing import Any, Callable, Dict, List, Optional, Set, Union +from typing import Any, Callable, Dict, List, Optional, Set, Tuple, Union import agate from dbt.adapters.base import AdapterConfig, available -from dbt.adapters.base.impl import BaseAdapter, ConstraintSupport, catch_as_completed +from dbt.adapters.base.impl import BaseAdapter, ConstraintSupport from dbt.adapters.base.relation import BaseRelation, InformationSchema +from dbt.adapters.capability import Capability, CapabilityDict, CapabilitySupport, Support from dbt.adapters.sql import SQLAdapter from dbt.contracts.graph.manifest import Manifest from dbt.contracts.graph.nodes import ConstraintType, ModelLevelConstraint -from dbt.contracts.relation import RelationType +from dbt.contracts.relation import Path, RelationType from dbt.events.functions import warn_or_error from dbt.events.types import ConstraintNotSupported from dbt.exceptions import DbtInternalError, DbtRuntimeError, NotImplementedError -from dbt.utils import executor, filter_null_values +from dbt.utils import filter_null_values +import dbt from dbt.adapters.clickhouse.cache import ClickHouseRelationsCache from dbt.adapters.clickhouse.column import ClickHouseColumn from dbt.adapters.clickhouse.connections import ClickHouseConnectionManager @@ -56,6 +57,13 @@ class ClickHouseAdapter(SQLAdapter): ConstraintType.foreign_key: ConstraintSupport.NOT_SUPPORTED, } + _capabilities: CapabilityDict = CapabilityDict( + { + Capability.SchemaMetadataByRelations: CapabilitySupport(support=Support.Unsupported), + Capability.TableLastModifiedMetadata: CapabilitySupport(support=Support.Unsupported), + } + ) + def __init__(self, config): BaseAdapter.__init__(self, config) self.cache = ClickHouseRelationsCache() @@ -295,37 +303,29 @@ def get_ch_database(self, schema: str): except DbtRuntimeError: return None - def get_catalog(self, manifest): - schema_map = self._get_catalog_schemas(manifest) - - with executor(self.config) as tpe: - futures: List[Future[agate.Table]] = [] - for info, schemas in schema_map.items(): - for schema in schemas: - futures.append( - tpe.submit_connected( - self, - schema, - self._get_one_catalog, - info, - [schema], - manifest, - ) - ) - catalogs, exceptions = catch_as_completed(futures) - return catalogs, exceptions - - def _get_one_catalog( - self, - information_schema: InformationSchema, - schemas: Set[str], - manifest: Manifest, - ) -> agate.Table: - if len(schemas) != 1: - raise DbtRuntimeError( - f"Expected only one schema in clickhouse _get_one_catalog, found ' f'{schemas}'" - ) - return super()._get_one_catalog(information_schema, schemas, manifest) + def get_catalog(self, manifest) -> Tuple[agate.Table, List[Exception]]: + relations = self._get_catalog_relations(manifest) + schemas = set(relation.schema for relation in relations) + if schemas: + catalog = self._get_one_catalog(InformationSchema(Path()), schemas, manifest) + else: + catalog = dbt.clients.agate_helper.empty_table() + return catalog, [] + + def get_filtered_catalog( + self, manifest: Manifest, relations: Optional[Set[BaseRelation]] = None + ): + catalog, exceptions = self.get_catalog(manifest) + if relations and catalog: + relation_map = {(r.schema, r.identifier) for r in relations} + + def in_map(row: agate.Row): + s = _expect_row_value("table_schema", row) + i = _expect_row_value("table_name", row) + return (s, i) in relation_map + + catalog = catalog.where(in_map) + return catalog, exceptions def get_rows_different_sql( self, diff --git a/dev_requirements.txt b/dev_requirements.txt index 8906bfec..fcadbaaf 100644 --- a/dev_requirements.txt +++ b/dev_requirements.txt @@ -1,9 +1,9 @@ -dbt-core~=1.6.9 -clickhouse-connect>=0.6.21 +dbt-core~=1.7.3 +clickhouse-connect>=0.6.22 clickhouse-driver>=0.2.6 pytest>=7.2.0 pytest-dotenv==0.5.2 -dbt-tests-adapter~=1.6.9 +dbt-tests-adapter~=1.7.3 black==23.11.0 isort==5.10.1 mypy==0.991 diff --git a/setup.py b/setup.py index fb2d5311..7beb9ba9 100644 --- a/setup.py +++ b/setup.py @@ -25,7 +25,7 @@ def _dbt_clickhouse_version(): package_version = _dbt_clickhouse_version() description = '''The Clickhouse plugin for dbt (data build tool)''' -dbt_version = '1.6.0' +dbt_version = '1.7.0' dbt_minor = '.'.join(dbt_version.split('.')[0:2]) if not package_version.startswith(dbt_minor): @@ -55,7 +55,7 @@ def _dbt_clickhouse_version(): }, install_requires=[ f'dbt-core~={dbt_version}', - 'clickhouse-connect>=0.6.21', + 'clickhouse-connect>=0.6.22', 'clickhouse-driver>=0.2.6', ], python_requires=">=3.8", @@ -70,5 +70,6 @@ def _dbt_clickhouse_version(): 'Programming Language :: Python :: 3.9', 'Programming Language :: Python :: 3.10', 'Programming Language :: Python :: 3.11', + 'Programming Language :: Python :: 3.12', ], ) From 5ccdad5e80af3d64647a147c01f0e0be33c00485 Mon Sep 17 00:00:00 2001 From: Geoff Genz Date: Thu, 7 Dec 2023 18:24:24 -0700 Subject: [PATCH 31/78] Correctly warn or error if light weight deletes not available --- CHANGELOG.md | 9 +++++ dbt/adapters/clickhouse/__version__.py | 2 +- dbt/adapters/clickhouse/dbclient.py | 54 ++++++++++++++++--------- dbt/adapters/clickhouse/errors.py | 21 ++++++++++ dbt/adapters/clickhouse/httpclient.py | 2 +- dbt/adapters/clickhouse/nativeclient.py | 4 +- 6 files changed, 68 insertions(+), 24 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index d37c1f4d..64d5d633 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,12 @@ +### Release [1.7.1], TBD +#### Bug Fix +- It was possible for incremental models with the delete+insert strategy to fail if ClickHouse "light weight deletes" were +not enabled or the required setting `allow_nondetermistic_mutations` was not enabled and the user did not have permission +to apply it. This condition is now detected on startup, and an exception will be thrown if `use_lw_deletes` is configured +in the profile. Otherwise, a warning will be logged that incremental models will be slower (because such models will +be downgraded to use the `legacy` incremental strategy). This should prevent the confusing behavior in +https://github.com/ClickHouse/dbt-clickhouse/issues/197 by throwing an early exception for an unsupported configuration. + ### Release [1.7.0], 2023-12-07 #### Improvements - Minimal compatibility with dbt 1.7.x. The date_spine macro and additional automated tests have not been implemented, diff --git a/dbt/adapters/clickhouse/__version__.py b/dbt/adapters/clickhouse/__version__.py index 95381e51..1f796f9b 100644 --- a/dbt/adapters/clickhouse/__version__.py +++ b/dbt/adapters/clickhouse/__version__.py @@ -1 +1 @@ -version = '1.7.0' +version = '1.7.1' diff --git a/dbt/adapters/clickhouse/dbclient.py b/dbt/adapters/clickhouse/dbclient.py index bf6f3725..c693a82e 100644 --- a/dbt/adapters/clickhouse/dbclient.py +++ b/dbt/adapters/clickhouse/dbclient.py @@ -2,9 +2,15 @@ from abc import ABC, abstractmethod from typing import Dict -from dbt.exceptions import DbtDatabaseError, FailedToConnectError +from dbt.exceptions import DbtConfigError, DbtDatabaseError, FailedToConnectError from dbt.adapters.clickhouse.credentials import ClickHouseCredentials +from dbt.adapters.clickhouse.errors import ( + lw_deletes_not_enabled_error, + lw_deletes_not_enabled_warning, + nd_mutations_not_enabled_error, + nd_mutations_not_enabled_warning, +) from dbt.adapters.clickhouse.logger import logger from dbt.adapters.clickhouse.query import quote_identifier from dbt.adapters.clickhouse.util import compare_versions @@ -131,34 +137,42 @@ def update_model_settings(self, model_settings: Dict[str, str]): model_settings[key] = value def _check_lightweight_deletes(self, requested: bool): - lw_deletes = self.get_ch_setting(LW_DELETE_SETTING) - nd_mutations = self.get_ch_setting(ND_MUTATION_SETTING) + lw_deletes, lw_read_only = self.get_ch_setting(LW_DELETE_SETTING) + nd_mutations, nd_mutations_read_only = self.get_ch_setting(ND_MUTATION_SETTING) if lw_deletes is None or nd_mutations is None: if requested: - logger.warning( - 'use_lw_deletes requested but are not available on this ClickHouse server' - ) + logger.warning(lw_deletes_not_enabled_error) return False, False lw_deletes = int(lw_deletes) > 0 if not lw_deletes: - try: - self.command(f'SET {LW_DELETE_SETTING} = 1') - self._conn_settings[LW_DELETE_SETTING] = '1' - lw_deletes = True - except DbtDatabaseError: - pass + if lw_read_only: + lw_deletes = False + if requested: + raise DbtConfigError(lw_deletes_not_enabled_error) + logger.warning(lw_deletes_not_enabled_warning) + else: + try: + self.command(f'SET {LW_DELETE_SETTING} = 1') + self._conn_settings[LW_DELETE_SETTING] = '1' + lw_deletes = True + except DbtDatabaseError: + logger.warning(lw_deletes_not_enabled_warning) nd_mutations = int(nd_mutations) > 0 if lw_deletes and not nd_mutations: - try: - self.command(f'SET {ND_MUTATION_SETTING} = 1') - self._conn_settings[ND_MUTATION_SETTING] = '1' - nd_mutations = True - except DbtDatabaseError: - pass + if nd_mutations_read_only: + nd_mutations = False + if requested: + raise DbtConfigError(nd_mutations_not_enabled_error) + logger.warning(nd_mutations_not_enabled_warning) + else: + try: + self.command(f'SET {ND_MUTATION_SETTING} = 1') + self._conn_settings[ND_MUTATION_SETTING] = '1' + nd_mutations = True + except DbtDatabaseError: + logger.warning(nd_mutations_not_enabled_warning) if lw_deletes and nd_mutations: return True, requested - if requested: - logger.warning('use_lw_deletes requested but cannot enable on this ClickHouse server') return False, False def _ensure_database(self, database_engine, cluster_name) -> None: diff --git a/dbt/adapters/clickhouse/errors.py b/dbt/adapters/clickhouse/errors.py index 1d3b5c69..bfcd5f95 100644 --- a/dbt/adapters/clickhouse/errors.py +++ b/dbt/adapters/clickhouse/errors.py @@ -22,3 +22,24 @@ Source columns not in target: {0} """ + +lw_deletes_not_enabled_error = """ +Attempting to apply the configuration `use_lw_deletes` to enable the delete+insert incremental strategy, but +`light weight deletes` are either not available or not enabled on this ClickHouse server. +""" + +lw_deletes_not_enabled_warning = """ +`light weight deletes` are either not available or not enabled on this ClickHouse server. This prevents the use +of the delete+insert incremental strategy, which may negatively affect performance for incremental models. +""" + +nd_mutations_not_enabled_error = """ +Attempting to apply the configuration `use_lw_deletes` to enable the delete+insert incremental strategy, but +the required `allow_nondeterministic_mutations` is not enabled and is `read_only` for this user +""" + +nd_mutations_not_enabled_warning = """ +The setting `allow_nondeterministic_mutations` is not enabled and is `read_only` for this user` This prevents the use +of `light weight deletes` and therefore the delete+insert incremental strategy. This may negatively affect performance +for incremental models +""" diff --git a/dbt/adapters/clickhouse/httpclient.py b/dbt/adapters/clickhouse/httpclient.py index 6e074464..161d1256 100644 --- a/dbt/adapters/clickhouse/httpclient.py +++ b/dbt/adapters/clickhouse/httpclient.py @@ -35,7 +35,7 @@ def columns_in_query(self, sql: str, **kwargs) -> List[ClickHouseColumn]: def get_ch_setting(self, setting_name): setting = self._client.server_settings.get(setting_name) - return setting.value if setting else None + return (setting.value, setting.readonly) if setting else (None, 0) def database_dropped(self, database: str): # This is necessary for the http client to avoid exceptions when ClickHouse doesn't recognize the database diff --git a/dbt/adapters/clickhouse/nativeclient.py b/dbt/adapters/clickhouse/nativeclient.py index 6fbff418..d7532ef5 100644 --- a/dbt/adapters/clickhouse/nativeclient.py +++ b/dbt/adapters/clickhouse/nativeclient.py @@ -42,12 +42,12 @@ def columns_in_query(self, sql: str, **kwargs) -> List[ClickHouseColumn]: def get_ch_setting(self, setting_name): try: result = self._client.execute( - f"SELECT value FROM system.settings WHERE name = '{setting_name}'" + f"SELECT value, readonly FROM system.settings WHERE name = '{setting_name}'" ) except clickhouse_driver.errors.Error as ex: logger.warn('Unexpected error retrieving ClickHouse server setting', ex) return None - return result[0][0] if result else None + return (result[0][0], result[0][1]) if result else (None, 0) def close(self): self._client.disconnect() From 2d5c675fa8c1312ea2b786b73b230afb603c7045 Mon Sep 17 00:00:00 2001 From: ptemarvelde <45282601+ptemarvelde@users.noreply.github.com> Date: Wed, 13 Dec 2023 16:52:31 +0100 Subject: [PATCH 32/78] Wrap columns_in_query query in select statement (#222) * Wrap columns_in_query query in select statement * formatting --- dbt/adapters/clickhouse/httpclient.py | 2 +- dbt/adapters/clickhouse/nativeclient.py | 4 +++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/dbt/adapters/clickhouse/httpclient.py b/dbt/adapters/clickhouse/httpclient.py index 161d1256..17795e44 100644 --- a/dbt/adapters/clickhouse/httpclient.py +++ b/dbt/adapters/clickhouse/httpclient.py @@ -25,7 +25,7 @@ def command(self, sql, **kwargs): def columns_in_query(self, sql: str, **kwargs) -> List[ClickHouseColumn]: try: - query_result = self._client.query(f'{sql} LIMIT 0', **kwargs) + query_result = self._client.query(f"SELECT * FROM ({sql}) LIMIT 0", **kwargs) return [ ClickHouseColumn.create(name, ch_type.name) for name, ch_type in zip(query_result.column_names, query_result.column_types) diff --git a/dbt/adapters/clickhouse/nativeclient.py b/dbt/adapters/clickhouse/nativeclient.py index d7532ef5..aaec97f9 100644 --- a/dbt/adapters/clickhouse/nativeclient.py +++ b/dbt/adapters/clickhouse/nativeclient.py @@ -34,7 +34,9 @@ def command(self, sql, **kwargs): def columns_in_query(self, sql: str, **kwargs) -> List[ClickHouseColumn]: try: - _, columns = self._client.execute(f'{sql} LIMIT 0', with_column_types=True) + _, columns = self._client.execute( + f"SELECT * FROM ({sql}) LIMIT 0", with_column_types=True + ) return [ClickHouseColumn.create(column[0], column[1]) for column in columns] except clickhouse_driver.errors.Error as ex: raise DbtDatabaseError(str(ex).strip()) from ex From ca9da0b6ebacc14d67139b0b417e0d4083b68a9c Mon Sep 17 00:00:00 2001 From: Geoff Genz Date: Wed, 13 Dec 2023 09:54:39 -0700 Subject: [PATCH 33/78] Update changelog --- CHANGELOG.md | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 64d5d633..35f64950 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,7 @@ -### Release [1.7.1], TBD -#### Bug Fix +### Release [1.7.1], 2023-12-13 +#### Bug Fixes +- Some models with LIMIT clauses were broken in recent releases. This has been fixed. Thanks to +[ptemarvelde](https://github.com/ptemarvelde) for the PR! - It was possible for incremental models with the delete+insert strategy to fail if ClickHouse "light weight deletes" were not enabled or the required setting `allow_nondetermistic_mutations` was not enabled and the user did not have permission to apply it. This condition is now detected on startup, and an exception will be thrown if `use_lw_deletes` is configured From 8551bb1110e30e87e58ff02962be37ae025e379b Mon Sep 17 00:00:00 2001 From: Dmitrii Tcimokha Date: Sat, 23 Dec 2023 20:46:22 +0100 Subject: [PATCH 34/78] allows to add a comment in table's or view's metadata --- dbt/include/clickhouse/macros/materializations/table.sql | 5 +++++ dbt/include/clickhouse/macros/materializations/view.sql | 4 ++++ 2 files changed, 9 insertions(+) diff --git a/dbt/include/clickhouse/macros/materializations/table.sql b/dbt/include/clickhouse/macros/materializations/table.sql index 72cc72c8..3fbb3cb0 100644 --- a/dbt/include/clickhouse/macros/materializations/table.sql +++ b/dbt/include/clickhouse/macros/materializations/table.sql @@ -181,6 +181,11 @@ ) {%- endif %} {%- endif %} + + {% set comment = config.get('comment') %} + {% if comment %} + COMMENT '{{ comment }}' + {%- endif %} {%- endmacro %} diff --git a/dbt/include/clickhouse/macros/materializations/view.sql b/dbt/include/clickhouse/macros/materializations/view.sql index 735ec973..5d3ff492 100644 --- a/dbt/include/clickhouse/macros/materializations/view.sql +++ b/dbt/include/clickhouse/macros/materializations/view.sql @@ -79,5 +79,9 @@ as ( {{ sql }} ) + {% set comment = config.get('comment') %} + {% if comment %} + COMMENT '{{ comment }}' + {%- endif %} {%- endmacro %} From 2b81b7670537df555880cb920bece582c649d767 Mon Sep 17 00:00:00 2001 From: bentsileviav Date: Sun, 28 Jan 2024 19:19:15 +0200 Subject: [PATCH 35/78] add settings_section flag as comment for code using settings --- dbt/adapters/clickhouse/impl.py | 19 +++++++++++++++++-- .../macros/schema_tests/relationships.sql | 1 + 2 files changed, 18 insertions(+), 2 deletions(-) diff --git a/dbt/adapters/clickhouse/impl.py b/dbt/adapters/clickhouse/impl.py index f5b9b0cf..1eea8d8b 100644 --- a/dbt/adapters/clickhouse/impl.py +++ b/dbt/adapters/clickhouse/impl.py @@ -412,7 +412,14 @@ def get_model_settings(self, model): res = [] for key in settings: res.append(f' {key}={settings[key]}') - return '' if len(res) == 0 else 'SETTINGS ' + ', '.join(res) + '\n' + if len(res) == 0: + return '' + else: + settings_str = 'SETTINGS ' + ', '.join(res) + '\n' + return f""" + -- settings_section + {settings_str} + """ @available def get_model_query_settings(self, model): @@ -420,7 +427,15 @@ def get_model_query_settings(self, model): res = [] for key in settings: res.append(f' {key}={settings[key]}') - return '' if len(res) == 0 else 'SETTINGS ' + ', '.join(res) + '\n' + + if len(res) == 0: + return '' + else: + settings_str = 'SETTINGS ' + ', '.join(res) + '\n' + return f""" + -- settings_section + {settings_str} + """ @available.parse_none def get_column_schema_from_query(self, sql: str, *_) -> List[ClickHouseColumn]: diff --git a/dbt/include/clickhouse/macros/schema_tests/relationships.sql b/dbt/include/clickhouse/macros/schema_tests/relationships.sql index b756a99c..a0f09fd3 100644 --- a/dbt/include/clickhouse/macros/schema_tests/relationships.sql +++ b/dbt/include/clickhouse/macros/schema_tests/relationships.sql @@ -19,6 +19,7 @@ left join parent on child.from_field = parent.to_field where parent.to_field is null +-- settings_section settings join_use_nulls = 1 {% endmacro %} From 2b0ba1982a1a3c3aa35ee0ab256fd6b2f8de3636 Mon Sep 17 00:00:00 2001 From: bentsileviav Date: Sun, 28 Jan 2024 19:20:24 +0200 Subject: [PATCH 36/78] override test sql macro and add limit-placer macro --- dbt/include/clickhouse/macros/utils/utils.sql | 30 +++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/dbt/include/clickhouse/macros/utils/utils.sql b/dbt/include/clickhouse/macros/utils/utils.sql index 9eb391f9..aa65abff 100644 --- a/dbt/include/clickhouse/macros/utils/utils.sql +++ b/dbt/include/clickhouse/macros/utils/utils.sql @@ -1,3 +1,33 @@ +{% macro clickhouse__get_test_sql(main_sql, fail_calc, warn_if, error_if, limit) -%} + {% set main_sql_formatted = clickhouse__place_limit(main_sql, limit) if limit !=None else main_sql%} + select + {{ fail_calc }} as failures, + {{ fail_calc }} {{ warn_if }} as should_warn, + {{ fail_calc }} {{ error_if }} as should_error + from ( + {{ main_sql_formatted }} + ) dbt_internal_test + +{%- endmacro %} + + +-- This macro is designed to add a LIMIT clause to a ClickHouse SQL query while preserving any ClickHouse settings specified in the query. +-- When multiple queries are nested, the limit will be attached to the outer query +{% macro clickhouse__place_limit(query, limit) -%} + {% if 'settings' in query.lower()%} + {% if '-- settings_section' not in query.lower()%} + {{exceptions.raise_compiler_error("-- settings_section must be set when using ClickHouse settings")}} + {% endif %} + {% set split_by_settings_sections = query.split("-- settings_section")%} + {% set split_by_settings_sections_with_limit = split_by_settings_sections[-2] + "\n LIMIT " + limit|string + "\n" %} + {% set query_with_limit = "-- settings_section".join(split_by_settings_sections[:-2] + [split_by_settings_sections_with_limit, split_by_settings_sections[-1]])%} + {{query_with_limit}} + {% else %} + {{query}} + {{"limit " ~ limit}} + {% endif %} +{%- endmacro %} + {% macro clickhouse__any_value(expression) -%} any({{ expression }}) {%- endmacro %} From 047308e524c8a2b530fd1aa58994433bee0aadf8 Mon Sep 17 00:00:00 2001 From: bentsileviav Date: Sun, 28 Jan 2024 20:40:08 +0200 Subject: [PATCH 37/78] update CHANGELOG.md --- CHANGELOG.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 35f64950..b3afa542 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,7 @@ +### Release [1.7.2], 2024-XX-XX +#### Bug Fixes +- A few tests with LIMIT clause were broken due to parsing error when having settings in the query ([issue](https://github.com/ClickHouse/dbt-clickhouse/issues/223)). We added a dedicated limit placer, that takes into account the settings section (using a comment flag `-- settings_section` within the query). + ### Release [1.7.1], 2023-12-13 #### Bug Fixes - Some models with LIMIT clauses were broken in recent releases. This has been fixed. Thanks to From 092c618935df6012bed07710a7eb4366ff240091 Mon Sep 17 00:00:00 2001 From: Rory Sawyer Date: Fri, 2 Feb 2024 11:15:50 -0500 Subject: [PATCH 38/78] fix: use correct schema for MV target tables (#244) * fix: use correct schema when updating MVs The existing implementation passes just the name for `target_table`, which ultimately means that the target schema is not included when the final SQL is generated. By passing the entire relation object, the correct target schema will be present in the final SQL. * update MV tests Provide a custom schema to make sure that the full target table name (schema + relation name) is included in the CREATE MATERIALIZED VIEW statement --- .../macros/materializations/materialized_view.sql | 6 +++--- .../materialized_view/test_materialized_view.py | 11 +++++++---- 2 files changed, 10 insertions(+), 7 deletions(-) diff --git a/dbt/include/clickhouse/macros/materializations/materialized_view.sql b/dbt/include/clickhouse/macros/materializations/materialized_view.sql index 293cc41b..2ebe0dae 100644 --- a/dbt/include/clickhouse/macros/materializations/materialized_view.sql +++ b/dbt/include/clickhouse/macros/materializations/materialized_view.sql @@ -41,7 +41,7 @@ {{ clickhouse__get_create_materialized_view_as_sql(target_relation, sql) }} {%- endcall %} {% elif existing_relation.can_exchange %} - {{ log('Replacing existing materialized view' + target_relation.name) }} + {{ log('Replacing existing materialized view ' + target_relation.name) }} {% call statement('drop existing materialized view') %} drop view if exists {{ mv_relation }} {{ cluster_clause }} {% endcall %} @@ -50,10 +50,10 @@ {%- endcall %} {% do exchange_tables_atomic(backup_relation, existing_relation) %} {% call statement('create new materialized view') %} - {{ clickhouse__create_mv_sql(mv_relation, existing_relation.name, cluster_clause, sql) }} + {{ clickhouse__create_mv_sql(mv_relation, existing_relation, cluster_clause, sql) }} {% endcall %} {% else %} - {{ log('Replacing existing materialized view' + target_relation.name) }} + {{ log('Replacing existing materialized view ' + target_relation.name) }} {{ clickhouse__replace_mv(target_relation, existing_relation, intermediate_relation, backup_relation, sql) }} {% endif %} diff --git a/tests/integration/adapter/materialized_view/test_materialized_view.py b/tests/integration/adapter/materialized_view/test_materialized_view.py index 9305d064..06b88e6e 100644 --- a/tests/integration/adapter/materialized_view/test_materialized_view.py +++ b/tests/integration/adapter/materialized_view/test_materialized_view.py @@ -25,6 +25,7 @@ materialized='materialized_view', engine='MergeTree()', order_by='(id)', + schema='custom_schema', ) }} {% if var('run_type', '') == '' %} @@ -92,6 +93,7 @@ def test_create(self, project): 2. create a model as a materialized view, selecting from the table created in (1) 3. insert data into the base table and make sure it's there in the target table created in (2) """ + schema = quote_identifier(project.test_schema + "_custom_schema") results = run_dbt(["seed"]) assert len(results) == 1 columns = project.run_sql("DESCRIBE TABLE people", fetch="all") @@ -101,10 +103,10 @@ def test_create(self, project): results = run_dbt() assert len(results) == 1 - columns = project.run_sql("DESCRIBE TABLE hackers", fetch="all") + columns = project.run_sql(f"DESCRIBE TABLE {schema}.hackers", fetch="all") assert columns[0][1] == "Int32" - columns = project.run_sql("DESCRIBE hackers_mv", fetch="all") + columns = project.run_sql(f"DESCRIBE {schema}.hackers_mv", fetch="all") assert columns[0][1] == "Int32" check_relation_types( @@ -123,7 +125,7 @@ def test_create(self, project): """ ) - result = project.run_sql("select count(*) from hackers", fetch="all") + result = project.run_sql(f"select count(*) from {schema}.hackers", fetch="all") assert result[0][0] == 4 @@ -145,6 +147,7 @@ def models(self): } def test_update(self, project): + schema = quote_identifier(project.test_schema + "_custom_schema") # create our initial materialized view run_dbt(["seed"]) run_dbt() @@ -162,6 +165,6 @@ def test_update(self, project): # assert that we now have both of Dade's aliases in our hackers table result = project.run_sql( - "select distinct hacker_alias from hackers where name = 'Dade'", fetch="all" + f"select distinct hacker_alias from {schema}.hackers where name = 'Dade'", fetch="all" ) assert len(result) == 2 From febc2208965476c1e042e29c56893c319163b9db Mon Sep 17 00:00:00 2001 From: Geoff Genz Date: Fri, 2 Feb 2024 09:26:40 -0700 Subject: [PATCH 39/78] Update changelog --- CHANGELOG.md | 5 +++++ README.md | 3 +-- dbt/adapters/clickhouse/__version__.py | 2 +- 3 files changed, 7 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 35f64950..1cbef238 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,8 @@ +### Release [1.7.2], 2024-02-09 +#### Bug Fix +- Fixed an issue where Materialize Views would break with a custom schema. Thanks to [Rory Sawyer](https://github.com/SoryRawyer) +for the PR! + ### Release [1.7.1], 2023-12-13 #### Bug Fixes - Some models with LIMIT clauses were broken in recent releases. This has been fixed. Thanks to diff --git a/README.md b/README.md index 8022f214..8e413f67 100644 --- a/README.md +++ b/README.md @@ -156,8 +156,7 @@ operations, because they don't require rewriting ClickHouse data parts. The inc incremental materializations that perform significantly better than the "legacy" strategy. However, there are important caveats to using this strategy: - Lightweight deletes must be enabled on your ClickHouse server using the setting `allow_experimental_lightweight_delete=1` or you must set `use_lw_deletes=true` in your profile (which will enable that setting for your dbt sessions) -- As suggested by the setting name, lightweight delete functionality is still experimental and there are still known issues that must be resolved before the feature is considered production ready, -so usage should be limited to datasets that are easily recreated +- Lightweight deletes are now production ready, but there may be performance and other problems on ClickHouse versions earlier than 23.3. - This strategy operates directly on the affected table/relation (with creating any intermediate or temporary tables), so if there is an issue during the operation, the data in the incremental model is likely to be in an invalid state - When using lightweight deletes, dbt-clickhouse enabled the setting `allow_nondeterministic_mutations`. In some very rare cases using non-deterministic incremental_predicates diff --git a/dbt/adapters/clickhouse/__version__.py b/dbt/adapters/clickhouse/__version__.py index 1f796f9b..41aad93f 100644 --- a/dbt/adapters/clickhouse/__version__.py +++ b/dbt/adapters/clickhouse/__version__.py @@ -1 +1 @@ -version = '1.7.1' +version = '1.7.2' From 36303c33e8a1d0fe366b96bc53c9f800c789fd12 Mon Sep 17 00:00:00 2001 From: bentsileviav Date: Sun, 4 Feb 2024 13:09:41 +0200 Subject: [PATCH 40/78] rename end of query flag --- dbt/adapters/clickhouse/impl.py | 2 +- .../clickhouse/macros/schema_tests/relationships.sql | 2 +- dbt/include/clickhouse/macros/utils/utils.sql | 8 ++++---- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/dbt/adapters/clickhouse/impl.py b/dbt/adapters/clickhouse/impl.py index 1eea8d8b..09047502 100644 --- a/dbt/adapters/clickhouse/impl.py +++ b/dbt/adapters/clickhouse/impl.py @@ -417,7 +417,7 @@ def get_model_settings(self, model): else: settings_str = 'SETTINGS ' + ', '.join(res) + '\n' return f""" - -- settings_section + -- end_of_sql {settings_str} """ diff --git a/dbt/include/clickhouse/macros/schema_tests/relationships.sql b/dbt/include/clickhouse/macros/schema_tests/relationships.sql index a0f09fd3..f602fecc 100644 --- a/dbt/include/clickhouse/macros/schema_tests/relationships.sql +++ b/dbt/include/clickhouse/macros/schema_tests/relationships.sql @@ -19,7 +19,7 @@ left join parent on child.from_field = parent.to_field where parent.to_field is null --- settings_section +-- end_of_sql settings join_use_nulls = 1 {% endmacro %} diff --git a/dbt/include/clickhouse/macros/utils/utils.sql b/dbt/include/clickhouse/macros/utils/utils.sql index aa65abff..d81e6da3 100644 --- a/dbt/include/clickhouse/macros/utils/utils.sql +++ b/dbt/include/clickhouse/macros/utils/utils.sql @@ -15,12 +15,12 @@ -- When multiple queries are nested, the limit will be attached to the outer query {% macro clickhouse__place_limit(query, limit) -%} {% if 'settings' in query.lower()%} - {% if '-- settings_section' not in query.lower()%} - {{exceptions.raise_compiler_error("-- settings_section must be set when using ClickHouse settings")}} + {% if '-- end_of_sql' not in query.lower()%} + {{exceptions.raise_compiler_error("-- end_of_sql must be set when using ClickHouse settings")}} {% endif %} - {% set split_by_settings_sections = query.split("-- settings_section")%} + {% set split_by_settings_sections = query.split("-- end_of_sql")%} {% set split_by_settings_sections_with_limit = split_by_settings_sections[-2] + "\n LIMIT " + limit|string + "\n" %} - {% set query_with_limit = "-- settings_section".join(split_by_settings_sections[:-2] + [split_by_settings_sections_with_limit, split_by_settings_sections[-1]])%} + {% set query_with_limit = "-- end_of_sql".join(split_by_settings_sections[:-2] + [split_by_settings_sections_with_limit, split_by_settings_sections[-1]])%} {{query_with_limit}} {% else %} {{query}} From d8afb93929f9c07c04bc0b3a86ead29b2e0752df Mon Sep 17 00:00:00 2001 From: Bentsi Leviav Date: Sun, 4 Feb 2024 22:09:56 +0200 Subject: [PATCH 41/78] Bug/223 relationship test with limit (#245) * add settings_section flag as comment for code using settings * override test sql macro and add limit-placer macro * update CHANGELOG.md * rename end of query flag --- CHANGELOG.md | 1 + dbt/adapters/clickhouse/impl.py | 19 ++++++++++-- .../macros/schema_tests/relationships.sql | 1 + dbt/include/clickhouse/macros/utils/utils.sql | 30 +++++++++++++++++++ 4 files changed, 49 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 1cbef238..6e5a1057 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,7 @@ #### Bug Fix - Fixed an issue where Materialize Views would break with a custom schema. Thanks to [Rory Sawyer](https://github.com/SoryRawyer) for the PR! +- A few tests with LIMIT clause were broken due to parsing error when having settings in the query ([issue](https://github.com/ClickHouse/dbt-clickhouse/issues/223)). We added a dedicated limit placer, that takes into account the settings section (using a comment flag `-- settings_section` within the query). ### Release [1.7.1], 2023-12-13 #### Bug Fixes diff --git a/dbt/adapters/clickhouse/impl.py b/dbt/adapters/clickhouse/impl.py index f5b9b0cf..09047502 100644 --- a/dbt/adapters/clickhouse/impl.py +++ b/dbt/adapters/clickhouse/impl.py @@ -412,7 +412,14 @@ def get_model_settings(self, model): res = [] for key in settings: res.append(f' {key}={settings[key]}') - return '' if len(res) == 0 else 'SETTINGS ' + ', '.join(res) + '\n' + if len(res) == 0: + return '' + else: + settings_str = 'SETTINGS ' + ', '.join(res) + '\n' + return f""" + -- end_of_sql + {settings_str} + """ @available def get_model_query_settings(self, model): @@ -420,7 +427,15 @@ def get_model_query_settings(self, model): res = [] for key in settings: res.append(f' {key}={settings[key]}') - return '' if len(res) == 0 else 'SETTINGS ' + ', '.join(res) + '\n' + + if len(res) == 0: + return '' + else: + settings_str = 'SETTINGS ' + ', '.join(res) + '\n' + return f""" + -- settings_section + {settings_str} + """ @available.parse_none def get_column_schema_from_query(self, sql: str, *_) -> List[ClickHouseColumn]: diff --git a/dbt/include/clickhouse/macros/schema_tests/relationships.sql b/dbt/include/clickhouse/macros/schema_tests/relationships.sql index b756a99c..f602fecc 100644 --- a/dbt/include/clickhouse/macros/schema_tests/relationships.sql +++ b/dbt/include/clickhouse/macros/schema_tests/relationships.sql @@ -19,6 +19,7 @@ left join parent on child.from_field = parent.to_field where parent.to_field is null +-- end_of_sql settings join_use_nulls = 1 {% endmacro %} diff --git a/dbt/include/clickhouse/macros/utils/utils.sql b/dbt/include/clickhouse/macros/utils/utils.sql index 9eb391f9..d81e6da3 100644 --- a/dbt/include/clickhouse/macros/utils/utils.sql +++ b/dbt/include/clickhouse/macros/utils/utils.sql @@ -1,3 +1,33 @@ +{% macro clickhouse__get_test_sql(main_sql, fail_calc, warn_if, error_if, limit) -%} + {% set main_sql_formatted = clickhouse__place_limit(main_sql, limit) if limit !=None else main_sql%} + select + {{ fail_calc }} as failures, + {{ fail_calc }} {{ warn_if }} as should_warn, + {{ fail_calc }} {{ error_if }} as should_error + from ( + {{ main_sql_formatted }} + ) dbt_internal_test + +{%- endmacro %} + + +-- This macro is designed to add a LIMIT clause to a ClickHouse SQL query while preserving any ClickHouse settings specified in the query. +-- When multiple queries are nested, the limit will be attached to the outer query +{% macro clickhouse__place_limit(query, limit) -%} + {% if 'settings' in query.lower()%} + {% if '-- end_of_sql' not in query.lower()%} + {{exceptions.raise_compiler_error("-- end_of_sql must be set when using ClickHouse settings")}} + {% endif %} + {% set split_by_settings_sections = query.split("-- end_of_sql")%} + {% set split_by_settings_sections_with_limit = split_by_settings_sections[-2] + "\n LIMIT " + limit|string + "\n" %} + {% set query_with_limit = "-- end_of_sql".join(split_by_settings_sections[:-2] + [split_by_settings_sections_with_limit, split_by_settings_sections[-1]])%} + {{query_with_limit}} + {% else %} + {{query}} + {{"limit " ~ limit}} + {% endif %} +{%- endmacro %} + {% macro clickhouse__any_value(expression) -%} any({{ expression }}) {%- endmacro %} From b791bcc61bc284ccf8282e16064b2cc53358c7c8 Mon Sep 17 00:00:00 2001 From: Geoff Genz Date: Sun, 4 Feb 2024 13:10:38 -0700 Subject: [PATCH 42/78] Revert "Bug/223 relationship test with limit (#245)" (#247) This reverts commit d8afb93929f9c07c04bc0b3a86ead29b2e0752df. --- CHANGELOG.md | 1 - dbt/adapters/clickhouse/impl.py | 19 ++---------- .../macros/schema_tests/relationships.sql | 1 - dbt/include/clickhouse/macros/utils/utils.sql | 30 ------------------- 4 files changed, 2 insertions(+), 49 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 6e5a1057..1cbef238 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,7 +2,6 @@ #### Bug Fix - Fixed an issue where Materialize Views would break with a custom schema. Thanks to [Rory Sawyer](https://github.com/SoryRawyer) for the PR! -- A few tests with LIMIT clause were broken due to parsing error when having settings in the query ([issue](https://github.com/ClickHouse/dbt-clickhouse/issues/223)). We added a dedicated limit placer, that takes into account the settings section (using a comment flag `-- settings_section` within the query). ### Release [1.7.1], 2023-12-13 #### Bug Fixes diff --git a/dbt/adapters/clickhouse/impl.py b/dbt/adapters/clickhouse/impl.py index 09047502..f5b9b0cf 100644 --- a/dbt/adapters/clickhouse/impl.py +++ b/dbt/adapters/clickhouse/impl.py @@ -412,14 +412,7 @@ def get_model_settings(self, model): res = [] for key in settings: res.append(f' {key}={settings[key]}') - if len(res) == 0: - return '' - else: - settings_str = 'SETTINGS ' + ', '.join(res) + '\n' - return f""" - -- end_of_sql - {settings_str} - """ + return '' if len(res) == 0 else 'SETTINGS ' + ', '.join(res) + '\n' @available def get_model_query_settings(self, model): @@ -427,15 +420,7 @@ def get_model_query_settings(self, model): res = [] for key in settings: res.append(f' {key}={settings[key]}') - - if len(res) == 0: - return '' - else: - settings_str = 'SETTINGS ' + ', '.join(res) + '\n' - return f""" - -- settings_section - {settings_str} - """ + return '' if len(res) == 0 else 'SETTINGS ' + ', '.join(res) + '\n' @available.parse_none def get_column_schema_from_query(self, sql: str, *_) -> List[ClickHouseColumn]: diff --git a/dbt/include/clickhouse/macros/schema_tests/relationships.sql b/dbt/include/clickhouse/macros/schema_tests/relationships.sql index f602fecc..b756a99c 100644 --- a/dbt/include/clickhouse/macros/schema_tests/relationships.sql +++ b/dbt/include/clickhouse/macros/schema_tests/relationships.sql @@ -19,7 +19,6 @@ left join parent on child.from_field = parent.to_field where parent.to_field is null --- end_of_sql settings join_use_nulls = 1 {% endmacro %} diff --git a/dbt/include/clickhouse/macros/utils/utils.sql b/dbt/include/clickhouse/macros/utils/utils.sql index d81e6da3..9eb391f9 100644 --- a/dbt/include/clickhouse/macros/utils/utils.sql +++ b/dbt/include/clickhouse/macros/utils/utils.sql @@ -1,33 +1,3 @@ -{% macro clickhouse__get_test_sql(main_sql, fail_calc, warn_if, error_if, limit) -%} - {% set main_sql_formatted = clickhouse__place_limit(main_sql, limit) if limit !=None else main_sql%} - select - {{ fail_calc }} as failures, - {{ fail_calc }} {{ warn_if }} as should_warn, - {{ fail_calc }} {{ error_if }} as should_error - from ( - {{ main_sql_formatted }} - ) dbt_internal_test - -{%- endmacro %} - - --- This macro is designed to add a LIMIT clause to a ClickHouse SQL query while preserving any ClickHouse settings specified in the query. --- When multiple queries are nested, the limit will be attached to the outer query -{% macro clickhouse__place_limit(query, limit) -%} - {% if 'settings' in query.lower()%} - {% if '-- end_of_sql' not in query.lower()%} - {{exceptions.raise_compiler_error("-- end_of_sql must be set when using ClickHouse settings")}} - {% endif %} - {% set split_by_settings_sections = query.split("-- end_of_sql")%} - {% set split_by_settings_sections_with_limit = split_by_settings_sections[-2] + "\n LIMIT " + limit|string + "\n" %} - {% set query_with_limit = "-- end_of_sql".join(split_by_settings_sections[:-2] + [split_by_settings_sections_with_limit, split_by_settings_sections[-1]])%} - {{query_with_limit}} - {% else %} - {{query}} - {{"limit " ~ limit}} - {% endif %} -{%- endmacro %} - {% macro clickhouse__any_value(expression) -%} any({{ expression }}) {%- endmacro %} From a63cbd75b390767b621d9ff47802271896358d89 Mon Sep 17 00:00:00 2001 From: bentsileviav Date: Tue, 13 Feb 2024 12:30:07 +0200 Subject: [PATCH 43/78] always return --end_of_sql when asking for settings --- dbt/adapters/clickhouse/impl.py | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/dbt/adapters/clickhouse/impl.py b/dbt/adapters/clickhouse/impl.py index 09047502..e87ab906 100644 --- a/dbt/adapters/clickhouse/impl.py +++ b/dbt/adapters/clickhouse/impl.py @@ -412,14 +412,11 @@ def get_model_settings(self, model): res = [] for key in settings: res.append(f' {key}={settings[key]}') - if len(res) == 0: - return '' - else: - settings_str = 'SETTINGS ' + ', '.join(res) + '\n' - return f""" - -- end_of_sql - {settings_str} - """ + settings_str = '' if len(res) == 0 else 'SETTINGS ' + ', '.join(res) + '\n' + return f""" + -- end_of_sql + {settings_str} + """ @available def get_model_query_settings(self, model): From 29179048e1535038e1f85ec32993e4130f0b9ee7 Mon Sep 17 00:00:00 2001 From: bentsileviav Date: Tue, 13 Feb 2024 16:21:06 +0200 Subject: [PATCH 44/78] Add model settings based on materialization type --- dbt/adapters/clickhouse/dbclient.py | 25 ++++++++++++++++++------- dbt/adapters/clickhouse/impl.py | 3 ++- 2 files changed, 20 insertions(+), 8 deletions(-) diff --git a/dbt/adapters/clickhouse/dbclient.py b/dbt/adapters/clickhouse/dbclient.py index c693a82e..7d1f5fcd 100644 --- a/dbt/adapters/clickhouse/dbclient.py +++ b/dbt/adapters/clickhouse/dbclient.py @@ -1,4 +1,5 @@ import uuid +import copy from abc import ABC, abstractmethod from typing import Dict @@ -18,7 +19,7 @@ LW_DELETE_SETTING = 'allow_experimental_lightweight_delete' ND_MUTATION_SETTING = 'allow_nondeterministic_mutations' DEDUP_WINDOW_SETTING = 'replicated_deduplication_window' - +DEDUP_WINDOW_SETTING_SUPPORTED_MATERIALIZATION = ["table", "incremental", "ephemeral", "materialized_view"] def get_db_client(credentials: ClickHouseCredentials): driver = credentials.driver @@ -89,12 +90,20 @@ def __init__(self, credentials: ClickHouseCredentials): except Exception as ex: self.close() raise ex - self._model_settings = {} + self._model_settings = { + "table": {}, + "view": {}, + "incremental": {}, + "ephemeral": {}, + "materialized_view": {}, + "general": {} + } if ( - not credentials.allow_automatic_deduplication - and compare_versions(self._server_version(), '22.7.1.2484') >= 0 + not credentials.allow_automatic_deduplication + and compare_versions(self._server_version(), '22.7.1.2484') >= 0 ): - self._model_settings[DEDUP_WINDOW_SETTING] = '0' + for materialization in DEDUP_WINDOW_SETTING_SUPPORTED_MATERIALIZATION: + self._model_settings[materialization][DEDUP_WINDOW_SETTING] = '0' @abstractmethod def query(self, sql: str, **kwargs): @@ -131,8 +140,10 @@ def _set_client_database(self): def _server_version(self): pass - def update_model_settings(self, model_settings: Dict[str, str]): - for key, value in self._model_settings.items(): + def update_model_settings(self, model_settings: Dict[str, str], materialization_type: str): + model_settings_to_add = copy.deepcopy(self._model_settings[materialization_type]) + model_settings_to_add.update(self._model_settings['general']) + for key, value in model_settings_to_add.items(): if key not in model_settings: model_settings[key] = value diff --git a/dbt/adapters/clickhouse/impl.py b/dbt/adapters/clickhouse/impl.py index f5b9b0cf..60741889 100644 --- a/dbt/adapters/clickhouse/impl.py +++ b/dbt/adapters/clickhouse/impl.py @@ -407,8 +407,9 @@ def run_sql_for_tests(self, sql, fetch, conn): @available def get_model_settings(self, model): settings = model['config'].get('settings', {}) + materialization_type = model['config'].get('materialized') conn = self.connections.get_if_exists() - conn.handle.update_model_settings(settings) + conn.handle.update_model_settings(settings, materialization_type) res = [] for key in settings: res.append(f' {key}={settings[key]}') From 5bfce513d54624884356d686b3f532fb3be42242 Mon Sep 17 00:00:00 2001 From: bentsileviav Date: Tue, 13 Feb 2024 16:21:42 +0200 Subject: [PATCH 45/78] support setting clause on view creation --- dbt/include/clickhouse/macros/materializations/view.sql | 1 + 1 file changed, 1 insertion(+) diff --git a/dbt/include/clickhouse/macros/materializations/view.sql b/dbt/include/clickhouse/macros/materializations/view.sql index 735ec973..8abe61bc 100644 --- a/dbt/include/clickhouse/macros/materializations/view.sql +++ b/dbt/include/clickhouse/macros/materializations/view.sql @@ -79,5 +79,6 @@ as ( {{ sql }} ) + {{ adapter.get_model_settings(model) }} {%- endmacro %} From 2dc4e49436d3e00d677ae72f747ea11ae1ec53d2 Mon Sep 17 00:00:00 2001 From: bentsileviav Date: Tue, 13 Feb 2024 16:46:57 +0200 Subject: [PATCH 46/78] edit CHANGELOG.md --- CHANGELOG.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 1cbef238..28230363 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,6 @@ +### Release [1.7.3], 2024-XX-XX +#### Bug Fix +- Fixed an [issue](https://github.com/ClickHouse/dbt-clickhouse/issues/231) where passing settings to on view creation didn't work. ### Release [1.7.2], 2024-02-09 #### Bug Fix - Fixed an issue where Materialize Views would break with a custom schema. Thanks to [Rory Sawyer](https://github.com/SoryRawyer) From 5b5aa058db2ead8144db933d64a2170f3a258641 Mon Sep 17 00:00:00 2001 From: Geoff Genz Date: Tue, 13 Feb 2024 08:00:29 -0700 Subject: [PATCH 47/78] Bump version and tweak changelog --- CHANGELOG.md | 5 ++++- dbt/adapters/clickhouse/__version__.py | 2 +- .../adapter/clickhouse/test_clickhouse_source_schema.py | 5 ++++- 3 files changed, 9 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 6e5a1057..0d9cf4eb 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,8 +1,11 @@ +### Release [1.7.3], 2024-02-13 +- The `dbt test` command with a LIMIT clause were broken due to parsing error when having settings in the query ([issue](https://github.com/ClickHouse/dbt-clickhouse/issues/223)). +We added a dedicated limit placer, that takes into account the settings section (using a comment flag `-- end_of_sql` within the query). + ### Release [1.7.2], 2024-02-09 #### Bug Fix - Fixed an issue where Materialize Views would break with a custom schema. Thanks to [Rory Sawyer](https://github.com/SoryRawyer) for the PR! -- A few tests with LIMIT clause were broken due to parsing error when having settings in the query ([issue](https://github.com/ClickHouse/dbt-clickhouse/issues/223)). We added a dedicated limit placer, that takes into account the settings section (using a comment flag `-- settings_section` within the query). ### Release [1.7.1], 2023-12-13 #### Bug Fixes diff --git a/dbt/adapters/clickhouse/__version__.py b/dbt/adapters/clickhouse/__version__.py index 41aad93f..b517477c 100644 --- a/dbt/adapters/clickhouse/__version__.py +++ b/dbt/adapters/clickhouse/__version__.py @@ -1 +1 @@ -version = '1.7.2' +version = '1.7.3' diff --git a/tests/integration/adapter/clickhouse/test_clickhouse_source_schema.py b/tests/integration/adapter/clickhouse/test_clickhouse_source_schema.py index 86fe916b..2b3abab8 100644 --- a/tests/integration/adapter/clickhouse/test_clickhouse_source_schema.py +++ b/tests/integration/adapter/clickhouse/test_clickhouse_source_schema.py @@ -16,7 +16,10 @@ class TestSourceSchema: @pytest.fixture(scope="class") def models(self): sys_tables_sql = """ - {{ config(order_by='(database, name)', engine='MergeTree()', materialized='table') }} + {{ config(order_by='(database, name)', + engine='MergeTree()', + materialized='table', + settings={'allow_nullable_key': 1}) }} select database, name, engine, total_rows from {{ source('system_source', 'tables') }} """ From a4dcd9e7d2cfcf2efc4d68a4954963242112de8a Mon Sep 17 00:00:00 2001 From: bentsileviav Date: Thu, 15 Feb 2024 11:27:31 +0200 Subject: [PATCH 48/78] change list syntax to satisfy lint test --- dbt/adapters/clickhouse/dbclient.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/dbt/adapters/clickhouse/dbclient.py b/dbt/adapters/clickhouse/dbclient.py index 7d1f5fcd..f1e3cee3 100644 --- a/dbt/adapters/clickhouse/dbclient.py +++ b/dbt/adapters/clickhouse/dbclient.py @@ -19,7 +19,12 @@ LW_DELETE_SETTING = 'allow_experimental_lightweight_delete' ND_MUTATION_SETTING = 'allow_nondeterministic_mutations' DEDUP_WINDOW_SETTING = 'replicated_deduplication_window' -DEDUP_WINDOW_SETTING_SUPPORTED_MATERIALIZATION = ["table", "incremental", "ephemeral", "materialized_view"] +DEDUP_WINDOW_SETTING_SUPPORTED_MATERIALIZATION = [ + "table", + "incremental", + "ephemeral", + "materialized_view" +] def get_db_client(credentials: ClickHouseCredentials): driver = credentials.driver From b353cf68b000bf80effd01fb0f0ebbeac261097e Mon Sep 17 00:00:00 2001 From: bentsileviav Date: Thu, 15 Feb 2024 14:28:07 +0200 Subject: [PATCH 49/78] change list syntax to satisfy lint test --- dbt/adapters/clickhouse/dbclient.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/dbt/adapters/clickhouse/dbclient.py b/dbt/adapters/clickhouse/dbclient.py index f1e3cee3..af0eeac9 100644 --- a/dbt/adapters/clickhouse/dbclient.py +++ b/dbt/adapters/clickhouse/dbclient.py @@ -23,9 +23,10 @@ "table", "incremental", "ephemeral", - "materialized_view" + "materialized_view", ] + def get_db_client(credentials: ClickHouseCredentials): driver = credentials.driver port = credentials.port @@ -101,11 +102,11 @@ def __init__(self, credentials: ClickHouseCredentials): "incremental": {}, "ephemeral": {}, "materialized_view": {}, - "general": {} + "general": {}, } if ( - not credentials.allow_automatic_deduplication - and compare_versions(self._server_version(), '22.7.1.2484') >= 0 + not credentials.allow_automatic_deduplication + and compare_versions(self._server_version(), '22.7.1.2484') >= 0 ): for materialization in DEDUP_WINDOW_SETTING_SUPPORTED_MATERIALIZATION: self._model_settings[materialization][DEDUP_WINDOW_SETTING] = '0' From eee0e52caa50c00d45664d93baa4bd64af2d6a5a Mon Sep 17 00:00:00 2001 From: bentsileviav Date: Thu, 15 Feb 2024 14:48:49 +0200 Subject: [PATCH 50/78] change imports order to satisfy lint test --- dbt/adapters/clickhouse/dbclient.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dbt/adapters/clickhouse/dbclient.py b/dbt/adapters/clickhouse/dbclient.py index af0eeac9..852dbb22 100644 --- a/dbt/adapters/clickhouse/dbclient.py +++ b/dbt/adapters/clickhouse/dbclient.py @@ -1,5 +1,5 @@ -import uuid import copy +import uuid from abc import ABC, abstractmethod from typing import Dict From 86c69ca9c6f5787cd16ca4d1da63115c4b55f674 Mon Sep 17 00:00:00 2001 From: bentsileviav Date: Thu, 15 Feb 2024 16:06:38 +0200 Subject: [PATCH 51/78] Add typing to satisfy lint --- dbt/adapters/clickhouse/dbclient.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dbt/adapters/clickhouse/dbclient.py b/dbt/adapters/clickhouse/dbclient.py index 852dbb22..8f9b61df 100644 --- a/dbt/adapters/clickhouse/dbclient.py +++ b/dbt/adapters/clickhouse/dbclient.py @@ -96,7 +96,7 @@ def __init__(self, credentials: ClickHouseCredentials): except Exception as ex: self.close() raise ex - self._model_settings = { + self._model_settings: Dict = { "table": {}, "view": {}, "incremental": {}, From 57eb850305af8416d4a39786e95d8f9d7dd14a9a Mon Sep 17 00:00:00 2001 From: bentsileviav Date: Sun, 18 Feb 2024 14:19:59 +0200 Subject: [PATCH 52/78] Add snapshot materialization to default settings --- dbt/adapters/clickhouse/dbclient.py | 1 + 1 file changed, 1 insertion(+) diff --git a/dbt/adapters/clickhouse/dbclient.py b/dbt/adapters/clickhouse/dbclient.py index 8f9b61df..89435fac 100644 --- a/dbt/adapters/clickhouse/dbclient.py +++ b/dbt/adapters/clickhouse/dbclient.py @@ -102,6 +102,7 @@ def __init__(self, credentials: ClickHouseCredentials): "incremental": {}, "ephemeral": {}, "materialized_view": {}, + "snapshot": {}, "general": {}, } if ( From 2df9462e160bda3e532c1dd4f7c5c14bd1368b24 Mon Sep 17 00:00:00 2001 From: bentsileviav Date: Sun, 3 Mar 2024 18:58:06 +0200 Subject: [PATCH 53/78] Fix tests - add distributed_table and distributed_incremental materializations --- dbt/adapters/clickhouse/dbclient.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/dbt/adapters/clickhouse/dbclient.py b/dbt/adapters/clickhouse/dbclient.py index 89435fac..65ee1c64 100644 --- a/dbt/adapters/clickhouse/dbclient.py +++ b/dbt/adapters/clickhouse/dbclient.py @@ -103,6 +103,8 @@ def __init__(self, credentials: ClickHouseCredentials): "ephemeral": {}, "materialized_view": {}, "snapshot": {}, + "distributed_table": {}, + "distributed_incremental": {}, "general": {}, } if ( From 79218f35beeec8fd1f5f3b77ed0bc85772c37a23 Mon Sep 17 00:00:00 2001 From: bentsileviav Date: Sun, 3 Mar 2024 19:05:22 +0200 Subject: [PATCH 54/78] Fix tests - make sure to call the get_model_settings only when materialization is view --- dbt/include/clickhouse/macros/materializations/view.sql | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/dbt/include/clickhouse/macros/materializations/view.sql b/dbt/include/clickhouse/macros/materializations/view.sql index 8abe61bc..dfb28787 100644 --- a/dbt/include/clickhouse/macros/materializations/view.sql +++ b/dbt/include/clickhouse/macros/materializations/view.sql @@ -79,6 +79,9 @@ as ( {{ sql }} ) - {{ adapter.get_model_settings(model) }} + {% if model.get('config').get('materialized') == 'view' %} + {{ adapter.get_model_settings(model) }} + {%- endif %} + {%- endmacro %} From d811e90be42c27390a1510a2434633aaaea5b85a Mon Sep 17 00:00:00 2001 From: Geoff Genz Date: Sun, 3 Mar 2024 12:00:19 -0700 Subject: [PATCH 55/78] clean up recent changelog --- CHANGELOG.md | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 7b7e2608..cf3cdc04 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,8 +1,6 @@ -### Release [1.7.4], 2024-XX-XX -#### Bug Fix -- Fixed an [issue](https://github.com/ClickHouse/dbt-clickhouse/issues/231) where passing settings to on view creation didn't work. - -### Release [1.7.3], 2024-02-13 +### Release [1.7.3], 2024-03-11 +#### Bug Fixes +- Fixed an [issue](https://github.com/ClickHouse/dbt-clickhouse/issues/231) where passing settings to on view creation didn't work. - The `dbt test` command with a LIMIT clause were broken due to parsing error when having settings in the query ([issue](https://github.com/ClickHouse/dbt-clickhouse/issues/223)). We added a dedicated limit placer, that takes into account the settings section (using a comment flag `-- end_of_sql` within the query). From 24af7d94d4a3bd46686490838257aaab134ac2b1 Mon Sep 17 00:00:00 2001 From: Rory Sawyer Date: Fri, 8 Mar 2024 11:28:44 -0500 Subject: [PATCH 56/78] Add materialization macro for dictionaries --- dbt/adapters/clickhouse/impl.py | 26 ++- dbt/adapters/clickhouse/relation.py | 16 +- dbt/include/clickhouse/macros/adapters.sql | 6 +- .../macros/materializations/dictionary.sql | 116 +++++++++++ .../adapter/dictionary/test_dictionary.py | 195 ++++++++++++++++++ 5 files changed, 352 insertions(+), 7 deletions(-) create mode 100644 dbt/include/clickhouse/macros/materializations/dictionary.sql create mode 100644 tests/integration/adapter/dictionary/test_dictionary.py diff --git a/dbt/adapters/clickhouse/impl.py b/dbt/adapters/clickhouse/impl.py index 38bfee91..33cb31dd 100644 --- a/dbt/adapters/clickhouse/impl.py +++ b/dbt/adapters/clickhouse/impl.py @@ -11,7 +11,7 @@ from dbt.adapters.sql import SQLAdapter from dbt.contracts.graph.manifest import Manifest from dbt.contracts.graph.nodes import ConstraintType, ModelLevelConstraint -from dbt.contracts.relation import Path, RelationType +from dbt.contracts.relation import Path from dbt.events.functions import warn_or_error from dbt.events.types import ConstraintNotSupported from dbt.exceptions import DbtInternalError, DbtRuntimeError, NotImplementedError @@ -28,7 +28,7 @@ ) from dbt.adapters.clickhouse.logger import logger from dbt.adapters.clickhouse.query import quote_identifier -from dbt.adapters.clickhouse.relation import ClickHouseRelation +from dbt.adapters.clickhouse.relation import ClickHouseRelation, ClickHouseRelationType from dbt.adapters.clickhouse.util import NewColumnDataType, compare_versions GET_CATALOG_MACRO_NAME = 'get_catalog' @@ -271,10 +271,15 @@ def list_relations_without_caching( relations = [] for row in results: name, schema, type_info, db_engine, on_cluster = row - rel_type = RelationType.View if 'view' in type_info else RelationType.Table + if 'view' in type_info: + rel_type = ClickHouseRelationType.View + elif type_info == 'dictionary': + rel_type = ClickHouseRelationType.Dictionary + else: + rel_type = ClickHouseRelationType.Table can_exchange = ( conn_supports_exchange - and rel_type == RelationType.Table + and rel_type == ClickHouseRelationType.Table and db_engine in ('Atomic', 'Replicated') ) @@ -445,6 +450,19 @@ def get_column_schema_from_query(self, sql: str, *_) -> List[ClickHouseColumn]: def format_columns(self, columns) -> List[Dict]: return [{'name': column.name, 'data_type': column.dtype} for column in columns] + @available + def get_credentials(self) -> Dict: + conn = self.connections.get_if_exists() + if conn is None or conn.credentials is None: + return dict() + return { + 'user': conn.credentials.user, + 'password': conn.credentials.password, + 'database': conn.credentials.database, + 'host': conn.credentials.host, + 'port': conn.credentials.port, + } + @classmethod def render_raw_columns_constraints(cls, raw_columns: Dict[str, Dict[str, Any]]) -> List: rendered_columns = [] diff --git a/dbt/adapters/clickhouse/relation.py b/dbt/adapters/clickhouse/relation.py index cc2865f4..16c2ef32 100644 --- a/dbt/adapters/clickhouse/relation.py +++ b/dbt/adapters/clickhouse/relation.py @@ -3,7 +3,8 @@ from dbt.adapters.base.relation import BaseRelation, Policy, Self from dbt.contracts.graph.nodes import ManifestNode, SourceDefinition -from dbt.contracts.relation import HasQuoting, Path, RelationType +from dbt.contracts.relation import HasQuoting, Path +from dbt.dataclass_schema import StrEnum from dbt.exceptions import DbtRuntimeError from dbt.utils import deep_merge, merge @@ -24,8 +25,19 @@ class ClickHouseIncludePolicy(Policy): identifier: bool = True +class ClickHouseRelationType(StrEnum): + Table = "table" + View = "view" + CTE = "cte" + MaterializedView = "materialized_view" + External = "external" + Ephemeral = "ephemeral" + Dictionary = "dictionary" + + @dataclass(frozen=True, eq=False, repr=False) class ClickHouseRelation(BaseRelation): + type: Optional[ClickHouseRelationType] = None quote_policy: Policy = field(default_factory=lambda: ClickHouseQuotePolicy()) include_policy: Policy = field(default_factory=lambda: ClickHouseIncludePolicy()) quote_character: str = '`' @@ -42,7 +54,7 @@ def render(self) -> str: def derivative(self, suffix: str, relation_type: Optional[str] = None) -> BaseRelation: path = Path(schema=self.path.schema, database='', identifier=self.path.identifier + suffix) - derivative_type = RelationType[relation_type] if relation_type else self.type + derivative_type = ClickHouseRelationType[relation_type] if relation_type else self.type return ClickHouseRelation(type=derivative_type, path=path) def matches( diff --git a/dbt/include/clickhouse/macros/adapters.sql b/dbt/include/clickhouse/macros/adapters.sql index 718c775a..6ae897d7 100644 --- a/dbt/include/clickhouse/macros/adapters.sql +++ b/dbt/include/clickhouse/macros/adapters.sql @@ -24,7 +24,11 @@ select t.name as name, t.database as schema, - if(engine not in ('MaterializedView', 'View'), 'table', 'view') as type, + multiIf( + engine in ('MaterializedView', 'View'), 'view', + engine = 'Dictionary', 'dictionary', + 'table' + ) as type, db.engine as db_engine, {%- if adapter.get_clickhouse_cluster_name() -%} count(distinct _shard_num) > 1 as is_on_cluster diff --git a/dbt/include/clickhouse/macros/materializations/dictionary.sql b/dbt/include/clickhouse/macros/materializations/dictionary.sql new file mode 100644 index 00000000..7d226238 --- /dev/null +++ b/dbt/include/clickhouse/macros/materializations/dictionary.sql @@ -0,0 +1,116 @@ +{%- materialization dictionary, adapter='clickhouse' -%} + + {%- set existing_relation = load_cached_relation(this) -%} + {%- set target_relation = this.incorporate(type='dictionary') -%} + {%- set intermediate_relation = make_intermediate_relation(target_relation) -%} + {%- set existing_intermediate_relation = load_cached_relation(intermediate_relation) -%} + {%- set backup_relation_type = 'dictionary' if existing_relation is none else existing_relation.type -%} + {%- set backup_relation = make_backup_relation(target_relation, backup_relation_type) -%} + {%- set existing_backup_relation = load_cached_relation(backup_relation) -%} + + {%- set grant_config = config.get('grants') -%} + + {{ run_hooks(pre_hooks, inside_transaction=False) }} + + {{ drop_dictionary_if_exists(existing_backup_relation) }} + {{ drop_dictionary_if_exists(existing_intermediate_relation) }} + + + {{ run_hooks(pre_hooks, inside_transaction=True) }} + + {# create our new dictionary #} + {% call statement('main') -%} + {{ clickhouse__get_create_dictionary_as_sql(intermediate_relation, sql) }} + {%- endcall %} + + {# cleanup #} + {% if existing_relation is not none %} + {% set existing_relation = load_cached_relation(existing_relation) %} + {% if existing_relation is not none %} + {{ adapter.rename_relation(existing_relation, backup_relation) }} + {% endif %} + {% endif %} + {{ adapter.rename_relation(intermediate_relation, target_relation) }} + + {% set should_revoke = should_revoke(existing_relation, full_refresh_mode=True) %} + {% do apply_grants(target_relation, grant_config, should_revoke=should_revoke) %} + + {% do persist_docs(target_relation, model) %} + + {{ run_hooks(post_hooks, inside_transaction=True) }} + + {{ adapter.commit() }} + + {{ drop_dictionary_if_exists(backup_relation) }} + + {{ run_hooks(post_hooks, inside_transaction=False) }} + + {{ return({'relations': [target_relation]}) }} + +{%- endmaterialization -%} + + +{% macro clickhouse__get_create_dictionary_as_sql(relation, sql) %} + {%- set fields = config.get('fields') -%} + {%- set source_type = config.get('source_type') -%} + + CREATE DICTIONARY {{ relation }} {{ on_cluster_clause(relation) }} + ( + {%- for (name, data_type) in fields -%} + {{ name }} {{ data_type }}{%- if not loop.last -%},{%- endif -%} + {%- endfor -%} + ) + {{ primary_key_clause(label="primary key") }} + SOURCE( + {%- if source_type == 'http' %} + {{ http_source() }} + {% else %} + {{ clickhouse_source(sql) }} + {% endif -%} + ) + LAYOUT({{ config.get('layout') }}) + LIFETIME({{ config.get('lifetime') }}) +{% endmacro %} + + +{% macro http_source() %} + HTTP(URL '{{ config.get("url") }}' FORMAT '{{ config.get("format") }}') +{% endmacro %} + + +{% macro clickhouse_source(sql) %} + {%- set credentials = adapter.get_credentials() -%} + {%- set table = config.get('table') -%} + CLICKHOUSE( + user '{{ credentials.get("user") }}' + {% if credentials.get("password") != '' -%} + password '{{ credentials.get("password") }}' + {%- endif %} + {% if credentials.get("database") != '' -%} + db '{{ credentials.get("database") }}' + {%- endif %} + {% if credentials.get("host") != '' and credentials.get("host") != 'localhost' -%} + host '{{ credentials.get("host") }}' + {% if credentials.get("port") != '' -%} + port '{{ credentials.get("port") }}' + {%- endif %} + {%- endif %} + {%- if table is not none %} + table '{{ table }}' + {% else %} + query "{{ sql }}" + {% endif -%} + ) +{% endmacro %} + + +{% macro drop_dictionary_if_exists(relation) %} + {% if relation.type != 'dictionary' %} + {{ log(relation ~ ' is not a dictionary; defaulting to drop_relation_if_exists') }} + {{ drop_relation_if_exists(relation) }} + {% else %} + {% call statement('drop_dictionary_if_exists') %} + drop dictionary if exists {{ relation }} + {% endcall %} + {% endif %} +{% endmacro %} diff --git a/tests/integration/adapter/dictionary/test_dictionary.py b/tests/integration/adapter/dictionary/test_dictionary.py new file mode 100644 index 00000000..b65eff82 --- /dev/null +++ b/tests/integration/adapter/dictionary/test_dictionary.py @@ -0,0 +1,195 @@ +""" +test dictionary support in dbt-clickhouse +""" + +import json +import os + +import pytest +from dbt.tests.util import run_dbt + +testing_s3 = os.environ.get('DBT_CH_TEST_INCLUDE_S3', '').lower() in ('1', 'true', 'yes') + + +PEOPLE_SEED_CSV = """ +id,name,age,department +1231,Dade,33,engineering +6666,Ksenia,48,engineering +8888,Kate,50,engineering +""".lstrip() + +# This model is parameterized, in a way, by the "run_type" dbt project variable +# This is to be able to switch between different model definitions within +# the same test run and allow us to test the evolution of a materialized view +HACKERS_MODEL = """ +{{ config( + materialized='dictionary', + fields=[ + ('id', 'Int32'), + ('name', 'String'), + ('hacker_alias', 'String') + ], + primary_key='id', + layout='COMPLEX_KEY_HASHED()', + lifetime='1', + source_type='clickhouse', +) }} + +{% if var('run_type', '') == '' %} +select + id, + name, + case + when name like 'Dade' then 'crash_override' + when name like 'Kate' then 'acid burn' + when name like 'Eugene' then 'the plague' + else 'N/A' + end as hacker_alias +from {{ source('raw', 'people') }} + +{% else %} + +select + id, + name, + case + -- Dade wasn't always known as 'crash override'! + when name like 'Dade' and age = 11 then 'zero cool' + when name like 'Dade' and age != 11 then 'crash override' + when name like 'Kate' then 'acid burn' + when name like 'Eugene' then 'the plague' + else 'N/A' + end as hacker_alias +from {{ source('raw', 'people') }} +{% endif %} +""" + + +TAXI_ZONE_DICTIONARY = """ +{{ config( + materialized='dictionary', + fields=[ + ('LocationID', 'UInt16 DEFAULT 0'), + ('Borough', 'String'), + ('Zone', 'String'), + ('service_zone', 'String'), + ], + primary_key='LocationID', + layout='HASHED()', + lifetime='MIN 0 MAX 0', + source_type='http', + url='https://datasets-documentation.s3.eu-west-3.amazonaws.com/nyc-taxi/taxi_zone_lookup.csv', + format='CSVWithNames' +) }} + +select 1 +""" + + +PEOPLE_DICT_MODEL = """ +{{ config( + materialized='dictionary', + fields=[ + ('id', 'Int32'), + ('name', 'String'), + ], + primary_key='id', + layout='HASHED()', + lifetime='1', + source_type='clickhouse', + table='people' +) }} + +select 1 +""" + + +SEED_SCHEMA_YML = """ +version: 2 + +sources: + - name: raw + schema: "{{ target.schema }}" + tables: + - name: people +""" + + +class TestQueryDictionary: + @pytest.fixture(scope="class") + def seeds(self): + return { + "people.csv": PEOPLE_SEED_CSV, + "schema.yml": SEED_SCHEMA_YML, + } + + @pytest.fixture(scope="class") + def models(self): + return { + "hackers.sql": HACKERS_MODEL, + } + + def test_create_and_update(self, project): + run_dbt(["seed"]) + + result = project.run_sql("DESCRIBE TABLE people", fetch="all") + assert result[0][1] == "Int32" + + run_dbt() + result = project.run_sql("select count(distinct id) from hackers", fetch="all") + assert result[0][0] == 3 + + # insert some data and make sure it reaches the target dictionary + project.run_sql( + f""" + insert into people ("id", "name", "age", "department") + values (1232,'Dade',11,'engineering'), (9999,'Eugene',40,'malware'); + """ + ) + # force the dictionary to be rebuilt to include the new records in `people` + project.run_sql("system reload dictionary hackers") + result = project.run_sql("select count(distinct id) from hackers", fetch="all") + assert result[0][0] == 5 + + # re-run dbt but this time with the new MV SQL + run_vars = {"run_type": "extended_schema"} + run_dbt(["run", "--vars", json.dumps(run_vars)]) + results = project.run_sql("select distinct hacker_alias from hackers", fetch="all") + names = set(i[0] for i in results) + assert names == set(["zero cool", "crash override", "acid burn", "the plague", "N/A"]) + + +class TestTableDictionary: + @pytest.fixture(scope="class") + def seeds(self): + return { + "people.csv": PEOPLE_SEED_CSV, + "schema.yml": SEED_SCHEMA_YML, + } + + @pytest.fixture(scope="class") + def models(self): + return {"people_dict.sql": PEOPLE_DICT_MODEL} + + def test_create(self, project): + run_dbt(["seed"]) + run_dbt() + + results = project.run_sql("select distinct name from people_dict", fetch="all") + names = set(i[0] for i in results) + assert names == set(["Dade", "Kate", "Ksenia"]) + + +class TestHttpDictionary: + @pytest.fixture(scope="class") + def models(self): + return {"taxi_zone_dictionary.sql": TAXI_ZONE_DICTIONARY} + + @pytest.mark.skipif(not testing_s3, reason='Testing S3 disabled') + def test_create(self, project): + run_dbt() + + results = project.run_sql( + "select count(distinct LocationID) from taxi_zone_dictionary", fetch="all" + ) + assert results[0][0] == 265 From 9e88d9f13f6eecdada703c122c35d68b91dca78f Mon Sep 17 00:00:00 2001 From: Rory Sawyer Date: Fri, 8 Mar 2024 12:09:47 -0500 Subject: [PATCH 57/78] address lint issue in dictionary test --- tests/integration/adapter/dictionary/test_dictionary.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/integration/adapter/dictionary/test_dictionary.py b/tests/integration/adapter/dictionary/test_dictionary.py index b65eff82..77ee1aae 100644 --- a/tests/integration/adapter/dictionary/test_dictionary.py +++ b/tests/integration/adapter/dictionary/test_dictionary.py @@ -141,7 +141,7 @@ def test_create_and_update(self, project): # insert some data and make sure it reaches the target dictionary project.run_sql( - f""" + """ insert into people ("id", "name", "age", "department") values (1232,'Dade',11,'engineering'), (9999,'Eugene',40,'malware'); """ From b7226793795175c2285fbc0123c0d952ae29554b Mon Sep 17 00:00:00 2001 From: Rory Sawyer Date: Fri, 8 Mar 2024 12:13:42 -0500 Subject: [PATCH 58/78] address lint issue with enum --- dbt/adapters/clickhouse/relation.py | 2 +- .../macros/materializations/materialized_view.sql | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/dbt/adapters/clickhouse/relation.py b/dbt/adapters/clickhouse/relation.py index 16c2ef32..90e88a63 100644 --- a/dbt/adapters/clickhouse/relation.py +++ b/dbt/adapters/clickhouse/relation.py @@ -54,7 +54,7 @@ def render(self) -> str: def derivative(self, suffix: str, relation_type: Optional[str] = None) -> BaseRelation: path = Path(schema=self.path.schema, database='', identifier=self.path.identifier + suffix) - derivative_type = ClickHouseRelationType[relation_type] if relation_type else self.type + derivative_type = ClickHouseRelationType(relation_type) if relation_type else self.type return ClickHouseRelation(type=derivative_type, path=path) def matches( diff --git a/dbt/include/clickhouse/macros/materializations/materialized_view.sql b/dbt/include/clickhouse/macros/materializations/materialized_view.sql index 2ebe0dae..2736debe 100644 --- a/dbt/include/clickhouse/macros/materializations/materialized_view.sql +++ b/dbt/include/clickhouse/macros/materializations/materialized_view.sql @@ -6,7 +6,7 @@ {%- materialization materialized_view, adapter='clickhouse' -%} {%- set target_relation = this.incorporate(type='table') -%} - {%- set mv_relation = target_relation.derivative('_mv', 'MaterializedView') -%} + {%- set mv_relation = target_relation.derivative('_mv', 'materialized_view') -%} {%- set cluster_clause = on_cluster_clause(target_relation) -%} {# look for an existing relation for the target table and create backup relations if necessary #} @@ -87,7 +87,7 @@ {{ get_create_table_as_sql(False, relation, sql) }} {% endcall %} {%- set cluster_clause = on_cluster_clause(relation) -%} - {%- set mv_relation = relation.derivative('_mv', 'MaterializedView') -%} + {%- set mv_relation = relation.derivative('_mv', 'materialized_view') -%} {{ clickhouse__create_mv_sql(mv_relation, relation, cluster_clause, sql) }} {%- endmacro %} @@ -102,7 +102,7 @@ {% macro clickhouse__replace_mv(target_relation, existing_relation, intermediate_relation, backup_relation, sql) %} {# drop existing materialized view while we recreate the target table #} {%- set cluster_clause = on_cluster_clause(target_relation) -%} - {%- set mv_relation = target_relation.derivative('_mv', 'MaterializedView') -%} + {%- set mv_relation = target_relation.derivative('_mv', 'materialized_view') -%} {% call statement('drop existing mv') -%} drop view if exists {{ mv_relation }} {{ cluster_clause }} {%- endcall %} From 23b01e8c2c8230d7ac824346cecc9618583e1fa5 Mon Sep 17 00:00:00 2001 From: Dmitriy Sokolov Date: Sat, 16 Mar 2024 10:37:07 +0200 Subject: [PATCH 59/78] Fix model settings with custom materialization --- dbt/adapters/clickhouse/dbclient.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/dbt/adapters/clickhouse/dbclient.py b/dbt/adapters/clickhouse/dbclient.py index 65ee1c64..31037f36 100644 --- a/dbt/adapters/clickhouse/dbclient.py +++ b/dbt/adapters/clickhouse/dbclient.py @@ -150,7 +150,8 @@ def _server_version(self): pass def update_model_settings(self, model_settings: Dict[str, str], materialization_type: str): - model_settings_to_add = copy.deepcopy(self._model_settings[materialization_type]) + settings = self._model_settings.get(materialization_type, {}) + model_settings_to_add = copy.deepcopy(settings) model_settings_to_add.update(self._model_settings['general']) for key, value in model_settings_to_add.items(): if key not in model_settings: From 5675b1d05ef91a318698c1ae5035660fb4d6f69f Mon Sep 17 00:00:00 2001 From: Geoff Genz Date: Sat, 23 Mar 2024 04:26:47 -0600 Subject: [PATCH 60/78] Release 1.7.4 housekeeping (#261) --- CHANGELOG.md | 6 ++++++ README.md | 4 ++++ dbt/adapters/clickhouse/__version__.py | 2 +- 3 files changed, 11 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index cf3cdc04..9f2e8961 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,9 @@ +### Release [1.7.4], 2024-03-23 +#### Improvement +- Adds support for materializing ClickHouse dictionaries. Thanks to [Rory Sawyer](https://github.com/SoryRawyer) for the contribution! +See his excellent [tests](https://github.com/ClickHouse/dbt-clickhouse/blob/main/tests/integration/adapter/dictionary/test_dictionary.py) +for example usage. + ### Release [1.7.3], 2024-03-11 #### Bug Fixes - Fixed an [issue](https://github.com/ClickHouse/dbt-clickhouse/issues/231) where passing settings to on view creation didn't work. diff --git a/README.md b/README.md index 8e413f67..c38582e1 100644 --- a/README.md +++ b/README.md @@ -213,6 +213,10 @@ no corresponding REFRESH operation). Instead, it acts as an "insert trigger", a (https://github.com/ClickHouse/dbt-clickhouse/blob/main/tests/integration/adapter/materialized_view/test_materialized_view.py) for an introductory example of how to use this functionality. +# Dictionary materializations (experimental) +See the tests in https://github.com/ClickHouse/dbt-clickhouse/blob/main/tests/integration/adapter/dictionary/test_dictionary.py for examples of how to +implement materializations for ClickHouse dictionaries + # Distributed materializations Notes: diff --git a/dbt/adapters/clickhouse/__version__.py b/dbt/adapters/clickhouse/__version__.py index b517477c..86d86eea 100644 --- a/dbt/adapters/clickhouse/__version__.py +++ b/dbt/adapters/clickhouse/__version__.py @@ -1 +1 @@ -version = '1.7.3' +version = '1.7.4' From 62efd62f0d9eea0e1a73a5024e0045d546b9a7b2 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Sat, 23 Mar 2024 04:51:01 -0600 Subject: [PATCH 61/78] Bump black from 23.11.0 to 24.3.0 (#259) Bumps [black](https://github.com/psf/black) from 23.11.0 to 24.3.0. - [Release notes](https://github.com/psf/black/releases) - [Changelog](https://github.com/psf/black/blob/main/CHANGES.md) - [Commits](https://github.com/psf/black/compare/23.11.0...24.3.0) --- updated-dependencies: - dependency-name: black dependency-type: direct:development ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- dev_requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dev_requirements.txt b/dev_requirements.txt index fcadbaaf..257cf72c 100644 --- a/dev_requirements.txt +++ b/dev_requirements.txt @@ -4,7 +4,7 @@ clickhouse-driver>=0.2.6 pytest>=7.2.0 pytest-dotenv==0.5.2 dbt-tests-adapter~=1.7.3 -black==23.11.0 +black==24.3.0 isort==5.10.1 mypy==0.991 yamllint==1.26.3 From ef350a4cecd4c205b12e822c27f78ff8da336a7c Mon Sep 17 00:00:00 2001 From: Geoff Genz Date: Tue, 2 Apr 2024 00:27:34 -0600 Subject: [PATCH 62/78] Release 1 7 5 (#265) * Release 1.7.5 housekeeping * Upgrade setuptools requirement for clickhouse_driver install * Remove flake8 checks for the moment * Update workflow actions * Fix black comma --- .github/workflows/lint.yml | 8 ++++---- .github/workflows/pypi.yml | 6 +++--- .github/workflows/test_cloud.yml | 4 ++-- .github/workflows/test_matrix.yml | 11 ++++++----- CHANGELOG.md | 6 ++++++ Makefile | 8 ++------ dbt/adapters/clickhouse/__version__.py | 2 +- dbt/adapters/clickhouse/dbclient.py | 8 ++++---- dev_requirements.txt | 13 ++++++------- pyproject.toml | 2 +- setup.py | 1 + 11 files changed, 36 insertions(+), 33 deletions(-) diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml index f5f9a9bc..4286da28 100644 --- a/.github/workflows/lint.yml +++ b/.github/workflows/lint.yml @@ -16,12 +16,12 @@ jobs: steps: - name: Checkout - uses: actions/checkout@v3 + uses: actions/checkout@v4 - - name: Setup Python 3.9 - uses: actions/setup-python@v4 + - name: Setup Python 3.11 + uses: actions/setup-python@v5 with: - python-version: 3.9 + python-version: 3.11 - name: Upgrade Setuptools run: pip install --upgrade setuptools wheel diff --git a/.github/workflows/pypi.yml b/.github/workflows/pypi.yml index 6e4fcc6d..4a8915d6 100644 --- a/.github/workflows/pypi.yml +++ b/.github/workflows/pypi.yml @@ -16,12 +16,12 @@ jobs: steps: - name: Checkout - uses: actions/checkout@v3 + uses: actions/checkout@v4 - name: Setup Python - uses: actions/setup-python@v4 + uses: actions/setup-python@v5 with: - python-version: "3.10" + python-version: "3.11" - name: Upgrade Setuptools run: pip install --upgrade setuptools wheel diff --git a/.github/workflows/test_cloud.yml b/.github/workflows/test_cloud.yml index d6403b1d..07d94bce 100644 --- a/.github/workflows/test_cloud.yml +++ b/.github/workflows/test_cloud.yml @@ -21,10 +21,10 @@ jobs: steps: - name: Checkout - uses: actions/checkout@v3 + uses: actions/checkout@v4 - name: Setup Python 3.11 - uses: actions/setup-python@v4 + uses: actions/setup-python@v5 with: python-version: '3.11' diff --git a/.github/workflows/test_matrix.yml b/.github/workflows/test_matrix.yml index 3da76748..224d17d3 100644 --- a/.github/workflows/test_matrix.yml +++ b/.github/workflows/test_matrix.yml @@ -26,16 +26,17 @@ jobs: - '3.9' - '3.10' - '3.11' + - '3.12' clickhouse-version: - - '23.3' - '23.8' - - '23.9' - - '23.10' + - '24.1' + - '24.2' + - '24.3' - latest steps: - name: Checkout - uses: actions/checkout@v3 + uses: actions/checkout@v4 - name: Set environment variables if: ${{ matrix.clickhouse-version == '22.3' }} @@ -49,7 +50,7 @@ jobs: run: REPLICA_NUM=1 docker-compose -f ${{ github.workspace }}/tests/integration/docker-compose.yml up -d - name: Setup Python ${{ matrix.python-version }} - uses: actions/setup-python@v4 + uses: actions/setup-python@v5 with: python-version: ${{ matrix.python-version }} diff --git a/CHANGELOG.md b/CHANGELOG.md index 9f2e8961..02588474 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,9 @@ +### Release [1.7.5], 2024-04-02 +#### Bug Fixes +- Requirements and tests upgraded to include Python 3.12. Closes https://github.com/ClickHouse/dbt-clickhouse/issues/264 +- Model settings were not working correctly for customer materializations. Thanks to original dbt-clickhouse [silentsokolov](https://github.com/silentsokolov) +for the PR! + ### Release [1.7.4], 2024-03-23 #### Improvement - Adds support for materializing ClickHouse dictionaries. Thanks to [Rory Sawyer](https://github.com/SoryRawyer) for the contribution! diff --git a/Makefile b/Makefile index 1a0d7dfd..feb23310 100644 --- a/Makefile +++ b/Makefile @@ -1,5 +1,5 @@ # Check style and linting -.PHONY: check-black check-isort check-flake8 check-mypy lint +.PHONY: check-black check-isort check-mypy lint check-black: @echo "--> Running black checks" @@ -9,10 +9,6 @@ check-isort: @echo "--> Running isort checks" @isort --check-only . -check-flake8: - @echo "--> Running flake8 checks" - @flake8 . - check-mypy: @echo "--> Running mypy checks" @mypy --exclude dbt/adapters/clickhouse/__init__.py --exclude conftest.py . @@ -21,7 +17,7 @@ check-yamllint: @echo "--> Running yamllint checks" @yamllint dbt tests .github -lint: check-black check-isort check-flake8 check-mypy check-yamllint +lint: check-black check-isort check-mypy check-yamllint # Format code .PHONY: fmt diff --git a/dbt/adapters/clickhouse/__version__.py b/dbt/adapters/clickhouse/__version__.py index 86d86eea..ee8593ed 100644 --- a/dbt/adapters/clickhouse/__version__.py +++ b/dbt/adapters/clickhouse/__version__.py @@ -1 +1 @@ -version = '1.7.4' +version = '1.7.5' diff --git a/dbt/adapters/clickhouse/dbclient.py b/dbt/adapters/clickhouse/dbclient.py index 31037f36..7bb78785 100644 --- a/dbt/adapters/clickhouse/dbclient.py +++ b/dbt/adapters/clickhouse/dbclient.py @@ -53,20 +53,20 @@ def get_db_client(credentials: ClickHouseCredentials): from dbt.adapters.clickhouse.nativeclient import ChNativeClient return ChNativeClient(credentials) - except ImportError: + except ImportError as ex: raise FailedToConnectError( 'Native adapter required but package clickhouse-driver is not installed' - ) + ) from ex try: import clickhouse_connect # noqa from dbt.adapters.clickhouse.httpclient import ChHttpClient return ChHttpClient(credentials) - except ImportError: + except ImportError as ex: raise FailedToConnectError( 'HTTP adapter required but package clickhouse-connect is not installed' - ) + ) from ex class ChRetryableException(Exception): diff --git a/dev_requirements.txt b/dev_requirements.txt index 257cf72c..9c72b0c5 100644 --- a/dev_requirements.txt +++ b/dev_requirements.txt @@ -1,16 +1,15 @@ -dbt-core~=1.7.3 -clickhouse-connect>=0.6.22 -clickhouse-driver>=0.2.6 +dbt-core~=1.7.11 +clickhouse-connect>=0.7.6 +clickhouse-driver>=0.2.7 pytest>=7.2.0 pytest-dotenv==0.5.2 -dbt-tests-adapter~=1.7.3 +dbt-tests-adapter~=1.7.11 black==24.3.0 isort==5.10.1 mypy==0.991 yamllint==1.26.3 -flake8==4.0.1 types-requests==2.27.29 agate~=1.7.1 requests~=2.27.1 -setuptools~=65.3.0 -types-setuptools==67.1.0.0 \ No newline at end of file +setuptools>=69.2.0 +types-setuptools>=69.2.0 \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml index 68570715..842526f6 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,7 +1,7 @@ [tool.black] line-length = 100 skip-string-normalization = true -target-version = ['py310', 'py311'] +target-version = ['py310', 'py311', 'py312'] exclude = '(\.eggs|\.git|\.mypy_cache|\.venv|venv|env|_build|build|build|dist|)' [tool.isort] diff --git a/setup.py b/setup.py index 7beb9ba9..1958f8c5 100644 --- a/setup.py +++ b/setup.py @@ -57,6 +57,7 @@ def _dbt_clickhouse_version(): f'dbt-core~={dbt_version}', 'clickhouse-connect>=0.6.22', 'clickhouse-driver>=0.2.6', + 'setuptools>=0.69', ], python_requires=">=3.8", platforms='any', From 08cee8732a6988323c621dd4863923a90fbc9ce7 Mon Sep 17 00:00:00 2001 From: triou Date: Mon, 8 Apr 2024 11:34:33 +0200 Subject: [PATCH 63/78] fix(clients): add newlines around subquery when retrieving columns to avoid a syntax error (#262) --- dbt/adapters/clickhouse/httpclient.py | 7 ++++++- dbt/adapters/clickhouse/nativeclient.py | 5 ++++- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/dbt/adapters/clickhouse/httpclient.py b/dbt/adapters/clickhouse/httpclient.py index 17795e44..6ece7bc0 100644 --- a/dbt/adapters/clickhouse/httpclient.py +++ b/dbt/adapters/clickhouse/httpclient.py @@ -25,7 +25,12 @@ def command(self, sql, **kwargs): def columns_in_query(self, sql: str, **kwargs) -> List[ClickHouseColumn]: try: - query_result = self._client.query(f"SELECT * FROM ({sql}) LIMIT 0", **kwargs) + query_result = self._client.query( + f"SELECT * FROM ( \n" + f"{sql} \n" + f") LIMIT 0", + **kwargs, + ) return [ ClickHouseColumn.create(name, ch_type.name) for name, ch_type in zip(query_result.column_names, query_result.column_types) diff --git a/dbt/adapters/clickhouse/nativeclient.py b/dbt/adapters/clickhouse/nativeclient.py index aaec97f9..dddc1ed9 100644 --- a/dbt/adapters/clickhouse/nativeclient.py +++ b/dbt/adapters/clickhouse/nativeclient.py @@ -35,7 +35,10 @@ def command(self, sql, **kwargs): def columns_in_query(self, sql: str, **kwargs) -> List[ClickHouseColumn]: try: _, columns = self._client.execute( - f"SELECT * FROM ({sql}) LIMIT 0", with_column_types=True + f"SELECT * FROM ( \n" + f"{sql} \n" + f") LIMIT 0", + with_column_types=True, ) return [ClickHouseColumn.create(column[0], column[1]) for column in columns] except clickhouse_driver.errors.Error as ex: From 84c79a27f122cbe35975c26718fd0411ba4a7173 Mon Sep 17 00:00:00 2001 From: Geoff Genz Date: Mon, 8 Apr 2024 06:21:20 -0600 Subject: [PATCH 64/78] Fix lint --- dbt/adapters/clickhouse/httpclient.py | 4 +--- dbt/adapters/clickhouse/nativeclient.py | 4 +--- 2 files changed, 2 insertions(+), 6 deletions(-) diff --git a/dbt/adapters/clickhouse/httpclient.py b/dbt/adapters/clickhouse/httpclient.py index 6ece7bc0..a34e5a64 100644 --- a/dbt/adapters/clickhouse/httpclient.py +++ b/dbt/adapters/clickhouse/httpclient.py @@ -26,9 +26,7 @@ def command(self, sql, **kwargs): def columns_in_query(self, sql: str, **kwargs) -> List[ClickHouseColumn]: try: query_result = self._client.query( - f"SELECT * FROM ( \n" - f"{sql} \n" - f") LIMIT 0", + f"SELECT * FROM ( \n" f"{sql} \n" f") LIMIT 0", **kwargs, ) return [ diff --git a/dbt/adapters/clickhouse/nativeclient.py b/dbt/adapters/clickhouse/nativeclient.py index dddc1ed9..772c8e96 100644 --- a/dbt/adapters/clickhouse/nativeclient.py +++ b/dbt/adapters/clickhouse/nativeclient.py @@ -35,9 +35,7 @@ def command(self, sql, **kwargs): def columns_in_query(self, sql: str, **kwargs) -> List[ClickHouseColumn]: try: _, columns = self._client.execute( - f"SELECT * FROM ( \n" - f"{sql} \n" - f") LIMIT 0", + f"SELECT * FROM ( \n" f"{sql} \n" f") LIMIT 0", with_column_types=True, ) return [ClickHouseColumn.create(column[0], column[1]) for column in columns] From 0e4ef7050d86aab7295f52bddd8235b96d42b356 Mon Sep 17 00:00:00 2001 From: Daniel Reeves <31971762+dwreeves@users.noreply.github.com> Date: Mon, 8 Apr 2024 10:19:26 -0400 Subject: [PATCH 65/78] lazy load agate (#263) --- dbt/adapters/clickhouse/connections.py | 18 +++++++++----- dbt/adapters/clickhouse/impl.py | 34 +++++++++++++++----------- 2 files changed, 32 insertions(+), 20 deletions(-) diff --git a/dbt/adapters/clickhouse/connections.py b/dbt/adapters/clickhouse/connections.py index dcb411f8..6881e86f 100644 --- a/dbt/adapters/clickhouse/connections.py +++ b/dbt/adapters/clickhouse/connections.py @@ -1,9 +1,8 @@ import re import time from contextlib import contextmanager -from typing import Any, Optional, Tuple, Union +from typing import Any, Optional, Tuple, Union, TYPE_CHECKING -import agate import dbt.exceptions from dbt.adapters.sql import SQLConnectionManager from dbt.contracts.connection import AdapterResponse, Connection @@ -11,6 +10,9 @@ from dbt.adapters.clickhouse.dbclient import ChRetryableException, get_db_client from dbt.adapters.clickhouse.logger import logger +if TYPE_CHECKING: + import agate + retryable_exceptions = [ChRetryableException] ddl_re = re.compile(r'^\s*(CREATE|DROP|ALTER)\s', re.IGNORECASE) @@ -60,21 +62,23 @@ def release(self): pass # There is no "release" type functionality in the existing ClickHouse connectors @classmethod - def get_table_from_response(cls, response, column_names) -> agate.Table: + def get_table_from_response(cls, response, column_names) -> "agate.Table": """ Build agate table from response. :param response: ClickHouse query result :param column_names: Table column names """ + from dbt.clients.agate_helper import table_from_data_flat + data = [] for row in response: data.append(dict(zip(column_names, row))) - return dbt.clients.agate_helper.table_from_data_flat(data, column_names) + return table_from_data_flat(data, column_names) def execute( self, sql: str, auto_begin: bool = False, fetch: bool = False, limit: Optional[int] = None - ) -> Tuple[AdapterResponse, agate.Table]: + ) -> Tuple[AdapterResponse, "agate.Table"]: # Don't try to fetch result of clustered DDL responses, we don't know what to do with them if fetch and ddl_re.match(sql): fetch = False @@ -97,7 +101,9 @@ def execute( query_result.result_set, query_result.column_names ) else: - table = dbt.clients.agate_helper.empty_table() + from dbt.clients.agate_helper import empty_table + + table = empty_table() return AdapterResponse(_message=status), table def add_query( diff --git a/dbt/adapters/clickhouse/impl.py b/dbt/adapters/clickhouse/impl.py index 33cb31dd..ea679383 100644 --- a/dbt/adapters/clickhouse/impl.py +++ b/dbt/adapters/clickhouse/impl.py @@ -1,9 +1,8 @@ import csv import io from dataclasses import dataclass -from typing import Any, Callable, Dict, List, Optional, Set, Tuple, Union +from typing import Any, Callable, Dict, List, Optional, Set, Tuple, Union, TYPE_CHECKING -import agate from dbt.adapters.base import AdapterConfig, available from dbt.adapters.base.impl import BaseAdapter, ConstraintSupport from dbt.adapters.base.relation import BaseRelation, InformationSchema @@ -31,6 +30,9 @@ from dbt.adapters.clickhouse.relation import ClickHouseRelation, ClickHouseRelationType from dbt.adapters.clickhouse.util import NewColumnDataType, compare_versions +if TYPE_CHECKING: + import agate + GET_CATALOG_MACRO_NAME = 'get_catalog' LIST_SCHEMAS_MACRO_NAME = 'list_schemas' @@ -73,29 +75,31 @@ def date_function(cls): return 'now()' @classmethod - def convert_text_type(cls, agate_table: agate.Table, col_idx: int) -> str: + def convert_text_type(cls, agate_table: "agate.Table", col_idx: int) -> str: return 'String' @classmethod - def convert_number_type(cls, agate_table: agate.Table, col_idx: int) -> str: + def convert_number_type(cls, agate_table: "agate.Table", col_idx: int) -> str: + import agate + decimals = agate_table.aggregate(agate.MaxPrecision(col_idx)) # We match these type to the Column.TYPE_LABELS for consistency return 'Float32' if decimals else 'Int32' @classmethod - def convert_boolean_type(cls, agate_table: agate.Table, col_idx: int) -> str: + def convert_boolean_type(cls, agate_table: "agate.Table", col_idx: int) -> str: return 'Bool' @classmethod - def convert_datetime_type(cls, agate_table: agate.Table, col_idx: int) -> str: + def convert_datetime_type(cls, agate_table: "agate.Table", col_idx: int) -> str: return 'DateTime' @classmethod - def convert_date_type(cls, agate_table: agate.Table, col_idx: int) -> str: + def convert_date_type(cls, agate_table: "agate.Table", col_idx: int) -> str: return 'Date' @classmethod - def convert_time_type(cls, agate_table: agate.Table, col_idx: int) -> str: + def convert_time_type(cls, agate_table: "agate.Table", col_idx: int) -> str: raise NotImplementedError('`convert_time_type` is not implemented for this adapter!') @available.parse(lambda *a, **k: {}) @@ -308,13 +312,15 @@ def get_ch_database(self, schema: str): except DbtRuntimeError: return None - def get_catalog(self, manifest) -> Tuple[agate.Table, List[Exception]]: + def get_catalog(self, manifest) -> Tuple["agate.Table", List[Exception]]: + from dbt.clients.agate_helper import empty_table + relations = self._get_catalog_relations(manifest) schemas = set(relation.schema for relation in relations) if schemas: catalog = self._get_one_catalog(InformationSchema(Path()), schemas, manifest) else: - catalog = dbt.clients.agate_helper.empty_table() + catalog = empty_table() return catalog, [] def get_filtered_catalog( @@ -324,7 +330,7 @@ def get_filtered_catalog( if relations and catalog: relation_map = {(r.schema, r.identifier) for r in relations} - def in_map(row: agate.Row): + def in_map(row: "agate.Row"): s = _expect_row_value("table_schema", row) i = _expect_row_value("table_name", row) return (s, i) in relation_map @@ -488,17 +494,17 @@ class ClickHouseDatabase: comment: str -def _expect_row_value(key: str, row: agate.Row): +def _expect_row_value(key: str, row: "agate.Row"): if key not in row.keys(): raise DbtInternalError(f'Got a row without \'{key}\' column, columns: {row.keys()}') return row[key] -def _catalog_filter_schemas(manifest: Manifest) -> Callable[[agate.Row], bool]: +def _catalog_filter_schemas(manifest: Manifest) -> Callable[["agate.Row"], bool]: schemas = frozenset((None, s) for d, s in manifest.get_used_schemas()) - def test(row: agate.Row) -> bool: + def test(row: "agate.Row") -> bool: table_database = _expect_row_value('table_database', row) table_schema = _expect_row_value('table_schema', row) if table_schema is None: From 1c9a15d3b60200ed4c2c99bbd3f4b638632cd018 Mon Sep 17 00:00:00 2001 From: Cristhian Garcia Date: Mon, 8 Apr 2024 15:08:38 -0500 Subject: [PATCH 66/78] feat: add TTL support (#254) --- README.md | 2 + dbt/adapters/clickhouse/impl.py | 1 + .../macros/materializations/table.sql | 7 +++ .../clickhouse/test_clickhouse_table_ttl.py | 62 +++++++++++++++++++ 4 files changed, 72 insertions(+) create mode 100644 tests/integration/adapter/clickhouse/test_clickhouse_table_ttl.py diff --git a/README.md b/README.md index c38582e1..68409378 100644 --- a/README.md +++ b/README.md @@ -102,6 +102,7 @@ your_profile_name: | incremental_predicates | Additional conditions to be applied to the incremental materialization (only applied to `delete+insert` strategy | | | settings | A map/dictionary of "TABLE" settings to be used to DDL statements like 'CREATE TABLE' with this model | | | query_settings | A map/dictionary of ClickHouse user level settings to be used with `INSERT` or `DELETE` statements in conjunction with this model | | +| ttl | A TTL expression to be used with the table. The TTL expression is a string that can be used to specify the TTL for the table. | | ## ClickHouse Cluster @@ -179,6 +180,7 @@ The following macros are included to facilitate creating ClickHouse specific tab - `order_cols` -- Uses the `order_by` model configuration to assign a ClickHouse order by/sorting key. If not specified ClickHouse will use an empty tuple() and the table will be unsorted - `primary_key_clause` -- Uses the `primary_key` model configuration property to assign a ClickHouse primary key. By default, primary key is set and ClickHouse will use the order by clause as the primary key. - `on_cluster_clause` -- Uses the `cluster` profile property to add an `ON CLUSTER` clause to certain dbt-operations: distributed materializations, views creation, database creation. +- `ttl_config` -- Uses the `ttl` model configuration property to assign a ClickHouse table TTL expression. No TTL is assigned by default. ### s3Source Helper Macro diff --git a/dbt/adapters/clickhouse/impl.py b/dbt/adapters/clickhouse/impl.py index ea679383..27ef4842 100644 --- a/dbt/adapters/clickhouse/impl.py +++ b/dbt/adapters/clickhouse/impl.py @@ -43,6 +43,7 @@ class ClickHouseConfig(AdapterConfig): order_by: Optional[Union[List[str], str]] = 'tuple()' partition_by: Optional[Union[List[str], str]] = None sharding_key: Optional[Union[List[str], str]] = 'rand()' + ttl: Optional[Union[List[str], str]] = None class ClickHouseAdapter(SQLAdapter): diff --git a/dbt/include/clickhouse/macros/materializations/table.sql b/dbt/include/clickhouse/macros/materializations/table.sql index 72cc72c8..d9b4c49b 100644 --- a/dbt/include/clickhouse/macros/materializations/table.sql +++ b/dbt/include/clickhouse/macros/materializations/table.sql @@ -121,6 +121,12 @@ {%- endif %} {%- endmacro -%} +{% macro ttl_config(label) %} + {%- if config.get("ttl")%} + {{ label }} {{ config.get("ttl") }} + {%- endif %} +{%- endmacro -%} + {% macro on_cluster_clause(relation, force_sync) %} {% set active_cluster = adapter.get_clickhouse_cluster_name() %} {%- if active_cluster is not none and relation.should_on_cluster %} @@ -170,6 +176,7 @@ {{ order_cols(label="order by") }} {{ primary_key_clause(label="primary key") }} {{ partition_cols(label="partition by") }} + {{ ttl_config(label="ttl")}} {{ adapter.get_model_settings(model) }} {%- if not has_contract %} diff --git a/tests/integration/adapter/clickhouse/test_clickhouse_table_ttl.py b/tests/integration/adapter/clickhouse/test_clickhouse_table_ttl.py new file mode 100644 index 00000000..6c7d8295 --- /dev/null +++ b/tests/integration/adapter/clickhouse/test_clickhouse_table_ttl.py @@ -0,0 +1,62 @@ +import time +from datetime import datetime + +import pytest +from dbt.tests.adapter.basic.files import model_base, schema_base_yml +from dbt.tests.adapter.basic.test_base import BaseSimpleMaterializations +from dbt.tests.util import relation_from_name, run_dbt + + +class TestTableTTL(BaseSimpleMaterializations): + @pytest.fixture(scope="class") + def models(self): + config_materialized_table = """ + {{ config( + order_by='(some_date, id, name)', + engine='MergeTree()', + materialized='table', + settings={'allow_nullable_key': 1}, + ttl='some_date + INTERVAL 5 SECONDS', + query_settings={'allow_nondeterministic_mutations': 1}) + }} + """ + base_table_sql = config_materialized_table + model_base + return { + "table_model.sql": base_table_sql, + "schema.yml": schema_base_yml, + } + + def test_base(self, project): + # seed command + results = run_dbt(["seed"]) + # seed result length + assert len(results) == 1 + + # run command + results = run_dbt() + # run result length + assert len(results) == 1 + + # base table rowcount + relation = relation_from_name(project.adapter, "table_model") + result = project.run_sql(f"select count(*) as num_rows from {relation}", fetch="one") + # the dates from the seed are too old, so those are expired + assert result[0] == 0 + + # insert new data + now = datetime.now().strftime("%Y-%m-%d %H:%M:%S") + project.run_sql(f"insert into {relation} (*) values (11, 'Elian', '{now}')") + + result = project.run_sql(f"select count(*) as num_rows from {relation}", fetch="one") + # the dates from the seed are too old, so those are expired + assert result[0] == 1 + + # wait for TTL to expire + time.sleep(6) + + # optimize table + project.run_sql(f"OPTIMIZE TABLE {relation} FINAL") + + # make sure is empty + result = project.run_sql(f"select count(*) as num_rows from {relation}", fetch="one") + assert result[0] == 0 From 6a6531aab9434af23cfd6d69bb2a6fd6b9679474 Mon Sep 17 00:00:00 2001 From: Geoff Genz Date: Mon, 8 Apr 2024 14:11:29 -0600 Subject: [PATCH 67/78] Fix lint --- dbt/adapters/clickhouse/connections.py | 2 +- dbt/adapters/clickhouse/impl.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/dbt/adapters/clickhouse/connections.py b/dbt/adapters/clickhouse/connections.py index 6881e86f..17f18e1f 100644 --- a/dbt/adapters/clickhouse/connections.py +++ b/dbt/adapters/clickhouse/connections.py @@ -1,7 +1,7 @@ import re import time from contextlib import contextmanager -from typing import Any, Optional, Tuple, Union, TYPE_CHECKING +from typing import TYPE_CHECKING, Any, Optional, Tuple, Union import dbt.exceptions from dbt.adapters.sql import SQLConnectionManager diff --git a/dbt/adapters/clickhouse/impl.py b/dbt/adapters/clickhouse/impl.py index 27ef4842..cd6e23a8 100644 --- a/dbt/adapters/clickhouse/impl.py +++ b/dbt/adapters/clickhouse/impl.py @@ -1,7 +1,7 @@ import csv import io from dataclasses import dataclass -from typing import Any, Callable, Dict, List, Optional, Set, Tuple, Union, TYPE_CHECKING +from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Set, Tuple, Union from dbt.adapters.base import AdapterConfig, available from dbt.adapters.base.impl import BaseAdapter, ConstraintSupport From d94f93b07f75c0b01f0c2a2b351de01a2a59bb86 Mon Sep 17 00:00:00 2001 From: Thomas Schmidt Date: Sat, 13 Apr 2024 01:01:53 +0200 Subject: [PATCH 68/78] Update table relation after exchange command (#230) Related to https://github.com/ClickHouse/dbt-clickhouse/issues/226 --- .../macros/materializations/distributed_table.sql | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/dbt/include/clickhouse/macros/materializations/distributed_table.sql b/dbt/include/clickhouse/macros/materializations/distributed_table.sql index 9f920ad9..92acadf6 100644 --- a/dbt/include/clickhouse/macros/materializations/distributed_table.sql +++ b/dbt/include/clickhouse/macros/materializations/distributed_table.sql @@ -58,8 +58,8 @@ {% do run_query(create_empty_table_from_relation(intermediate_relation, view_relation)) or '' %} {{ adapter.rename_relation(existing_relation_local, backup_relation) }} {{ adapter.rename_relation(intermediate_relation, target_relation_local) }} - {{ create_distributed_table(target_relation, target_relation_local) }} - {% endif %} + {% endif %} + {% do run_query(create_distributed_table(target_relation, target_relation_local)) or '' %} {% do run_query(clickhouse__insert_into(target_relation, sql)) or '' %} {{ drop_relation_if_exists(view_relation) }} -- cleanup @@ -85,7 +85,7 @@ {%- set cluster = cluster[1:-1] -%} {%- set sharding = config.get('sharding_key') -%} - create table {{ relation }} {{ on_cluster_clause(relation) }} as {{ local_relation }} + create or replace table {{ relation }} {{ on_cluster_clause(relation) }} as {{ local_relation }} ENGINE = Distributed('{{ cluster}}', '{{ relation.schema }}', '{{ local_relation.name }}' {%- if sharding is not none and sharding.strip() != '' -%} , {{ sharding }} @@ -98,6 +98,7 @@ {% macro create_empty_table_from_relation(relation, source_relation) -%} {%- set sql_header = config.get('sql_header', none) -%} {%- set columns = adapter.get_columns_in_relation(source_relation) | list -%} + {%- set col_list = [] -%} {% for col in columns %} @@ -109,7 +110,7 @@ {{ on_cluster_clause(relation) }} ( {{col_list | join(', ')}} ) - + {{ engine_clause() }} {{ order_cols(label="order by") }} {{ primary_key_clause(label="primary key") }} From 4d7f4af281ac606e504fb18de7107a36cba20ec5 Mon Sep 17 00:00:00 2001 From: Cristhian Garcia Date: Fri, 12 Apr 2024 18:14:16 -0500 Subject: [PATCH 69/78] feat: allow to add connection overrides for dictionaries (#267) --- dbt/adapters/clickhouse/impl.py | 11 +++++++++-- .../clickhouse/macros/materializations/dictionary.sql | 2 +- 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/dbt/adapters/clickhouse/impl.py b/dbt/adapters/clickhouse/impl.py index cd6e23a8..7fc27d5f 100644 --- a/dbt/adapters/clickhouse/impl.py +++ b/dbt/adapters/clickhouse/impl.py @@ -458,17 +458,24 @@ def format_columns(self, columns) -> List[Dict]: return [{'name': column.name, 'data_type': column.dtype} for column in columns] @available - def get_credentials(self) -> Dict: + def get_credentials(self, connection_overrides) -> Dict: conn = self.connections.get_if_exists() if conn is None or conn.credentials is None: return dict() - return { + credentials = { 'user': conn.credentials.user, 'password': conn.credentials.password, 'database': conn.credentials.database, 'host': conn.credentials.host, 'port': conn.credentials.port, } + credentials.update(connection_overrides) + + for key in connection_overrides.keys(): + if not credentials[key]: + credentials.pop(key) + + return credentials @classmethod def render_raw_columns_constraints(cls, raw_columns: Dict[str, Dict[str, Any]]) -> List: diff --git a/dbt/include/clickhouse/macros/materializations/dictionary.sql b/dbt/include/clickhouse/macros/materializations/dictionary.sql index 7d226238..ef8e64e5 100644 --- a/dbt/include/clickhouse/macros/materializations/dictionary.sql +++ b/dbt/include/clickhouse/macros/materializations/dictionary.sql @@ -79,7 +79,7 @@ {% macro clickhouse_source(sql) %} - {%- set credentials = adapter.get_credentials() -%} + {%- set credentials = adapter.get_credentials(config.get("connection_overrides", {})) -%} {%- set table = config.get('table') -%} CLICKHOUSE( user '{{ credentials.get("user") }}' From 63b94fc9b1457ea05705a289b363584844879a65 Mon Sep 17 00:00:00 2001 From: Geoff Genz Date: Fri, 12 Apr 2024 19:03:12 -0600 Subject: [PATCH 70/78] Housekeeping for 1.7.6 release (#268) --- CHANGELOG.md | 19 ++++++++++++++++++- dbt/adapters/clickhouse/__version__.py | 2 +- 2 files changed, 19 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 02588474..0910b2f3 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,7 +1,24 @@ +### Release [1.7.6], 2024-04-12 +#### Bug Fix +- A bug in (experimental) Distributed Table model creation could lead to errors when there was a change in the model definition (see, e.g., +https://github.com/ClickHouse/dbt-clickhouse/issues/226). Thanks to [Thomas Schmidt](https://github.com/Somtom) for the Fix! +- A comment at the end of a model would break the query used to retrieve the result column datatypes. Thanks to [triou](https://github.com/tevariou) +for the bug report and the fix. Closes https://github.com/ClickHouse/dbt-clickhouse/issues/256 + +#### Improvements +- The new materialization for ClickHouse dictionaries now takes an optional "credentials dictionary" argument that overrides the +global credentials values for user, password, database, host, and port (including removing any of those values by adding empty values if not needed). +This allows better control over creating dictionaries on different server. Thanks to [Cristhian Garcia](https://github.com/Ian2012) +for the PR! +- A new `ttl` setting has been added to model configuration that will insert the provided ClickHouse TTL expression in the appropriate place. +Thanks to [Evan Rusackas](https://github.com/rusackas) for the contribution! +- The Agate library should now be lazy loaded. This should modestly improve dbt startup times (after dbt-clickhouse is upgraded to dbt 1.8.x). +Thanks to [Daniel Reeves](https://github.com/dwreeves) for PR. + ### Release [1.7.5], 2024-04-02 #### Bug Fixes - Requirements and tests upgraded to include Python 3.12. Closes https://github.com/ClickHouse/dbt-clickhouse/issues/264 -- Model settings were not working correctly for customer materializations. Thanks to original dbt-clickhouse [silentsokolov](https://github.com/silentsokolov) +- Model settings were not working correctly for custom materializations. Thanks to original dbt-clickhouse [silentsokolov](https://github.com/silentsokolov) for the PR! ### Release [1.7.4], 2024-03-23 diff --git a/dbt/adapters/clickhouse/__version__.py b/dbt/adapters/clickhouse/__version__.py index ee8593ed..a50e7bab 100644 --- a/dbt/adapters/clickhouse/__version__.py +++ b/dbt/adapters/clickhouse/__version__.py @@ -1 +1 @@ -version = '1.7.5' +version = '1.7.6' From 3a0ba3629beb4e634c8c33bce81f0a5e7a399137 Mon Sep 17 00:00:00 2001 From: Bentsi Leviav Date: Mon, 15 Apr 2024 11:34:00 +0300 Subject: [PATCH 71/78] Revert "allows to add a comment in table's or view's metadata" --- dbt/include/clickhouse/macros/materializations/table.sql | 5 ----- dbt/include/clickhouse/macros/materializations/view.sql | 6 ------ 2 files changed, 11 deletions(-) diff --git a/dbt/include/clickhouse/macros/materializations/table.sql b/dbt/include/clickhouse/macros/materializations/table.sql index 5f6e234d..d9b4c49b 100644 --- a/dbt/include/clickhouse/macros/materializations/table.sql +++ b/dbt/include/clickhouse/macros/materializations/table.sql @@ -188,11 +188,6 @@ ) {%- endif %} {%- endif %} - - {% set comment = config.get('comment') %} - {% if comment %} - COMMENT '{{ comment }}' - {%- endif %} {%- endmacro %} diff --git a/dbt/include/clickhouse/macros/materializations/view.sql b/dbt/include/clickhouse/macros/materializations/view.sql index e6f1e80b..dfb28787 100644 --- a/dbt/include/clickhouse/macros/materializations/view.sql +++ b/dbt/include/clickhouse/macros/materializations/view.sql @@ -79,12 +79,6 @@ as ( {{ sql }} ) - - {% set comment = config.get('comment') %} - {% if comment %} - COMMENT '{{ comment }}' - {%- endif %} - {% if model.get('config').get('materialized') == 'view' %} {{ adapter.get_model_settings(model) }} {%- endif %} From 52db8586e7727a8d4334450fedaf6288c6554d33 Mon Sep 17 00:00:00 2001 From: scrawfor Date: Tue, 16 Apr 2024 10:42:10 -0400 Subject: [PATCH 72/78] Fix bool_or behavior (#270) --- dbt/include/clickhouse/macros/utils/utils.sql | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dbt/include/clickhouse/macros/utils/utils.sql b/dbt/include/clickhouse/macros/utils/utils.sql index d81e6da3..d2f7a614 100644 --- a/dbt/include/clickhouse/macros/utils/utils.sql +++ b/dbt/include/clickhouse/macros/utils/utils.sql @@ -34,7 +34,7 @@ {% macro clickhouse__bool_or(expression) -%} - any({{ expression }}) > 0 + max({{ expression }}) > 0 {%- endmacro %} From c781b098f3dc5891d7eda5b864eed1ed2d538722 Mon Sep 17 00:00:00 2001 From: Robin Norgren <68205730+rjoelnorgren@users.noreply.github.com> Date: Fri, 26 Apr 2024 16:04:10 -0700 Subject: [PATCH 73/78] feat: support column codecs --- dbt/adapters/clickhouse/impl.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/dbt/adapters/clickhouse/impl.py b/dbt/adapters/clickhouse/impl.py index 7fc27d5f..1fc1ae7a 100644 --- a/dbt/adapters/clickhouse/impl.py +++ b/dbt/adapters/clickhouse/impl.py @@ -481,7 +481,10 @@ def get_credentials(self, connection_overrides) -> Dict: def render_raw_columns_constraints(cls, raw_columns: Dict[str, Dict[str, Any]]) -> List: rendered_columns = [] for v in raw_columns.values(): - rendered_columns.append(f"{quote_identifier(v['name'])} {v['data_type']}") + codec = f"CODEC({_codec})" if (_codec := v.get('codec')) else "" + rendered_columns.append( + f"{quote_identifier(v['name'])} {v['data_type']} {codec}".rstrip() + ) if v.get("constraints"): warn_or_error(ConstraintNotSupported(constraint='column', adapter='clickhouse')) return rendered_columns From 7f7c8dff09384e9e46baa7c9cf90e705d0db314a Mon Sep 17 00:00:00 2001 From: Robin Norgren <68205730+rjoelnorgren@users.noreply.github.com> Date: Sat, 27 Apr 2024 09:14:52 -0700 Subject: [PATCH 74/78] Use Column.data_type in ClickHouseAdapter.format_columns --- dbt/adapters/clickhouse/impl.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dbt/adapters/clickhouse/impl.py b/dbt/adapters/clickhouse/impl.py index 7fc27d5f..415397ce 100644 --- a/dbt/adapters/clickhouse/impl.py +++ b/dbt/adapters/clickhouse/impl.py @@ -455,7 +455,7 @@ def get_column_schema_from_query(self, sql: str, *_) -> List[ClickHouseColumn]: @available.parse_none def format_columns(self, columns) -> List[Dict]: - return [{'name': column.name, 'data_type': column.dtype} for column in columns] + return [{'name': column.name, 'data_type': column.data_type} for column in columns] @available def get_credentials(self, connection_overrides) -> Dict: From 6addd50febb155fa8b847c7e9e1f409659240c84 Mon Sep 17 00:00:00 2001 From: Robin Norgren <68205730+rjoelnorgren@users.noreply.github.com> Date: Mon, 29 Apr 2024 19:59:54 -0700 Subject: [PATCH 75/78] Always apply query_settings in clickhouse__insert_into macro --- dbt/include/clickhouse/macros/materializations/table.sql | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dbt/include/clickhouse/macros/materializations/table.sql b/dbt/include/clickhouse/macros/materializations/table.sql index d9b4c49b..4b436212 100644 --- a/dbt/include/clickhouse/macros/materializations/table.sql +++ b/dbt/include/clickhouse/macros/materializations/table.sql @@ -202,6 +202,6 @@ SELECT {{ dest_cols_csv }} FROM ( {{ sql }} ) {%- else -%} {{ sql }} - {{ adapter.get_model_query_settings(model) }} {%- endif -%} + {{ adapter.get_model_query_settings(model) }} {%- endmacro %} From c996503503e295c756709c9de98f8b0fb52d11ee Mon Sep 17 00:00:00 2001 From: Robin Norgren <68205730+rjoelnorgren@users.noreply.github.com> Date: Fri, 17 May 2024 20:33:51 -0700 Subject: [PATCH 76/78] Add ClickHouseColumn.is_low_cardinality --- dbt/adapters/clickhouse/column.py | 53 ++++++++++++++++++------------- 1 file changed, 31 insertions(+), 22 deletions(-) diff --git a/dbt/adapters/clickhouse/column.py b/dbt/adapters/clickhouse/column.py index 43df6aa3..393d0eaa 100644 --- a/dbt/adapters/clickhouse/column.py +++ b/dbt/adapters/clickhouse/column.py @@ -17,7 +17,9 @@ class ClickHouseColumn(Column): 'INTEGER': 'Int32', } is_nullable: bool = False - _brackets_regex = re.compile(r'^(Nullable|LowCardinality)\((.*)\)$') + is_low_cardinality: bool = False + _low_card_regex = re.compile(r'^LowCardinality\((.*)\)$') + _nullable_regex = re.compile(r'^Nullable\((.*)\)$') _fix_size_regex = re.compile(r'FixedString\((.*?)\)') _decimal_regex = re.compile(r'Decimal\((\d+), (\d+)\)') @@ -26,13 +28,7 @@ def __init__(self, column: str, dtype: str) -> None: numeric_precision = None numeric_scale = None - inner_dtype = self.match_brackets(dtype) - if inner_dtype: - dtype = inner_dtype - if not self.is_nullable: - # Support LowCardinality(Nullable(dtype)) - inner_dtype = self.match_brackets(dtype) - dtype = inner_dtype if inner_dtype else dtype + dtype = self._inner_dtype(dtype) if dtype.lower().startswith('fixedstring'): match_sized = self._fix_size_regex.search(dtype) @@ -56,18 +52,15 @@ def __repr__(self) -> str: def data_type(self) -> str: if self.is_string(): data_t = self.string_type(self.string_size()) - if self.is_nullable: - return "Nullable({})".format(data_t) - return data_t elif self.is_numeric(): data_t = self.numeric_type(self.dtype, self.numeric_precision, self.numeric_scale) - if self.is_nullable: - return "Nullable({})".format(data_t) - return data_t else: - if self.is_nullable: - return "Nullable({})".format(self.dtype) - return self.dtype + data_t = self.dtype + + if self.is_nullable or self.is_low_cardinality: + data_t = self.nested_type(data_t, self.is_low_cardinality, self.is_nullable) + + return data_t def is_string(self) -> bool: return self.dtype.lower() in [ @@ -111,6 +104,15 @@ def string_type(cls, size: int) -> str: def numeric_type(cls, dtype: str, precision: Any, scale: Any) -> str: return f'Decimal({precision}, {scale})' + @classmethod + def nested_type(cls, dtype: str, is_low_cardinality: bool, is_nullable: bool) -> str: + template = "{}" + if is_low_cardinality: + template = template.format("LowCardinality({})") + if is_nullable: + template = template.format("Nullable({})") + return template.format(dtype) + def literal(self, value): return f'to{self.dtype}({value})' @@ -120,8 +122,15 @@ def can_expand_to(self, other_column: 'Column') -> bool: return other_column.string_size() > self.string_size() - def match_brackets(self, dtype): - match = self._brackets_regex.search(dtype.strip()) - if match: - self.is_nullable = match.group(1) == 'Nullable' - return match.group(2) + def _inner_dtype(self, dtype) -> str: + inner_dtype = dtype.strip() + + if low_card_match := self._low_card_regex.search(inner_dtype): + self.is_low_cardinality = True + inner_dtype = low_card_match.group(1) + + if null_match := self._nullable_regex.search(inner_dtype): + self.is_nullable = True + inner_dtype = null_match.group(1) + + return inner_dtype From 888b531f010c471bc3e8fd9c8493e41f93d94336 Mon Sep 17 00:00:00 2001 From: Robin Norgren <68205730+rjoelnorgren@users.noreply.github.com> Date: Fri, 17 May 2024 20:34:19 -0700 Subject: [PATCH 77/78] Update column type test cases for LowCardinality --- .../adapter/column_types/test_column_types.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/tests/integration/adapter/column_types/test_column_types.py b/tests/integration/adapter/column_types/test_column_types.py index 52e9d6f9..8fca387f 100644 --- a/tests/integration/adapter/column_types/test_column_types.py +++ b/tests/integration/adapter/column_types/test_column_types.py @@ -52,10 +52,16 @@ def test_array_type(self): def test_low_cardinality_nullable_type(self): col = ClickHouseColumn(column='name', dtype='LowCardinality(Nullable(String))') verify_column_types(col, True, False, False, False) - assert repr(col) == '' + assert ( + repr(col) + == '' + ) col = ClickHouseColumn(column='name', dtype='LowCardinality(Nullable(FixedString(16)))') verify_column_types(col, True, False, False, False) - assert repr(col) == '' + assert ( + repr(col) + == '' + ) def test_map_type(self): col = ClickHouseColumn(column='name', dtype='Map(String, UInt64)') @@ -88,7 +94,8 @@ def verify_column( low_cardinality_col = ClickHouseColumn(column=name, dtype=f'LowCardinality({dtype})') verify_column_types(low_cardinality_col, is_string, is_numeric, is_float, is_int) assert ( - repr(low_cardinality_col) == f'' + repr(low_cardinality_col) + == f'' ) return col From 9e82f58ff01640d2f66e7f027aa3a1021aef3441 Mon Sep 17 00:00:00 2001 From: Robin Norgren <68205730+rjoelnorgren@users.noreply.github.com> Date: Sat, 18 May 2024 09:56:12 -0700 Subject: [PATCH 78/78] Omit empty dictionary connection_overrides from materialization DDL --- .../clickhouse/macros/materializations/dictionary.sql | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/dbt/include/clickhouse/macros/materializations/dictionary.sql b/dbt/include/clickhouse/macros/materializations/dictionary.sql index ef8e64e5..4d03027e 100644 --- a/dbt/include/clickhouse/macros/materializations/dictionary.sql +++ b/dbt/include/clickhouse/macros/materializations/dictionary.sql @@ -82,16 +82,18 @@ {%- set credentials = adapter.get_credentials(config.get("connection_overrides", {})) -%} {%- set table = config.get('table') -%} CLICKHOUSE( + {% if credentials.get("user") -%} user '{{ credentials.get("user") }}' - {% if credentials.get("password") != '' -%} + {%- endif %} + {% if credentials.get("password") -%} password '{{ credentials.get("password") }}' {%- endif %} - {% if credentials.get("database") != '' -%} + {% if credentials.get("database") -%} db '{{ credentials.get("database") }}' {%- endif %} - {% if credentials.get("host") != '' and credentials.get("host") != 'localhost' -%} + {% if credentials.get("host") and credentials.get("host") != 'localhost' -%} host '{{ credentials.get("host") }}' - {% if credentials.get("port") != '' -%} + {% if credentials.get("port") -%} port '{{ credentials.get("port") }}' {%- endif %} {%- endif %}