diff --git a/README.md b/README.md index 32260cf..5de7192 100644 --- a/README.md +++ b/README.md @@ -86,6 +86,88 @@ You might want to configure your editor to treat `.ibis` files as normal Python }, ``` +## Column name casing +`dbt-ibis` relies on the default Ibis behavior when it comes to quoting column names and writing them as upper or lowercase which can depend on your database. However, for databases such as Snowflake which store case-insensitive identifiers in all uppercase letters, this might not always be the most convenient for you to write dbt models. This section shows with an example what different case conventions can mean and how you can configure `dbt-ibis` to deal with it. For databases where identifiers are always case-insensitive, e.g. DuckDB, you can skip this part. + +For the columns, for which `dbt-ibis` loads the data types from the `.yml` files (see above), it assumes that the column name appears exactly in the database as they are specified in the `.yml` file. Taking the following example: + +```yml +models: + - name: customers + columns: + - name: customer_id + data_type: integer + - name: customer_name + data_type: varchar +``` +and a dbt model which references this table: + +```python +from dbt_ibis import ref, depends_on + +@depends_on(ref("customers")) +def model(customers): + return customers.select("customer_id") +``` +This will be rendered as the following query if you're using Snowflake: + +```sql +SELECT + t0."customer_id" +FROM {{ ref('customers') }} as t0 +``` + +If the column identifier is stored as case-insensitive, this query will fail as the lowercase column `"customer_id"` does not exist. To fix this, you'll have to write the column names in the `.yml` file in uppercase: + +```yml +models: + - name: customers + columns: + - name: CUSTOMER_ID + data_type: integer + - name: CUSTOMER_NAME + data_type: varchar +``` + +and also change it in the model + +```python +@depends_on(ref("customers")) +def model(customers): + return customers.select("CUSTOMER_ID") +``` + +If you want to keep using lowercase column names in your model but case-insensitive (i.e. uppercase) identifiers in the database, it would look something like this: + +```python +@depends_on(ref("customers")) +def model(customers): + customers = customers.rename("snake_case") + customers = customers.select("customer_id") + customers = customers.rename("ALL_CAPS") + return customers +``` + +This is rather cumbersome to do for every model and many of us are used to work with lowercase column names as a convention. To simplify the process, you can tell `dbt-ibis` to do these conversions for you. Going back to our original example of using all lowercase names in the `.yml` file as well as in the model, you can make that work by setting the following variables in your `dbt_project.yml` file: + +```yml +vars: + dbt_ibis_letter_case_in_db_jaffle_shop_prod: upper + dbt_ibis_letter_case_in_model: lower +``` +This tells `dbt-ibis` that in the database, uppercase letters should be used and can be expected, and that in your dbt model you want to use lowercase letters. Both variables accept `upper` and `lower` as values. In addition, the first variable is specific to a profile (`jaffle_shop`) and target (`prod`) following the format `dbt_ibis_letter_case_in_db_{profile}_{target}`. This allows you to set different conventions for different databases. If your `prod` target points to a Snowflake database and `dev` to a local duckdb file, you could do: + +```yml +vars: + dbt_ibis_letter_case_in_db_jaffle_shop_prod: upper + dbt_ibis_letter_case_in_db_jaffle_shop_dev: lower + dbt_ibis_letter_case_in_model: lower +``` + +If all of this sounds confusing, I'd recommend to play around with the different configurations and run `dbt-ibis precompile` to inspect the generated SQL. If you have any questions, feel free to open an Issue in this repository. + +See [this GitHub issue](https://github.com/ibis-project/ibis/issues/6772) for some further explanations and examples on case handling in Ibis and Snowflake. + ## Limitations * There is no database connection available in the Ibis `model` functions. Hence, you cannot use Ibis functionality which would require this. * For non-Ibis models, seeds, snapshots, and for sources, you need to specify the data types of the columns. See "Basic example" above. diff --git a/dbt_ibis/__init__.py b/dbt_ibis/__init__.py index 2caf31d..14a82a7 100644 --- a/dbt_ibis/__init__.py +++ b/dbt_ibis/__init__.py @@ -1,5 +1,5 @@ __all__ = ["ref", "source", "depends_on", "compile_ibis_to_sql_models"] -__version__ = "0.5.0dev" +__version__ = "0.5.0" import graphlib import logging @@ -45,6 +45,7 @@ _RefLookup = dict[str, Union[ModelNode, SeedNode, SnapshotNode]] _SourcesLookup = dict[str, dict[str, SourceDefinition]] +_LetterCase = Literal["lower", "upper"] def _configure_logging(logger: logging.Logger) -> None: @@ -289,6 +290,10 @@ def compile_ibis_to_sql_models(dbt_parse_arguments: Optional[list[str]] = None) ref_infos, source_infos = _extract_ref_and_source_infos(manifest) + letter_case_in_db, letter_case_in_model = _get_letter_case_conversion_rules( + runtime_config + ) + # Schemas of the Ibis models themselves in case they are referenced # by other downstream Ibis models ibis_model_schemas: dict[str, ibis.Schema] = {} @@ -298,7 +303,10 @@ def compile_ibis_to_sql_models(dbt_parse_arguments: Optional[list[str]] = None) for r in ibis_model.depends_on: if isinstance(r, source): schema = _get_schema_for_source( - r, source_infos, ibis_dialect=ibis_dialect + r, + source_infos, + ibis_dialect=ibis_dialect, + letter_case_in_db=letter_case_in_db, ) elif isinstance(r, ref): schema = _get_schema_for_ref( @@ -306,13 +314,21 @@ def compile_ibis_to_sql_models(dbt_parse_arguments: Optional[list[str]] = None) ref_infos, ibis_model_schemas=ibis_model_schemas, ibis_dialect=ibis_dialect, + letter_case_in_db=letter_case_in_db, ) else: raise ValueError(f"Unknown reference type: {type(r)}") ibis_table = r.to_ibis(schema) + ibis_table = _set_letter_case_on_ibis_expression( + ibis_table, letter_case_in_model + ) references.append(ibis_table) ibis_expr = ibis_model.model_func(*references) + ibis_expr = _set_letter_case_on_ibis_expression( + ibis_expr, letter_case_in_db + ) + ibis_model_schemas[ibis_model.name] = ibis_expr.schema() # Convert to SQL and write to file @@ -322,6 +338,16 @@ def compile_ibis_to_sql_models(dbt_parse_arguments: Optional[list[str]] = None) logger.info("Finished compiling Ibis models to SQL") +def _set_letter_case_on_ibis_expression( + ibis_expr: ibis.expr.types.Table, letter_case: Optional[_LetterCase] +) -> ibis.expr.types.Table: + if letter_case == "upper": + ibis_expr = ibis_expr.rename("ALL_CAPS") + elif letter_case == "lower": + ibis_expr = ibis_expr.rename("snake_case") + return ibis_expr + + def _invoke_parse_customized( dbt_parse_arguments: Optional[list[str]], ) -> tuple[Manifest, RuntimeConfig]: @@ -445,11 +471,46 @@ def _extract_ref_and_source_infos( return ref_lookup, dict(sources_lookup) +def _get_letter_case_conversion_rules( + runtime_config: RuntimeConfig, +) -> tuple[Optional[_LetterCase], Optional[_LetterCase]]: + # Variables as defined in e.g. dbt_project.yml + dbt_project_vars = runtime_config.vars.vars + project_name = runtime_config.project_name + target_name = runtime_config.target_name + + in_db_var_name = f"dbt_ibis_letter_case_in_db_{project_name}_{target_name}" + in_model_var_name = "dbt_ibis_letter_case_in_model" + + in_db_raw = dbt_project_vars.get(in_db_var_name, None) + in_model_raw = dbt_project_vars.get(in_model_var_name, None) + in_db = _validate_letter_case_var(in_db_var_name, in_db_raw) + in_model = _validate_letter_case_var(in_model_var_name, in_model_raw) + return in_db, in_model + + +def _validate_letter_case_var(variable_name: str, value: Any) -> Optional[_LetterCase]: + if value is not None and value not in ["lower", "upper"]: + raise ValueError( + f"The {variable_name} variable needs to be set to" + + f" either 'lower' or 'upper' but currently has a value of '{value}'." + + " If you want the default behaviour of Ibis, you can omit this variable." + ) + return value + + def _get_schema_for_source( - source: source, source_infos: _SourcesLookup, ibis_dialect: _dialects.IbisDialect + source: source, + source_infos: _SourcesLookup, + ibis_dialect: _dialects.IbisDialect, + letter_case_in_db: Optional[_LetterCase] = None, ) -> ibis.Schema: columns = source_infos[source.source_name][source.table_name].columns - return _columns_to_ibis_schema(columns, ibis_dialect=ibis_dialect) + return _columns_to_ibis_schema( + columns, + ibis_dialect=ibis_dialect, + letter_case_in_db=letter_case_in_db, + ) def _get_schema_for_ref( @@ -457,6 +518,7 @@ def _get_schema_for_ref( ref_infos: _RefLookup, ibis_model_schemas: dict[str, ibis.Schema], ibis_dialect: _dialects.IbisDialect, + letter_case_in_db: Optional[_LetterCase] = None, ) -> ibis.Schema: schema: Optional[ibis.Schema] = None columns_with_missing_data_types: list[ColumnInfo] = [] @@ -483,7 +545,11 @@ def _get_schema_for_ref( c for c in columns.values() if c.data_type is None ] if not columns_with_missing_data_types: - schema = _columns_to_ibis_schema(columns, ibis_dialect=ibis_dialect) + schema = _columns_to_ibis_schema( + columns, + ibis_dialect=ibis_dialect, + letter_case_in_db=letter_case_in_db, + ) # Do not yet raise an error if there are missing data types as we might # be able to get the schema from the Ibis model itself @@ -509,13 +575,18 @@ def _get_schema_for_ref( def _columns_to_ibis_schema( - columns: dict[str, ColumnInfo], ibis_dialect: _dialects.IbisDialect + columns: dict[str, ColumnInfo], + ibis_dialect: _dialects.IbisDialect, + letter_case_in_db: Optional[_LetterCase] = None, ) -> ibis.Schema: schema_dict: dict[str, dt.DataType] = {} for c in columns.values(): if c.data_type is None: raise ValueError(f"Could not determine data type for column '{c.name}'") - schema_dict[c.name] = _dialects.parse_db_dtype_to_ibis_dtype( + column_name = c.name + if letter_case_in_db is not None: + column_name = getattr(column_name, letter_case_in_db)() + schema_dict[column_name] = _dialects.parse_db_dtype_to_ibis_dtype( c.data_type, ibis_dialect=ibis_dialect ) return ibis.schema(schema_dict)