From d8957ea4ea3810b4b4ae590f88e19ad5591cccad Mon Sep 17 00:00:00 2001 From: Brent Moran Date: Wed, 19 May 2021 18:05:27 +0800 Subject: [PATCH 1/6] bring over inference code from old branch The merge with master was too hairy, started a new branch. --- db/tests/types/test_inference.py | 73 +++++++++++++++++++++++++++++++ db/types/alteration.py | 21 ++++----- db/types/inference.py | 75 ++++++++++++++++++++++++++++++++ 3 files changed, 159 insertions(+), 10 deletions(-) create mode 100644 db/tests/types/test_inference.py create mode 100644 db/types/inference.py diff --git a/db/tests/types/test_inference.py b/db/tests/types/test_inference.py new file mode 100644 index 0000000000..44b0a26dae --- /dev/null +++ b/db/tests/types/test_inference.py @@ -0,0 +1,73 @@ +import pytest +from sqlalchemy import Column, MetaData, Table +from sqlalchemy import BOOLEAN, String, NUMERIC, VARCHAR +from sqlalchemy.schema import CreateSchema, DropSchema +from db.engine import _add_custom_types_to_engine +from db.types import base, inference, install + +TEST_SCHEMA = "test_schema" + + +@pytest.fixture +def engine_with_types(engine): + _add_custom_types_to_engine(engine) + return engine + + +@pytest.fixture +def temporary_testing_schema(engine_with_types): + schema = TEST_SCHEMA + with engine_with_types.begin() as conn: + conn.execute(CreateSchema(schema)) + yield engine_with_types, schema + with engine_with_types.begin() as conn: + conn.execute(DropSchema(schema, cascade=True, if_exists=True)) + + +@pytest.fixture +def engine_email_type(temporary_testing_schema): + engine, schema = temporary_testing_schema + install.install_mathesar_on_database(engine) + yield engine, schema + with engine.begin() as conn: + conn.execute(DropSchema(base.SCHEMA, cascade=True, if_exists=True)) + + +type_data_list = [ + (String, ["t", "false", "true", "f", "f"], BOOLEAN), + (String, ["t", "false", "2", "0"], VARCHAR), + (String, ["a", "cat", "mat", "bat"], VARCHAR), + (String, ["2", "1", "0", "0"], NUMERIC), +] + + +@pytest.mark.parametrize("type_,value_list,expect_type", type_data_list) +def test_type_inference(engine_email_type, type_, value_list, expect_type): + engine, schema = engine_email_type + TEST_TABLE = "test_table" + TEST_COLUMN = "test_column" + metadata = MetaData(bind=engine) + print(value_list) + input_table = Table( + TEST_TABLE, + metadata, + Column(TEST_COLUMN, type_), + schema=schema + ) + input_table.create() + for value in value_list: + ins = input_table.insert(values=(value,)) + with engine.begin() as conn: + conn.execute(ins) + inference.infer_column_type( + schema, + TEST_TABLE, + TEST_COLUMN, + engine + ) + with engine.begin(): + metadata = MetaData(bind=engine, schema=schema) + actual_type = Table( + TEST_TABLE, metadata, schema=schema, autoload_with=engine, + ).columns[TEST_COLUMN].type.__class__ + assert actual_type == expect_type diff --git a/db/types/alteration.py b/db/types/alteration.py index 2cb115f6ae..ef0dea79f2 100644 --- a/db/types/alteration.py +++ b/db/types/alteration.py @@ -2,9 +2,10 @@ from db.types import base, email BOOLEAN = "boolean" -EMAIL = email.QUALIFIED_EMAIL +EMAIL = "email" INTERVAL = "interval" NUMERIC = "numeric" +STRING = "string" TEXT = "text" VARCHAR = "varchar" @@ -13,12 +14,12 @@ def get_supported_alter_column_types(engine): dialect_types = engine.dialect.ischema_names type_map = { # Default Postgres types - "boolean": dialect_types.get("boolean"), - "interval": dialect_types.get("interval"), - "numeric": dialect_types.get("numeric"), - "string": dialect_types.get("name"), + BOOLEAN: dialect_types.get("boolean"), + INTERVAL: dialect_types.get("interval"), + NUMERIC: dialect_types.get("numeric"), + STRING: dialect_types.get("name"), # Custom Mathesar types - "email": dialect_types.get(email.QUALIFIED_EMAIL) + EMAIL: dialect_types.get(email.QUALIFIED_EMAIL) } return {k: v for k, v in type_map.items() if v is not None} @@ -141,18 +142,18 @@ def create_email_casts(engine): DOMAIN). """ type_body_map = { - EMAIL: """ + email.QUALIFIED_EMAIL: """ BEGIN RETURN $1; END; """, TEXT: f""" BEGIN - RETURN $1::{EMAIL}; + RETURN $1::{email.QUALIFIED_EMAIL}; END; """, } - create_cast_functions(EMAIL, type_body_map, engine) + create_cast_functions(email.QUALIFIED_EMAIL, type_body_map, engine) def create_interval_casts(engine): @@ -209,7 +210,7 @@ def create_varchar_casts(engine): RETURN $1::{VARCHAR}; END; """, - EMAIL: f""" + email.QUALIFIED_EMAIL: f""" BEGIN RETURN $1::{VARCHAR}; END; diff --git a/db/types/inference.py b/db/types/inference.py new file mode 100644 index 0000000000..3f2498c00c --- /dev/null +++ b/db/types/inference.py @@ -0,0 +1,75 @@ +import logging +from sqlalchemy import MetaData, Table +from sqlalchemy import VARCHAR, TEXT, Text +from db.types import alteration + +logging.basicConfig( + format='%(asctime)s - %(name)s - %(levelname)s: %(message)s', + level=logging.INFO +) + +logger = logging.getLogger(__name__) +logger.setLevel(logging.DEBUG) + +TYPE_INFERENCE_DAG = { + alteration.BOOLEAN: [], + alteration.EMAIL: [], + alteration.INTERVAL: [], + alteration.NUMERIC: [], + alteration.STRING: [ + alteration.BOOLEAN, + alteration.NUMERIC, + alteration.INTERVAL, + alteration.EMAIL, + ], +} + + +class DagCycleError(Exception): + pass + + +def infer_column_type( + schema, + table_name, + column_name, + engine, + depth=0, + type_inference_dag=TYPE_INFERENCE_DAG, +): + if depth > 100: + raise DagCycleError("The type_inference_dag likely has a cycle") + supported_types = alteration.get_supported_alter_column_types(engine) + reverse_type_map = { + Text: alteration.STRING, + TEXT: alteration.STRING, + VARCHAR: alteration.STRING, + } + reverse_type_map.update({v: k for k, v in supported_types.items()}) + with engine.begin(): + metadata = MetaData(bind=engine, schema=schema) + column_type = Table( + table_name, metadata, schema=schema, autoload_with=engine, + ).columns[column_name].type.__class__ + column_type_str = reverse_type_map.get(column_type) + logger.debug(f"column_type_str: {column_type_str}") + for type_str in type_inference_dag.get(column_type_str, []): + try: + alteration.alter_column_type( + schema, table_name, column_name, type_str, engine, + ) + logger.info(f"Column {column_name} altered to type {type_str}") + column_type = infer_column_type( + schema, + table_name, + column_name, + engine, + depth=depth + 1, + type_inference_dag=type_inference_dag, + ) + break + except Exception: + logger.info( + f"Cannot alter column {column_name} to type {type_str}" + ) + return column_type From 1bcdeb822b7a5034b1edd9ee0cc0bca23bed794f Mon Sep 17 00:00:00 2001 From: Brent Moran Date: Wed, 19 May 2021 18:26:40 +0800 Subject: [PATCH 2/6] allow '0' and '1' strings to cast to boolean --- db/tests/types/test_alteration.py | 2 -- db/types/alteration.py | 4 ++-- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/db/tests/types/test_alteration.py b/db/tests/types/test_alteration.py index 67f820c743..74a2ca040e 100644 --- a/db/tests/types/test_alteration.py +++ b/db/tests/types/test_alteration.py @@ -153,8 +153,6 @@ def test_alter_column_type_casts_column_data( type_test_bad_data_list = [ - (String, "boolean", "0"), - (String, "boolean", "1"), (String, "boolean", "cat"), (String, "interval", "1 potato"), (String, "interval", "3"), diff --git a/db/types/alteration.py b/db/types/alteration.py index ef0dea79f2..385c464aed 100644 --- a/db/types/alteration.py +++ b/db/types/alteration.py @@ -80,8 +80,8 @@ def create_boolean_casts(engine): DECLARE istrue {BOOLEAN}; BEGIN - SELECT lower($1)='t' OR lower($1)='true' INTO istrue; - IF istrue OR lower($1)='f' OR lower($1)='false' THEN + SELECT lower($1)='t' OR lower($1)='true' OR $1='1' INTO istrue; + IF istrue OR lower($1)='f' OR lower($1)='false' OR $1='0' THEN RETURN istrue; END IF; {not_bool_exception_str} From a9cc190b9ffefd55411092ba276f386c1b4349ff Mon Sep 17 00:00:00 2001 From: Brent Moran Date: Wed, 19 May 2021 18:27:29 +0800 Subject: [PATCH 3/6] let numerics be inferred to be boolean --- db/types/inference.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/db/types/inference.py b/db/types/inference.py index 3f2498c00c..36f7e61c12 100644 --- a/db/types/inference.py +++ b/db/types/inference.py @@ -15,7 +15,9 @@ alteration.BOOLEAN: [], alteration.EMAIL: [], alteration.INTERVAL: [], - alteration.NUMERIC: [], + alteration.NUMERIC: [ + alteration.BOOLEAN, + ], alteration.STRING: [ alteration.BOOLEAN, alteration.NUMERIC, From 82a9db8a034dbce7523c148d4fd99ca830e8acd3 Mon Sep 17 00:00:00 2001 From: Brent Moran Date: Wed, 19 May 2021 21:48:54 +0800 Subject: [PATCH 4/6] add tests to make sure non-0,1 numerics stay numeric --- db/tests/types/test_inference.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/db/tests/types/test_inference.py b/db/tests/types/test_inference.py index 44b0a26dae..f21cf83a9d 100644 --- a/db/tests/types/test_inference.py +++ b/db/tests/types/test_inference.py @@ -1,6 +1,6 @@ import pytest from sqlalchemy import Column, MetaData, Table -from sqlalchemy import BOOLEAN, String, NUMERIC, VARCHAR +from sqlalchemy import BOOLEAN, Numeric, NUMERIC, String, VARCHAR from sqlalchemy.schema import CreateSchema, DropSchema from db.engine import _add_custom_types_to_engine from db.types import base, inference, install @@ -34,6 +34,8 @@ def engine_email_type(temporary_testing_schema): type_data_list = [ + (Numeric, [0, 2, 1, 0], NUMERIC), + (Numeric, [0, 1, 1, 0], BOOLEAN), (String, ["t", "false", "true", "f", "f"], BOOLEAN), (String, ["t", "false", "2", "0"], VARCHAR), (String, ["a", "cat", "mat", "bat"], VARCHAR), From 4f1a3ad68d896e84ca8fcd8650194e8a4227bea0 Mon Sep 17 00:00:00 2001 From: Brent Moran Date: Thu, 20 May 2021 17:24:16 +0800 Subject: [PATCH 5/6] clean up logging setup, remove debug print --- db/tests/types/test_inference.py | 1 - db/types/inference.py | 6 ------ 2 files changed, 7 deletions(-) diff --git a/db/tests/types/test_inference.py b/db/tests/types/test_inference.py index f21cf83a9d..7e75ad6270 100644 --- a/db/tests/types/test_inference.py +++ b/db/tests/types/test_inference.py @@ -49,7 +49,6 @@ def test_type_inference(engine_email_type, type_, value_list, expect_type): TEST_TABLE = "test_table" TEST_COLUMN = "test_column" metadata = MetaData(bind=engine) - print(value_list) input_table = Table( TEST_TABLE, metadata, diff --git a/db/types/inference.py b/db/types/inference.py index 36f7e61c12..f9c4296fdd 100644 --- a/db/types/inference.py +++ b/db/types/inference.py @@ -3,13 +3,7 @@ from sqlalchemy import VARCHAR, TEXT, Text from db.types import alteration -logging.basicConfig( - format='%(asctime)s - %(name)s - %(levelname)s: %(message)s', - level=logging.INFO -) - logger = logging.getLogger(__name__) -logger.setLevel(logging.DEBUG) TYPE_INFERENCE_DAG = { alteration.BOOLEAN: [], From 851009cfe72935adbc2f086fded0daba45d8c096 Mon Sep 17 00:00:00 2001 From: Brent Moran Date: Thu, 20 May 2021 17:57:08 +0800 Subject: [PATCH 6/6] use more specific error for control flow --- db/types/inference.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/db/types/inference.py b/db/types/inference.py index f9c4296fdd..e7965952d4 100644 --- a/db/types/inference.py +++ b/db/types/inference.py @@ -1,6 +1,7 @@ import logging from sqlalchemy import MetaData, Table from sqlalchemy import VARCHAR, TEXT, Text +from sqlalchemy.exc import DatabaseError from db.types import alteration logger = logging.getLogger(__name__) @@ -64,7 +65,9 @@ def infer_column_type( type_inference_dag=type_inference_dag, ) break - except Exception: + # It's expected we catch this error when the test to see whether + # a type is appropriate for a column fails. + except DatabaseError: logger.info( f"Cannot alter column {column_name} to type {type_str}" )