Merge pull request #135 from centerofci/infer_column_type_two

Infer column type two
mathesar-foundation · May 20, 2021 · 5e8b206 · 5e8b206
2 parents 202485f + 851009c
commit 5e8b206
Show file tree

Hide file tree

Showing 4 changed files with 161 additions and 14 deletions.
diff --git a/db/tests/types/test_alteration.py b/db/tests/types/test_alteration.py
@@ -153,8 +153,6 @@ def test_alter_column_type_casts_column_data(
 
 
 type_test_bad_data_list = [
-    (String, "boolean", "0"),
-    (String, "boolean", "1"),
     (String, "boolean", "cat"),
     (String, "interval", "1 potato"),
     (String, "interval", "3"),

diff --git a/db/tests/types/test_inference.py b/db/tests/types/test_inference.py
@@ -0,0 +1,74 @@
+import pytest
+from sqlalchemy import Column, MetaData, Table
+from sqlalchemy import BOOLEAN, Numeric, NUMERIC, String, VARCHAR
+from sqlalchemy.schema import CreateSchema, DropSchema
+from db.engine import _add_custom_types_to_engine
+from db.types import base, inference, install
+
+TEST_SCHEMA = "test_schema"
+
+
+@pytest.fixture
+def engine_with_types(engine):
+    _add_custom_types_to_engine(engine)
+    return engine
+
+
+@pytest.fixture
+def temporary_testing_schema(engine_with_types):
+    schema = TEST_SCHEMA
+    with engine_with_types.begin() as conn:
+        conn.execute(CreateSchema(schema))
+    yield engine_with_types, schema
+    with engine_with_types.begin() as conn:
+        conn.execute(DropSchema(schema, cascade=True, if_exists=True))
+
+
+@pytest.fixture
+def engine_email_type(temporary_testing_schema):
+    engine, schema = temporary_testing_schema
+    install.install_mathesar_on_database(engine)
+    yield engine, schema
+    with engine.begin() as conn:
+        conn.execute(DropSchema(base.SCHEMA, cascade=True, if_exists=True))
+
+
+type_data_list = [
+    (Numeric, [0, 2, 1, 0], NUMERIC),
+    (Numeric, [0, 1, 1, 0], BOOLEAN),
+    (String, ["t", "false", "true", "f", "f"], BOOLEAN),
+    (String, ["t", "false", "2", "0"], VARCHAR),
+    (String, ["a", "cat", "mat", "bat"], VARCHAR),
+    (String, ["2", "1", "0", "0"], NUMERIC),
+]
+
+
+@pytest.mark.parametrize("type_,value_list,expect_type", type_data_list)
+def test_type_inference(engine_email_type, type_, value_list, expect_type):
+    engine, schema = engine_email_type
+    TEST_TABLE = "test_table"
+    TEST_COLUMN = "test_column"
+    metadata = MetaData(bind=engine)
+    input_table = Table(
+        TEST_TABLE,
+        metadata,
+        Column(TEST_COLUMN, type_),
+        schema=schema
+    )
+    input_table.create()
+    for value in value_list:
+        ins = input_table.insert(values=(value,))
+        with engine.begin() as conn:
+            conn.execute(ins)
+    inference.infer_column_type(
+        schema,
+        TEST_TABLE,
+        TEST_COLUMN,
+        engine
+    )
+    with engine.begin():
+        metadata = MetaData(bind=engine, schema=schema)
+        actual_type = Table(
+            TEST_TABLE, metadata, schema=schema, autoload_with=engine,
+        ).columns[TEST_COLUMN].type.__class__
+    assert actual_type == expect_type
diff --git a/db/types/alteration.py b/db/types/alteration.py
@@ -2,9 +2,10 @@
 from db.types import base, email
 
 BOOLEAN = "boolean"
-EMAIL = email.QUALIFIED_EMAIL
+EMAIL = "email"
 INTERVAL = "interval"
 NUMERIC = "numeric"
+STRING = "string"
 TEXT = "text"
 VARCHAR = "varchar"
 
@@ -13,12 +14,12 @@ def get_supported_alter_column_types(engine):
     dialect_types = engine.dialect.ischema_names
     type_map = {
         # Default Postgres types
-        "boolean": dialect_types.get("boolean"),
-        "interval": dialect_types.get("interval"),
-        "numeric": dialect_types.get("numeric"),
-        "string": dialect_types.get("name"),
+        BOOLEAN: dialect_types.get("boolean"),
+        INTERVAL: dialect_types.get("interval"),
+        NUMERIC: dialect_types.get("numeric"),
+        STRING: dialect_types.get("name"),
         # Custom Mathesar types
-        "email": dialect_types.get(email.QUALIFIED_EMAIL)
+        EMAIL: dialect_types.get(email.QUALIFIED_EMAIL)
     }
     return {k: v for k, v in type_map.items() if v is not None}
 
@@ -79,8 +80,8 @@ def create_boolean_casts(engine):
         DECLARE
         istrue {BOOLEAN};
         BEGIN
-          SELECT lower($1)='t' OR lower($1)='true' INTO istrue;
-          IF istrue OR lower($1)='f' OR lower($1)='false' THEN
+          SELECT lower($1)='t' OR lower($1)='true' OR $1='1' INTO istrue;
+          IF istrue OR lower($1)='f' OR lower($1)='false' OR $1='0' THEN
             RETURN istrue;
           END IF;
           {not_bool_exception_str}
@@ -141,18 +142,18 @@ def create_email_casts(engine):
                      DOMAIN).
     """
     type_body_map = {
-        EMAIL: """
+        email.QUALIFIED_EMAIL: """
         BEGIN
           RETURN $1;
         END;
         """,
         TEXT: f"""
         BEGIN
-          RETURN $1::{EMAIL};
+          RETURN $1::{email.QUALIFIED_EMAIL};
         END;
         """,
     }
-    create_cast_functions(EMAIL, type_body_map, engine)
+    create_cast_functions(email.QUALIFIED_EMAIL, type_body_map, engine)
 
 
 def create_interval_casts(engine):
@@ -209,7 +210,7 @@ def create_varchar_casts(engine):
           RETURN $1::{VARCHAR};
         END;
         """,
-        EMAIL: f"""
+        email.QUALIFIED_EMAIL: f"""
         BEGIN
           RETURN $1::{VARCHAR};
         END;

diff --git a/db/types/inference.py b/db/types/inference.py
@@ -0,0 +1,74 @@
+import logging
+from sqlalchemy import MetaData, Table
+from sqlalchemy import VARCHAR, TEXT, Text
+from sqlalchemy.exc import DatabaseError
+from db.types import alteration
+
+logger = logging.getLogger(__name__)
+
+TYPE_INFERENCE_DAG = {
+    alteration.BOOLEAN: [],
+    alteration.EMAIL: [],
+    alteration.INTERVAL: [],
+    alteration.NUMERIC: [
+        alteration.BOOLEAN,
+    ],
+    alteration.STRING: [
+        alteration.BOOLEAN,
+        alteration.NUMERIC,
+        alteration.INTERVAL,
+        alteration.EMAIL,
+    ],
+}
+
+
+class DagCycleError(Exception):
+    pass
+
+
+def infer_column_type(
+        schema,
+        table_name,
+        column_name,
+        engine,
+        depth=0,
+        type_inference_dag=TYPE_INFERENCE_DAG,
+):
+    if depth > 100:
+        raise DagCycleError("The type_inference_dag likely has a cycle")
+    supported_types = alteration.get_supported_alter_column_types(engine)
+    reverse_type_map = {
+        Text: alteration.STRING,
+        TEXT: alteration.STRING,
+        VARCHAR: alteration.STRING,
+    }
+    reverse_type_map.update({v: k for k, v in supported_types.items()})
+    with engine.begin():
+        metadata = MetaData(bind=engine, schema=schema)
+        column_type = Table(
+            table_name, metadata, schema=schema, autoload_with=engine,
+        ).columns[column_name].type.__class__
+    column_type_str = reverse_type_map.get(column_type)
+    logger.debug(f"column_type_str: {column_type_str}")
+    for type_str in type_inference_dag.get(column_type_str, []):
+        try:
+            alteration.alter_column_type(
+                schema, table_name, column_name, type_str, engine,
+            )
+            logger.info(f"Column {column_name} altered to type {type_str}")
+            column_type = infer_column_type(
+                schema,
+                table_name,
+                column_name,
+                engine,
+                depth=depth + 1,
+                type_inference_dag=type_inference_dag,
+            )
+            break
+        # It's expected we catch this error when the test to see whether
+        # a type is appropriate for a column fails.
+        except DatabaseError:
+            logger.info(
+                f"Cannot alter column {column_name} to type {type_str}"
+            )
+    return column_type