Skip to content

Commit

Permalink
Merge pull request #135 from centerofci/infer_column_type_two
Browse files Browse the repository at this point in the history
Infer column type two
  • Loading branch information
mathemancer authored May 20, 2021
2 parents 202485f + 851009c commit 5e8b206
Show file tree
Hide file tree
Showing 4 changed files with 161 additions and 14 deletions.
2 changes: 0 additions & 2 deletions db/tests/types/test_alteration.py
Original file line number Diff line number Diff line change
Expand Up @@ -153,8 +153,6 @@ def test_alter_column_type_casts_column_data(


type_test_bad_data_list = [
(String, "boolean", "0"),
(String, "boolean", "1"),
(String, "boolean", "cat"),
(String, "interval", "1 potato"),
(String, "interval", "3"),
Expand Down
74 changes: 74 additions & 0 deletions db/tests/types/test_inference.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
import pytest
from sqlalchemy import Column, MetaData, Table
from sqlalchemy import BOOLEAN, Numeric, NUMERIC, String, VARCHAR
from sqlalchemy.schema import CreateSchema, DropSchema
from db.engine import _add_custom_types_to_engine
from db.types import base, inference, install

TEST_SCHEMA = "test_schema"


@pytest.fixture
def engine_with_types(engine):
_add_custom_types_to_engine(engine)
return engine


@pytest.fixture
def temporary_testing_schema(engine_with_types):
schema = TEST_SCHEMA
with engine_with_types.begin() as conn:
conn.execute(CreateSchema(schema))
yield engine_with_types, schema
with engine_with_types.begin() as conn:
conn.execute(DropSchema(schema, cascade=True, if_exists=True))


@pytest.fixture
def engine_email_type(temporary_testing_schema):
engine, schema = temporary_testing_schema
install.install_mathesar_on_database(engine)
yield engine, schema
with engine.begin() as conn:
conn.execute(DropSchema(base.SCHEMA, cascade=True, if_exists=True))


type_data_list = [
(Numeric, [0, 2, 1, 0], NUMERIC),
(Numeric, [0, 1, 1, 0], BOOLEAN),
(String, ["t", "false", "true", "f", "f"], BOOLEAN),
(String, ["t", "false", "2", "0"], VARCHAR),
(String, ["a", "cat", "mat", "bat"], VARCHAR),
(String, ["2", "1", "0", "0"], NUMERIC),
]


@pytest.mark.parametrize("type_,value_list,expect_type", type_data_list)
def test_type_inference(engine_email_type, type_, value_list, expect_type):
engine, schema = engine_email_type
TEST_TABLE = "test_table"
TEST_COLUMN = "test_column"
metadata = MetaData(bind=engine)
input_table = Table(
TEST_TABLE,
metadata,
Column(TEST_COLUMN, type_),
schema=schema
)
input_table.create()
for value in value_list:
ins = input_table.insert(values=(value,))
with engine.begin() as conn:
conn.execute(ins)
inference.infer_column_type(
schema,
TEST_TABLE,
TEST_COLUMN,
engine
)
with engine.begin():
metadata = MetaData(bind=engine, schema=schema)
actual_type = Table(
TEST_TABLE, metadata, schema=schema, autoload_with=engine,
).columns[TEST_COLUMN].type.__class__
assert actual_type == expect_type
25 changes: 13 additions & 12 deletions db/types/alteration.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,10 @@
from db.types import base, email

BOOLEAN = "boolean"
EMAIL = email.QUALIFIED_EMAIL
EMAIL = "email"
INTERVAL = "interval"
NUMERIC = "numeric"
STRING = "string"
TEXT = "text"
VARCHAR = "varchar"

Expand All @@ -13,12 +14,12 @@ def get_supported_alter_column_types(engine):
dialect_types = engine.dialect.ischema_names
type_map = {
# Default Postgres types
"boolean": dialect_types.get("boolean"),
"interval": dialect_types.get("interval"),
"numeric": dialect_types.get("numeric"),
"string": dialect_types.get("name"),
BOOLEAN: dialect_types.get("boolean"),
INTERVAL: dialect_types.get("interval"),
NUMERIC: dialect_types.get("numeric"),
STRING: dialect_types.get("name"),
# Custom Mathesar types
"email": dialect_types.get(email.QUALIFIED_EMAIL)
EMAIL: dialect_types.get(email.QUALIFIED_EMAIL)
}
return {k: v for k, v in type_map.items() if v is not None}

Expand Down Expand Up @@ -79,8 +80,8 @@ def create_boolean_casts(engine):
DECLARE
istrue {BOOLEAN};
BEGIN
SELECT lower($1)='t' OR lower($1)='true' INTO istrue;
IF istrue OR lower($1)='f' OR lower($1)='false' THEN
SELECT lower($1)='t' OR lower($1)='true' OR $1='1' INTO istrue;
IF istrue OR lower($1)='f' OR lower($1)='false' OR $1='0' THEN
RETURN istrue;
END IF;
{not_bool_exception_str}
Expand Down Expand Up @@ -141,18 +142,18 @@ def create_email_casts(engine):
DOMAIN).
"""
type_body_map = {
EMAIL: """
email.QUALIFIED_EMAIL: """
BEGIN
RETURN $1;
END;
""",
TEXT: f"""
BEGIN
RETURN $1::{EMAIL};
RETURN $1::{email.QUALIFIED_EMAIL};
END;
""",
}
create_cast_functions(EMAIL, type_body_map, engine)
create_cast_functions(email.QUALIFIED_EMAIL, type_body_map, engine)


def create_interval_casts(engine):
Expand Down Expand Up @@ -209,7 +210,7 @@ def create_varchar_casts(engine):
RETURN $1::{VARCHAR};
END;
""",
EMAIL: f"""
email.QUALIFIED_EMAIL: f"""
BEGIN
RETURN $1::{VARCHAR};
END;
Expand Down
74 changes: 74 additions & 0 deletions db/types/inference.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
import logging
from sqlalchemy import MetaData, Table
from sqlalchemy import VARCHAR, TEXT, Text
from sqlalchemy.exc import DatabaseError
from db.types import alteration

logger = logging.getLogger(__name__)

TYPE_INFERENCE_DAG = {
alteration.BOOLEAN: [],
alteration.EMAIL: [],
alteration.INTERVAL: [],
alteration.NUMERIC: [
alteration.BOOLEAN,
],
alteration.STRING: [
alteration.BOOLEAN,
alteration.NUMERIC,
alteration.INTERVAL,
alteration.EMAIL,
],
}


class DagCycleError(Exception):
pass


def infer_column_type(
schema,
table_name,
column_name,
engine,
depth=0,
type_inference_dag=TYPE_INFERENCE_DAG,
):
if depth > 100:
raise DagCycleError("The type_inference_dag likely has a cycle")
supported_types = alteration.get_supported_alter_column_types(engine)
reverse_type_map = {
Text: alteration.STRING,
TEXT: alteration.STRING,
VARCHAR: alteration.STRING,
}
reverse_type_map.update({v: k for k, v in supported_types.items()})
with engine.begin():
metadata = MetaData(bind=engine, schema=schema)
column_type = Table(
table_name, metadata, schema=schema, autoload_with=engine,
).columns[column_name].type.__class__
column_type_str = reverse_type_map.get(column_type)
logger.debug(f"column_type_str: {column_type_str}")
for type_str in type_inference_dag.get(column_type_str, []):
try:
alteration.alter_column_type(
schema, table_name, column_name, type_str, engine,
)
logger.info(f"Column {column_name} altered to type {type_str}")
column_type = infer_column_type(
schema,
table_name,
column_name,
engine,
depth=depth + 1,
type_inference_dag=type_inference_dag,
)
break
# It's expected we catch this error when the test to see whether
# a type is appropriate for a column fails.
except DatabaseError:
logger.info(
f"Cannot alter column {column_name} to type {type_str}"
)
return column_type

0 comments on commit 5e8b206

Please sign in to comment.