Skip to content

Commit

Permalink
feat: support nulls in the csv uploads (apache#10208)
Browse files Browse the repository at this point in the history
* Support more table properties for the hive upload

Refactor

Add tests, and refactor them to be pytest friendly

Use lowercase table names

Ignore isort

* Use sql params

Co-authored-by: bogdan kyryliuk <[email protected]>
  • Loading branch information
bkyryliuk and bogdan-dbx authored Jul 6, 2020
1 parent 318e534 commit 84f8a51
Show file tree
Hide file tree
Showing 10 changed files with 337 additions and 172 deletions.
1 change: 0 additions & 1 deletion pytest.ini
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@
# limitations under the License.
#
[pytest]
addopts = -ra -q
testpaths =
tests
python_files = *_test.py test_*.py *_tests.py
4 changes: 4 additions & 0 deletions superset/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@
from dateutil import tz
from flask import Blueprint
from flask_appbuilder.security.manager import AUTH_DB
from pandas.io.parsers import STR_NA_VALUES

from superset.jinja_context import ( # pylint: disable=unused-import
BaseTemplateProcessor,
Expand Down Expand Up @@ -622,6 +623,9 @@ class CeleryConfig: # pylint: disable=too-few-public-methods
UPLOADED_CSV_HIVE_NAMESPACE
] if UPLOADED_CSV_HIVE_NAMESPACE else []

# Values that should be treated as nulls for the csv uploads.
CSV_DEFAULT_NA_NAMES = list(STR_NA_VALUES)

# A dictionary of items that gets merged into the Jinja context for
# SQL Lab. The existing context gets updated with this dictionary,
# meaning values for existing keys get overwritten by the content of this
Expand Down
58 changes: 48 additions & 10 deletions superset/db_engine_specs/hive.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,45 @@ def fetch_data(cls, cursor: Any, limit: int) -> List[Tuple[Any, ...]]:
except pyhive.exc.ProgrammingError:
return []

@classmethod
def get_create_table_stmt( # pylint: disable=too-many-arguments
cls,
table: Table,
schema_definition: str,
location: str,
delim: str,
header_line_count: Optional[int],
null_values: Optional[List[str]],
) -> text:
tblproperties = []
# available options:
# https://cwiki.apache.org/confluence/display/Hive/LanguageManual+DDL
# TODO(bkyryliuk): figure out what to do with the skip rows field.
params: Dict[str, str] = {
"delim": delim,
"location": location,
}
if header_line_count is not None and header_line_count >= 0:
header_line_count += 1
tblproperties.append("'skip.header.line.count'=':header_line_count'")
params["header_line_count"] = str(header_line_count)
if null_values:
# hive only supports 1 value for the null format
tblproperties.append("'serialization.null.format'=':null_value'")
params["null_value"] = null_values[0]

if tblproperties:
tblproperties_stmt = f"tblproperties ({', '.join(tblproperties)})"
sql = f"""CREATE TABLE {str(table)} ( {schema_definition} )
ROW FORMAT DELIMITED FIELDS TERMINATED BY :delim
STORED AS TEXTFILE LOCATION :location
{tblproperties_stmt}"""
else:
sql = f"""CREATE TABLE {str(table)} ( {schema_definition} )
ROW FORMAT DELIMITED FIELDS TERMINATED BY :delim
STORED AS TEXTFILE LOCATION :location"""
return sql, params

@classmethod
def create_table_from_csv( # pylint: disable=too-many-arguments, too-many-locals
cls,
Expand Down Expand Up @@ -182,18 +221,17 @@ def convert_to_hive_type(col_type: str) -> str:
bucket_path,
os.path.join(upload_prefix, table.table, os.path.basename(filename)),
)
sql = text(
f"""CREATE TABLE {str(table)} ( {schema_definition} )
ROW FORMAT DELIMITED FIELDS TERMINATED BY :delim
STORED AS TEXTFILE LOCATION :location
tblproperties ('skip.header.line.count'='1')"""

sql, params = cls.get_create_table_stmt(
table,
schema_definition,
location,
csv_to_df_kwargs["sep"].encode().decode("unicode_escape"),
int(csv_to_df_kwargs.get("header", 0)),
csv_to_df_kwargs.get("na_values"),
)
engine = cls.get_engine(database)
engine.execute(
sql,
delim=csv_to_df_kwargs["sep"].encode().decode("unicode_escape"),
location=location,
)
engine.execute(text(sql), **params)

@classmethod
def convert_dttm(cls, target_type: str, dttm: datetime) -> Optional[str]:
Expand Down
15 changes: 15 additions & 0 deletions superset/forms.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,12 +15,27 @@
# specific language governing permissions and limitations
# under the License.
"""Contains the logic to create cohesive forms on the explore view"""
import json
from typing import Any, List, Optional

from flask_appbuilder.fieldwidgets import BS3TextFieldWidget
from wtforms import Field


class JsonListField(Field):
widget = BS3TextFieldWidget()
data: List[str] = []

def _value(self) -> str:
return json.dumps(self.data)

def process_formdata(self, valuelist: List[str]) -> None:
if valuelist and valuelist[0]:
self.data = json.loads(valuelist[0])
else:
self.data = []


class CommaSeparatedListField(Field):
widget = BS3TextFieldWidget()
data: List[str] = []
Expand Down
26 changes: 25 additions & 1 deletion superset/views/database/forms.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,11 @@
from wtforms.validators import DataRequired, Length, NumberRange, Optional

from superset import app, db, security_manager
from superset.forms import CommaSeparatedListField, filter_not_empty_values
from superset.forms import (
CommaSeparatedListField,
filter_not_empty_values,
JsonListField,
)
from superset.models.core import Database

config = app.config
Expand Down Expand Up @@ -210,6 +214,16 @@ def at_least_one_schema_is_allowed(database: Database) -> bool:
validators=[Optional()],
widget=BS3TextFieldWidget(),
)
null_values = JsonListField(
_("Null values"),
default=config["CSV_DEFAULT_NA_NAMES"],
description=_(
"Json list of the values that should be treated as null. "
'Examples: [""], ["None", "N/A"], ["nan", "null"]. '
"Warning: Hive database supports only single value. "
'Use [""] for empty string.'
),
)


class ExcelToDatabaseForm(DynamicForm):
Expand Down Expand Up @@ -376,3 +390,13 @@ def at_least_one_schema_is_allowed(database: Database) -> bool:
validators=[Optional()],
widget=BS3TextFieldWidget(),
)
null_values = JsonListField(
_("Null values"),
default=config["CSV_DEFAULT_NA_NAMES"],
description=_(
"Json list of the values that should be treated as null. "
'Examples: [""], ["None", "N/A"], ["nan", "null"]. '
"Warning: Hive database supports only single value. "
'Use [""] for empty string.'
),
)
9 changes: 9 additions & 0 deletions superset/views/database/views.py
Original file line number Diff line number Diff line change
Expand Up @@ -149,6 +149,9 @@ def form_post(self, form: CsvToDatabaseForm) -> Response:
database = (
db.session.query(models.Database).filter_by(id=con.data.get("id")).one()
)

# More can be found here:
# https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_csv.html
csv_to_df_kwargs = {
"sep": form.sep.data,
"header": form.header.data if form.header.data else 0,
Expand All @@ -162,6 +165,12 @@ def form_post(self, form: CsvToDatabaseForm) -> Response:
"infer_datetime_format": form.infer_datetime_format.data,
"chunksize": 1000,
}
if form.null_values.data:
csv_to_df_kwargs["na_values"] = form.null_values.data
csv_to_df_kwargs["keep_default_na"] = False

# More can be found here:
# https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.to_sql.html
df_to_sql_kwargs = {
"name": csv_table.table,
"if_exists": form.if_exists.data,
Expand Down
2 changes: 1 addition & 1 deletion tests/base_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,8 +27,8 @@
from flask_testing import TestCase
from sqlalchemy.orm import Session

from tests.test_app import app
from superset.sql_parse import CtasMethod
from tests.test_app import app # isort:skip
from superset import db, security_manager
from superset.connectors.base.models import BaseDatasource
from superset.connectors.druid.models import DruidCluster, DruidDatasource
Expand Down
21 changes: 19 additions & 2 deletions tests/core_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -916,12 +916,12 @@ def test_import_csv_explore_database(self):

def test_import_csv(self):
self.login(username="admin")
table_name = "".join(random.choice(string.ascii_uppercase) for _ in range(5))
table_name = "".join(random.choice(string.ascii_lowercase) for _ in range(5))

f1 = "testCSV.csv"
self.create_sample_csvfile(f1, ["a,b", "john,1", "paul,2"])
f2 = "testCSV2.csv"
self.create_sample_csvfile(f2, ["b,c,d", "john,1,x", "paul,2,y"])
self.create_sample_csvfile(f2, ["b,c,d", "john,1,x", "paul,2,"])
self.enable_csv_upload(utils.get_example_database())

try:
Expand Down Expand Up @@ -957,6 +957,23 @@ def test_import_csv(self):
table = self.get_table_by_name(table_name)
# make sure the new column name is reflected in the table metadata
self.assertIn("d", table.column_names)

# null values are set
self.upload_csv(
f2,
table_name,
extra={"null_values": '["", "john"]', "if_exists": "replace"},
)
# make sure that john and empty string are replaced with None
data = db.session.execute(f"SELECT * from {table_name}").fetchall()
assert data == [(None, 1, "x"), ("paul", 2, None)]

# default null values
self.upload_csv(f2, table_name, extra={"if_exists": "replace"})
# make sure that john and empty string are replaced with None
data = db.session.execute(f"SELECT * from {table_name}").fetchall()
assert data == [("john", 1, "x"), ("paul", 2, None)]

finally:
os.remove(f1)
os.remove(f2)
Expand Down
6 changes: 3 additions & 3 deletions tests/db_engine_specs/base_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,13 +14,13 @@
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
# isort:skip_file
from datetime import datetime

from tests.test_app import app
from tests.base_tests import SupersetTestCase
from superset.db_engine_specs.mysql import MySQLEngineSpec
from superset.models.core import Database
from tests.base_tests import SupersetTestCase

from tests.test_app import app # isort:skip


class TestDbEngineSpec(SupersetTestCase):
Expand Down
Loading

0 comments on commit 84f8a51

Please sign in to comment.